Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


Changes to try to make time handling better. Also disabled TSC offsetting for now...
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #ifdef V3_CONFIG_CHECKPOINT
38 #include <palacios/vmm_checkpoint.h>
39 #endif
40
41 #include <palacios/vmx_ept.h>
42 #include <palacios/vmx_assist.h>
43 #include <palacios/vmx_hw_info.h>
44
45 #ifndef V3_CONFIG_DEBUG_VMX
46 #undef PrintDebug
47 #define PrintDebug(fmt, args...)
48 #endif
49
50
51 /* These fields contain the hardware feature sets supported by the local CPU */
52 static struct vmx_hw_info hw_info;
53
54 extern v3_cpu_arch_t v3_cpu_types[];
55
56 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
57
58 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
59 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
60
61 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
62     int ret = 0;
63
64     ret = vmcs_write(field, val);
65
66     if (ret != VMX_SUCCESS) {
67         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
68         return 1;
69     }
70
71     return 0;
72 }
73
74 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
75     int ret = 0;
76
77     ret = vmcs_read(field, val);
78
79     if (ret != VMX_SUCCESS) {
80         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
81     }
82
83     return ret;
84 }
85
86
87
88
89 static addr_t allocate_vmcs() {
90     struct vmcs_data * vmcs_page = NULL;
91
92     PrintDebug("Allocating page\n");
93
94     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
95     memset(vmcs_page, 0, 4096);
96
97     vmcs_page->revision = hw_info.basic_info.revision;
98     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
99
100     return (addr_t)V3_PAddr((void *)vmcs_page);
101 }
102
103
104
105
106 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
107     int vmx_ret = 0;
108
109     // disable global interrupts for vm state initialization
110     v3_disable_ints();
111
112     PrintDebug("Loading VMCS\n");
113     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
114     vmx_state->state = VMX_UNLAUNCHED;
115
116     if (vmx_ret != VMX_SUCCESS) {
117         PrintError("VMPTRLD failed\n");
118         return -1;
119     }
120
121
122     /*** Setup default state from HW ***/
123
124     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
125     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
126     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
127     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
128     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
129
130     /* Print Control MSRs */
131     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
132     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
133
134
135
136     /******* Setup Host State **********/
137
138     /* Cache GDTR, IDTR, and TR in host struct */
139     addr_t gdtr_base;
140     struct {
141         uint16_t selector;
142         addr_t   base;
143     } __attribute__((packed)) tmp_seg;
144     
145
146     __asm__ __volatile__(
147                          "sgdt (%0);"
148                          :
149                          : "q"(&tmp_seg)
150                          : "memory"
151                          );
152     gdtr_base = tmp_seg.base;
153     vmx_state->host_state.gdtr.base = gdtr_base;
154
155     __asm__ __volatile__(
156                          "sidt (%0);"
157                          :
158                          : "q"(&tmp_seg)
159                          : "memory"
160                          );
161     vmx_state->host_state.idtr.base = tmp_seg.base;
162
163     __asm__ __volatile__(
164                          "str (%0);"
165                          :
166                          : "q"(&tmp_seg)
167                          : "memory"
168                          );
169     vmx_state->host_state.tr.selector = tmp_seg.selector;
170
171     /* The GDTR *index* is bits 3-15 of the selector. */
172     struct tss_descriptor * desc = NULL;
173     desc = (struct tss_descriptor *)(gdtr_base + (8 * (tmp_seg.selector >> 3)));
174
175     tmp_seg.base = ((desc->base1) |
176                     (desc->base2 << 16) |
177                     (desc->base3 << 24) |
178 #ifdef __V3_64BIT__
179                     ((uint64_t)desc->base4 << 32)
180 #else 
181                     (0)
182 #endif
183                     );
184
185     vmx_state->host_state.tr.base = tmp_seg.base;
186
187
188     /********** Setup VMX Control Fields ***********/
189
190     /* Add external interrupts, NMI exiting, and virtual NMI */
191     vmx_state->pin_ctrls.nmi_exit = 1;
192     vmx_state->pin_ctrls.ext_int_exit = 1;
193
194
195     vmx_state->pri_proc_ctrls.hlt_exit = 1;
196
197
198     vmx_state->pri_proc_ctrls.pause_exit = 0;
199     vmx_state->pri_proc_ctrls.tsc_offset = 1;
200 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
201     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
202 #endif
203
204     /* Setup IO map */
205     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
206     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
207     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
208             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
209
210
211     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
212     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
213
214
215
216     
217
218
219
220 #ifdef __V3_64BIT__
221     // Ensure host runs in 64-bit mode at each VM EXIT
222     vmx_state->exit_ctrls.host_64_on = 1;
223 #endif
224
225     // Hook all accesses to EFER register
226     v3_hook_msr(core->vm_info, EFER_MSR, 
227                 &v3_handle_efer_read,
228                 &v3_handle_efer_write, 
229                 core);
230
231     // Restore host's EFER register on each VM EXIT
232     vmx_state->exit_ctrls.ld_efer = 1;
233
234     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
235     vmx_state->exit_ctrls.save_efer = 1;
236     vmx_state->entry_ctrls.ld_efer  = 1;
237
238     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
239     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
240
241
242     /* Setup paging */
243     if (core->shdw_pg_mode == SHADOW_PAGING) {
244         PrintDebug("Creating initial shadow page table\n");
245
246         if (v3_init_passthrough_pts(core) == -1) {
247             PrintError("Could not initialize passthrough page tables\n");
248             return -1;
249         }
250         
251 #define CR0_PE 0x00000001
252 #define CR0_PG 0x80000000
253 #define CR0_WP 0x00010000 // To ensure mem hooks work
254         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
255
256         core->ctrl_regs.cr3 = core->direct_map_pt;
257
258         // vmx_state->pinbased_ctrls |= NMI_EXIT;
259
260         /* Add CR exits */
261         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
262         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
263         
264         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
265         
266         /* Add page fault exits */
267         vmx_state->excp_bmap.pf = 1;
268
269         // Setup VMX Assist
270         v3_vmxassist_init(core, vmx_state);
271
272     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
273                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
274
275 #define CR0_PE 0x00000001
276 #define CR0_PG 0x80000000
277 #define CR0_WP 0x00010000 // To ensure mem hooks work
278         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
279
280         // vmx_state->pinbased_ctrls |= NMI_EXIT;
281
282         /* Disable CR exits */
283         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
284         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
285
286         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
287
288         /* Add page fault exits */
289         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
290         
291         // Setup VMX Assist
292         v3_vmxassist_init(core, vmx_state);
293
294         /* Enable EPT */
295         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
296         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
297
298
299
300         if (v3_init_ept(core, &hw_info) == -1) {
301             PrintError("Error initializing EPT\n");
302             return -1;
303         }
304
305     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
306                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
307         int i = 0;
308         // For now we will assume that unrestricted guest mode is assured w/ EPT
309
310
311         core->vm_regs.rsp = 0x00;
312         core->rip = 0xfff0;
313         core->vm_regs.rdx = 0x00000f00;
314         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
315         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
316
317
318         core->segments.cs.selector = 0xf000;
319         core->segments.cs.limit = 0xffff;
320         core->segments.cs.base = 0x0000000f0000LL;
321
322         // (raw attributes = 0xf3)
323         core->segments.cs.type = 0xb;
324         core->segments.cs.system = 0x1;
325         core->segments.cs.dpl = 0x0;
326         core->segments.cs.present = 1;
327
328
329
330         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
331                                           &(core->segments.es), &(core->segments.fs), 
332                                           &(core->segments.gs), NULL};
333
334         for ( i = 0; segregs[i] != NULL; i++) {
335             struct v3_segment * seg = segregs[i];
336         
337             seg->selector = 0x0000;
338             //    seg->base = seg->selector << 4;
339             seg->base = 0x00000000;
340             seg->limit = 0xffff;
341
342
343             seg->type = 0x3;
344             seg->system = 0x1;
345             seg->dpl = 0x0;
346             seg->present = 1;
347             //    seg->granularity = 1;
348
349         }
350
351
352         core->segments.gdtr.limit = 0x0000ffff;
353         core->segments.gdtr.base = 0x0000000000000000LL;
354
355         core->segments.idtr.limit = 0x0000ffff;
356         core->segments.idtr.base = 0x0000000000000000LL;
357
358         core->segments.ldtr.selector = 0x0000;
359         core->segments.ldtr.limit = 0x0000ffff;
360         core->segments.ldtr.base = 0x0000000000000000LL;
361         core->segments.ldtr.type = 2;
362         core->segments.ldtr.present = 1;
363
364         core->segments.tr.selector = 0x0000;
365         core->segments.tr.limit = 0x0000ffff;
366         core->segments.tr.base = 0x0000000000000000LL;
367         core->segments.tr.type = 0xb;
368         core->segments.tr.present = 1;
369
370         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
371         core->dbg_regs.dr7 = 0x0000000000000400LL;
372
373         /* Enable EPT */
374         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
375         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
376         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
377
378
379         /* Disable shadow paging stuff */
380         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
381         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
382
383         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
384
385
386         if (v3_init_ept(core, &hw_info) == -1) {
387             PrintError("Error initializing EPT\n");
388             return -1;
389         }
390
391     } else {
392         PrintError("Invalid Virtual paging mode\n");
393         return -1;
394     }
395
396
397     // hook vmx msrs
398
399     // Setup SYSCALL/SYSENTER MSRs in load/store area
400     
401     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
402     {
403 #define IA32_STAR 0xc0000081
404 #define IA32_LSTAR 0xc0000082
405 #define IA32_FMASK 0xc0000084
406 #define IA32_KERN_GS_BASE 0xc0000102
407
408 #define IA32_CSTAR 0xc0000083 // Compatibility mode STAR (ignored for now... hopefully its not that important...)
409
410         int msr_ret = 0;
411
412         struct vmcs_msr_entry * exit_store_msrs = NULL;
413         struct vmcs_msr_entry * exit_load_msrs = NULL;
414         struct vmcs_msr_entry * entry_load_msrs = NULL;;
415         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
416
417         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
418
419         if (max_msrs < 4) {
420             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
421             return -1;
422         }
423
424         vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
425
426         if (vmx_state->msr_area == NULL) {
427             PrintError("could not allocate msr load/store area\n");
428             return -1;
429         }
430
431         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
432         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
433         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
434         
435         
436         exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
437         exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
438         entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
439
440
441         exit_store_msrs[0].index = IA32_STAR;
442         exit_store_msrs[1].index = IA32_LSTAR;
443         exit_store_msrs[2].index = IA32_FMASK;
444         exit_store_msrs[3].index = IA32_KERN_GS_BASE;
445         
446         memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
447         memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
448
449         
450         v3_get_msr(IA32_STAR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
451         v3_get_msr(IA32_LSTAR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
452         v3_get_msr(IA32_FMASK, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
453         v3_get_msr(IA32_KERN_GS_BASE, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
454
455         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
456         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
457         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
458
459     }    
460
461     /* Sanity check ctrl/reg fields against hw_defaults */
462
463
464
465
466     /*** Write all the info to the VMCS ***/
467   
468     /*
469     {
470         // IS THIS NECESSARY???
471 #define DEBUGCTL_MSR 0x1d9
472         struct v3_msr tmp_msr;
473         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
474         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
475         core->dbg_regs.dr7 = 0x400;
476     }
477     */
478
479 #ifdef __V3_64BIT__
480     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
481 #else
482     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
483     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
484 #endif
485
486
487  
488
489     if (v3_update_vmcs_ctrl_fields(core)) {
490         PrintError("Could not write control fields!\n");
491         return -1;
492     }
493     
494     if (v3_update_vmcs_host_state(core)) {
495         PrintError("Could not write host state\n");
496         return -1;
497     }
498
499     // reenable global interrupts for vm state initialization now
500     // that the vm state is initialized. If another VM kicks us off, 
501     // it'll update our vmx state so that we know to reload ourself
502     v3_enable_ints();
503
504     return 0;
505 }
506
507 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
508     struct vmx_data * vmx_state = NULL;
509     int vmx_ret = 0;
510     
511     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
512     memset(vmx_state, 0, sizeof(struct vmx_data));
513
514     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
515
516     PrintDebug("Allocating VMCS\n");
517     vmx_state->vmcs_ptr_phys = allocate_vmcs();
518
519     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
520
521     core->vmm_data = vmx_state;
522     vmx_state->state = VMX_UNLAUNCHED;
523
524     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
525     
526     // TODO: Fix vmcs fields so they're 32-bit
527
528     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
529     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
530
531     if (vmx_ret != VMX_SUCCESS) {
532         PrintError("VMCLEAR failed\n");
533         return -1; 
534     }
535
536     if (vm_class == V3_PC_VM) {
537         PrintDebug("Initializing VMCS\n");
538         if (init_vmcs_bios(core, vmx_state) == -1) {
539             PrintError("Error initializing VMCS to BIOS state\n");
540             return -1;
541         }
542     } else {
543         PrintError("Invalid VM Class\n");
544         return -1;
545     }
546
547     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
548     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
549
550     return 0;
551 }
552
553
554 int v3_deinit_vmx_vmcs(struct guest_info * core) {
555     struct vmx_data * vmx_state = core->vmm_data;
556
557     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
558     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
559
560     V3_Free(vmx_state);
561
562     return 0;
563 }
564
565
566
567 #ifdef V3_CONFIG_CHECKPOINT
568 /* 
569  * JRL: This is broken
570  */
571 int v3_vmx_save_core(struct guest_info * core, void * ctx){
572     uint64_t vmcs_ptr = vmcs_store();
573
574     v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
575
576     return 0;
577 }
578
579 int v3_vmx_load_core(struct guest_info * core, void * ctx){
580     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
581     struct cr0_32 * shadow_cr0;
582     char vmcs[PAGE_SIZE_4KB];
583
584     v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
585
586     vmcs_clear(vmx_info->vmcs_ptr_phys);
587     vmcs_load((addr_t)vmcs);
588
589     v3_vmx_save_vmcs(core);
590
591     shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
592
593
594     /* Get the CPU mode to set the guest_ia32e entry ctrl */
595
596     if (core->shdw_pg_mode == SHADOW_PAGING) {
597         if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
598             if (v3_activate_shadow_pt(core) == -1) {
599                 PrintError("Failed to activate shadow page tables\n");
600                 return -1;
601             }
602         } else {
603             if (v3_activate_passthrough_pt(core) == -1) {
604                 PrintError("Failed to activate passthrough page tables\n");
605                 return -1;
606             }
607         }
608     }
609
610     return 0;
611 }
612 #endif
613
614
615 static int update_irq_exit_state(struct guest_info * info) {
616     struct vmx_exit_idt_vec_info idt_vec_info;
617
618     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
619
620     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
621 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
622         V3_Print("Calling v3_injecting_intr\n");
623 #endif
624         info->intr_core_state.irq_started = 0;
625         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
626     }
627
628     return 0;
629 }
630
631 static int update_irq_entry_state(struct guest_info * info) {
632     struct vmx_exit_idt_vec_info idt_vec_info;
633     struct vmcs_interrupt_state intr_core_state;
634     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
635
636     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
637     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
638
639     /* Check for pending exceptions to inject */
640     if (v3_excp_pending(info)) {
641         struct vmx_entry_int_info int_info;
642         int_info.value = 0;
643
644         // In VMX, almost every exception is hardware
645         // Software exceptions are pretty much only for breakpoint or overflow
646         int_info.type = 3;
647         int_info.vector = v3_get_excp_number(info);
648
649         if (info->excp_state.excp_error_code_valid) {
650             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
651             int_info.error_code = 1;
652
653 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
654             V3_Print("Injecting exception %d with error code %x\n", 
655                     int_info.vector, info->excp_state.excp_error_code);
656 #endif
657         }
658
659         int_info.valid = 1;
660 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
661         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
662 #endif
663         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
664
665         v3_injecting_excp(info, int_info.vector);
666
667     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
668                (intr_core_state.val == 0)) {
669        
670         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
671
672 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
673             V3_Print("IRQ pending from previous injection\n");
674 #endif
675
676             // Copy the IDT vectoring info over to reinject the old interrupt
677             if (idt_vec_info.error_code == 1) {
678                 uint32_t err_code = 0;
679
680                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
681                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
682             }
683
684             idt_vec_info.undef = 0;
685             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
686
687         } else {
688             struct vmx_entry_int_info ent_int;
689             ent_int.value = 0;
690
691             switch (v3_intr_pending(info)) {
692                 case V3_EXTERNAL_IRQ: {
693                     info->intr_core_state.irq_vector = v3_get_intr(info); 
694                     ent_int.vector = info->intr_core_state.irq_vector;
695                     ent_int.type = 0;
696                     ent_int.error_code = 0;
697                     ent_int.valid = 1;
698
699 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
700                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
701                                info->intr_core_state.irq_vector, 
702                                (uint32_t)info->num_exits, 
703                                (void *)(addr_t)info->rip);
704 #endif
705
706                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
707                     info->intr_core_state.irq_started = 1;
708
709                     break;
710                 }
711                 case V3_NMI:
712                     PrintDebug("Injecting NMI\n");
713
714                     ent_int.type = 2;
715                     ent_int.vector = 2;
716                     ent_int.valid = 1;
717                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
718
719                     break;
720                 case V3_SOFTWARE_INTR:
721                     PrintDebug("Injecting software interrupt\n");
722                     ent_int.type = 4;
723
724                     ent_int.valid = 1;
725                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
726
727                     break;
728                 case V3_VIRTUAL_IRQ:
729                     // Not sure what to do here, Intel doesn't have virtual IRQs
730                     // May be the same as external interrupts/IRQs
731
732                     break;
733                 case V3_INVALID_INTR:
734                 default:
735                     break;
736             }
737         }
738     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
739         // Enable INTR window exiting so we know when IF=1
740         uint32_t instr_len;
741
742         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
743
744 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
745         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
746 #endif
747
748         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
749         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
750     }
751
752
753     return 0;
754 }
755
756
757
758 static struct vmx_exit_info exit_log[10];
759
760 static void print_exit_log(struct guest_info * info) {
761     int cnt = info->num_exits % 10;
762     int i = 0;
763     
764
765     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
766
767     for (i = 0; i < 10; i++) {
768         struct vmx_exit_info * tmp = &exit_log[cnt];
769
770         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
771         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
772         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
773         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
774         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
775
776         cnt--;
777
778         if (cnt == -1) {
779             cnt = 9;
780         }
781
782     }
783
784 }
785
786 /* 
787  * CAUTION and DANGER!!! 
788  * 
789  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
790  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
791  * on its contents will cause things to break. The contents at the time of the exit WILL 
792  * change before the exit handler is executed.
793  */
794 int v3_vmx_enter(struct guest_info * info) {
795     int ret = 0;
796     uint32_t tsc_offset_low, tsc_offset_high;
797     struct vmx_exit_info exit_info;
798     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
799
800     // Conditionally yield the CPU if the timeslice has expired
801     v3_yield_cond(info);
802
803     // Perform any additional yielding needed for time adjustment
804     v3_adjust_time(info);
805
806     // disable global interrupts for vm state transition
807     v3_disable_ints();
808
809     // Update timer devices late after being in the VM so that as much 
810     // of hte time in the VM is accounted for as possible. Also do it before
811     // updating IRQ entry state so that any interrupts the timers raise get 
812     // handled on the next VM entry. Must be done with interrupts disabled.
813     v3_update_timers(info);
814
815     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
816         vmcs_clear(vmx_info->vmcs_ptr_phys);
817         vmcs_load(vmx_info->vmcs_ptr_phys);
818         vmx_info->state = VMX_UNLAUNCHED;
819     }
820
821     v3_vmx_restore_vmcs(info);
822
823
824 #ifdef V3_CONFIG_SYMCALL
825     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
826         update_irq_entry_state(info);
827     }
828 #else 
829     update_irq_entry_state(info);
830 #endif
831
832     {
833         addr_t guest_cr3;
834         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
835         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
836     }
837
838     // Perform last-minute time bookkeeping prior to entering the VM
839     v3_time_enter_vm(info);
840
841     // tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
842     // tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
843     // check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
844     // check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
845
846     if (v3_update_vmcs_host_state(info)) {
847         v3_enable_ints();
848         PrintError("Could not write host state\n");
849         return -1;
850     }
851
852
853     if (vmx_info->state == VMX_UNLAUNCHED) {
854         vmx_info->state = VMX_LAUNCHED;
855
856         info->vm_info->run_state = VM_RUNNING;
857         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
858     } else {
859         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
860         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
861     }
862     
863     //  PrintDebug("VMX Exit: ret=%d\n", ret);
864
865     if (ret != VMX_SUCCESS) {
866         uint32_t error = 0;
867
868         vmcs_read(VMCS_INSTR_ERR, &error);
869
870         v3_enable_ints();
871
872         PrintError("VMENTRY Error: %d\n", error);
873         return -1;
874     }
875
876     // Immediate exit from VM time bookkeeping
877     v3_time_exit_vm(info);
878
879     info->num_exits++;
880
881     /* Update guest state */
882     v3_vmx_save_vmcs(info);
883
884     // info->cpl = info->segments.cs.selector & 0x3;
885
886     info->mem_mode = v3_get_vm_mem_mode(info);
887     info->cpu_mode = v3_get_vm_cpu_mode(info);
888
889
890     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
891     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
892     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
893     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
894     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
895     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
896     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
897
898     if (info->shdw_pg_mode == NESTED_PAGING) {
899         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
900     }
901
902     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
903
904     exit_log[info->num_exits % 10] = exit_info;
905
906 #ifdef V3_CONFIG_SYMCALL
907     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
908         update_irq_exit_state(info);
909     }
910 #else
911     update_irq_exit_state(info);
912 #endif
913
914     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
915         // This is a special case whose only job is to inject an interrupt
916         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
917         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
918         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
919
920 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
921        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
922 #endif
923     }
924
925     // reenable global interrupts after vm exit
926     v3_enable_ints();
927
928     // Conditionally yield the CPU if the timeslice has expired
929     v3_yield_cond(info);
930
931     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
932         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
933         return -1;
934     }
935
936     return 0;
937 }
938
939
940 int v3_start_vmx_guest(struct guest_info * info) {
941
942     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
943
944     if (info->vcpu_id == 0) {
945         info->core_run_state = CORE_RUNNING;
946         info->vm_info->run_state = VM_RUNNING;
947     } else {
948
949         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
950
951         while (info->core_run_state == CORE_STOPPED) {
952             v3_yield(info);
953             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
954         }
955         
956         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
957
958         // We'll be paranoid about race conditions here
959         v3_wait_at_barrier(info);
960     }
961
962
963     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
964                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
965                info->segments.cs.limit, (void *)(info->rip));
966
967
968     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
969
970     v3_start_time(info);
971
972     while (1) {
973
974         if (info->vm_info->run_state == VM_STOPPED) {
975             info->core_run_state = CORE_STOPPED;
976             break;
977         }
978
979         if (v3_vmx_enter(info) == -1) {
980
981             addr_t host_addr;
982             addr_t linear_addr = 0;
983             
984             info->vm_info->run_state = VM_ERROR;
985             
986             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
987             
988             v3_print_guest_state(info);
989             
990             V3_Print("VMX core %u\n", info->vcpu_id); 
991
992             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
993             
994             if (info->mem_mode == PHYSICAL_MEM) {
995                 v3_gpa_to_hva(info, linear_addr, &host_addr);
996             } else if (info->mem_mode == VIRTUAL_MEM) {
997                 v3_gva_to_hva(info, linear_addr, &host_addr);
998             }
999             
1000             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
1001             
1002             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
1003             v3_dump_mem((uint8_t *)host_addr, 15);
1004             
1005             v3_print_stack(info);
1006
1007
1008             v3_print_vmcs();
1009             print_exit_log(info);
1010             return -1;
1011         }
1012
1013         v3_wait_at_barrier(info);
1014
1015
1016         if (info->vm_info->run_state == VM_STOPPED) {
1017             info->core_run_state = CORE_STOPPED;
1018             break;
1019         }
1020 /*
1021         if ((info->num_exits % 5000) == 0) {
1022             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
1023         }
1024 */
1025
1026     }
1027
1028     return 0;
1029 }
1030
1031
1032
1033
1034 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
1035 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
1036 #define CPUID_1_ECX_VTXFLAG 0x00000020
1037
1038 int v3_is_vmx_capable() {
1039     v3_msr_t feature_msr;
1040     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1041
1042     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1043
1044     PrintDebug("ECX: 0x%x\n", ecx);
1045
1046     if (ecx & CPUID_1_ECX_VTXFLAG) {
1047         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1048         
1049         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
1050
1051         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1052             PrintDebug("VMX is locked -- enable in the BIOS\n");
1053             return 0;
1054         }
1055
1056     } else {
1057         PrintDebug("VMX not supported on this cpu\n");
1058         return 0;
1059     }
1060
1061     return 1;
1062 }
1063
1064
1065 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1066     // init vmcs bios
1067     
1068     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1069         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1070         // easy 
1071         core->rip = 0;
1072         core->segments.cs.selector = rip << 8;
1073         core->segments.cs.limit = 0xffff;
1074         core->segments.cs.base = rip << 12;
1075     } else {
1076         core->vm_regs.rdx = core->vcpu_id;
1077         core->vm_regs.rbx = rip;
1078     }
1079
1080     return 0;
1081 }
1082
1083
1084
1085 void v3_init_vmx_cpu(int cpu_id) {
1086     addr_t vmx_on_region = 0;
1087
1088     if (cpu_id == 0) {
1089         if (v3_init_vmx_hw(&hw_info) == -1) {
1090             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1091             return;
1092         }
1093     }
1094
1095     enable_vmx();
1096
1097
1098     // Setup VMXON Region
1099     vmx_on_region = allocate_vmcs();
1100
1101
1102     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1103         V3_Print("VMX Enabled\n");
1104         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1105     } else {
1106         V3_Print("VMX already enabled\n");
1107         V3_FreePages((void *)vmx_on_region, 1);
1108     }
1109
1110     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1111
1112     {
1113         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1114         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1115         
1116         if (sec_proc_ctrls.enable_ept == 0) {
1117             V3_Print("VMX EPT (Nested) Paging not supported\n");
1118             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1119         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1120             V3_Print("VMX EPT (Nested) Paging supported\n");
1121             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1122         } else {
1123             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1124             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1125         }
1126     }
1127 }
1128
1129
1130 void v3_deinit_vmx_cpu(int cpu_id) {
1131     extern v3_cpu_arch_t v3_cpu_types[];
1132     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1133
1134     if (host_vmcs_ptrs[cpu_id] != 0) {
1135         V3_Print("Disabling VMX\n");
1136
1137         if (vmx_off() != VMX_SUCCESS) {
1138             PrintError("Error executing VMXOFF\n");
1139         }
1140
1141         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1142
1143         host_vmcs_ptrs[cpu_id] = 0;
1144     }
1145 }