Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


e733eb5500aa331f45168e174761a4ac0d2e763b
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #ifdef V3_CONFIG_CHECKPOINT
38 #include <palacios/vmm_checkpoint.h>
39 #endif
40
41 #include <palacios/vmx_ept.h>
42 #include <palacios/vmx_assist.h>
43 #include <palacios/vmx_hw_info.h>
44
45 #ifndef V3_CONFIG_DEBUG_VMX
46 #undef PrintDebug
47 #define PrintDebug(fmt, args...)
48 #endif
49
50
51 /* These fields contain the hardware feature sets supported by the local CPU */
52 static struct vmx_hw_info hw_info;
53
54 extern v3_cpu_arch_t v3_cpu_types[];
55
56 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
57
58 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
59 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
60
61 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
62     int ret = 0;
63
64     ret = vmcs_write(field, val);
65
66     if (ret != VMX_SUCCESS) {
67         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
68         return 1;
69     }
70
71     return 0;
72 }
73
74 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
75     int ret = 0;
76
77     ret = vmcs_read(field, val);
78
79     if (ret != VMX_SUCCESS) {
80         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
81     }
82
83     return ret;
84 }
85
86
87
88
89 static addr_t allocate_vmcs() {
90     struct vmcs_data * vmcs_page = NULL;
91
92     PrintDebug("Allocating page\n");
93
94     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
95     memset(vmcs_page, 0, 4096);
96
97     vmcs_page->revision = hw_info.basic_info.revision;
98     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
99
100     return (addr_t)V3_PAddr((void *)vmcs_page);
101 }
102
103
104
105
106 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
107     int vmx_ret = 0;
108
109     // disable global interrupts for vm state initialization
110     v3_disable_ints();
111
112     PrintDebug("Loading VMCS\n");
113     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
114     vmx_state->state = VMX_UNLAUNCHED;
115
116     if (vmx_ret != VMX_SUCCESS) {
117         PrintError("VMPTRLD failed\n");
118         return -1;
119     }
120
121
122     /*** Setup default state from HW ***/
123
124     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
125     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
126     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
127     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
128     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
129
130     /* Print Control MSRs */
131     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
132     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
133
134
135
136     /******* Setup Host State **********/
137
138     /* Cache GDTR, IDTR, and TR in host struct */
139     addr_t gdtr_base;
140     struct {
141         uint16_t selector;
142         addr_t   base;
143     } __attribute__((packed)) tmp_seg;
144     
145
146     __asm__ __volatile__(
147                          "sgdt (%0);"
148                          :
149                          : "q"(&tmp_seg)
150                          : "memory"
151                          );
152     gdtr_base = tmp_seg.base;
153     vmx_state->host_state.gdtr.base = gdtr_base;
154
155     __asm__ __volatile__(
156                          "sidt (%0);"
157                          :
158                          : "q"(&tmp_seg)
159                          : "memory"
160                          );
161     vmx_state->host_state.idtr.base = tmp_seg.base;
162
163     __asm__ __volatile__(
164                          "str (%0);"
165                          :
166                          : "q"(&tmp_seg)
167                          : "memory"
168                          );
169     vmx_state->host_state.tr.selector = tmp_seg.selector;
170
171     /* The GDTR *index* is bits 3-15 of the selector. */
172     struct tss_descriptor * desc = NULL;
173     desc = (struct tss_descriptor *)(gdtr_base + (8 * (tmp_seg.selector >> 3)));
174
175     tmp_seg.base = ((desc->base1) |
176                     (desc->base2 << 16) |
177                     (desc->base3 << 24) |
178 #ifdef __V3_64BIT__
179                     ((uint64_t)desc->base4 << 32)
180 #else 
181                     (0)
182 #endif
183                     );
184
185     vmx_state->host_state.tr.base = tmp_seg.base;
186
187
188     /********** Setup VMX Control Fields ***********/
189
190     /* Add external interrupts, NMI exiting, and virtual NMI */
191     vmx_state->pin_ctrls.nmi_exit = 1;
192     vmx_state->pin_ctrls.ext_int_exit = 1;
193
194
195     vmx_state->pri_proc_ctrls.hlt_exit = 1;
196
197
198     vmx_state->pri_proc_ctrls.pause_exit = 0;
199     vmx_state->pri_proc_ctrls.tsc_offset = 1;
200 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
201     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
202 #endif
203
204     /* Setup IO map */
205     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
206     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
207     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
208             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
209
210
211     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
212     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
213
214
215
216     
217
218
219
220 #ifdef __V3_64BIT__
221     // Ensure host runs in 64-bit mode at each VM EXIT
222     vmx_state->exit_ctrls.host_64_on = 1;
223 #endif
224
225     // Hook all accesses to EFER register
226     v3_hook_msr(core->vm_info, EFER_MSR, 
227                 &v3_handle_efer_read,
228                 &v3_handle_efer_write, 
229                 core);
230
231     // Restore host's EFER register on each VM EXIT
232     vmx_state->exit_ctrls.ld_efer = 1;
233
234     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
235     vmx_state->exit_ctrls.save_efer = 1;
236     vmx_state->entry_ctrls.ld_efer  = 1;
237
238     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
239     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
240
241
242     /* Setup paging */
243     if (core->shdw_pg_mode == SHADOW_PAGING) {
244         PrintDebug("Creating initial shadow page table\n");
245
246         if (v3_init_passthrough_pts(core) == -1) {
247             PrintError("Could not initialize passthrough page tables\n");
248             return -1;
249         }
250         
251 #define CR0_PE 0x00000001
252 #define CR0_PG 0x80000000
253 #define CR0_WP 0x00010000 // To ensure mem hooks work
254         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
255
256         core->ctrl_regs.cr3 = core->direct_map_pt;
257
258         // vmx_state->pinbased_ctrls |= NMI_EXIT;
259
260         /* Add CR exits */
261         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
262         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
263         
264         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
265         
266         /* Add page fault exits */
267         vmx_state->excp_bmap.pf = 1;
268
269         // Setup VMX Assist
270         v3_vmxassist_init(core, vmx_state);
271
272     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
273                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
274
275 #define CR0_PE 0x00000001
276 #define CR0_PG 0x80000000
277 #define CR0_WP 0x00010000 // To ensure mem hooks work
278         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
279
280         // vmx_state->pinbased_ctrls |= NMI_EXIT;
281
282         /* Disable CR exits */
283         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
284         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
285
286         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
287
288         /* Add page fault exits */
289         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
290         
291         // Setup VMX Assist
292         v3_vmxassist_init(core, vmx_state);
293
294         /* Enable EPT */
295         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
296         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
297
298
299
300         if (v3_init_ept(core, &hw_info) == -1) {
301             PrintError("Error initializing EPT\n");
302             return -1;
303         }
304
305     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
306                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
307         int i = 0;
308         // For now we will assume that unrestricted guest mode is assured w/ EPT
309
310
311         core->vm_regs.rsp = 0x00;
312         core->rip = 0xfff0;
313         core->vm_regs.rdx = 0x00000f00;
314         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
315         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
316
317
318         core->segments.cs.selector = 0xf000;
319         core->segments.cs.limit = 0xffff;
320         core->segments.cs.base = 0x0000000f0000LL;
321
322         // (raw attributes = 0xf3)
323         core->segments.cs.type = 0xb;
324         core->segments.cs.system = 0x1;
325         core->segments.cs.dpl = 0x0;
326         core->segments.cs.present = 1;
327
328
329
330         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
331                                           &(core->segments.es), &(core->segments.fs), 
332                                           &(core->segments.gs), NULL};
333
334         for ( i = 0; segregs[i] != NULL; i++) {
335             struct v3_segment * seg = segregs[i];
336         
337             seg->selector = 0x0000;
338             //    seg->base = seg->selector << 4;
339             seg->base = 0x00000000;
340             seg->limit = 0xffff;
341
342
343             seg->type = 0x3;
344             seg->system = 0x1;
345             seg->dpl = 0x0;
346             seg->present = 1;
347             //    seg->granularity = 1;
348
349         }
350
351
352         core->segments.gdtr.limit = 0x0000ffff;
353         core->segments.gdtr.base = 0x0000000000000000LL;
354
355         core->segments.idtr.limit = 0x0000ffff;
356         core->segments.idtr.base = 0x0000000000000000LL;
357
358         core->segments.ldtr.selector = 0x0000;
359         core->segments.ldtr.limit = 0x0000ffff;
360         core->segments.ldtr.base = 0x0000000000000000LL;
361         core->segments.ldtr.type = 2;
362         core->segments.ldtr.present = 1;
363
364         core->segments.tr.selector = 0x0000;
365         core->segments.tr.limit = 0x0000ffff;
366         core->segments.tr.base = 0x0000000000000000LL;
367         core->segments.tr.type = 0xb;
368         core->segments.tr.present = 1;
369
370         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
371         core->dbg_regs.dr7 = 0x0000000000000400LL;
372
373         /* Enable EPT */
374         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
375         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
376         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
377
378
379         /* Disable shadow paging stuff */
380         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
381         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
382
383         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
384
385
386         if (v3_init_ept(core, &hw_info) == -1) {
387             PrintError("Error initializing EPT\n");
388             return -1;
389         }
390
391     } else {
392         PrintError("Invalid Virtual paging mode\n");
393         return -1;
394     }
395
396
397     // hook vmx msrs
398
399     // Setup SYSCALL/SYSENTER MSRs in load/store area
400     
401     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
402     {
403 #define IA32_STAR 0xc0000081
404 #define IA32_LSTAR 0xc0000082
405 #define IA32_FMASK 0xc0000084
406 #define IA32_KERN_GS_BASE 0xc0000102
407
408 #define IA32_CSTAR 0xc0000083 // Compatibility mode STAR (ignored for now... hopefully its not that important...)
409
410         int msr_ret = 0;
411
412         struct vmcs_msr_entry * exit_store_msrs = NULL;
413         struct vmcs_msr_entry * exit_load_msrs = NULL;
414         struct vmcs_msr_entry * entry_load_msrs = NULL;;
415         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
416
417         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
418
419         if (max_msrs < 4) {
420             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
421             return -1;
422         }
423
424         vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
425
426         if (vmx_state->msr_area == NULL) {
427             PrintError("could not allocate msr load/store area\n");
428             return -1;
429         }
430
431         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
432         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
433         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
434         
435         
436         exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
437         exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
438         entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
439
440
441         exit_store_msrs[0].index = IA32_STAR;
442         exit_store_msrs[1].index = IA32_LSTAR;
443         exit_store_msrs[2].index = IA32_FMASK;
444         exit_store_msrs[3].index = IA32_KERN_GS_BASE;
445         
446         memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
447         memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
448
449         
450         v3_get_msr(IA32_STAR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
451         v3_get_msr(IA32_LSTAR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
452         v3_get_msr(IA32_FMASK, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
453         v3_get_msr(IA32_KERN_GS_BASE, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
454
455         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
456         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
457         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
458
459     }    
460
461     /* Sanity check ctrl/reg fields against hw_defaults */
462
463
464
465
466     /*** Write all the info to the VMCS ***/
467   
468     /*
469     {
470         // IS THIS NECESSARY???
471 #define DEBUGCTL_MSR 0x1d9
472         struct v3_msr tmp_msr;
473         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
474         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
475         core->dbg_regs.dr7 = 0x400;
476     }
477     */
478
479 #ifdef __V3_64BIT__
480     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
481 #else
482     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
483     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
484 #endif
485
486
487  
488
489     if (v3_update_vmcs_ctrl_fields(core)) {
490         PrintError("Could not write control fields!\n");
491         return -1;
492     }
493     
494     if (v3_update_vmcs_host_state(core)) {
495         PrintError("Could not write host state\n");
496         return -1;
497     }
498
499     // reenable global interrupts for vm state initialization now
500     // that the vm state is initialized. If another VM kicks us off, 
501     // it'll update our vmx state so that we know to reload ourself
502     v3_enable_ints();
503
504     return 0;
505 }
506
507 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
508     struct vmx_data * vmx_state = NULL;
509     int vmx_ret = 0;
510     
511     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
512     memset(vmx_state, 0, sizeof(struct vmx_data));
513
514     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
515
516     PrintDebug("Allocating VMCS\n");
517     vmx_state->vmcs_ptr_phys = allocate_vmcs();
518
519     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
520
521     core->vmm_data = vmx_state;
522     vmx_state->state = VMX_UNLAUNCHED;
523
524     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
525     
526     // TODO: Fix vmcs fields so they're 32-bit
527
528     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
529     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
530
531     if (vmx_ret != VMX_SUCCESS) {
532         PrintError("VMCLEAR failed\n");
533         return -1; 
534     }
535
536     if (vm_class == V3_PC_VM) {
537         PrintDebug("Initializing VMCS\n");
538         if (init_vmcs_bios(core, vmx_state) == -1) {
539             PrintError("Error initializing VMCS to BIOS state\n");
540             return -1;
541         }
542     } else {
543         PrintError("Invalid VM Class\n");
544         return -1;
545     }
546
547     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
548     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
549
550     return 0;
551 }
552
553
554 int v3_deinit_vmx_vmcs(struct guest_info * core) {
555     struct vmx_data * vmx_state = core->vmm_data;
556
557     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
558     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
559
560     V3_Free(vmx_state);
561
562     return 0;
563 }
564
565
566
567 #ifdef V3_CONFIG_CHECKPOINT
568 /* 
569  * JRL: This is broken
570  */
571 int v3_vmx_save_core(struct guest_info * core, void * ctx){
572     uint64_t vmcs_ptr = vmcs_store();
573
574     v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
575
576     return 0;
577 }
578
579 int v3_vmx_load_core(struct guest_info * core, void * ctx){
580     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
581     struct cr0_32 * shadow_cr0;
582     char vmcs[PAGE_SIZE_4KB];
583
584     v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
585
586     vmcs_clear(vmx_info->vmcs_ptr_phys);
587     vmcs_load((addr_t)vmcs);
588
589     v3_vmx_save_vmcs(core);
590
591     shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
592
593
594     /* Get the CPU mode to set the guest_ia32e entry ctrl */
595
596     if (core->shdw_pg_mode == SHADOW_PAGING) {
597         if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
598             if (v3_activate_shadow_pt(core) == -1) {
599                 PrintError("Failed to activate shadow page tables\n");
600                 return -1;
601             }
602         } else {
603             if (v3_activate_passthrough_pt(core) == -1) {
604                 PrintError("Failed to activate passthrough page tables\n");
605                 return -1;
606             }
607         }
608     }
609
610     return 0;
611 }
612 #endif
613
614
615 static int update_irq_exit_state(struct guest_info * info) {
616     struct vmx_exit_idt_vec_info idt_vec_info;
617
618     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
619
620     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
621 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
622         V3_Print("Calling v3_injecting_intr\n");
623 #endif
624         info->intr_core_state.irq_started = 0;
625         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
626     }
627
628     return 0;
629 }
630
631 static int update_irq_entry_state(struct guest_info * info) {
632     struct vmx_exit_idt_vec_info idt_vec_info;
633     struct vmcs_interrupt_state intr_core_state;
634     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
635
636     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
637     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
638
639     /* Check for pending exceptions to inject */
640     if (v3_excp_pending(info)) {
641         struct vmx_entry_int_info int_info;
642         int_info.value = 0;
643
644         // In VMX, almost every exception is hardware
645         // Software exceptions are pretty much only for breakpoint or overflow
646         int_info.type = 3;
647         int_info.vector = v3_get_excp_number(info);
648
649         if (info->excp_state.excp_error_code_valid) {
650             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
651             int_info.error_code = 1;
652
653 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
654             V3_Print("Injecting exception %d with error code %x\n", 
655                     int_info.vector, info->excp_state.excp_error_code);
656 #endif
657         }
658
659         int_info.valid = 1;
660 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
661         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
662 #endif
663         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
664
665         v3_injecting_excp(info, int_info.vector);
666
667     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
668                (intr_core_state.val == 0)) {
669        
670         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
671
672 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
673             V3_Print("IRQ pending from previous injection\n");
674 #endif
675
676             // Copy the IDT vectoring info over to reinject the old interrupt
677             if (idt_vec_info.error_code == 1) {
678                 uint32_t err_code = 0;
679
680                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
681                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
682             }
683
684             idt_vec_info.undef = 0;
685             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
686
687         } else {
688             struct vmx_entry_int_info ent_int;
689             ent_int.value = 0;
690
691             switch (v3_intr_pending(info)) {
692                 case V3_EXTERNAL_IRQ: {
693                     info->intr_core_state.irq_vector = v3_get_intr(info); 
694                     ent_int.vector = info->intr_core_state.irq_vector;
695                     ent_int.type = 0;
696                     ent_int.error_code = 0;
697                     ent_int.valid = 1;
698
699 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
700                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
701                                info->intr_core_state.irq_vector, 
702                                (uint32_t)info->num_exits, 
703                                (void *)(addr_t)info->rip);
704 #endif
705
706                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
707                     info->intr_core_state.irq_started = 1;
708
709                     break;
710                 }
711                 case V3_NMI:
712                     PrintDebug("Injecting NMI\n");
713
714                     ent_int.type = 2;
715                     ent_int.vector = 2;
716                     ent_int.valid = 1;
717                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
718
719                     break;
720                 case V3_SOFTWARE_INTR:
721                     PrintDebug("Injecting software interrupt\n");
722                     ent_int.type = 4;
723
724                     ent_int.valid = 1;
725                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
726
727                     break;
728                 case V3_VIRTUAL_IRQ:
729                     // Not sure what to do here, Intel doesn't have virtual IRQs
730                     // May be the same as external interrupts/IRQs
731
732                     break;
733                 case V3_INVALID_INTR:
734                 default:
735                     break;
736             }
737         }
738     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
739         // Enable INTR window exiting so we know when IF=1
740         uint32_t instr_len;
741
742         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
743
744 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
745         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
746 #endif
747
748         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
749         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
750     }
751
752
753     return 0;
754 }
755
756
757
758 static struct vmx_exit_info exit_log[10];
759
760 static void print_exit_log(struct guest_info * info) {
761     int cnt = info->num_exits % 10;
762     int i = 0;
763     
764
765     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
766
767     for (i = 0; i < 10; i++) {
768         struct vmx_exit_info * tmp = &exit_log[cnt];
769
770         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
771         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
772         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
773         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
774         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
775
776         cnt--;
777
778         if (cnt == -1) {
779             cnt = 9;
780         }
781
782     }
783
784 }
785
786 /* 
787  * CAUTION and DANGER!!! 
788  * 
789  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
790  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
791  * on its contents will cause things to break. The contents at the time of the exit WILL 
792  * change before the exit handler is executed.
793  */
794 int v3_vmx_enter(struct guest_info * info) {
795     int ret = 0;
796     uint32_t tsc_offset_low, tsc_offset_high;
797     struct vmx_exit_info exit_info;
798     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
799
800     // Conditionally yield the CPU if the timeslice has expired
801     v3_yield_cond(info);
802
803     // Perform any additional yielding needed for time adjustment
804     v3_adjust_time(info);
805
806     // disable global interrupts for vm state transition
807     v3_disable_ints();
808
809     // Update timer devices late after being in the VM so that as much 
810     // of hte time in the VM is accounted for as possible. Also do it before
811     // updating IRQ entry state so that any interrupts the timers raise get 
812     // handled on the next VM entry. Must be done with interrupts disabled.
813     v3_update_timers(info);
814
815     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
816         vmcs_clear(vmx_info->vmcs_ptr_phys);
817         vmcs_load(vmx_info->vmcs_ptr_phys);
818         vmx_info->state = VMX_UNLAUNCHED;
819     }
820
821     v3_vmx_restore_vmcs(info);
822
823
824 #ifdef V3_CONFIG_SYMCALL
825     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
826         update_irq_entry_state(info);
827     }
828 #else 
829     update_irq_entry_state(info);
830 #endif
831
832     {
833         addr_t guest_cr3;
834         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
835         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
836     }
837
838     // Perform last-minute time bookkeeping prior to entering the VM
839     v3_time_enter_vm(info);
840
841     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
842     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
843     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
844     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
845
846
847     if (v3_update_vmcs_host_state(info)) {
848         v3_enable_ints();
849         PrintError("Could not write host state\n");
850         return -1;
851     }
852
853
854     if (vmx_info->state == VMX_UNLAUNCHED) {
855         vmx_info->state = VMX_LAUNCHED;
856
857         info->vm_info->run_state = VM_RUNNING;
858         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
859     } else {
860         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
861         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
862     }
863     
864     //  PrintDebug("VMX Exit: ret=%d\n", ret);
865
866     if (ret != VMX_SUCCESS) {
867         uint32_t error = 0;
868
869         vmcs_read(VMCS_INSTR_ERR, &error);
870
871         v3_enable_ints();
872
873         PrintError("VMENTRY Error: %d\n", error);
874         return -1;
875     }
876
877     // Immediate exit from VM time bookkeeping
878     v3_time_exit_vm(info);
879
880     info->num_exits++;
881
882     /* Update guest state */
883     v3_vmx_save_vmcs(info);
884
885     // info->cpl = info->segments.cs.selector & 0x3;
886
887     info->mem_mode = v3_get_vm_mem_mode(info);
888     info->cpu_mode = v3_get_vm_cpu_mode(info);
889
890
891     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
892     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
893     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
894     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
895     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
896     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
897     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
898
899     if (info->shdw_pg_mode == NESTED_PAGING) {
900         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
901     }
902
903     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
904
905     exit_log[info->num_exits % 10] = exit_info;
906
907 #ifdef V3_CONFIG_SYMCALL
908     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
909         update_irq_exit_state(info);
910     }
911 #else
912     update_irq_exit_state(info);
913 #endif
914
915     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
916         // This is a special case whose only job is to inject an interrupt
917         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
918         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
919         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
920
921 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
922        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
923 #endif
924     }
925
926     // reenable global interrupts after vm exit
927     v3_enable_ints();
928
929     // Conditionally yield the CPU if the timeslice has expired
930     v3_yield_cond(info);
931
932     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
933         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
934         return -1;
935     }
936
937     return 0;
938 }
939
940
941 int v3_start_vmx_guest(struct guest_info * info) {
942
943     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
944
945     if (info->vcpu_id == 0) {
946         info->core_run_state = CORE_RUNNING;
947         info->vm_info->run_state = VM_RUNNING;
948     } else {
949
950         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
951
952         while (info->core_run_state == CORE_STOPPED) {
953             v3_yield(info);
954             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
955         }
956         
957         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
958
959         // We'll be paranoid about race conditions here
960         v3_wait_at_barrier(info);
961     }
962
963
964     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
965                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
966                info->segments.cs.limit, (void *)(info->rip));
967
968
969     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
970
971     v3_start_time(info);
972
973     while (1) {
974
975         if (info->vm_info->run_state == VM_STOPPED) {
976             info->core_run_state = CORE_STOPPED;
977             break;
978         }
979
980         if (v3_vmx_enter(info) == -1) {
981
982             addr_t host_addr;
983             addr_t linear_addr = 0;
984             
985             info->vm_info->run_state = VM_ERROR;
986             
987             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
988             
989             v3_print_guest_state(info);
990             
991             V3_Print("VMX core %u\n", info->vcpu_id); 
992
993             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
994             
995             if (info->mem_mode == PHYSICAL_MEM) {
996                 v3_gpa_to_hva(info, linear_addr, &host_addr);
997             } else if (info->mem_mode == VIRTUAL_MEM) {
998                 v3_gva_to_hva(info, linear_addr, &host_addr);
999             }
1000             
1001             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
1002             
1003             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
1004             v3_dump_mem((uint8_t *)host_addr, 15);
1005             
1006             v3_print_stack(info);
1007
1008
1009             v3_print_vmcs();
1010             print_exit_log(info);
1011             return -1;
1012         }
1013
1014         v3_wait_at_barrier(info);
1015
1016
1017         if (info->vm_info->run_state == VM_STOPPED) {
1018             info->core_run_state = CORE_STOPPED;
1019             break;
1020         }
1021 /*
1022         if ((info->num_exits % 5000) == 0) {
1023             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
1024         }
1025 */
1026
1027     }
1028
1029     return 0;
1030 }
1031
1032
1033
1034
1035 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
1036 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
1037 #define CPUID_1_ECX_VTXFLAG 0x00000020
1038
1039 int v3_is_vmx_capable() {
1040     v3_msr_t feature_msr;
1041     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1042
1043     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1044
1045     PrintDebug("ECX: 0x%x\n", ecx);
1046
1047     if (ecx & CPUID_1_ECX_VTXFLAG) {
1048         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1049         
1050         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
1051
1052         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1053             PrintDebug("VMX is locked -- enable in the BIOS\n");
1054             return 0;
1055         }
1056
1057     } else {
1058         PrintDebug("VMX not supported on this cpu\n");
1059         return 0;
1060     }
1061
1062     return 1;
1063 }
1064
1065
1066 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1067     // init vmcs bios
1068     
1069     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1070         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1071         // easy 
1072         core->rip = 0;
1073         core->segments.cs.selector = rip << 8;
1074         core->segments.cs.limit = 0xffff;
1075         core->segments.cs.base = rip << 12;
1076     } else {
1077         core->vm_regs.rdx = core->vcpu_id;
1078         core->vm_regs.rbx = rip;
1079     }
1080
1081     return 0;
1082 }
1083
1084
1085
1086 void v3_init_vmx_cpu(int cpu_id) {
1087     addr_t vmx_on_region = 0;
1088
1089     if (cpu_id == 0) {
1090         if (v3_init_vmx_hw(&hw_info) == -1) {
1091             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1092             return;
1093         }
1094     }
1095
1096     enable_vmx();
1097
1098
1099     // Setup VMXON Region
1100     vmx_on_region = allocate_vmcs();
1101
1102
1103     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1104         V3_Print("VMX Enabled\n");
1105         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1106     } else {
1107         V3_Print("VMX already enabled\n");
1108         V3_FreePages((void *)vmx_on_region, 1);
1109     }
1110
1111     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1112
1113     {
1114         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1115         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1116         
1117         if (sec_proc_ctrls.enable_ept == 0) {
1118             V3_Print("VMX EPT (Nested) Paging not supported\n");
1119             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1120         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1121             V3_Print("VMX EPT (Nested) Paging supported\n");
1122             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1123         } else {
1124             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1125             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1126         }
1127     }
1128 }
1129
1130
1131 void v3_deinit_vmx_cpu(int cpu_id) {
1132     extern v3_cpu_arch_t v3_cpu_types[];
1133     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1134
1135     if (host_vmcs_ptrs[cpu_id] != 0) {
1136         V3_Print("Disabling VMX\n");
1137
1138         if (vmx_off() != VMX_SUCCESS) {
1139             PrintError("Error executing VMXOFF\n");
1140         }
1141
1142         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1143
1144         host_vmcs_ptrs[cpu_id] = 0;
1145     }
1146 }