Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


exported functionality for vmcs flushing via vmcs_clear
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #ifdef V3_CONFIG_CHECKPOINT
38 #include <palacios/vmm_checkpoint.h>
39 #endif
40
41 #include <palacios/vmx_ept.h>
42 #include <palacios/vmx_assist.h>
43 #include <palacios/vmx_hw_info.h>
44
45 #ifndef V3_CONFIG_DEBUG_VMX
46 #undef PrintDebug
47 #define PrintDebug(fmt, args...)
48 #endif
49
50
51 /* These fields contain the hardware feature sets supported by the local CPU */
52 static struct vmx_hw_info hw_info;
53
54 extern v3_cpu_arch_t v3_cpu_types[];
55
56 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
57
58 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
59 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
60
61 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
62     int ret = 0;
63
64     ret = vmcs_write(field, val);
65
66     if (ret != VMX_SUCCESS) {
67         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
68         return 1;
69     }
70
71     return 0;
72 }
73
74 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
75     int ret = 0;
76
77     ret = vmcs_read(field, val);
78
79     if (ret != VMX_SUCCESS) {
80         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
81     }
82
83     return ret;
84 }
85
86
87
88
89 static addr_t allocate_vmcs() {
90     struct vmcs_data * vmcs_page = NULL;
91
92     PrintDebug("Allocating page\n");
93
94     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
95     memset(vmcs_page, 0, 4096);
96
97     vmcs_page->revision = hw_info.basic_info.revision;
98     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
99
100     return (addr_t)V3_PAddr((void *)vmcs_page);
101 }
102
103
104
105
106 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
107     int vmx_ret = 0;
108
109     // disable global interrupts for vm state initialization
110     v3_disable_ints();
111
112     PrintDebug("Loading VMCS\n");
113     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
114     vmx_state->state = VMX_UNLAUNCHED;
115
116     if (vmx_ret != VMX_SUCCESS) {
117         PrintError("VMPTRLD failed\n");
118         return -1;
119     }
120
121
122     /*** Setup default state from HW ***/
123
124     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
125     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
126     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
127     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
128     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
129
130     /* Print Control MSRs */
131     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
132     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
133
134
135
136     /******* Setup Host State **********/
137
138     /* Cache GDTR, IDTR, and TR in host struct */
139     addr_t gdtr_base;
140     struct {
141         uint16_t selector;
142         addr_t   base;
143     } __attribute__((packed)) tmp_seg;
144     
145
146     __asm__ __volatile__(
147                          "sgdt (%0);"
148                          :
149                          : "q"(&tmp_seg)
150                          : "memory"
151                          );
152     gdtr_base = tmp_seg.base;
153     vmx_state->host_state.gdtr.base = gdtr_base;
154
155     __asm__ __volatile__(
156                          "sidt (%0);"
157                          :
158                          : "q"(&tmp_seg)
159                          : "memory"
160                          );
161     vmx_state->host_state.idtr.base = tmp_seg.base;
162
163     __asm__ __volatile__(
164                          "str (%0);"
165                          :
166                          : "q"(&tmp_seg)
167                          : "memory"
168                          );
169     vmx_state->host_state.tr.selector = tmp_seg.selector;
170
171     /* The GDTR *index* is bits 3-15 of the selector. */
172     struct tss_descriptor * desc = NULL;
173     desc = (struct tss_descriptor *)(gdtr_base + (8 * (tmp_seg.selector >> 3)));
174
175     tmp_seg.base = ((desc->base1) |
176                     (desc->base2 << 16) |
177                     (desc->base3 << 24) |
178 #ifdef __V3_64BIT__
179                     ((uint64_t)desc->base4 << 32)
180 #else 
181                     (0)
182 #endif
183                     );
184
185     vmx_state->host_state.tr.base = tmp_seg.base;
186
187
188     /********** Setup VMX Control Fields ***********/
189
190     /* Add external interrupts, NMI exiting, and virtual NMI */
191     vmx_state->pin_ctrls.nmi_exit = 1;
192     vmx_state->pin_ctrls.ext_int_exit = 1;
193
194
195     vmx_state->pri_proc_ctrls.hlt_exit = 1;
196
197
198     vmx_state->pri_proc_ctrls.pause_exit = 0;
199     vmx_state->pri_proc_ctrls.tsc_offset = 1;
200 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
201     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
202 #endif
203
204     /* Setup IO map */
205     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
206     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
207     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
208             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
209
210
211     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
212     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
213
214
215
216     
217
218
219
220 #ifdef __V3_64BIT__
221     // Ensure host runs in 64-bit mode at each VM EXIT
222     vmx_state->exit_ctrls.host_64_on = 1;
223 #endif
224
225     // Hook all accesses to EFER register
226     v3_hook_msr(core->vm_info, EFER_MSR, 
227                 &v3_handle_efer_read,
228                 &v3_handle_efer_write, 
229                 core);
230
231     // Restore host's EFER register on each VM EXIT
232     vmx_state->exit_ctrls.ld_efer = 1;
233
234     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
235     vmx_state->exit_ctrls.save_efer = 1;
236     vmx_state->entry_ctrls.ld_efer  = 1;
237
238     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
239     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
240
241
242     /* Setup paging */
243     if (core->shdw_pg_mode == SHADOW_PAGING) {
244         PrintDebug("Creating initial shadow page table\n");
245
246         if (v3_init_passthrough_pts(core) == -1) {
247             PrintError("Could not initialize passthrough page tables\n");
248             return -1;
249         }
250         
251 #define CR0_PE 0x00000001
252 #define CR0_PG 0x80000000
253 #define CR0_WP 0x00010000 // To ensure mem hooks work
254         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
255
256         core->ctrl_regs.cr3 = core->direct_map_pt;
257
258         // vmx_state->pinbased_ctrls |= NMI_EXIT;
259
260         /* Add CR exits */
261         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
262         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
263         
264         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
265         
266         /* Add page fault exits */
267         vmx_state->excp_bmap.pf = 1;
268
269         // Setup VMX Assist
270         v3_vmxassist_init(core, vmx_state);
271
272     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
273                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
274
275 #define CR0_PE 0x00000001
276 #define CR0_PG 0x80000000
277 #define CR0_WP 0x00010000 // To ensure mem hooks work
278         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
279
280         // vmx_state->pinbased_ctrls |= NMI_EXIT;
281
282         /* Disable CR exits */
283         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
284         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
285
286         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
287
288         /* Add page fault exits */
289         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
290         
291         // Setup VMX Assist
292         v3_vmxassist_init(core, vmx_state);
293
294         /* Enable EPT */
295         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
296         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
297
298
299
300         if (v3_init_ept(core, &hw_info) == -1) {
301             PrintError("Error initializing EPT\n");
302             return -1;
303         }
304
305     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
306                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
307         int i = 0;
308         // For now we will assume that unrestricted guest mode is assured w/ EPT
309
310
311         core->vm_regs.rsp = 0x00;
312         core->rip = 0xfff0;
313         core->vm_regs.rdx = 0x00000f00;
314         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
315         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
316
317
318         core->segments.cs.selector = 0xf000;
319         core->segments.cs.limit = 0xffff;
320         core->segments.cs.base = 0x0000000f0000LL;
321
322         // (raw attributes = 0xf3)
323         core->segments.cs.type = 0xb;
324         core->segments.cs.system = 0x1;
325         core->segments.cs.dpl = 0x0;
326         core->segments.cs.present = 1;
327
328
329
330         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
331                                           &(core->segments.es), &(core->segments.fs), 
332                                           &(core->segments.gs), NULL};
333
334         for ( i = 0; segregs[i] != NULL; i++) {
335             struct v3_segment * seg = segregs[i];
336         
337             seg->selector = 0x0000;
338             //    seg->base = seg->selector << 4;
339             seg->base = 0x00000000;
340             seg->limit = 0xffff;
341
342
343             seg->type = 0x3;
344             seg->system = 0x1;
345             seg->dpl = 0x0;
346             seg->present = 1;
347             //    seg->granularity = 1;
348
349         }
350
351
352         core->segments.gdtr.limit = 0x0000ffff;
353         core->segments.gdtr.base = 0x0000000000000000LL;
354
355         core->segments.idtr.limit = 0x0000ffff;
356         core->segments.idtr.base = 0x0000000000000000LL;
357
358         core->segments.ldtr.selector = 0x0000;
359         core->segments.ldtr.limit = 0x0000ffff;
360         core->segments.ldtr.base = 0x0000000000000000LL;
361         core->segments.ldtr.type = 2;
362         core->segments.ldtr.present = 1;
363
364         core->segments.tr.selector = 0x0000;
365         core->segments.tr.limit = 0x0000ffff;
366         core->segments.tr.base = 0x0000000000000000LL;
367         core->segments.tr.type = 0xb;
368         core->segments.tr.present = 1;
369
370         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
371         core->dbg_regs.dr7 = 0x0000000000000400LL;
372
373         /* Enable EPT */
374         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
375         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
376         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
377
378
379         /* Disable shadow paging stuff */
380         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
381         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
382
383         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
384
385
386         if (v3_init_ept(core, &hw_info) == -1) {
387             PrintError("Error initializing EPT\n");
388             return -1;
389         }
390
391     } else {
392         PrintError("Invalid Virtual paging mode\n");
393         return -1;
394     }
395
396
397     // hook vmx msrs
398
399     // Setup SYSCALL/SYSENTER MSRs in load/store area
400     
401     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
402     {
403 #define IA32_STAR 0xc0000081
404 #define IA32_LSTAR 0xc0000082
405 #define IA32_FMASK 0xc0000084
406 #define IA32_KERN_GS_BASE 0xc0000102
407
408 #define IA32_CSTAR 0xc0000083 // Compatibility mode STAR (ignored for now... hopefully its not that important...)
409
410         int msr_ret = 0;
411
412         struct vmcs_msr_entry * exit_store_msrs = NULL;
413         struct vmcs_msr_entry * exit_load_msrs = NULL;
414         struct vmcs_msr_entry * entry_load_msrs = NULL;;
415         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
416
417         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
418
419         if (max_msrs < 4) {
420             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
421             return -1;
422         }
423
424         vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
425
426         if (vmx_state->msr_area == NULL) {
427             PrintError("could not allocate msr load/store area\n");
428             return -1;
429         }
430
431         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
432         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
433         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
434         
435         
436         exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
437         exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
438         entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
439
440
441         exit_store_msrs[0].index = IA32_STAR;
442         exit_store_msrs[1].index = IA32_LSTAR;
443         exit_store_msrs[2].index = IA32_FMASK;
444         exit_store_msrs[3].index = IA32_KERN_GS_BASE;
445         
446         memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
447         memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
448
449         
450         v3_get_msr(IA32_STAR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
451         v3_get_msr(IA32_LSTAR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
452         v3_get_msr(IA32_FMASK, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
453         v3_get_msr(IA32_KERN_GS_BASE, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
454
455         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
456         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
457         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
458
459     }    
460
461     /* Sanity check ctrl/reg fields against hw_defaults */
462
463
464
465
466     /*** Write all the info to the VMCS ***/
467   
468     /*
469     {
470         // IS THIS NECESSARY???
471 #define DEBUGCTL_MSR 0x1d9
472         struct v3_msr tmp_msr;
473         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
474         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
475         core->dbg_regs.dr7 = 0x400;
476     }
477     */
478
479 #ifdef __V3_64BIT__
480     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
481 #else
482     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
483     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
484 #endif
485
486
487  
488
489     if (v3_update_vmcs_ctrl_fields(core)) {
490         PrintError("Could not write control fields!\n");
491         return -1;
492     }
493     
494     if (v3_update_vmcs_host_state(core)) {
495         PrintError("Could not write host state\n");
496         return -1;
497     }
498
499     // reenable global interrupts for vm state initialization now
500     // that the vm state is initialized. If another VM kicks us off, 
501     // it'll update our vmx state so that we know to reload ourself
502     v3_enable_ints();
503
504     return 0;
505 }
506
507 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
508     struct vmx_data * vmx_state = NULL;
509     int vmx_ret = 0;
510     
511     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
512     memset(vmx_state, 0, sizeof(struct vmx_data));
513
514     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
515
516     PrintDebug("Allocating VMCS\n");
517     vmx_state->vmcs_ptr_phys = allocate_vmcs();
518
519     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
520
521     core->vmm_data = vmx_state;
522     vmx_state->state = VMX_UNLAUNCHED;
523
524     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
525     
526     // TODO: Fix vmcs fields so they're 32-bit
527
528     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
529     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
530
531     if (vmx_ret != VMX_SUCCESS) {
532         PrintError("VMCLEAR failed\n");
533         return -1; 
534     }
535
536     if (vm_class == V3_PC_VM) {
537         PrintDebug("Initializing VMCS\n");
538         if (init_vmcs_bios(core, vmx_state) == -1) {
539             PrintError("Error initializing VMCS to BIOS state\n");
540             return -1;
541         }
542     } else {
543         PrintError("Invalid VM Class\n");
544         return -1;
545     }
546
547     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
548     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
549
550     return 0;
551 }
552
553
554 int v3_deinit_vmx_vmcs(struct guest_info * core) {
555     struct vmx_data * vmx_state = core->vmm_data;
556
557     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
558     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
559
560     V3_Free(vmx_state);
561
562     return 0;
563 }
564
565
566
567 #ifdef V3_CONFIG_CHECKPOINT
568 /* 
569  * JRL: This is broken
570  */
571 int v3_vmx_save_core(struct guest_info * core, void * ctx){
572     uint64_t vmcs_ptr = vmcs_store();
573
574     v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
575
576     return 0;
577 }
578
579 int v3_vmx_load_core(struct guest_info * core, void * ctx){
580     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
581     struct cr0_32 * shadow_cr0;
582     char vmcs[PAGE_SIZE_4KB];
583
584     v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
585
586     vmcs_clear(vmx_info->vmcs_ptr_phys);
587     vmcs_load((addr_t)vmcs);
588
589     v3_vmx_save_vmcs(core);
590
591     shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
592
593
594     /* Get the CPU mode to set the guest_ia32e entry ctrl */
595
596     if (core->shdw_pg_mode == SHADOW_PAGING) {
597         if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
598             if (v3_activate_shadow_pt(core) == -1) {
599                 PrintError("Failed to activate shadow page tables\n");
600                 return -1;
601             }
602         } else {
603             if (v3_activate_passthrough_pt(core) == -1) {
604                 PrintError("Failed to activate passthrough page tables\n");
605                 return -1;
606             }
607         }
608     }
609
610     return 0;
611 }
612 #endif
613
614
615 void v3_flush_vmx_vm_core(struct guest_info * core) {
616     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
617     vmcs_clear(vmx_info->vmcs_ptr_phys);
618     vmx_info->state = VMX_UNLAUNCHED;
619 }
620
621
622
623 static int update_irq_exit_state(struct guest_info * info) {
624     struct vmx_exit_idt_vec_info idt_vec_info;
625
626     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
627
628     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
629 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
630         V3_Print("Calling v3_injecting_intr\n");
631 #endif
632         info->intr_core_state.irq_started = 0;
633         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
634     }
635
636     return 0;
637 }
638
639 static int update_irq_entry_state(struct guest_info * info) {
640     struct vmx_exit_idt_vec_info idt_vec_info;
641     struct vmcs_interrupt_state intr_core_state;
642     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
643
644     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
645     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
646
647     /* Check for pending exceptions to inject */
648     if (v3_excp_pending(info)) {
649         struct vmx_entry_int_info int_info;
650         int_info.value = 0;
651
652         // In VMX, almost every exception is hardware
653         // Software exceptions are pretty much only for breakpoint or overflow
654         int_info.type = 3;
655         int_info.vector = v3_get_excp_number(info);
656
657         if (info->excp_state.excp_error_code_valid) {
658             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
659             int_info.error_code = 1;
660
661 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
662             V3_Print("Injecting exception %d with error code %x\n", 
663                     int_info.vector, info->excp_state.excp_error_code);
664 #endif
665         }
666
667         int_info.valid = 1;
668 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
669         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
670 #endif
671         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
672
673         v3_injecting_excp(info, int_info.vector);
674
675     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
676                (intr_core_state.val == 0)) {
677        
678         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
679
680 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
681             V3_Print("IRQ pending from previous injection\n");
682 #endif
683
684             // Copy the IDT vectoring info over to reinject the old interrupt
685             if (idt_vec_info.error_code == 1) {
686                 uint32_t err_code = 0;
687
688                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
689                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
690             }
691
692             idt_vec_info.undef = 0;
693             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
694
695         } else {
696             struct vmx_entry_int_info ent_int;
697             ent_int.value = 0;
698
699             switch (v3_intr_pending(info)) {
700                 case V3_EXTERNAL_IRQ: {
701                     info->intr_core_state.irq_vector = v3_get_intr(info); 
702                     ent_int.vector = info->intr_core_state.irq_vector;
703                     ent_int.type = 0;
704                     ent_int.error_code = 0;
705                     ent_int.valid = 1;
706
707 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
708                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
709                                info->intr_core_state.irq_vector, 
710                                (uint32_t)info->num_exits, 
711                                (void *)(addr_t)info->rip);
712 #endif
713
714                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
715                     info->intr_core_state.irq_started = 1;
716
717                     break;
718                 }
719                 case V3_NMI:
720                     PrintDebug("Injecting NMI\n");
721
722                     ent_int.type = 2;
723                     ent_int.vector = 2;
724                     ent_int.valid = 1;
725                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
726
727                     break;
728                 case V3_SOFTWARE_INTR:
729                     PrintDebug("Injecting software interrupt\n");
730                     ent_int.type = 4;
731
732                     ent_int.valid = 1;
733                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
734
735                     break;
736                 case V3_VIRTUAL_IRQ:
737                     // Not sure what to do here, Intel doesn't have virtual IRQs
738                     // May be the same as external interrupts/IRQs
739
740                     break;
741                 case V3_INVALID_INTR:
742                 default:
743                     break;
744             }
745         }
746     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
747         // Enable INTR window exiting so we know when IF=1
748         uint32_t instr_len;
749
750         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
751
752 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
753         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
754 #endif
755
756         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
757         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
758     }
759
760
761     return 0;
762 }
763
764
765
766 static struct vmx_exit_info exit_log[10];
767
768 static void print_exit_log(struct guest_info * info) {
769     int cnt = info->num_exits % 10;
770     int i = 0;
771     
772
773     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
774
775     for (i = 0; i < 10; i++) {
776         struct vmx_exit_info * tmp = &exit_log[cnt];
777
778         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
779         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
780         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
781         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
782         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
783
784         cnt--;
785
786         if (cnt == -1) {
787             cnt = 9;
788         }
789
790     }
791
792 }
793
794 /* 
795  * CAUTION and DANGER!!! 
796  * 
797  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
798  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
799  * on its contents will cause things to break. The contents at the time of the exit WILL 
800  * change before the exit handler is executed.
801  */
802 int v3_vmx_enter(struct guest_info * info) {
803     int ret = 0;
804     uint32_t tsc_offset_low, tsc_offset_high;
805     struct vmx_exit_info exit_info;
806     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
807
808     // Conditionally yield the CPU if the timeslice has expired
809     v3_yield_cond(info);
810
811     // Perform any additional yielding needed for time adjustment
812     v3_adjust_time(info);
813
814     // disable global interrupts for vm state transition
815     v3_disable_ints();
816
817     // Update timer devices late after being in the VM so that as much 
818     // of hte time in the VM is accounted for as possible. Also do it before
819     // updating IRQ entry state so that any interrupts the timers raise get 
820     // handled on the next VM entry. Must be done with interrupts disabled.
821     v3_update_timers(info);
822
823     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
824         vmcs_clear(vmx_info->vmcs_ptr_phys);
825         vmcs_load(vmx_info->vmcs_ptr_phys);
826         vmx_info->state = VMX_UNLAUNCHED;
827     }
828
829     v3_vmx_restore_vmcs(info);
830
831
832 #ifdef V3_CONFIG_SYMCALL
833     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
834         update_irq_entry_state(info);
835     }
836 #else 
837     update_irq_entry_state(info);
838 #endif
839
840     {
841         addr_t guest_cr3;
842         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
843         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
844     }
845
846     // Perform last-minute time bookkeeping prior to entering the VM
847     v3_time_enter_vm(info);
848
849     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
850     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
851     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
852     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
853
854
855     if (v3_update_vmcs_host_state(info)) {
856         v3_enable_ints();
857         PrintError("Could not write host state\n");
858         return -1;
859     }
860
861
862     if (vmx_info->state == VMX_UNLAUNCHED) {
863         vmx_info->state = VMX_LAUNCHED;
864
865         info->vm_info->run_state = VM_RUNNING;
866         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
867     } else {
868         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
869         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
870     }
871     
872
873
874     //  PrintDebug("VMX Exit: ret=%d\n", ret);
875
876     if (ret != VMX_SUCCESS) {
877         uint32_t error = 0;
878         vmcs_read(VMCS_INSTR_ERR, &error);
879
880         v3_enable_ints();
881
882         PrintError("VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
883         return -1;
884     }
885
886
887
888     // Immediate exit from VM time bookkeeping
889     v3_time_exit_vm(info);
890
891     info->num_exits++;
892
893     /* Update guest state */
894     v3_vmx_save_vmcs(info);
895
896     // info->cpl = info->segments.cs.selector & 0x3;
897
898     info->mem_mode = v3_get_vm_mem_mode(info);
899     info->cpu_mode = v3_get_vm_cpu_mode(info);
900
901
902     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
903     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
904     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
905     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
906     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
907     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
908     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
909
910     if (info->shdw_pg_mode == NESTED_PAGING) {
911         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
912     }
913
914     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
915
916     exit_log[info->num_exits % 10] = exit_info;
917
918 #ifdef V3_CONFIG_SYMCALL
919     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
920         update_irq_exit_state(info);
921     }
922 #else
923     update_irq_exit_state(info);
924 #endif
925
926     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
927         // This is a special case whose only job is to inject an interrupt
928         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
929         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
930         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
931
932 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
933        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
934 #endif
935     }
936
937     // reenable global interrupts after vm exit
938     v3_enable_ints();
939
940     // Conditionally yield the CPU if the timeslice has expired
941     v3_yield_cond(info);
942
943     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
944         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
945         return -1;
946     }
947
948     return 0;
949 }
950
951
952 int v3_start_vmx_guest(struct guest_info * info) {
953
954     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
955
956     if (info->vcpu_id == 0) {
957         info->core_run_state = CORE_RUNNING;
958         info->vm_info->run_state = VM_RUNNING;
959     } else {
960
961         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
962
963         while (info->core_run_state == CORE_STOPPED) {
964             v3_yield(info);
965             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
966         }
967         
968         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
969
970         // We'll be paranoid about race conditions here
971         v3_wait_at_barrier(info);
972     }
973
974
975     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
976                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
977                info->segments.cs.limit, (void *)(info->rip));
978
979
980     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
981
982     v3_start_time(info);
983
984     while (1) {
985
986         if (info->vm_info->run_state == VM_STOPPED) {
987             info->core_run_state = CORE_STOPPED;
988             break;
989         }
990
991         if (v3_vmx_enter(info) == -1) {
992
993             addr_t host_addr;
994             addr_t linear_addr = 0;
995             
996             info->vm_info->run_state = VM_ERROR;
997             
998             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
999             
1000             v3_print_guest_state(info);
1001             
1002             V3_Print("VMX core %u\n", info->vcpu_id); 
1003
1004             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
1005             
1006             if (info->mem_mode == PHYSICAL_MEM) {
1007                 v3_gpa_to_hva(info, linear_addr, &host_addr);
1008             } else if (info->mem_mode == VIRTUAL_MEM) {
1009                 v3_gva_to_hva(info, linear_addr, &host_addr);
1010             }
1011             
1012             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
1013             
1014             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
1015             v3_dump_mem((uint8_t *)host_addr, 15);
1016             
1017             v3_print_stack(info);
1018
1019
1020             v3_print_vmcs();
1021             print_exit_log(info);
1022             return -1;
1023         }
1024
1025         v3_wait_at_barrier(info);
1026
1027
1028         if (info->vm_info->run_state == VM_STOPPED) {
1029             info->core_run_state = CORE_STOPPED;
1030             break;
1031         }
1032 /*
1033         if ((info->num_exits % 5000) == 0) {
1034             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
1035         }
1036 */
1037
1038     }
1039
1040     return 0;
1041 }
1042
1043
1044
1045
1046 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
1047 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
1048 #define CPUID_1_ECX_VTXFLAG 0x00000020
1049
1050 int v3_is_vmx_capable() {
1051     v3_msr_t feature_msr;
1052     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1053
1054     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1055
1056     PrintDebug("ECX: 0x%x\n", ecx);
1057
1058     if (ecx & CPUID_1_ECX_VTXFLAG) {
1059         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1060         
1061         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
1062
1063         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1064             PrintDebug("VMX is locked -- enable in the BIOS\n");
1065             return 0;
1066         }
1067
1068     } else {
1069         PrintDebug("VMX not supported on this cpu\n");
1070         return 0;
1071     }
1072
1073     return 1;
1074 }
1075
1076
1077 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1078     // init vmcs bios
1079     
1080     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1081         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1082         // easy 
1083         core->rip = 0;
1084         core->segments.cs.selector = rip << 8;
1085         core->segments.cs.limit = 0xffff;
1086         core->segments.cs.base = rip << 12;
1087     } else {
1088         core->vm_regs.rdx = core->vcpu_id;
1089         core->vm_regs.rbx = rip;
1090     }
1091
1092     return 0;
1093 }
1094
1095
1096
1097 void v3_init_vmx_cpu(int cpu_id) {
1098     addr_t vmx_on_region = 0;
1099
1100     if (cpu_id == 0) {
1101         if (v3_init_vmx_hw(&hw_info) == -1) {
1102             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1103             return;
1104         }
1105     }
1106
1107     enable_vmx();
1108
1109
1110     // Setup VMXON Region
1111     vmx_on_region = allocate_vmcs();
1112
1113
1114     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1115         V3_Print("VMX Enabled\n");
1116         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1117     } else {
1118         V3_Print("VMX already enabled\n");
1119         V3_FreePages((void *)vmx_on_region, 1);
1120     }
1121
1122     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1123
1124     {
1125         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1126         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1127         
1128         if (sec_proc_ctrls.enable_ept == 0) {
1129             V3_Print("VMX EPT (Nested) Paging not supported\n");
1130             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1131         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1132             V3_Print("VMX EPT (Nested) Paging supported\n");
1133             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1134         } else {
1135             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1136             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1137         }
1138     }
1139 }
1140
1141
1142 void v3_deinit_vmx_cpu(int cpu_id) {
1143     extern v3_cpu_arch_t v3_cpu_types[];
1144     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1145
1146     if (host_vmcs_ptrs[cpu_id] != 0) {
1147         V3_Print("Disabling VMX\n");
1148
1149         if (vmx_off() != VMX_SUCCESS) {
1150             PrintError("Error executing VMXOFF\n");
1151         }
1152
1153         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1154
1155         host_vmcs_ptrs[cpu_id] = 0;
1156     }
1157 }