Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


Fix to MSR save/restore handling to avoid VMX ABORT errors
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #ifdef V3_CONFIG_CHECKPOINT
38 #include <palacios/vmm_checkpoint.h>
39 #endif
40
41 #include <palacios/vmx_ept.h>
42 #include <palacios/vmx_assist.h>
43 #include <palacios/vmx_hw_info.h>
44
45 #ifndef V3_CONFIG_DEBUG_VMX
46 #undef PrintDebug
47 #define PrintDebug(fmt, args...)
48 #endif
49
50
51 /* These fields contain the hardware feature sets supported by the local CPU */
52 static struct vmx_hw_info hw_info;
53
54 extern v3_cpu_arch_t v3_cpu_types[];
55
56 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
57
58 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
59 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
60
61 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
62     int ret = 0;
63
64     ret = vmcs_write(field, val);
65
66     if (ret != VMX_SUCCESS) {
67         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
68         return 1;
69     }
70
71     return 0;
72 }
73
74 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
75     int ret = 0;
76
77     ret = vmcs_read(field, val);
78
79     if (ret != VMX_SUCCESS) {
80         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
81     }
82
83     return ret;
84 }
85
86
87
88
89 static addr_t allocate_vmcs() {
90     struct vmcs_data * vmcs_page = NULL;
91
92     PrintDebug("Allocating page\n");
93
94     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
95     memset(vmcs_page, 0, 4096);
96
97     vmcs_page->revision = hw_info.basic_info.revision;
98     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
99
100     return (addr_t)V3_PAddr((void *)vmcs_page);
101 }
102
103
104
105
106 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
107     int vmx_ret = 0;
108
109     // disable global interrupts for vm state initialization
110     v3_disable_ints();
111
112     PrintDebug("Loading VMCS\n");
113     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
114     vmx_state->state = VMX_UNLAUNCHED;
115
116     if (vmx_ret != VMX_SUCCESS) {
117         PrintError("VMPTRLD failed\n");
118         return -1;
119     }
120
121
122     /*** Setup default state from HW ***/
123
124     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
125     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
126     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
127     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
128     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
129
130     /* Print Control MSRs */
131     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
132     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
133
134
135
136     /******* Setup Host State **********/
137
138     /* Cache GDTR, IDTR, and TR in host struct */
139
140
141     /********** Setup VMX Control Fields ***********/
142
143     /* Add external interrupts, NMI exiting, and virtual NMI */
144     vmx_state->pin_ctrls.nmi_exit = 1;
145     vmx_state->pin_ctrls.ext_int_exit = 1;
146
147
148     vmx_state->pri_proc_ctrls.hlt_exit = 1;
149
150
151     vmx_state->pri_proc_ctrls.pause_exit = 0;
152     vmx_state->pri_proc_ctrls.tsc_offset = 1;
153 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
154     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
155 #endif
156
157     /* Setup IO map */
158     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
159     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
160     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
161             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
162
163
164     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
165     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
166
167
168
169 #ifdef __V3_64BIT__
170     // Ensure host runs in 64-bit mode at each VM EXIT
171     vmx_state->exit_ctrls.host_64_on = 1;
172 #endif
173
174     // Hook all accesses to EFER register
175     v3_hook_msr(core->vm_info, EFER_MSR, 
176                 &v3_handle_efer_read,
177                 &v3_handle_efer_write, 
178                 core);
179
180     // Restore host's EFER register on each VM EXIT
181     vmx_state->exit_ctrls.ld_efer = 1;
182
183     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
184     vmx_state->exit_ctrls.save_efer = 1;
185     vmx_state->entry_ctrls.ld_efer  = 1;
186
187     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
188     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
189
190
191     /* Setup paging */
192     if (core->shdw_pg_mode == SHADOW_PAGING) {
193         PrintDebug("Creating initial shadow page table\n");
194
195         if (v3_init_passthrough_pts(core) == -1) {
196             PrintError("Could not initialize passthrough page tables\n");
197             return -1;
198         }
199         
200 #define CR0_PE 0x00000001
201 #define CR0_PG 0x80000000
202 #define CR0_WP 0x00010000 // To ensure mem hooks work
203         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
204
205         core->ctrl_regs.cr3 = core->direct_map_pt;
206
207         // vmx_state->pinbased_ctrls |= NMI_EXIT;
208
209         /* Add CR exits */
210         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
211         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
212         
213         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
214         
215         /* Add page fault exits */
216         vmx_state->excp_bmap.pf = 1;
217
218         // Setup VMX Assist
219         v3_vmxassist_init(core, vmx_state);
220
221     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
222                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
223
224 #define CR0_PE 0x00000001
225 #define CR0_PG 0x80000000
226 #define CR0_WP 0x00010000 // To ensure mem hooks work
227         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
228
229         // vmx_state->pinbased_ctrls |= NMI_EXIT;
230
231         /* Disable CR exits */
232         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
233         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
234
235         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
236
237         /* Add page fault exits */
238         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
239         
240         // Setup VMX Assist
241         v3_vmxassist_init(core, vmx_state);
242
243         /* Enable EPT */
244         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
245         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
246
247
248
249         if (v3_init_ept(core, &hw_info) == -1) {
250             PrintError("Error initializing EPT\n");
251             return -1;
252         }
253
254     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
255                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
256         int i = 0;
257         // For now we will assume that unrestricted guest mode is assured w/ EPT
258
259
260         core->vm_regs.rsp = 0x00;
261         core->rip = 0xfff0;
262         core->vm_regs.rdx = 0x00000f00;
263         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
264         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
265
266
267         core->segments.cs.selector = 0xf000;
268         core->segments.cs.limit = 0xffff;
269         core->segments.cs.base = 0x0000000f0000LL;
270
271         // (raw attributes = 0xf3)
272         core->segments.cs.type = 0xb;
273         core->segments.cs.system = 0x1;
274         core->segments.cs.dpl = 0x0;
275         core->segments.cs.present = 1;
276
277
278
279         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
280                                           &(core->segments.es), &(core->segments.fs), 
281                                           &(core->segments.gs), NULL};
282
283         for ( i = 0; segregs[i] != NULL; i++) {
284             struct v3_segment * seg = segregs[i];
285         
286             seg->selector = 0x0000;
287             //    seg->base = seg->selector << 4;
288             seg->base = 0x00000000;
289             seg->limit = 0xffff;
290
291
292             seg->type = 0x3;
293             seg->system = 0x1;
294             seg->dpl = 0x0;
295             seg->present = 1;
296             //    seg->granularity = 1;
297
298         }
299
300
301         core->segments.gdtr.limit = 0x0000ffff;
302         core->segments.gdtr.base = 0x0000000000000000LL;
303
304         core->segments.idtr.limit = 0x0000ffff;
305         core->segments.idtr.base = 0x0000000000000000LL;
306
307         core->segments.ldtr.selector = 0x0000;
308         core->segments.ldtr.limit = 0x0000ffff;
309         core->segments.ldtr.base = 0x0000000000000000LL;
310         core->segments.ldtr.type = 2;
311         core->segments.ldtr.present = 1;
312
313         core->segments.tr.selector = 0x0000;
314         core->segments.tr.limit = 0x0000ffff;
315         core->segments.tr.base = 0x0000000000000000LL;
316         core->segments.tr.type = 0xb;
317         core->segments.tr.present = 1;
318
319         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
320         core->dbg_regs.dr7 = 0x0000000000000400LL;
321
322         /* Enable EPT */
323         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
324         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
325         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
326
327
328         /* Disable shadow paging stuff */
329         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
330         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
331
332         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
333
334
335         if (v3_init_ept(core, &hw_info) == -1) {
336             PrintError("Error initializing EPT\n");
337             return -1;
338         }
339
340     } else {
341         PrintError("Invalid Virtual paging mode\n");
342         return -1;
343     }
344
345
346     // hook vmx msrs
347
348     // Setup SYSCALL/SYSENTER MSRs in load/store area
349     
350     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
351     {
352
353         struct vmcs_msr_save_area * msr_entries = NULL;
354         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
355         int msr_ret = 0;
356
357         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
358
359         if (max_msrs < 4) {
360             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
361             return -1;
362         }
363
364         vmx_state->msr_area_paddr = (addr_t)V3_AllocPages(1);
365         
366         if (vmx_state->msr_area_paddr == (addr_t)NULL) {
367             PrintError("could not allocate msr load/store area\n");
368             return -1;
369         }
370
371         msr_entries = (struct vmcs_msr_save_area *)V3_VAddr((void *)(vmx_state->msr_area_paddr));
372         vmx_state->msr_area = msr_entries; // cache in vmx_info
373
374         memset(msr_entries, 0, PAGE_SIZE);
375
376         msr_entries->guest_star.index = IA32_STAR_MSR;
377         msr_entries->guest_lstar.index = IA32_LSTAR_MSR;
378         msr_entries->guest_fmask.index = IA32_FMASK_MSR;
379         msr_entries->guest_kern_gs.index = IA32_KERN_GS_BASE_MSR;
380
381         msr_entries->host_star.index = IA32_STAR_MSR;
382         msr_entries->host_lstar.index = IA32_LSTAR_MSR;
383         msr_entries->host_fmask.index = IA32_FMASK_MSR;
384         msr_entries->host_kern_gs.index = IA32_KERN_GS_BASE_MSR;
385
386         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
387         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
388         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
389
390         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
391         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
392         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->host_msrs));
393
394
395         msr_ret |= v3_hook_msr(core->vm_info, IA32_STAR_MSR, NULL, NULL, NULL);
396         msr_ret |= v3_hook_msr(core->vm_info, IA32_LSTAR_MSR, NULL, NULL, NULL);
397         msr_ret |= v3_hook_msr(core->vm_info, IA32_FMASK_MSR, NULL, NULL, NULL);
398         msr_ret |= v3_hook_msr(core->vm_info, IA32_KERN_GS_BASE_MSR, NULL, NULL, NULL);
399
400
401         // IMPORTANT: These MSRs appear to be cached by the hardware....
402         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_CS_MSR, NULL, NULL, NULL);
403         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_ESP_MSR, NULL, NULL, NULL);
404         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_EIP_MSR, NULL, NULL, NULL);
405
406         msr_ret |= v3_hook_msr(core->vm_info, FS_BASE_MSR, NULL, NULL, NULL);
407         msr_ret |= v3_hook_msr(core->vm_info, GS_BASE_MSR, NULL, NULL, NULL);
408
409
410         // Not sure what to do about this... Does not appear to be an explicit hardware cache version...
411         msr_ret |= v3_hook_msr(core->vm_info, IA32_CSTAR_MSR, NULL, NULL, NULL);
412
413         if (msr_ret != 0) {
414             PrintError("Error configuring MSR save/restore area\n");
415             return -1;
416         }
417
418
419     }    
420
421     /* Sanity check ctrl/reg fields against hw_defaults */
422
423
424
425
426     /*** Write all the info to the VMCS ***/
427   
428     /*
429     {
430         // IS THIS NECESSARY???
431 #define DEBUGCTL_MSR 0x1d9
432         struct v3_msr tmp_msr;
433         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
434         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
435         core->dbg_regs.dr7 = 0x400;
436     }
437     */
438
439 #ifdef __V3_64BIT__
440     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
441 #else
442     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
443     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
444 #endif
445
446
447  
448
449     if (v3_update_vmcs_ctrl_fields(core)) {
450         PrintError("Could not write control fields!\n");
451         return -1;
452     }
453     
454     /*
455     if (v3_update_vmcs_host_state(core)) {
456         PrintError("Could not write host state\n");
457         return -1;
458     }
459     */
460
461     // reenable global interrupts for vm state initialization now
462     // that the vm state is initialized. If another VM kicks us off, 
463     // it'll update our vmx state so that we know to reload ourself
464     v3_enable_ints();
465
466     return 0;
467 }
468
469 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
470     struct vmx_data * vmx_state = NULL;
471     int vmx_ret = 0;
472     
473     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
474     memset(vmx_state, 0, sizeof(struct vmx_data));
475
476     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
477
478     PrintDebug("Allocating VMCS\n");
479     vmx_state->vmcs_ptr_phys = allocate_vmcs();
480
481     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
482
483     core->vmm_data = vmx_state;
484     vmx_state->state = VMX_UNLAUNCHED;
485
486     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
487     
488     // TODO: Fix vmcs fields so they're 32-bit
489
490     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
491     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
492
493     if (vmx_ret != VMX_SUCCESS) {
494         PrintError("VMCLEAR failed\n");
495         return -1; 
496     }
497
498     if (vm_class == V3_PC_VM) {
499         PrintDebug("Initializing VMCS\n");
500         if (init_vmcs_bios(core, vmx_state) == -1) {
501             PrintError("Error initializing VMCS to BIOS state\n");
502             return -1;
503         }
504     } else {
505         PrintError("Invalid VM Class\n");
506         return -1;
507     }
508
509     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
510     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
511
512     return 0;
513 }
514
515
516 int v3_deinit_vmx_vmcs(struct guest_info * core) {
517     struct vmx_data * vmx_state = core->vmm_data;
518
519     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
520     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
521
522     V3_Free(vmx_state);
523
524     return 0;
525 }
526
527
528
529 #ifdef V3_CONFIG_CHECKPOINT
530 /* 
531  * JRL: This is broken
532  */
533 int v3_vmx_save_core(struct guest_info * core, void * ctx){
534     uint64_t vmcs_ptr = vmcs_store();
535
536     v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
537
538     return 0;
539 }
540
541 int v3_vmx_load_core(struct guest_info * core, void * ctx){
542     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
543     struct cr0_32 * shadow_cr0;
544     char vmcs[PAGE_SIZE_4KB];
545
546     v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
547
548     vmcs_clear(vmx_info->vmcs_ptr_phys);
549     vmcs_load((addr_t)vmcs);
550
551     v3_vmx_save_vmcs(core);
552
553     shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
554
555
556     /* Get the CPU mode to set the guest_ia32e entry ctrl */
557
558     if (core->shdw_pg_mode == SHADOW_PAGING) {
559         if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
560             if (v3_activate_shadow_pt(core) == -1) {
561                 PrintError("Failed to activate shadow page tables\n");
562                 return -1;
563             }
564         } else {
565             if (v3_activate_passthrough_pt(core) == -1) {
566                 PrintError("Failed to activate passthrough page tables\n");
567                 return -1;
568             }
569         }
570     }
571
572     return 0;
573 }
574 #endif
575
576
577 void v3_flush_vmx_vm_core(struct guest_info * core) {
578     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
579     vmcs_clear(vmx_info->vmcs_ptr_phys);
580     vmx_info->state = VMX_UNLAUNCHED;
581 }
582
583
584
585 static int update_irq_exit_state(struct guest_info * info) {
586     struct vmx_exit_idt_vec_info idt_vec_info;
587
588     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
589
590     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
591 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
592         V3_Print("Calling v3_injecting_intr\n");
593 #endif
594         info->intr_core_state.irq_started = 0;
595         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
596     }
597
598     return 0;
599 }
600
601 static int update_irq_entry_state(struct guest_info * info) {
602     struct vmx_exit_idt_vec_info idt_vec_info;
603     struct vmcs_interrupt_state intr_core_state;
604     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
605
606     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
607     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
608
609     /* Check for pending exceptions to inject */
610     if (v3_excp_pending(info)) {
611         struct vmx_entry_int_info int_info;
612         int_info.value = 0;
613
614         // In VMX, almost every exception is hardware
615         // Software exceptions are pretty much only for breakpoint or overflow
616         int_info.type = 3;
617         int_info.vector = v3_get_excp_number(info);
618
619         if (info->excp_state.excp_error_code_valid) {
620             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
621             int_info.error_code = 1;
622
623 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
624             V3_Print("Injecting exception %d with error code %x\n", 
625                     int_info.vector, info->excp_state.excp_error_code);
626 #endif
627         }
628
629         int_info.valid = 1;
630 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
631         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
632 #endif
633         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
634
635         v3_injecting_excp(info, int_info.vector);
636
637     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
638                (intr_core_state.val == 0)) {
639        
640         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
641
642 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
643             V3_Print("IRQ pending from previous injection\n");
644 #endif
645
646             // Copy the IDT vectoring info over to reinject the old interrupt
647             if (idt_vec_info.error_code == 1) {
648                 uint32_t err_code = 0;
649
650                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
651                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
652             }
653
654             idt_vec_info.undef = 0;
655             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
656
657         } else {
658             struct vmx_entry_int_info ent_int;
659             ent_int.value = 0;
660
661             switch (v3_intr_pending(info)) {
662                 case V3_EXTERNAL_IRQ: {
663                     info->intr_core_state.irq_vector = v3_get_intr(info); 
664                     ent_int.vector = info->intr_core_state.irq_vector;
665                     ent_int.type = 0;
666                     ent_int.error_code = 0;
667                     ent_int.valid = 1;
668
669 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
670                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
671                                info->intr_core_state.irq_vector, 
672                                (uint32_t)info->num_exits, 
673                                (void *)(addr_t)info->rip);
674 #endif
675
676                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
677                     info->intr_core_state.irq_started = 1;
678
679                     break;
680                 }
681                 case V3_NMI:
682                     PrintDebug("Injecting NMI\n");
683
684                     ent_int.type = 2;
685                     ent_int.vector = 2;
686                     ent_int.valid = 1;
687                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
688
689                     break;
690                 case V3_SOFTWARE_INTR:
691                     PrintDebug("Injecting software interrupt\n");
692                     ent_int.type = 4;
693
694                     ent_int.valid = 1;
695                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
696
697                     break;
698                 case V3_VIRTUAL_IRQ:
699                     // Not sure what to do here, Intel doesn't have virtual IRQs
700                     // May be the same as external interrupts/IRQs
701
702                     break;
703                 case V3_INVALID_INTR:
704                 default:
705                     break;
706             }
707         }
708     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
709         // Enable INTR window exiting so we know when IF=1
710         uint32_t instr_len;
711
712         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
713
714 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
715         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
716 #endif
717
718         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
719         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
720     }
721
722
723     return 0;
724 }
725
726
727
728 static struct vmx_exit_info exit_log[10];
729
730 static void print_exit_log(struct guest_info * info) {
731     int cnt = info->num_exits % 10;
732     int i = 0;
733     
734
735     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
736
737     for (i = 0; i < 10; i++) {
738         struct vmx_exit_info * tmp = &exit_log[cnt];
739
740         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
741         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
742         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
743         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
744         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
745
746         cnt--;
747
748         if (cnt == -1) {
749             cnt = 9;
750         }
751
752     }
753
754 }
755
756 /* 
757  * CAUTION and DANGER!!! 
758  * 
759  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
760  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
761  * on its contents will cause things to break. The contents at the time of the exit WILL 
762  * change before the exit handler is executed.
763  */
764 int v3_vmx_enter(struct guest_info * info) {
765     int ret = 0;
766     uint32_t tsc_offset_low, tsc_offset_high;
767     struct vmx_exit_info exit_info;
768     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
769
770     // Conditionally yield the CPU if the timeslice has expired
771     v3_yield_cond(info);
772
773     // Perform any additional yielding needed for time adjustment
774     v3_adjust_time(info);
775
776     // disable global interrupts for vm state transition
777     v3_disable_ints();
778
779     // Update timer devices late after being in the VM so that as much 
780     // of hte time in the VM is accounted for as possible. Also do it before
781     // updating IRQ entry state so that any interrupts the timers raise get 
782     // handled on the next VM entry. Must be done with interrupts disabled.
783     v3_update_timers(info);
784
785     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
786         vmcs_clear(vmx_info->vmcs_ptr_phys);
787         vmcs_load(vmx_info->vmcs_ptr_phys);
788         vmx_info->state = VMX_UNLAUNCHED;
789     }
790
791     v3_vmx_restore_vmcs(info);
792
793
794 #ifdef V3_CONFIG_SYMCALL
795     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
796         update_irq_entry_state(info);
797     }
798 #else 
799     update_irq_entry_state(info);
800 #endif
801
802     {
803         addr_t guest_cr3;
804         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
805         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
806     }
807
808     // Perform last-minute time bookkeeping prior to entering the VM
809     v3_time_enter_vm(info);
810
811     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
812     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
813     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
814     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
815
816     if (v3_update_vmcs_host_state(info)) {
817         v3_enable_ints();
818         PrintError("Could not write host state\n");
819         return -1;
820     }
821
822
823     if (vmx_info->state == VMX_UNLAUNCHED) {
824         vmx_info->state = VMX_LAUNCHED;
825         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
826     } else {
827         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
828         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
829     }
830     
831
832
833     //  PrintDebug("VMX Exit: ret=%d\n", ret);
834
835     if (ret != VMX_SUCCESS) {
836         uint32_t error = 0;
837         vmcs_read(VMCS_INSTR_ERR, &error);
838
839         v3_enable_ints();
840
841         PrintError("VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
842         return -1;
843     }
844
845
846
847     // Immediate exit from VM time bookkeeping
848     v3_time_exit_vm(info);
849
850     info->num_exits++;
851
852     /* Update guest state */
853     v3_vmx_save_vmcs(info);
854
855     // info->cpl = info->segments.cs.selector & 0x3;
856
857     info->mem_mode = v3_get_vm_mem_mode(info);
858     info->cpu_mode = v3_get_vm_cpu_mode(info);
859
860
861     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
862     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
863     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
864     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
865     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
866     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
867     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
868
869     if (info->shdw_pg_mode == NESTED_PAGING) {
870         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
871     }
872
873     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
874
875     exit_log[info->num_exits % 10] = exit_info;
876
877 #ifdef V3_CONFIG_SYMCALL
878     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
879         update_irq_exit_state(info);
880     }
881 #else
882     update_irq_exit_state(info);
883 #endif
884
885     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
886         // This is a special case whose only job is to inject an interrupt
887         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
888         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
889         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
890
891 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
892        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
893 #endif
894     }
895
896     // reenable global interrupts after vm exit
897     v3_enable_ints();
898
899     // Conditionally yield the CPU if the timeslice has expired
900     v3_yield_cond(info);
901
902     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
903         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
904         return -1;
905     }
906
907     return 0;
908 }
909
910
911 int v3_start_vmx_guest(struct guest_info * info) {
912
913     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
914
915     if (info->vcpu_id == 0) {
916         info->core_run_state = CORE_RUNNING;
917     } else {
918
919         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
920
921         while (info->core_run_state == CORE_STOPPED) {
922
923             if (info->vm_info->run_state == VM_STOPPED) {
924                 // The VM was stopped before this core was initialized. 
925                 return 0;
926             }
927
928             v3_yield(info);
929             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
930         }
931         
932         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
933
934         // We'll be paranoid about race conditions here
935         v3_wait_at_barrier(info);
936     }
937
938
939     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
940                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
941                info->segments.cs.limit, (void *)(info->rip));
942
943
944     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
945
946     v3_start_time(info);
947
948     while (1) {
949
950         if (info->vm_info->run_state == VM_STOPPED) {
951             info->core_run_state = CORE_STOPPED;
952             break;
953         }
954
955         if (v3_vmx_enter(info) == -1) {
956
957             addr_t host_addr;
958             addr_t linear_addr = 0;
959             
960             info->vm_info->run_state = VM_ERROR;
961             
962             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
963             
964             v3_print_guest_state(info);
965             
966             V3_Print("VMX core %u\n", info->vcpu_id); 
967
968             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
969             
970             if (info->mem_mode == PHYSICAL_MEM) {
971                 v3_gpa_to_hva(info, linear_addr, &host_addr);
972             } else if (info->mem_mode == VIRTUAL_MEM) {
973                 v3_gva_to_hva(info, linear_addr, &host_addr);
974             }
975             
976             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
977             
978             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
979             v3_dump_mem((uint8_t *)host_addr, 15);
980             
981             v3_print_stack(info);
982
983
984             v3_print_vmcs();
985             print_exit_log(info);
986             return -1;
987         }
988
989         v3_wait_at_barrier(info);
990
991
992         if (info->vm_info->run_state == VM_STOPPED) {
993             info->core_run_state = CORE_STOPPED;
994             break;
995         }
996 /*
997         if ((info->num_exits % 5000) == 0) {
998             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
999         }
1000 */
1001
1002     }
1003
1004     return 0;
1005 }
1006
1007
1008
1009
1010 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
1011 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
1012 #define CPUID_1_ECX_VTXFLAG 0x00000020
1013
1014 int v3_is_vmx_capable() {
1015     v3_msr_t feature_msr;
1016     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1017
1018     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1019
1020     PrintDebug("ECX: 0x%x\n", ecx);
1021
1022     if (ecx & CPUID_1_ECX_VTXFLAG) {
1023         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1024         
1025         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
1026
1027         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1028             PrintDebug("VMX is locked -- enable in the BIOS\n");
1029             return 0;
1030         }
1031
1032     } else {
1033         PrintDebug("VMX not supported on this cpu\n");
1034         return 0;
1035     }
1036
1037     return 1;
1038 }
1039
1040
1041 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1042     // init vmcs bios
1043     
1044     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1045         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1046         // easy 
1047         core->rip = 0;
1048         core->segments.cs.selector = rip << 8;
1049         core->segments.cs.limit = 0xffff;
1050         core->segments.cs.base = rip << 12;
1051     } else {
1052         core->vm_regs.rdx = core->vcpu_id;
1053         core->vm_regs.rbx = rip;
1054     }
1055
1056     return 0;
1057 }
1058
1059
1060
1061 void v3_init_vmx_cpu(int cpu_id) {
1062     addr_t vmx_on_region = 0;
1063
1064     if (cpu_id == 0) {
1065         if (v3_init_vmx_hw(&hw_info) == -1) {
1066             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1067             return;
1068         }
1069     }
1070
1071     enable_vmx();
1072
1073
1074     // Setup VMXON Region
1075     vmx_on_region = allocate_vmcs();
1076
1077
1078     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1079         V3_Print("VMX Enabled\n");
1080         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1081     } else {
1082         V3_Print("VMX already enabled\n");
1083         V3_FreePages((void *)vmx_on_region, 1);
1084     }
1085
1086     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1087
1088     {
1089         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1090         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1091         
1092         if (sec_proc_ctrls.enable_ept == 0) {
1093             V3_Print("VMX EPT (Nested) Paging not supported\n");
1094             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1095         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1096             V3_Print("VMX EPT (Nested) Paging supported\n");
1097             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1098         } else {
1099             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1100             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1101         }
1102     }
1103 }
1104
1105
1106 void v3_deinit_vmx_cpu(int cpu_id) {
1107     extern v3_cpu_arch_t v3_cpu_types[];
1108     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1109
1110     if (host_vmcs_ptrs[cpu_id] != 0) {
1111         V3_Print("Disabling VMX\n");
1112
1113         if (vmx_off() != VMX_SUCCESS) {
1114             PrintError("Error executing VMXOFF\n");
1115         }
1116
1117         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1118
1119         host_vmcs_ptrs[cpu_id] = 0;
1120     }
1121 }