Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


moved VM runstate settings to avoid race condition
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #ifdef V3_CONFIG_CHECKPOINT
38 #include <palacios/vmm_checkpoint.h>
39 #endif
40
41 #include <palacios/vmx_ept.h>
42 #include <palacios/vmx_assist.h>
43 #include <palacios/vmx_hw_info.h>
44
45 #ifndef V3_CONFIG_DEBUG_VMX
46 #undef PrintDebug
47 #define PrintDebug(fmt, args...)
48 #endif
49
50
51 /* These fields contain the hardware feature sets supported by the local CPU */
52 static struct vmx_hw_info hw_info;
53
54 extern v3_cpu_arch_t v3_cpu_types[];
55
56 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
57
58 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
59 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
60
61 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
62     int ret = 0;
63
64     ret = vmcs_write(field, val);
65
66     if (ret != VMX_SUCCESS) {
67         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
68         return 1;
69     }
70
71     return 0;
72 }
73
74 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
75     int ret = 0;
76
77     ret = vmcs_read(field, val);
78
79     if (ret != VMX_SUCCESS) {
80         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
81     }
82
83     return ret;
84 }
85
86
87
88
89 static addr_t allocate_vmcs() {
90     struct vmcs_data * vmcs_page = NULL;
91
92     PrintDebug("Allocating page\n");
93
94     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
95     memset(vmcs_page, 0, 4096);
96
97     vmcs_page->revision = hw_info.basic_info.revision;
98     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
99
100     return (addr_t)V3_PAddr((void *)vmcs_page);
101 }
102
103
104
105
106 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
107     int vmx_ret = 0;
108
109     // disable global interrupts for vm state initialization
110     v3_disable_ints();
111
112     PrintDebug("Loading VMCS\n");
113     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
114     vmx_state->state = VMX_UNLAUNCHED;
115
116     if (vmx_ret != VMX_SUCCESS) {
117         PrintError("VMPTRLD failed\n");
118         return -1;
119     }
120
121
122     /*** Setup default state from HW ***/
123
124     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
125     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
126     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
127     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
128     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
129
130     /* Print Control MSRs */
131     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
132     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
133
134
135
136     /******* Setup Host State **********/
137
138     /* Cache GDTR, IDTR, and TR in host struct */
139
140
141     /********** Setup VMX Control Fields ***********/
142
143     /* Add external interrupts, NMI exiting, and virtual NMI */
144     vmx_state->pin_ctrls.nmi_exit = 1;
145     vmx_state->pin_ctrls.ext_int_exit = 1;
146
147
148     vmx_state->pri_proc_ctrls.hlt_exit = 1;
149
150
151     vmx_state->pri_proc_ctrls.pause_exit = 0;
152     vmx_state->pri_proc_ctrls.tsc_offset = 1;
153 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
154     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
155 #endif
156
157     /* Setup IO map */
158     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
159     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
160     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
161             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
162
163
164     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
165     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
166
167
168
169 #ifdef __V3_64BIT__
170     // Ensure host runs in 64-bit mode at each VM EXIT
171     vmx_state->exit_ctrls.host_64_on = 1;
172 #endif
173
174     // Hook all accesses to EFER register
175     v3_hook_msr(core->vm_info, EFER_MSR, 
176                 &v3_handle_efer_read,
177                 &v3_handle_efer_write, 
178                 core);
179
180     // Restore host's EFER register on each VM EXIT
181     vmx_state->exit_ctrls.ld_efer = 1;
182
183     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
184     vmx_state->exit_ctrls.save_efer = 1;
185     vmx_state->entry_ctrls.ld_efer  = 1;
186
187     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
188     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
189
190
191     /* Setup paging */
192     if (core->shdw_pg_mode == SHADOW_PAGING) {
193         PrintDebug("Creating initial shadow page table\n");
194
195         if (v3_init_passthrough_pts(core) == -1) {
196             PrintError("Could not initialize passthrough page tables\n");
197             return -1;
198         }
199         
200 #define CR0_PE 0x00000001
201 #define CR0_PG 0x80000000
202 #define CR0_WP 0x00010000 // To ensure mem hooks work
203         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
204
205         core->ctrl_regs.cr3 = core->direct_map_pt;
206
207         // vmx_state->pinbased_ctrls |= NMI_EXIT;
208
209         /* Add CR exits */
210         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
211         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
212         
213         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
214         
215         /* Add page fault exits */
216         vmx_state->excp_bmap.pf = 1;
217
218         // Setup VMX Assist
219         v3_vmxassist_init(core, vmx_state);
220
221     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
222                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
223
224 #define CR0_PE 0x00000001
225 #define CR0_PG 0x80000000
226 #define CR0_WP 0x00010000 // To ensure mem hooks work
227         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
228
229         // vmx_state->pinbased_ctrls |= NMI_EXIT;
230
231         /* Disable CR exits */
232         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
233         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
234
235         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
236
237         /* Add page fault exits */
238         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
239         
240         // Setup VMX Assist
241         v3_vmxassist_init(core, vmx_state);
242
243         /* Enable EPT */
244         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
245         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
246
247
248
249         if (v3_init_ept(core, &hw_info) == -1) {
250             PrintError("Error initializing EPT\n");
251             return -1;
252         }
253
254     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
255                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
256         int i = 0;
257         // For now we will assume that unrestricted guest mode is assured w/ EPT
258
259
260         core->vm_regs.rsp = 0x00;
261         core->rip = 0xfff0;
262         core->vm_regs.rdx = 0x00000f00;
263         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
264         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
265
266
267         core->segments.cs.selector = 0xf000;
268         core->segments.cs.limit = 0xffff;
269         core->segments.cs.base = 0x0000000f0000LL;
270
271         // (raw attributes = 0xf3)
272         core->segments.cs.type = 0xb;
273         core->segments.cs.system = 0x1;
274         core->segments.cs.dpl = 0x0;
275         core->segments.cs.present = 1;
276
277
278
279         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
280                                           &(core->segments.es), &(core->segments.fs), 
281                                           &(core->segments.gs), NULL};
282
283         for ( i = 0; segregs[i] != NULL; i++) {
284             struct v3_segment * seg = segregs[i];
285         
286             seg->selector = 0x0000;
287             //    seg->base = seg->selector << 4;
288             seg->base = 0x00000000;
289             seg->limit = 0xffff;
290
291
292             seg->type = 0x3;
293             seg->system = 0x1;
294             seg->dpl = 0x0;
295             seg->present = 1;
296             //    seg->granularity = 1;
297
298         }
299
300
301         core->segments.gdtr.limit = 0x0000ffff;
302         core->segments.gdtr.base = 0x0000000000000000LL;
303
304         core->segments.idtr.limit = 0x0000ffff;
305         core->segments.idtr.base = 0x0000000000000000LL;
306
307         core->segments.ldtr.selector = 0x0000;
308         core->segments.ldtr.limit = 0x0000ffff;
309         core->segments.ldtr.base = 0x0000000000000000LL;
310         core->segments.ldtr.type = 2;
311         core->segments.ldtr.present = 1;
312
313         core->segments.tr.selector = 0x0000;
314         core->segments.tr.limit = 0x0000ffff;
315         core->segments.tr.base = 0x0000000000000000LL;
316         core->segments.tr.type = 0xb;
317         core->segments.tr.present = 1;
318
319         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
320         core->dbg_regs.dr7 = 0x0000000000000400LL;
321
322         /* Enable EPT */
323         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
324         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
325         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
326
327
328         /* Disable shadow paging stuff */
329         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
330         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
331
332         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
333
334
335         if (v3_init_ept(core, &hw_info) == -1) {
336             PrintError("Error initializing EPT\n");
337             return -1;
338         }
339
340     } else {
341         PrintError("Invalid Virtual paging mode\n");
342         return -1;
343     }
344
345
346     // hook vmx msrs
347
348     // Setup SYSCALL/SYSENTER MSRs in load/store area
349     
350     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
351     {
352         int msr_ret = 0;
353
354         struct vmcs_msr_entry * exit_store_msrs = NULL;
355         struct vmcs_msr_entry * exit_load_msrs = NULL;
356         struct vmcs_msr_entry * entry_load_msrs = NULL;;
357         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
358
359         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
360
361         if (max_msrs < 4) {
362             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
363             return -1;
364         }
365
366         vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
367
368         if (vmx_state->msr_area == NULL) {
369             PrintError("could not allocate msr load/store area\n");
370             return -1;
371         }
372
373         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
374         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
375         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
376         
377         
378         exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
379         exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
380         entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
381
382
383         exit_store_msrs[0].index = IA32_STAR_MSR;
384         exit_store_msrs[1].index = IA32_LSTAR_MSR;
385         exit_store_msrs[2].index = IA32_FMASK_MSR;
386         exit_store_msrs[3].index = IA32_KERN_GS_BASE_MSR;
387         
388         memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
389         memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
390
391         
392         v3_get_msr(IA32_STAR_MSR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
393         v3_get_msr(IA32_LSTAR_MSR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
394         v3_get_msr(IA32_FMASK_MSR, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
395         v3_get_msr(IA32_KERN_GS_BASE_MSR, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
396
397         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
398         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
399         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
400
401
402         v3_hook_msr(core->vm_info, IA32_STAR_MSR, NULL, NULL, NULL);
403         v3_hook_msr(core->vm_info, IA32_LSTAR_MSR, NULL, NULL, NULL);
404         v3_hook_msr(core->vm_info, IA32_FMASK_MSR, NULL, NULL, NULL);
405         v3_hook_msr(core->vm_info, IA32_KERN_GS_BASE_MSR, NULL, NULL, NULL);
406
407
408         // IMPORTANT: These SYSCALL MSRs are currently not handled by hardware or cached
409         // We should really emulate these ourselves, or ideally include them in the MSR store area if there is room
410         v3_hook_msr(core->vm_info, IA32_CSTAR_MSR, NULL, NULL, NULL);
411         v3_hook_msr(core->vm_info, SYSENTER_CS_MSR, NULL, NULL, NULL);
412         v3_hook_msr(core->vm_info, SYSENTER_ESP_MSR, NULL, NULL, NULL);
413         v3_hook_msr(core->vm_info, SYSENTER_EIP_MSR, NULL, NULL, NULL);
414         
415         v3_hook_msr(core->vm_info, FS_BASE_MSR, NULL, NULL, NULL);
416         v3_hook_msr(core->vm_info, GS_BASE_MSR, NULL, NULL, NULL);
417         
418
419     }    
420
421     /* Sanity check ctrl/reg fields against hw_defaults */
422
423
424
425
426     /*** Write all the info to the VMCS ***/
427   
428     /*
429     {
430         // IS THIS NECESSARY???
431 #define DEBUGCTL_MSR 0x1d9
432         struct v3_msr tmp_msr;
433         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
434         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
435         core->dbg_regs.dr7 = 0x400;
436     }
437     */
438
439 #ifdef __V3_64BIT__
440     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
441 #else
442     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
443     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
444 #endif
445
446
447  
448
449     if (v3_update_vmcs_ctrl_fields(core)) {
450         PrintError("Could not write control fields!\n");
451         return -1;
452     }
453     
454     if (v3_update_vmcs_host_state(core)) {
455         PrintError("Could not write host state\n");
456         return -1;
457     }
458
459     // reenable global interrupts for vm state initialization now
460     // that the vm state is initialized. If another VM kicks us off, 
461     // it'll update our vmx state so that we know to reload ourself
462     v3_enable_ints();
463
464     return 0;
465 }
466
467 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
468     struct vmx_data * vmx_state = NULL;
469     int vmx_ret = 0;
470     
471     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
472     memset(vmx_state, 0, sizeof(struct vmx_data));
473
474     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
475
476     PrintDebug("Allocating VMCS\n");
477     vmx_state->vmcs_ptr_phys = allocate_vmcs();
478
479     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
480
481     core->vmm_data = vmx_state;
482     vmx_state->state = VMX_UNLAUNCHED;
483
484     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
485     
486     // TODO: Fix vmcs fields so they're 32-bit
487
488     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
489     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
490
491     if (vmx_ret != VMX_SUCCESS) {
492         PrintError("VMCLEAR failed\n");
493         return -1; 
494     }
495
496     if (vm_class == V3_PC_VM) {
497         PrintDebug("Initializing VMCS\n");
498         if (init_vmcs_bios(core, vmx_state) == -1) {
499             PrintError("Error initializing VMCS to BIOS state\n");
500             return -1;
501         }
502     } else {
503         PrintError("Invalid VM Class\n");
504         return -1;
505     }
506
507     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
508     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
509
510     return 0;
511 }
512
513
514 int v3_deinit_vmx_vmcs(struct guest_info * core) {
515     struct vmx_data * vmx_state = core->vmm_data;
516
517     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
518     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
519
520     V3_Free(vmx_state);
521
522     return 0;
523 }
524
525
526
527 #ifdef V3_CONFIG_CHECKPOINT
528 /* 
529  * JRL: This is broken
530  */
531 int v3_vmx_save_core(struct guest_info * core, void * ctx){
532     uint64_t vmcs_ptr = vmcs_store();
533
534     v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
535
536     return 0;
537 }
538
539 int v3_vmx_load_core(struct guest_info * core, void * ctx){
540     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
541     struct cr0_32 * shadow_cr0;
542     char vmcs[PAGE_SIZE_4KB];
543
544     v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
545
546     vmcs_clear(vmx_info->vmcs_ptr_phys);
547     vmcs_load((addr_t)vmcs);
548
549     v3_vmx_save_vmcs(core);
550
551     shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
552
553
554     /* Get the CPU mode to set the guest_ia32e entry ctrl */
555
556     if (core->shdw_pg_mode == SHADOW_PAGING) {
557         if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
558             if (v3_activate_shadow_pt(core) == -1) {
559                 PrintError("Failed to activate shadow page tables\n");
560                 return -1;
561             }
562         } else {
563             if (v3_activate_passthrough_pt(core) == -1) {
564                 PrintError("Failed to activate passthrough page tables\n");
565                 return -1;
566             }
567         }
568     }
569
570     return 0;
571 }
572 #endif
573
574
575 void v3_flush_vmx_vm_core(struct guest_info * core) {
576     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
577     vmcs_clear(vmx_info->vmcs_ptr_phys);
578     vmx_info->state = VMX_UNLAUNCHED;
579 }
580
581
582
583 static int update_irq_exit_state(struct guest_info * info) {
584     struct vmx_exit_idt_vec_info idt_vec_info;
585
586     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
587
588     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
589 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
590         V3_Print("Calling v3_injecting_intr\n");
591 #endif
592         info->intr_core_state.irq_started = 0;
593         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
594     }
595
596     return 0;
597 }
598
599 static int update_irq_entry_state(struct guest_info * info) {
600     struct vmx_exit_idt_vec_info idt_vec_info;
601     struct vmcs_interrupt_state intr_core_state;
602     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
603
604     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
605     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
606
607     /* Check for pending exceptions to inject */
608     if (v3_excp_pending(info)) {
609         struct vmx_entry_int_info int_info;
610         int_info.value = 0;
611
612         // In VMX, almost every exception is hardware
613         // Software exceptions are pretty much only for breakpoint or overflow
614         int_info.type = 3;
615         int_info.vector = v3_get_excp_number(info);
616
617         if (info->excp_state.excp_error_code_valid) {
618             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
619             int_info.error_code = 1;
620
621 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
622             V3_Print("Injecting exception %d with error code %x\n", 
623                     int_info.vector, info->excp_state.excp_error_code);
624 #endif
625         }
626
627         int_info.valid = 1;
628 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
629         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
630 #endif
631         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
632
633         v3_injecting_excp(info, int_info.vector);
634
635     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
636                (intr_core_state.val == 0)) {
637        
638         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
639
640 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
641             V3_Print("IRQ pending from previous injection\n");
642 #endif
643
644             // Copy the IDT vectoring info over to reinject the old interrupt
645             if (idt_vec_info.error_code == 1) {
646                 uint32_t err_code = 0;
647
648                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
649                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
650             }
651
652             idt_vec_info.undef = 0;
653             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
654
655         } else {
656             struct vmx_entry_int_info ent_int;
657             ent_int.value = 0;
658
659             switch (v3_intr_pending(info)) {
660                 case V3_EXTERNAL_IRQ: {
661                     info->intr_core_state.irq_vector = v3_get_intr(info); 
662                     ent_int.vector = info->intr_core_state.irq_vector;
663                     ent_int.type = 0;
664                     ent_int.error_code = 0;
665                     ent_int.valid = 1;
666
667 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
668                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
669                                info->intr_core_state.irq_vector, 
670                                (uint32_t)info->num_exits, 
671                                (void *)(addr_t)info->rip);
672 #endif
673
674                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
675                     info->intr_core_state.irq_started = 1;
676
677                     break;
678                 }
679                 case V3_NMI:
680                     PrintDebug("Injecting NMI\n");
681
682                     ent_int.type = 2;
683                     ent_int.vector = 2;
684                     ent_int.valid = 1;
685                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
686
687                     break;
688                 case V3_SOFTWARE_INTR:
689                     PrintDebug("Injecting software interrupt\n");
690                     ent_int.type = 4;
691
692                     ent_int.valid = 1;
693                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
694
695                     break;
696                 case V3_VIRTUAL_IRQ:
697                     // Not sure what to do here, Intel doesn't have virtual IRQs
698                     // May be the same as external interrupts/IRQs
699
700                     break;
701                 case V3_INVALID_INTR:
702                 default:
703                     break;
704             }
705         }
706     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
707         // Enable INTR window exiting so we know when IF=1
708         uint32_t instr_len;
709
710         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
711
712 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
713         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
714 #endif
715
716         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
717         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
718     }
719
720
721     return 0;
722 }
723
724
725
726 static struct vmx_exit_info exit_log[10];
727
728 static void print_exit_log(struct guest_info * info) {
729     int cnt = info->num_exits % 10;
730     int i = 0;
731     
732
733     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
734
735     for (i = 0; i < 10; i++) {
736         struct vmx_exit_info * tmp = &exit_log[cnt];
737
738         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
739         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
740         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
741         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
742         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
743
744         cnt--;
745
746         if (cnt == -1) {
747             cnt = 9;
748         }
749
750     }
751
752 }
753
754 /* 
755  * CAUTION and DANGER!!! 
756  * 
757  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
758  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
759  * on its contents will cause things to break. The contents at the time of the exit WILL 
760  * change before the exit handler is executed.
761  */
762 int v3_vmx_enter(struct guest_info * info) {
763     int ret = 0;
764     uint32_t tsc_offset_low, tsc_offset_high;
765     struct vmx_exit_info exit_info;
766     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
767
768     // Conditionally yield the CPU if the timeslice has expired
769     v3_yield_cond(info);
770
771     // Perform any additional yielding needed for time adjustment
772     v3_adjust_time(info);
773
774     // disable global interrupts for vm state transition
775     v3_disable_ints();
776
777     // Update timer devices late after being in the VM so that as much 
778     // of hte time in the VM is accounted for as possible. Also do it before
779     // updating IRQ entry state so that any interrupts the timers raise get 
780     // handled on the next VM entry. Must be done with interrupts disabled.
781     v3_update_timers(info);
782
783     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
784         vmcs_clear(vmx_info->vmcs_ptr_phys);
785         vmcs_load(vmx_info->vmcs_ptr_phys);
786         vmx_info->state = VMX_UNLAUNCHED;
787     }
788
789     v3_vmx_restore_vmcs(info);
790
791
792 #ifdef V3_CONFIG_SYMCALL
793     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
794         update_irq_entry_state(info);
795     }
796 #else 
797     update_irq_entry_state(info);
798 #endif
799
800     {
801         addr_t guest_cr3;
802         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
803         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
804     }
805
806     // Perform last-minute time bookkeeping prior to entering the VM
807     v3_time_enter_vm(info);
808
809     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
810     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
811     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
812     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
813
814     if (v3_update_vmcs_host_state(info)) {
815         v3_enable_ints();
816         PrintError("Could not write host state\n");
817         return -1;
818     }
819
820
821     if (vmx_info->state == VMX_UNLAUNCHED) {
822         vmx_info->state = VMX_LAUNCHED;
823         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
824     } else {
825         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
826         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
827     }
828     
829
830
831     //  PrintDebug("VMX Exit: ret=%d\n", ret);
832
833     if (ret != VMX_SUCCESS) {
834         uint32_t error = 0;
835         vmcs_read(VMCS_INSTR_ERR, &error);
836
837         v3_enable_ints();
838
839         PrintError("VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
840         return -1;
841     }
842
843
844
845     // Immediate exit from VM time bookkeeping
846     v3_time_exit_vm(info);
847
848     info->num_exits++;
849
850     /* Update guest state */
851     v3_vmx_save_vmcs(info);
852
853     // info->cpl = info->segments.cs.selector & 0x3;
854
855     info->mem_mode = v3_get_vm_mem_mode(info);
856     info->cpu_mode = v3_get_vm_cpu_mode(info);
857
858
859     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
860     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
861     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
862     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
863     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
864     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
865     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
866
867     if (info->shdw_pg_mode == NESTED_PAGING) {
868         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
869     }
870
871     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
872
873     exit_log[info->num_exits % 10] = exit_info;
874
875 #ifdef V3_CONFIG_SYMCALL
876     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
877         update_irq_exit_state(info);
878     }
879 #else
880     update_irq_exit_state(info);
881 #endif
882
883     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
884         // This is a special case whose only job is to inject an interrupt
885         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
886         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
887         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
888
889 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
890        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
891 #endif
892     }
893
894     // reenable global interrupts after vm exit
895     v3_enable_ints();
896
897     // Conditionally yield the CPU if the timeslice has expired
898     v3_yield_cond(info);
899
900     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
901         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
902         return -1;
903     }
904
905     return 0;
906 }
907
908
909 int v3_start_vmx_guest(struct guest_info * info) {
910
911     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
912
913     if (info->vcpu_id == 0) {
914         info->core_run_state = CORE_RUNNING;
915     } else {
916
917         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
918
919         while (info->core_run_state == CORE_STOPPED) {
920
921             if (info->vm_info->run_state == VM_STOPPED) {
922                 // The VM was stopped before this core was initialized. 
923                 return 0;
924             }
925
926             v3_yield(info);
927             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
928         }
929         
930         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
931
932         // We'll be paranoid about race conditions here
933         v3_wait_at_barrier(info);
934     }
935
936
937     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
938                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
939                info->segments.cs.limit, (void *)(info->rip));
940
941
942     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
943
944     v3_start_time(info);
945
946     while (1) {
947
948         if (info->vm_info->run_state == VM_STOPPED) {
949             info->core_run_state = CORE_STOPPED;
950             break;
951         }
952
953         if (v3_vmx_enter(info) == -1) {
954
955             addr_t host_addr;
956             addr_t linear_addr = 0;
957             
958             info->vm_info->run_state = VM_ERROR;
959             
960             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
961             
962             v3_print_guest_state(info);
963             
964             V3_Print("VMX core %u\n", info->vcpu_id); 
965
966             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
967             
968             if (info->mem_mode == PHYSICAL_MEM) {
969                 v3_gpa_to_hva(info, linear_addr, &host_addr);
970             } else if (info->mem_mode == VIRTUAL_MEM) {
971                 v3_gva_to_hva(info, linear_addr, &host_addr);
972             }
973             
974             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
975             
976             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
977             v3_dump_mem((uint8_t *)host_addr, 15);
978             
979             v3_print_stack(info);
980
981
982             v3_print_vmcs();
983             print_exit_log(info);
984             return -1;
985         }
986
987         v3_wait_at_barrier(info);
988
989
990         if (info->vm_info->run_state == VM_STOPPED) {
991             info->core_run_state = CORE_STOPPED;
992             break;
993         }
994 /*
995         if ((info->num_exits % 5000) == 0) {
996             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
997         }
998 */
999
1000     }
1001
1002     return 0;
1003 }
1004
1005
1006
1007
1008 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
1009 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
1010 #define CPUID_1_ECX_VTXFLAG 0x00000020
1011
1012 int v3_is_vmx_capable() {
1013     v3_msr_t feature_msr;
1014     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1015
1016     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1017
1018     PrintDebug("ECX: 0x%x\n", ecx);
1019
1020     if (ecx & CPUID_1_ECX_VTXFLAG) {
1021         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1022         
1023         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
1024
1025         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1026             PrintDebug("VMX is locked -- enable in the BIOS\n");
1027             return 0;
1028         }
1029
1030     } else {
1031         PrintDebug("VMX not supported on this cpu\n");
1032         return 0;
1033     }
1034
1035     return 1;
1036 }
1037
1038
1039 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1040     // init vmcs bios
1041     
1042     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1043         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1044         // easy 
1045         core->rip = 0;
1046         core->segments.cs.selector = rip << 8;
1047         core->segments.cs.limit = 0xffff;
1048         core->segments.cs.base = rip << 12;
1049     } else {
1050         core->vm_regs.rdx = core->vcpu_id;
1051         core->vm_regs.rbx = rip;
1052     }
1053
1054     return 0;
1055 }
1056
1057
1058
1059 void v3_init_vmx_cpu(int cpu_id) {
1060     addr_t vmx_on_region = 0;
1061
1062     if (cpu_id == 0) {
1063         if (v3_init_vmx_hw(&hw_info) == -1) {
1064             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1065             return;
1066         }
1067     }
1068
1069     enable_vmx();
1070
1071
1072     // Setup VMXON Region
1073     vmx_on_region = allocate_vmcs();
1074
1075
1076     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1077         V3_Print("VMX Enabled\n");
1078         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1079     } else {
1080         V3_Print("VMX already enabled\n");
1081         V3_FreePages((void *)vmx_on_region, 1);
1082     }
1083
1084     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1085
1086     {
1087         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1088         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1089         
1090         if (sec_proc_ctrls.enable_ept == 0) {
1091             V3_Print("VMX EPT (Nested) Paging not supported\n");
1092             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1093         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1094             V3_Print("VMX EPT (Nested) Paging supported\n");
1095             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1096         } else {
1097             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1098             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1099         }
1100     }
1101 }
1102
1103
1104 void v3_deinit_vmx_cpu(int cpu_id) {
1105     extern v3_cpu_arch_t v3_cpu_types[];
1106     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1107
1108     if (host_vmcs_ptrs[cpu_id] != 0) {
1109         V3_Print("Disabling VMX\n");
1110
1111         if (vmx_off() != VMX_SUCCESS) {
1112             PrintError("Error executing VMXOFF\n");
1113         }
1114
1115         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1116
1117         host_vmcs_ptrs[cpu_id] = 0;
1118     }
1119 }