Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


Minor time fixes
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #ifdef V3_CONFIG_CHECKPOINT
38 #include <palacios/vmm_checkpoint.h>
39 #endif
40
41 #include <palacios/vmx_ept.h>
42 #include <palacios/vmx_assist.h>
43 #include <palacios/vmx_hw_info.h>
44
45 #ifndef V3_CONFIG_DEBUG_VMX
46 #undef PrintDebug
47 #define PrintDebug(fmt, args...)
48 #endif
49
50
51 /* These fields contain the hardware feature sets supported by the local CPU */
52 static struct vmx_hw_info hw_info;
53
54 extern v3_cpu_arch_t v3_cpu_types[];
55
56 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
57
58 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
59 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
60
61 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
62     int ret = 0;
63
64     ret = vmcs_write(field, val);
65
66     if (ret != VMX_SUCCESS) {
67         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
68         return 1;
69     }
70
71     return 0;
72 }
73
74 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
75     int ret = 0;
76
77     ret = vmcs_read(field, val);
78
79     if (ret != VMX_SUCCESS) {
80         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
81     }
82
83     return ret;
84 }
85
86
87
88
89 static addr_t allocate_vmcs() {
90     struct vmcs_data * vmcs_page = NULL;
91
92     PrintDebug("Allocating page\n");
93
94     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
95     memset(vmcs_page, 0, 4096);
96
97     vmcs_page->revision = hw_info.basic_info.revision;
98     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
99
100     return (addr_t)V3_PAddr((void *)vmcs_page);
101 }
102
103
104
105
106 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
107     int vmx_ret = 0;
108
109     // disable global interrupts for vm state initialization
110     v3_disable_ints();
111
112     PrintDebug("Loading VMCS\n");
113     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
114     vmx_state->state = VMX_UNLAUNCHED;
115
116     if (vmx_ret != VMX_SUCCESS) {
117         PrintError("VMPTRLD failed\n");
118         return -1;
119     }
120
121
122     /*** Setup default state from HW ***/
123
124     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
125     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
126     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
127     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
128     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
129
130     /* Print Control MSRs */
131     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
132     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
133
134
135
136     /******* Setup Host State **********/
137
138     /* Cache GDTR, IDTR, and TR in host struct */
139
140
141     /********** Setup VMX Control Fields ***********/
142
143     /* Add external interrupts, NMI exiting, and virtual NMI */
144     vmx_state->pin_ctrls.nmi_exit = 1;
145     vmx_state->pin_ctrls.ext_int_exit = 1;
146
147
148     vmx_state->pri_proc_ctrls.hlt_exit = 1;
149
150
151     vmx_state->pri_proc_ctrls.pause_exit = 0;
152     vmx_state->pri_proc_ctrls.tsc_offset = 1;
153 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
154     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
155 #endif
156
157     /* Setup IO map */
158     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
159     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
160     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
161             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
162
163
164     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
165     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
166
167
168
169 #ifdef __V3_64BIT__
170     // Ensure host runs in 64-bit mode at each VM EXIT
171     vmx_state->exit_ctrls.host_64_on = 1;
172 #endif
173
174     // Hook all accesses to EFER register
175     v3_hook_msr(core->vm_info, EFER_MSR, 
176                 &v3_handle_efer_read,
177                 &v3_handle_efer_write, 
178                 core);
179
180     // Restore host's EFER register on each VM EXIT
181     vmx_state->exit_ctrls.ld_efer = 1;
182
183     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
184     vmx_state->exit_ctrls.save_efer = 1;
185     vmx_state->entry_ctrls.ld_efer  = 1;
186
187     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
188     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
189
190
191     // Setup Guests initial PAT field
192     vmx_ret |= check_vmcs_write(VMCS_GUEST_PAT, 0x0007040600070406LL);
193
194     /* Setup paging */
195     if (core->shdw_pg_mode == SHADOW_PAGING) {
196         PrintDebug("Creating initial shadow page table\n");
197
198         if (v3_init_passthrough_pts(core) == -1) {
199             PrintError("Could not initialize passthrough page tables\n");
200             return -1;
201         }
202         
203 #define CR0_PE 0x00000001
204 #define CR0_PG 0x80000000
205 #define CR0_WP 0x00010000 // To ensure mem hooks work
206         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
207
208         core->ctrl_regs.cr3 = core->direct_map_pt;
209
210         // vmx_state->pinbased_ctrls |= NMI_EXIT;
211
212         /* Add CR exits */
213         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
214         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
215         
216         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
217         
218         /* Add page fault exits */
219         vmx_state->excp_bmap.pf = 1;
220
221         // Setup VMX Assist
222         v3_vmxassist_init(core, vmx_state);
223
224     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
225                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
226
227 #define CR0_PE 0x00000001
228 #define CR0_PG 0x80000000
229 #define CR0_WP 0x00010000 // To ensure mem hooks work
230         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
231
232         // vmx_state->pinbased_ctrls |= NMI_EXIT;
233
234         /* Disable CR exits */
235         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
236         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
237
238         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
239
240         /* Add page fault exits */
241         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
242         
243         // Setup VMX Assist
244         v3_vmxassist_init(core, vmx_state);
245
246         /* Enable EPT */
247         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
248         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
249
250
251
252         if (v3_init_ept(core, &hw_info) == -1) {
253             PrintError("Error initializing EPT\n");
254             return -1;
255         }
256
257     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
258                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
259         int i = 0;
260         // For now we will assume that unrestricted guest mode is assured w/ EPT
261
262
263         core->vm_regs.rsp = 0x00;
264         core->rip = 0xfff0;
265         core->vm_regs.rdx = 0x00000f00;
266         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
267         core->ctrl_regs.cr0 = 0x00000030; 
268         core->ctrl_regs.cr4 = 0x00002010; // Enable VMX and PSE flag
269         
270
271         core->segments.cs.selector = 0xf000;
272         core->segments.cs.limit = 0xffff;
273         core->segments.cs.base = 0x0000000f0000LL;
274
275         // (raw attributes = 0xf3)
276         core->segments.cs.type = 0xb;
277         core->segments.cs.system = 0x1;
278         core->segments.cs.dpl = 0x0;
279         core->segments.cs.present = 1;
280
281
282
283         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
284                                           &(core->segments.es), &(core->segments.fs), 
285                                           &(core->segments.gs), NULL};
286
287         for ( i = 0; segregs[i] != NULL; i++) {
288             struct v3_segment * seg = segregs[i];
289         
290             seg->selector = 0x0000;
291             //    seg->base = seg->selector << 4;
292             seg->base = 0x00000000;
293             seg->limit = 0xffff;
294
295
296             seg->type = 0x3;
297             seg->system = 0x1;
298             seg->dpl = 0x0;
299             seg->present = 1;
300             //    seg->granularity = 1;
301
302         }
303
304
305         core->segments.gdtr.limit = 0x0000ffff;
306         core->segments.gdtr.base = 0x0000000000000000LL;
307
308         core->segments.idtr.limit = 0x0000ffff;
309         core->segments.idtr.base = 0x0000000000000000LL;
310
311         core->segments.ldtr.selector = 0x0000;
312         core->segments.ldtr.limit = 0x0000ffff;
313         core->segments.ldtr.base = 0x0000000000000000LL;
314         core->segments.ldtr.type = 0x2;
315         core->segments.ldtr.present = 1;
316
317         core->segments.tr.selector = 0x0000;
318         core->segments.tr.limit = 0x0000ffff;
319         core->segments.tr.base = 0x0000000000000000LL;
320         core->segments.tr.type = 0xb;
321         core->segments.tr.present = 1;
322
323         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
324         core->dbg_regs.dr7 = 0x0000000000000400LL;
325
326         /* Enable EPT */
327         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
328         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
329         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
330
331
332         /* Disable shadow paging stuff */
333         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
334         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
335
336         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
337
338
339         if (v3_init_ept(core, &hw_info) == -1) {
340             PrintError("Error initializing EPT\n");
341             return -1;
342         }
343
344     } else {
345         PrintError("Invalid Virtual paging mode\n");
346         return -1;
347     }
348
349
350     // hook vmx msrs
351
352     // Setup SYSCALL/SYSENTER MSRs in load/store area
353     
354     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
355     {
356
357         struct vmcs_msr_save_area * msr_entries = NULL;
358         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
359         int msr_ret = 0;
360
361         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
362
363         if (max_msrs < 4) {
364             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
365             return -1;
366         }
367
368         vmx_state->msr_area_paddr = (addr_t)V3_AllocPages(1);
369         
370         if (vmx_state->msr_area_paddr == (addr_t)NULL) {
371             PrintError("could not allocate msr load/store area\n");
372             return -1;
373         }
374
375         msr_entries = (struct vmcs_msr_save_area *)V3_VAddr((void *)(vmx_state->msr_area_paddr));
376         vmx_state->msr_area = msr_entries; // cache in vmx_info
377
378         memset(msr_entries, 0, PAGE_SIZE);
379
380         msr_entries->guest_star.index = IA32_STAR_MSR;
381         msr_entries->guest_lstar.index = IA32_LSTAR_MSR;
382         msr_entries->guest_fmask.index = IA32_FMASK_MSR;
383         msr_entries->guest_kern_gs.index = IA32_KERN_GS_BASE_MSR;
384
385         msr_entries->host_star.index = IA32_STAR_MSR;
386         msr_entries->host_lstar.index = IA32_LSTAR_MSR;
387         msr_entries->host_fmask.index = IA32_FMASK_MSR;
388         msr_entries->host_kern_gs.index = IA32_KERN_GS_BASE_MSR;
389
390         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
391         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
392         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
393
394         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
395         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
396         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->host_msrs));
397
398
399         msr_ret |= v3_hook_msr(core->vm_info, IA32_STAR_MSR, NULL, NULL, NULL);
400         msr_ret |= v3_hook_msr(core->vm_info, IA32_LSTAR_MSR, NULL, NULL, NULL);
401         msr_ret |= v3_hook_msr(core->vm_info, IA32_FMASK_MSR, NULL, NULL, NULL);
402         msr_ret |= v3_hook_msr(core->vm_info, IA32_KERN_GS_BASE_MSR, NULL, NULL, NULL);
403
404
405         // IMPORTANT: These MSRs appear to be cached by the hardware....
406         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_CS_MSR, NULL, NULL, NULL);
407         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_ESP_MSR, NULL, NULL, NULL);
408         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_EIP_MSR, NULL, NULL, NULL);
409
410         msr_ret |= v3_hook_msr(core->vm_info, FS_BASE_MSR, NULL, NULL, NULL);
411         msr_ret |= v3_hook_msr(core->vm_info, GS_BASE_MSR, NULL, NULL, NULL);
412
413
414         // Not sure what to do about this... Does not appear to be an explicit hardware cache version...
415         msr_ret |= v3_hook_msr(core->vm_info, IA32_CSTAR_MSR, NULL, NULL, NULL);
416
417         if (msr_ret != 0) {
418             PrintError("Error configuring MSR save/restore area\n");
419             return -1;
420         }
421
422
423     }    
424
425     /* Sanity check ctrl/reg fields against hw_defaults */
426
427
428
429
430     /*** Write all the info to the VMCS ***/
431   
432     /*
433     {
434         // IS THIS NECESSARY???
435 #define DEBUGCTL_MSR 0x1d9
436         struct v3_msr tmp_msr;
437         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
438         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
439         core->dbg_regs.dr7 = 0x400;
440     }
441     */
442
443 #ifdef __V3_64BIT__
444     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
445 #else
446     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
447     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
448 #endif
449
450
451  
452
453     if (v3_update_vmcs_ctrl_fields(core)) {
454         PrintError("Could not write control fields!\n");
455         return -1;
456     }
457     
458     /*
459     if (v3_update_vmcs_host_state(core)) {
460         PrintError("Could not write host state\n");
461         return -1;
462     }
463     */
464
465     // reenable global interrupts for vm state initialization now
466     // that the vm state is initialized. If another VM kicks us off, 
467     // it'll update our vmx state so that we know to reload ourself
468     v3_enable_ints();
469
470     return 0;
471 }
472
473 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
474     struct vmx_data * vmx_state = NULL;
475     int vmx_ret = 0;
476     
477     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
478     memset(vmx_state, 0, sizeof(struct vmx_data));
479
480     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
481
482     PrintDebug("Allocating VMCS\n");
483     vmx_state->vmcs_ptr_phys = allocate_vmcs();
484
485     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
486
487     core->vmm_data = vmx_state;
488     vmx_state->state = VMX_UNLAUNCHED;
489
490     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
491     
492     // TODO: Fix vmcs fields so they're 32-bit
493
494     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
495     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
496
497     if (vmx_ret != VMX_SUCCESS) {
498         PrintError("VMCLEAR failed\n");
499         return -1; 
500     }
501
502     if (vm_class == V3_PC_VM) {
503         PrintDebug("Initializing VMCS\n");
504         if (init_vmcs_bios(core, vmx_state) == -1) {
505             PrintError("Error initializing VMCS to BIOS state\n");
506             return -1;
507         }
508     } else {
509         PrintError("Invalid VM Class\n");
510         return -1;
511     }
512
513     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
514     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
515
516     return 0;
517 }
518
519
520 int v3_deinit_vmx_vmcs(struct guest_info * core) {
521     struct vmx_data * vmx_state = core->vmm_data;
522
523     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
524     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
525
526     V3_Free(vmx_state);
527
528     return 0;
529 }
530
531
532
533 #ifdef V3_CONFIG_CHECKPOINT
534 /* 
535  * JRL: This is broken
536  */
537 int v3_vmx_save_core(struct guest_info * core, void * ctx){
538     uint64_t vmcs_ptr = vmcs_store();
539
540     v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
541
542     return 0;
543 }
544
545 int v3_vmx_load_core(struct guest_info * core, void * ctx){
546     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
547     struct cr0_32 * shadow_cr0;
548     char vmcs[PAGE_SIZE_4KB];
549
550     v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
551
552     vmcs_clear(vmx_info->vmcs_ptr_phys);
553     vmcs_load((addr_t)vmcs);
554
555     v3_vmx_save_vmcs(core);
556
557     shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
558
559
560     /* Get the CPU mode to set the guest_ia32e entry ctrl */
561
562     if (core->shdw_pg_mode == SHADOW_PAGING) {
563         if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
564             if (v3_activate_shadow_pt(core) == -1) {
565                 PrintError("Failed to activate shadow page tables\n");
566                 return -1;
567             }
568         } else {
569             if (v3_activate_passthrough_pt(core) == -1) {
570                 PrintError("Failed to activate passthrough page tables\n");
571                 return -1;
572             }
573         }
574     }
575
576     return 0;
577 }
578 #endif
579
580
581 void v3_flush_vmx_vm_core(struct guest_info * core) {
582     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
583     vmcs_clear(vmx_info->vmcs_ptr_phys);
584     vmx_info->state = VMX_UNLAUNCHED;
585 }
586
587
588
589 static int update_irq_exit_state(struct guest_info * info) {
590     struct vmx_exit_idt_vec_info idt_vec_info;
591
592     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
593
594     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
595 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
596         V3_Print("Calling v3_injecting_intr\n");
597 #endif
598         info->intr_core_state.irq_started = 0;
599         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
600     }
601
602     return 0;
603 }
604
605 static int update_irq_entry_state(struct guest_info * info) {
606     struct vmx_exit_idt_vec_info idt_vec_info;
607     struct vmcs_interrupt_state intr_core_state;
608     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
609
610     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
611     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
612
613     /* Check for pending exceptions to inject */
614     if (v3_excp_pending(info)) {
615         struct vmx_entry_int_info int_info;
616         int_info.value = 0;
617
618         // In VMX, almost every exception is hardware
619         // Software exceptions are pretty much only for breakpoint or overflow
620         int_info.type = 3;
621         int_info.vector = v3_get_excp_number(info);
622
623         if (info->excp_state.excp_error_code_valid) {
624             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
625             int_info.error_code = 1;
626
627 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
628             V3_Print("Injecting exception %d with error code %x\n", 
629                     int_info.vector, info->excp_state.excp_error_code);
630 #endif
631         }
632
633         int_info.valid = 1;
634 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
635         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
636 #endif
637         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
638
639         v3_injecting_excp(info, int_info.vector);
640
641     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
642                (intr_core_state.val == 0)) {
643        
644         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
645
646 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
647             V3_Print("IRQ pending from previous injection\n");
648 #endif
649
650             // Copy the IDT vectoring info over to reinject the old interrupt
651             if (idt_vec_info.error_code == 1) {
652                 uint32_t err_code = 0;
653
654                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
655                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
656             }
657
658             idt_vec_info.undef = 0;
659             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
660
661         } else {
662             struct vmx_entry_int_info ent_int;
663             ent_int.value = 0;
664
665             switch (v3_intr_pending(info)) {
666                 case V3_EXTERNAL_IRQ: {
667                     info->intr_core_state.irq_vector = v3_get_intr(info); 
668                     ent_int.vector = info->intr_core_state.irq_vector;
669                     ent_int.type = 0;
670                     ent_int.error_code = 0;
671                     ent_int.valid = 1;
672
673 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
674                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
675                                info->intr_core_state.irq_vector, 
676                                (uint32_t)info->num_exits, 
677                                (void *)(addr_t)info->rip);
678 #endif
679
680                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
681                     info->intr_core_state.irq_started = 1;
682
683                     break;
684                 }
685                 case V3_NMI:
686                     PrintDebug("Injecting NMI\n");
687
688                     ent_int.type = 2;
689                     ent_int.vector = 2;
690                     ent_int.valid = 1;
691                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
692
693                     break;
694                 case V3_SOFTWARE_INTR:
695                     PrintDebug("Injecting software interrupt\n");
696                     ent_int.type = 4;
697
698                     ent_int.valid = 1;
699                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
700
701                     break;
702                 case V3_VIRTUAL_IRQ:
703                     // Not sure what to do here, Intel doesn't have virtual IRQs
704                     // May be the same as external interrupts/IRQs
705
706                     break;
707                 case V3_INVALID_INTR:
708                 default:
709                     break;
710             }
711         }
712     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
713         // Enable INTR window exiting so we know when IF=1
714         uint32_t instr_len;
715
716         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
717
718 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
719         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
720 #endif
721
722         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
723         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
724     }
725
726
727     return 0;
728 }
729
730
731
732 static struct vmx_exit_info exit_log[10];
733
734 static void print_exit_log(struct guest_info * info) {
735     int cnt = info->num_exits % 10;
736     int i = 0;
737     
738
739     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
740
741     for (i = 0; i < 10; i++) {
742         struct vmx_exit_info * tmp = &exit_log[cnt];
743
744         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
745         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
746         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
747         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
748         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
749
750         cnt--;
751
752         if (cnt == -1) {
753             cnt = 9;
754         }
755
756     }
757
758 }
759
760 int
761 v3_vmx_schedule_timeout(struct guest_info * info)
762 {
763     struct vmx_data * vmx_state = (struct vmx_data *)(info->vmm_data);
764     sint64_t cycles;
765     uint32_t timeout;
766
767     /* Check if the hardware supports an active timeout */
768 #define VMX_ACTIVE_PREEMPT_TIMER_PIN 0x40
769     if (hw_info.pin_ctrls.req_mask & VMX_ACTIVE_PREEMPT_TIMER_PIN) {
770         /* The hardware doesn't support us modifying this pin control */
771         return 0;
772     }
773
774     /* Check if we have one to schedule and schedule it if we do */
775     cycles = (sint64_t)info->time_state.next_timeout - (sint64_t)v3_get_guest_time(&info->time_state);
776     if (info->time_state.next_timeout == (ullong_t) -1)  {
777         timeout = 0;
778         vmx_state->pin_ctrls.active_preempt_timer = 0;
779     } else if (cycles < 0) {
780         /* set the timeout to 0 to force an immediate re-exit since it expired between
781          * when we checked a timeout and now. IF SOMEONE CONTINAULLY SETS A SHORT TIMEOUT,
782          * THIS CAN LOCK US OUT OF THE GUEST! */
783         timeout = 0;
784         vmx_state->pin_ctrls.active_preempt_timer = 1;
785     } else {
786         /* The hardware supports scheduling a timeout, and we have one to 
787          * schedule */
788         timeout = (uint32_t)cycles >> hw_info.misc_info.tsc_multiple;
789         vmx_state->pin_ctrls.active_preempt_timer = 1;
790     }
791
792     /* Actually program the timer based on the settings above. */
793     check_vmcs_write(VMCS_PREEMPT_TIMER, timeout);
794     check_vmcs_write(VMCS_PIN_CTRLS, vmx_state->pin_ctrls.value);
795     return 0;
796 }
797
798 /* 
799  * CAUTION and DANGER!!! 
800  * 
801  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
802  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
803  * on its contents will cause things to break. The contents at the time of the exit WILL 
804  * change before the exit handler is executed.
805  */
806 int v3_vmx_enter(struct guest_info * info) {
807     int ret = 0;
808     sint64_t tsc_offset;
809     uint32_t tsc_offset_low, tsc_offset_high;
810     struct vmx_exit_info exit_info;
811     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
812
813     // Conditionally yield the CPU if the timeslice has expired
814     v3_yield_cond(info);
815
816     // Perform any additional yielding needed for time adjustment
817     v3_adjust_time(info);
818
819     // Check for timeout - since this calls generic hooks in devices
820     // that may do things like pause the VM, it cannot be with interrupts
821     // disabled.
822     v3_check_timeout(info);
823
824     // disable global interrupts for vm state transition
825     v3_disable_ints();
826
827     // Update timer devices late after being in the VM so that as much 
828     // of the time in the VM is accounted for as possible. Also do it before
829     // updating IRQ entry state so that any interrupts the timers raise get 
830     // handled on the next VM entry. Must be done with interrupts disabled.
831     v3_update_timers(info);
832
833     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
834         vmcs_clear(vmx_info->vmcs_ptr_phys);
835         vmcs_load(vmx_info->vmcs_ptr_phys);
836         vmx_info->state = VMX_UNLAUNCHED;
837     }
838
839     v3_vmx_restore_vmcs(info);
840
841
842 #ifdef V3_CONFIG_SYMCALL
843     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
844         update_irq_entry_state(info);
845     }
846 #else 
847     update_irq_entry_state(info);
848 #endif
849
850     {
851         addr_t guest_cr3;
852         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
853         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
854     }
855
856     // Update vmx active preemption timer to exit at the next timeout if 
857     // the hardware supports it.
858     v3_vmx_schedule_timeout(info);
859
860     // Perform last-minute time bookkeeping prior to entering the VM
861     v3_time_enter_vm(info);
862
863     tsc_offset = v3_tsc_host_offset(&info->time_state);
864     tsc_offset_high = (uint32_t)(( tsc_offset >> 32) & 0xffffffff);
865     tsc_offset_low = (uint32_t)(tsc_offset & 0xffffffff);
866
867     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
868     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
869
870     if (v3_update_vmcs_host_state(info)) {
871         v3_enable_ints();
872         PrintError("Could not write host state\n");
873         return -1;
874     }
875
876
877     if (vmx_info->state == VMX_UNLAUNCHED) {
878         vmx_info->state = VMX_LAUNCHED;
879         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
880     } else {
881         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
882         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
883     }
884     
885
886
887     //  PrintDebug("VMX Exit: ret=%d\n", ret);
888
889     if (ret != VMX_SUCCESS) {
890         uint32_t error = 0;
891         vmcs_read(VMCS_INSTR_ERR, &error);
892
893         v3_enable_ints();
894
895         PrintError("VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
896         return -1;
897     }
898
899
900
901     // Immediate exit from VM time bookkeeping
902     v3_time_exit_vm(info);
903
904     info->num_exits++;
905
906     /* Update guest state */
907     v3_vmx_save_vmcs(info);
908
909     // info->cpl = info->segments.cs.selector & 0x3;
910
911     info->mem_mode = v3_get_vm_mem_mode(info);
912     info->cpu_mode = v3_get_vm_cpu_mode(info);
913
914
915     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
916     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
917     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
918     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
919     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
920     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
921     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
922
923     if (info->shdw_pg_mode == NESTED_PAGING) {
924         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
925     }
926
927     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
928
929     exit_log[info->num_exits % 10] = exit_info;
930
931 #ifdef V3_CONFIG_SYMCALL
932     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
933         update_irq_exit_state(info);
934     }
935 #else
936     update_irq_exit_state(info);
937 #endif
938
939     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
940         // This is a special case whose only job is to inject an interrupt
941         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
942         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
943         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
944
945 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
946        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
947 #endif
948     }
949
950     // reenable global interrupts after vm exit
951     v3_enable_ints();
952
953     // Conditionally yield the CPU if the timeslice has expired
954     v3_yield_cond(info);
955
956     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
957         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
958         return -1;
959     }
960
961     return 0;
962 }
963
964
965 int v3_start_vmx_guest(struct guest_info * info) {
966
967     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
968
969     if (info->vcpu_id == 0) {
970         info->core_run_state = CORE_RUNNING;
971     } else {
972
973         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
974
975         while (info->core_run_state == CORE_STOPPED) {
976
977             if (info->vm_info->run_state == VM_STOPPED) {
978                 // The VM was stopped before this core was initialized. 
979                 return 0;
980             }
981
982             v3_yield(info);
983             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
984         }
985         
986         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
987
988         // We'll be paranoid about race conditions here
989         v3_wait_at_barrier(info);
990     }
991
992
993     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
994                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
995                info->segments.cs.limit, (void *)(info->rip));
996
997
998     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
999
1000     v3_start_time(info);
1001
1002     while (1) {
1003
1004         if (info->vm_info->run_state == VM_STOPPED) {
1005             info->core_run_state = CORE_STOPPED;
1006             break;
1007         }
1008
1009         if (v3_vmx_enter(info) == -1) {
1010
1011             addr_t host_addr;
1012             addr_t linear_addr = 0;
1013             
1014             info->vm_info->run_state = VM_ERROR;
1015             
1016             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
1017             
1018             v3_print_guest_state(info);
1019             
1020             V3_Print("VMX core %u\n", info->vcpu_id); 
1021
1022             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
1023             
1024             if (info->mem_mode == PHYSICAL_MEM) {
1025                 v3_gpa_to_hva(info, linear_addr, &host_addr);
1026             } else if (info->mem_mode == VIRTUAL_MEM) {
1027                 v3_gva_to_hva(info, linear_addr, &host_addr);
1028             }
1029             
1030             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
1031             
1032             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
1033             v3_dump_mem((uint8_t *)host_addr, 15);
1034             
1035             v3_print_stack(info);
1036
1037
1038             v3_print_vmcs();
1039             print_exit_log(info);
1040             return -1;
1041         }
1042
1043         v3_wait_at_barrier(info);
1044
1045
1046         if (info->vm_info->run_state == VM_STOPPED) {
1047             info->core_run_state = CORE_STOPPED;
1048             break;
1049         }
1050 /*
1051         if ((info->num_exits % 5000) == 0) {
1052             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
1053         }
1054 */
1055
1056     }
1057
1058     return 0;
1059 }
1060
1061
1062
1063
1064 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
1065 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
1066 #define CPUID_1_ECX_VTXFLAG 0x00000020
1067
1068 int v3_is_vmx_capable() {
1069     v3_msr_t feature_msr;
1070     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1071
1072     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1073
1074     PrintDebug("ECX: 0x%x\n", ecx);
1075
1076     if (ecx & CPUID_1_ECX_VTXFLAG) {
1077         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1078         
1079         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
1080
1081         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1082             PrintDebug("VMX is locked -- enable in the BIOS\n");
1083             return 0;
1084         }
1085
1086     } else {
1087         PrintDebug("VMX not supported on this cpu\n");
1088         return 0;
1089     }
1090
1091     return 1;
1092 }
1093
1094
1095 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1096     // init vmcs bios
1097     
1098     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1099         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1100         // easy 
1101         core->rip = 0;
1102         core->segments.cs.selector = rip << 8;
1103         core->segments.cs.limit = 0xffff;
1104         core->segments.cs.base = rip << 12;
1105     } else {
1106         core->vm_regs.rdx = core->vcpu_id;
1107         core->vm_regs.rbx = rip;
1108     }
1109
1110     return 0;
1111 }
1112
1113
1114
1115 void v3_init_vmx_cpu(int cpu_id) {
1116     addr_t vmx_on_region = 0;
1117
1118     if (cpu_id == 0) {
1119         if (v3_init_vmx_hw(&hw_info) == -1) {
1120             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1121             return;
1122         }
1123     }
1124
1125     enable_vmx();
1126
1127
1128     // Setup VMXON Region
1129     vmx_on_region = allocate_vmcs();
1130
1131
1132     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1133         V3_Print("VMX Enabled\n");
1134         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1135     } else {
1136         V3_Print("VMX already enabled\n");
1137         V3_FreePages((void *)vmx_on_region, 1);
1138     }
1139
1140     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1141
1142     {
1143         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1144         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1145         
1146         if (sec_proc_ctrls.enable_ept == 0) {
1147             V3_Print("VMX EPT (Nested) Paging not supported\n");
1148             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1149         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1150             V3_Print("VMX EPT (Nested) Paging supported\n");
1151             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1152         } else {
1153             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1154             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1155         }
1156     }
1157 }
1158
1159
1160 void v3_deinit_vmx_cpu(int cpu_id) {
1161     extern v3_cpu_arch_t v3_cpu_types[];
1162     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1163
1164     if (host_vmcs_ptrs[cpu_id] != 0) {
1165         V3_Print("Disabling VMX\n");
1166
1167         if (vmx_off() != VMX_SUCCESS) {
1168             PrintError("Error executing VMXOFF\n");
1169         }
1170
1171         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1172
1173         host_vmcs_ptrs[cpu_id] = 0;
1174     }
1175 }