Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


fixes for unrestricted guest support
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #ifdef V3_CONFIG_CHECKPOINT
38 #include <palacios/vmm_checkpoint.h>
39 #endif
40
41 #include <palacios/vmx_ept.h>
42 #include <palacios/vmx_assist.h>
43 #include <palacios/vmx_hw_info.h>
44
45 #ifndef V3_CONFIG_DEBUG_VMX
46 #undef PrintDebug
47 #define PrintDebug(fmt, args...)
48 #endif
49
50
51 /* These fields contain the hardware feature sets supported by the local CPU */
52 static struct vmx_hw_info hw_info;
53
54 extern v3_cpu_arch_t v3_cpu_types[];
55
56 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
57
58 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
59 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
60
61 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
62     int ret = 0;
63
64     ret = vmcs_write(field, val);
65
66     if (ret != VMX_SUCCESS) {
67         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
68         return 1;
69     }
70
71     return 0;
72 }
73
74 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
75     int ret = 0;
76
77     ret = vmcs_read(field, val);
78
79     if (ret != VMX_SUCCESS) {
80         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
81     }
82
83     return ret;
84 }
85
86
87
88
89 static addr_t allocate_vmcs() {
90     struct vmcs_data * vmcs_page = NULL;
91
92     PrintDebug("Allocating page\n");
93
94     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
95     memset(vmcs_page, 0, 4096);
96
97     vmcs_page->revision = hw_info.basic_info.revision;
98     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
99
100     return (addr_t)V3_PAddr((void *)vmcs_page);
101 }
102
103
104
105
106 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
107     int vmx_ret = 0;
108
109     // disable global interrupts for vm state initialization
110     v3_disable_ints();
111
112     PrintDebug("Loading VMCS\n");
113     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
114     vmx_state->state = VMX_UNLAUNCHED;
115
116     if (vmx_ret != VMX_SUCCESS) {
117         PrintError("VMPTRLD failed\n");
118         return -1;
119     }
120
121
122     /*** Setup default state from HW ***/
123
124     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
125     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
126     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
127     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
128     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
129
130     /* Print Control MSRs */
131     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
132     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
133
134
135
136     /******* Setup Host State **********/
137
138     /* Cache GDTR, IDTR, and TR in host struct */
139
140
141     /********** Setup VMX Control Fields ***********/
142
143     /* Add external interrupts, NMI exiting, and virtual NMI */
144     vmx_state->pin_ctrls.nmi_exit = 1;
145     vmx_state->pin_ctrls.ext_int_exit = 1;
146
147
148     vmx_state->pri_proc_ctrls.hlt_exit = 1;
149
150
151     vmx_state->pri_proc_ctrls.pause_exit = 0;
152     vmx_state->pri_proc_ctrls.tsc_offset = 1;
153 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
154     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
155 #endif
156
157     /* Setup IO map */
158     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
159     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
160     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
161             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
162
163
164     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
165     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
166
167
168
169 #ifdef __V3_64BIT__
170     // Ensure host runs in 64-bit mode at each VM EXIT
171     vmx_state->exit_ctrls.host_64_on = 1;
172 #endif
173
174     // Hook all accesses to EFER register
175     v3_hook_msr(core->vm_info, EFER_MSR, 
176                 &v3_handle_efer_read,
177                 &v3_handle_efer_write, 
178                 core);
179
180     // Restore host's EFER register on each VM EXIT
181     vmx_state->exit_ctrls.ld_efer = 1;
182
183     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
184     vmx_state->exit_ctrls.save_efer = 1;
185     vmx_state->entry_ctrls.ld_efer  = 1;
186
187     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
188     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
189
190
191     // Setup Guests initial PAT field
192     vmx_ret |= check_vmcs_write(VMCS_GUEST_PAT, 0x0007040600070406LL);
193
194     /* Setup paging */
195     if (core->shdw_pg_mode == SHADOW_PAGING) {
196         PrintDebug("Creating initial shadow page table\n");
197
198         if (v3_init_passthrough_pts(core) == -1) {
199             PrintError("Could not initialize passthrough page tables\n");
200             return -1;
201         }
202         
203 #define CR0_PE 0x00000001
204 #define CR0_PG 0x80000000
205 #define CR0_WP 0x00010000 // To ensure mem hooks work
206         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
207
208         core->ctrl_regs.cr3 = core->direct_map_pt;
209
210         // vmx_state->pinbased_ctrls |= NMI_EXIT;
211
212         /* Add CR exits */
213         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
214         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
215         
216         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
217         
218         /* Add page fault exits */
219         vmx_state->excp_bmap.pf = 1;
220
221         // Setup VMX Assist
222         v3_vmxassist_init(core, vmx_state);
223
224     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
225                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
226
227 #define CR0_PE 0x00000001
228 #define CR0_PG 0x80000000
229 #define CR0_WP 0x00010000 // To ensure mem hooks work
230         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
231
232         // vmx_state->pinbased_ctrls |= NMI_EXIT;
233
234         /* Disable CR exits */
235         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
236         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
237
238         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
239
240         /* Add page fault exits */
241         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
242         
243         // Setup VMX Assist
244         v3_vmxassist_init(core, vmx_state);
245
246         /* Enable EPT */
247         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
248         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
249
250
251
252         if (v3_init_ept(core, &hw_info) == -1) {
253             PrintError("Error initializing EPT\n");
254             return -1;
255         }
256
257     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
258                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
259         int i = 0;
260         // For now we will assume that unrestricted guest mode is assured w/ EPT
261
262
263         core->vm_regs.rsp = 0x00;
264         core->rip = 0xfff0;
265         core->vm_regs.rdx = 0x00000f00;
266         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
267         core->ctrl_regs.cr0 = 0x00000030; 
268         core->ctrl_regs.cr4 = 0x00002010; // Enable VMX and PSE flag
269         
270
271         core->segments.cs.selector = 0xf000;
272         core->segments.cs.limit = 0xffff;
273         core->segments.cs.base = 0x0000000f0000LL;
274
275         // (raw attributes = 0xf3)
276         core->segments.cs.type = 0xb;
277         core->segments.cs.system = 0x1;
278         core->segments.cs.dpl = 0x0;
279         core->segments.cs.present = 1;
280
281
282
283         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
284                                           &(core->segments.es), &(core->segments.fs), 
285                                           &(core->segments.gs), NULL};
286
287         for ( i = 0; segregs[i] != NULL; i++) {
288             struct v3_segment * seg = segregs[i];
289         
290             seg->selector = 0x0000;
291             //    seg->base = seg->selector << 4;
292             seg->base = 0x00000000;
293             seg->limit = 0xffff;
294
295
296             seg->type = 0x3;
297             seg->system = 0x1;
298             seg->dpl = 0x0;
299             seg->present = 1;
300             //    seg->granularity = 1;
301
302         }
303
304
305         core->segments.gdtr.limit = 0x0000ffff;
306         core->segments.gdtr.base = 0x0000000000000000LL;
307
308         core->segments.idtr.limit = 0x0000ffff;
309         core->segments.idtr.base = 0x0000000000000000LL;
310
311         core->segments.ldtr.selector = 0x0000;
312         core->segments.ldtr.limit = 0x0000ffff;
313         core->segments.ldtr.base = 0x0000000000000000LL;
314         core->segments.ldtr.type = 0x2;
315         core->segments.ldtr.present = 1;
316
317         core->segments.tr.selector = 0x0000;
318         core->segments.tr.limit = 0x0000ffff;
319         core->segments.tr.base = 0x0000000000000000LL;
320         core->segments.tr.type = 0xb;
321         core->segments.tr.present = 1;
322
323         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
324         core->dbg_regs.dr7 = 0x0000000000000400LL;
325
326         /* Enable EPT */
327         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
328         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
329         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
330
331
332         /* Disable shadow paging stuff */
333         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
334         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
335
336         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
337
338
339         if (v3_init_ept(core, &hw_info) == -1) {
340             PrintError("Error initializing EPT\n");
341             return -1;
342         }
343
344     } else {
345         PrintError("Invalid Virtual paging mode\n");
346         return -1;
347     }
348
349
350     // hook vmx msrs
351
352     // Setup SYSCALL/SYSENTER MSRs in load/store area
353     
354     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
355     {
356
357         struct vmcs_msr_save_area * msr_entries = NULL;
358         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
359         int msr_ret = 0;
360
361         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
362
363         if (max_msrs < 4) {
364             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
365             return -1;
366         }
367
368         vmx_state->msr_area_paddr = (addr_t)V3_AllocPages(1);
369         
370         if (vmx_state->msr_area_paddr == (addr_t)NULL) {
371             PrintError("could not allocate msr load/store area\n");
372             return -1;
373         }
374
375         msr_entries = (struct vmcs_msr_save_area *)V3_VAddr((void *)(vmx_state->msr_area_paddr));
376         vmx_state->msr_area = msr_entries; // cache in vmx_info
377
378         memset(msr_entries, 0, PAGE_SIZE);
379
380         msr_entries->guest_star.index = IA32_STAR_MSR;
381         msr_entries->guest_lstar.index = IA32_LSTAR_MSR;
382         msr_entries->guest_fmask.index = IA32_FMASK_MSR;
383         msr_entries->guest_kern_gs.index = IA32_KERN_GS_BASE_MSR;
384
385         msr_entries->host_star.index = IA32_STAR_MSR;
386         msr_entries->host_lstar.index = IA32_LSTAR_MSR;
387         msr_entries->host_fmask.index = IA32_FMASK_MSR;
388         msr_entries->host_kern_gs.index = IA32_KERN_GS_BASE_MSR;
389
390         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
391         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
392         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
393
394         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
395         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
396         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->host_msrs));
397
398
399         msr_ret |= v3_hook_msr(core->vm_info, IA32_STAR_MSR, NULL, NULL, NULL);
400         msr_ret |= v3_hook_msr(core->vm_info, IA32_LSTAR_MSR, NULL, NULL, NULL);
401         msr_ret |= v3_hook_msr(core->vm_info, IA32_FMASK_MSR, NULL, NULL, NULL);
402         msr_ret |= v3_hook_msr(core->vm_info, IA32_KERN_GS_BASE_MSR, NULL, NULL, NULL);
403
404
405         // IMPORTANT: These MSRs appear to be cached by the hardware....
406         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_CS_MSR, NULL, NULL, NULL);
407         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_ESP_MSR, NULL, NULL, NULL);
408         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_EIP_MSR, NULL, NULL, NULL);
409
410         msr_ret |= v3_hook_msr(core->vm_info, FS_BASE_MSR, NULL, NULL, NULL);
411         msr_ret |= v3_hook_msr(core->vm_info, GS_BASE_MSR, NULL, NULL, NULL);
412
413
414         // Not sure what to do about this... Does not appear to be an explicit hardware cache version...
415         msr_ret |= v3_hook_msr(core->vm_info, IA32_CSTAR_MSR, NULL, NULL, NULL);
416
417         if (msr_ret != 0) {
418             PrintError("Error configuring MSR save/restore area\n");
419             return -1;
420         }
421
422
423     }    
424
425     /* Sanity check ctrl/reg fields against hw_defaults */
426
427
428
429
430     /*** Write all the info to the VMCS ***/
431   
432     /*
433     {
434         // IS THIS NECESSARY???
435 #define DEBUGCTL_MSR 0x1d9
436         struct v3_msr tmp_msr;
437         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
438         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
439         core->dbg_regs.dr7 = 0x400;
440     }
441     */
442
443 #ifdef __V3_64BIT__
444     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
445 #else
446     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
447     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
448 #endif
449
450
451  
452
453     if (v3_update_vmcs_ctrl_fields(core)) {
454         PrintError("Could not write control fields!\n");
455         return -1;
456     }
457     
458     /*
459     if (v3_update_vmcs_host_state(core)) {
460         PrintError("Could not write host state\n");
461         return -1;
462     }
463     */
464
465     // reenable global interrupts for vm state initialization now
466     // that the vm state is initialized. If another VM kicks us off, 
467     // it'll update our vmx state so that we know to reload ourself
468     v3_enable_ints();
469
470     return 0;
471 }
472
473 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
474     struct vmx_data * vmx_state = NULL;
475     int vmx_ret = 0;
476     
477     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
478     memset(vmx_state, 0, sizeof(struct vmx_data));
479
480     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
481
482     PrintDebug("Allocating VMCS\n");
483     vmx_state->vmcs_ptr_phys = allocate_vmcs();
484
485     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
486
487     core->vmm_data = vmx_state;
488     vmx_state->state = VMX_UNLAUNCHED;
489
490     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
491     
492     // TODO: Fix vmcs fields so they're 32-bit
493
494     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
495     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
496
497     if (vmx_ret != VMX_SUCCESS) {
498         PrintError("VMCLEAR failed\n");
499         return -1; 
500     }
501
502     if (vm_class == V3_PC_VM) {
503         PrintDebug("Initializing VMCS\n");
504         if (init_vmcs_bios(core, vmx_state) == -1) {
505             PrintError("Error initializing VMCS to BIOS state\n");
506             return -1;
507         }
508     } else {
509         PrintError("Invalid VM Class\n");
510         return -1;
511     }
512
513     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
514     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
515
516     return 0;
517 }
518
519
520 int v3_deinit_vmx_vmcs(struct guest_info * core) {
521     struct vmx_data * vmx_state = core->vmm_data;
522
523     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
524     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
525
526     V3_Free(vmx_state);
527
528     return 0;
529 }
530
531
532
533 #ifdef V3_CONFIG_CHECKPOINT
534 /* 
535  * JRL: This is broken
536  */
537 int v3_vmx_save_core(struct guest_info * core, void * ctx){
538     uint64_t vmcs_ptr = vmcs_store();
539
540     v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
541
542     return 0;
543 }
544
545 int v3_vmx_load_core(struct guest_info * core, void * ctx){
546     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
547     struct cr0_32 * shadow_cr0;
548     char vmcs[PAGE_SIZE_4KB];
549
550     v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
551
552     vmcs_clear(vmx_info->vmcs_ptr_phys);
553     vmcs_load((addr_t)vmcs);
554
555     v3_vmx_save_vmcs(core);
556
557     shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
558
559
560     /* Get the CPU mode to set the guest_ia32e entry ctrl */
561
562     if (core->shdw_pg_mode == SHADOW_PAGING) {
563         if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
564             if (v3_activate_shadow_pt(core) == -1) {
565                 PrintError("Failed to activate shadow page tables\n");
566                 return -1;
567             }
568         } else {
569             if (v3_activate_passthrough_pt(core) == -1) {
570                 PrintError("Failed to activate passthrough page tables\n");
571                 return -1;
572             }
573         }
574     }
575
576     return 0;
577 }
578 #endif
579
580
581 void v3_flush_vmx_vm_core(struct guest_info * core) {
582     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
583     vmcs_clear(vmx_info->vmcs_ptr_phys);
584     vmx_info->state = VMX_UNLAUNCHED;
585 }
586
587
588
589 static int update_irq_exit_state(struct guest_info * info) {
590     struct vmx_exit_idt_vec_info idt_vec_info;
591
592     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
593
594     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
595 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
596         V3_Print("Calling v3_injecting_intr\n");
597 #endif
598         info->intr_core_state.irq_started = 0;
599         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
600     }
601
602     return 0;
603 }
604
605 static int update_irq_entry_state(struct guest_info * info) {
606     struct vmx_exit_idt_vec_info idt_vec_info;
607     struct vmcs_interrupt_state intr_core_state;
608     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
609
610     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
611     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
612
613     /* Check for pending exceptions to inject */
614     if (v3_excp_pending(info)) {
615         struct vmx_entry_int_info int_info;
616         int_info.value = 0;
617
618         // In VMX, almost every exception is hardware
619         // Software exceptions are pretty much only for breakpoint or overflow
620         int_info.type = 3;
621         int_info.vector = v3_get_excp_number(info);
622
623         if (info->excp_state.excp_error_code_valid) {
624             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
625             int_info.error_code = 1;
626
627 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
628             V3_Print("Injecting exception %d with error code %x\n", 
629                     int_info.vector, info->excp_state.excp_error_code);
630 #endif
631         }
632
633         int_info.valid = 1;
634 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
635         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
636 #endif
637         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
638
639         v3_injecting_excp(info, int_info.vector);
640
641     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
642                (intr_core_state.val == 0)) {
643        
644         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
645
646 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
647             V3_Print("IRQ pending from previous injection\n");
648 #endif
649
650             // Copy the IDT vectoring info over to reinject the old interrupt
651             if (idt_vec_info.error_code == 1) {
652                 uint32_t err_code = 0;
653
654                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
655                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
656             }
657
658             idt_vec_info.undef = 0;
659             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
660
661         } else {
662             struct vmx_entry_int_info ent_int;
663             ent_int.value = 0;
664
665             switch (v3_intr_pending(info)) {
666                 case V3_EXTERNAL_IRQ: {
667                     info->intr_core_state.irq_vector = v3_get_intr(info); 
668                     ent_int.vector = info->intr_core_state.irq_vector;
669                     ent_int.type = 0;
670                     ent_int.error_code = 0;
671                     ent_int.valid = 1;
672
673 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
674                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
675                                info->intr_core_state.irq_vector, 
676                                (uint32_t)info->num_exits, 
677                                (void *)(addr_t)info->rip);
678 #endif
679
680                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
681                     info->intr_core_state.irq_started = 1;
682
683                     break;
684                 }
685                 case V3_NMI:
686                     PrintDebug("Injecting NMI\n");
687
688                     ent_int.type = 2;
689                     ent_int.vector = 2;
690                     ent_int.valid = 1;
691                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
692
693                     break;
694                 case V3_SOFTWARE_INTR:
695                     PrintDebug("Injecting software interrupt\n");
696                     ent_int.type = 4;
697
698                     ent_int.valid = 1;
699                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
700
701                     break;
702                 case V3_VIRTUAL_IRQ:
703                     // Not sure what to do here, Intel doesn't have virtual IRQs
704                     // May be the same as external interrupts/IRQs
705
706                     break;
707                 case V3_INVALID_INTR:
708                 default:
709                     break;
710             }
711         }
712     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
713         // Enable INTR window exiting so we know when IF=1
714         uint32_t instr_len;
715
716         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
717
718 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
719         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
720 #endif
721
722         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
723         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
724     }
725
726
727     return 0;
728 }
729
730
731
732 static struct vmx_exit_info exit_log[10];
733
734 static void print_exit_log(struct guest_info * info) {
735     int cnt = info->num_exits % 10;
736     int i = 0;
737     
738
739     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
740
741     for (i = 0; i < 10; i++) {
742         struct vmx_exit_info * tmp = &exit_log[cnt];
743
744         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
745         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
746         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
747         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
748         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
749
750         cnt--;
751
752         if (cnt == -1) {
753             cnt = 9;
754         }
755
756     }
757
758 }
759
760 /* 
761  * CAUTION and DANGER!!! 
762  * 
763  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
764  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
765  * on its contents will cause things to break. The contents at the time of the exit WILL 
766  * change before the exit handler is executed.
767  */
768 int v3_vmx_enter(struct guest_info * info) {
769     int ret = 0;
770     uint32_t tsc_offset_low, tsc_offset_high;
771     struct vmx_exit_info exit_info;
772     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
773
774     // Conditionally yield the CPU if the timeslice has expired
775     v3_yield_cond(info);
776
777     // Perform any additional yielding needed for time adjustment
778     v3_adjust_time(info);
779
780     // disable global interrupts for vm state transition
781     v3_disable_ints();
782
783     // Update timer devices late after being in the VM so that as much 
784     // of hte time in the VM is accounted for as possible. Also do it before
785     // updating IRQ entry state so that any interrupts the timers raise get 
786     // handled on the next VM entry. Must be done with interrupts disabled.
787     v3_update_timers(info);
788
789     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
790         vmcs_clear(vmx_info->vmcs_ptr_phys);
791         vmcs_load(vmx_info->vmcs_ptr_phys);
792         vmx_info->state = VMX_UNLAUNCHED;
793     }
794
795     v3_vmx_restore_vmcs(info);
796
797
798 #ifdef V3_CONFIG_SYMCALL
799     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
800         update_irq_entry_state(info);
801     }
802 #else 
803     update_irq_entry_state(info);
804 #endif
805
806     {
807         addr_t guest_cr3;
808         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
809         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
810     }
811
812     // Perform last-minute time bookkeeping prior to entering the VM
813     v3_time_enter_vm(info);
814
815     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
816     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
817     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
818     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
819
820     if (v3_update_vmcs_host_state(info)) {
821         v3_enable_ints();
822         PrintError("Could not write host state\n");
823         return -1;
824     }
825
826
827     if (vmx_info->state == VMX_UNLAUNCHED) {
828         vmx_info->state = VMX_LAUNCHED;
829         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
830     } else {
831         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
832         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
833     }
834     
835
836
837     //  PrintDebug("VMX Exit: ret=%d\n", ret);
838
839     if (ret != VMX_SUCCESS) {
840         uint32_t error = 0;
841         vmcs_read(VMCS_INSTR_ERR, &error);
842
843         v3_enable_ints();
844
845         PrintError("VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
846         return -1;
847     }
848
849
850
851     // Immediate exit from VM time bookkeeping
852     v3_time_exit_vm(info);
853
854     info->num_exits++;
855
856     /* Update guest state */
857     v3_vmx_save_vmcs(info);
858
859     // info->cpl = info->segments.cs.selector & 0x3;
860
861     info->mem_mode = v3_get_vm_mem_mode(info);
862     info->cpu_mode = v3_get_vm_cpu_mode(info);
863
864
865     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
866     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
867     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
868     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
869     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
870     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
871     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
872
873     if (info->shdw_pg_mode == NESTED_PAGING) {
874         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
875     }
876
877     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
878
879     exit_log[info->num_exits % 10] = exit_info;
880
881 #ifdef V3_CONFIG_SYMCALL
882     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
883         update_irq_exit_state(info);
884     }
885 #else
886     update_irq_exit_state(info);
887 #endif
888
889     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
890         // This is a special case whose only job is to inject an interrupt
891         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
892         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
893         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
894
895 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
896        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
897 #endif
898     }
899
900     // reenable global interrupts after vm exit
901     v3_enable_ints();
902
903     // Conditionally yield the CPU if the timeslice has expired
904     v3_yield_cond(info);
905
906     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
907         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
908         return -1;
909     }
910
911     return 0;
912 }
913
914
915 int v3_start_vmx_guest(struct guest_info * info) {
916
917     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
918
919     if (info->vcpu_id == 0) {
920         info->core_run_state = CORE_RUNNING;
921     } else {
922
923         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
924
925         while (info->core_run_state == CORE_STOPPED) {
926
927             if (info->vm_info->run_state == VM_STOPPED) {
928                 // The VM was stopped before this core was initialized. 
929                 return 0;
930             }
931
932             v3_yield(info);
933             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
934         }
935         
936         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
937
938         // We'll be paranoid about race conditions here
939         v3_wait_at_barrier(info);
940     }
941
942
943     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
944                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
945                info->segments.cs.limit, (void *)(info->rip));
946
947
948     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
949
950     v3_start_time(info);
951
952     while (1) {
953
954         if (info->vm_info->run_state == VM_STOPPED) {
955             info->core_run_state = CORE_STOPPED;
956             break;
957         }
958
959         if (v3_vmx_enter(info) == -1) {
960
961             addr_t host_addr;
962             addr_t linear_addr = 0;
963             
964             info->vm_info->run_state = VM_ERROR;
965             
966             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
967             
968             v3_print_guest_state(info);
969             
970             V3_Print("VMX core %u\n", info->vcpu_id); 
971
972             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
973             
974             if (info->mem_mode == PHYSICAL_MEM) {
975                 v3_gpa_to_hva(info, linear_addr, &host_addr);
976             } else if (info->mem_mode == VIRTUAL_MEM) {
977                 v3_gva_to_hva(info, linear_addr, &host_addr);
978             }
979             
980             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
981             
982             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
983             v3_dump_mem((uint8_t *)host_addr, 15);
984             
985             v3_print_stack(info);
986
987
988             v3_print_vmcs();
989             print_exit_log(info);
990             return -1;
991         }
992
993         v3_wait_at_barrier(info);
994
995
996         if (info->vm_info->run_state == VM_STOPPED) {
997             info->core_run_state = CORE_STOPPED;
998             break;
999         }
1000 /*
1001         if ((info->num_exits % 5000) == 0) {
1002             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
1003         }
1004 */
1005
1006     }
1007
1008     return 0;
1009 }
1010
1011
1012
1013
1014 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
1015 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
1016 #define CPUID_1_ECX_VTXFLAG 0x00000020
1017
1018 int v3_is_vmx_capable() {
1019     v3_msr_t feature_msr;
1020     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1021
1022     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1023
1024     PrintDebug("ECX: 0x%x\n", ecx);
1025
1026     if (ecx & CPUID_1_ECX_VTXFLAG) {
1027         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1028         
1029         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
1030
1031         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1032             PrintDebug("VMX is locked -- enable in the BIOS\n");
1033             return 0;
1034         }
1035
1036     } else {
1037         PrintDebug("VMX not supported on this cpu\n");
1038         return 0;
1039     }
1040
1041     return 1;
1042 }
1043
1044
1045 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1046     // init vmcs bios
1047     
1048     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1049         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1050         // easy 
1051         core->rip = 0;
1052         core->segments.cs.selector = rip << 8;
1053         core->segments.cs.limit = 0xffff;
1054         core->segments.cs.base = rip << 12;
1055     } else {
1056         core->vm_regs.rdx = core->vcpu_id;
1057         core->vm_regs.rbx = rip;
1058     }
1059
1060     return 0;
1061 }
1062
1063
1064
1065 void v3_init_vmx_cpu(int cpu_id) {
1066     addr_t vmx_on_region = 0;
1067
1068     if (cpu_id == 0) {
1069         if (v3_init_vmx_hw(&hw_info) == -1) {
1070             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1071             return;
1072         }
1073     }
1074
1075     enable_vmx();
1076
1077
1078     // Setup VMXON Region
1079     vmx_on_region = allocate_vmcs();
1080
1081
1082     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1083         V3_Print("VMX Enabled\n");
1084         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1085     } else {
1086         V3_Print("VMX already enabled\n");
1087         V3_FreePages((void *)vmx_on_region, 1);
1088     }
1089
1090     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1091
1092     {
1093         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1094         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1095         
1096         if (sec_proc_ctrls.enable_ept == 0) {
1097             V3_Print("VMX EPT (Nested) Paging not supported\n");
1098             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1099         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1100             V3_Print("VMX EPT (Nested) Paging supported\n");
1101             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1102         } else {
1103             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1104             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1105         }
1106     }
1107 }
1108
1109
1110 void v3_deinit_vmx_cpu(int cpu_id) {
1111     extern v3_cpu_arch_t v3_cpu_types[];
1112     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1113
1114     if (host_vmcs_ptrs[cpu_id] != 0) {
1115         V3_Print("Disabling VMX\n");
1116
1117         if (vmx_off() != VMX_SUCCESS) {
1118             PrintError("Error executing VMXOFF\n");
1119         }
1120
1121         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1122
1123         host_vmcs_ptrs[cpu_id] = 0;
1124     }
1125 }