Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


added a global machine type to determine machine architecture
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #ifdef V3_CONFIG_CHECKPOINT
38 #include <palacios/vmm_checkpoint.h>
39 #endif
40
41 #include <palacios/vmx_ept.h>
42 #include <palacios/vmx_assist.h>
43 #include <palacios/vmx_hw_info.h>
44
45 #ifndef V3_CONFIG_DEBUG_VMX
46 #undef PrintDebug
47 #define PrintDebug(fmt, args...)
48 #endif
49
50
51 /* These fields contain the hardware feature sets supported by the local CPU */
52 static struct vmx_hw_info hw_info;
53
54 extern v3_cpu_arch_t v3_cpu_types[];
55
56 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
57
58 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
59 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
60
61 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
62     int ret = 0;
63
64     ret = vmcs_write(field, val);
65
66     if (ret != VMX_SUCCESS) {
67         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
68         return 1;
69     }
70
71     return 0;
72 }
73
74 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
75     int ret = 0;
76
77     ret = vmcs_read(field, val);
78
79     if (ret != VMX_SUCCESS) {
80         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
81     }
82
83     return ret;
84 }
85
86
87
88
89 static addr_t allocate_vmcs() {
90     struct vmcs_data * vmcs_page = NULL;
91
92     PrintDebug("Allocating page\n");
93
94     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
95     memset(vmcs_page, 0, 4096);
96
97     vmcs_page->revision = hw_info.basic_info.revision;
98     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
99
100     return (addr_t)V3_PAddr((void *)vmcs_page);
101 }
102
103
104
105
106 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
107     int vmx_ret = 0;
108
109     // disable global interrupts for vm state initialization
110     v3_disable_ints();
111
112     PrintDebug("Loading VMCS\n");
113     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
114     vmx_state->state = VMX_UNLAUNCHED;
115
116     if (vmx_ret != VMX_SUCCESS) {
117         PrintError("VMPTRLD failed\n");
118         return -1;
119     }
120
121
122     /*** Setup default state from HW ***/
123
124     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
125     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
126     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
127     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
128     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
129
130     /* Print Control MSRs */
131     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
132     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
133
134
135
136     /******* Setup Host State **********/
137
138     /* Cache GDTR, IDTR, and TR in host struct */
139
140
141     /********** Setup VMX Control Fields ***********/
142
143     /* Add external interrupts, NMI exiting, and virtual NMI */
144     vmx_state->pin_ctrls.nmi_exit = 1;
145     vmx_state->pin_ctrls.ext_int_exit = 1;
146
147
148     vmx_state->pri_proc_ctrls.hlt_exit = 1;
149
150
151     vmx_state->pri_proc_ctrls.pause_exit = 0;
152     vmx_state->pri_proc_ctrls.tsc_offset = 1;
153 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
154     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
155 #endif
156
157     /* Setup IO map */
158     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
159     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
160     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
161             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
162
163
164     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
165     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
166
167
168
169 #ifdef __V3_64BIT__
170     // Ensure host runs in 64-bit mode at each VM EXIT
171     vmx_state->exit_ctrls.host_64_on = 1;
172 #endif
173
174     // Hook all accesses to EFER register
175     v3_hook_msr(core->vm_info, EFER_MSR, 
176                 &v3_handle_efer_read,
177                 &v3_handle_efer_write, 
178                 core);
179
180     // Restore host's EFER register on each VM EXIT
181     vmx_state->exit_ctrls.ld_efer = 1;
182
183     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
184     vmx_state->exit_ctrls.save_efer = 1;
185     vmx_state->entry_ctrls.ld_efer  = 1;
186
187     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
188     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
189
190
191     // Setup Guests initial PAT field
192     vmx_ret |= check_vmcs_write(VMCS_GUEST_PAT, 0x0007040600070406LL);
193
194     /* Setup paging */
195     if (core->shdw_pg_mode == SHADOW_PAGING) {
196         PrintDebug("Creating initial shadow page table\n");
197
198         if (v3_init_passthrough_pts(core) == -1) {
199             PrintError("Could not initialize passthrough page tables\n");
200             return -1;
201         }
202         
203 #define CR0_PE 0x00000001
204 #define CR0_PG 0x80000000
205 #define CR0_WP 0x00010000 // To ensure mem hooks work
206         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
207
208         core->ctrl_regs.cr3 = core->direct_map_pt;
209
210         // vmx_state->pinbased_ctrls |= NMI_EXIT;
211
212         /* Add CR exits */
213         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
214         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
215         
216         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
217         
218         /* Add page fault exits */
219         vmx_state->excp_bmap.pf = 1;
220
221         // Setup VMX Assist
222         v3_vmxassist_init(core, vmx_state);
223
224     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
225                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
226
227 #define CR0_PE 0x00000001
228 #define CR0_PG 0x80000000
229 #define CR0_WP 0x00010000 // To ensure mem hooks work
230         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
231
232         // vmx_state->pinbased_ctrls |= NMI_EXIT;
233
234         /* Disable CR exits */
235         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
236         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
237
238         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
239
240         /* Add page fault exits */
241         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
242         
243         // Setup VMX Assist
244         v3_vmxassist_init(core, vmx_state);
245
246         /* Enable EPT */
247         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
248         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
249
250
251
252         if (v3_init_ept(core, &hw_info) == -1) {
253             PrintError("Error initializing EPT\n");
254             return -1;
255         }
256
257     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
258                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
259         int i = 0;
260         // For now we will assume that unrestricted guest mode is assured w/ EPT
261
262
263         core->vm_regs.rsp = 0x00;
264         core->rip = 0xfff0;
265         core->vm_regs.rdx = 0x00000f00;
266         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
267         core->ctrl_regs.cr0 = 0x00000030; 
268         core->ctrl_regs.cr4 = 0x00002010; // Enable VMX and PSE flag
269         
270
271         core->segments.cs.selector = 0xf000;
272         core->segments.cs.limit = 0xffff;
273         core->segments.cs.base = 0x0000000f0000LL;
274
275         // (raw attributes = 0xf3)
276         core->segments.cs.type = 0xb;
277         core->segments.cs.system = 0x1;
278         core->segments.cs.dpl = 0x0;
279         core->segments.cs.present = 1;
280
281
282
283         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
284                                           &(core->segments.es), &(core->segments.fs), 
285                                           &(core->segments.gs), NULL};
286
287         for ( i = 0; segregs[i] != NULL; i++) {
288             struct v3_segment * seg = segregs[i];
289         
290             seg->selector = 0x0000;
291             //    seg->base = seg->selector << 4;
292             seg->base = 0x00000000;
293             seg->limit = 0xffff;
294
295
296             seg->type = 0x3;
297             seg->system = 0x1;
298             seg->dpl = 0x0;
299             seg->present = 1;
300             //    seg->granularity = 1;
301
302         }
303
304
305         core->segments.gdtr.limit = 0x0000ffff;
306         core->segments.gdtr.base = 0x0000000000000000LL;
307
308         core->segments.idtr.limit = 0x0000ffff;
309         core->segments.idtr.base = 0x0000000000000000LL;
310
311         core->segments.ldtr.selector = 0x0000;
312         core->segments.ldtr.limit = 0x0000ffff;
313         core->segments.ldtr.base = 0x0000000000000000LL;
314         core->segments.ldtr.type = 0x2;
315         core->segments.ldtr.present = 1;
316
317         core->segments.tr.selector = 0x0000;
318         core->segments.tr.limit = 0x0000ffff;
319         core->segments.tr.base = 0x0000000000000000LL;
320         core->segments.tr.type = 0xb;
321         core->segments.tr.present = 1;
322
323         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
324         core->dbg_regs.dr7 = 0x0000000000000400LL;
325
326         /* Enable EPT */
327         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
328         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
329         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
330
331
332         /* Disable shadow paging stuff */
333         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
334         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
335
336         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
337
338
339         if (v3_init_ept(core, &hw_info) == -1) {
340             PrintError("Error initializing EPT\n");
341             return -1;
342         }
343
344     } else {
345         PrintError("Invalid Virtual paging mode\n");
346         return -1;
347     }
348
349
350     // hook vmx msrs
351
352     // Setup SYSCALL/SYSENTER MSRs in load/store area
353     
354     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
355     {
356
357         struct vmcs_msr_save_area * msr_entries = NULL;
358         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
359         int msr_ret = 0;
360
361         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
362
363         if (max_msrs < 4) {
364             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
365             return -1;
366         }
367
368         vmx_state->msr_area_paddr = (addr_t)V3_AllocPages(1);
369         
370         if (vmx_state->msr_area_paddr == (addr_t)NULL) {
371             PrintError("could not allocate msr load/store area\n");
372             return -1;
373         }
374
375         msr_entries = (struct vmcs_msr_save_area *)V3_VAddr((void *)(vmx_state->msr_area_paddr));
376         vmx_state->msr_area = msr_entries; // cache in vmx_info
377
378         memset(msr_entries, 0, PAGE_SIZE);
379
380         msr_entries->guest_star.index = IA32_STAR_MSR;
381         msr_entries->guest_lstar.index = IA32_LSTAR_MSR;
382         msr_entries->guest_fmask.index = IA32_FMASK_MSR;
383         msr_entries->guest_kern_gs.index = IA32_KERN_GS_BASE_MSR;
384
385         msr_entries->host_star.index = IA32_STAR_MSR;
386         msr_entries->host_lstar.index = IA32_LSTAR_MSR;
387         msr_entries->host_fmask.index = IA32_FMASK_MSR;
388         msr_entries->host_kern_gs.index = IA32_KERN_GS_BASE_MSR;
389
390         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
391         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
392         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
393
394         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
395         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
396         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->host_msrs));
397
398
399         msr_ret |= v3_hook_msr(core->vm_info, IA32_STAR_MSR, NULL, NULL, NULL);
400         msr_ret |= v3_hook_msr(core->vm_info, IA32_LSTAR_MSR, NULL, NULL, NULL);
401         msr_ret |= v3_hook_msr(core->vm_info, IA32_FMASK_MSR, NULL, NULL, NULL);
402         msr_ret |= v3_hook_msr(core->vm_info, IA32_KERN_GS_BASE_MSR, NULL, NULL, NULL);
403
404
405         // IMPORTANT: These MSRs appear to be cached by the hardware....
406         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_CS_MSR, NULL, NULL, NULL);
407         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_ESP_MSR, NULL, NULL, NULL);
408         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_EIP_MSR, NULL, NULL, NULL);
409
410         msr_ret |= v3_hook_msr(core->vm_info, FS_BASE_MSR, NULL, NULL, NULL);
411         msr_ret |= v3_hook_msr(core->vm_info, GS_BASE_MSR, NULL, NULL, NULL);
412
413
414         // Not sure what to do about this... Does not appear to be an explicit hardware cache version...
415         msr_ret |= v3_hook_msr(core->vm_info, IA32_CSTAR_MSR, NULL, NULL, NULL);
416
417         if (msr_ret != 0) {
418             PrintError("Error configuring MSR save/restore area\n");
419             return -1;
420         }
421
422
423     }    
424
425     /* Sanity check ctrl/reg fields against hw_defaults */
426
427
428
429
430     /*** Write all the info to the VMCS ***/
431   
432     /*
433     {
434         // IS THIS NECESSARY???
435 #define DEBUGCTL_MSR 0x1d9
436         struct v3_msr tmp_msr;
437         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
438         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
439         core->dbg_regs.dr7 = 0x400;
440     }
441     */
442
443 #ifdef __V3_64BIT__
444     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
445 #else
446     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
447     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
448 #endif
449
450
451  
452
453     if (v3_update_vmcs_ctrl_fields(core)) {
454         PrintError("Could not write control fields!\n");
455         return -1;
456     }
457     
458     /*
459     if (v3_update_vmcs_host_state(core)) {
460         PrintError("Could not write host state\n");
461         return -1;
462     }
463     */
464
465     // reenable global interrupts for vm state initialization now
466     // that the vm state is initialized. If another VM kicks us off, 
467     // it'll update our vmx state so that we know to reload ourself
468     v3_enable_ints();
469
470     return 0;
471 }
472
473 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
474     struct vmx_data * vmx_state = NULL;
475     int vmx_ret = 0;
476     
477     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
478     memset(vmx_state, 0, sizeof(struct vmx_data));
479
480     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
481
482     PrintDebug("Allocating VMCS\n");
483     vmx_state->vmcs_ptr_phys = allocate_vmcs();
484
485     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
486
487     core->vmm_data = vmx_state;
488     vmx_state->state = VMX_UNLAUNCHED;
489
490     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
491     
492     // TODO: Fix vmcs fields so they're 32-bit
493
494     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
495     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
496
497     if (vmx_ret != VMX_SUCCESS) {
498         PrintError("VMCLEAR failed\n");
499         return -1; 
500     }
501
502     if (vm_class == V3_PC_VM) {
503         PrintDebug("Initializing VMCS\n");
504         if (init_vmcs_bios(core, vmx_state) == -1) {
505             PrintError("Error initializing VMCS to BIOS state\n");
506             return -1;
507         }
508     } else {
509         PrintError("Invalid VM Class\n");
510         return -1;
511     }
512
513     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
514     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
515
516     return 0;
517 }
518
519
520 int v3_deinit_vmx_vmcs(struct guest_info * core) {
521     struct vmx_data * vmx_state = core->vmm_data;
522
523     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
524     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
525
526     V3_Free(vmx_state);
527
528     return 0;
529 }
530
531
532
533 #ifdef V3_CONFIG_CHECKPOINT
534 /* 
535  * JRL: This is broken
536  */
537 int v3_vmx_save_core(struct guest_info * core, void * ctx){
538     uint64_t vmcs_ptr = vmcs_store();
539
540     v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
541
542     return 0;
543 }
544
545 int v3_vmx_load_core(struct guest_info * core, void * ctx){
546     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
547     struct cr0_32 * shadow_cr0;
548     char vmcs[PAGE_SIZE_4KB];
549
550     v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
551
552     vmcs_clear(vmx_info->vmcs_ptr_phys);
553     vmcs_load((addr_t)vmcs);
554
555     v3_vmx_save_vmcs(core);
556
557     shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
558
559
560     /* Get the CPU mode to set the guest_ia32e entry ctrl */
561
562     if (core->shdw_pg_mode == SHADOW_PAGING) {
563         if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
564             if (v3_activate_shadow_pt(core) == -1) {
565                 PrintError("Failed to activate shadow page tables\n");
566                 return -1;
567             }
568         } else {
569             if (v3_activate_passthrough_pt(core) == -1) {
570                 PrintError("Failed to activate passthrough page tables\n");
571                 return -1;
572             }
573         }
574     }
575
576     return 0;
577 }
578 #endif
579
580
581 void v3_flush_vmx_vm_core(struct guest_info * core) {
582     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
583     vmcs_clear(vmx_info->vmcs_ptr_phys);
584     vmx_info->state = VMX_UNLAUNCHED;
585 }
586
587
588
589 static int update_irq_exit_state(struct guest_info * info) {
590     struct vmx_exit_idt_vec_info idt_vec_info;
591
592     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
593
594     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
595 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
596         V3_Print("Calling v3_injecting_intr\n");
597 #endif
598         info->intr_core_state.irq_started = 0;
599         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
600     }
601
602     return 0;
603 }
604
605 static int update_irq_entry_state(struct guest_info * info) {
606     struct vmx_exit_idt_vec_info idt_vec_info;
607     struct vmcs_interrupt_state intr_core_state;
608     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
609
610     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
611     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
612
613     /* Check for pending exceptions to inject */
614     if (v3_excp_pending(info)) {
615         struct vmx_entry_int_info int_info;
616         int_info.value = 0;
617
618         // In VMX, almost every exception is hardware
619         // Software exceptions are pretty much only for breakpoint or overflow
620         int_info.type = 3;
621         int_info.vector = v3_get_excp_number(info);
622
623         if (info->excp_state.excp_error_code_valid) {
624             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
625             int_info.error_code = 1;
626
627 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
628             V3_Print("Injecting exception %d with error code %x\n", 
629                     int_info.vector, info->excp_state.excp_error_code);
630 #endif
631         }
632
633         int_info.valid = 1;
634 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
635         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
636 #endif
637         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
638
639         v3_injecting_excp(info, int_info.vector);
640
641     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
642                (intr_core_state.val == 0)) {
643        
644         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
645
646 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
647             V3_Print("IRQ pending from previous injection\n");
648 #endif
649
650             // Copy the IDT vectoring info over to reinject the old interrupt
651             if (idt_vec_info.error_code == 1) {
652                 uint32_t err_code = 0;
653
654                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
655                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
656             }
657
658             idt_vec_info.undef = 0;
659             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
660
661         } else {
662             struct vmx_entry_int_info ent_int;
663             ent_int.value = 0;
664
665             switch (v3_intr_pending(info)) {
666                 case V3_EXTERNAL_IRQ: {
667                     info->intr_core_state.irq_vector = v3_get_intr(info); 
668                     ent_int.vector = info->intr_core_state.irq_vector;
669                     ent_int.type = 0;
670                     ent_int.error_code = 0;
671                     ent_int.valid = 1;
672
673 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
674                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
675                                info->intr_core_state.irq_vector, 
676                                (uint32_t)info->num_exits, 
677                                (void *)(addr_t)info->rip);
678 #endif
679
680                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
681                     info->intr_core_state.irq_started = 1;
682
683                     break;
684                 }
685                 case V3_NMI:
686                     PrintDebug("Injecting NMI\n");
687
688                     ent_int.type = 2;
689                     ent_int.vector = 2;
690                     ent_int.valid = 1;
691                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
692
693                     break;
694                 case V3_SOFTWARE_INTR:
695                     PrintDebug("Injecting software interrupt\n");
696                     ent_int.type = 4;
697
698                     ent_int.valid = 1;
699                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
700
701                     break;
702                 case V3_VIRTUAL_IRQ:
703                     // Not sure what to do here, Intel doesn't have virtual IRQs
704                     // May be the same as external interrupts/IRQs
705
706                     break;
707                 case V3_INVALID_INTR:
708                 default:
709                     break;
710             }
711         }
712     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
713         // Enable INTR window exiting so we know when IF=1
714         uint32_t instr_len;
715
716         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
717
718 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
719         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
720 #endif
721
722         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
723         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
724     }
725
726
727     return 0;
728 }
729
730
731
732 static struct vmx_exit_info exit_log[10];
733
734 static void print_exit_log(struct guest_info * info) {
735     int cnt = info->num_exits % 10;
736     int i = 0;
737     
738
739     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
740
741     for (i = 0; i < 10; i++) {
742         struct vmx_exit_info * tmp = &exit_log[cnt];
743
744         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
745         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
746         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
747         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
748         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
749
750         cnt--;
751
752         if (cnt == -1) {
753             cnt = 9;
754         }
755
756     }
757
758 }
759
760 int
761 v3_vmx_schedule_timeout(struct guest_info * info)
762 {
763     struct vmx_data * vmx_state = (struct vmx_data *)(info->vmm_data);
764     sint64_t cycles;
765     uint32_t timeout;
766
767     /* Check if the hardware supports an active timeout */
768 #define VMX_ACTIVE_PREEMPT_TIMER_PIN 0x40
769     if (hw_info.pin_ctrls.req_mask & VMX_ACTIVE_PREEMPT_TIMER_PIN) {
770         /* The hardware doesn't support us modifying this pin control */
771         return 0;
772     }
773
774     /* Check if we have one to schedule and schedule it if we do */
775     cycles = (sint64_t)info->time_state.next_timeout - (sint64_t)v3_get_guest_time(&info->time_state);
776     if (info->time_state.next_timeout == (ullong_t) -1)  {
777         timeout = 0;
778         vmx_state->pin_ctrls.active_preempt_timer = 0;
779     } else if (cycles < 0) {
780         /* set the timeout to 0 to force an immediate re-exit since it expired between
781          * when we checked a timeout and now. IF SOMEONE CONTINAULLY SETS A SHORT TIMEOUT,
782          * THIS CAN LOCK US OUT OF THE GUEST! */
783         timeout = 0;
784         vmx_state->pin_ctrls.active_preempt_timer = 1;
785     } else {
786         /* The hardware supports scheduling a timeout, and we have one to 
787          * schedule */
788         timeout = (uint32_t)cycles >> hw_info.misc_info.tsc_multiple;
789         vmx_state->pin_ctrls.active_preempt_timer = 1;
790     }
791
792     /* Actually program the timer based on the settings above. */
793     check_vmcs_write(VMCS_PREEMPT_TIMER, timeout);
794     check_vmcs_write(VMCS_PIN_CTRLS, vmx_state->pin_ctrls.value);
795     return 0;
796 }
797
798 /* 
799  * CAUTION and DANGER!!! 
800  * 
801  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
802  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
803  * on its contents will cause things to break. The contents at the time of the exit WILL 
804  * change before the exit handler is executed.
805  */
806 int v3_vmx_enter(struct guest_info * info) {
807     int ret = 0;
808     uint32_t tsc_offset_low, tsc_offset_high;
809     struct vmx_exit_info exit_info;
810     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
811
812     // Conditionally yield the CPU if the timeslice has expired
813     v3_yield_cond(info);
814
815     // Perform any additional yielding needed for time adjustment
816     v3_adjust_time(info);
817
818     // Check for timeout - since this calls generic hooks in devices
819     // that may do things like pause the VM, it cannot be with interrupts
820     // disabled.
821     v3_check_timeout(info);
822
823     // disable global interrupts for vm state transition
824     v3_disable_ints();
825
826     // Update timer devices late after being in the VM so that as much 
827     // of the time in the VM is accounted for as possible. Also do it before
828     // updating IRQ entry state so that any interrupts the timers raise get 
829     // handled on the next VM entry. Must be done with interrupts disabled.
830     v3_update_timers(info);
831
832     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
833         vmcs_clear(vmx_info->vmcs_ptr_phys);
834         vmcs_load(vmx_info->vmcs_ptr_phys);
835         vmx_info->state = VMX_UNLAUNCHED;
836     }
837
838     v3_vmx_restore_vmcs(info);
839
840
841 #ifdef V3_CONFIG_SYMCALL
842     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
843         update_irq_entry_state(info);
844     }
845 #else 
846     update_irq_entry_state(info);
847 #endif
848
849     {
850         addr_t guest_cr3;
851         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
852         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
853     }
854
855     // Update vmx active preemption timer to exit at the next timeout if 
856     // the hardware supports it.
857     v3_vmx_schedule_timeout(info);
858
859     // Perform last-minute time bookkeeping prior to entering the VM
860     v3_time_enter_vm(info);
861
862     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
863     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
864     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
865     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
866
867     if (v3_update_vmcs_host_state(info)) {
868         v3_enable_ints();
869         PrintError("Could not write host state\n");
870         return -1;
871     }
872
873
874     if (vmx_info->state == VMX_UNLAUNCHED) {
875         vmx_info->state = VMX_LAUNCHED;
876         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
877     } else {
878         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
879         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
880     }
881     
882
883
884     //  PrintDebug("VMX Exit: ret=%d\n", ret);
885
886     if (ret != VMX_SUCCESS) {
887         uint32_t error = 0;
888         vmcs_read(VMCS_INSTR_ERR, &error);
889
890         v3_enable_ints();
891
892         PrintError("VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
893         return -1;
894     }
895
896
897
898     // Immediate exit from VM time bookkeeping
899     v3_time_exit_vm(info);
900
901     info->num_exits++;
902
903     /* Update guest state */
904     v3_vmx_save_vmcs(info);
905
906     // info->cpl = info->segments.cs.selector & 0x3;
907
908     info->mem_mode = v3_get_vm_mem_mode(info);
909     info->cpu_mode = v3_get_vm_cpu_mode(info);
910
911
912     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
913     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
914     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
915     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
916     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
917     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
918     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
919
920     if (info->shdw_pg_mode == NESTED_PAGING) {
921         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
922     }
923
924     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
925
926     exit_log[info->num_exits % 10] = exit_info;
927
928 #ifdef V3_CONFIG_SYMCALL
929     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
930         update_irq_exit_state(info);
931     }
932 #else
933     update_irq_exit_state(info);
934 #endif
935
936     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
937         // This is a special case whose only job is to inject an interrupt
938         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
939         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
940         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
941
942 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
943        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
944 #endif
945     }
946
947     // reenable global interrupts after vm exit
948     v3_enable_ints();
949
950     // Conditionally yield the CPU if the timeslice has expired
951     v3_yield_cond(info);
952
953     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
954         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
955         return -1;
956     }
957
958     return 0;
959 }
960
961
962 int v3_start_vmx_guest(struct guest_info * info) {
963
964     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
965
966     if (info->vcpu_id == 0) {
967         info->core_run_state = CORE_RUNNING;
968     } else {
969
970         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
971
972         while (info->core_run_state == CORE_STOPPED) {
973
974             if (info->vm_info->run_state == VM_STOPPED) {
975                 // The VM was stopped before this core was initialized. 
976                 return 0;
977             }
978
979             v3_yield(info);
980             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
981         }
982         
983         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
984
985         // We'll be paranoid about race conditions here
986         v3_wait_at_barrier(info);
987     }
988
989
990     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
991                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
992                info->segments.cs.limit, (void *)(info->rip));
993
994
995     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
996
997     v3_start_time(info);
998
999     while (1) {
1000
1001         if (info->vm_info->run_state == VM_STOPPED) {
1002             info->core_run_state = CORE_STOPPED;
1003             break;
1004         }
1005
1006         if (v3_vmx_enter(info) == -1) {
1007
1008             addr_t host_addr;
1009             addr_t linear_addr = 0;
1010             
1011             info->vm_info->run_state = VM_ERROR;
1012             
1013             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
1014             
1015             v3_print_guest_state(info);
1016             
1017             V3_Print("VMX core %u\n", info->vcpu_id); 
1018
1019             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
1020             
1021             if (info->mem_mode == PHYSICAL_MEM) {
1022                 v3_gpa_to_hva(info, linear_addr, &host_addr);
1023             } else if (info->mem_mode == VIRTUAL_MEM) {
1024                 v3_gva_to_hva(info, linear_addr, &host_addr);
1025             }
1026             
1027             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
1028             
1029             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
1030             v3_dump_mem((uint8_t *)host_addr, 15);
1031             
1032             v3_print_stack(info);
1033
1034
1035             v3_print_vmcs();
1036             print_exit_log(info);
1037             return -1;
1038         }
1039
1040         v3_wait_at_barrier(info);
1041
1042
1043         if (info->vm_info->run_state == VM_STOPPED) {
1044             info->core_run_state = CORE_STOPPED;
1045             break;
1046         }
1047 /*
1048         if ((info->num_exits % 5000) == 0) {
1049             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
1050         }
1051 */
1052
1053     }
1054
1055     return 0;
1056 }
1057
1058
1059
1060
1061 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
1062 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
1063 #define CPUID_1_ECX_VTXFLAG 0x00000020
1064
1065 int v3_is_vmx_capable() {
1066     v3_msr_t feature_msr;
1067     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1068
1069     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1070
1071     PrintDebug("ECX: 0x%x\n", ecx);
1072
1073     if (ecx & CPUID_1_ECX_VTXFLAG) {
1074         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1075         
1076         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
1077
1078         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1079             PrintDebug("VMX is locked -- enable in the BIOS\n");
1080             return 0;
1081         }
1082
1083     } else {
1084         PrintDebug("VMX not supported on this cpu\n");
1085         return 0;
1086     }
1087
1088     return 1;
1089 }
1090
1091
1092 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1093     // init vmcs bios
1094     
1095     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1096         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1097         // easy 
1098         core->rip = 0;
1099         core->segments.cs.selector = rip << 8;
1100         core->segments.cs.limit = 0xffff;
1101         core->segments.cs.base = rip << 12;
1102     } else {
1103         core->vm_regs.rdx = core->vcpu_id;
1104         core->vm_regs.rbx = rip;
1105     }
1106
1107     return 0;
1108 }
1109
1110
1111
1112 void v3_init_vmx_cpu(int cpu_id) {
1113     addr_t vmx_on_region = 0;
1114     extern v3_cpu_arch_t v3_mach_type;
1115
1116     if (v3_mach_type == V3_INVALID_CPU) {
1117         if (v3_init_vmx_hw(&hw_info) == -1) {
1118             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1119             return;
1120         }
1121     }
1122
1123     enable_vmx();
1124
1125
1126     // Setup VMXON Region
1127     vmx_on_region = allocate_vmcs();
1128
1129
1130     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1131         V3_Print("VMX Enabled\n");
1132         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1133     } else {
1134         V3_Print("VMX already enabled\n");
1135         V3_FreePages((void *)vmx_on_region, 1);
1136     }
1137
1138     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1139
1140     {
1141         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1142         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1143         
1144         if (sec_proc_ctrls.enable_ept == 0) {
1145             V3_Print("VMX EPT (Nested) Paging not supported\n");
1146             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1147         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1148             V3_Print("VMX EPT (Nested) Paging supported\n");
1149             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1150         } else {
1151             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1152             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1153         }
1154     }
1155     
1156 }
1157
1158
1159 void v3_deinit_vmx_cpu(int cpu_id) {
1160     extern v3_cpu_arch_t v3_cpu_types[];
1161     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1162
1163     if (host_vmcs_ptrs[cpu_id] != 0) {
1164         V3_Print("Disabling VMX\n");
1165
1166         if (vmx_off() != VMX_SUCCESS) {
1167             PrintError("Error executing VMXOFF\n");
1168         }
1169
1170         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1171
1172         host_vmcs_ptrs[cpu_id] = 0;
1173     }
1174 }