Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


HVM capability enhancement: asynchronous upcalls to ROS userspace
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36 #include <palacios/vmm_timeout.h>
37 #include <palacios/vmm_debug.h>
38
39 #ifdef V3_CONFIG_CHECKPOINT
40 #include <palacios/vmm_checkpoint.h>
41 #endif
42
43 #include <palacios/vmx_ept.h>
44 #include <palacios/vmx_assist.h>
45 #include <palacios/vmx_hw_info.h>
46
47 #ifdef V3_CONFIG_MEM_TRACK
48 #include <palacios/vmm_mem_track.h>
49 #endif 
50
51 #ifndef V3_CONFIG_DEBUG_VMX
52 #undef PrintDebug
53 #define PrintDebug(fmt, args...)
54 #endif
55
56
57 /* These fields contain the hardware feature sets supported by the local CPU */
58 static struct vmx_hw_info hw_info;
59
60 extern v3_cpu_arch_t v3_mach_type;
61
62 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
63
64 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
65 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
66
67 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
68     int ret = 0;
69
70     ret = vmcs_write(field, val);
71
72     if (ret != VMX_SUCCESS) {
73         PrintError(VM_NONE, VCORE_NONE, "VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
74         return 1;
75     }
76
77
78     
79
80     return 0;
81 }
82
83 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
84     int ret = 0;
85
86     ret = vmcs_read(field, val);
87
88     if (ret != VMX_SUCCESS) {
89         PrintError(VM_NONE, VCORE_NONE, "VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
90     }
91
92     return ret;
93 }
94
95
96
97
98 static addr_t allocate_vmcs() {
99     void *temp;
100     struct vmcs_data * vmcs_page = NULL;
101
102     PrintDebug(VM_NONE, VCORE_NONE, "Allocating page\n");
103
104     temp = V3_AllocPages(1); // need not be shadow-safe, not exposed to guest
105     if (!temp) { 
106         PrintError(VM_NONE, VCORE_NONE, "Cannot allocate VMCS\n");
107         return -1;
108     }
109     vmcs_page = (struct vmcs_data *)V3_VAddr(temp);
110     memset(vmcs_page, 0, 4096);
111
112     vmcs_page->revision = hw_info.basic_info.revision;
113     PrintDebug(VM_NONE, VCORE_NONE, "VMX Revision: 0x%x\n", vmcs_page->revision);
114
115     return (addr_t)V3_PAddr((void *)vmcs_page);
116 }
117
118
119 #if 0
120 static int debug_efer_read(struct guest_info * core, uint_t msr, struct v3_msr * src, void * priv_data) {
121     struct v3_msr * efer = (struct v3_msr *)&(core->ctrl_regs.efer);
122     V3_Print(core->vm_info, core, "\n\nEFER READ (val = %p)\n", (void *)efer->value);
123     
124     v3_print_guest_state(core);
125     v3_print_vmcs();
126
127
128     src->value = efer->value;
129     return 0;
130 }
131
132 static int debug_efer_write(struct guest_info * core, uint_t msr, struct v3_msr src, void * priv_data) {
133     struct v3_msr * efer = (struct v3_msr *)&(core->ctrl_regs.efer);
134     V3_Print(core->vm_info, core, "\n\nEFER WRITE (old_val = %p) (new_val = %p)\n", (void *)efer->value, (void *)src.value);
135     
136     v3_print_guest_state(core);
137     v3_print_vmcs();
138
139     efer->value = src.value;
140
141     return 0;
142 }
143 #endif
144
145
146 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
147     int vmx_ret = 0;
148
149     /* Get Available features */
150     struct vmx_pin_ctrls avail_pin_ctrls;
151     avail_pin_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.pin_ctrls));
152     /* ** */
153
154
155     // disable global interrupts for vm state initialization
156     v3_disable_ints();
157
158     PrintDebug(core->vm_info, core, "Loading VMCS\n");
159     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
160     vmx_state->state = VMX_UNLAUNCHED;
161
162     if (vmx_ret != VMX_SUCCESS) {
163         PrintError(core->vm_info, core, "VMPTRLD failed\n");
164         return -1;
165     }
166
167
168     /*** Setup default state from HW ***/
169
170     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
171     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
172     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
173     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
174     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
175
176     /* Print Control MSRs */
177     V3_Print(core->vm_info, core, "CR0 MSR: req_val=%p, req_mask=%p\n", (void *)(addr_t)hw_info.cr0.req_val, (void *)(addr_t)hw_info.cr0.req_mask);
178     V3_Print(core->vm_info, core, "CR4 MSR: req_val=%p, req_mask=%p\n", (void *)(addr_t)hw_info.cr4.req_val, (void *)(addr_t)hw_info.cr4.req_mask);
179
180
181
182     /******* Setup Host State **********/
183
184     /* Cache GDTR, IDTR, and TR in host struct */
185
186
187     /********** Setup VMX Control Fields ***********/
188
189     /* Add external interrupts, NMI exiting, and virtual NMI */
190     vmx_state->pin_ctrls.nmi_exit = 1;
191     vmx_state->pin_ctrls.virt_nmi = 1;
192     vmx_state->pin_ctrls.ext_int_exit = 1;
193
194
195
196     /* We enable the preemption timer by default to measure accurate guest time */
197     if (avail_pin_ctrls.active_preempt_timer) {
198         V3_Print(core->vm_info, core, "VMX Preemption Timer is available\n");
199         vmx_state->pin_ctrls.active_preempt_timer = 1;
200         vmx_state->exit_ctrls.save_preempt_timer = 1;
201     }
202
203     // we want it to use this when halting
204     vmx_state->pri_proc_ctrls.hlt_exit = 1;
205
206     // cpuid tells it that it does not have these instructions
207     vmx_state->pri_proc_ctrls.monitor_exit = 1;
208     vmx_state->pri_proc_ctrls.mwait_exit = 1;
209
210     // we don't need to handle a pause, although this is where
211     // we could pull out of a spin lock acquire or schedule to find its partner
212     vmx_state->pri_proc_ctrls.pause_exit = 0;
213
214     vmx_state->pri_proc_ctrls.tsc_offset = 1;
215 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
216     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
217 #endif
218
219     /* Setup IO map */
220     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
221     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
222     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
223             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
224
225
226     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
227     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
228
229
230
231 #ifdef __V3_64BIT__
232     // Ensure host runs in 64-bit mode at each VM EXIT
233     vmx_state->exit_ctrls.host_64_on = 1;
234 #endif
235
236
237
238     // Restore host's EFER register on each VM EXIT
239     vmx_state->exit_ctrls.ld_efer = 1;
240
241     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
242     vmx_state->exit_ctrls.save_efer = 1;
243     vmx_state->entry_ctrls.ld_efer  = 1;
244
245     vmx_state->exit_ctrls.save_pat = 1;
246     vmx_state->exit_ctrls.ld_pat = 1;
247     vmx_state->entry_ctrls.ld_pat = 1;
248
249     /* Temporary GPF trap */
250     //  vmx_state->excp_bmap.gp = 1;
251
252     // Setup Guests initial PAT field
253     vmx_ret |= check_vmcs_write(VMCS_GUEST_PAT, 0x0007040600070406LL);
254
255     // Capture CR8 mods so that we can keep the apic_tpr correct
256     vmx_state->pri_proc_ctrls.cr8_ld_exit = 1;
257     vmx_state->pri_proc_ctrls.cr8_str_exit = 1;
258
259
260     /* Setup paging */
261     if (core->shdw_pg_mode == SHADOW_PAGING) {
262         PrintDebug(core->vm_info, core, "Creating initial shadow page table\n");
263
264         if (v3_init_passthrough_pts(core) == -1) {
265             PrintError(core->vm_info, core, "Could not initialize passthrough page tables\n");
266             return -1;
267         }
268         
269 #define CR0_PE 0x00000001
270 #define CR0_PG 0x80000000
271 #define CR0_WP 0x00010000 // To ensure mem hooks work
272 #define CR0_NE 0x00000020
273         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP | CR0_NE));
274
275
276         // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
277         vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE );
278
279         v3_activate_passthrough_pt(core);
280
281         // vmx_state->pinbased_ctrls |= NMI_EXIT;
282
283         /* Add CR exits */
284         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
285         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
286
287         // Note that we intercept cr4.pae writes
288         // and we have cr4 read-shadowed to the shadow pager's cr4
289
290         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
291         
292         /* Add page fault exits */
293         vmx_state->excp_bmap.pf = 1;
294
295         // Setup VMX Assist
296         v3_vmxassist_init(core, vmx_state);
297
298         // Hook all accesses to EFER register
299         v3_hook_msr(core->vm_info, EFER_MSR, 
300                     &v3_handle_efer_read,
301                     &v3_handle_efer_write, 
302                     core);
303
304     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
305                (v3_mach_type == V3_VMX_EPT_CPU)) {
306
307 #define CR0_PE 0x00000001
308 #define CR0_PG 0x80000000
309 #define CR0_WP 0x00010000 // To ensure mem hooks work
310 #define CR0_NE 0x00000020
311         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP | CR0_NE));
312
313         // vmx_state->pinbased_ctrls |= NMI_EXIT;
314
315         // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
316         vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
317         
318         /* Disable CR exits */
319         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
320         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
321
322         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
323
324         /* Add page fault exits */
325         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
326         
327         // Setup VMX Assist
328         v3_vmxassist_init(core, vmx_state);
329
330         /* Enable EPT */
331         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
332         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
333
334
335
336         if (v3_init_nested_paging_core(core, &hw_info) == -1) {
337             PrintError(core->vm_info, core, "Error initializing EPT\n");
338             return -1;
339         }
340
341         // Hook all accesses to EFER register
342         v3_hook_msr(core->vm_info, EFER_MSR, NULL, NULL, NULL);
343
344     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
345                (v3_mach_type == V3_VMX_EPT_UG_CPU)) {
346         int i = 0;
347         // For now we will assume that unrestricted guest mode is assured w/ EPT
348
349
350         core->vm_regs.rsp = 0x00;
351         core->rip = 0xfff0;
352         core->vm_regs.rdx = 0x00000f00;
353         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
354         core->ctrl_regs.cr0 = 0x60010030; 
355         core->ctrl_regs.cr4 = 0x00002010; // Enable VMX and PSE flag
356         
357
358         core->segments.cs.selector = 0xf000;
359         core->segments.cs.limit = 0xffff;
360         core->segments.cs.base = 0x0000000f0000LL;
361
362         // (raw attributes = 0xf3)
363         core->segments.cs.type = 0xb;
364         core->segments.cs.system = 0x1;
365         core->segments.cs.dpl = 0x0;
366         core->segments.cs.present = 1;
367
368
369
370         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
371                                           &(core->segments.es), &(core->segments.fs), 
372                                           &(core->segments.gs), NULL};
373
374         for ( i = 0; segregs[i] != NULL; i++) {
375             struct v3_segment * seg = segregs[i];
376         
377             seg->selector = 0x0000;
378             //    seg->base = seg->selector << 4;
379             seg->base = 0x00000000;
380             seg->limit = 0xffff;
381
382
383             seg->type = 0x3;
384             seg->system = 0x1;
385             seg->dpl = 0x0;
386             seg->present = 1;
387             //    seg->granularity = 1;
388
389         }
390
391
392         core->segments.gdtr.limit = 0x0000ffff;
393         core->segments.gdtr.base = 0x0000000000000000LL;
394
395         core->segments.idtr.limit = 0x0000ffff;
396         core->segments.idtr.base = 0x0000000000000000LL;
397
398         core->segments.ldtr.selector = 0x0000;
399         core->segments.ldtr.limit = 0x0000ffff;
400         core->segments.ldtr.base = 0x0000000000000000LL;
401         core->segments.ldtr.type = 0x2;
402         core->segments.ldtr.present = 1;
403
404         core->segments.tr.selector = 0x0000;
405         core->segments.tr.limit = 0x0000ffff;
406         core->segments.tr.base = 0x0000000000000000LL;
407         core->segments.tr.type = 0xb;
408         core->segments.tr.present = 1;
409
410         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
411         core->dbg_regs.dr7 = 0x0000000000000400LL;
412
413         /* Enable EPT */
414         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
415         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
416         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
417
418
419         /* Disable shadow paging stuff */
420         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
421         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
422
423         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
424
425
426         // Cause VM_EXIT whenever the CR4.VMXE bit is set
427         vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE);
428 #define CR0_NE 0x00000020
429 #define CR0_CD 0x40000000
430         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, CR0_NE | CR0_CD);
431         ((struct cr0_32 *)&(core->shdw_pg_state.guest_cr0))->ne = 1;
432         ((struct cr0_32 *)&(core->shdw_pg_state.guest_cr0))->cd = 0;
433
434         if (v3_init_nested_paging_core(core, &hw_info) == -1) {
435             PrintError(core->vm_info, core, "Error initializing EPT\n");
436             return -1;
437         }
438
439         // Hook all accesses to EFER register
440         //      v3_hook_msr(core->vm_info, EFER_MSR, &debug_efer_read, &debug_efer_write, core);
441         v3_hook_msr(core->vm_info, EFER_MSR, NULL, NULL, NULL);
442     } else {
443         PrintError(core->vm_info, core, "Invalid Virtual paging mode (pg_mode=%d) (mach_type=%d)\n", core->shdw_pg_mode, v3_mach_type);
444         return -1;
445     }
446
447
448     // hook vmx msrs
449
450     // Setup SYSCALL/SYSENTER MSRs in load/store area
451     
452     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
453     {
454
455         struct vmcs_msr_save_area * msr_entries = NULL;
456         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
457         int msr_ret = 0;
458
459         V3_Print(core->vm_info, core, "Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
460
461         if (max_msrs < 4) {
462             PrintError(core->vm_info, core, "Max MSR cache size is too small (%d)\n", max_msrs);
463             return -1;
464         }
465
466         vmx_state->msr_area_paddr = (addr_t)V3_AllocPages(1); // need not be shadow-safe, not exposed to guest
467         
468         if (vmx_state->msr_area_paddr == (addr_t)NULL) {
469             PrintError(core->vm_info, core, "could not allocate msr load/store area\n");
470             return -1;
471         }
472
473         msr_entries = (struct vmcs_msr_save_area *)V3_VAddr((void *)(vmx_state->msr_area_paddr));
474         vmx_state->msr_area = msr_entries; // cache in vmx_info
475
476         memset(msr_entries, 0, PAGE_SIZE);
477
478         msr_entries->guest_star.index = IA32_STAR_MSR;
479         msr_entries->guest_lstar.index = IA32_LSTAR_MSR;
480         msr_entries->guest_fmask.index = IA32_FMASK_MSR;
481         msr_entries->guest_kern_gs.index = IA32_KERN_GS_BASE_MSR;
482
483         msr_entries->host_star.index = IA32_STAR_MSR;
484         msr_entries->host_lstar.index = IA32_LSTAR_MSR;
485         msr_entries->host_fmask.index = IA32_FMASK_MSR;
486         msr_entries->host_kern_gs.index = IA32_KERN_GS_BASE_MSR;
487
488         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
489         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
490         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
491
492         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
493         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
494         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->host_msrs));
495
496
497         msr_ret |= v3_hook_msr(core->vm_info, IA32_STAR_MSR, NULL, NULL, NULL);
498         msr_ret |= v3_hook_msr(core->vm_info, IA32_LSTAR_MSR, NULL, NULL, NULL);
499         msr_ret |= v3_hook_msr(core->vm_info, IA32_FMASK_MSR, NULL, NULL, NULL);
500         msr_ret |= v3_hook_msr(core->vm_info, IA32_KERN_GS_BASE_MSR, NULL, NULL, NULL);
501
502
503         // IMPORTANT: These MSRs appear to be cached by the hardware....
504         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_CS_MSR, NULL, NULL, NULL);
505         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_ESP_MSR, NULL, NULL, NULL);
506         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_EIP_MSR, NULL, NULL, NULL);
507
508         msr_ret |= v3_hook_msr(core->vm_info, FS_BASE_MSR, NULL, NULL, NULL);
509         msr_ret |= v3_hook_msr(core->vm_info, GS_BASE_MSR, NULL, NULL, NULL);
510
511         msr_ret |= v3_hook_msr(core->vm_info, IA32_PAT_MSR, NULL, NULL, NULL);
512
513         // Not sure what to do about this... Does not appear to be an explicit hardware cache version...
514         msr_ret |= v3_hook_msr(core->vm_info, IA32_CSTAR_MSR, NULL, NULL, NULL);
515
516         if (msr_ret != 0) {
517             PrintError(core->vm_info, core, "Error configuring MSR save/restore area\n");
518             return -1;
519         }
520
521
522     }    
523
524     /* Sanity check ctrl/reg fields against hw_defaults */
525
526
527
528
529     /*** Write all the info to the VMCS ***/
530   
531     /*
532     {
533         // IS THIS NECESSARY???
534 #define DEBUGCTL_MSR 0x1d9
535         struct v3_msr tmp_msr;
536         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
537         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
538         core->dbg_regs.dr7 = 0x400;
539     }
540     */
541
542 #ifdef __V3_64BIT__
543     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
544 #else
545     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
546     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
547 #endif
548
549     if (vmx_ret != VMX_SUCCESS) { 
550         PrintError(core->vm_info, core, "Error configuring VMX\n");
551         return -1;
552     }
553
554  
555
556     if (v3_update_vmcs_ctrl_fields(core)) {
557         PrintError(core->vm_info, core, "Could not write control fields!\n");
558         return -1;
559     }
560     
561     /*
562     if (v3_update_vmcs_host_state(core)) {
563         PrintError(core->vm_info, core, "Could not write host state\n");
564         return -1;
565     }
566     */
567
568     // reenable global interrupts for vm state initialization now
569     // that the vm state is initialized. If another VM kicks us off, 
570     // it'll update our vmx state so that we know to reload ourself
571     v3_enable_ints();
572
573     return 0;
574 }
575
576
577 static void __init_vmx_vmcs(void * arg) {
578     struct guest_info * core = arg;
579     struct vmx_data * vmx_state = NULL;
580     int vmx_ret = 0;
581     
582     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
583
584     if (!vmx_state) {
585         PrintError(core->vm_info, core,  "Unable to allocate in initializing vmx vmcs\n");
586         return;
587     }
588
589     memset(vmx_state, 0, sizeof(struct vmx_data));
590
591     PrintDebug(core->vm_info, core,  "vmx_data pointer: %p\n", (void *)vmx_state);
592
593     PrintDebug(core->vm_info, core, "Allocating VMCS\n");
594     vmx_state->vmcs_ptr_phys = allocate_vmcs();
595
596     PrintDebug(core->vm_info, core, "VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
597
598     core->vmm_data = vmx_state;
599     vmx_state->state = VMX_UNLAUNCHED;
600
601     PrintDebug(core->vm_info, core, "Initializing VMCS (addr=%p)\n", core->vmm_data);
602     
603     // TODO: Fix vmcs fields so they're 32-bit
604
605     PrintDebug(core->vm_info, core, "Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
606     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
607
608     if (vmx_ret != VMX_SUCCESS) {
609         PrintError(core->vm_info, core, "VMCLEAR failed\n");
610         return; 
611     }
612
613     if (core->vm_info->vm_class == V3_PC_VM) {
614         PrintDebug(core->vm_info, core, "Initializing VMCS\n");
615         if (init_vmcs_bios(core, vmx_state) == -1) {
616             PrintError(core->vm_info, core, "Error initializing VMCS to BIOS state\n");
617             return;
618         }
619     } else {
620         PrintError(core->vm_info, core, "Invalid VM Class\n");
621         return;
622     }
623
624     PrintDebug(core->vm_info, core, "Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
625     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
626
627     if (vmx_ret != VMX_SUCCESS) { 
628         PrintError(core->vm_info,core,"VMCS Clear failed\n");
629         return;
630     }
631
632     core->core_run_state = CORE_STOPPED;
633     return;
634 }
635
636
637
638 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
639     extern v3_cpu_arch_t v3_cpu_types[];
640
641     if (v3_cpu_types[V3_Get_CPU()] == V3_INVALID_CPU) {
642         int i = 0;
643
644         for (i = 0; i < V3_CONFIG_MAX_CPUS; i++) {
645             if (v3_cpu_types[i] != V3_INVALID_CPU) {
646                 break;
647             }
648         }
649
650         if (i == V3_CONFIG_MAX_CPUS) {
651             PrintError(core->vm_info, core, "Could not find VALID CPU for VMX guest initialization\n");
652             return -1;
653         }
654
655         V3_Call_On_CPU(i, __init_vmx_vmcs, core);
656
657     } else {
658         __init_vmx_vmcs(core);
659     }
660
661     if (core->core_run_state != CORE_STOPPED) {
662         PrintError(core->vm_info, core, "Error initializing VMX Core\n");
663         return -1;
664     }
665
666     return 0;
667 }
668
669
670 int v3_deinit_vmx_vmcs(struct guest_info * core) {
671     struct vmx_data * vmx_state = core->vmm_data;
672
673     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
674     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
675
676     V3_Free(vmx_state);
677
678     return 0;
679 }
680
681
682
683 #ifdef V3_CONFIG_CHECKPOINT
684 /* 
685  * JRL: This is broken
686  */
687 int v3_vmx_save_core(struct guest_info * core, void * ctx){
688   struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
689   
690   // note that the vmcs pointer is an HPA, but we need an HVA
691   if (v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE_4KB, 
692                     V3_VAddr((void*) (vmx_info->vmcs_ptr_phys)))) {
693     PrintError(core->vm_info, core, "Could not save vmcs data for VMX\n");
694     return -1;
695   }
696   
697   return 0;
698 }
699
700 int v3_vmx_load_core(struct guest_info * core, void * ctx){
701   struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
702   struct cr0_32 * shadow_cr0;
703   addr_t vmcs_page_paddr;  //HPA
704   
705   vmcs_page_paddr = (addr_t) V3_AllocPages(1); // need not be shadow-safe, not exposed to guest
706   
707   if (!vmcs_page_paddr) { 
708     PrintError(core->vm_info, core, "Could not allocate space for a vmcs in VMX\n");
709     return -1;
710   }
711   
712   if (v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, 
713                     V3_VAddr((void *)vmcs_page_paddr)) == -1) { 
714     PrintError(core->vm_info, core, "Could not load vmcs data for VMX\n");
715     V3_FreePages((void*)vmcs_page_paddr,1);
716     return -1;
717   }
718
719   vmcs_clear(vmx_info->vmcs_ptr_phys);
720   
721   // Probably need to delete the old one... 
722   V3_FreePages((void*)(vmx_info->vmcs_ptr_phys),1);
723   
724   vmcs_load(vmcs_page_paddr);
725   
726   v3_vmx_save_vmcs(core);
727
728   shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
729
730
731   /* Get the CPU mode to set the guest_ia32e entry ctrl */
732   
733   if (core->shdw_pg_mode == SHADOW_PAGING) {
734     if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
735       if (v3_activate_shadow_pt(core) == -1) {
736         PrintError(core->vm_info, core, "Failed to activate shadow page tables\n");
737         return -1;
738       }
739     } else {
740       if (v3_activate_passthrough_pt(core) == -1) {
741         PrintError(core->vm_info, core, "Failed to activate passthrough page tables\n");
742         return -1;
743       }
744     }
745   }
746   
747   return 0;
748 }
749 #endif
750
751
752 void v3_flush_vmx_vm_core(struct guest_info * core) {
753     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
754     vmcs_clear(vmx_info->vmcs_ptr_phys);
755     vmx_info->state = VMX_UNLAUNCHED;
756 }
757
758
759
760 static int update_irq_exit_state(struct guest_info * info) {
761     struct vmx_exit_idt_vec_info idt_vec_info;
762
763     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
764
765     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
766 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
767         V3_Print(info->vm_info, info, "Calling v3_injecting_intr\n");
768 #endif
769         info->intr_core_state.irq_started = 0;
770         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
771     }
772
773     return 0;
774 }
775
776 static int update_irq_entry_state(struct guest_info * info) {
777     struct vmx_exit_idt_vec_info idt_vec_info;
778     struct vmcs_interrupt_state intr_core_state;
779     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
780
781     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
782     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
783
784     /* Check for pending exceptions to inject */
785     if (v3_excp_pending(info)) {
786         struct vmx_entry_int_info int_info;
787         int_info.value = 0;
788
789         // In VMX, almost every exception is hardware
790         // Software exceptions are pretty much only for breakpoint or overflow
791         int_info.type = 3;
792         int_info.vector = v3_get_excp_number(info);
793
794         if (info->excp_state.excp_error_code_valid) {
795             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
796             int_info.error_code = 1;
797
798 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
799             V3_Print(info->vm_info, info, "Injecting exception %d with error code %x\n", 
800                     int_info.vector, info->excp_state.excp_error_code);
801 #endif
802         }
803
804         int_info.valid = 1;
805 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
806         V3_Print(info->vm_info, info, "Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
807 #endif
808         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
809
810         v3_injecting_excp(info, int_info.vector);
811
812     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
813                (intr_core_state.val == 0)) {
814        
815         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
816
817 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
818             V3_Print(info->vm_info, info, "IRQ pending from previous injection\n");
819 #endif
820
821             // Copy the IDT vectoring info over to reinject the old interrupt
822             if (idt_vec_info.error_code == 1) {
823                 uint32_t err_code = 0;
824
825                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
826                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
827             }
828
829             idt_vec_info.undef = 0;
830             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
831
832         } else {
833             struct vmx_entry_int_info ent_int;
834             ent_int.value = 0;
835
836             switch (v3_intr_pending(info)) {
837                 case V3_EXTERNAL_IRQ: {
838                   
839                     int irq = v3_get_intr(info); 
840
841                     if (irq<0) {
842                       break;
843                     }
844
845                     info->intr_core_state.irq_vector = irq; 
846                     ent_int.vector = info->intr_core_state.irq_vector;
847                     ent_int.type = 0;
848                     ent_int.error_code = 0;
849                     ent_int.valid = 1;
850
851 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
852                     V3_Print(info->vm_info, info, "Injecting Interrupt %d at exit %u(EIP=%p)\n", 
853                                info->intr_core_state.irq_vector, 
854                                (uint32_t)info->num_exits, 
855                                (void *)(addr_t)info->rip);
856 #endif
857
858                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
859                     info->intr_core_state.irq_started = 1;
860
861                     break;
862                 }
863                 case V3_NMI:
864                     PrintDebug(info->vm_info, info, "Injecting NMI\n");
865
866                     ent_int.type = 2;
867                     ent_int.vector = 2;
868                     ent_int.valid = 1;
869                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
870
871                     break;
872                 case V3_SOFTWARE_INTR:
873                     PrintDebug(info->vm_info, info, "Injecting software interrupt\n");
874                     ent_int.type = 4;
875
876                     ent_int.valid = 1;
877                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
878
879                     break;
880                 case V3_VIRTUAL_IRQ:
881                     // Not sure what to do here, Intel doesn't have virtual IRQs
882                     // May be the same as external interrupts/IRQs
883
884                     break;
885                 case V3_INVALID_INTR:
886                 default:
887                     break;
888             }
889         }
890     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
891         // Enable INTR window exiting so we know when IF=1
892         uint32_t instr_len;
893
894         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
895
896 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
897         V3_Print(info->vm_info, info, "Enabling Interrupt-Window exiting: %d\n", instr_len);
898 #endif
899
900         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
901         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
902     }
903
904
905     return 0;
906 }
907
908
909
910 static struct vmx_exit_info exit_log[10];
911 static uint64_t rip_log[10];
912
913
914
915 static void print_exit_log(struct guest_info * info) {
916     int cnt = info->num_exits % 10;
917     int i = 0;
918     
919
920     V3_Print(info->vm_info, info, "\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
921
922     for (i = 0; i < 10; i++) {
923         struct vmx_exit_info * tmp = &exit_log[cnt];
924
925         V3_Print(info->vm_info, info, "%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
926         V3_Print(info->vm_info, info, "\texit_qual = %p\n", (void *)tmp->exit_qual);
927         V3_Print(info->vm_info, info, "\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
928         V3_Print(info->vm_info, info, "\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
929         V3_Print(info->vm_info, info, "\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
930         V3_Print(info->vm_info, info, "\tguest_linear_addr= %p\n", (void *)(addr_t)tmp->guest_linear_addr);
931         V3_Print(info->vm_info, info, "\tRIP = %p\n", (void *)rip_log[cnt]);
932
933
934         cnt--;
935
936         if (cnt == -1) {
937             cnt = 9;
938         }
939
940     }
941
942 }
943
944 int 
945 v3_vmx_config_tsc_virtualization(struct guest_info * info) {
946     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
947
948     if (info->time_state.flags & VM_TIME_TRAP_RDTSC) {
949         if  (!vmx_info->pri_proc_ctrls.rdtsc_exit) {
950             vmx_info->pri_proc_ctrls.rdtsc_exit = 1;
951             check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
952         }
953     } else {
954         sint64_t tsc_offset;
955         uint32_t tsc_offset_low, tsc_offset_high;
956
957         if  (vmx_info->pri_proc_ctrls.rdtsc_exit) {
958             vmx_info->pri_proc_ctrls.rdtsc_exit = 0;
959             check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
960         }
961
962         if (info->time_state.flags & VM_TIME_TSC_PASSTHROUGH) {
963             tsc_offset = 0;
964         } else {
965             tsc_offset = v3_tsc_host_offset(&info->time_state);
966         }
967         tsc_offset_high = (uint32_t)(( tsc_offset >> 32) & 0xffffffff);
968         tsc_offset_low = (uint32_t)(tsc_offset & 0xffffffff);
969
970         check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
971         check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
972     }
973     return 0;
974 }
975
976 /* 
977  * CAUTION and DANGER!!! 
978  * 
979  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
980  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
981  * on its contents will cause things to break. The contents at the time of the exit WILL 
982  * change before the exit handler is executed.
983  */
984 int v3_vmx_enter(struct guest_info * info) {
985     int ret = 0;
986     struct vmx_exit_info exit_info;
987     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
988     uint64_t guest_cycles = 0;
989
990     // Conditionally yield the CPU if the timeslice has expired
991     v3_schedule(info);
992
993 #ifdef V3_CONFIG_MEM_TRACK
994     v3_mem_track_entry(info);
995 #endif 
996
997 #ifdef V3_CONFIG_HVM
998     v3_handle_hvm_entry(info);
999 #endif
1000
1001     // Update timer devices late after being in the VM so that as much 
1002     // of the time in the VM is accounted for as possible. Also do it before
1003     // updating IRQ entry state so that any interrupts the timers raise get 
1004     // handled on the next VM entry.
1005     v3_advance_time(info, NULL);
1006     v3_update_timers(info);
1007
1008     // disable global interrupts for vm state transition
1009     v3_disable_ints();
1010
1011     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
1012         vmcs_clear(vmx_info->vmcs_ptr_phys);
1013         vmcs_load(vmx_info->vmcs_ptr_phys);
1014         vmx_info->state = VMX_UNLAUNCHED;
1015     }
1016
1017     v3_vmx_restore_vmcs(info);
1018
1019
1020 #ifdef V3_CONFIG_SYMCALL
1021     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
1022         update_irq_entry_state(info);
1023     }
1024 #else 
1025     update_irq_entry_state(info);
1026 #endif
1027
1028     {
1029         addr_t guest_cr3;
1030         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
1031         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
1032     }
1033
1034
1035     // Perform last-minute time setup prior to entering the VM
1036     v3_vmx_config_tsc_virtualization(info);
1037
1038     if (v3_update_vmcs_host_state(info)) {
1039         v3_enable_ints();
1040         PrintError(info->vm_info, info, "Could not write host state\n");
1041         return -1;
1042     }
1043     
1044     if (vmx_info->pin_ctrls.active_preempt_timer) {
1045         /* Preemption timer is active */
1046         uint32_t preempt_window = 0xffffffff;
1047
1048         if (info->timeouts.timeout_active) {
1049             preempt_window = info->timeouts.next_timeout;
1050         }
1051         
1052         check_vmcs_write(VMCS_PREEMPT_TIMER, preempt_window);
1053     }
1054
1055     V3_FP_ENTRY_RESTORE(info);
1056
1057     {   
1058         uint64_t entry_tsc = 0;
1059         uint64_t exit_tsc = 0;
1060
1061 #ifdef V3_CONFIG_PWRSTAT_TELEMETRY
1062         v3_pwrstat_telemetry_enter(info);
1063 #endif
1064
1065 #ifdef V3_CONFIG_PMU_TELEMETRY
1066         v3_pmu_telemetry_enter(info);
1067 #endif
1068
1069         if (vmx_info->state == VMX_UNLAUNCHED) {
1070             vmx_info->state = VMX_LAUNCHED;
1071             rdtscll(entry_tsc);
1072             ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
1073             rdtscll(exit_tsc);
1074
1075         } else {
1076             V3_ASSERT(info->vm_info, info,vmx_info->state != VMX_UNLAUNCHED);
1077             rdtscll(entry_tsc);
1078             ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
1079             rdtscll(exit_tsc);
1080         }
1081
1082         guest_cycles = exit_tsc - entry_tsc;    
1083
1084 #ifdef V3_CONFIG_PMU_TELEMETRY
1085         v3_pmu_telemetry_exit(info);
1086 #endif
1087
1088 #ifdef V3_CONFIG_PWRSTAT_TELEMETRY
1089         v3_pwrstat_telemetry_exit(info);
1090 #endif
1091     }
1092
1093     //  PrintDebug(info->vm_info, info, "VMX Exit: ret=%d\n", ret);
1094
1095     if (ret != VMX_SUCCESS) {
1096         uint32_t error = 0;
1097         vmcs_read(VMCS_INSTR_ERR, &error);
1098
1099         v3_enable_ints();
1100
1101         PrintError(info->vm_info, info, "VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
1102         return -1;
1103     }
1104
1105
1106     info->num_exits++;
1107
1108     V3_FP_EXIT_SAVE(info);
1109
1110     /* If we have the preemption time, then use it to get more accurate guest time */
1111     if (vmx_info->pin_ctrls.active_preempt_timer) {
1112         uint32_t cycles_left = 0;
1113         check_vmcs_read(VMCS_PREEMPT_TIMER, &(cycles_left));
1114
1115         if (info->timeouts.timeout_active) {
1116             guest_cycles = info->timeouts.next_timeout - cycles_left;
1117         } else {
1118             guest_cycles = 0xffffffff - cycles_left;
1119         }
1120     }
1121
1122     // Immediate exit from VM time bookkeeping
1123     v3_advance_time(info, &guest_cycles);
1124
1125     /* Update guest state */
1126     v3_vmx_save_vmcs(info);
1127
1128     // info->cpl = info->segments.cs.selector & 0x3;
1129
1130     info->mem_mode = v3_get_vm_mem_mode(info);
1131     info->cpu_mode = v3_get_vm_cpu_mode(info);
1132
1133
1134
1135     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
1136     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
1137     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
1138     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
1139     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
1140     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
1141     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
1142
1143     if (info->shdw_pg_mode == NESTED_PAGING) {
1144         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
1145     } else {
1146         exit_info.ept_fault_addr = 0;
1147     }
1148
1149     //PrintDebug(info->vm_info, info, "VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
1150
1151     exit_log[info->num_exits % 10] = exit_info;
1152     rip_log[info->num_exits % 10] = get_addr_linear(info, info->rip, &(info->segments.cs));
1153
1154 #ifdef V3_CONFIG_SYMCALL
1155     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
1156         update_irq_exit_state(info);
1157     }
1158 #else
1159     update_irq_exit_state(info);
1160 #endif
1161
1162     if (exit_info.exit_reason == VMX_EXIT_INTR_WINDOW) {
1163         // This is a special case whose only job is to inject an interrupt
1164         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
1165         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
1166         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
1167
1168 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
1169        V3_Print(info->vm_info, info, "Interrupts available again! (RIP=%llx)\n", info->rip);
1170 #endif
1171     }
1172
1173
1174     // Lastly we check for an NMI exit, and reinject if so
1175     {
1176         struct vmx_basic_exit_info * basic_info = (struct vmx_basic_exit_info *)&(exit_info.exit_reason);
1177
1178         if (basic_info->reason == VMX_EXIT_INFO_EXCEPTION_OR_NMI) {
1179             if ((uint8_t)exit_info.int_info == 2) {
1180                 asm("int $2");
1181             }
1182         }
1183     }
1184
1185     // reenable global interrupts after vm exit
1186     v3_enable_ints();
1187
1188     // Conditionally yield the CPU if the timeslice has expired
1189     v3_schedule(info);
1190     v3_advance_time(info, NULL);
1191     v3_update_timers(info);
1192
1193     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
1194         PrintError(info->vm_info, info, "Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
1195         return -1;
1196     }
1197
1198     if (info->timeouts.timeout_active) {
1199         /* Check to see if any timeouts have expired */
1200         v3_handle_timeouts(info, guest_cycles);
1201     }
1202
1203 #ifdef V3_CONFIG_HVM
1204     v3_handle_hvm_exit(info);
1205 #endif
1206
1207 #ifdef V3_CONFIG_MEM_TRACK
1208     v3_mem_track_exit(info);
1209 #endif 
1210
1211     return 0;
1212 }
1213
1214
1215 int v3_start_vmx_guest(struct guest_info * info) {
1216
1217     PrintDebug(info->vm_info, info, "Starting VMX core %u\n", info->vcpu_id);
1218
1219 #ifdef V3_CONFIG_MULTIBOOT
1220     if (v3_setup_multiboot_core_for_boot(info)) { 
1221         PrintError(info->vm_info, info, "Failed to setup Multiboot core...\n");
1222         return -1;
1223     }
1224 #endif
1225
1226 #ifdef V3_CONFIG_HVM
1227     if (v3_setup_hvm_hrt_core_for_boot(info)) { 
1228         PrintError(info->vm_info, info, "Failed to setup HRT core...\n");
1229         return -1;
1230     }
1231 #endif
1232     
1233     while (1) {
1234         if (info->core_run_state == CORE_STOPPED) {
1235             if (info->vcpu_id == 0) {
1236                 info->core_run_state = CORE_RUNNING;
1237             } else {
1238                 
1239                 PrintDebug(info->vm_info, info, "VMX core %u: Waiting for core initialization\n", info->vcpu_id);
1240
1241                 V3_NO_WORK(info);
1242                 
1243                 while (info->core_run_state == CORE_STOPPED) {
1244                     
1245                     if (info->vm_info->run_state == VM_STOPPED) {
1246                         // The VM was stopped before this core was initialized. 
1247                         return 0;
1248                     }
1249
1250                     V3_STILL_NO_WORK(info);
1251                     //PrintDebug(info->vm_info, info, "VMX core %u: still waiting for INIT\n",info->vcpu_id);
1252                 }
1253
1254                 V3_HAVE_WORK_AGAIN(info);
1255
1256                 PrintDebug(info->vm_info, info, "VMX core %u initialized\n", info->vcpu_id);
1257                 
1258                 // We'll be paranoid about race conditions here
1259                 v3_wait_at_barrier(info);
1260             }
1261             
1262             
1263             PrintDebug(info->vm_info, info, "VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
1264                        info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
1265                        info->segments.cs.limit, (void *)(info->rip));
1266             
1267             
1268             PrintDebug(info->vm_info, info, "VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
1269             
1270             v3_start_time(info);
1271             
1272             
1273             if (info->vm_info->run_state == VM_STOPPED) {
1274                 info->core_run_state = CORE_STOPPED;
1275                 break;
1276             }
1277         }
1278         
1279         
1280 #ifdef V3_CONFIG_HVM
1281         if (v3_handle_hvm_reset(info) > 0) {
1282             continue;
1283         }
1284 #endif
1285         
1286 #ifdef V3_CONFIG_MULTIBOOT
1287         if (v3_handle_multiboot_reset(info) > 0) {
1288             continue;
1289         }
1290 #endif
1291
1292 #ifdef V3_CONFIG_PMU_TELEMETRY
1293         v3_pmu_telemetry_start(info);
1294 #endif
1295         
1296 #ifdef V3_CONFIG_PWRSTAT_TELEMETRY
1297         v3_pwrstat_telemetry_start(info);
1298 #endif
1299         
1300         
1301         if (v3_vmx_enter(info) == -1) {
1302             
1303             addr_t host_addr;
1304             addr_t linear_addr = 0;
1305             
1306             info->vm_info->run_state = VM_ERROR;
1307             
1308             V3_Print(info->vm_info, info, "VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
1309             
1310             v3_print_guest_state(info);
1311             
1312             V3_Print(info->vm_info, info, "VMX core %u\n", info->vcpu_id); 
1313
1314             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
1315             
1316             if (info->mem_mode == PHYSICAL_MEM) {
1317                 if (v3_gpa_to_hva(info, linear_addr, &host_addr)) {
1318                     PrintError(info->vm_info, info, "Cannot translate address\n");
1319                     return -1;
1320                 }
1321             } else if (info->mem_mode == VIRTUAL_MEM) {
1322                 if (v3_gva_to_hva(info, linear_addr, &host_addr)) {
1323                     PrintError(info->vm_info, info, "Cannot translate address\n");
1324                     return -1;
1325                 }
1326             }
1327             
1328             V3_Print(info->vm_info, info, "VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
1329             
1330             V3_Print(info->vm_info, info, "VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
1331             v3_dump_mem((uint8_t *)host_addr, 15);
1332             
1333             v3_print_stack(info);
1334
1335
1336             v3_print_vmcs();
1337             print_exit_log(info);
1338             return -1;
1339         }
1340
1341         v3_wait_at_barrier(info);
1342
1343
1344         if (info->vm_info->run_state == VM_STOPPED) {
1345             info->core_run_state = CORE_STOPPED;
1346             break;
1347         }
1348 /*
1349         if ((info->num_exits % 5000) == 0) {
1350             V3_Print(info->vm_info, info, "VMX Exit number %d\n", (uint32_t)info->num_exits);
1351         }
1352 */
1353
1354     }
1355
1356 #ifdef V3_CONFIG_PMU_TELEMETRY
1357     v3_pmu_telemetry_end(info);
1358 #endif
1359
1360 #ifdef V3_CONFIG_PWRSTAT_TELEMETRY
1361     v3_pwrstat_telemetry_end(info);
1362 #endif
1363
1364     return 0;
1365 }
1366
1367
1368
1369
1370 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
1371 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
1372 #define CPUID_1_ECX_VTXFLAG 0x00000020
1373
1374 int v3_is_vmx_capable() {
1375     v3_msr_t feature_msr;
1376     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1377
1378     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1379
1380     PrintDebug(VM_NONE, VCORE_NONE, "ECX: 0x%x\n", ecx);
1381
1382     if (ecx & CPUID_1_ECX_VTXFLAG) {
1383         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1384         
1385         PrintDebug(VM_NONE, VCORE_NONE,  "MSRREGlow: 0x%.8x\n", feature_msr.lo);
1386
1387         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1388             PrintDebug(VM_NONE, VCORE_NONE,  "VMX is locked -- enable in the BIOS\n");
1389             return 0;
1390         }
1391
1392     } else {
1393         PrintDebug(VM_NONE, VCORE_NONE,  "VMX not supported on this cpu\n");
1394         return 0;
1395     }
1396
1397     return 1;
1398 }
1399
1400
1401 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1402     // init vmcs bios
1403     
1404     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1405         (v3_mach_type == V3_VMX_EPT_UG_CPU)) {
1406         // easy 
1407         core->rip = 0;
1408         core->segments.cs.selector = rip << 8;
1409         core->segments.cs.limit = 0xffff;
1410         core->segments.cs.base = rip << 12;
1411     } else {
1412         core->vm_regs.rdx = core->vcpu_id;
1413         core->vm_regs.rbx = rip;
1414     }
1415
1416     return 0;
1417 }
1418
1419
1420
1421 void v3_init_vmx_cpu(int cpu_id) {
1422     addr_t vmx_on_region = 0;
1423     extern v3_cpu_arch_t v3_mach_type;
1424     extern v3_cpu_arch_t v3_cpu_types[];
1425
1426     if (v3_mach_type == V3_INVALID_CPU) {
1427         if (v3_init_vmx_hw(&hw_info) == -1) {
1428             PrintError(VM_NONE, VCORE_NONE, "Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1429             return;
1430         }
1431     }
1432
1433     enable_vmx();
1434
1435
1436     // Setup VMXON Region
1437     vmx_on_region = allocate_vmcs();
1438
1439
1440     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1441         V3_Print(VM_NONE, VCORE_NONE,  "VMX Enabled\n");
1442         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1443     } else {
1444         V3_Print(VM_NONE, VCORE_NONE,  "VMX already enabled\n");
1445         V3_FreePages((void *)vmx_on_region, 1);
1446     }
1447
1448     PrintDebug(VM_NONE, VCORE_NONE,  "VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1449
1450     {
1451         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1452         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1453         
1454         if (sec_proc_ctrls.enable_ept == 0) {
1455             V3_Print(VM_NONE, VCORE_NONE, "VMX EPT (Nested) Paging not supported\n");
1456             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1457         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1458             V3_Print(VM_NONE, VCORE_NONE, "VMX EPT (Nested) Paging supported\n");
1459             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1460         } else {
1461             V3_Print(VM_NONE, VCORE_NONE, "VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1462             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1463         }
1464     }
1465     
1466 }
1467
1468
1469 void v3_deinit_vmx_cpu(int cpu_id) {
1470     extern v3_cpu_arch_t v3_cpu_types[];
1471     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1472
1473     if (host_vmcs_ptrs[cpu_id] != 0) {
1474         V3_Print(VM_NONE, VCORE_NONE, "Disabling VMX\n");
1475
1476         if (vmx_off() != VMX_SUCCESS) {
1477             PrintError(VM_NONE, VCORE_NONE, "Error executing VMXOFF\n");
1478         }
1479
1480         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1481
1482         host_vmcs_ptrs[cpu_id] = 0;
1483     }
1484 }