Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


Basic HRT startup for HVM, plus assorted cleanup
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36 #include <palacios/vmm_timeout.h>
37 #include <palacios/vmm_debug.h>
38
39 #ifdef V3_CONFIG_CHECKPOINT
40 #include <palacios/vmm_checkpoint.h>
41 #endif
42
43 #include <palacios/vmx_ept.h>
44 #include <palacios/vmx_assist.h>
45 #include <palacios/vmx_hw_info.h>
46
47 #ifdef V3_CONFIG_MEM_TRACK
48 #include <palacios/vmm_mem_track.h>
49 #endif 
50
51 #ifndef V3_CONFIG_DEBUG_VMX
52 #undef PrintDebug
53 #define PrintDebug(fmt, args...)
54 #endif
55
56
57 /* These fields contain the hardware feature sets supported by the local CPU */
58 static struct vmx_hw_info hw_info;
59
60 extern v3_cpu_arch_t v3_mach_type;
61
62 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
63
64 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
65 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
66
67 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
68     int ret = 0;
69
70     ret = vmcs_write(field, val);
71
72     if (ret != VMX_SUCCESS) {
73         PrintError(VM_NONE, VCORE_NONE, "VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
74         return 1;
75     }
76
77
78     
79
80     return 0;
81 }
82
83 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
84     int ret = 0;
85
86     ret = vmcs_read(field, val);
87
88     if (ret != VMX_SUCCESS) {
89         PrintError(VM_NONE, VCORE_NONE, "VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
90     }
91
92     return ret;
93 }
94
95
96
97
98 static addr_t allocate_vmcs() {
99     void *temp;
100     struct vmcs_data * vmcs_page = NULL;
101
102     PrintDebug(VM_NONE, VCORE_NONE, "Allocating page\n");
103
104     temp = V3_AllocPages(1); // need not be shadow-safe, not exposed to guest
105     if (!temp) { 
106         PrintError(VM_NONE, VCORE_NONE, "Cannot allocate VMCS\n");
107         return -1;
108     }
109     vmcs_page = (struct vmcs_data *)V3_VAddr(temp);
110     memset(vmcs_page, 0, 4096);
111
112     vmcs_page->revision = hw_info.basic_info.revision;
113     PrintDebug(VM_NONE, VCORE_NONE, "VMX Revision: 0x%x\n", vmcs_page->revision);
114
115     return (addr_t)V3_PAddr((void *)vmcs_page);
116 }
117
118
119 #if 0
120 static int debug_efer_read(struct guest_info * core, uint_t msr, struct v3_msr * src, void * priv_data) {
121     struct v3_msr * efer = (struct v3_msr *)&(core->ctrl_regs.efer);
122     V3_Print(core->vm_info, core, "\n\nEFER READ (val = %p)\n", (void *)efer->value);
123     
124     v3_print_guest_state(core);
125     v3_print_vmcs();
126
127
128     src->value = efer->value;
129     return 0;
130 }
131
132 static int debug_efer_write(struct guest_info * core, uint_t msr, struct v3_msr src, void * priv_data) {
133     struct v3_msr * efer = (struct v3_msr *)&(core->ctrl_regs.efer);
134     V3_Print(core->vm_info, core, "\n\nEFER WRITE (old_val = %p) (new_val = %p)\n", (void *)efer->value, (void *)src.value);
135     
136     v3_print_guest_state(core);
137     v3_print_vmcs();
138
139     efer->value = src.value;
140
141     return 0;
142 }
143 #endif
144
145
146 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
147     int vmx_ret = 0;
148
149     /* Get Available features */
150     struct vmx_pin_ctrls avail_pin_ctrls;
151     avail_pin_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.pin_ctrls));
152     /* ** */
153
154
155     // disable global interrupts for vm state initialization
156     v3_disable_ints();
157
158     PrintDebug(core->vm_info, core, "Loading VMCS\n");
159     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
160     vmx_state->state = VMX_UNLAUNCHED;
161
162     if (vmx_ret != VMX_SUCCESS) {
163         PrintError(core->vm_info, core, "VMPTRLD failed\n");
164         return -1;
165     }
166
167
168     /*** Setup default state from HW ***/
169
170     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
171     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
172     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
173     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
174     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
175
176     /* Print Control MSRs */
177     V3_Print(core->vm_info, core, "CR0 MSR: req_val=%p, req_mask=%p\n", (void *)(addr_t)hw_info.cr0.req_val, (void *)(addr_t)hw_info.cr0.req_mask);
178     V3_Print(core->vm_info, core, "CR4 MSR: req_val=%p, req_mask=%p\n", (void *)(addr_t)hw_info.cr4.req_val, (void *)(addr_t)hw_info.cr4.req_mask);
179
180
181
182     /******* Setup Host State **********/
183
184     /* Cache GDTR, IDTR, and TR in host struct */
185
186
187     /********** Setup VMX Control Fields ***********/
188
189     /* Add external interrupts, NMI exiting, and virtual NMI */
190     vmx_state->pin_ctrls.nmi_exit = 1;
191     vmx_state->pin_ctrls.virt_nmi = 1;
192     vmx_state->pin_ctrls.ext_int_exit = 1;
193
194
195
196     /* We enable the preemption timer by default to measure accurate guest time */
197     if (avail_pin_ctrls.active_preempt_timer) {
198         V3_Print(core->vm_info, core, "VMX Preemption Timer is available\n");
199         vmx_state->pin_ctrls.active_preempt_timer = 1;
200         vmx_state->exit_ctrls.save_preempt_timer = 1;
201     }
202
203     // we want it to use this when halting
204     vmx_state->pri_proc_ctrls.hlt_exit = 1;
205
206     // cpuid tells it that it does not have these instructions
207     vmx_state->pri_proc_ctrls.monitor_exit = 1;
208     vmx_state->pri_proc_ctrls.mwait_exit = 1;
209
210     // we don't need to handle a pause, although this is where
211     // we could pull out of a spin lock acquire or schedule to find its partner
212     vmx_state->pri_proc_ctrls.pause_exit = 0;
213
214     vmx_state->pri_proc_ctrls.tsc_offset = 1;
215 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
216     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
217 #endif
218
219     /* Setup IO map */
220     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
221     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
222     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
223             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
224
225
226     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
227     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
228
229
230
231 #ifdef __V3_64BIT__
232     // Ensure host runs in 64-bit mode at each VM EXIT
233     vmx_state->exit_ctrls.host_64_on = 1;
234 #endif
235
236
237
238     // Restore host's EFER register on each VM EXIT
239     vmx_state->exit_ctrls.ld_efer = 1;
240
241     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
242     vmx_state->exit_ctrls.save_efer = 1;
243     vmx_state->entry_ctrls.ld_efer  = 1;
244
245     vmx_state->exit_ctrls.save_pat = 1;
246     vmx_state->exit_ctrls.ld_pat = 1;
247     vmx_state->entry_ctrls.ld_pat = 1;
248
249     /* Temporary GPF trap */
250     //  vmx_state->excp_bmap.gp = 1;
251
252     // Setup Guests initial PAT field
253     vmx_ret |= check_vmcs_write(VMCS_GUEST_PAT, 0x0007040600070406LL);
254
255     // Capture CR8 mods so that we can keep the apic_tpr correct
256     vmx_state->pri_proc_ctrls.cr8_ld_exit = 1;
257     vmx_state->pri_proc_ctrls.cr8_str_exit = 1;
258
259
260     /* Setup paging */
261     if (core->shdw_pg_mode == SHADOW_PAGING) {
262         PrintDebug(core->vm_info, core, "Creating initial shadow page table\n");
263
264         if (v3_init_passthrough_pts(core) == -1) {
265             PrintError(core->vm_info, core, "Could not initialize passthrough page tables\n");
266             return -1;
267         }
268         
269 #define CR0_PE 0x00000001
270 #define CR0_PG 0x80000000
271 #define CR0_WP 0x00010000 // To ensure mem hooks work
272 #define CR0_NE 0x00000020
273         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP | CR0_NE));
274
275
276         // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
277         vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE );
278
279         v3_activate_passthrough_pt(core);
280
281         // vmx_state->pinbased_ctrls |= NMI_EXIT;
282
283         /* Add CR exits */
284         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
285         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
286
287         // Note that we intercept cr4.pae writes
288         // and we have cr4 read-shadowed to the shadow pager's cr4
289
290         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
291         
292         /* Add page fault exits */
293         vmx_state->excp_bmap.pf = 1;
294
295         // Setup VMX Assist
296         v3_vmxassist_init(core, vmx_state);
297
298         // Hook all accesses to EFER register
299         v3_hook_msr(core->vm_info, EFER_MSR, 
300                     &v3_handle_efer_read,
301                     &v3_handle_efer_write, 
302                     core);
303
304     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
305                (v3_mach_type == V3_VMX_EPT_CPU)) {
306
307 #define CR0_PE 0x00000001
308 #define CR0_PG 0x80000000
309 #define CR0_WP 0x00010000 // To ensure mem hooks work
310 #define CR0_NE 0x00000020
311         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP | CR0_NE));
312
313         // vmx_state->pinbased_ctrls |= NMI_EXIT;
314
315         // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
316         vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
317         
318         /* Disable CR exits */
319         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
320         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
321
322         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
323
324         /* Add page fault exits */
325         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
326         
327         // Setup VMX Assist
328         v3_vmxassist_init(core, vmx_state);
329
330         /* Enable EPT */
331         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
332         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
333
334
335
336         if (v3_init_nested_paging_core(core, &hw_info) == -1) {
337             PrintError(core->vm_info, core, "Error initializing EPT\n");
338             return -1;
339         }
340
341         // Hook all accesses to EFER register
342         v3_hook_msr(core->vm_info, EFER_MSR, NULL, NULL, NULL);
343
344     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
345                (v3_mach_type == V3_VMX_EPT_UG_CPU)) {
346         int i = 0;
347         // For now we will assume that unrestricted guest mode is assured w/ EPT
348
349
350         core->vm_regs.rsp = 0x00;
351         core->rip = 0xfff0;
352         core->vm_regs.rdx = 0x00000f00;
353         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
354         core->ctrl_regs.cr0 = 0x60010030; 
355         core->ctrl_regs.cr4 = 0x00002010; // Enable VMX and PSE flag
356         
357
358         core->segments.cs.selector = 0xf000;
359         core->segments.cs.limit = 0xffff;
360         core->segments.cs.base = 0x0000000f0000LL;
361
362         // (raw attributes = 0xf3)
363         core->segments.cs.type = 0xb;
364         core->segments.cs.system = 0x1;
365         core->segments.cs.dpl = 0x0;
366         core->segments.cs.present = 1;
367
368
369
370         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
371                                           &(core->segments.es), &(core->segments.fs), 
372                                           &(core->segments.gs), NULL};
373
374         for ( i = 0; segregs[i] != NULL; i++) {
375             struct v3_segment * seg = segregs[i];
376         
377             seg->selector = 0x0000;
378             //    seg->base = seg->selector << 4;
379             seg->base = 0x00000000;
380             seg->limit = 0xffff;
381
382
383             seg->type = 0x3;
384             seg->system = 0x1;
385             seg->dpl = 0x0;
386             seg->present = 1;
387             //    seg->granularity = 1;
388
389         }
390
391
392         core->segments.gdtr.limit = 0x0000ffff;
393         core->segments.gdtr.base = 0x0000000000000000LL;
394
395         core->segments.idtr.limit = 0x0000ffff;
396         core->segments.idtr.base = 0x0000000000000000LL;
397
398         core->segments.ldtr.selector = 0x0000;
399         core->segments.ldtr.limit = 0x0000ffff;
400         core->segments.ldtr.base = 0x0000000000000000LL;
401         core->segments.ldtr.type = 0x2;
402         core->segments.ldtr.present = 1;
403
404         core->segments.tr.selector = 0x0000;
405         core->segments.tr.limit = 0x0000ffff;
406         core->segments.tr.base = 0x0000000000000000LL;
407         core->segments.tr.type = 0xb;
408         core->segments.tr.present = 1;
409
410         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
411         core->dbg_regs.dr7 = 0x0000000000000400LL;
412
413         /* Enable EPT */
414         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
415         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
416         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
417
418
419         /* Disable shadow paging stuff */
420         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
421         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
422
423         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
424
425
426         // Cause VM_EXIT whenever the CR4.VMXE bit is set
427         vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE);
428 #define CR0_NE 0x00000020
429 #define CR0_CD 0x40000000
430         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, CR0_NE | CR0_CD);
431         ((struct cr0_32 *)&(core->shdw_pg_state.guest_cr0))->ne = 1;
432         ((struct cr0_32 *)&(core->shdw_pg_state.guest_cr0))->cd = 0;
433
434         if (v3_init_nested_paging_core(core, &hw_info) == -1) {
435             PrintError(core->vm_info, core, "Error initializing EPT\n");
436             return -1;
437         }
438
439         // Hook all accesses to EFER register
440         //      v3_hook_msr(core->vm_info, EFER_MSR, &debug_efer_read, &debug_efer_write, core);
441         v3_hook_msr(core->vm_info, EFER_MSR, NULL, NULL, NULL);
442     } else {
443         PrintError(core->vm_info, core, "Invalid Virtual paging mode (pg_mode=%d) (mach_type=%d)\n", core->shdw_pg_mode, v3_mach_type);
444         return -1;
445     }
446
447
448     // hook vmx msrs
449
450     // Setup SYSCALL/SYSENTER MSRs in load/store area
451     
452     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
453     {
454
455         struct vmcs_msr_save_area * msr_entries = NULL;
456         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
457         int msr_ret = 0;
458
459         V3_Print(core->vm_info, core, "Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
460
461         if (max_msrs < 4) {
462             PrintError(core->vm_info, core, "Max MSR cache size is too small (%d)\n", max_msrs);
463             return -1;
464         }
465
466         vmx_state->msr_area_paddr = (addr_t)V3_AllocPages(1); // need not be shadow-safe, not exposed to guest
467         
468         if (vmx_state->msr_area_paddr == (addr_t)NULL) {
469             PrintError(core->vm_info, core, "could not allocate msr load/store area\n");
470             return -1;
471         }
472
473         msr_entries = (struct vmcs_msr_save_area *)V3_VAddr((void *)(vmx_state->msr_area_paddr));
474         vmx_state->msr_area = msr_entries; // cache in vmx_info
475
476         memset(msr_entries, 0, PAGE_SIZE);
477
478         msr_entries->guest_star.index = IA32_STAR_MSR;
479         msr_entries->guest_lstar.index = IA32_LSTAR_MSR;
480         msr_entries->guest_fmask.index = IA32_FMASK_MSR;
481         msr_entries->guest_kern_gs.index = IA32_KERN_GS_BASE_MSR;
482
483         msr_entries->host_star.index = IA32_STAR_MSR;
484         msr_entries->host_lstar.index = IA32_LSTAR_MSR;
485         msr_entries->host_fmask.index = IA32_FMASK_MSR;
486         msr_entries->host_kern_gs.index = IA32_KERN_GS_BASE_MSR;
487
488         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
489         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
490         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
491
492         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
493         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
494         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->host_msrs));
495
496
497         msr_ret |= v3_hook_msr(core->vm_info, IA32_STAR_MSR, NULL, NULL, NULL);
498         msr_ret |= v3_hook_msr(core->vm_info, IA32_LSTAR_MSR, NULL, NULL, NULL);
499         msr_ret |= v3_hook_msr(core->vm_info, IA32_FMASK_MSR, NULL, NULL, NULL);
500         msr_ret |= v3_hook_msr(core->vm_info, IA32_KERN_GS_BASE_MSR, NULL, NULL, NULL);
501
502
503         // IMPORTANT: These MSRs appear to be cached by the hardware....
504         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_CS_MSR, NULL, NULL, NULL);
505         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_ESP_MSR, NULL, NULL, NULL);
506         msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_EIP_MSR, NULL, NULL, NULL);
507
508         msr_ret |= v3_hook_msr(core->vm_info, FS_BASE_MSR, NULL, NULL, NULL);
509         msr_ret |= v3_hook_msr(core->vm_info, GS_BASE_MSR, NULL, NULL, NULL);
510
511         msr_ret |= v3_hook_msr(core->vm_info, IA32_PAT_MSR, NULL, NULL, NULL);
512
513         // Not sure what to do about this... Does not appear to be an explicit hardware cache version...
514         msr_ret |= v3_hook_msr(core->vm_info, IA32_CSTAR_MSR, NULL, NULL, NULL);
515
516         if (msr_ret != 0) {
517             PrintError(core->vm_info, core, "Error configuring MSR save/restore area\n");
518             return -1;
519         }
520
521
522     }    
523
524     /* Sanity check ctrl/reg fields against hw_defaults */
525
526
527
528
529     /*** Write all the info to the VMCS ***/
530   
531     /*
532     {
533         // IS THIS NECESSARY???
534 #define DEBUGCTL_MSR 0x1d9
535         struct v3_msr tmp_msr;
536         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
537         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
538         core->dbg_regs.dr7 = 0x400;
539     }
540     */
541
542 #ifdef __V3_64BIT__
543     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
544 #else
545     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
546     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
547 #endif
548
549
550
551  
552
553     if (v3_update_vmcs_ctrl_fields(core)) {
554         PrintError(core->vm_info, core, "Could not write control fields!\n");
555         return -1;
556     }
557     
558     /*
559     if (v3_update_vmcs_host_state(core)) {
560         PrintError(core->vm_info, core, "Could not write host state\n");
561         return -1;
562     }
563     */
564
565     // reenable global interrupts for vm state initialization now
566     // that the vm state is initialized. If another VM kicks us off, 
567     // it'll update our vmx state so that we know to reload ourself
568     v3_enable_ints();
569
570     return 0;
571 }
572
573
574 static void __init_vmx_vmcs(void * arg) {
575     struct guest_info * core = arg;
576     struct vmx_data * vmx_state = NULL;
577     int vmx_ret = 0;
578     
579     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
580
581     if (!vmx_state) {
582         PrintError(core->vm_info, core,  "Unable to allocate in initializing vmx vmcs\n");
583         return;
584     }
585
586     memset(vmx_state, 0, sizeof(struct vmx_data));
587
588     PrintDebug(core->vm_info, core,  "vmx_data pointer: %p\n", (void *)vmx_state);
589
590     PrintDebug(core->vm_info, core, "Allocating VMCS\n");
591     vmx_state->vmcs_ptr_phys = allocate_vmcs();
592
593     PrintDebug(core->vm_info, core, "VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
594
595     core->vmm_data = vmx_state;
596     vmx_state->state = VMX_UNLAUNCHED;
597
598     PrintDebug(core->vm_info, core, "Initializing VMCS (addr=%p)\n", core->vmm_data);
599     
600     // TODO: Fix vmcs fields so they're 32-bit
601
602     PrintDebug(core->vm_info, core, "Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
603     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
604
605     if (vmx_ret != VMX_SUCCESS) {
606         PrintError(core->vm_info, core, "VMCLEAR failed\n");
607         return; 
608     }
609
610     if (core->vm_info->vm_class == V3_PC_VM) {
611         PrintDebug(core->vm_info, core, "Initializing VMCS\n");
612         if (init_vmcs_bios(core, vmx_state) == -1) {
613             PrintError(core->vm_info, core, "Error initializing VMCS to BIOS state\n");
614             return;
615         }
616     } else {
617         PrintError(core->vm_info, core, "Invalid VM Class\n");
618         return;
619     }
620
621     PrintDebug(core->vm_info, core, "Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
622     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
623
624     core->core_run_state = CORE_STOPPED;
625     return;
626 }
627
628
629
630 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
631     extern v3_cpu_arch_t v3_cpu_types[];
632
633     if (v3_cpu_types[V3_Get_CPU()] == V3_INVALID_CPU) {
634         int i = 0;
635
636         for (i = 0; i < V3_CONFIG_MAX_CPUS; i++) {
637             if (v3_cpu_types[i] != V3_INVALID_CPU) {
638                 break;
639             }
640         }
641
642         if (i == V3_CONFIG_MAX_CPUS) {
643             PrintError(core->vm_info, core, "Could not find VALID CPU for VMX guest initialization\n");
644             return -1;
645         }
646
647         V3_Call_On_CPU(i, __init_vmx_vmcs, core);
648
649     } else {
650         __init_vmx_vmcs(core);
651     }
652
653     if (core->core_run_state != CORE_STOPPED) {
654         PrintError(core->vm_info, core, "Error initializing VMX Core\n");
655         return -1;
656     }
657
658     return 0;
659 }
660
661
662 int v3_deinit_vmx_vmcs(struct guest_info * core) {
663     struct vmx_data * vmx_state = core->vmm_data;
664
665     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
666     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
667
668     V3_Free(vmx_state);
669
670     return 0;
671 }
672
673
674
675 #ifdef V3_CONFIG_CHECKPOINT
676 /* 
677  * JRL: This is broken
678  */
679 int v3_vmx_save_core(struct guest_info * core, void * ctx){
680   struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
681   
682   // note that the vmcs pointer is an HPA, but we need an HVA
683   if (v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE_4KB, 
684                     V3_VAddr((void*) (vmx_info->vmcs_ptr_phys)))) {
685     PrintError(core->vm_info, core, "Could not save vmcs data for VMX\n");
686     return -1;
687   }
688   
689   return 0;
690 }
691
692 int v3_vmx_load_core(struct guest_info * core, void * ctx){
693   struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
694   struct cr0_32 * shadow_cr0;
695   addr_t vmcs_page_paddr;  //HPA
696   
697   vmcs_page_paddr = (addr_t) V3_AllocPages(1); // need not be shadow-safe, not exposed to guest
698   
699   if (!vmcs_page_paddr) { 
700     PrintError(core->vm_info, core, "Could not allocate space for a vmcs in VMX\n");
701     return -1;
702   }
703   
704   if (v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, 
705                     V3_VAddr((void *)vmcs_page_paddr)) == -1) { 
706     PrintError(core->vm_info, core, "Could not load vmcs data for VMX\n");
707     V3_FreePages((void*)vmcs_page_paddr,1);
708     return -1;
709   }
710
711   vmcs_clear(vmx_info->vmcs_ptr_phys);
712   
713   // Probably need to delete the old one... 
714   V3_FreePages((void*)(vmx_info->vmcs_ptr_phys),1);
715   
716   vmcs_load(vmcs_page_paddr);
717   
718   v3_vmx_save_vmcs(core);
719
720   shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
721
722
723   /* Get the CPU mode to set the guest_ia32e entry ctrl */
724   
725   if (core->shdw_pg_mode == SHADOW_PAGING) {
726     if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
727       if (v3_activate_shadow_pt(core) == -1) {
728         PrintError(core->vm_info, core, "Failed to activate shadow page tables\n");
729         return -1;
730       }
731     } else {
732       if (v3_activate_passthrough_pt(core) == -1) {
733         PrintError(core->vm_info, core, "Failed to activate passthrough page tables\n");
734         return -1;
735       }
736     }
737   }
738   
739   return 0;
740 }
741 #endif
742
743
744 void v3_flush_vmx_vm_core(struct guest_info * core) {
745     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
746     vmcs_clear(vmx_info->vmcs_ptr_phys);
747     vmx_info->state = VMX_UNLAUNCHED;
748 }
749
750
751
752 static int update_irq_exit_state(struct guest_info * info) {
753     struct vmx_exit_idt_vec_info idt_vec_info;
754
755     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
756
757     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
758 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
759         V3_Print(info->vm_info, info, "Calling v3_injecting_intr\n");
760 #endif
761         info->intr_core_state.irq_started = 0;
762         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
763     }
764
765     return 0;
766 }
767
768 static int update_irq_entry_state(struct guest_info * info) {
769     struct vmx_exit_idt_vec_info idt_vec_info;
770     struct vmcs_interrupt_state intr_core_state;
771     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
772
773     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
774     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
775
776     /* Check for pending exceptions to inject */
777     if (v3_excp_pending(info)) {
778         struct vmx_entry_int_info int_info;
779         int_info.value = 0;
780
781         // In VMX, almost every exception is hardware
782         // Software exceptions are pretty much only for breakpoint or overflow
783         int_info.type = 3;
784         int_info.vector = v3_get_excp_number(info);
785
786         if (info->excp_state.excp_error_code_valid) {
787             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
788             int_info.error_code = 1;
789
790 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
791             V3_Print(info->vm_info, info, "Injecting exception %d with error code %x\n", 
792                     int_info.vector, info->excp_state.excp_error_code);
793 #endif
794         }
795
796         int_info.valid = 1;
797 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
798         V3_Print(info->vm_info, info, "Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
799 #endif
800         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
801
802         v3_injecting_excp(info, int_info.vector);
803
804     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
805                (intr_core_state.val == 0)) {
806        
807         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
808
809 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
810             V3_Print(info->vm_info, info, "IRQ pending from previous injection\n");
811 #endif
812
813             // Copy the IDT vectoring info over to reinject the old interrupt
814             if (idt_vec_info.error_code == 1) {
815                 uint32_t err_code = 0;
816
817                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
818                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
819             }
820
821             idt_vec_info.undef = 0;
822             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
823
824         } else {
825             struct vmx_entry_int_info ent_int;
826             ent_int.value = 0;
827
828             switch (v3_intr_pending(info)) {
829                 case V3_EXTERNAL_IRQ: {
830                   
831                     int irq = v3_get_intr(info); 
832
833                     if (irq<0) {
834                       break;
835                     }
836
837                     info->intr_core_state.irq_vector = irq; 
838                     ent_int.vector = info->intr_core_state.irq_vector;
839                     ent_int.type = 0;
840                     ent_int.error_code = 0;
841                     ent_int.valid = 1;
842
843 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
844                     V3_Print(info->vm_info, info, "Injecting Interrupt %d at exit %u(EIP=%p)\n", 
845                                info->intr_core_state.irq_vector, 
846                                (uint32_t)info->num_exits, 
847                                (void *)(addr_t)info->rip);
848 #endif
849
850                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
851                     info->intr_core_state.irq_started = 1;
852
853                     break;
854                 }
855                 case V3_NMI:
856                     PrintDebug(info->vm_info, info, "Injecting NMI\n");
857
858                     ent_int.type = 2;
859                     ent_int.vector = 2;
860                     ent_int.valid = 1;
861                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
862
863                     break;
864                 case V3_SOFTWARE_INTR:
865                     PrintDebug(info->vm_info, info, "Injecting software interrupt\n");
866                     ent_int.type = 4;
867
868                     ent_int.valid = 1;
869                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
870
871                     break;
872                 case V3_VIRTUAL_IRQ:
873                     // Not sure what to do here, Intel doesn't have virtual IRQs
874                     // May be the same as external interrupts/IRQs
875
876                     break;
877                 case V3_INVALID_INTR:
878                 default:
879                     break;
880             }
881         }
882     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
883         // Enable INTR window exiting so we know when IF=1
884         uint32_t instr_len;
885
886         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
887
888 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
889         V3_Print(info->vm_info, info, "Enabling Interrupt-Window exiting: %d\n", instr_len);
890 #endif
891
892         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
893         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
894     }
895
896
897     return 0;
898 }
899
900
901
902 static struct vmx_exit_info exit_log[10];
903 static uint64_t rip_log[10];
904
905
906
907 static void print_exit_log(struct guest_info * info) {
908     int cnt = info->num_exits % 10;
909     int i = 0;
910     
911
912     V3_Print(info->vm_info, info, "\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
913
914     for (i = 0; i < 10; i++) {
915         struct vmx_exit_info * tmp = &exit_log[cnt];
916
917         V3_Print(info->vm_info, info, "%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
918         V3_Print(info->vm_info, info, "\texit_qual = %p\n", (void *)tmp->exit_qual);
919         V3_Print(info->vm_info, info, "\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
920         V3_Print(info->vm_info, info, "\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
921         V3_Print(info->vm_info, info, "\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
922         V3_Print(info->vm_info, info, "\tguest_linear_addr= %p\n", (void *)(addr_t)tmp->guest_linear_addr);
923         V3_Print(info->vm_info, info, "\tRIP = %p\n", (void *)rip_log[cnt]);
924
925
926         cnt--;
927
928         if (cnt == -1) {
929             cnt = 9;
930         }
931
932     }
933
934 }
935
936 int 
937 v3_vmx_config_tsc_virtualization(struct guest_info * info) {
938     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
939
940     if (info->time_state.flags & VM_TIME_TRAP_RDTSC) {
941         if  (!vmx_info->pri_proc_ctrls.rdtsc_exit) {
942             vmx_info->pri_proc_ctrls.rdtsc_exit = 1;
943             check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
944         }
945     } else {
946         sint64_t tsc_offset;
947         uint32_t tsc_offset_low, tsc_offset_high;
948
949         if  (vmx_info->pri_proc_ctrls.rdtsc_exit) {
950             vmx_info->pri_proc_ctrls.rdtsc_exit = 0;
951             check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
952         }
953
954         if (info->time_state.flags & VM_TIME_TSC_PASSTHROUGH) {
955             tsc_offset = 0;
956         } else {
957             tsc_offset = v3_tsc_host_offset(&info->time_state);
958         }
959         tsc_offset_high = (uint32_t)(( tsc_offset >> 32) & 0xffffffff);
960         tsc_offset_low = (uint32_t)(tsc_offset & 0xffffffff);
961
962         check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
963         check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
964     }
965     return 0;
966 }
967
968 /* 
969  * CAUTION and DANGER!!! 
970  * 
971  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
972  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
973  * on its contents will cause things to break. The contents at the time of the exit WILL 
974  * change before the exit handler is executed.
975  */
976 int v3_vmx_enter(struct guest_info * info) {
977     int ret = 0;
978     struct vmx_exit_info exit_info;
979     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
980     uint64_t guest_cycles = 0;
981
982     // Conditionally yield the CPU if the timeslice has expired
983     v3_schedule(info);
984
985 #ifdef V3_CONFIG_MEM_TRACK
986     v3_mem_track_entry(info);
987 #endif 
988
989     // Update timer devices late after being in the VM so that as much 
990     // of the time in the VM is accounted for as possible. Also do it before
991     // updating IRQ entry state so that any interrupts the timers raise get 
992     // handled on the next VM entry.
993     v3_advance_time(info, NULL);
994     v3_update_timers(info);
995
996     // disable global interrupts for vm state transition
997     v3_disable_ints();
998
999     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
1000         vmcs_clear(vmx_info->vmcs_ptr_phys);
1001         vmcs_load(vmx_info->vmcs_ptr_phys);
1002         vmx_info->state = VMX_UNLAUNCHED;
1003     }
1004
1005     v3_vmx_restore_vmcs(info);
1006
1007
1008 #ifdef V3_CONFIG_SYMCALL
1009     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
1010         update_irq_entry_state(info);
1011     }
1012 #else 
1013     update_irq_entry_state(info);
1014 #endif
1015
1016     {
1017         addr_t guest_cr3;
1018         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
1019         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
1020     }
1021
1022
1023     // Perform last-minute time setup prior to entering the VM
1024     v3_vmx_config_tsc_virtualization(info);
1025
1026     if (v3_update_vmcs_host_state(info)) {
1027         v3_enable_ints();
1028         PrintError(info->vm_info, info, "Could not write host state\n");
1029         return -1;
1030     }
1031     
1032     if (vmx_info->pin_ctrls.active_preempt_timer) {
1033         /* Preemption timer is active */
1034         uint32_t preempt_window = 0xffffffff;
1035
1036         if (info->timeouts.timeout_active) {
1037             preempt_window = info->timeouts.next_timeout;
1038         }
1039         
1040         check_vmcs_write(VMCS_PREEMPT_TIMER, preempt_window);
1041     }
1042
1043     V3_FP_ENTRY_RESTORE(info);
1044
1045     {   
1046         uint64_t entry_tsc = 0;
1047         uint64_t exit_tsc = 0;
1048
1049 #ifdef V3_CONFIG_PWRSTAT_TELEMETRY
1050         v3_pwrstat_telemetry_enter(info);
1051 #endif
1052
1053 #ifdef V3_CONFIG_PMU_TELEMETRY
1054         v3_pmu_telemetry_enter(info);
1055 #endif
1056
1057         if (vmx_info->state == VMX_UNLAUNCHED) {
1058             vmx_info->state = VMX_LAUNCHED;
1059             rdtscll(entry_tsc);
1060             ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
1061             rdtscll(exit_tsc);
1062
1063         } else {
1064             V3_ASSERT(info->vm_info, info,vmx_info->state != VMX_UNLAUNCHED);
1065             rdtscll(entry_tsc);
1066             ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
1067             rdtscll(exit_tsc);
1068         }
1069
1070         guest_cycles = exit_tsc - entry_tsc;    
1071
1072 #ifdef V3_CONFIG_PMU_TELEMETRY
1073         v3_pmu_telemetry_exit(info);
1074 #endif
1075
1076 #ifdef V3_CONFIG_PWRSTAT_TELEMETRY
1077         v3_pwrstat_telemetry_exit(info);
1078 #endif
1079     }
1080
1081     //  PrintDebug(info->vm_info, info, "VMX Exit: ret=%d\n", ret);
1082
1083     if (ret != VMX_SUCCESS) {
1084         uint32_t error = 0;
1085         vmcs_read(VMCS_INSTR_ERR, &error);
1086
1087         v3_enable_ints();
1088
1089         PrintError(info->vm_info, info, "VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
1090         return -1;
1091     }
1092
1093
1094     info->num_exits++;
1095
1096     V3_FP_EXIT_SAVE(info);
1097
1098     /* If we have the preemption time, then use it to get more accurate guest time */
1099     if (vmx_info->pin_ctrls.active_preempt_timer) {
1100         uint32_t cycles_left = 0;
1101         check_vmcs_read(VMCS_PREEMPT_TIMER, &(cycles_left));
1102
1103         if (info->timeouts.timeout_active) {
1104             guest_cycles = info->timeouts.next_timeout - cycles_left;
1105         } else {
1106             guest_cycles = 0xffffffff - cycles_left;
1107         }
1108     }
1109
1110     // Immediate exit from VM time bookkeeping
1111     v3_advance_time(info, &guest_cycles);
1112
1113     /* Update guest state */
1114     v3_vmx_save_vmcs(info);
1115
1116     // info->cpl = info->segments.cs.selector & 0x3;
1117
1118     info->mem_mode = v3_get_vm_mem_mode(info);
1119     info->cpu_mode = v3_get_vm_cpu_mode(info);
1120
1121
1122
1123     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
1124     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
1125     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
1126     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
1127     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
1128     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
1129     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
1130
1131     if (info->shdw_pg_mode == NESTED_PAGING) {
1132         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
1133     }
1134
1135     //PrintDebug(info->vm_info, info, "VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
1136
1137     exit_log[info->num_exits % 10] = exit_info;
1138     rip_log[info->num_exits % 10] = get_addr_linear(info, info->rip, &(info->segments.cs));
1139
1140 #ifdef V3_CONFIG_SYMCALL
1141     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
1142         update_irq_exit_state(info);
1143     }
1144 #else
1145     update_irq_exit_state(info);
1146 #endif
1147
1148     if (exit_info.exit_reason == VMX_EXIT_INTR_WINDOW) {
1149         // This is a special case whose only job is to inject an interrupt
1150         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
1151         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
1152         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
1153
1154 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
1155        V3_Print(info->vm_info, info, "Interrupts available again! (RIP=%llx)\n", info->rip);
1156 #endif
1157     }
1158
1159
1160     // Lastly we check for an NMI exit, and reinject if so
1161     {
1162         struct vmx_basic_exit_info * basic_info = (struct vmx_basic_exit_info *)&(exit_info.exit_reason);
1163
1164         if (basic_info->reason == VMX_EXIT_INFO_EXCEPTION_OR_NMI) {
1165             if ((uint8_t)exit_info.int_info == 2) {
1166                 asm("int $2");
1167             }
1168         }
1169     }
1170
1171     // reenable global interrupts after vm exit
1172     v3_enable_ints();
1173
1174     // Conditionally yield the CPU if the timeslice has expired
1175     v3_schedule(info);
1176     v3_advance_time(info, NULL);
1177     v3_update_timers(info);
1178
1179     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
1180         PrintError(info->vm_info, info, "Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
1181         return -1;
1182     }
1183
1184     if (info->timeouts.timeout_active) {
1185         /* Check to see if any timeouts have expired */
1186         v3_handle_timeouts(info, guest_cycles);
1187     }
1188
1189 #ifdef V3_CONFIG_MEM_TRACK
1190     v3_mem_track_exit(info);
1191 #endif 
1192
1193     return 0;
1194 }
1195
1196
1197 int v3_start_vmx_guest(struct guest_info * info) {
1198
1199     PrintDebug(info->vm_info, info, "Starting VMX core %u\n", info->vcpu_id);
1200
1201 #if V3_CONFIG_HVM
1202     if (v3_setup_hvm_vm_for_boot(vm)) { 
1203         PrintError(vm, VCORE_NONE, "HVM setup for boot failed\n");
1204         return -1;
1205     }
1206 #endif
1207     
1208     while (1) {
1209         if (info->core_run_state == CORE_STOPPED) {
1210             if (info->vcpu_id == 0) {
1211                 info->core_run_state = CORE_RUNNING;
1212             } else {
1213                 
1214                 PrintDebug(info->vm_info, info, "VMX core %u: Waiting for core initialization\n", info->vcpu_id);
1215
1216                 V3_NO_WORK(info);
1217                 
1218                 while (info->core_run_state == CORE_STOPPED) {
1219                     
1220                     if (info->vm_info->run_state == VM_STOPPED) {
1221                         // The VM was stopped before this core was initialized. 
1222                         return 0;
1223                     }
1224
1225                     V3_STILL_NO_WORK(info);
1226                     //PrintDebug(info->vm_info, info, "VMX core %u: still waiting for INIT\n",info->vcpu_id);
1227                 }
1228
1229                 V3_HAVE_WORK_AGAIN(info);
1230
1231                 PrintDebug(info->vm_info, info, "VMX core %u initialized\n", info->vcpu_id);
1232                 
1233                 // We'll be paranoid about race conditions here
1234                 v3_wait_at_barrier(info);
1235             }
1236             
1237             
1238             PrintDebug(info->vm_info, info, "VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
1239                        info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
1240                        info->segments.cs.limit, (void *)(info->rip));
1241             
1242             
1243             PrintDebug(info->vm_info, info, "VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
1244             
1245             v3_start_time(info);
1246             
1247             
1248             if (info->vm_info->run_state == VM_STOPPED) {
1249                 info->core_run_state = CORE_STOPPED;
1250                 break;
1251             }
1252         }
1253         
1254         
1255 #ifdef V3_CONFIG_PMU_TELEMETRY
1256         v3_pmu_telemetry_start(info);
1257 #endif
1258         
1259 #ifdef V3_CONFIG_PWRSTAT_TELEMETRY
1260         v3_pwrstat_telemetry_start(info);
1261 #endif
1262         
1263         
1264         if (v3_vmx_enter(info) == -1) {
1265             
1266             addr_t host_addr;
1267             addr_t linear_addr = 0;
1268             
1269             info->vm_info->run_state = VM_ERROR;
1270             
1271             V3_Print(info->vm_info, info, "VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
1272             
1273             v3_print_guest_state(info);
1274             
1275             V3_Print(info->vm_info, info, "VMX core %u\n", info->vcpu_id); 
1276
1277             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
1278             
1279             if (info->mem_mode == PHYSICAL_MEM) {
1280                 v3_gpa_to_hva(info, linear_addr, &host_addr);
1281             } else if (info->mem_mode == VIRTUAL_MEM) {
1282                 v3_gva_to_hva(info, linear_addr, &host_addr);
1283             }
1284             
1285             V3_Print(info->vm_info, info, "VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
1286             
1287             V3_Print(info->vm_info, info, "VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
1288             v3_dump_mem((uint8_t *)host_addr, 15);
1289             
1290             v3_print_stack(info);
1291
1292
1293             v3_print_vmcs();
1294             print_exit_log(info);
1295             return -1;
1296         }
1297
1298         v3_wait_at_barrier(info);
1299
1300
1301         if (info->vm_info->run_state == VM_STOPPED) {
1302             info->core_run_state = CORE_STOPPED;
1303             break;
1304         }
1305 /*
1306         if ((info->num_exits % 5000) == 0) {
1307             V3_Print(info->vm_info, info, "VMX Exit number %d\n", (uint32_t)info->num_exits);
1308         }
1309 */
1310
1311     }
1312
1313 #ifdef V3_CONFIG_PMU_TELEMETRY
1314     v3_pmu_telemetry_end(info);
1315 #endif
1316
1317 #ifdef V3_CONFIG_PWRSTAT_TELEMETRY
1318     v3_pwrstat_telemetry_end(info);
1319 #endif
1320
1321     return 0;
1322 }
1323
1324
1325
1326
1327 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
1328 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
1329 #define CPUID_1_ECX_VTXFLAG 0x00000020
1330
1331 int v3_is_vmx_capable() {
1332     v3_msr_t feature_msr;
1333     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1334
1335     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1336
1337     PrintDebug(VM_NONE, VCORE_NONE, "ECX: 0x%x\n", ecx);
1338
1339     if (ecx & CPUID_1_ECX_VTXFLAG) {
1340         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1341         
1342         PrintDebug(VM_NONE, VCORE_NONE,  "MSRREGlow: 0x%.8x\n", feature_msr.lo);
1343
1344         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1345             PrintDebug(VM_NONE, VCORE_NONE,  "VMX is locked -- enable in the BIOS\n");
1346             return 0;
1347         }
1348
1349     } else {
1350         PrintDebug(VM_NONE, VCORE_NONE,  "VMX not supported on this cpu\n");
1351         return 0;
1352     }
1353
1354     return 1;
1355 }
1356
1357
1358 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1359     // init vmcs bios
1360     
1361     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1362         (v3_mach_type == V3_VMX_EPT_UG_CPU)) {
1363         // easy 
1364         core->rip = 0;
1365         core->segments.cs.selector = rip << 8;
1366         core->segments.cs.limit = 0xffff;
1367         core->segments.cs.base = rip << 12;
1368     } else {
1369         core->vm_regs.rdx = core->vcpu_id;
1370         core->vm_regs.rbx = rip;
1371     }
1372
1373     return 0;
1374 }
1375
1376
1377
1378 void v3_init_vmx_cpu(int cpu_id) {
1379     addr_t vmx_on_region = 0;
1380     extern v3_cpu_arch_t v3_mach_type;
1381     extern v3_cpu_arch_t v3_cpu_types[];
1382
1383     if (v3_mach_type == V3_INVALID_CPU) {
1384         if (v3_init_vmx_hw(&hw_info) == -1) {
1385             PrintError(VM_NONE, VCORE_NONE, "Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1386             return;
1387         }
1388     }
1389
1390     enable_vmx();
1391
1392
1393     // Setup VMXON Region
1394     vmx_on_region = allocate_vmcs();
1395
1396
1397     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1398         V3_Print(VM_NONE, VCORE_NONE,  "VMX Enabled\n");
1399         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1400     } else {
1401         V3_Print(VM_NONE, VCORE_NONE,  "VMX already enabled\n");
1402         V3_FreePages((void *)vmx_on_region, 1);
1403     }
1404
1405     PrintDebug(VM_NONE, VCORE_NONE,  "VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1406
1407     {
1408         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1409         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1410         
1411         if (sec_proc_ctrls.enable_ept == 0) {
1412             V3_Print(VM_NONE, VCORE_NONE, "VMX EPT (Nested) Paging not supported\n");
1413             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1414         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1415             V3_Print(VM_NONE, VCORE_NONE, "VMX EPT (Nested) Paging supported\n");
1416             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1417         } else {
1418             V3_Print(VM_NONE, VCORE_NONE, "VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1419             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1420         }
1421     }
1422     
1423 }
1424
1425
1426 void v3_deinit_vmx_cpu(int cpu_id) {
1427     extern v3_cpu_arch_t v3_cpu_types[];
1428     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1429
1430     if (host_vmcs_ptrs[cpu_id] != 0) {
1431         V3_Print(VM_NONE, VCORE_NONE, "Disabling VMX\n");
1432
1433         if (vmx_off() != VMX_SUCCESS) {
1434             PrintError(VM_NONE, VCORE_NONE, "Error executing VMXOFF\n");
1435         }
1436
1437         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1438
1439         host_vmcs_ptrs[cpu_id] = 0;
1440     }
1441 }