Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


419b7066152a1478a991059b7b256c7529f1e196
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35
36 #include <palacios/vmx_ept.h>
37 #include <palacios/vmx_assist.h>
38 #include <palacios/vmx_hw_info.h>
39
40 #ifndef V3_CONFIG_DEBUG_VMX
41 #undef PrintDebug
42 #define PrintDebug(fmt, args...)
43 #endif
44
45
46 /* These fields contain the hardware feature sets supported by the local CPU */
47 static struct vmx_hw_info hw_info;
48
49 extern v3_cpu_arch_t v3_cpu_types[];
50
51 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
52
53 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
54 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
55
56 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
57     int ret = 0;
58
59     ret = vmcs_write(field, val);
60
61     if (ret != VMX_SUCCESS) {
62         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
63         return 1;
64     }
65
66     return 0;
67 }
68
69 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
70     int ret = 0;
71
72     ret = vmcs_read(field, val);
73
74     if (ret != VMX_SUCCESS) {
75         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
76     }
77
78     return ret;
79 }
80
81
82
83
84 static addr_t allocate_vmcs() {
85     struct vmcs_data * vmcs_page = NULL;
86
87     PrintDebug("Allocating page\n");
88
89     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
90     memset(vmcs_page, 0, 4096);
91
92     vmcs_page->revision = hw_info.basic_info.revision;
93     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
94
95     return (addr_t)V3_PAddr((void *)vmcs_page);
96 }
97
98
99
100
101 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
102     int vmx_ret = 0;
103
104     // disable global interrupts for vm state initialization
105     v3_disable_ints();
106
107     PrintDebug("Loading VMCS\n");
108     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
109     vmx_state->state = VMX_UNLAUNCHED;
110
111     if (vmx_ret != VMX_SUCCESS) {
112         PrintError("VMPTRLD failed\n");
113         return -1;
114     }
115
116
117     /*** Setup default state from HW ***/
118
119     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
120     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
121     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
122     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
123     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
124
125     /* Print Control MSRs */
126     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
127     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
128
129
130
131     /******* Setup Host State **********/
132
133     /* Cache GDTR, IDTR, and TR in host struct */
134     addr_t gdtr_base;
135     struct {
136         uint16_t selector;
137         addr_t   base;
138     } __attribute__((packed)) tmp_seg;
139     
140
141     __asm__ __volatile__(
142                          "sgdt (%0);"
143                          :
144                          : "q"(&tmp_seg)
145                          : "memory"
146                          );
147     gdtr_base = tmp_seg.base;
148     vmx_state->host_state.gdtr.base = gdtr_base;
149
150     __asm__ __volatile__(
151                          "sidt (%0);"
152                          :
153                          : "q"(&tmp_seg)
154                          : "memory"
155                          );
156     vmx_state->host_state.idtr.base = tmp_seg.base;
157
158     __asm__ __volatile__(
159                          "str (%0);"
160                          :
161                          : "q"(&tmp_seg)
162                          : "memory"
163                          );
164     vmx_state->host_state.tr.selector = tmp_seg.selector;
165
166     /* The GDTR *index* is bits 3-15 of the selector. */
167     struct tss_descriptor * desc = NULL;
168     desc = (struct tss_descriptor *)(gdtr_base + (8 * (tmp_seg.selector >> 3)));
169
170     tmp_seg.base = ((desc->base1) |
171                     (desc->base2 << 16) |
172                     (desc->base3 << 24) |
173 #ifdef __V3_64BIT__
174                     ((uint64_t)desc->base4 << 32)
175 #else 
176                     (0)
177 #endif
178                     );
179
180     vmx_state->host_state.tr.base = tmp_seg.base;
181
182
183     /********** Setup VMX Control Fields ***********/
184
185     /* Add external interrupts, NMI exiting, and virtual NMI */
186     vmx_state->pin_ctrls.nmi_exit = 1;
187     vmx_state->pin_ctrls.ext_int_exit = 1;
188
189
190     vmx_state->pri_proc_ctrls.hlt_exit = 1;
191
192
193     vmx_state->pri_proc_ctrls.pause_exit = 0;
194     vmx_state->pri_proc_ctrls.tsc_offset = 1;
195 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
196     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
197 #endif
198
199     /* Setup IO map */
200     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
201     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
202     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
203             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
204
205
206     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
207     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
208
209
210
211     
212
213
214
215 #ifdef __V3_64BIT__
216     // Ensure host runs in 64-bit mode at each VM EXIT
217     vmx_state->exit_ctrls.host_64_on = 1;
218 #endif
219
220     // Hook all accesses to EFER register
221     v3_hook_msr(core->vm_info, EFER_MSR, 
222                 &v3_handle_efer_read,
223                 &v3_handle_efer_write, 
224                 core);
225
226     // Restore host's EFER register on each VM EXIT
227     vmx_state->exit_ctrls.ld_efer = 1;
228
229     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
230     vmx_state->exit_ctrls.save_efer = 1;
231     vmx_state->entry_ctrls.ld_efer  = 1;
232
233     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
234     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
235
236
237     /* Setup paging */
238     if (core->shdw_pg_mode == SHADOW_PAGING) {
239         PrintDebug("Creating initial shadow page table\n");
240
241         if (v3_init_passthrough_pts(core) == -1) {
242             PrintError("Could not initialize passthrough page tables\n");
243             return -1;
244         }
245         
246 #define CR0_PE 0x00000001
247 #define CR0_PG 0x80000000
248 #define CR0_WP 0x00010000 // To ensure mem hooks work
249         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
250
251         core->ctrl_regs.cr3 = core->direct_map_pt;
252
253         // vmx_state->pinbased_ctrls |= NMI_EXIT;
254
255         /* Add CR exits */
256         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
257         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
258         
259         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
260         
261         /* Add page fault exits */
262         vmx_state->excp_bmap.pf = 1;
263
264         // Setup VMX Assist
265         v3_vmxassist_init(core, vmx_state);
266
267     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
268                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
269
270 #define CR0_PE 0x00000001
271 #define CR0_PG 0x80000000
272 #define CR0_WP 0x00010000 // To ensure mem hooks work
273         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
274
275         // vmx_state->pinbased_ctrls |= NMI_EXIT;
276
277         /* Disable CR exits */
278         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
279         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
280
281         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
282
283         /* Add page fault exits */
284         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
285         
286         // Setup VMX Assist
287         v3_vmxassist_init(core, vmx_state);
288
289         /* Enable EPT */
290         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
291         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
292
293
294
295         if (v3_init_ept(core, &hw_info) == -1) {
296             PrintError("Error initializing EPT\n");
297             return -1;
298         }
299
300     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
301                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
302         int i = 0;
303         // For now we will assume that unrestricted guest mode is assured w/ EPT
304
305
306         core->vm_regs.rsp = 0x00;
307         core->rip = 0xfff0;
308         core->vm_regs.rdx = 0x00000f00;
309         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
310         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
311
312
313         core->segments.cs.selector = 0xf000;
314         core->segments.cs.limit = 0xffff;
315         core->segments.cs.base = 0x0000000f0000LL;
316
317         // (raw attributes = 0xf3)
318         core->segments.cs.type = 0xb;
319         core->segments.cs.system = 0x1;
320         core->segments.cs.dpl = 0x0;
321         core->segments.cs.present = 1;
322
323
324
325         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
326                                           &(core->segments.es), &(core->segments.fs), 
327                                           &(core->segments.gs), NULL};
328
329         for ( i = 0; segregs[i] != NULL; i++) {
330             struct v3_segment * seg = segregs[i];
331         
332             seg->selector = 0x0000;
333             //    seg->base = seg->selector << 4;
334             seg->base = 0x00000000;
335             seg->limit = 0xffff;
336
337
338             seg->type = 0x3;
339             seg->system = 0x1;
340             seg->dpl = 0x0;
341             seg->present = 1;
342             //    seg->granularity = 1;
343
344         }
345
346
347         core->segments.gdtr.limit = 0x0000ffff;
348         core->segments.gdtr.base = 0x0000000000000000LL;
349
350         core->segments.idtr.limit = 0x0000ffff;
351         core->segments.idtr.base = 0x0000000000000000LL;
352
353         core->segments.ldtr.selector = 0x0000;
354         core->segments.ldtr.limit = 0x0000ffff;
355         core->segments.ldtr.base = 0x0000000000000000LL;
356         core->segments.ldtr.type = 2;
357         core->segments.ldtr.present = 1;
358
359         core->segments.tr.selector = 0x0000;
360         core->segments.tr.limit = 0x0000ffff;
361         core->segments.tr.base = 0x0000000000000000LL;
362         core->segments.tr.type = 0xb;
363         core->segments.tr.present = 1;
364
365         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
366         core->dbg_regs.dr7 = 0x0000000000000400LL;
367
368         /* Enable EPT */
369         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
370         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
371         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
372
373
374         /* Disable shadow paging stuff */
375         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
376         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
377
378         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
379
380
381         if (v3_init_ept(core, &hw_info) == -1) {
382             PrintError("Error initializing EPT\n");
383             return -1;
384         }
385
386     } else {
387         PrintError("Invalid Virtual paging mode\n");
388         return -1;
389     }
390
391
392     // hook vmx msrs
393
394     // Setup SYSCALL/SYSENTER MSRs in load/store area
395     
396     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
397     {
398 #define IA32_STAR 0xc0000081
399 #define IA32_LSTAR 0xc0000082
400 #define IA32_FMASK 0xc0000084
401 #define IA32_KERN_GS_BASE 0xc0000102
402
403 #define IA32_CSTAR 0xc0000083 // Compatibility mode STAR (ignored for now... hopefully its not that important...)
404
405         int msr_ret = 0;
406
407         struct vmcs_msr_entry * exit_store_msrs = NULL;
408         struct vmcs_msr_entry * exit_load_msrs = NULL;
409         struct vmcs_msr_entry * entry_load_msrs = NULL;;
410         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
411
412         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
413
414         if (max_msrs < 4) {
415             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
416             return -1;
417         }
418
419         vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
420
421         if (vmx_state->msr_area == NULL) {
422             PrintError("could not allocate msr load/store area\n");
423             return -1;
424         }
425
426         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
427         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
428         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
429         
430         
431         exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
432         exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
433         entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
434
435
436         exit_store_msrs[0].index = IA32_STAR;
437         exit_store_msrs[1].index = IA32_LSTAR;
438         exit_store_msrs[2].index = IA32_FMASK;
439         exit_store_msrs[3].index = IA32_KERN_GS_BASE;
440         
441         memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
442         memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
443
444         
445         v3_get_msr(IA32_STAR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
446         v3_get_msr(IA32_LSTAR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
447         v3_get_msr(IA32_FMASK, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
448         v3_get_msr(IA32_KERN_GS_BASE, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
449
450         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
451         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
452         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
453
454     }    
455
456     /* Sanity check ctrl/reg fields against hw_defaults */
457
458
459
460
461     /*** Write all the info to the VMCS ***/
462   
463     /*
464     {
465         // IS THIS NECESSARY???
466 #define DEBUGCTL_MSR 0x1d9
467         struct v3_msr tmp_msr;
468         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
469         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
470         core->dbg_regs.dr7 = 0x400;
471     }
472     */
473
474 #ifdef __V3_64BIT__
475     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
476 #else
477     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
478     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
479 #endif
480
481
482  
483
484     if (v3_update_vmcs_ctrl_fields(core)) {
485         PrintError("Could not write control fields!\n");
486         return -1;
487     }
488     
489     if (v3_update_vmcs_host_state(core)) {
490         PrintError("Could not write host state\n");
491         return -1;
492     }
493
494     // reenable global interrupts for vm state initialization now
495     // that the vm state is initialized. If another VM kicks us off, 
496     // it'll update our vmx state so that we know to reload ourself
497     v3_enable_ints();
498
499     return 0;
500 }
501
502 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
503     struct vmx_data * vmx_state = NULL;
504     int vmx_ret = 0;
505     
506     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
507     memset(vmx_state, 0, sizeof(struct vmx_data));
508
509     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
510
511     PrintDebug("Allocating VMCS\n");
512     vmx_state->vmcs_ptr_phys = allocate_vmcs();
513
514     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
515
516     core->vmm_data = vmx_state;
517     vmx_state->state = VMX_UNLAUNCHED;
518
519     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
520     
521     // TODO: Fix vmcs fields so they're 32-bit
522
523     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
524     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
525
526     if (vmx_ret != VMX_SUCCESS) {
527         PrintError("VMCLEAR failed\n");
528         return -1; 
529     }
530
531     if (vm_class == V3_PC_VM) {
532         PrintDebug("Initializing VMCS\n");
533         if (init_vmcs_bios(core, vmx_state) == -1) {
534             PrintError("Error initializing VMCS to BIOS state\n");
535             return -1;
536         }
537     } else {
538         PrintError("Invalid VM Class\n");
539         return -1;
540     }
541
542     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
543     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
544
545     return 0;
546 }
547
548
549 int v3_deinit_vmx_vmcs(struct guest_info * core) {
550     struct vmx_data * vmx_state = core->vmm_data;
551
552     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
553     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
554
555     V3_Free(vmx_state);
556
557     return 0;
558 }
559
560
561 static int update_irq_exit_state(struct guest_info * info) {
562     struct vmx_exit_idt_vec_info idt_vec_info;
563
564     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
565
566     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
567 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
568         V3_Print("Calling v3_injecting_intr\n");
569 #endif
570         info->intr_core_state.irq_started = 0;
571         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
572     }
573
574     return 0;
575 }
576
577 static int update_irq_entry_state(struct guest_info * info) {
578     struct vmx_exit_idt_vec_info idt_vec_info;
579     struct vmcs_interrupt_state intr_core_state;
580     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
581
582     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
583     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
584
585     /* Check for pending exceptions to inject */
586     if (v3_excp_pending(info)) {
587         struct vmx_entry_int_info int_info;
588         int_info.value = 0;
589
590         // In VMX, almost every exception is hardware
591         // Software exceptions are pretty much only for breakpoint or overflow
592         int_info.type = 3;
593         int_info.vector = v3_get_excp_number(info);
594
595         if (info->excp_state.excp_error_code_valid) {
596             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
597             int_info.error_code = 1;
598
599 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
600             V3_Print("Injecting exception %d with error code %x\n", 
601                     int_info.vector, info->excp_state.excp_error_code);
602 #endif
603         }
604
605         int_info.valid = 1;
606 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
607         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
608 #endif
609         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
610
611         v3_injecting_excp(info, int_info.vector);
612
613     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
614                (intr_core_state.val == 0)) {
615        
616         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
617
618 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
619             V3_Print("IRQ pending from previous injection\n");
620 #endif
621
622             // Copy the IDT vectoring info over to reinject the old interrupt
623             if (idt_vec_info.error_code == 1) {
624                 uint32_t err_code = 0;
625
626                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
627                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
628             }
629
630             idt_vec_info.undef = 0;
631             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
632
633         } else {
634             struct vmx_entry_int_info ent_int;
635             ent_int.value = 0;
636
637             switch (v3_intr_pending(info)) {
638                 case V3_EXTERNAL_IRQ: {
639                     info->intr_core_state.irq_vector = v3_get_intr(info); 
640                     ent_int.vector = info->intr_core_state.irq_vector;
641                     ent_int.type = 0;
642                     ent_int.error_code = 0;
643                     ent_int.valid = 1;
644
645 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
646                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
647                                info->intr_core_state.irq_vector, 
648                                (uint32_t)info->num_exits, 
649                                (void *)(addr_t)info->rip);
650 #endif
651
652                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
653                     info->intr_core_state.irq_started = 1;
654
655                     break;
656                 }
657                 case V3_NMI:
658                     PrintDebug("Injecting NMI\n");
659
660                     ent_int.type = 2;
661                     ent_int.vector = 2;
662                     ent_int.valid = 1;
663                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
664
665                     break;
666                 case V3_SOFTWARE_INTR:
667                     PrintDebug("Injecting software interrupt\n");
668                     ent_int.type = 4;
669
670                     ent_int.valid = 1;
671                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
672
673                     break;
674                 case V3_VIRTUAL_IRQ:
675                     // Not sure what to do here, Intel doesn't have virtual IRQs
676                     // May be the same as external interrupts/IRQs
677
678                     break;
679                 case V3_INVALID_INTR:
680                 default:
681                     break;
682             }
683         }
684     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
685         // Enable INTR window exiting so we know when IF=1
686         uint32_t instr_len;
687
688         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
689
690 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
691         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
692 #endif
693
694         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
695         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
696     }
697
698
699     return 0;
700 }
701
702
703
704 static struct vmx_exit_info exit_log[10];
705
706 static void print_exit_log(struct guest_info * info) {
707     int cnt = info->num_exits % 10;
708     int i = 0;
709     
710
711     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
712
713     for (i = 0; i < 10; i++) {
714         struct vmx_exit_info * tmp = &exit_log[cnt];
715
716         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
717         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
718         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
719         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
720         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
721
722         cnt--;
723
724         if (cnt == -1) {
725             cnt = 9;
726         }
727
728     }
729
730 }
731
732 /* 
733  * CAUTION and DANGER!!! 
734  * 
735  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
736  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
737  * on its contents will cause things to break. The contents at the time of the exit WILL 
738  * change before the exit handler is executed.
739  */
740 int v3_vmx_enter(struct guest_info * info) {
741     int ret = 0;
742     uint32_t tsc_offset_low, tsc_offset_high;
743     struct vmx_exit_info exit_info;
744     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
745
746     // Conditionally yield the CPU if the timeslice has expired
747     v3_yield_cond(info);
748
749     // Perform any additional yielding needed for time adjustment
750     v3_adjust_time(info);
751
752     // disable global interrupts for vm state transition
753     v3_disable_ints();
754
755     // Update timer devices late after being in the VM so that as much 
756     // of hte time in the VM is accounted for as possible. Also do it before
757     // updating IRQ entry state so that any interrupts the timers raise get 
758     // handled on the next VM entry. Must be done with interrupts disabled.
759     v3_update_timers(info);
760
761     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
762         vmcs_clear(vmx_info->vmcs_ptr_phys);
763         vmcs_load(vmx_info->vmcs_ptr_phys);
764         vmx_info->state = VMX_UNLAUNCHED;
765     }
766
767     v3_vmx_restore_vmcs(info);
768
769
770 #ifdef V3_CONFIG_SYMCALL
771     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
772         update_irq_entry_state(info);
773     }
774 #else 
775     update_irq_entry_state(info);
776 #endif
777
778     {
779         addr_t guest_cr3;
780         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
781         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
782     }
783
784     // Perform last-minute time bookkeeping prior to entering the VM
785     v3_time_enter_vm(info);
786
787     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
788     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
789     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
790     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
791
792
793     if (v3_update_vmcs_host_state(info)) {
794         v3_enable_ints();
795         PrintError("Could not write host state\n");
796         return -1;
797     }
798
799
800     if (vmx_info->state == VMX_UNLAUNCHED) {
801         vmx_info->state = VMX_LAUNCHED;
802
803         info->vm_info->run_state = VM_RUNNING;
804         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
805     } else {
806         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
807         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
808     }
809     
810     //  PrintDebug("VMX Exit: ret=%d\n", ret);
811
812     if (ret != VMX_SUCCESS) {
813         uint32_t error = 0;
814
815         vmcs_read(VMCS_INSTR_ERR, &error);
816
817         v3_enable_ints();
818
819         PrintError("VMENTRY Error: %d\n", error);
820         return -1;
821     }
822
823     // Immediate exit from VM time bookkeeping
824     v3_time_exit_vm(info);
825
826     info->num_exits++;
827
828     /* Update guest state */
829     v3_vmx_save_vmcs(info);
830
831     // info->cpl = info->segments.cs.selector & 0x3;
832
833     info->mem_mode = v3_get_vm_mem_mode(info);
834     info->cpu_mode = v3_get_vm_cpu_mode(info);
835
836
837     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
838     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
839     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
840     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
841     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
842     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
843     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
844
845     if (info->shdw_pg_mode == NESTED_PAGING) {
846         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
847     }
848
849     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
850
851     exit_log[info->num_exits % 10] = exit_info;
852
853 #ifdef V3_CONFIG_SYMCALL
854     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
855         update_irq_exit_state(info);
856     }
857 #else
858     update_irq_exit_state(info);
859 #endif
860
861     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
862         // This is a special case whose only job is to inject an interrupt
863         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
864         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
865         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
866
867 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
868        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
869 #endif
870     }
871
872     // reenable global interrupts after vm exit
873     v3_enable_ints();
874
875     // Conditionally yield the CPU if the timeslice has expired
876     v3_yield_cond(info);
877
878     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
879         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
880         return -1;
881     }
882
883     return 0;
884 }
885
886
887 int v3_start_vmx_guest(struct guest_info * info) {
888
889     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
890
891     if (info->vcpu_id == 0) {
892         info->core_run_state = CORE_RUNNING;
893         info->vm_info->run_state = VM_RUNNING;
894     } else {
895
896         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
897
898         while (info->core_run_state == CORE_STOPPED) {
899             v3_yield(info);
900             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
901         }
902         
903         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
904     }
905
906
907     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
908                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
909                info->segments.cs.limit, (void *)(info->rip));
910
911
912     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
913
914     v3_start_time(info);
915
916     while (1) {
917
918         if (info->vm_info->run_state == VM_STOPPED) {
919             info->core_run_state = CORE_STOPPED;
920             break;
921         }
922
923         if (v3_vmx_enter(info) == -1) {
924
925             addr_t host_addr;
926             addr_t linear_addr = 0;
927             
928             info->vm_info->run_state = VM_ERROR;
929             
930             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
931             
932             v3_print_guest_state(info);
933             
934             V3_Print("VMX core %u\n", info->vcpu_id); 
935
936             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
937             
938             if (info->mem_mode == PHYSICAL_MEM) {
939                 v3_gpa_to_hva(info, linear_addr, &host_addr);
940             } else if (info->mem_mode == VIRTUAL_MEM) {
941                 v3_gva_to_hva(info, linear_addr, &host_addr);
942             }
943             
944             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
945             
946             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
947             v3_dump_mem((uint8_t *)host_addr, 15);
948             
949             v3_print_stack(info);
950
951
952             v3_print_vmcs();
953             print_exit_log(info);
954             return -1;
955         }
956
957
958
959         if (info->vm_info->run_state == VM_STOPPED) {
960             info->core_run_state = CORE_STOPPED;
961             break;
962         }
963 /*
964         if ((info->num_exits % 5000) == 0) {
965             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
966         }
967 */
968
969     }
970
971     return 0;
972 }
973
974
975
976
977 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
978 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
979 #define CPUID_1_ECX_VTXFLAG 0x00000020
980
981 int v3_is_vmx_capable() {
982     v3_msr_t feature_msr;
983     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
984
985     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
986
987     PrintDebug("ECX: 0x%x\n", ecx);
988
989     if (ecx & CPUID_1_ECX_VTXFLAG) {
990         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
991         
992         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
993
994         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
995             PrintDebug("VMX is locked -- enable in the BIOS\n");
996             return 0;
997         }
998
999     } else {
1000         PrintDebug("VMX not supported on this cpu\n");
1001         return 0;
1002     }
1003
1004     return 1;
1005 }
1006
1007
1008 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1009     // init vmcs bios
1010     
1011     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1012         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1013         // easy 
1014         core->rip = 0;
1015         core->segments.cs.selector = rip << 8;
1016         core->segments.cs.limit = 0xffff;
1017         core->segments.cs.base = rip << 12;
1018     } else {
1019         core->vm_regs.rdx = core->vcpu_id;
1020         core->vm_regs.rbx = rip;
1021     }
1022
1023     return 0;
1024 }
1025
1026
1027
1028 void v3_init_vmx_cpu(int cpu_id) {
1029     addr_t vmx_on_region = 0;
1030
1031     if (cpu_id == 0) {
1032         if (v3_init_vmx_hw(&hw_info) == -1) {
1033             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1034             return;
1035         }
1036     }
1037
1038     enable_vmx();
1039
1040
1041     // Setup VMXON Region
1042     vmx_on_region = allocate_vmcs();
1043
1044
1045     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1046         V3_Print("VMX Enabled\n");
1047         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1048     } else {
1049         V3_Print("VMX already enabled\n");
1050         V3_FreePages((void *)vmx_on_region, 1);
1051     }
1052
1053     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1054
1055     {
1056         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1057         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1058         
1059         if (sec_proc_ctrls.enable_ept == 0) {
1060             V3_Print("VMX EPT (Nested) Paging not supported\n");
1061             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1062         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1063             V3_Print("VMX EPT (Nested) Paging supported\n");
1064             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1065         } else {
1066             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1067             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1068         }
1069     }
1070 }
1071
1072
1073 void v3_deinit_vmx_cpu(int cpu_id) {
1074     extern v3_cpu_arch_t v3_cpu_types[];
1075     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1076
1077     if (host_vmcs_ptrs[cpu_id] != 0) {
1078         V3_Print("Disabling VMX\n");
1079
1080         if (vmx_off() != VMX_SUCCESS) {
1081             PrintError("Error executing VMXOFF\n");
1082         }
1083
1084         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1085
1086         host_vmcs_ptrs[cpu_id] = 0;
1087     }
1088 }