Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


bc45c9b444b882e33666fa869e989be46e4d13d4
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #ifdef V3_CONFIG_CHECKPOINT
38 #include <palacios/vmm_checkpoint.h>
39 #endif
40
41 #include <palacios/vmx_ept.h>
42 #include <palacios/vmx_assist.h>
43 #include <palacios/vmx_hw_info.h>
44
45 #ifndef V3_CONFIG_DEBUG_VMX
46 #undef PrintDebug
47 #define PrintDebug(fmt, args...)
48 #endif
49
50
51 /* These fields contain the hardware feature sets supported by the local CPU */
52 static struct vmx_hw_info hw_info;
53
54 extern v3_cpu_arch_t v3_cpu_types[];
55
56 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
57
58 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
59 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
60
61 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
62     int ret = 0;
63
64     ret = vmcs_write(field, val);
65
66     if (ret != VMX_SUCCESS) {
67         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
68         return 1;
69     }
70
71     return 0;
72 }
73
74 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
75     int ret = 0;
76
77     ret = vmcs_read(field, val);
78
79     if (ret != VMX_SUCCESS) {
80         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
81     }
82
83     return ret;
84 }
85
86
87
88
89 static addr_t allocate_vmcs() {
90     struct vmcs_data * vmcs_page = NULL;
91
92     PrintDebug("Allocating page\n");
93
94     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
95     memset(vmcs_page, 0, 4096);
96
97     vmcs_page->revision = hw_info.basic_info.revision;
98     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
99
100     return (addr_t)V3_PAddr((void *)vmcs_page);
101 }
102
103
104
105
106 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
107     int vmx_ret = 0;
108
109     // disable global interrupts for vm state initialization
110     v3_disable_ints();
111
112     PrintDebug("Loading VMCS\n");
113     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
114     vmx_state->state = VMX_UNLAUNCHED;
115
116     if (vmx_ret != VMX_SUCCESS) {
117         PrintError("VMPTRLD failed\n");
118         return -1;
119     }
120
121
122     /*** Setup default state from HW ***/
123
124     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
125     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
126     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
127     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
128     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
129
130     /* Print Control MSRs */
131     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
132     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
133
134
135
136     /******* Setup Host State **********/
137
138     /* Cache GDTR, IDTR, and TR in host struct */
139
140
141     /********** Setup VMX Control Fields ***********/
142
143     /* Add external interrupts, NMI exiting, and virtual NMI */
144     vmx_state->pin_ctrls.nmi_exit = 1;
145     vmx_state->pin_ctrls.ext_int_exit = 1;
146
147
148     vmx_state->pri_proc_ctrls.hlt_exit = 1;
149
150
151     vmx_state->pri_proc_ctrls.pause_exit = 0;
152     vmx_state->pri_proc_ctrls.tsc_offset = 1;
153 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
154     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
155 #endif
156
157     /* Setup IO map */
158     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
159     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
160     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
161             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
162
163
164     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
165     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
166
167
168
169 #ifdef __V3_64BIT__
170     // Ensure host runs in 64-bit mode at each VM EXIT
171     vmx_state->exit_ctrls.host_64_on = 1;
172 #endif
173
174     // Hook all accesses to EFER register
175     v3_hook_msr(core->vm_info, EFER_MSR, 
176                 &v3_handle_efer_read,
177                 &v3_handle_efer_write, 
178                 core);
179
180     // Restore host's EFER register on each VM EXIT
181     vmx_state->exit_ctrls.ld_efer = 1;
182
183     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
184     vmx_state->exit_ctrls.save_efer = 1;
185     vmx_state->entry_ctrls.ld_efer  = 1;
186
187     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
188     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
189
190
191     /* Setup paging */
192     if (core->shdw_pg_mode == SHADOW_PAGING) {
193         PrintDebug("Creating initial shadow page table\n");
194
195         if (v3_init_passthrough_pts(core) == -1) {
196             PrintError("Could not initialize passthrough page tables\n");
197             return -1;
198         }
199         
200 #define CR0_PE 0x00000001
201 #define CR0_PG 0x80000000
202 #define CR0_WP 0x00010000 // To ensure mem hooks work
203         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
204
205         core->ctrl_regs.cr3 = core->direct_map_pt;
206
207         // vmx_state->pinbased_ctrls |= NMI_EXIT;
208
209         /* Add CR exits */
210         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
211         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
212         
213         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
214         
215         /* Add page fault exits */
216         vmx_state->excp_bmap.pf = 1;
217
218         // Setup VMX Assist
219         v3_vmxassist_init(core, vmx_state);
220
221     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
222                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
223
224 #define CR0_PE 0x00000001
225 #define CR0_PG 0x80000000
226 #define CR0_WP 0x00010000 // To ensure mem hooks work
227         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
228
229         // vmx_state->pinbased_ctrls |= NMI_EXIT;
230
231         /* Disable CR exits */
232         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
233         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
234
235         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
236
237         /* Add page fault exits */
238         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
239         
240         // Setup VMX Assist
241         v3_vmxassist_init(core, vmx_state);
242
243         /* Enable EPT */
244         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
245         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
246
247
248
249         if (v3_init_ept(core, &hw_info) == -1) {
250             PrintError("Error initializing EPT\n");
251             return -1;
252         }
253
254     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
255                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
256         int i = 0;
257         // For now we will assume that unrestricted guest mode is assured w/ EPT
258
259
260         core->vm_regs.rsp = 0x00;
261         core->rip = 0xfff0;
262         core->vm_regs.rdx = 0x00000f00;
263         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
264         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
265
266
267         core->segments.cs.selector = 0xf000;
268         core->segments.cs.limit = 0xffff;
269         core->segments.cs.base = 0x0000000f0000LL;
270
271         // (raw attributes = 0xf3)
272         core->segments.cs.type = 0xb;
273         core->segments.cs.system = 0x1;
274         core->segments.cs.dpl = 0x0;
275         core->segments.cs.present = 1;
276
277
278
279         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
280                                           &(core->segments.es), &(core->segments.fs), 
281                                           &(core->segments.gs), NULL};
282
283         for ( i = 0; segregs[i] != NULL; i++) {
284             struct v3_segment * seg = segregs[i];
285         
286             seg->selector = 0x0000;
287             //    seg->base = seg->selector << 4;
288             seg->base = 0x00000000;
289             seg->limit = 0xffff;
290
291
292             seg->type = 0x3;
293             seg->system = 0x1;
294             seg->dpl = 0x0;
295             seg->present = 1;
296             //    seg->granularity = 1;
297
298         }
299
300
301         core->segments.gdtr.limit = 0x0000ffff;
302         core->segments.gdtr.base = 0x0000000000000000LL;
303
304         core->segments.idtr.limit = 0x0000ffff;
305         core->segments.idtr.base = 0x0000000000000000LL;
306
307         core->segments.ldtr.selector = 0x0000;
308         core->segments.ldtr.limit = 0x0000ffff;
309         core->segments.ldtr.base = 0x0000000000000000LL;
310         core->segments.ldtr.type = 2;
311         core->segments.ldtr.present = 1;
312
313         core->segments.tr.selector = 0x0000;
314         core->segments.tr.limit = 0x0000ffff;
315         core->segments.tr.base = 0x0000000000000000LL;
316         core->segments.tr.type = 0xb;
317         core->segments.tr.present = 1;
318
319         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
320         core->dbg_regs.dr7 = 0x0000000000000400LL;
321
322         /* Enable EPT */
323         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
324         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
325         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
326
327
328         /* Disable shadow paging stuff */
329         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
330         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
331
332         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
333
334
335         if (v3_init_ept(core, &hw_info) == -1) {
336             PrintError("Error initializing EPT\n");
337             return -1;
338         }
339
340     } else {
341         PrintError("Invalid Virtual paging mode\n");
342         return -1;
343     }
344
345
346     // hook vmx msrs
347
348     // Setup SYSCALL/SYSENTER MSRs in load/store area
349     
350     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
351     {
352         int msr_ret = 0;
353
354         struct vmcs_msr_entry * exit_store_msrs = NULL;
355         struct vmcs_msr_entry * exit_load_msrs = NULL;
356         struct vmcs_msr_entry * entry_load_msrs = NULL;;
357         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
358
359         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
360
361         if (max_msrs < 4) {
362             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
363             return -1;
364         }
365
366         vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
367
368         if (vmx_state->msr_area == NULL) {
369             PrintError("could not allocate msr load/store area\n");
370             return -1;
371         }
372
373         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
374         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
375         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
376         
377         
378         exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
379         exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
380         entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
381
382
383         exit_store_msrs[0].index = IA32_STAR_MSR;
384         exit_store_msrs[1].index = IA32_LSTAR_MSR;
385         exit_store_msrs[2].index = IA32_FMASK_MSR;
386         exit_store_msrs[3].index = IA32_KERN_GS_BASE_MSR;
387         
388         memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
389         memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
390
391         
392         v3_get_msr(IA32_STAR_MSR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
393         v3_get_msr(IA32_LSTAR_MSR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
394         v3_get_msr(IA32_FMASK_MSR, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
395         v3_get_msr(IA32_KERN_GS_BASE_MSR, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
396
397         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
398         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
399         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
400
401
402         v3_hook_msr(core->vm_info, IA32_STAR_MSR, NULL, NULL, NULL);
403         v3_hook_msr(core->vm_info, IA32_LSTAR_MSR, NULL, NULL, NULL);
404         v3_hook_msr(core->vm_info, IA32_FMASK_MSR, NULL, NULL, NULL);
405         v3_hook_msr(core->vm_info, IA32_KERN_GS_BASE_MSR, NULL, NULL, NULL);
406
407
408         // IMPORTANT: These SYSCALL MSRs are currently not handled by hardware or cached
409         // We should really emulate these ourselves, or ideally include them in the MSR store area if there is room
410         v3_hook_msr(core->vm_info, IA32_CSTAR_MSR, NULL, NULL, NULL);
411         v3_hook_msr(core->vm_info, SYSENTER_CS_MSR, NULL, NULL, NULL);
412         v3_hook_msr(core->vm_info, SYSENTER_ESP_MSR, NULL, NULL, NULL);
413         v3_hook_msr(core->vm_info, SYSENTER_EIP_MSR, NULL, NULL, NULL);
414         
415         v3_hook_msr(core->vm_info, FS_BASE_MSR, NULL, NULL, NULL);
416         v3_hook_msr(core->vm_info, GS_BASE_MSR, NULL, NULL, NULL);
417         
418
419     }    
420
421     /* Sanity check ctrl/reg fields against hw_defaults */
422
423
424
425
426     /*** Write all the info to the VMCS ***/
427   
428     /*
429     {
430         // IS THIS NECESSARY???
431 #define DEBUGCTL_MSR 0x1d9
432         struct v3_msr tmp_msr;
433         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
434         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
435         core->dbg_regs.dr7 = 0x400;
436     }
437     */
438
439 #ifdef __V3_64BIT__
440     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
441 #else
442     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
443     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
444 #endif
445
446
447  
448
449     if (v3_update_vmcs_ctrl_fields(core)) {
450         PrintError("Could not write control fields!\n");
451         return -1;
452     }
453     
454     if (v3_update_vmcs_host_state(core)) {
455         PrintError("Could not write host state\n");
456         return -1;
457     }
458
459     // reenable global interrupts for vm state initialization now
460     // that the vm state is initialized. If another VM kicks us off, 
461     // it'll update our vmx state so that we know to reload ourself
462     v3_enable_ints();
463
464     return 0;
465 }
466
467 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
468     struct vmx_data * vmx_state = NULL;
469     int vmx_ret = 0;
470     
471     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
472     memset(vmx_state, 0, sizeof(struct vmx_data));
473
474     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
475
476     PrintDebug("Allocating VMCS\n");
477     vmx_state->vmcs_ptr_phys = allocate_vmcs();
478
479     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
480
481     core->vmm_data = vmx_state;
482     vmx_state->state = VMX_UNLAUNCHED;
483
484     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
485     
486     // TODO: Fix vmcs fields so they're 32-bit
487
488     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
489     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
490
491     if (vmx_ret != VMX_SUCCESS) {
492         PrintError("VMCLEAR failed\n");
493         return -1; 
494     }
495
496     if (vm_class == V3_PC_VM) {
497         PrintDebug("Initializing VMCS\n");
498         if (init_vmcs_bios(core, vmx_state) == -1) {
499             PrintError("Error initializing VMCS to BIOS state\n");
500             return -1;
501         }
502     } else {
503         PrintError("Invalid VM Class\n");
504         return -1;
505     }
506
507     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
508     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
509
510     return 0;
511 }
512
513
514 int v3_deinit_vmx_vmcs(struct guest_info * core) {
515     struct vmx_data * vmx_state = core->vmm_data;
516
517     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
518     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
519
520     V3_Free(vmx_state);
521
522     return 0;
523 }
524
525
526
527 #ifdef V3_CONFIG_CHECKPOINT
528 /* 
529  * JRL: This is broken
530  */
531 int v3_vmx_save_core(struct guest_info * core, void * ctx){
532     uint64_t vmcs_ptr = vmcs_store();
533
534     v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
535
536     return 0;
537 }
538
539 int v3_vmx_load_core(struct guest_info * core, void * ctx){
540     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
541     struct cr0_32 * shadow_cr0;
542     char vmcs[PAGE_SIZE_4KB];
543
544     v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
545
546     vmcs_clear(vmx_info->vmcs_ptr_phys);
547     vmcs_load((addr_t)vmcs);
548
549     v3_vmx_save_vmcs(core);
550
551     shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
552
553
554     /* Get the CPU mode to set the guest_ia32e entry ctrl */
555
556     if (core->shdw_pg_mode == SHADOW_PAGING) {
557         if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
558             if (v3_activate_shadow_pt(core) == -1) {
559                 PrintError("Failed to activate shadow page tables\n");
560                 return -1;
561             }
562         } else {
563             if (v3_activate_passthrough_pt(core) == -1) {
564                 PrintError("Failed to activate passthrough page tables\n");
565                 return -1;
566             }
567         }
568     }
569
570     return 0;
571 }
572 #endif
573
574
575 void v3_flush_vmx_vm_core(struct guest_info * core) {
576     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
577     vmcs_clear(vmx_info->vmcs_ptr_phys);
578     vmx_info->state = VMX_UNLAUNCHED;
579 }
580
581
582
583 static int update_irq_exit_state(struct guest_info * info) {
584     struct vmx_exit_idt_vec_info idt_vec_info;
585
586     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
587
588     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
589 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
590         V3_Print("Calling v3_injecting_intr\n");
591 #endif
592         info->intr_core_state.irq_started = 0;
593         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
594     }
595
596     return 0;
597 }
598
599 static int update_irq_entry_state(struct guest_info * info) {
600     struct vmx_exit_idt_vec_info idt_vec_info;
601     struct vmcs_interrupt_state intr_core_state;
602     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
603
604     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
605     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
606
607     /* Check for pending exceptions to inject */
608     if (v3_excp_pending(info)) {
609         struct vmx_entry_int_info int_info;
610         int_info.value = 0;
611
612         // In VMX, almost every exception is hardware
613         // Software exceptions are pretty much only for breakpoint or overflow
614         int_info.type = 3;
615         int_info.vector = v3_get_excp_number(info);
616
617         if (info->excp_state.excp_error_code_valid) {
618             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
619             int_info.error_code = 1;
620
621 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
622             V3_Print("Injecting exception %d with error code %x\n", 
623                     int_info.vector, info->excp_state.excp_error_code);
624 #endif
625         }
626
627         int_info.valid = 1;
628 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
629         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
630 #endif
631         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
632
633         v3_injecting_excp(info, int_info.vector);
634
635     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
636                (intr_core_state.val == 0)) {
637        
638         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
639
640 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
641             V3_Print("IRQ pending from previous injection\n");
642 #endif
643
644             // Copy the IDT vectoring info over to reinject the old interrupt
645             if (idt_vec_info.error_code == 1) {
646                 uint32_t err_code = 0;
647
648                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
649                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
650             }
651
652             idt_vec_info.undef = 0;
653             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
654
655         } else {
656             struct vmx_entry_int_info ent_int;
657             ent_int.value = 0;
658
659             switch (v3_intr_pending(info)) {
660                 case V3_EXTERNAL_IRQ: {
661                     info->intr_core_state.irq_vector = v3_get_intr(info); 
662                     ent_int.vector = info->intr_core_state.irq_vector;
663                     ent_int.type = 0;
664                     ent_int.error_code = 0;
665                     ent_int.valid = 1;
666
667 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
668                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
669                                info->intr_core_state.irq_vector, 
670                                (uint32_t)info->num_exits, 
671                                (void *)(addr_t)info->rip);
672 #endif
673
674                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
675                     info->intr_core_state.irq_started = 1;
676
677                     break;
678                 }
679                 case V3_NMI:
680                     PrintDebug("Injecting NMI\n");
681
682                     ent_int.type = 2;
683                     ent_int.vector = 2;
684                     ent_int.valid = 1;
685                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
686
687                     break;
688                 case V3_SOFTWARE_INTR:
689                     PrintDebug("Injecting software interrupt\n");
690                     ent_int.type = 4;
691
692                     ent_int.valid = 1;
693                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
694
695                     break;
696                 case V3_VIRTUAL_IRQ:
697                     // Not sure what to do here, Intel doesn't have virtual IRQs
698                     // May be the same as external interrupts/IRQs
699
700                     break;
701                 case V3_INVALID_INTR:
702                 default:
703                     break;
704             }
705         }
706     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
707         // Enable INTR window exiting so we know when IF=1
708         uint32_t instr_len;
709
710         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
711
712 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
713         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
714 #endif
715
716         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
717         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
718     }
719
720
721     return 0;
722 }
723
724
725
726 static struct vmx_exit_info exit_log[10];
727
728 static void print_exit_log(struct guest_info * info) {
729     int cnt = info->num_exits % 10;
730     int i = 0;
731     
732
733     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
734
735     for (i = 0; i < 10; i++) {
736         struct vmx_exit_info * tmp = &exit_log[cnt];
737
738         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
739         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
740         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
741         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
742         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
743
744         cnt--;
745
746         if (cnt == -1) {
747             cnt = 9;
748         }
749
750     }
751
752 }
753
754 /* 
755  * CAUTION and DANGER!!! 
756  * 
757  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
758  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
759  * on its contents will cause things to break. The contents at the time of the exit WILL 
760  * change before the exit handler is executed.
761  */
762 int v3_vmx_enter(struct guest_info * info) {
763     int ret = 0;
764     uint32_t tsc_offset_low, tsc_offset_high;
765     struct vmx_exit_info exit_info;
766     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
767
768     // Conditionally yield the CPU if the timeslice has expired
769     v3_yield_cond(info);
770
771     // Perform any additional yielding needed for time adjustment
772     v3_adjust_time(info);
773
774     // disable global interrupts for vm state transition
775     v3_disable_ints();
776
777     // Update timer devices late after being in the VM so that as much 
778     // of hte time in the VM is accounted for as possible. Also do it before
779     // updating IRQ entry state so that any interrupts the timers raise get 
780     // handled on the next VM entry. Must be done with interrupts disabled.
781     v3_update_timers(info);
782
783     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
784         vmcs_clear(vmx_info->vmcs_ptr_phys);
785         vmcs_load(vmx_info->vmcs_ptr_phys);
786         vmx_info->state = VMX_UNLAUNCHED;
787     }
788
789     v3_vmx_restore_vmcs(info);
790
791
792 #ifdef V3_CONFIG_SYMCALL
793     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
794         update_irq_entry_state(info);
795     }
796 #else 
797     update_irq_entry_state(info);
798 #endif
799
800     {
801         addr_t guest_cr3;
802         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
803         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
804     }
805
806     // Perform last-minute time bookkeeping prior to entering the VM
807     v3_time_enter_vm(info);
808
809     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
810     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
811     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
812     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
813
814     if (v3_update_vmcs_host_state(info)) {
815         v3_enable_ints();
816         PrintError("Could not write host state\n");
817         return -1;
818     }
819
820
821     if (vmx_info->state == VMX_UNLAUNCHED) {
822         vmx_info->state = VMX_LAUNCHED;
823
824         info->vm_info->run_state = VM_RUNNING;
825         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
826     } else {
827         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
828         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
829     }
830     
831
832
833     //  PrintDebug("VMX Exit: ret=%d\n", ret);
834
835     if (ret != VMX_SUCCESS) {
836         uint32_t error = 0;
837         vmcs_read(VMCS_INSTR_ERR, &error);
838
839         v3_enable_ints();
840
841         PrintError("VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
842         return -1;
843     }
844
845
846
847     // Immediate exit from VM time bookkeeping
848     v3_time_exit_vm(info);
849
850     info->num_exits++;
851
852     /* Update guest state */
853     v3_vmx_save_vmcs(info);
854
855     // info->cpl = info->segments.cs.selector & 0x3;
856
857     info->mem_mode = v3_get_vm_mem_mode(info);
858     info->cpu_mode = v3_get_vm_cpu_mode(info);
859
860
861     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
862     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
863     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
864     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
865     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
866     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
867     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
868
869     if (info->shdw_pg_mode == NESTED_PAGING) {
870         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
871     }
872
873     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
874
875     exit_log[info->num_exits % 10] = exit_info;
876
877 #ifdef V3_CONFIG_SYMCALL
878     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
879         update_irq_exit_state(info);
880     }
881 #else
882     update_irq_exit_state(info);
883 #endif
884
885     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
886         // This is a special case whose only job is to inject an interrupt
887         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
888         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
889         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
890
891 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
892        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
893 #endif
894     }
895
896     // reenable global interrupts after vm exit
897     v3_enable_ints();
898
899     // Conditionally yield the CPU if the timeslice has expired
900     v3_yield_cond(info);
901
902     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
903         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
904         return -1;
905     }
906
907     return 0;
908 }
909
910
911 int v3_start_vmx_guest(struct guest_info * info) {
912
913     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
914
915     if (info->vcpu_id == 0) {
916         info->core_run_state = CORE_RUNNING;
917         info->vm_info->run_state = VM_RUNNING;
918     } else {
919
920         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
921
922         while (info->core_run_state == CORE_STOPPED) {
923
924             if (info->vm_info->run_state == VM_STOPPED) {
925                 // The VM was stopped before this core was initialized. 
926                 return 0;
927             }
928
929             v3_yield(info);
930             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
931         }
932         
933         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
934
935         // We'll be paranoid about race conditions here
936         v3_wait_at_barrier(info);
937     }
938
939
940     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
941                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
942                info->segments.cs.limit, (void *)(info->rip));
943
944
945     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
946
947     v3_start_time(info);
948
949     while (1) {
950
951         if (info->vm_info->run_state == VM_STOPPED) {
952             info->core_run_state = CORE_STOPPED;
953             break;
954         }
955
956         if (v3_vmx_enter(info) == -1) {
957
958             addr_t host_addr;
959             addr_t linear_addr = 0;
960             
961             info->vm_info->run_state = VM_ERROR;
962             
963             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
964             
965             v3_print_guest_state(info);
966             
967             V3_Print("VMX core %u\n", info->vcpu_id); 
968
969             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
970             
971             if (info->mem_mode == PHYSICAL_MEM) {
972                 v3_gpa_to_hva(info, linear_addr, &host_addr);
973             } else if (info->mem_mode == VIRTUAL_MEM) {
974                 v3_gva_to_hva(info, linear_addr, &host_addr);
975             }
976             
977             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
978             
979             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
980             v3_dump_mem((uint8_t *)host_addr, 15);
981             
982             v3_print_stack(info);
983
984
985             v3_print_vmcs();
986             print_exit_log(info);
987             return -1;
988         }
989
990         v3_wait_at_barrier(info);
991
992
993         if (info->vm_info->run_state == VM_STOPPED) {
994             info->core_run_state = CORE_STOPPED;
995             break;
996         }
997 /*
998         if ((info->num_exits % 5000) == 0) {
999             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
1000         }
1001 */
1002
1003     }
1004
1005     return 0;
1006 }
1007
1008
1009
1010
1011 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
1012 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
1013 #define CPUID_1_ECX_VTXFLAG 0x00000020
1014
1015 int v3_is_vmx_capable() {
1016     v3_msr_t feature_msr;
1017     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1018
1019     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1020
1021     PrintDebug("ECX: 0x%x\n", ecx);
1022
1023     if (ecx & CPUID_1_ECX_VTXFLAG) {
1024         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1025         
1026         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
1027
1028         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1029             PrintDebug("VMX is locked -- enable in the BIOS\n");
1030             return 0;
1031         }
1032
1033     } else {
1034         PrintDebug("VMX not supported on this cpu\n");
1035         return 0;
1036     }
1037
1038     return 1;
1039 }
1040
1041
1042 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1043     // init vmcs bios
1044     
1045     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1046         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1047         // easy 
1048         core->rip = 0;
1049         core->segments.cs.selector = rip << 8;
1050         core->segments.cs.limit = 0xffff;
1051         core->segments.cs.base = rip << 12;
1052     } else {
1053         core->vm_regs.rdx = core->vcpu_id;
1054         core->vm_regs.rbx = rip;
1055     }
1056
1057     return 0;
1058 }
1059
1060
1061
1062 void v3_init_vmx_cpu(int cpu_id) {
1063     addr_t vmx_on_region = 0;
1064
1065     if (cpu_id == 0) {
1066         if (v3_init_vmx_hw(&hw_info) == -1) {
1067             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1068             return;
1069         }
1070     }
1071
1072     enable_vmx();
1073
1074
1075     // Setup VMXON Region
1076     vmx_on_region = allocate_vmcs();
1077
1078
1079     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1080         V3_Print("VMX Enabled\n");
1081         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1082     } else {
1083         V3_Print("VMX already enabled\n");
1084         V3_FreePages((void *)vmx_on_region, 1);
1085     }
1086
1087     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1088
1089     {
1090         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1091         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1092         
1093         if (sec_proc_ctrls.enable_ept == 0) {
1094             V3_Print("VMX EPT (Nested) Paging not supported\n");
1095             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1096         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1097             V3_Print("VMX EPT (Nested) Paging supported\n");
1098             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1099         } else {
1100             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1101             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1102         }
1103     }
1104 }
1105
1106
1107 void v3_deinit_vmx_cpu(int cpu_id) {
1108     extern v3_cpu_arch_t v3_cpu_types[];
1109     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1110
1111     if (host_vmcs_ptrs[cpu_id] != 0) {
1112         V3_Print("Disabling VMX\n");
1113
1114         if (vmx_off() != VMX_SUCCESS) {
1115             PrintError("Error executing VMXOFF\n");
1116         }
1117
1118         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1119
1120         host_vmcs_ptrs[cpu_id] = 0;
1121     }
1122 }