Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


added a global machine type to determine machine architecture
[palacios.git] / palacios / src / palacios / vmx.c
index 419b706..4f37e8e 100644 (file)
 #include <palacios/vmx_io.h>
 #include <palacios/vmx_msr.h>
 #include <palacios/vmm_decoder.h>
+#include <palacios/vmm_barrier.h>
+
+#ifdef V3_CONFIG_CHECKPOINT
+#include <palacios/vmm_checkpoint.h>
+#endif
 
 #include <palacios/vmx_ept.h>
 #include <palacios/vmx_assist.h>
@@ -131,53 +136,6 @@ static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state)
     /******* Setup Host State **********/
 
     /* Cache GDTR, IDTR, and TR in host struct */
-    addr_t gdtr_base;
-    struct {
-        uint16_t selector;
-        addr_t   base;
-    } __attribute__((packed)) tmp_seg;
-    
-
-    __asm__ __volatile__(
-                        "sgdt (%0);"
-                        :
-                        : "q"(&tmp_seg)
-                        : "memory"
-                        );
-    gdtr_base = tmp_seg.base;
-    vmx_state->host_state.gdtr.base = gdtr_base;
-
-    __asm__ __volatile__(
-                        "sidt (%0);"
-                        :
-                        : "q"(&tmp_seg)
-                        : "memory"
-                        );
-    vmx_state->host_state.idtr.base = tmp_seg.base;
-
-    __asm__ __volatile__(
-                        "str (%0);"
-                        :
-                        : "q"(&tmp_seg)
-                        : "memory"
-                        );
-    vmx_state->host_state.tr.selector = tmp_seg.selector;
-
-    /* The GDTR *index* is bits 3-15 of the selector. */
-    struct tss_descriptor * desc = NULL;
-    desc = (struct tss_descriptor *)(gdtr_base + (8 * (tmp_seg.selector >> 3)));
-
-    tmp_seg.base = ((desc->base1) |
-                   (desc->base2 << 16) |
-                   (desc->base3 << 24) |
-#ifdef __V3_64BIT__
-                   ((uint64_t)desc->base4 << 32)
-#else 
-                   (0)
-#endif
-                   );
-
-    vmx_state->host_state.tr.base = tmp_seg.base;
 
 
     /********** Setup VMX Control Fields ***********/
@@ -208,10 +166,6 @@ static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state)
 
 
 
-    
-
-
-
 #ifdef __V3_64BIT__
     // Ensure host runs in 64-bit mode at each VM EXIT
     vmx_state->exit_ctrls.host_64_on = 1;
@@ -234,6 +188,9 @@ static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state)
     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
 
 
+    // Setup Guests initial PAT field
+    vmx_ret |= check_vmcs_write(VMCS_GUEST_PAT, 0x0007040600070406LL);
+
     /* Setup paging */
     if (core->shdw_pg_mode == SHADOW_PAGING) {
         PrintDebug("Creating initial shadow page table\n");
@@ -307,8 +264,9 @@ static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state)
        core->rip = 0xfff0;
        core->vm_regs.rdx = 0x00000f00;
        core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
-       core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
-
+       core->ctrl_regs.cr0 = 0x00000030; 
+       core->ctrl_regs.cr4 = 0x00002010; // Enable VMX and PSE flag
+       
 
        core->segments.cs.selector = 0xf000;
        core->segments.cs.limit = 0xffff;
@@ -353,7 +311,7 @@ static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state)
        core->segments.ldtr.selector = 0x0000;
        core->segments.ldtr.limit = 0x0000ffff;
        core->segments.ldtr.base = 0x0000000000000000LL;
-       core->segments.ldtr.type = 2;
+       core->segments.ldtr.type = 0x2;
        core->segments.ldtr.present = 1;
 
        core->segments.tr.selector = 0x0000;
@@ -395,19 +353,10 @@ static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state)
     
     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
     {
-#define IA32_STAR 0xc0000081
-#define IA32_LSTAR 0xc0000082
-#define IA32_FMASK 0xc0000084
-#define IA32_KERN_GS_BASE 0xc0000102
-
-#define IA32_CSTAR 0xc0000083 // Compatibility mode STAR (ignored for now... hopefully its not that important...)
-
-       int msr_ret = 0;
 
-       struct vmcs_msr_entry * exit_store_msrs = NULL;
-       struct vmcs_msr_entry * exit_load_msrs = NULL;
-       struct vmcs_msr_entry * entry_load_msrs = NULL;;
+       struct vmcs_msr_save_area * msr_entries = NULL;
        int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
+       int msr_ret = 0;
 
        V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
 
@@ -416,40 +365,60 @@ static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state)
            return -1;
        }
 
-       vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
-
-       if (vmx_state->msr_area == NULL) {
+       vmx_state->msr_area_paddr = (addr_t)V3_AllocPages(1);
+       
+       if (vmx_state->msr_area_paddr == (addr_t)NULL) {
            PrintError("could not allocate msr load/store area\n");
            return -1;
        }
 
+       msr_entries = (struct vmcs_msr_save_area *)V3_VAddr((void *)(vmx_state->msr_area_paddr));
+       vmx_state->msr_area = msr_entries; // cache in vmx_info
+
+       memset(msr_entries, 0, PAGE_SIZE);
+
+       msr_entries->guest_star.index = IA32_STAR_MSR;
+       msr_entries->guest_lstar.index = IA32_LSTAR_MSR;
+       msr_entries->guest_fmask.index = IA32_FMASK_MSR;
+       msr_entries->guest_kern_gs.index = IA32_KERN_GS_BASE_MSR;
+
+       msr_entries->host_star.index = IA32_STAR_MSR;
+       msr_entries->host_lstar.index = IA32_LSTAR_MSR;
+       msr_entries->host_fmask.index = IA32_FMASK_MSR;
+       msr_entries->host_kern_gs.index = IA32_KERN_GS_BASE_MSR;
+
        msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
        msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
        msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
-       
-       
-       exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
-       exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
-       entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
 
+       msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
+       msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
+       msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->host_msrs));
 
-       exit_store_msrs[0].index = IA32_STAR;
-       exit_store_msrs[1].index = IA32_LSTAR;
-       exit_store_msrs[2].index = IA32_FMASK;
-       exit_store_msrs[3].index = IA32_KERN_GS_BASE;
-       
-       memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
-       memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
 
-       
-       v3_get_msr(IA32_STAR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
-       v3_get_msr(IA32_LSTAR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
-       v3_get_msr(IA32_FMASK, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
-       v3_get_msr(IA32_KERN_GS_BASE, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
+       msr_ret |= v3_hook_msr(core->vm_info, IA32_STAR_MSR, NULL, NULL, NULL);
+       msr_ret |= v3_hook_msr(core->vm_info, IA32_LSTAR_MSR, NULL, NULL, NULL);
+       msr_ret |= v3_hook_msr(core->vm_info, IA32_FMASK_MSR, NULL, NULL, NULL);
+       msr_ret |= v3_hook_msr(core->vm_info, IA32_KERN_GS_BASE_MSR, NULL, NULL, NULL);
+
+
+       // IMPORTANT: These MSRs appear to be cached by the hardware....
+       msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_CS_MSR, NULL, NULL, NULL);
+       msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_ESP_MSR, NULL, NULL, NULL);
+       msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_EIP_MSR, NULL, NULL, NULL);
+
+       msr_ret |= v3_hook_msr(core->vm_info, FS_BASE_MSR, NULL, NULL, NULL);
+       msr_ret |= v3_hook_msr(core->vm_info, GS_BASE_MSR, NULL, NULL, NULL);
+
+
+       // Not sure what to do about this... Does not appear to be an explicit hardware cache version...
+       msr_ret |= v3_hook_msr(core->vm_info, IA32_CSTAR_MSR, NULL, NULL, NULL);
+
+       if (msr_ret != 0) {
+           PrintError("Error configuring MSR save/restore area\n");
+           return -1;
+       }
 
-       msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
-       msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
-       msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
 
     }    
 
@@ -486,10 +455,12 @@ static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state)
         return -1;
     }
     
+    /*
     if (v3_update_vmcs_host_state(core)) {
         PrintError("Could not write host state\n");
         return -1;
     }
+    */
 
     // reenable global interrupts for vm state initialization now
     // that the vm state is initialized. If another VM kicks us off, 
@@ -558,6 +529,63 @@ int v3_deinit_vmx_vmcs(struct guest_info * core) {
 }
 
 
+
+#ifdef V3_CONFIG_CHECKPOINT
+/* 
+ * JRL: This is broken
+ */
+int v3_vmx_save_core(struct guest_info * core, void * ctx){
+    uint64_t vmcs_ptr = vmcs_store();
+
+    v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
+
+    return 0;
+}
+
+int v3_vmx_load_core(struct guest_info * core, void * ctx){
+    struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
+    struct cr0_32 * shadow_cr0;
+    char vmcs[PAGE_SIZE_4KB];
+
+    v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
+
+    vmcs_clear(vmx_info->vmcs_ptr_phys);
+    vmcs_load((addr_t)vmcs);
+
+    v3_vmx_save_vmcs(core);
+
+    shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
+
+
+    /* Get the CPU mode to set the guest_ia32e entry ctrl */
+
+    if (core->shdw_pg_mode == SHADOW_PAGING) {
+       if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
+           if (v3_activate_shadow_pt(core) == -1) {
+               PrintError("Failed to activate shadow page tables\n");
+               return -1;
+           }
+       } else {
+           if (v3_activate_passthrough_pt(core) == -1) {
+               PrintError("Failed to activate passthrough page tables\n");
+               return -1;
+           }
+       }
+    }
+
+    return 0;
+}
+#endif
+
+
+void v3_flush_vmx_vm_core(struct guest_info * core) {
+    struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
+    vmcs_clear(vmx_info->vmcs_ptr_phys);
+    vmx_info->state = VMX_UNLAUNCHED;
+}
+
+
+
 static int update_irq_exit_state(struct guest_info * info) {
     struct vmx_exit_idt_vec_info idt_vec_info;
 
@@ -729,6 +757,44 @@ static void print_exit_log(struct guest_info * info) {
 
 }
 
+int
+v3_vmx_schedule_timeout(struct guest_info * info)
+{
+    struct vmx_data * vmx_state = (struct vmx_data *)(info->vmm_data);
+    sint64_t cycles;
+    uint32_t timeout;
+
+    /* Check if the hardware supports an active timeout */
+#define VMX_ACTIVE_PREEMPT_TIMER_PIN 0x40
+    if (hw_info.pin_ctrls.req_mask & VMX_ACTIVE_PREEMPT_TIMER_PIN) {
+       /* The hardware doesn't support us modifying this pin control */
+       return 0;
+    }
+
+    /* Check if we have one to schedule and schedule it if we do */
+    cycles = (sint64_t)info->time_state.next_timeout - (sint64_t)v3_get_guest_time(&info->time_state);
+    if (info->time_state.next_timeout == (ullong_t) -1)  {
+       timeout = 0;
+        vmx_state->pin_ctrls.active_preempt_timer = 0;
+    } else if (cycles < 0) {
+       /* set the timeout to 0 to force an immediate re-exit since it expired between
+        * when we checked a timeout and now. IF SOMEONE CONTINAULLY SETS A SHORT TIMEOUT,
+        * THIS CAN LOCK US OUT OF THE GUEST! */
+       timeout = 0;
+        vmx_state->pin_ctrls.active_preempt_timer = 1;
+    } else {
+        /* The hardware supports scheduling a timeout, and we have one to 
+         * schedule */
+        timeout = (uint32_t)cycles >> hw_info.misc_info.tsc_multiple;
+        vmx_state->pin_ctrls.active_preempt_timer = 1;
+    }
+
+    /* Actually program the timer based on the settings above. */
+    check_vmcs_write(VMCS_PREEMPT_TIMER, timeout);
+    check_vmcs_write(VMCS_PIN_CTRLS, vmx_state->pin_ctrls.value);
+    return 0;
+}
+
 /* 
  * CAUTION and DANGER!!! 
  * 
@@ -749,11 +815,16 @@ int v3_vmx_enter(struct guest_info * info) {
     // Perform any additional yielding needed for time adjustment
     v3_adjust_time(info);
 
+    // Check for timeout - since this calls generic hooks in devices
+    // that may do things like pause the VM, it cannot be with interrupts
+    // disabled.
+    v3_check_timeout(info);
+
     // disable global interrupts for vm state transition
     v3_disable_ints();
 
     // Update timer devices late after being in the VM so that as much 
-    // of hte time in the VM is accounted for as possible. Also do it before
+    // of the time in the VM is accounted for as possible. Also do it before
     // updating IRQ entry state so that any interrupts the timers raise get 
     // handled on the next VM entry. Must be done with interrupts disabled.
     v3_update_timers(info);
@@ -781,6 +852,10 @@ int v3_vmx_enter(struct guest_info * info) {
        vmcs_write(VMCS_GUEST_CR3, guest_cr3);
     }
 
+    // Update vmx active preemption timer to exit at the next timeout if 
+    // the hardware supports it.
+    v3_vmx_schedule_timeout(info);
+
     // Perform last-minute time bookkeeping prior to entering the VM
     v3_time_enter_vm(info);
 
@@ -789,7 +864,6 @@ int v3_vmx_enter(struct guest_info * info) {
     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
 
-
     if (v3_update_vmcs_host_state(info)) {
        v3_enable_ints();
         PrintError("Could not write host state\n");
@@ -799,27 +873,28 @@ int v3_vmx_enter(struct guest_info * info) {
 
     if (vmx_info->state == VMX_UNLAUNCHED) {
        vmx_info->state = VMX_LAUNCHED;
-
-       info->vm_info->run_state = VM_RUNNING;
        ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
     } else {
        V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
        ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
     }
     
+
+
     //  PrintDebug("VMX Exit: ret=%d\n", ret);
 
     if (ret != VMX_SUCCESS) {
        uint32_t error = 0;
-
         vmcs_read(VMCS_INSTR_ERR, &error);
 
        v3_enable_ints();
 
-        PrintError("VMENTRY Error: %d\n", error);
+       PrintError("VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
        return -1;
     }
 
+
+
     // Immediate exit from VM time bookkeeping
     v3_time_exit_vm(info);
 
@@ -890,17 +965,25 @@ int v3_start_vmx_guest(struct guest_info * info) {
 
     if (info->vcpu_id == 0) {
        info->core_run_state = CORE_RUNNING;
-       info->vm_info->run_state = VM_RUNNING;
     } else {
 
         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
 
         while (info->core_run_state == CORE_STOPPED) {
+
+           if (info->vm_info->run_state == VM_STOPPED) {
+               // The VM was stopped before this core was initialized. 
+               return 0;
+           }
+
             v3_yield(info);
             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
         }
        
        PrintDebug("VMX core %u initialized\n", info->vcpu_id);
+
+       // We'll be paranoid about race conditions here
+       v3_wait_at_barrier(info);
     }
 
 
@@ -954,6 +1037,7 @@ int v3_start_vmx_guest(struct guest_info * info) {
            return -1;
        }
 
+       v3_wait_at_barrier(info);
 
 
        if (info->vm_info->run_state == VM_STOPPED) {
@@ -1027,8 +1111,9 @@ int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
 
 void v3_init_vmx_cpu(int cpu_id) {
     addr_t vmx_on_region = 0;
+    extern v3_cpu_arch_t v3_mach_type;
 
-    if (cpu_id == 0) {
+    if (v3_mach_type == V3_INVALID_CPU) {
        if (v3_init_vmx_hw(&hw_info) == -1) {
            PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
            return;
@@ -1067,6 +1152,7 @@ void v3_init_vmx_cpu(int cpu_id) {
            v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
        }
     }
+    
 }