Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


Floating point context-switching and checkpoint/load
Peter Dinda [Fri, 25 Oct 2013 23:30:39 +0000 (18:30 -0500)]
This integrates:

- the option to do floating point context-switching
- conservative code to do context-switching in Palacios
- a lazy floating point save/restore host interface
- an implementation of this interface in the linux module
- liberal code to use this interface in Palacios
- floating point checkpointing

This also includes a performance tuning element that
is hard to separate

19 files changed:
Kconfig
linux_module/palacios-stubs.c
linux_module/palacios.h
palacios/include/interfaces/vmm_lazy_fpu.h [new file with mode: 0644]
palacios/include/palacios/vm_guest.h
palacios/include/palacios/vmm.h
palacios/include/palacios/vmm_fp.h [new file with mode: 0644]
palacios/include/palacios/vmm_perftune.h
palacios/include/palacios/vmm_types.h
palacios/src/interfaces/Kconfig
palacios/src/interfaces/Makefile
palacios/src/interfaces/vmm_lazy_fpu.c [new file with mode: 0644]
palacios/src/palacios/Makefile
palacios/src/palacios/svm.c
palacios/src/palacios/vmm.c
palacios/src/palacios/vmm_barrier.c
palacios/src/palacios/vmm_checkpoint.c
palacios/src/palacios/vmm_fp.c [new file with mode: 0644]
palacios/src/palacios/vmx.c

diff --git a/Kconfig b/Kconfig
index 9c9dfe2..028645f 100644 (file)
--- a/Kconfig
+++ b/Kconfig
@@ -75,6 +75,7 @@ config VMX
          Compile with support for Intel VMX
 
 
+
 config FRAME_POINTER
        bool "Compile with Frame pointers"
        default n
@@ -144,6 +145,49 @@ config MAX_CPUS
 endmenu
 
 source "palacios/src/interfaces/Kconfig"
+
+menu "Virtual core specialization"
+
+config CUSTOM_CPUID
+       bool "Use custom CPU information (vendor, etc)"
+       default y
+       help 
+          If set, the CPU information will be for a special V3VEE vendor.
+         This should result in identical guest kernel setup, regardless
+          of the underlying hardware, but it also means that the guest kernel
+          has no chance of employing CPU-specific bug fixes.
+
+config STRICT_MSR_SEMANTICS
+       bool "Use strict RDMSR/WRMSR semantics"
+       default y
+       help
+         Use strict MSR semantics - when an unhandled MSR is read or written,
+         a GPF is generated.  This is typically usd with CUSTOM_CPU_TYPE on.
+
+config FP_SWITCH
+       bool "Floating point context switching"
+        default y
+        help
+          If set, floating point is handled for context switches 
+          (VM1->VM2->VM1 and/or VM->HOST->VM).   This can be disabled
+          for environments where a single VM is the only user of FP.
+          Note that even if disabled, FP save/restore code is included
+          for support of checkpoint/restore.
+
+config LAZY_FP_SWITCH
+       bool "Use host-based lazy floating point context switching"
+        depends on FP_SWITCH && HOST_LAZY_FPU_SWITCH
+       default y
+       help
+         When true,  the host's lazy floating point save/restore 
+          mechanism is notified on each exit and entry.  If false,
+          the floating point state is explicitly saved on each exit
+          and restored on each entry---this save/restore is entirely
+          done in Palacios.
+          
+          
+endmenu
+
 source "palacios/src/extensions/Kconfig"
 
 config TELEMETRY
index 08021e4..decae1f 100644 (file)
 #include <linux/smp.h>
 #include <linux/vmalloc.h>
 
+#include <asm/i387.h>
+
 #include <palacios/vmm.h>
 #include <palacios/vmm_host_events.h>
+
+#ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH
+#include <interfaces/vmm_lazy_fpu.h>
+#endif
+
 #include "palacios.h"
 
 #include "mm.h"
@@ -25,6 +32,8 @@
 #include "memcheck.h"
 #include "lockcheck.h"
 
+
+
 // The following can be used to track heap bugs
 // zero memory after allocation
 #define ALLOC_ZERO_MEM 0
@@ -169,14 +178,14 @@ void *palacios_allocate_pages(int num_pages, unsigned int alignment, int node_id
     void * pg_addr = NULL;
 
     if (num_pages<=0) { 
-      ERROR("ALERT ALERT Attempt to allocate zero or fewer pages\n");
+       ERROR("ALERT ALERT Attempt to allocate zero or fewer pages (%d pages, alignment %d, node %d, constraints 0x%x)\n",num_pages, alignment, node_id, constraints);
       return NULL;
     }
 
     pg_addr = (void *)alloc_palacios_pgs(num_pages, alignment, node_id, constraints);
 
     if (!pg_addr) { 
-       ERROR("ALERT ALERT  Page allocation has FAILED Warning\n");
+       ERROR("ALERT ALERT  Page allocation has FAILED Warning (%d pages, alignment %d, node %d, constraints 0x%x)\n",num_pages, alignment, node_id, constraints);
        return NULL;
     }
 
@@ -195,6 +204,10 @@ void *palacios_allocate_pages(int num_pages, unsigned int alignment, int node_id
  */
 
 void palacios_free_pages(void * page_paddr, int num_pages) {
+    if (!page_paddr) { 
+       ERROR("Ignoring free pages: 0x%p (0x%lx)for %d pages\n", page_paddr, (uintptr_t)page_paddr, num_pages);
+       dump_stack();
+    }
     pg_frees += num_pages;
     free_palacios_pgs((uintptr_t)page_paddr, num_pages);
     MEMCHECK_FREE_PAGES(page_paddr,num_pages*4096);
@@ -294,6 +307,10 @@ palacios_free(
        void *                  addr
 )
 {
+    if (!addr) {
+       ERROR("Ignoring free : 0x%p\n", addr);
+       dump_stack();
+    }
     frees++;
     kfree(addr-ALLOC_PAD);
     MEMCHECK_KFREE(addr-ALLOC_PAD);
@@ -359,17 +376,25 @@ static int lnx_thread_target(void * arg) {
       allow_signal(SIGKILL);
     */
 
+#ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH
+    // We are a kernel thread that needs FPU save/restore state
+    // vcores definitely need this, all the other threads get it too, 
+    // but they just won't use it
+    fpu_alloc(&(current->thread.fpu));
+#endif
 
     ret = thread_info->fn(thread_info->arg);
 
-
     INFO("Palacios Thread (%s) EXITING\n", thread_info->name);
 
     palacios_free(thread_info);
     // handle cleanup 
 
+    // We rely on do_exit to free the fpu data
+    // since we could get switched at any point until the thread is done... 
+
     do_exit(ret);
-    
+
     return 0; // should not get here.
 }
 
@@ -764,6 +789,33 @@ palacios_mutex_unlock_irqrestore(void *mutex, void *flags)
     LOCKCHECK_UNLOCK_IRQRESTORE_POST(mutex,(unsigned long)flags);
 }
 
+void palacios_used_fpu(void)
+{
+   struct thread_info *cur = current_thread_info();
+
+   // We assume we are not preemptible here...
+   cur->status |= TS_USEDFPU;
+   clts(); 
+   // After this, FP Save should be handled by Linux if it
+   // switches to a different task and that task uses FPU
+}
+
+inline int ists(void)
+{
+   return read_cr0() & X86_CR0_TS;
+
+}
+void palacios_need_fpu(void)
+{
+    // We assume we are not preemptible here... 
+    if (ists()) { 
+      // we have been switched back to from somewhere else...
+      // Do a restore now - this will also do a clts()
+      math_state_restore();
+    }
+}
+
+
 /**
  * Structure used by the Palacios hypervisor to interface with the host kernel.
  */
@@ -796,6 +848,15 @@ static struct v3_os_hooks palacios_os_hooks = {
 };
 
 
+#ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH
+// Note that this host interface is defined here since it's
+// intertwined with thread creation... 
+static struct v3_lazy_fpu_iface palacios_fpu_hooks = {
+        .used_fpu               = palacios_used_fpu,
+        .need_fpu               = palacios_need_fpu
+};
+
+#endif
 
 
 int palacios_vmm_init( char *options )
@@ -842,6 +903,10 @@ int palacios_vmm_init( char *options )
 
     Init_V3(&palacios_os_hooks, cpu_mask, num_cpus, options);
 
+#ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH
+    V3_Init_Lazy_FPU(&palacios_fpu_hooks);
+#endif
+
     return 0;
 
 }
index c9cbb96..b4c17e6 100644 (file)
@@ -166,6 +166,8 @@ void  palacios_yield_cpu(void);
 void  palacios_sleep_cpu(unsigned int us);
 unsigned int palacios_get_cpu(void);
 unsigned int palacios_get_cpu_khz(void);
+void  palacios_used_fpu(void);
+void  palacios_need_fpu(void);
 void *palacios_mutex_alloc(void);         // allocates and inits a lock
 void  palacios_mutex_init(void *mutex);   // only inits a lock
 void  palacios_mutex_deinit(void *mutex); // only deinits a lock
diff --git a/palacios/include/interfaces/vmm_lazy_fpu.h b/palacios/include/interfaces/vmm_lazy_fpu.h
new file mode 100644 (file)
index 0000000..a50dc14
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * This file is part of the Palacios Virtual Machine Monitor developed
+ * by the V3VEE Project with funding from the United States National
+ * Science Foundation and the Department of Energy.
+ *
+ * The V3VEE Project is a joint project between Northwestern University
+ * and the University of New Mexico.  You can find out more at
+ * http://www.v3vee.org
+ *
+ * Copyright (c) 2013, The V3VEE Project <http://www.v3vee.org>
+ * All rights reserved.
+ *
+ * Author: Peter Dinda <pdinda@northwestern.edu>
+ *
+ * This is free software.  You are permitted to use,
+ * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
+ */
+
+#ifndef __VMM_LAZY_FPU
+#define __VMM_LAZY_FPU
+
+#include <palacios/vmm_types.h>
+
+
+struct v3_lazy_fpu_iface {
+
+    // if these two are provided then lazy FP save/restore handled by host
+    // indicate that the calling thread has used floating point
+    void (*used_fpu)(void);
+    // indicate that the calling thread wants to use floating point again
+    void (*need_fpu)(void);
+
+};
+
+
+/*
+ *  function prototypes
+ */
+
+extern void V3_Init_Lazy_FPU(struct v3_lazy_fpu_iface * palacios_lazy_fpu);
+
+#ifdef __V3VEE__
+
+#define V3_LAZY_FPU_USED()                                                  \
+  do {                                                                     \
+    extern struct v3_lazy_fpu_iface * palacios_lazy_fpu_hooks;              \
+    if ((palacios_lazy_fpu_hooks) && (palacios_lazy_fpu_hooks)->used_fpu)         { \
+      (palacios_lazy_fpu_hooks)->used_fpu();                                \
+    }                                                                       \
+  } while (0)
+
+#define V3_LAZY_FPU_NEED()                                                 \
+  do {                                                                     \
+    extern struct v3_lazy_fpu_iface * palacios_lazy_fpu_hooks;             \
+    if ((palacios_lazy_fpu_hooks) && (palacios_lazy_fpu_hooks)->need_fpu)         { \
+       (palacios_lazy_fpu_hooks)->need_fpu();                              \
+    }                                                                      \
+  } while (0)
+
+#endif
+
+#endif
index 5272b0d..4a9d075 100644 (file)
@@ -27,6 +27,7 @@
 #include <palacios/vmm_mem_hook.h>
 #include <palacios/vmm_io.h>
 #include <palacios/vmm_shadow_paging.h>
+#include <palacios/vmm_direct_paging.h>
 #include <palacios/vmm_intr.h>
 #include <palacios/vmm_excp.h>
 #include <palacios/vmm_dev_mgr.h>
@@ -43,7 +44,7 @@
 #include <palacios/vmm_events.h>
 #include <palacios/vmm_scheduler.h>
 #include <palacios/vmm_fw_cfg.h>
-
+#include <palacios/vmm_fp.h>
 #include <palacios/vmm_perftune.h>
 
 #ifdef V3_CONFIG_TELEMETRY
@@ -86,6 +87,7 @@ struct guest_info {
 
     v3_paging_mode_t shdw_pg_mode;
     struct v3_shdw_pg_state shdw_pg_state;
+    //struct v3_nested_pg_state nested_pg_state;
     addr_t direct_map_pt;
     
 
@@ -116,6 +118,7 @@ struct guest_info {
     struct v3_segments segments;
     struct v3_msrs     msrs;
 
+    struct v3_fp_state fp_state;
 
     void * vmm_data;
 
@@ -177,6 +180,7 @@ struct v3_vm_info {
     struct v3_mem_hooks mem_hooks;
 
     struct v3_shdw_impl_state shdw_impl;
+    //struct v3_nested_impl_state nested_impl;
     void * sched_priv_data;
 
     struct v3_io_map io_map;
index 983cd78..2b02058 100644 (file)
@@ -369,8 +369,6 @@ struct v3_os_hooks {
 
     unsigned int (*get_cpu)(void);
 
-
-
     void * (*start_kernel_thread)(int (*fn)(void * arg), void * arg, char * thread_name); 
     void (*interrupt_cpu)(struct v3_vm_info * vm, int logical_cpu, int vector);
     void (*call_on_cpu)(int logical_cpu, void (*fn)(void * arg), void * arg);
diff --git a/palacios/include/palacios/vmm_fp.h b/palacios/include/palacios/vmm_fp.h
new file mode 100644 (file)
index 0000000..76b377d
--- /dev/null
@@ -0,0 +1,224 @@
+/*
+ * This file is part of the Palacios Virtual Machine Monitor developed
+ * by the V3VEE Project with funding from the United States National 
+ * Science Foundation and the Department of Energy.  
+ *
+ * The V3VEE Project is a joint project between Northwestern University
+ * and the University of New Mexico.  You can find out more at 
+ * http://www.v3vee.org
+ *
+ * Copyright (c) 2013, Peter Dinda <pdinda@northwestern.edu> 
+ * Copyright (c) 2013, The V3VEE Project <http://www.v3vee.org> 
+ * All rights reserved.
+ *
+ * Author: Peter Dinda <pdinda@northwestern.edu>
+ *
+ * This is free software.  You are permitted to use,
+ * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
+ */
+
+#ifndef __VMM_FP_H
+#define __VMM_FP_H
+
+#include <palacios/vmm_types.h>
+#include <palacios/vmm.h>
+#ifdef V3_CONFIG_LAZY_FPU_SWITCH
+#include <interfaces/vmm_lazy_fpu.h>
+#endif
+
+// the FPRs are arranged into the 
+// precise layout of the FXSAVE/FXRESTORE instructions 
+// bytes 32+, which is common for all three variants
+// 8*6 reserved + 8*10 (fpu/mmx) + 16*16 (xmm) 
+// + 3*16 (res) + 3*16 (ava) = 480 bytes
+// another 32 bytes are used for the store header
+// which varies depending on machine mode
+struct v3_fp_regs {
+  v3_fp_mmx_reg_t   stmm0;  // stmm0..7 are the x87 stack or mmx regs
+  uint8_t           res0[6]; 
+  v3_fp_mmx_reg_t   stmm1;  
+  uint8_t           res1[6]; 
+  v3_fp_mmx_reg_t   stmm2;  
+  uint8_t           res2[6]; 
+  v3_fp_mmx_reg_t   stmm3;  
+  uint8_t           res3[6]; 
+  v3_fp_mmx_reg_t   stmm4;  
+  uint8_t           res4[6]; 
+  v3_fp_mmx_reg_t   stmm5;
+  uint8_t           res5[6]; 
+  v3_fp_mmx_reg_t   stmm6;  
+  uint8_t           res6[6]; 
+  v3_fp_mmx_reg_t   stmm7;  
+  uint8_t           res7[6]; 
+  v3_xmm_reg_t      xmm0;   // xmm0..7 are the "classic" SSE regs
+  v3_xmm_reg_t      xmm1;
+  v3_xmm_reg_t      xmm2;
+  v3_xmm_reg_t      xmm3;
+  v3_xmm_reg_t      xmm4;
+  v3_xmm_reg_t      xmm5;
+  v3_xmm_reg_t      xmm6;
+  v3_xmm_reg_t      xmm7;
+  v3_xmm_reg_t      xmm8;    //xmm8..15 are the "new" SSE reg
+  v3_xmm_reg_t      xmm9;
+  v3_xmm_reg_t      xmm10;
+  v3_xmm_reg_t      xmm11;
+  v3_xmm_reg_t      xmm12;
+  v3_xmm_reg_t      xmm13;
+  v3_xmm_reg_t      xmm14;
+  v3_xmm_reg_t      xmm15;
+  v3_xmm_reg_t      res16;  // reserved
+  v3_xmm_reg_t      res17;
+  v3_xmm_reg_t      res18;
+  v3_xmm_reg_t      ava19;
+  v3_xmm_reg_t      ava20;
+  v3_xmm_reg_t      ava21;
+} __attribute__((packed)) __attribute__((aligned(16)));
+
+// FXSAVE, 32 bit mode header (32 bytes)
+// V3_FP_MODE_32
+struct v3_fp_32_state {
+  uint16_t          fcw;
+  uint16_t          fsw;
+  uint8_t           ftw;
+  uint8_t           res0;
+  uint16_t          fop;
+  uint32_t          fip; //fpu instruction pointer
+  uint16_t          fcs; //fpu code segment selector
+  uint16_t          res1;
+  uint32_t          fdp; //fpu data pointer
+  uint16_t          fds; //fpu data segment selector
+  uint16_t          res2;
+  uint32_t          mxcsr;
+  uint32_t          mxcsr_mask;
+} __attribute__((packed)) __attribute__((aligned(16)));
+
+// FXSAVE, 64 bit mode header, REX.W=1 (32 bytes)
+// V3_FP_MODE_64
+struct v3_fp_64_state {
+  uint16_t          fcw;
+  uint16_t          fsw;
+  uint8_t           ftw;
+  uint8_t           res0;
+  uint16_t          fop;
+  uint64_t          fip; //fpu instruction pointer
+  uint64_t          fdp; //fpu data pointer
+  uint32_t          mxcsr;
+  uint32_t          mxcsr_mask;
+} __attribute__((packed)) __attribute__((aligned(16)));
+
+
+// FXSAVE, 64 bit mode header, REX.W=0 (32 bytes)
+// V3_FP_MODE_64_COMPAT
+struct v3_fp_64compat_state {
+  uint16_t          fcw;
+  uint16_t          fsw;
+  uint8_t           ftw;
+  uint8_t           res0;
+  uint16_t          fop;
+  uint32_t          fip; //fpu instruction pointer
+  uint16_t          fcs; //fpu code segment selector
+  uint16_t          res1;
+  uint32_t          fdp; //fpu data pointer
+  uint16_t          fds; //fpu data segment selector
+  uint16_t          res2;
+  uint32_t          mxcsr;
+  uint32_t          mxcsr_mask;
+} __attribute__((packed)) __attribute__((aligned(16)));
+
+
+//
+// This is an FXSAVE block
+//    
+struct v3_fp_state_core {
+  union {
+    struct v3_fp_32_state fp32;
+    struct v3_fp_64_state fp64;
+    struct v3_fp_64compat_state fp64compat;
+  } header;
+  struct v3_fp_regs fprs;
+} __attribute__((packed)) __attribute__((aligned(16)));
+  
+struct v3_fp_state {
+  // Do we need to restore on next entry?
+  int need_restore;
+  // The meaning 
+  enum {V3_FP_MODE_32=0, V3_FP_MODE_64, V3_FP_MODE_64_COMPAT} state_type;
+  struct v3_fp_state_core  state __attribute__((aligned(16)));
+} ;
+
+
+struct guest_info;
+
+// Can we save FP state on this core?
+int v3_can_handle_fp_state(); 
+
+// Save state from this core to the structure
+int v3_get_fp_state(struct guest_info *core);
+
+// Restore FP state from this structure to this core
+int v3_put_fp_state(struct guest_info *core);
+
+int v3_init_fp(void);
+int v3_deinit_fp(void);
+
+#ifndef V3_CONFIG_FP_SWITCH
+
+#define V3_FP_EXIT_SAVE(core) 
+#define V3_FP_ENTRY_RESTORE(core)
+
+#else
+
+#ifdef V3_CONFIG_LAZY_FPU_SWITCH
+
+
+/* Ideally these would use the TS trick to do lazy calls to used_fpu() */
+#define V3_FP_EXIT_SAVE(core)                                               \
+  do {                                                                     \
+    extern struct v3_lazy_fpu_hooks * lazy_fpu_hooks;                      \
+    if ((lazy_fpu_hooks) && (lazy_fpu_hooks)->used_fpu)                   { \
+      (lazy_fpu_hooks)->used_fpu();                                         \
+    } else {                                                                \
+      v3_get_fp_state(core);                                                \
+    }                                                                       \
+  } while (0)
+
+#define V3_FP_ENTRY_RESTORE(core)                                          \
+  do {                                                                     \
+    extern struct v3_lazy_fpu_hooks * lazy_fpu_hooks;                      \
+    if ((core)->fp_state.need_restore) {                                   \
+      v3_put_fp_state(core);                                                \
+      (core)->fp_state.need_restore=0;                                     \
+    } else {                                                                \
+       if ((lazy_fpu_hooks) && (lazy_fpu_hooks)->will_use_fpu)           { \
+       (lazy_fpu_hooks)->need_fpu();                                       \
+       } else {                                                             \
+         v3_put_fp_state(core);                                             \
+       }                                                                    \
+    }                                                                      \
+  } while (0)
+
+#else
+
+// conservative FPU switching
+
+#define V3_FP_EXIT_SAVE(core) v3_get_fp_state(core)
+#define V3_FP_ENTRY_RESTORE(core) v3_put_fp_state(core)
+
+#endif
+
+#endif
+
+#ifdef V3_CONFIG_CHECKPOINT
+
+struct v3_chkpt_ctx;
+
+// save state from structure to checkpoint/migration context
+int v3_save_fp_state(struct v3_chkpt_ctx *ctx, struct guest_info *core);
+
+// load state from checkpoint/migration context to structure
+int v3_load_fp_state(struct v3_chkpt_ctx *ctx, struct guest_info *core);
+
+
+#endif
+
+#endif
index 4346374..7876efa 100644 (file)
@@ -24,6 +24,7 @@
 
 #include <palacios/vmm_types.h>
 
+#include <palacios/vmm_time.h>
 
 struct v3_yield_strategy {
     enum {
@@ -58,6 +59,24 @@ void     v3_strategy_driven_yield(struct guest_info *core, uint64_t time_since_l
 
 uint64_t v3_cycle_diff_in_usec(struct guest_info *core, uint64_t earlier_cycles, uint64_t later_cycles);
 
+// The following three macros are intended to make it easy to
+// use strategy-driven yield.  Call the first one when you are out of work
+// then call the second when each time that you want to yield because you are
+// out of work, and then call the third one when you have work to do again
+//
+// This assumes the thread is locked to a core and may behave strangely if 
+// this is not the case.   
+
+#define  V3_NO_WORK(core) {                               \
+  uint64_t _v3_strat_local_first=0, _v3_strat_local_cur=0; \
+  _v3_strat_local_first=v3_get_host_time(core ? &(core->time_state) : 0); 
+  
+  
+#define  V3_STILL_NO_WORK(core)            \
+  _v3_strat_local_cur=v3_get_host_time(core ? &(core->time_state) : 0);              \
+  v3_strategy_driven_yield(core,v3_cycle_diff_in_usec(core,_v3_strat_local_first,_v3_strat_local_cur)); 
+
+#define  V3_HAVE_WORK_AGAIN(core) }
 
 #endif
 
index 82e83c6..01e22b7 100644 (file)
@@ -76,6 +76,10 @@ typedef char sint8_t;
 
 typedef ulong_t addr_t;
 typedef ullong_t v3_reg_t;
+
+typedef uint8_t v3_xmm_reg_t[16];
+typedef uint8_t v3_fp_mmx_reg_t[10];
+
 #endif /* ! __V3VEE__ */
 
 #endif
index a72ce87..581852a 100644 (file)
@@ -88,4 +88,11 @@ config HOST_PWRSTAT
        help
                Select this if you would like to access energy/power
                measurements within Palacios
+
+config HOST_LAZY_FPU_SWITCH
+       bool "Host provides lazy FPU context switching"
+       default n
+       help
+               Select this if your host provides lazy context switch support
+                for floating point state and you would like Palacios to use it
 endmenu
index ae10d74..262b6cc 100644 (file)
@@ -10,6 +10,7 @@ obj-$(V3_CONFIG_HOST_HYPERCALL) += vmm_host_hypercall.o
 obj-$(V3_CONFIG_HOST_PCI) += host_pci.o
 obj-$(V3_CONFIG_HOST_PMU) += vmm_pmu.o
 obj-$(V3_CONFIG_HOST_PWRSTAT) += vmm_pwrstat.o
+obj-$(V3_CONFIG_HOST_LAZY_FPU_SWITCH) += vmm_lazy_fpu.o
 
 obj-y += null.o
 obj-y += vmm_numa.o
diff --git a/palacios/src/interfaces/vmm_lazy_fpu.c b/palacios/src/interfaces/vmm_lazy_fpu.c
new file mode 100644 (file)
index 0000000..7562ba0
--- /dev/null
@@ -0,0 +1,36 @@
+/* 
+ * This file is part of the Palacios Virtual Machine Monitor developed
+ * by the V3VEE Project with funding from the United States National 
+ * Science Foundation and the Department of Energy.  
+ *
+ * The V3VEE Project is a joint project between Northwestern University
+ * and the University of New Mexico.  You can find out more at 
+ * http://www.v3vee.org
+ *
+ * Copyright (c) 2013, The V3VEE Project <http://www.v3vee.org> 
+ * All rights reserved.
+ *
+ * Author: Peter Dinda <pdinda@northwestern.edu>
+ * 
+ * This is free software.  You are permitted to use,
+ * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
+ */
+
+#include <palacios/vmm.h>
+#include <palacios/vmm_debug.h>
+#include <palacios/vmm_types.h>
+#include <palacios/vm_guest.h>
+#include <palacios/vmm_lowlevel.h>
+
+#include <interfaces/vmm_lazy_fpu.h>
+
+struct v3_lazy_fpu_iface * palacios_lazy_fpu_hooks = 0;
+
+
+
+void V3_Init_Lazy_FPU (struct v3_lazy_fpu_iface * lazy_fpu_iface) 
+{
+    palacios_lazy_fpu_hooks = lazy_fpu_iface;
+}
+
+
index 18cdea1..2ad3889 100644 (file)
@@ -20,6 +20,7 @@ obj-y := \
        vmm_io.o \
        vmm_lock.o \
        vmm_mem.o \
+        vmm_fp.o \
        vmm_msr.o \
        vmm_paging.o \
        vmm_options.o \
index 71e62d6..05d4b7a 100644 (file)
@@ -39,6 +39,7 @@
 #include <palacios/vmm_barrier.h>
 #include <palacios/vmm_debug.h>
 
+#include <palacios/vmm_perftune.h>
 
 
 #ifdef V3_CONFIG_CHECKPOINT
@@ -666,6 +667,8 @@ int v3_svm_enter(struct guest_info * info) {
     guest_state->rip = info->rip;
     guest_state->rsp = info->vm_regs.rsp;
 
+    V3_FP_ENTRY_RESTORE(info);
+
 #ifdef V3_CONFIG_SYMCALL
     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
        update_irq_entry_state(info);
@@ -733,6 +736,8 @@ int v3_svm_enter(struct guest_info * info) {
 
     info->num_exits++;
 
+    V3_FP_EXIT_SAVE(info);
+
     // Save Guest state from VMCB
     info->rip = guest_state->rip;
     info->vm_regs.rsp = guest_state->rsp;
@@ -823,7 +828,9 @@ int v3_start_svm_guest(struct guest_info * info) {
                info->core_run_state = CORE_RUNNING;
            } else  { 
                PrintDebug(info->vm_info, info, "SVM core %u (on %u): Waiting for core initialization\n", info->vcpu_id, info->pcpu_id);
-               
+
+               V3_NO_WORK(info);
+
                while (info->core_run_state == CORE_STOPPED) {
                    
                    if (info->vm_info->run_state == VM_STOPPED) {
@@ -831,9 +838,12 @@ int v3_start_svm_guest(struct guest_info * info) {
                        return 0;
                    }
                    
-                   v3_yield(info,-1);
+                   V3_STILL_NO_WORK(info);
+
                    //PrintDebug(info->vm_info, info, "SVM core %u: still waiting for INIT\n", info->vcpu_id);
                }
+
+               V3_HAVE_WORK_AGAIN(info);
                
                PrintDebug(info->vm_info, info, "SVM core %u(on %u) initialized\n", info->vcpu_id, info->pcpu_id);
                
index cfef4f9..8c34b68 100644 (file)
@@ -55,6 +55,8 @@ int v3_dbg_enable = 0;
 static void init_cpu(void * arg) {
     uint32_t cpu_id = (uint32_t)(addr_t)arg;
 
+    v3_init_fp();
+
 #ifdef V3_CONFIG_SVM
     if (v3_is_svm_capable()) {
         PrintDebug(VM_NONE, VCORE_NONE, "Machine is SVM Capable\n");
@@ -100,6 +102,9 @@ static void deinit_cpu(void * arg) {
            PrintError(VM_NONE, VCORE_NONE, "CPU has no virtualization Extensions\n");
            break;
     }
+
+    v3_deinit_fp();
+
 }
 
 void Init_V3(struct v3_os_hooks * hooks, char * cpu_mask, int num_cpus, char *options) {
@@ -689,6 +694,7 @@ static int sim_callback(struct guest_info * core, void * private_data) {
     V3_Print(core->vm_info, core, "Simulation callback activated (guest_rip=%p)\n", (void *)core->rip);
 
     while (v3_bitmap_check(timeout_map, core->vcpu_id) == 1) {
+        // We spin here if there is noone to yield to
        v3_yield(NULL,-1);
     }
 
@@ -759,7 +765,8 @@ int v3_simulate_vm(struct v3_vm_info * vm, unsigned int msecs) {
        if (all_blocked == 1) {
            break;
        }
-
+       
+       // Intentionally spin if there is no one to yield to
        v3_yield(NULL,-1);
     }
 
index 35efe0f..ba88e2b 100644 (file)
@@ -120,6 +120,7 @@ int v3_wait_for_barrier(struct v3_vm_info * vm_info, struct guest_info * local_c
            break;
        }
 
+        // return immediately and spin if there is no one to yield to 
        v3_yield(local_core,-1);
     }
 
@@ -198,6 +199,10 @@ int v3_wait_at_barrier(struct guest_info * core) {
        return 0;
     }
 
+#ifdef V3_CONFIG_LAZY_FP_SWITCH
+    v3_get_fp_state(core); // snapshot FP state now regardless of lazy eval
+#endif
+
     V3_Print(core->vm_info, core, "Core %d waiting at barrier\n", core->vcpu_id);
 
     /*  Barrier has been activated. 
@@ -211,8 +216,13 @@ int v3_wait_at_barrier(struct guest_info * core) {
 
     // wait for cpu bit to clear
     while (v3_bitmap_check(&(barrier->cpu_map), core->vcpu_id)) {
+        // Barrier wait will spin if there is no competing work
        v3_yield(core,-1);
     }
+    
+#ifdef V3_LAZY_FP_SWITCH
+    core->fp_state.need_restore=1;  // restore FP on next entry
+#endif
 
     return 0;
 }
index 92c5e16..7a564e6 100644 (file)
@@ -412,15 +412,15 @@ struct mem_migration_state {
     struct v3_bitmap  modified_pages; 
 };
 
-static int paging_callback(struct guest_info *core, 
-                          struct v3_shdw_pg_event *event,
-                          void      *priv_data)
+static int shadow_paging_callback(struct guest_info *core, 
+                                 struct v3_shdw_pg_event *event,
+                                 void      *priv_data)
 {
     struct mem_migration_state *m = (struct mem_migration_state *)priv_data;
     
     if (event->event_type==SHADOW_PAGEFAULT &&
        event->event_order==SHADOW_PREIMPL &&
-       event->error_code.write) { 
+       event->error_code.write) { // Note, assumes VTLB behavior where we will see the write even if preceded by a read
        addr_t gpa;
        if (!v3_gva_to_gpa(core,event->gva,&gpa)) {
            // write to this page
@@ -434,7 +434,30 @@ static int paging_callback(struct guest_info *core,
     
     return 0;
 }
-       
+
+
+/*
+static int nested_paging_callback(struct guest_info *core, 
+                                 struct v3_nested_pg_event *event,
+                                 void      *priv_data)
+{
+    struct mem_migration_state *m = (struct mem_migration_state *)priv_data;
+    
+    if (event->event_type==NESTED_PAGEFAULT &&
+       event->event_order==NESTED_PREIMPL &&
+       event->error_code.write) { // Assumes we will see a write after reads
+       if (event->gpa<core->vm_info->mem_size) { 
+         v3_bitmap_set(&(m->modified_pages),(event->gpa)>>12);
+       } else {
+         // no worries, this isn't physical memory
+       }
+    } else {
+      // we don't care about other events
+    }
+    
+    return 0;
+}
+*/     
 
 
 static struct mem_migration_state *start_page_tracking(struct v3_vm_info *vm)
@@ -456,10 +479,27 @@ static struct mem_migration_state *start_page_tracking(struct v3_vm_info *vm)
        V3_Free(m);
     }
 
-    v3_register_shadow_paging_event_callback(vm,paging_callback,m);
+    // We assume that the migrator has already verified that all cores are
+    // using the identical model (shadow or nested)
+    // This must not change over the execution of the migration
 
-    for (i=0;i<vm->num_cores;i++) {
+    if (vm->cores[0].shdw_pg_mode==SHADOW_PAGING) { 
+      v3_register_shadow_paging_event_callback(vm,shadow_paging_callback,m);
+
+      for (i=0;i<vm->num_cores;i++) {
        v3_invalidate_shadow_pts(&(vm->cores[i]));
+      }
+    } else if (vm->cores[0].shdw_pg_mode==NESTED_PAGING) { 
+      //v3_register_nested_paging_event_callback(vm,nested_paging_callback,m);
+      
+      for (i=0;i<vm->num_cores;i++) {
+       //v3_invalidate_nested_addr_range(&(vm->cores[i]),0,vm->mem_size-1);
+      }
+    } else {
+      PrintError(vm, VCORE_NONE, "Unsupported paging mode\n");
+      v3_bitmap_deinit(&(m->modified_pages));
+      V3_Free(m);
+      return 0;
     }
     
     // and now we should get callbacks as writes happen
@@ -469,11 +509,15 @@ static struct mem_migration_state *start_page_tracking(struct v3_vm_info *vm)
 
 static void stop_page_tracking(struct mem_migration_state *m)
 {
-    v3_unregister_shadow_paging_event_callback(m->vm,paging_callback,m);
-    
-    v3_bitmap_deinit(&(m->modified_pages));
+  if (m->vm->cores[0].shdw_pg_mode==SHADOW_PAGING) { 
+    v3_unregister_shadow_paging_event_callback(m->vm,shadow_paging_callback,m);
+  } else {
+    //v3_unregister_nested_paging_event_callback(m->vm,nested_paging_callback,m);
+  }
     
-    V3_Free(m);
+  v3_bitmap_deinit(&(m->modified_pages));
+  
+  V3_Free(m);
 }
 
            
@@ -731,6 +775,10 @@ static int load_core(struct guest_info * info, struct v3_chkpt * chkpt, v3_chkpt
        PrintError(info->vm_info, info, "Could not open context to load core\n");
        goto loadfailout;
     }
+    
+    // Run state is needed to determine when AP cores need
+    // to be immediately run after resume
+    V3_CHKPT_LOAD(ctx,"run_state",info->core_run_state,loadfailout);
 
     V3_CHKPT_LOAD(ctx, "RIP", info->rip, loadfailout);
     
@@ -798,6 +846,11 @@ static int load_core(struct guest_info * info, struct v3_chkpt * chkpt, v3_chkpt
     V3_CHKPT_LOAD(ctx, "GUEST_CR0", info->shdw_pg_state.guest_cr0, loadfailout);
     V3_CHKPT_LOAD(ctx, "GUEST_EFER", info->shdw_pg_state.guest_efer, loadfailout);
 
+    // floating point
+    if (v3_load_fp_state(ctx,info)) {
+      goto loadfailout;
+    }
+
     v3_chkpt_close_ctx(ctx); ctx=0;
 
     PrintDebug(info->vm_info, info, "Finished reading guest_info information\n");
@@ -912,6 +965,7 @@ static int save_core(struct guest_info * info, struct v3_chkpt * chkpt, v3_chkpt
        goto savefailout;
     }
 
+    V3_CHKPT_SAVE(ctx,"run_state",info->core_run_state,savefailout);
 
     V3_CHKPT_SAVE(ctx, "RIP", info->rip, savefailout);
     
@@ -979,6 +1033,11 @@ static int save_core(struct guest_info * info, struct v3_chkpt * chkpt, v3_chkpt
     V3_CHKPT_SAVE(ctx, "GUEST_CR0", info->shdw_pg_state.guest_cr0, savefailout);
     V3_CHKPT_SAVE(ctx, "GUEST_EFER", info->shdw_pg_state.guest_efer, savefailout);
 
+    // floating point
+    if (v3_save_fp_state(ctx,info)) {
+      goto savefailout;
+    }
+
     v3_chkpt_close_ctx(ctx); ctx=0;
 
     if (opts & V3_CHKPT_OPT_SKIP_ARCHDEP) {
@@ -1200,11 +1259,15 @@ int v3_chkpt_send_vm(struct v3_vm_info * vm, char * store, char * url, v3_chkpt_
     struct mem_migration_state *mm_state;
     int i;
 
-    // Currently will work only for shadow paging
-    for (i=0;i<vm->num_cores;i++) { 
-      if (vm->cores[i].shdw_pg_mode!=SHADOW_PAGING && !(opts & V3_CHKPT_OPT_SKIP_MEM)) { 
-       PrintError(vm, VCORE_NONE, "Cannot currently handle nested paging\n");
-       return -1;
+    // Cores must all be in the same mode
+    // or we must be skipping mmeory
+    if (!(opts & V3_CHKPT_OPT_SKIP_MEM)) { 
+      v3_paging_mode_t mode = vm->cores[0].shdw_pg_mode;
+      for (i=1;i<vm->num_cores;i++) { 
+       if (vm->cores[i].shdw_pg_mode != mode) { 
+         PrintError(vm, VCORE_NONE, "Cores having different paging modes (nested and shadow) are not supported\n");
+         return -1;
+       }
       }
     }
     
diff --git a/palacios/src/palacios/vmm_fp.c b/palacios/src/palacios/vmm_fp.c
new file mode 100644 (file)
index 0000000..d3b6ca3
--- /dev/null
@@ -0,0 +1,179 @@
+/* 
+ * This file is part of the Palacios Virtual Machine Monitor developed
+ * by the V3VEE Project with funding from the United States National 
+ * Science Foundation and the Department of Energy.  
+ *
+ * The V3VEE Project is a joint project between Northwestern University
+ * and the University of New Mexico.  You can find out more at 
+ * http://www.v3vee.org
+ *
+ * Copyright (c) 2013, Peter Dinda <pdinda@northwestern.edu> 
+ * Copyright (c) 2013, The V3VEE Project <http://www.v3vee.org> 
+ * All rights reserved.
+ *
+ * Author: Peter Dinda <pdinda@northwestern.edu>
+ *
+ * This is free software.  You are permitted to use,
+ * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
+ */
+
+#include <palacios/vmm.h>
+#include <palacios/vmm_fp.h>
+#include <palacios/vm_guest.h>
+#include <palacios/vmm_lowlevel.h>
+
+#ifdef V3_CONFIG_CHECKPOINT
+#include <palacios/vmm_checkpoint.h>
+#endif
+
+
+static int can_do_fp=-1;
+
+// assumes identical on all cores...
+int v3_can_handle_fp_state()
+{
+  if (can_do_fp!=-1) { 
+    return can_do_fp;
+  } else {
+    uint32_t eax, ebx, ecx, edx;
+
+    v3_cpuid(CPUID_FEATURE_IDS,&eax,&ebx,&ecx,&edx);
+    
+    can_do_fp= !!(edx & (1<<25)); // do we have SSE?
+    
+    return can_do_fp;
+  }
+}
+
+int v3_init_fp()
+{
+  if (v3_can_handle_fp_state()) { 
+    V3_Print(VM_NONE,VCORE_NONE,"Floating point save/restore init:  available on this hardware\n");
+  } else {
+    V3_Print(VM_NONE,VCORE_NONE,"Floating point save/restore init:  UNAVAILABLE ON THIS HARDWARE\n");
+  }
+  return 0;
+}
+
+int v3_deinit_fp()
+{
+  V3_Print(VM_NONE,VCORE_NONE,"Floating point save/restore deinited\n");
+  return 0;
+}
+
+#define EFER_MSR 0xc0000080
+
+
+int v3_get_fp_state(struct guest_info *core)
+{ 
+  if (v3_can_handle_fp_state()) { 
+    /*
+      If the fast-FXSAVE/FXRSTOR (FFXSR) feature is enabled in EFER, FXSAVE and FXRSTOR do not save or restore the XMM0–15 registers when executed in 64-bit mode at CPL 0. The x87 environment and MXCSR are saved whether fast-FXSAVE/FXRSTOR is enabled or not. Software can use the CPUID instruction to determine whether the fast-FXSAVE/FXRSTOR feature is available
+      (CPUID Fn8000_0001h_EDX[FFXSR]). The fast-FXSAVE/FXRSTOR feature has no effect on FXSAVE/FXRSTOR in non 64-bit mode or when CPL > 0.
+      
+    */
+
+    // We need to assure that the fast-FXSAVE/FXRSTOR are not on
+    // otherwise we will NOT have the XMM regs since we running at CPL 0
+    //
+
+    int restore=0;
+    uint32_t high,low;
+    
+    v3_get_msr(EFER_MSR,&high,&low);
+    
+    if (low & (0x1<<14)) { 
+      // fast save is in effect
+      low &= ~(0x1<<14);
+      restore=1;
+      v3_set_msr(EFER_MSR, high, low);
+    }
+    
+    __asm__ __volatile__(" rex64/fxsave %0 ; "
+                        : "=m"(core->fp_state.state)); /* no input, no clobber */
+    if (restore) { 
+      low |= 0x1<<14;
+      v3_set_msr(EFER_MSR, high, low);
+    }
+
+    // this is a giant guess
+    // we really need to capture the state type as seen in the guest, not here...
+    core->fp_state.state_type=V3_FP_MODE_64;
+    
+    return 0;
+
+  } else {
+    return -1;
+  }
+}
+
+
+// Restore FP state from this structure to this core
+int v3_put_fp_state(struct guest_info *core)
+{
+  if (v3_can_handle_fp_state()) {
+    // We need to assure that the fast-FXSAVE/FXRSTOR are not on
+    // otherwise we will NOT have the XMM regs since we running at CPL 0
+    //
+
+    int restore=0;
+    uint32_t high,low;
+    
+    v3_get_msr(EFER_MSR,&high,&low);
+    
+    if (low & (0x1<<14)) { 
+      // fast restore is in effect
+      low &= ~(0x1<<14);
+      restore=1;
+      v3_set_msr(EFER_MSR, high, low);
+    }
+
+    __asm__ __volatile__(" rex64/fxrstor %0; "
+                        : /* no output */
+                        : "m"((core->fp_state.state)) ); /* no clobber*/
+
+    
+    if (restore) { 
+      low |= 0x1<<14;
+      v3_set_msr(EFER_MSR, high, low);
+    }
+
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+#ifdef V3_CONFIG_CHECKPOINT
+
+
+int v3_save_fp_state(struct v3_chkpt_ctx *ctx, struct guest_info *core)
+{
+  V3_CHKPT_SAVE(ctx, "FP_STATE_TYPE", core->fp_state.state_type, savefailout);
+  if (v3_chkpt_save(ctx,"FP_STATE_BLOB",sizeof(core->fp_state.state),&(core->fp_state.state))) { 
+    goto savefailout;
+  }
+  
+  return 0;
+
+ savefailout:
+  PrintError(core->vm_info,core,"Unable to save floating point state\n");
+  return -1;
+}
+
+
+int v3_load_fp_state(struct v3_chkpt_ctx *ctx, struct guest_info *core)
+{
+  V3_CHKPT_LOAD(ctx, "FP_STATE_TYPE", core->fp_state.state_type, loadfailout);
+  if (v3_chkpt_load(ctx,"FP_STATE_BLOB",sizeof(core->fp_state.state),&(core->fp_state.state))) { 
+    goto loadfailout;
+  }
+  
+  return 0;
+
+ loadfailout:
+  PrintError(core->vm_info,core,"Unable to load floating point state\n");
+  return -1;
+}
+
+#endif
index ae3fad4..de81dfc 100644 (file)
@@ -1028,7 +1028,8 @@ int v3_vmx_enter(struct guest_info * info) {
        
        check_vmcs_write(VMCS_PREEMPT_TIMER, preempt_window);
     }
-   
+
+    V3_FP_ENTRY_RESTORE(info);
 
     {  
        uint64_t entry_tsc = 0;
@@ -1081,6 +1082,8 @@ int v3_vmx_enter(struct guest_info * info) {
 
     info->num_exits++;
 
+    V3_FP_EXIT_SAVE(info);
+
     /* If we have the preemption time, then use it to get more accurate guest time */
     if (vmx_info->pin_ctrls.active_preempt_timer) {
        uint32_t cycles_left = 0;
@@ -1187,6 +1190,8 @@ int v3_start_vmx_guest(struct guest_info * info) {
            } else {
                
                PrintDebug(info->vm_info, info, "VMX core %u: Waiting for core initialization\n", info->vcpu_id);
+
+                V3_NO_WORK(info);
                
                while (info->core_run_state == CORE_STOPPED) {
                    
@@ -1194,11 +1199,13 @@ int v3_start_vmx_guest(struct guest_info * info) {
                        // The VM was stopped before this core was initialized. 
                        return 0;
                    }
-                   
-                   v3_yield(info,-1);
+
+                   V3_STILL_NO_WORK(info);
                    //PrintDebug(info->vm_info, info, "VMX core %u: still waiting for INIT\n",info->vcpu_id);
                }
-               
+
+               V3_HAVE_WORK_AGAIN(info);
+
                PrintDebug(info->vm_info, info, "VMX core %u initialized\n", info->vcpu_id);
                
                // We'll be paranoid about race conditions here