From: Peter Dinda Date: Fri, 25 Oct 2013 23:30:39 +0000 (-0500) Subject: Floating point context-switching and checkpoint/load X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?p=palacios.git;a=commitdiff_plain;h=9feccf93cd8327d1d30a404a92f19716bf5a1e96 Floating point context-switching and checkpoint/load This integrates: - the option to do floating point context-switching - conservative code to do context-switching in Palacios - a lazy floating point save/restore host interface - an implementation of this interface in the linux module - liberal code to use this interface in Palacios - floating point checkpointing This also includes a performance tuning element that is hard to separate --- diff --git a/Kconfig b/Kconfig index 9c9dfe2..028645f 100644 --- a/Kconfig +++ b/Kconfig @@ -75,6 +75,7 @@ config VMX Compile with support for Intel VMX + config FRAME_POINTER bool "Compile with Frame pointers" default n @@ -144,6 +145,49 @@ config MAX_CPUS endmenu source "palacios/src/interfaces/Kconfig" + +menu "Virtual core specialization" + +config CUSTOM_CPUID + bool "Use custom CPU information (vendor, etc)" + default y + help + If set, the CPU information will be for a special V3VEE vendor. + This should result in identical guest kernel setup, regardless + of the underlying hardware, but it also means that the guest kernel + has no chance of employing CPU-specific bug fixes. + +config STRICT_MSR_SEMANTICS + bool "Use strict RDMSR/WRMSR semantics" + default y + help + Use strict MSR semantics - when an unhandled MSR is read or written, + a GPF is generated. This is typically usd with CUSTOM_CPU_TYPE on. + +config FP_SWITCH + bool "Floating point context switching" + default y + help + If set, floating point is handled for context switches + (VM1->VM2->VM1 and/or VM->HOST->VM). This can be disabled + for environments where a single VM is the only user of FP. + Note that even if disabled, FP save/restore code is included + for support of checkpoint/restore. + +config LAZY_FP_SWITCH + bool "Use host-based lazy floating point context switching" + depends on FP_SWITCH && HOST_LAZY_FPU_SWITCH + default y + help + When true, the host's lazy floating point save/restore + mechanism is notified on each exit and entry. If false, + the floating point state is explicitly saved on each exit + and restored on each entry---this save/restore is entirely + done in Palacios. + + +endmenu + source "palacios/src/extensions/Kconfig" config TELEMETRY diff --git a/linux_module/palacios-stubs.c b/linux_module/palacios-stubs.c index 08021e4..decae1f 100644 --- a/linux_module/palacios-stubs.c +++ b/linux_module/palacios-stubs.c @@ -16,8 +16,15 @@ #include #include +#include + #include #include + +#ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH +#include +#endif + #include "palacios.h" #include "mm.h" @@ -25,6 +32,8 @@ #include "memcheck.h" #include "lockcheck.h" + + // The following can be used to track heap bugs // zero memory after allocation #define ALLOC_ZERO_MEM 0 @@ -169,14 +178,14 @@ void *palacios_allocate_pages(int num_pages, unsigned int alignment, int node_id void * pg_addr = NULL; if (num_pages<=0) { - ERROR("ALERT ALERT Attempt to allocate zero or fewer pages\n"); + ERROR("ALERT ALERT Attempt to allocate zero or fewer pages (%d pages, alignment %d, node %d, constraints 0x%x)\n",num_pages, alignment, node_id, constraints); return NULL; } pg_addr = (void *)alloc_palacios_pgs(num_pages, alignment, node_id, constraints); if (!pg_addr) { - ERROR("ALERT ALERT Page allocation has FAILED Warning\n"); + ERROR("ALERT ALERT Page allocation has FAILED Warning (%d pages, alignment %d, node %d, constraints 0x%x)\n",num_pages, alignment, node_id, constraints); return NULL; } @@ -195,6 +204,10 @@ void *palacios_allocate_pages(int num_pages, unsigned int alignment, int node_id */ void palacios_free_pages(void * page_paddr, int num_pages) { + if (!page_paddr) { + ERROR("Ignoring free pages: 0x%p (0x%lx)for %d pages\n", page_paddr, (uintptr_t)page_paddr, num_pages); + dump_stack(); + } pg_frees += num_pages; free_palacios_pgs((uintptr_t)page_paddr, num_pages); MEMCHECK_FREE_PAGES(page_paddr,num_pages*4096); @@ -294,6 +307,10 @@ palacios_free( void * addr ) { + if (!addr) { + ERROR("Ignoring free : 0x%p\n", addr); + dump_stack(); + } frees++; kfree(addr-ALLOC_PAD); MEMCHECK_KFREE(addr-ALLOC_PAD); @@ -359,17 +376,25 @@ static int lnx_thread_target(void * arg) { allow_signal(SIGKILL); */ +#ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH + // We are a kernel thread that needs FPU save/restore state + // vcores definitely need this, all the other threads get it too, + // but they just won't use it + fpu_alloc(&(current->thread.fpu)); +#endif ret = thread_info->fn(thread_info->arg); - INFO("Palacios Thread (%s) EXITING\n", thread_info->name); palacios_free(thread_info); // handle cleanup + // We rely on do_exit to free the fpu data + // since we could get switched at any point until the thread is done... + do_exit(ret); - + return 0; // should not get here. } @@ -764,6 +789,33 @@ palacios_mutex_unlock_irqrestore(void *mutex, void *flags) LOCKCHECK_UNLOCK_IRQRESTORE_POST(mutex,(unsigned long)flags); } +void palacios_used_fpu(void) +{ + struct thread_info *cur = current_thread_info(); + + // We assume we are not preemptible here... + cur->status |= TS_USEDFPU; + clts(); + // After this, FP Save should be handled by Linux if it + // switches to a different task and that task uses FPU +} + +inline int ists(void) +{ + return read_cr0() & X86_CR0_TS; + +} +void palacios_need_fpu(void) +{ + // We assume we are not preemptible here... + if (ists()) { + // we have been switched back to from somewhere else... + // Do a restore now - this will also do a clts() + math_state_restore(); + } +} + + /** * Structure used by the Palacios hypervisor to interface with the host kernel. */ @@ -796,6 +848,15 @@ static struct v3_os_hooks palacios_os_hooks = { }; +#ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH +// Note that this host interface is defined here since it's +// intertwined with thread creation... +static struct v3_lazy_fpu_iface palacios_fpu_hooks = { + .used_fpu = palacios_used_fpu, + .need_fpu = palacios_need_fpu +}; + +#endif int palacios_vmm_init( char *options ) @@ -842,6 +903,10 @@ int palacios_vmm_init( char *options ) Init_V3(&palacios_os_hooks, cpu_mask, num_cpus, options); +#ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH + V3_Init_Lazy_FPU(&palacios_fpu_hooks); +#endif + return 0; } diff --git a/linux_module/palacios.h b/linux_module/palacios.h index c9cbb96..b4c17e6 100644 --- a/linux_module/palacios.h +++ b/linux_module/palacios.h @@ -166,6 +166,8 @@ void palacios_yield_cpu(void); void palacios_sleep_cpu(unsigned int us); unsigned int palacios_get_cpu(void); unsigned int palacios_get_cpu_khz(void); +void palacios_used_fpu(void); +void palacios_need_fpu(void); void *palacios_mutex_alloc(void); // allocates and inits a lock void palacios_mutex_init(void *mutex); // only inits a lock void palacios_mutex_deinit(void *mutex); // only deinits a lock diff --git a/palacios/include/interfaces/vmm_lazy_fpu.h b/palacios/include/interfaces/vmm_lazy_fpu.h new file mode 100644 index 0000000..a50dc14 --- /dev/null +++ b/palacios/include/interfaces/vmm_lazy_fpu.h @@ -0,0 +1,62 @@ +/* + * This file is part of the Palacios Virtual Machine Monitor developed + * by the V3VEE Project with funding from the United States National + * Science Foundation and the Department of Energy. + * + * The V3VEE Project is a joint project between Northwestern University + * and the University of New Mexico. You can find out more at + * http://www.v3vee.org + * + * Copyright (c) 2013, The V3VEE Project + * All rights reserved. + * + * Author: Peter Dinda + * + * This is free software. You are permitted to use, + * redistribute, and modify it as specified in the file "V3VEE_LICENSE". + */ + +#ifndef __VMM_LAZY_FPU +#define __VMM_LAZY_FPU + +#include + + +struct v3_lazy_fpu_iface { + + // if these two are provided then lazy FP save/restore handled by host + // indicate that the calling thread has used floating point + void (*used_fpu)(void); + // indicate that the calling thread wants to use floating point again + void (*need_fpu)(void); + +}; + + +/* + * function prototypes + */ + +extern void V3_Init_Lazy_FPU(struct v3_lazy_fpu_iface * palacios_lazy_fpu); + +#ifdef __V3VEE__ + +#define V3_LAZY_FPU_USED() \ + do { \ + extern struct v3_lazy_fpu_iface * palacios_lazy_fpu_hooks; \ + if ((palacios_lazy_fpu_hooks) && (palacios_lazy_fpu_hooks)->used_fpu) { \ + (palacios_lazy_fpu_hooks)->used_fpu(); \ + } \ + } while (0) + +#define V3_LAZY_FPU_NEED() \ + do { \ + extern struct v3_lazy_fpu_iface * palacios_lazy_fpu_hooks; \ + if ((palacios_lazy_fpu_hooks) && (palacios_lazy_fpu_hooks)->need_fpu) { \ + (palacios_lazy_fpu_hooks)->need_fpu(); \ + } \ + } while (0) + +#endif + +#endif diff --git a/palacios/include/palacios/vm_guest.h b/palacios/include/palacios/vm_guest.h index 5272b0d..4a9d075 100644 --- a/palacios/include/palacios/vm_guest.h +++ b/palacios/include/palacios/vm_guest.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -43,7 +44,7 @@ #include #include #include - +#include #include #ifdef V3_CONFIG_TELEMETRY @@ -86,6 +87,7 @@ struct guest_info { v3_paging_mode_t shdw_pg_mode; struct v3_shdw_pg_state shdw_pg_state; + //struct v3_nested_pg_state nested_pg_state; addr_t direct_map_pt; @@ -116,6 +118,7 @@ struct guest_info { struct v3_segments segments; struct v3_msrs msrs; + struct v3_fp_state fp_state; void * vmm_data; @@ -177,6 +180,7 @@ struct v3_vm_info { struct v3_mem_hooks mem_hooks; struct v3_shdw_impl_state shdw_impl; + //struct v3_nested_impl_state nested_impl; void * sched_priv_data; struct v3_io_map io_map; diff --git a/palacios/include/palacios/vmm.h b/palacios/include/palacios/vmm.h index 983cd78..2b02058 100644 --- a/palacios/include/palacios/vmm.h +++ b/palacios/include/palacios/vmm.h @@ -369,8 +369,6 @@ struct v3_os_hooks { unsigned int (*get_cpu)(void); - - void * (*start_kernel_thread)(int (*fn)(void * arg), void * arg, char * thread_name); void (*interrupt_cpu)(struct v3_vm_info * vm, int logical_cpu, int vector); void (*call_on_cpu)(int logical_cpu, void (*fn)(void * arg), void * arg); diff --git a/palacios/include/palacios/vmm_fp.h b/palacios/include/palacios/vmm_fp.h new file mode 100644 index 0000000..76b377d --- /dev/null +++ b/palacios/include/palacios/vmm_fp.h @@ -0,0 +1,224 @@ +/* + * This file is part of the Palacios Virtual Machine Monitor developed + * by the V3VEE Project with funding from the United States National + * Science Foundation and the Department of Energy. + * + * The V3VEE Project is a joint project between Northwestern University + * and the University of New Mexico. You can find out more at + * http://www.v3vee.org + * + * Copyright (c) 2013, Peter Dinda + * Copyright (c) 2013, The V3VEE Project + * All rights reserved. + * + * Author: Peter Dinda + * + * This is free software. You are permitted to use, + * redistribute, and modify it as specified in the file "V3VEE_LICENSE". + */ + +#ifndef __VMM_FP_H +#define __VMM_FP_H + +#include +#include +#ifdef V3_CONFIG_LAZY_FPU_SWITCH +#include +#endif + +// the FPRs are arranged into the +// precise layout of the FXSAVE/FXRESTORE instructions +// bytes 32+, which is common for all three variants +// 8*6 reserved + 8*10 (fpu/mmx) + 16*16 (xmm) +// + 3*16 (res) + 3*16 (ava) = 480 bytes +// another 32 bytes are used for the store header +// which varies depending on machine mode +struct v3_fp_regs { + v3_fp_mmx_reg_t stmm0; // stmm0..7 are the x87 stack or mmx regs + uint8_t res0[6]; + v3_fp_mmx_reg_t stmm1; + uint8_t res1[6]; + v3_fp_mmx_reg_t stmm2; + uint8_t res2[6]; + v3_fp_mmx_reg_t stmm3; + uint8_t res3[6]; + v3_fp_mmx_reg_t stmm4; + uint8_t res4[6]; + v3_fp_mmx_reg_t stmm5; + uint8_t res5[6]; + v3_fp_mmx_reg_t stmm6; + uint8_t res6[6]; + v3_fp_mmx_reg_t stmm7; + uint8_t res7[6]; + v3_xmm_reg_t xmm0; // xmm0..7 are the "classic" SSE regs + v3_xmm_reg_t xmm1; + v3_xmm_reg_t xmm2; + v3_xmm_reg_t xmm3; + v3_xmm_reg_t xmm4; + v3_xmm_reg_t xmm5; + v3_xmm_reg_t xmm6; + v3_xmm_reg_t xmm7; + v3_xmm_reg_t xmm8; //xmm8..15 are the "new" SSE reg + v3_xmm_reg_t xmm9; + v3_xmm_reg_t xmm10; + v3_xmm_reg_t xmm11; + v3_xmm_reg_t xmm12; + v3_xmm_reg_t xmm13; + v3_xmm_reg_t xmm14; + v3_xmm_reg_t xmm15; + v3_xmm_reg_t res16; // reserved + v3_xmm_reg_t res17; + v3_xmm_reg_t res18; + v3_xmm_reg_t ava19; + v3_xmm_reg_t ava20; + v3_xmm_reg_t ava21; +} __attribute__((packed)) __attribute__((aligned(16))); + +// FXSAVE, 32 bit mode header (32 bytes) +// V3_FP_MODE_32 +struct v3_fp_32_state { + uint16_t fcw; + uint16_t fsw; + uint8_t ftw; + uint8_t res0; + uint16_t fop; + uint32_t fip; //fpu instruction pointer + uint16_t fcs; //fpu code segment selector + uint16_t res1; + uint32_t fdp; //fpu data pointer + uint16_t fds; //fpu data segment selector + uint16_t res2; + uint32_t mxcsr; + uint32_t mxcsr_mask; +} __attribute__((packed)) __attribute__((aligned(16))); + +// FXSAVE, 64 bit mode header, REX.W=1 (32 bytes) +// V3_FP_MODE_64 +struct v3_fp_64_state { + uint16_t fcw; + uint16_t fsw; + uint8_t ftw; + uint8_t res0; + uint16_t fop; + uint64_t fip; //fpu instruction pointer + uint64_t fdp; //fpu data pointer + uint32_t mxcsr; + uint32_t mxcsr_mask; +} __attribute__((packed)) __attribute__((aligned(16))); + + +// FXSAVE, 64 bit mode header, REX.W=0 (32 bytes) +// V3_FP_MODE_64_COMPAT +struct v3_fp_64compat_state { + uint16_t fcw; + uint16_t fsw; + uint8_t ftw; + uint8_t res0; + uint16_t fop; + uint32_t fip; //fpu instruction pointer + uint16_t fcs; //fpu code segment selector + uint16_t res1; + uint32_t fdp; //fpu data pointer + uint16_t fds; //fpu data segment selector + uint16_t res2; + uint32_t mxcsr; + uint32_t mxcsr_mask; +} __attribute__((packed)) __attribute__((aligned(16))); + + +// +// This is an FXSAVE block +// +struct v3_fp_state_core { + union { + struct v3_fp_32_state fp32; + struct v3_fp_64_state fp64; + struct v3_fp_64compat_state fp64compat; + } header; + struct v3_fp_regs fprs; +} __attribute__((packed)) __attribute__((aligned(16))); + +struct v3_fp_state { + // Do we need to restore on next entry? + int need_restore; + // The meaning + enum {V3_FP_MODE_32=0, V3_FP_MODE_64, V3_FP_MODE_64_COMPAT} state_type; + struct v3_fp_state_core state __attribute__((aligned(16))); +} ; + + +struct guest_info; + +// Can we save FP state on this core? +int v3_can_handle_fp_state(); + +// Save state from this core to the structure +int v3_get_fp_state(struct guest_info *core); + +// Restore FP state from this structure to this core +int v3_put_fp_state(struct guest_info *core); + +int v3_init_fp(void); +int v3_deinit_fp(void); + +#ifndef V3_CONFIG_FP_SWITCH + +#define V3_FP_EXIT_SAVE(core) +#define V3_FP_ENTRY_RESTORE(core) + +#else + +#ifdef V3_CONFIG_LAZY_FPU_SWITCH + + +/* Ideally these would use the TS trick to do lazy calls to used_fpu() */ +#define V3_FP_EXIT_SAVE(core) \ + do { \ + extern struct v3_lazy_fpu_hooks * lazy_fpu_hooks; \ + if ((lazy_fpu_hooks) && (lazy_fpu_hooks)->used_fpu) { \ + (lazy_fpu_hooks)->used_fpu(); \ + } else { \ + v3_get_fp_state(core); \ + } \ + } while (0) + +#define V3_FP_ENTRY_RESTORE(core) \ + do { \ + extern struct v3_lazy_fpu_hooks * lazy_fpu_hooks; \ + if ((core)->fp_state.need_restore) { \ + v3_put_fp_state(core); \ + (core)->fp_state.need_restore=0; \ + } else { \ + if ((lazy_fpu_hooks) && (lazy_fpu_hooks)->will_use_fpu) { \ + (lazy_fpu_hooks)->need_fpu(); \ + } else { \ + v3_put_fp_state(core); \ + } \ + } \ + } while (0) + +#else + +// conservative FPU switching + +#define V3_FP_EXIT_SAVE(core) v3_get_fp_state(core) +#define V3_FP_ENTRY_RESTORE(core) v3_put_fp_state(core) + +#endif + +#endif + +#ifdef V3_CONFIG_CHECKPOINT + +struct v3_chkpt_ctx; + +// save state from structure to checkpoint/migration context +int v3_save_fp_state(struct v3_chkpt_ctx *ctx, struct guest_info *core); + +// load state from checkpoint/migration context to structure +int v3_load_fp_state(struct v3_chkpt_ctx *ctx, struct guest_info *core); + + +#endif + +#endif diff --git a/palacios/include/palacios/vmm_perftune.h b/palacios/include/palacios/vmm_perftune.h index 4346374..7876efa 100644 --- a/palacios/include/palacios/vmm_perftune.h +++ b/palacios/include/palacios/vmm_perftune.h @@ -24,6 +24,7 @@ #include +#include struct v3_yield_strategy { enum { @@ -58,6 +59,24 @@ void v3_strategy_driven_yield(struct guest_info *core, uint64_t time_since_l uint64_t v3_cycle_diff_in_usec(struct guest_info *core, uint64_t earlier_cycles, uint64_t later_cycles); +// The following three macros are intended to make it easy to +// use strategy-driven yield. Call the first one when you are out of work +// then call the second when each time that you want to yield because you are +// out of work, and then call the third one when you have work to do again +// +// This assumes the thread is locked to a core and may behave strangely if +// this is not the case. + +#define V3_NO_WORK(core) { \ + uint64_t _v3_strat_local_first=0, _v3_strat_local_cur=0; \ + _v3_strat_local_first=v3_get_host_time(core ? &(core->time_state) : 0); + + +#define V3_STILL_NO_WORK(core) \ + _v3_strat_local_cur=v3_get_host_time(core ? &(core->time_state) : 0); \ + v3_strategy_driven_yield(core,v3_cycle_diff_in_usec(core,_v3_strat_local_first,_v3_strat_local_cur)); + +#define V3_HAVE_WORK_AGAIN(core) } #endif diff --git a/palacios/include/palacios/vmm_types.h b/palacios/include/palacios/vmm_types.h index 82e83c6..01e22b7 100644 --- a/palacios/include/palacios/vmm_types.h +++ b/palacios/include/palacios/vmm_types.h @@ -76,6 +76,10 @@ typedef char sint8_t; typedef ulong_t addr_t; typedef ullong_t v3_reg_t; + +typedef uint8_t v3_xmm_reg_t[16]; +typedef uint8_t v3_fp_mmx_reg_t[10]; + #endif /* ! __V3VEE__ */ #endif diff --git a/palacios/src/interfaces/Kconfig b/palacios/src/interfaces/Kconfig index a72ce87..581852a 100644 --- a/palacios/src/interfaces/Kconfig +++ b/palacios/src/interfaces/Kconfig @@ -88,4 +88,11 @@ config HOST_PWRSTAT help Select this if you would like to access energy/power measurements within Palacios + +config HOST_LAZY_FPU_SWITCH + bool "Host provides lazy FPU context switching" + default n + help + Select this if your host provides lazy context switch support + for floating point state and you would like Palacios to use it endmenu diff --git a/palacios/src/interfaces/Makefile b/palacios/src/interfaces/Makefile index ae10d74..262b6cc 100644 --- a/palacios/src/interfaces/Makefile +++ b/palacios/src/interfaces/Makefile @@ -10,6 +10,7 @@ obj-$(V3_CONFIG_HOST_HYPERCALL) += vmm_host_hypercall.o obj-$(V3_CONFIG_HOST_PCI) += host_pci.o obj-$(V3_CONFIG_HOST_PMU) += vmm_pmu.o obj-$(V3_CONFIG_HOST_PWRSTAT) += vmm_pwrstat.o +obj-$(V3_CONFIG_HOST_LAZY_FPU_SWITCH) += vmm_lazy_fpu.o obj-y += null.o obj-y += vmm_numa.o diff --git a/palacios/src/interfaces/vmm_lazy_fpu.c b/palacios/src/interfaces/vmm_lazy_fpu.c new file mode 100644 index 0000000..7562ba0 --- /dev/null +++ b/palacios/src/interfaces/vmm_lazy_fpu.c @@ -0,0 +1,36 @@ +/* + * This file is part of the Palacios Virtual Machine Monitor developed + * by the V3VEE Project with funding from the United States National + * Science Foundation and the Department of Energy. + * + * The V3VEE Project is a joint project between Northwestern University + * and the University of New Mexico. You can find out more at + * http://www.v3vee.org + * + * Copyright (c) 2013, The V3VEE Project + * All rights reserved. + * + * Author: Peter Dinda + * + * This is free software. You are permitted to use, + * redistribute, and modify it as specified in the file "V3VEE_LICENSE". + */ + +#include +#include +#include +#include +#include + +#include + +struct v3_lazy_fpu_iface * palacios_lazy_fpu_hooks = 0; + + + +void V3_Init_Lazy_FPU (struct v3_lazy_fpu_iface * lazy_fpu_iface) +{ + palacios_lazy_fpu_hooks = lazy_fpu_iface; +} + + diff --git a/palacios/src/palacios/Makefile b/palacios/src/palacios/Makefile index 18cdea1..2ad3889 100644 --- a/palacios/src/palacios/Makefile +++ b/palacios/src/palacios/Makefile @@ -20,6 +20,7 @@ obj-y := \ vmm_io.o \ vmm_lock.o \ vmm_mem.o \ + vmm_fp.o \ vmm_msr.o \ vmm_paging.o \ vmm_options.o \ diff --git a/palacios/src/palacios/svm.c b/palacios/src/palacios/svm.c index 71e62d6..05d4b7a 100644 --- a/palacios/src/palacios/svm.c +++ b/palacios/src/palacios/svm.c @@ -39,6 +39,7 @@ #include #include +#include #ifdef V3_CONFIG_CHECKPOINT @@ -666,6 +667,8 @@ int v3_svm_enter(struct guest_info * info) { guest_state->rip = info->rip; guest_state->rsp = info->vm_regs.rsp; + V3_FP_ENTRY_RESTORE(info); + #ifdef V3_CONFIG_SYMCALL if (info->sym_core_state.symcall_state.sym_call_active == 0) { update_irq_entry_state(info); @@ -733,6 +736,8 @@ int v3_svm_enter(struct guest_info * info) { info->num_exits++; + V3_FP_EXIT_SAVE(info); + // Save Guest state from VMCB info->rip = guest_state->rip; info->vm_regs.rsp = guest_state->rsp; @@ -823,7 +828,9 @@ int v3_start_svm_guest(struct guest_info * info) { info->core_run_state = CORE_RUNNING; } else { PrintDebug(info->vm_info, info, "SVM core %u (on %u): Waiting for core initialization\n", info->vcpu_id, info->pcpu_id); - + + V3_NO_WORK(info); + while (info->core_run_state == CORE_STOPPED) { if (info->vm_info->run_state == VM_STOPPED) { @@ -831,9 +838,12 @@ int v3_start_svm_guest(struct guest_info * info) { return 0; } - v3_yield(info,-1); + V3_STILL_NO_WORK(info); + //PrintDebug(info->vm_info, info, "SVM core %u: still waiting for INIT\n", info->vcpu_id); } + + V3_HAVE_WORK_AGAIN(info); PrintDebug(info->vm_info, info, "SVM core %u(on %u) initialized\n", info->vcpu_id, info->pcpu_id); diff --git a/palacios/src/palacios/vmm.c b/palacios/src/palacios/vmm.c index cfef4f9..8c34b68 100644 --- a/palacios/src/palacios/vmm.c +++ b/palacios/src/palacios/vmm.c @@ -55,6 +55,8 @@ int v3_dbg_enable = 0; static void init_cpu(void * arg) { uint32_t cpu_id = (uint32_t)(addr_t)arg; + v3_init_fp(); + #ifdef V3_CONFIG_SVM if (v3_is_svm_capable()) { PrintDebug(VM_NONE, VCORE_NONE, "Machine is SVM Capable\n"); @@ -100,6 +102,9 @@ static void deinit_cpu(void * arg) { PrintError(VM_NONE, VCORE_NONE, "CPU has no virtualization Extensions\n"); break; } + + v3_deinit_fp(); + } void Init_V3(struct v3_os_hooks * hooks, char * cpu_mask, int num_cpus, char *options) { @@ -689,6 +694,7 @@ static int sim_callback(struct guest_info * core, void * private_data) { V3_Print(core->vm_info, core, "Simulation callback activated (guest_rip=%p)\n", (void *)core->rip); while (v3_bitmap_check(timeout_map, core->vcpu_id) == 1) { + // We spin here if there is noone to yield to v3_yield(NULL,-1); } @@ -759,7 +765,8 @@ int v3_simulate_vm(struct v3_vm_info * vm, unsigned int msecs) { if (all_blocked == 1) { break; } - + + // Intentionally spin if there is no one to yield to v3_yield(NULL,-1); } diff --git a/palacios/src/palacios/vmm_barrier.c b/palacios/src/palacios/vmm_barrier.c index 35efe0f..ba88e2b 100644 --- a/palacios/src/palacios/vmm_barrier.c +++ b/palacios/src/palacios/vmm_barrier.c @@ -120,6 +120,7 @@ int v3_wait_for_barrier(struct v3_vm_info * vm_info, struct guest_info * local_c break; } + // return immediately and spin if there is no one to yield to v3_yield(local_core,-1); } @@ -198,6 +199,10 @@ int v3_wait_at_barrier(struct guest_info * core) { return 0; } +#ifdef V3_CONFIG_LAZY_FP_SWITCH + v3_get_fp_state(core); // snapshot FP state now regardless of lazy eval +#endif + V3_Print(core->vm_info, core, "Core %d waiting at barrier\n", core->vcpu_id); /* Barrier has been activated. @@ -211,8 +216,13 @@ int v3_wait_at_barrier(struct guest_info * core) { // wait for cpu bit to clear while (v3_bitmap_check(&(barrier->cpu_map), core->vcpu_id)) { + // Barrier wait will spin if there is no competing work v3_yield(core,-1); } + +#ifdef V3_LAZY_FP_SWITCH + core->fp_state.need_restore=1; // restore FP on next entry +#endif return 0; } diff --git a/palacios/src/palacios/vmm_checkpoint.c b/palacios/src/palacios/vmm_checkpoint.c index 92c5e16..7a564e6 100644 --- a/palacios/src/palacios/vmm_checkpoint.c +++ b/palacios/src/palacios/vmm_checkpoint.c @@ -412,15 +412,15 @@ struct mem_migration_state { struct v3_bitmap modified_pages; }; -static int paging_callback(struct guest_info *core, - struct v3_shdw_pg_event *event, - void *priv_data) +static int shadow_paging_callback(struct guest_info *core, + struct v3_shdw_pg_event *event, + void *priv_data) { struct mem_migration_state *m = (struct mem_migration_state *)priv_data; if (event->event_type==SHADOW_PAGEFAULT && event->event_order==SHADOW_PREIMPL && - event->error_code.write) { + event->error_code.write) { // Note, assumes VTLB behavior where we will see the write even if preceded by a read addr_t gpa; if (!v3_gva_to_gpa(core,event->gva,&gpa)) { // write to this page @@ -434,7 +434,30 @@ static int paging_callback(struct guest_info *core, return 0; } - + + +/* +static int nested_paging_callback(struct guest_info *core, + struct v3_nested_pg_event *event, + void *priv_data) +{ + struct mem_migration_state *m = (struct mem_migration_state *)priv_data; + + if (event->event_type==NESTED_PAGEFAULT && + event->event_order==NESTED_PREIMPL && + event->error_code.write) { // Assumes we will see a write after reads + if (event->gpavm_info->mem_size) { + v3_bitmap_set(&(m->modified_pages),(event->gpa)>>12); + } else { + // no worries, this isn't physical memory + } + } else { + // we don't care about other events + } + + return 0; +} +*/ static struct mem_migration_state *start_page_tracking(struct v3_vm_info *vm) @@ -456,10 +479,27 @@ static struct mem_migration_state *start_page_tracking(struct v3_vm_info *vm) V3_Free(m); } - v3_register_shadow_paging_event_callback(vm,paging_callback,m); + // We assume that the migrator has already verified that all cores are + // using the identical model (shadow or nested) + // This must not change over the execution of the migration - for (i=0;inum_cores;i++) { + if (vm->cores[0].shdw_pg_mode==SHADOW_PAGING) { + v3_register_shadow_paging_event_callback(vm,shadow_paging_callback,m); + + for (i=0;inum_cores;i++) { v3_invalidate_shadow_pts(&(vm->cores[i])); + } + } else if (vm->cores[0].shdw_pg_mode==NESTED_PAGING) { + //v3_register_nested_paging_event_callback(vm,nested_paging_callback,m); + + for (i=0;inum_cores;i++) { + //v3_invalidate_nested_addr_range(&(vm->cores[i]),0,vm->mem_size-1); + } + } else { + PrintError(vm, VCORE_NONE, "Unsupported paging mode\n"); + v3_bitmap_deinit(&(m->modified_pages)); + V3_Free(m); + return 0; } // and now we should get callbacks as writes happen @@ -469,11 +509,15 @@ static struct mem_migration_state *start_page_tracking(struct v3_vm_info *vm) static void stop_page_tracking(struct mem_migration_state *m) { - v3_unregister_shadow_paging_event_callback(m->vm,paging_callback,m); - - v3_bitmap_deinit(&(m->modified_pages)); + if (m->vm->cores[0].shdw_pg_mode==SHADOW_PAGING) { + v3_unregister_shadow_paging_event_callback(m->vm,shadow_paging_callback,m); + } else { + //v3_unregister_nested_paging_event_callback(m->vm,nested_paging_callback,m); + } - V3_Free(m); + v3_bitmap_deinit(&(m->modified_pages)); + + V3_Free(m); } @@ -731,6 +775,10 @@ static int load_core(struct guest_info * info, struct v3_chkpt * chkpt, v3_chkpt PrintError(info->vm_info, info, "Could not open context to load core\n"); goto loadfailout; } + + // Run state is needed to determine when AP cores need + // to be immediately run after resume + V3_CHKPT_LOAD(ctx,"run_state",info->core_run_state,loadfailout); V3_CHKPT_LOAD(ctx, "RIP", info->rip, loadfailout); @@ -798,6 +846,11 @@ static int load_core(struct guest_info * info, struct v3_chkpt * chkpt, v3_chkpt V3_CHKPT_LOAD(ctx, "GUEST_CR0", info->shdw_pg_state.guest_cr0, loadfailout); V3_CHKPT_LOAD(ctx, "GUEST_EFER", info->shdw_pg_state.guest_efer, loadfailout); + // floating point + if (v3_load_fp_state(ctx,info)) { + goto loadfailout; + } + v3_chkpt_close_ctx(ctx); ctx=0; PrintDebug(info->vm_info, info, "Finished reading guest_info information\n"); @@ -912,6 +965,7 @@ static int save_core(struct guest_info * info, struct v3_chkpt * chkpt, v3_chkpt goto savefailout; } + V3_CHKPT_SAVE(ctx,"run_state",info->core_run_state,savefailout); V3_CHKPT_SAVE(ctx, "RIP", info->rip, savefailout); @@ -979,6 +1033,11 @@ static int save_core(struct guest_info * info, struct v3_chkpt * chkpt, v3_chkpt V3_CHKPT_SAVE(ctx, "GUEST_CR0", info->shdw_pg_state.guest_cr0, savefailout); V3_CHKPT_SAVE(ctx, "GUEST_EFER", info->shdw_pg_state.guest_efer, savefailout); + // floating point + if (v3_save_fp_state(ctx,info)) { + goto savefailout; + } + v3_chkpt_close_ctx(ctx); ctx=0; if (opts & V3_CHKPT_OPT_SKIP_ARCHDEP) { @@ -1200,11 +1259,15 @@ int v3_chkpt_send_vm(struct v3_vm_info * vm, char * store, char * url, v3_chkpt_ struct mem_migration_state *mm_state; int i; - // Currently will work only for shadow paging - for (i=0;inum_cores;i++) { - if (vm->cores[i].shdw_pg_mode!=SHADOW_PAGING && !(opts & V3_CHKPT_OPT_SKIP_MEM)) { - PrintError(vm, VCORE_NONE, "Cannot currently handle nested paging\n"); - return -1; + // Cores must all be in the same mode + // or we must be skipping mmeory + if (!(opts & V3_CHKPT_OPT_SKIP_MEM)) { + v3_paging_mode_t mode = vm->cores[0].shdw_pg_mode; + for (i=1;inum_cores;i++) { + if (vm->cores[i].shdw_pg_mode != mode) { + PrintError(vm, VCORE_NONE, "Cores having different paging modes (nested and shadow) are not supported\n"); + return -1; + } } } diff --git a/palacios/src/palacios/vmm_fp.c b/palacios/src/palacios/vmm_fp.c new file mode 100644 index 0000000..d3b6ca3 --- /dev/null +++ b/palacios/src/palacios/vmm_fp.c @@ -0,0 +1,179 @@ +/* + * This file is part of the Palacios Virtual Machine Monitor developed + * by the V3VEE Project with funding from the United States National + * Science Foundation and the Department of Energy. + * + * The V3VEE Project is a joint project between Northwestern University + * and the University of New Mexico. You can find out more at + * http://www.v3vee.org + * + * Copyright (c) 2013, Peter Dinda + * Copyright (c) 2013, The V3VEE Project + * All rights reserved. + * + * Author: Peter Dinda + * + * This is free software. You are permitted to use, + * redistribute, and modify it as specified in the file "V3VEE_LICENSE". + */ + +#include +#include +#include +#include + +#ifdef V3_CONFIG_CHECKPOINT +#include +#endif + + +static int can_do_fp=-1; + +// assumes identical on all cores... +int v3_can_handle_fp_state() +{ + if (can_do_fp!=-1) { + return can_do_fp; + } else { + uint32_t eax, ebx, ecx, edx; + + v3_cpuid(CPUID_FEATURE_IDS,&eax,&ebx,&ecx,&edx); + + can_do_fp= !!(edx & (1<<25)); // do we have SSE? + + return can_do_fp; + } +} + +int v3_init_fp() +{ + if (v3_can_handle_fp_state()) { + V3_Print(VM_NONE,VCORE_NONE,"Floating point save/restore init: available on this hardware\n"); + } else { + V3_Print(VM_NONE,VCORE_NONE,"Floating point save/restore init: UNAVAILABLE ON THIS HARDWARE\n"); + } + return 0; +} + +int v3_deinit_fp() +{ + V3_Print(VM_NONE,VCORE_NONE,"Floating point save/restore deinited\n"); + return 0; +} + +#define EFER_MSR 0xc0000080 + + +int v3_get_fp_state(struct guest_info *core) +{ + if (v3_can_handle_fp_state()) { + /* + If the fast-FXSAVE/FXRSTOR (FFXSR) feature is enabled in EFER, FXSAVE and FXRSTOR do not save or restore the XMM0–15 registers when executed in 64-bit mode at CPL 0. The x87 environment and MXCSR are saved whether fast-FXSAVE/FXRSTOR is enabled or not. Software can use the CPUID instruction to determine whether the fast-FXSAVE/FXRSTOR feature is available + (CPUID Fn8000_0001h_EDX[FFXSR]). The fast-FXSAVE/FXRSTOR feature has no effect on FXSAVE/FXRSTOR in non 64-bit mode or when CPL > 0. + + */ + + // We need to assure that the fast-FXSAVE/FXRSTOR are not on + // otherwise we will NOT have the XMM regs since we running at CPL 0 + // + + int restore=0; + uint32_t high,low; + + v3_get_msr(EFER_MSR,&high,&low); + + if (low & (0x1<<14)) { + // fast save is in effect + low &= ~(0x1<<14); + restore=1; + v3_set_msr(EFER_MSR, high, low); + } + + __asm__ __volatile__(" rex64/fxsave %0 ; " + : "=m"(core->fp_state.state)); /* no input, no clobber */ + if (restore) { + low |= 0x1<<14; + v3_set_msr(EFER_MSR, high, low); + } + + // this is a giant guess + // we really need to capture the state type as seen in the guest, not here... + core->fp_state.state_type=V3_FP_MODE_64; + + return 0; + + } else { + return -1; + } +} + + +// Restore FP state from this structure to this core +int v3_put_fp_state(struct guest_info *core) +{ + if (v3_can_handle_fp_state()) { + // We need to assure that the fast-FXSAVE/FXRSTOR are not on + // otherwise we will NOT have the XMM regs since we running at CPL 0 + // + + int restore=0; + uint32_t high,low; + + v3_get_msr(EFER_MSR,&high,&low); + + if (low & (0x1<<14)) { + // fast restore is in effect + low &= ~(0x1<<14); + restore=1; + v3_set_msr(EFER_MSR, high, low); + } + + __asm__ __volatile__(" rex64/fxrstor %0; " + : /* no output */ + : "m"((core->fp_state.state)) ); /* no clobber*/ + + + if (restore) { + low |= 0x1<<14; + v3_set_msr(EFER_MSR, high, low); + } + + return 0; + } else { + return -1; + } +} + +#ifdef V3_CONFIG_CHECKPOINT + + +int v3_save_fp_state(struct v3_chkpt_ctx *ctx, struct guest_info *core) +{ + V3_CHKPT_SAVE(ctx, "FP_STATE_TYPE", core->fp_state.state_type, savefailout); + if (v3_chkpt_save(ctx,"FP_STATE_BLOB",sizeof(core->fp_state.state),&(core->fp_state.state))) { + goto savefailout; + } + + return 0; + + savefailout: + PrintError(core->vm_info,core,"Unable to save floating point state\n"); + return -1; +} + + +int v3_load_fp_state(struct v3_chkpt_ctx *ctx, struct guest_info *core) +{ + V3_CHKPT_LOAD(ctx, "FP_STATE_TYPE", core->fp_state.state_type, loadfailout); + if (v3_chkpt_load(ctx,"FP_STATE_BLOB",sizeof(core->fp_state.state),&(core->fp_state.state))) { + goto loadfailout; + } + + return 0; + + loadfailout: + PrintError(core->vm_info,core,"Unable to load floating point state\n"); + return -1; +} + +#endif diff --git a/palacios/src/palacios/vmx.c b/palacios/src/palacios/vmx.c index ae3fad4..de81dfc 100644 --- a/palacios/src/palacios/vmx.c +++ b/palacios/src/palacios/vmx.c @@ -1028,7 +1028,8 @@ int v3_vmx_enter(struct guest_info * info) { check_vmcs_write(VMCS_PREEMPT_TIMER, preempt_window); } - + + V3_FP_ENTRY_RESTORE(info); { uint64_t entry_tsc = 0; @@ -1081,6 +1082,8 @@ int v3_vmx_enter(struct guest_info * info) { info->num_exits++; + V3_FP_EXIT_SAVE(info); + /* If we have the preemption time, then use it to get more accurate guest time */ if (vmx_info->pin_ctrls.active_preempt_timer) { uint32_t cycles_left = 0; @@ -1187,6 +1190,8 @@ int v3_start_vmx_guest(struct guest_info * info) { } else { PrintDebug(info->vm_info, info, "VMX core %u: Waiting for core initialization\n", info->vcpu_id); + + V3_NO_WORK(info); while (info->core_run_state == CORE_STOPPED) { @@ -1194,11 +1199,13 @@ int v3_start_vmx_guest(struct guest_info * info) { // The VM was stopped before this core was initialized. return 0; } - - v3_yield(info,-1); + + V3_STILL_NO_WORK(info); //PrintDebug(info->vm_info, info, "VMX core %u: still waiting for INIT\n",info->vcpu_id); } - + + V3_HAVE_WORK_AGAIN(info); + PrintDebug(info->vm_info, info, "VMX core %u initialized\n", info->vcpu_id); // We'll be paranoid about race conditions here