#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH #include #endif #include "palacios.h" #include "mm.h" #include "memcheck.h" #include "lockcheck.h" // The following can be used to track memory bugs // zero memory after allocation (now applies to valloc and page alloc as well) #define ALLOC_ZERO_MEM 1 // pad allocations by this many bytes on both ends of block (heap only) #define ALLOC_PAD 0 u32 pg_allocs = 0; u32 pg_frees = 0; u32 mallocs = 0; u32 frees = 0; u32 vmallocs = 0; u32 vfrees = 0; static struct v3_vm_info * irq_to_guest_map[256]; extern unsigned int cpu_khz; extern int cpu_list[NR_CPUS]; extern int cpu_list_len; static char *print_buffer[NR_CPUS]; static void deinit_print_buffers(void) { int i; for (i=0;i=0) { printk(KERN_INFO "palacios (pcore %u vm %s vcore %u): %s", cpu, guest->name, vcore, buf); } else { printk(KERN_INFO "palacios (pcore %u vm %s): %s", cpu, guest->name, buf); } } else { printk(KERN_INFO "palacios (pcore %u): %s", cpu, buf); } return; #endif } /* * Allocates a contiguous region of pages of the requested size. * Returns the physical address of the first page in the region. */ void *palacios_allocate_pages(int num_pages, unsigned int alignment, int node_id, int (*filter_func)(void *paddr, void *filter_state), void *filter_state) { void * pg_addr = NULL; if (num_pages<=0) { ERROR("ALERT ALERT Attempt to allocate zero or fewer pages (%d pages, alignment %d, node %d, filter_func %p, filter_state %p)\n",num_pages, alignment, node_id, filter_func, filter_state); return NULL; } pg_addr = (void *)alloc_palacios_pgs(num_pages, alignment, node_id, filter_func, filter_state); if (!pg_addr) { ERROR("ALERT ALERT Page allocation has FAILED Warning (%d pages, alignment %d, node %d, filter_func %p, filter_state %p)\n",num_pages, alignment, node_id, filter_func, filter_state); return NULL; } pg_allocs += num_pages; #if ALLOC_ZERO_MEM memset(__va(pg_addr),0,num_pages*4096); #endif MEMCHECK_ALLOC_PAGES(pg_addr,num_pages*4096); return pg_addr; } /** * Frees a page previously allocated via palacios_allocate_page(). * Note that palacios_allocate_page() can allocate multiple pages with * a single call while palacios_free_page() only frees a single page. */ void palacios_free_pages(void * page_paddr, int num_pages) { if (!page_paddr) { ERROR("Ignoring free pages: 0x%p (0x%lx)for %d pages\n", page_paddr, (uintptr_t)page_paddr, num_pages); dump_stack(); return; } pg_frees += num_pages; free_palacios_pgs((uintptr_t)page_paddr, num_pages); MEMCHECK_FREE_PAGES(page_paddr,num_pages*4096); } void * palacios_alloc_extended(unsigned int size, unsigned int flags, int node) { void * addr = NULL; if (size==0) { // note that modern kernels will respond to a zero byte // kmalloc and return the address 0x10... In Palacios, // we will simply not allow 0 byte allocs at all, of any kind ERROR("ALERT ALERT attempt to kmalloc zero bytes rejected\n"); return NULL; } if (node==-1) { addr = kmalloc(size+2*ALLOC_PAD, flags); } else { addr = kmalloc_node(size+2*ALLOC_PAD, flags, node); } if (!addr || IS_ERR(addr)) { ERROR("ALERT ALERT kmalloc has FAILED FAILED FAILED\n"); return NULL; } mallocs++; #if ALLOC_ZERO_MEM memset(addr,0,size+2*ALLOC_PAD); #endif MEMCHECK_KMALLOC(addr,size+2*ALLOC_PAD); return addr+ALLOC_PAD; } void * palacios_valloc(unsigned int size) { void * addr = NULL; if (size==0) { ERROR("ALERT ALERT attempt to vmalloc zero bytes rejected\n"); return NULL; } addr = vmalloc(size); if (!addr || IS_ERR(addr)) { ERROR("ALERT ALERT vmalloc has FAILED FAILED FAILED\n"); return NULL; } vmallocs++; #if ALLOC_ZERO_MEM memset(addr,0,size); #endif MEMCHECK_VMALLOC(addr,size); return addr; } void palacios_vfree(void *p) { if (!p) { ERROR("Ignoring vfree: 0x%p\n",p); dump_stack(); return; } vfree(p); vfrees++; MEMCHECK_VFREE(p); } /** * Allocates 'size' bytes of kernel memory. * Returns the kernel virtual address of the memory allocated. */ void * palacios_alloc(unsigned int size) { // It is very important that this test remains since // this function is used extensively throughout palacios and the linux // module, both in places where interrupts are off and where they are on // a GFP_KERNEL call, when done with interrupts off can lead to DEADLOCK if (irqs_disabled() || in_atomic()) { return palacios_alloc_extended(size,GFP_ATOMIC,-1); } else { return palacios_alloc_extended(size,GFP_KERNEL,-1); } } /** * Frees memory that was previously allocated by palacios_alloc(). */ void palacios_free( void * addr ) { if (!addr) { ERROR("Ignoring free : 0x%p\n", addr); dump_stack(); return; } frees++; kfree(addr-ALLOC_PAD); MEMCHECK_KFREE(addr-ALLOC_PAD); } /** * Converts a kernel virtual address to the corresponding physical address. */ void * palacios_vaddr_to_paddr( void * vaddr ) { return (void*) __pa(vaddr); } /** * Converts a physical address to the corresponding kernel virtual address. */ void * palacios_paddr_to_vaddr( void * paddr ) { return __va(paddr); } /** * Runs a function on the specified CPU. */ void palacios_xcall( int cpu_id, void (*fn)(void *arg), void * arg ) { // We set wait to 1, but I'm not sure this is necessary smp_call_function_single(cpu_id, fn, arg, 1); return; } #define MAX_THREAD_NAME 32 struct lnx_thread_arg { int (*fn)(void * arg); void * arg; char name[MAX_THREAD_NAME]; }; static int lnx_thread_target(void * arg) { struct lnx_thread_arg * thread_info = (struct lnx_thread_arg *)arg; int ret = 0; /* INFO("Daemonizing new Palacios thread (name=%s)\n", thread_info->name); daemonize(thread_info->name); allow_signal(SIGKILL); */ #ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH // We are a kernel thread that needs FPU save/restore state // vcores definitely need this, all the other threads get it too, // but they just won't use it fpu_alloc(&(current->thread.fpu)); #endif ret = thread_info->fn(thread_info->arg); INFO("Palacios Thread (%s) EXITING\n", thread_info->name); palacios_free(thread_info); // handle cleanup // We rely on do_exit to free the fpu data // since we could get switched at any point until the thread is done... do_exit(ret); return 0; // should not get here. } /** * Creates a kernel thread. */ void * palacios_create_and_start_kernel_thread( int (*fn) (void * arg), void * arg, char * thread_name) { struct lnx_thread_arg * thread_info = palacios_alloc(sizeof(struct lnx_thread_arg)); if (!thread_info) { ERROR("ALERT ALERT Unable to allocate thread\n"); return NULL; } thread_info->fn = fn; thread_info->arg = arg; strncpy(thread_info->name,thread_name,MAX_THREAD_NAME); thread_info->name[MAX_THREAD_NAME-1] =0; return kthread_run( lnx_thread_target, thread_info, thread_info->name ); } /** * Starts a kernel thread on the specified CPU. */ void * palacios_create_thread_on_cpu(int cpu_id, int (*fn)(void * arg), void * arg, char * thread_name ) { struct task_struct * thread = NULL; struct lnx_thread_arg * thread_info = palacios_alloc(sizeof(struct lnx_thread_arg)); if (!thread_info) { ERROR("ALERT ALERT Unable to allocate thread to start on cpu\n"); return NULL; } thread_info->fn = fn; thread_info->arg = arg; strncpy(thread_info->name,thread_name,MAX_THREAD_NAME); thread_info->name[MAX_THREAD_NAME-1] =0; thread = kthread_create( lnx_thread_target, thread_info, thread_info->name ); if (!thread || IS_ERR(thread)) { WARNING("Palacios error creating thread: %s\n", thread_info->name); palacios_free(thread_info); return NULL; } if (set_cpus_allowed_ptr(thread, cpumask_of(cpu_id)) != 0) { WARNING("Attempt to start thread on disallowed CPU\n"); kthread_stop(thread); palacios_free(thread_info); return NULL; } return thread; } void palacios_start_thread(void * th){ struct task_struct * thread = (struct task_struct *)th; wake_up_process(thread); } /* Convenience wrapper */ void * palacios_create_and_start_thread_on_cpu(int cpu_id, int (*fn)(void * arg), void * arg, char * thread_name ) { void *t = palacios_create_thread_on_cpu(cpu_id, fn, arg, thread_name); if (t) { palacios_start_thread(t); } return t; } /** * Rebind a kernel thread to the specified CPU * The thread will be running on target CPU on return * non-zero return means failure */ int palacios_move_thread_to_cpu(int new_cpu_id, void * thread_ptr) { struct task_struct * thread = (struct task_struct *)thread_ptr; INFO("Moving thread (%p) to cpu %d\n", thread, new_cpu_id); if (thread == NULL) { thread = current; } /* * Bind to the specified CPU. When this call returns, * the thread should be running on the target CPU. */ return set_cpus_allowed_ptr(thread, cpumask_of(new_cpu_id)); } /** * Returns the CPU ID that the caller is running on. */ unsigned int palacios_get_cpu(void) { /* We want to call smp_processor_id() * But this is not safe if kernel preemption is possible * We need to ensure that the palacios threads are bound to a give cpu */ unsigned int cpu_id = get_cpu(); put_cpu(); return cpu_id; } /** * Interrupts the physical CPU corresponding to the specified logical guest cpu. * * NOTE: * This is dependent on the implementation of xcall_reschedule(). Currently * xcall_reschedule does not explicitly call schedule() on the destination CPU, * but instead relies on the return to user space to handle it. Because * palacios is a kernel thread schedule will not be called, which is correct. * If it ever changes to induce side effects, we'll need to figure something * else out... */ #include static void palacios_interrupt_cpu( struct v3_vm_info * vm, int cpu_id, int vector ) { if (vector == 0) { smp_send_reschedule(cpu_id); } else { apic->send_IPI_mask(cpumask_of(cpu_id), vector); } return; } /** * Dispatches an interrupt to Palacios for handling. */ static void palacios_dispatch_interrupt( int vector, void * dev, struct pt_regs * regs ) { struct v3_interrupt intr = { .irq = vector, .error = regs->orig_ax, .should_ack = 1, }; if (irq_to_guest_map[vector]) { v3_deliver_irq(irq_to_guest_map[vector], &intr); } } /** * Instructs the kernel to forward the specified IRQ to Palacios. */ static int palacios_hook_interrupt(struct v3_vm_info * vm, unsigned int vector ) { INFO("hooking vector %d\n", vector); if (irq_to_guest_map[vector]) { WARNING( "%s: Interrupt vector %u is already hooked.\n", __func__, vector); return -1; } DEBUG( "%s: Hooking interrupt vector %u to vm %p.\n", __func__, vector, vm); irq_to_guest_map[vector] = vm; /* * NOTE: Normally PCI devices are supposed to be level sensitive, * but we need them to be edge sensitive so that they are * properly latched by Palacios. Leaving them as level * sensitive would lead to an interrupt storm. */ //ioapic_set_trigger_for_vector(vector, ioapic_edge_sensitive); //set_idtvec_handler(vector, palacios_dispatch_interrupt); if (vector < 32) { ERROR("unexpected vector for hooking\n"); return -1; } else { int device_id = 0; int flag = 0; int error; DEBUG("hooking vector: %d\n", vector); if (vector == 32) { flag = IRQF_TIMER; } else { flag = IRQF_SHARED; } error = request_irq((vector - 32), (void *)palacios_dispatch_interrupt, flag, "interrupt_for_palacios", &device_id); if (error) { ERROR("error code for request_irq is %d\n", error); ERROR("request vector %d failed", vector); return -1; } } return 0; } /** * Acknowledges an interrupt. */ static int palacios_ack_interrupt( int vector ) { ack_APIC_irq(); DEBUG("Pretending to ack interrupt, vector=%d\n", vector); return 0; } /** * Returns the CPU frequency in kilohertz. */ unsigned int palacios_get_cpu_khz(void) { INFO("cpu_khz is %u\n", cpu_khz); if (cpu_khz == 0) { INFO("faking cpu_khz to 1000000\n"); return 1000000; } else { return cpu_khz; } //return 1000000; } /** * Yield the CPU so other host OS tasks can run. * This will return immediately if there is no other thread that is runnable * And there is no real bound on how long it will yield */ void palacios_yield_cpu(void) { schedule(); return; } /** * Yield the CPU so other host OS tasks can run. * Given now immediately if there is no other thread that is runnable * And there is no real bound on how long it will yield */ void palacios_sleep_cpu(unsigned int us) { set_current_state(TASK_INTERRUPTIBLE); if (us) { unsigned int uspj = 1000000U/HZ; unsigned int jiffies = us/uspj + ((us%uspj) !=0); // ceiling schedule_timeout(jiffies); } else { schedule(); } return; } void palacios_wakeup_cpu(void *thread) { wake_up_process(thread); return; } /** * Allocates a mutex. * Returns NULL on failure. */ void * palacios_mutex_alloc(void) { spinlock_t *lock = palacios_alloc(sizeof(spinlock_t)); if (lock) { spin_lock_init(lock); LOCKCHECK_ALLOC(lock); } else { ERROR("ALERT ALERT Unable to allocate lock\n"); return NULL; } return lock; } void palacios_mutex_init(void *mutex) { spinlock_t *lock = (spinlock_t*)mutex; if (lock) { spin_lock_init(lock); LOCKCHECK_ALLOC(lock); } } void palacios_mutex_deinit(void *mutex) { spinlock_t *lock = (spinlock_t*)mutex; if (lock) { // no actual spin_lock_deinit on linux // our purpose here is to drive the lock checker LOCKCHECK_FREE(lock); } } /** * Frees a mutex. */ void palacios_mutex_free(void * mutex) { palacios_free(mutex); LOCKCHECK_FREE(mutex); } /** * Locks a mutex. */ void palacios_mutex_lock(void * mutex, int must_spin) { LOCKCHECK_LOCK_PRE(mutex); spin_lock((spinlock_t *)mutex); LOCKCHECK_LOCK_POST(mutex); } /** * Locks a mutex, disabling interrupts on this core */ void * palacios_mutex_lock_irqsave(void * mutex, int must_spin) { unsigned long flags; LOCKCHECK_LOCK_IRQSAVE_PRE(mutex,flags); spin_lock_irqsave((spinlock_t *)mutex,flags); LOCKCHECK_LOCK_IRQSAVE_POST(mutex,flags); return (void *)flags; } /** * Unlocks a mutex. */ void palacios_mutex_unlock( void * mutex ) { LOCKCHECK_UNLOCK_PRE(mutex); spin_unlock((spinlock_t *)mutex); LOCKCHECK_UNLOCK_POST(mutex); } /** * Unlocks a mutex and restores previous interrupt state on this core */ void palacios_mutex_unlock_irqrestore(void *mutex, void *flags) { LOCKCHECK_UNLOCK_IRQRESTORE_PRE(mutex,(unsigned long)flags); // This is correct, flags is opaque spin_unlock_irqrestore((spinlock_t *)mutex,(unsigned long)flags); LOCKCHECK_UNLOCK_IRQRESTORE_POST(mutex,(unsigned long)flags); } void palacios_used_fpu(void) { // We assume we are not preemptible here... #ifndef TS_USEDFPU struct task_struct *tsk = current; tsk->thread.fpu.has_fpu = 1; #else struct thread_info *cur = current_thread_info(); cur->status |= TS_USEDFPU; #endif clts(); // After this, FP Save should be handled by Linux if it // switches to a different task and that task uses FPU } inline int ists(void) { return read_cr0() & X86_CR0_TS; } void palacios_need_fpu(void) { // We assume we are not preemptible here... if (ists()) { // we have been switched back to from somewhere else... // Do a restore now - this will also do a clts() math_state_restore(); } } /** * Structure used by the Palacios hypervisor to interface with the host kernel. */ static struct v3_os_hooks palacios_os_hooks = { .print = palacios_print_scoped, .allocate_pages = palacios_allocate_pages, .free_pages = palacios_free_pages, .vmalloc = palacios_valloc, .vfree = palacios_vfree, .malloc = palacios_alloc, .free = palacios_free, .vaddr_to_paddr = palacios_vaddr_to_paddr, .paddr_to_vaddr = palacios_paddr_to_vaddr, .hook_interrupt = palacios_hook_interrupt, .ack_irq = palacios_ack_interrupt, .get_cpu_khz = palacios_get_cpu_khz, .start_kernel_thread = palacios_create_and_start_kernel_thread, .yield_cpu = palacios_yield_cpu, .sleep_cpu = palacios_sleep_cpu, .wakeup_cpu = palacios_wakeup_cpu, .mutex_alloc = palacios_mutex_alloc, .mutex_free = palacios_mutex_free, .mutex_lock = palacios_mutex_lock, .mutex_unlock = palacios_mutex_unlock, .mutex_lock_irqsave = palacios_mutex_lock_irqsave, .mutex_unlock_irqrestore= palacios_mutex_unlock_irqrestore, .get_cpu = palacios_get_cpu, .interrupt_cpu = palacios_interrupt_cpu, .call_on_cpu = palacios_xcall, .create_thread_on_cpu = palacios_create_thread_on_cpu, .start_thread = palacios_start_thread, .move_thread_to_cpu = palacios_move_thread_to_cpu, }; #ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH // Note that this host interface is defined here since it's // intertwined with thread creation... static struct v3_lazy_fpu_iface palacios_fpu_hooks = { .used_fpu = palacios_used_fpu, .need_fpu = palacios_need_fpu }; #endif int palacios_vmm_init( char *options ) { int num_cpus = num_online_cpus(); char * cpu_mask = NULL; if (cpu_list_len > 0) { int major = 0; int minor = 0; int i = 0; cpu_mask = palacios_alloc((num_cpus / 8) + 1); if (!cpu_mask) { ERROR("Cannot allocate cpu mask\n"); return -1; } memset(cpu_mask, 0, (num_cpus / 8) + 1); for (i = 0; i < cpu_list_len; i++) { if (cpu_list[i] >= num_cpus) { WARNING("CPU (%d) exceeds number of available CPUs. Ignoring...\n", cpu_list[i]); continue; } major = cpu_list[i] / 8; minor = cpu_list[i] % 8; *(cpu_mask + major) |= (0x1 << minor); } } memset(irq_to_guest_map, 0, sizeof(struct v3_vm_info *) * 256); if (init_print_buffers()) { ERROR("Cannot initialize print buffers\n"); palacios_free(cpu_mask); return -1; } INFO("palacios_init starting - calling init_v3\n"); Init_V3(&palacios_os_hooks, cpu_mask, num_cpus, options); #ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH V3_Init_Lazy_FPU(&palacios_fpu_hooks); #endif return 0; } int palacios_vmm_exit( void ) { Shutdown_V3(); INFO("palacios shutdown complete\n"); deinit_print_buffers(); return 0; }