1 #include <linux/kernel.h>
2 #include <linux/kthread.h>
3 #include <linux/spinlock.h>
5 #include <linux/interrupt.h>
6 #include <linux/linkage.h>
7 #include <linux/sched.h>
8 #include <linux/uaccess.h>
9 #include <asm/irq_vectors.h>
11 #include <asm/thread_info.h>
13 #include <linux/version.h>
14 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,0)
15 #include <asm/fpu-internal.h>
18 #include <linux/init.h>
19 #include <linux/module.h>
20 #include <linux/kthread.h>
21 #include <asm/uaccess.h>
22 #include <linux/smp.h>
23 #include <linux/vmalloc.h>
27 #include <palacios/vmm.h>
28 #include <palacios/vmm_host_events.h>
30 #ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH
31 #include <interfaces/vmm_lazy_fpu.h>
36 #include "util-hashtable.h"
41 #include "lockcheck.h"
45 // The following can be used to track memory bugs
46 // zero memory after allocation (now applies to valloc and page alloc as well)
47 #define ALLOC_ZERO_MEM 1
48 // pad allocations by this many bytes on both ends of block (heap only)
59 static struct v3_vm_info * irq_to_guest_map[256];
62 extern unsigned int cpu_khz;
64 extern int cpu_list[NR_CPUS];
65 extern int cpu_list_len;
68 extern struct hashtable *v3_thread_resource_map;
71 static char *print_buffer[NR_CPUS];
73 static void deinit_print_buffers(void)
77 for (i=0;i<NR_CPUS;i++) {
78 if (print_buffer[i]) {
79 palacios_free(print_buffer[i]);
85 static int init_print_buffers(void)
89 memset(print_buffer,0,sizeof(char*)*NR_CPUS);
91 #if !V3_PRINTK_OLD_STYLE_OUTPUT
93 for (i=0;i<NR_CPUS;i++) {
94 print_buffer[i] = palacios_alloc(V3_PRINTK_BUF_SIZE);
95 if (!print_buffer[i]) {
96 ERROR("Cannot allocate print buffer for cpu %d\n",i);
97 deinit_print_buffers();
100 memset(print_buffer[i],0,V3_PRINTK_BUF_SIZE);
110 * Prints a message to the console.
112 void palacios_print_scoped(void * vm, int vcore, const char *fmt, ...) {
114 #if V3_PRINTK_OLD_STYLE_OUTPUT
128 unsigned int cpu = palacios_get_cpu();
129 struct v3_guest *guest = (struct v3_guest *)vm;
131 buf = print_buffer[cpu];
134 printk(KERN_INFO "palacios (pcore %u): output skipped - no allocated buffer\n",cpu);
139 vsnprintf(buf,V3_PRINTK_BUF_SIZE, fmt, ap);
142 #if V3_PRINTK_CHECK_7BIT
146 for (i=0;i<strlen(buf);i++) {
153 printk(KERN_INFO "palacios (pcore %u): ALERT ALERT 8 BIT CHAR (c=%d) DETECTED\n", cpu,c);
160 printk(KERN_INFO "palacios (pcore %u vm %s vcore %u): %s",
166 printk(KERN_INFO "palacios (pcore %u vm %s): %s",
172 printk(KERN_INFO "palacios (pcore %u): %s",
185 * Allocates a contiguous region of pages of the requested size.
186 * Returns the physical address of the first page in the region.
188 void *palacios_allocate_pages(int num_pages, unsigned int alignment, int node_id, int (*filter_func)(void *paddr, void *filter_state), void *filter_state) {
189 void * pg_addr = NULL;
190 v3_resource_control_t *r;
193 ERROR("ALERT ALERT Attempt to allocate zero or fewer pages (%d pages, alignment %d, node %d, filter_func %p, filter_state %p)\n",num_pages, alignment, node_id, filter_func, filter_state);
197 if ((r=(v3_resource_control_t *)palacios_htable_search(v3_thread_resource_map,(addr_t)current))) {
198 // thread has a registered resource control structure
199 // these override any default values
200 // INFO("Overridden page search: (pre) alignment=%x, node_id=%x, filter_func=%p, filter_state=%p\n",alignment,node_id,filter_func,filter_state);
201 if (alignment==4096) {
202 alignment = r->pg_alignment;
205 node_id = r->pg_node_id;
208 filter_func = r->pg_filter_func;
209 filter_state = r->pg_filter_state;
211 //INFO("Overridden page search: (post) alignment=%x, node_id=%x, filter_func=%p, filter_state=%p\n",alignment,node_id,filter_func,filter_state);
214 pg_addr = (void *)alloc_palacios_pgs(num_pages, alignment, node_id, filter_func, filter_state);
217 ERROR("ALERT ALERT Page allocation has FAILED Warning (%d pages, alignment %d, node %d, filter_func %p, filter_state %p)\n",num_pages, alignment, node_id, filter_func, filter_state);
221 pg_allocs += num_pages;
224 memset(__va(pg_addr),0,num_pages*4096);
227 MEMCHECK_ALLOC_PAGES(pg_addr,num_pages*4096);
234 * Frees a page previously allocated via palacios_allocate_page().
235 * Note that palacios_allocate_page() can allocate multiple pages with
236 * a single call while palacios_free_page() only frees a single page.
239 void palacios_free_pages(void * page_paddr, int num_pages) {
241 ERROR("Ignoring free pages: 0x%p (0x%lx)for %d pages\n", page_paddr, (uintptr_t)page_paddr, num_pages);
245 pg_frees += num_pages;
246 free_palacios_pgs((uintptr_t)page_paddr, num_pages);
247 MEMCHECK_FREE_PAGES(page_paddr,num_pages*4096);
253 palacios_alloc_extended(unsigned int size, unsigned int flags, int node) {
257 // note that modern kernels will respond to a zero byte
258 // kmalloc and return the address 0x10... In Palacios,
259 // we will simply not allow 0 byte allocs at all, of any kind
260 ERROR("ALERT ALERT attempt to kmalloc zero bytes rejected\n");
265 addr = kmalloc(size+2*ALLOC_PAD, flags);
267 addr = kmalloc_node(size+2*ALLOC_PAD, flags, node);
270 if (!addr || IS_ERR(addr)) {
271 ERROR("ALERT ALERT kmalloc has FAILED FAILED FAILED\n");
278 memset(addr,0,size+2*ALLOC_PAD);
281 MEMCHECK_KMALLOC(addr,size+2*ALLOC_PAD);
283 return addr+ALLOC_PAD;
287 palacios_valloc(unsigned int size)
292 ERROR("ALERT ALERT attempt to vmalloc zero bytes rejected\n");
296 addr = vmalloc(size);
298 if (!addr || IS_ERR(addr)) {
299 ERROR("ALERT ALERT vmalloc has FAILED FAILED FAILED\n");
309 MEMCHECK_VMALLOC(addr,size);
314 void palacios_vfree(void *p)
317 ERROR("Ignoring vfree: 0x%p\n",p);
327 * Allocates 'size' bytes of kernel memory.
328 * Returns the kernel virtual address of the memory allocated.
331 palacios_alloc(unsigned int size) {
333 // It is very important that this test remains since
334 // this function is used extensively throughout palacios and the linux
335 // module, both in places where interrupts are off and where they are on
336 // a GFP_KERNEL call, when done with interrupts off can lead to DEADLOCK
337 if (irqs_disabled() || in_atomic()) {
338 return palacios_alloc_extended(size,GFP_ATOMIC,-1);
340 return palacios_alloc_extended(size,GFP_KERNEL,-1);
346 * Frees memory that was previously allocated by palacios_alloc().
354 ERROR("Ignoring free : 0x%p\n", addr);
359 kfree(addr-ALLOC_PAD);
360 MEMCHECK_KFREE(addr-ALLOC_PAD);
364 * Converts a kernel virtual address to the corresponding physical address.
367 palacios_vaddr_to_paddr(
371 return (void*) __pa(vaddr);
376 * Converts a physical address to the corresponding kernel virtual address.
379 palacios_paddr_to_vaddr(
387 * Runs a function on the specified CPU.
392 void (*fn)(void *arg),
398 // We set wait to 1, but I'm not sure this is necessary
399 smp_call_function_single(cpu_id, fn, arg, 1);
405 #define MAX_THREAD_NAME 32
407 struct lnx_thread_arg {
408 int (*fn)(void * arg);
410 v3_resource_control_t *resource_control;
411 char name[MAX_THREAD_NAME];
414 static int lnx_thread_target(void * arg) {
415 struct lnx_thread_arg * thread_info = (struct lnx_thread_arg *)arg;
418 INFO("Daemonizing new Palacios thread (name=%s)\n", thread_info->name);
420 daemonize(thread_info->name);
421 allow_signal(SIGKILL);
424 #ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH
425 // We are a kernel thread that needs FPU save/restore state
426 // vcores definitely need this, all the other threads get it too,
427 // but they just won't use it
429 fpu_alloc(&(current->thread.fpu));
432 palacios_htable_insert(v3_thread_resource_map,(addr_t)current,(addr_t)thread_info->resource_control);
434 ret = thread_info->fn(thread_info->arg);
436 INFO("Palacios Thread (%s) EXITING\n", thread_info->name);
438 palacios_htable_remove(v3_thread_resource_map,(addr_t)current,0);
440 palacios_free(thread_info);
443 // We rely on do_exit to free the fpu data
444 // since we could get switched at any point until the thread is done...
448 return 0; // should not get here.
452 * Creates a kernel thread.
455 palacios_create_and_start_kernel_thread(
456 int (*fn) (void * arg),
459 v3_resource_control_t *resource_control) {
461 struct lnx_thread_arg * thread_info = palacios_alloc(sizeof(struct lnx_thread_arg));
464 ERROR("ALERT ALERT Unable to allocate thread\n");
468 thread_info->fn = fn;
469 thread_info->arg = arg;
470 strncpy(thread_info->name,thread_name,MAX_THREAD_NAME);
471 thread_info->name[MAX_THREAD_NAME-1] =0;
472 thread_info->resource_control = resource_control;
474 return kthread_run( lnx_thread_target, thread_info, thread_info->name );
479 * Starts a kernel thread on the specified CPU.
482 palacios_create_thread_on_cpu(int cpu_id,
483 int (*fn)(void * arg),
486 v3_resource_control_t *resource_control) {
487 struct task_struct * thread = NULL;
488 struct lnx_thread_arg * thread_info = palacios_alloc(sizeof(struct lnx_thread_arg));
491 ERROR("ALERT ALERT Unable to allocate thread to start on cpu\n");
495 thread_info->fn = fn;
496 thread_info->arg = arg;
497 strncpy(thread_info->name,thread_name,MAX_THREAD_NAME);
498 thread_info->name[MAX_THREAD_NAME-1] =0;
499 thread_info->resource_control=resource_control;
501 thread = kthread_create( lnx_thread_target, thread_info, thread_info->name );
503 if (!thread || IS_ERR(thread)) {
504 WARNING("Palacios error creating thread: %s\n", thread_info->name);
505 palacios_free(thread_info);
509 if (set_cpus_allowed_ptr(thread, cpumask_of(cpu_id)) != 0) {
510 WARNING("Attempt to start thread on disallowed CPU\n");
511 kthread_stop(thread);
512 palacios_free(thread_info);
520 palacios_start_thread(void * th){
522 struct task_struct * thread = (struct task_struct *)th;
523 wake_up_process(thread);
531 palacios_create_and_start_thread_on_cpu(int cpu_id,
532 int (*fn)(void * arg),
535 v3_resource_control_t *resource_control) {
537 void *t = palacios_create_thread_on_cpu(cpu_id, fn, arg, thread_name, resource_control);
540 palacios_start_thread(t);
549 * Rebind a kernel thread to the specified CPU
550 * The thread will be running on target CPU on return
551 * non-zero return means failure
554 palacios_move_thread_to_cpu(int new_cpu_id,
556 struct task_struct * thread = (struct task_struct *)thread_ptr;
558 INFO("Moving thread (%p) to cpu %d\n", thread, new_cpu_id);
560 if (thread == NULL) {
565 * Bind to the specified CPU. When this call returns,
566 * the thread should be running on the target CPU.
568 return set_cpus_allowed_ptr(thread, cpumask_of(new_cpu_id));
573 * Returns the CPU ID that the caller is running on.
576 palacios_get_cpu(void)
579 /* We want to call smp_processor_id()
580 * But this is not safe if kernel preemption is possible
581 * We need to ensure that the palacios threads are bound to a give cpu
584 unsigned int cpu_id = get_cpu();
590 * Interrupts the physical CPU corresponding to the specified logical guest cpu.
593 * This is dependent on the implementation of xcall_reschedule(). Currently
594 * xcall_reschedule does not explicitly call schedule() on the destination CPU,
595 * but instead relies on the return to user space to handle it. Because
596 * palacios is a kernel thread schedule will not be called, which is correct.
597 * If it ever changes to induce side effects, we'll need to figure something
601 #include <asm/apic.h>
604 palacios_interrupt_cpu(
605 struct v3_vm_info * vm,
611 smp_send_reschedule(cpu_id);
613 apic->send_IPI_mask(cpumask_of(cpu_id), vector);
619 * Dispatches an interrupt to Palacios for handling.
622 palacios_dispatch_interrupt( int vector, void * dev, struct pt_regs * regs ) {
623 struct v3_interrupt intr = {
625 .error = regs->orig_ax,
629 if (irq_to_guest_map[vector]) {
630 v3_deliver_irq(irq_to_guest_map[vector], &intr);
636 * Instructs the kernel to forward the specified IRQ to Palacios.
639 palacios_hook_interrupt(struct v3_vm_info * vm,
640 unsigned int vector ) {
641 INFO("hooking vector %d\n", vector);
643 if (irq_to_guest_map[vector]) {
645 "%s: Interrupt vector %u is already hooked.\n",
651 "%s: Hooking interrupt vector %u to vm %p.\n",
652 __func__, vector, vm);
654 irq_to_guest_map[vector] = vm;
657 * NOTE: Normally PCI devices are supposed to be level sensitive,
658 * but we need them to be edge sensitive so that they are
659 * properly latched by Palacios. Leaving them as level
660 * sensitive would lead to an interrupt storm.
662 //ioapic_set_trigger_for_vector(vector, ioapic_edge_sensitive);
664 //set_idtvec_handler(vector, palacios_dispatch_interrupt);
666 ERROR("unexpected vector for hooking\n");
674 DEBUG("hooking vector: %d\n", vector);
682 error = request_irq((vector - 32),
683 (void *)palacios_dispatch_interrupt,
685 "interrupt_for_palacios",
689 ERROR("error code for request_irq is %d\n", error);
690 ERROR("request vector %d failed", vector);
701 * Acknowledges an interrupt.
704 palacios_ack_interrupt(
709 DEBUG("Pretending to ack interrupt, vector=%d\n", vector);
714 * Returns the CPU frequency in kilohertz.
717 palacios_get_cpu_khz(void)
719 INFO("cpu_khz is %u\n", cpu_khz);
722 INFO("faking cpu_khz to 1000000\n");
731 * Yield the CPU so other host OS tasks can run.
732 * This will return immediately if there is no other thread that is runnable
733 * And there is no real bound on how long it will yield
736 palacios_yield_cpu(void)
743 * Yield the CPU so other host OS tasks can run.
744 * Given now immediately if there is no other thread that is runnable
745 * And there is no real bound on how long it will yield
747 void palacios_sleep_cpu(unsigned int us)
750 set_current_state(TASK_INTERRUPTIBLE);
752 unsigned int uspj = 1000000U/HZ;
753 unsigned int jiffies = us/uspj + ((us%uspj) !=0); // ceiling
754 schedule_timeout(jiffies);
761 void palacios_wakeup_cpu(void *thread)
763 wake_up_process(thread);
769 * Returns NULL on failure.
772 palacios_mutex_alloc(void)
774 spinlock_t *lock = palacios_alloc(sizeof(spinlock_t));
777 spin_lock_init(lock);
778 LOCKCHECK_ALLOC(lock);
780 ERROR("ALERT ALERT Unable to allocate lock\n");
787 void palacios_mutex_init(void *mutex)
789 spinlock_t *lock = (spinlock_t*)mutex;
792 spin_lock_init(lock);
793 LOCKCHECK_ALLOC(lock);
797 void palacios_mutex_deinit(void *mutex)
799 spinlock_t *lock = (spinlock_t*)mutex;
802 // no actual spin_lock_deinit on linux
803 // our purpose here is to drive the lock checker
804 LOCKCHECK_FREE(lock);
813 palacios_mutex_free(void * mutex) {
814 palacios_free(mutex);
815 LOCKCHECK_FREE(mutex);
822 palacios_mutex_lock(void * mutex, int must_spin) {
824 LOCKCHECK_LOCK_PRE(mutex);
825 spin_lock((spinlock_t *)mutex);
826 LOCKCHECK_LOCK_POST(mutex);
831 * Locks a mutex, disabling interrupts on this core
834 palacios_mutex_lock_irqsave(void * mutex, int must_spin) {
838 LOCKCHECK_LOCK_IRQSAVE_PRE(mutex,flags);
839 spin_lock_irqsave((spinlock_t *)mutex,flags);
840 LOCKCHECK_LOCK_IRQSAVE_POST(mutex,flags);
842 return (void *)flags;
850 palacios_mutex_unlock(
854 LOCKCHECK_UNLOCK_PRE(mutex);
855 spin_unlock((spinlock_t *)mutex);
856 LOCKCHECK_UNLOCK_POST(mutex);
861 * Unlocks a mutex and restores previous interrupt state on this core
864 palacios_mutex_unlock_irqrestore(void *mutex, void *flags)
866 LOCKCHECK_UNLOCK_IRQRESTORE_PRE(mutex,(unsigned long)flags);
867 // This is correct, flags is opaque
868 spin_unlock_irqrestore((spinlock_t *)mutex,(unsigned long)flags);
869 LOCKCHECK_UNLOCK_IRQRESTORE_POST(mutex,(unsigned long)flags);
872 void palacios_used_fpu(void)
874 // We assume we are not preemptible here...
876 struct task_struct *tsk = current;
877 tsk->thread.fpu.has_fpu = 1;
879 struct thread_info *cur = current_thread_info();
880 cur->status |= TS_USEDFPU;
883 // After this, FP Save should be handled by Linux if it
884 // switches to a different task and that task uses FPU
887 inline int ists(void)
889 return read_cr0() & X86_CR0_TS;
892 void palacios_need_fpu(void)
894 // We assume we are not preemptible here...
896 // we have been switched back to from somewhere else...
897 // Do a restore now - this will also do a clts()
898 math_state_restore();
904 * Structure used by the Palacios hypervisor to interface with the host kernel.
906 static struct v3_os_hooks palacios_os_hooks = {
907 .print = palacios_print_scoped,
908 .allocate_pages = palacios_allocate_pages,
909 .free_pages = palacios_free_pages,
910 .vmalloc = palacios_valloc,
911 .vfree = palacios_vfree,
912 .malloc = palacios_alloc,
913 .free = palacios_free,
914 .vaddr_to_paddr = palacios_vaddr_to_paddr,
915 .paddr_to_vaddr = palacios_paddr_to_vaddr,
916 .hook_interrupt = palacios_hook_interrupt,
917 .ack_irq = palacios_ack_interrupt,
918 .get_cpu_khz = palacios_get_cpu_khz,
919 .start_kernel_thread = palacios_create_and_start_kernel_thread,
920 .yield_cpu = palacios_yield_cpu,
921 .sleep_cpu = palacios_sleep_cpu,
922 .wakeup_cpu = palacios_wakeup_cpu,
923 .mutex_alloc = palacios_mutex_alloc,
924 .mutex_free = palacios_mutex_free,
925 .mutex_lock = palacios_mutex_lock,
926 .mutex_unlock = palacios_mutex_unlock,
927 .mutex_lock_irqsave = palacios_mutex_lock_irqsave,
928 .mutex_unlock_irqrestore= palacios_mutex_unlock_irqrestore,
929 .get_cpu = palacios_get_cpu,
930 .interrupt_cpu = palacios_interrupt_cpu,
931 .call_on_cpu = palacios_xcall,
932 .create_thread_on_cpu = palacios_create_thread_on_cpu,
933 .start_thread = palacios_start_thread,
934 .move_thread_to_cpu = palacios_move_thread_to_cpu,
938 #ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH
939 // Note that this host interface is defined here since it's
940 // intertwined with thread creation...
941 static struct v3_lazy_fpu_iface palacios_fpu_hooks = {
942 .used_fpu = palacios_used_fpu,
943 .need_fpu = palacios_need_fpu
949 int palacios_vmm_init( char *options )
951 int num_cpus = num_online_cpus();
952 char * cpu_mask = NULL;
954 if (cpu_list_len > 0) {
959 cpu_mask = palacios_alloc((num_cpus / 8) + 1);
962 ERROR("Cannot allocate cpu mask\n");
966 memset(cpu_mask, 0, (num_cpus / 8) + 1);
968 for (i = 0; i < cpu_list_len; i++) {
969 if (cpu_list[i] >= num_cpus) {
970 WARNING("CPU (%d) exceeds number of available CPUs. Ignoring...\n", cpu_list[i]);
974 major = cpu_list[i] / 8;
975 minor = cpu_list[i] % 8;
977 *(cpu_mask + major) |= (0x1 << minor);
981 memset(irq_to_guest_map, 0, sizeof(struct v3_vm_info *) * 256);
983 if (init_print_buffers()) {
984 ERROR("Cannot initialize print buffers\n");
985 palacios_free(cpu_mask);
989 INFO("palacios_init starting - calling init_v3\n");
991 Init_V3(&palacios_os_hooks, cpu_mask, num_cpus, options);
993 #ifdef V3_CONFIG_HOST_LAZY_FPU_SWITCH
994 V3_Init_Lazy_FPU(&palacios_fpu_hooks);
1002 int palacios_vmm_exit( void ) {
1006 INFO("palacios shutdown complete\n");
1008 deinit_print_buffers();