/* Device File Virtualization Host Module (c) Akhil Guliani and William Gross, 2015 Adapted from MPI module (c) 2012 Peter Dinda */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "devfile_hc.h" #define DEEP_DEBUG 1 #define SHALLOW_DEBUG 1 #if DEEP_DEBUG #define DEEP_DEBUG_PRINT(fmt, args...) printk(("devfile: " fmt), ##args) #else #define DEEP_DEBUG_PRINT(fmt, args...) #endif #if SHALLOW_DEBUG #define SHALLOW_DEBUG_PRINT(fmt, args...) printk(("devfile: " fmt), ##args) #else #define SHALLOW_DEBUG_PRINT(fmt, args...) #endif #define ERROR(fmt, args...) printk(("devfile: " fmt), ##args) #define INFO(fmt, args...) printk(("devfile: " fmt), ##args) #define PRINT_CONSOLE(fmt,args...) printf(("devfile: " fmt),##args) // Added to make unique id's for IOCTL #define MY_MACIG 'G' #define INIT_IOCTL _IOR(MY_MACIG, 0, int) #define SHADOW_SYSCALL_DONE _IOW(MY_MACIG, 2, int) #define DEVFILE_NAME "v3-devfile" static int devfile_major_num = 0; static struct class *devfile_class = 0; static struct cdev devfile_dev; struct devfile_state { void *shared_mem_va; uint64_t shared_mem_uva; uint64_t shared_mem_pa; uint64_t returned_fd; wait_queue_head_t user_wait_queue; wait_queue_head_t host_wait_queue; enum { WAIT_FOR_INIT, WAIT_ON_GUEST, WAIT_ON_SHADOW} state; } ; // Currently this proof of concept supports a single userland/VM binding // and is serially reusable static struct devfile_state *state=0; static inline struct devfile_state *find_matching_state(palacios_core_t core) { return state; } /* Hypercall helpers */ static void get_args_64(palacios_core_t core, struct guest_accessors *acc, uint64_t *a1, uint64_t *a2, uint64_t *a3, uint64_t *a4, uint64_t *a5, uint64_t *a6, uint64_t *a7, uint64_t *a8) { *a1 = acc->get_rcx(core); *a2 = acc->get_rdx(core); *a3 = acc->get_rsi(core); *a4 = acc->get_rdi(core); *a5 = acc->get_r8(core); *a6 = acc->get_r9(core); *a7 = acc->get_r10(core); *a8 = acc->get_r11(core); } static void get_args_32(palacios_core_t core, struct guest_accessors *acc, uint64_t *a1, uint64_t *a2, uint64_t *a3, uint64_t *a4, uint64_t *a5, uint64_t *a6, uint64_t *a7, uint64_t *a8) { uint64_t rsp; uint32_t temp; rsp = acc->get_rsp(core); acc->read_gva(core,rsp,4,&temp); *a1=temp; acc->read_gva(core,rsp+4,4,&temp); *a2=temp; acc->read_gva(core,rsp+8,4,&temp); *a3=temp; acc->read_gva(core,rsp+12,4,&temp); *a4=temp; acc->read_gva(core,rsp+16,4,&temp); *a5=temp; acc->read_gva(core,rsp+20,4,&temp); *a6=temp; acc->read_gva(core,rsp+24,4,&temp); *a7=temp; acc->read_gva(core,rsp+28,4,&temp); *a8=temp; } static void get_args(palacios_core_t core, struct guest_accessors *acc, uint64_t *a1, uint64_t *a2, uint64_t *a3, uint64_t *a4, uint64_t *a5, uint64_t *a6, uint64_t *a7, uint64_t *a8) { uint64_t rbx; uint32_t ebx; rbx=acc->get_rbx(core); ebx=rbx&0xffffffff; switch (ebx) { case 0x64646464: DEEP_DEBUG_PRINT("64 bit hcall\n"); return get_args_64(core,acc,a1,a2,a3,a4,a5,a6,a7,a8); break; case 0x32323232: DEEP_DEBUG_PRINT("32 bit hcall\n"); return get_args_32(core,acc,a1,a2,a3,a4,a5,a6,a7,a8); break; default: ERROR("UNKNOWN hcall calling convention\n"); break; } } static void put_return(palacios_core_t core, struct guest_accessors *acc, uint64_t rc, uint64_t errno) { acc->set_rax(core,rc); acc->set_rbx(core,rc); } /* Convert all hypercall pointer arguments from GVAs to GPAs The host userland is responsible for converting from GVAs to HVAs. The assumption here is that any pointer argument points to a structure that does NOT span a page boundary. The guest userland is responsible for assuring that this is the case. */ static int deref_args(palacios_core_t core, struct guest_accessors *acc, uint64_t* a1, uint64_t* a2, uint64_t* a3, uint64_t* a4, uint64_t* a5, uint64_t* a6, uint64_t bvec) { if (bvec & 1){ uint64_t a1tmp = *a1; acc->gva_to_gpa(core,a1tmp,a1); } if (bvec & 2){ uint64_t a2tmp = *a2; acc->gva_to_gpa(core,a2tmp,a2); } if (bvec & 4){ uint64_t a3tmp = *a3; acc->gva_to_gpa(core,a3tmp,a3); } if (bvec & 8){ uint64_t a4tmp = *a4; acc->gva_to_gpa(core,a4tmp,a4); } if (bvec & 16){ uint64_t a5tmp = *a5; acc->gva_to_gpa(core,a5tmp,a5); } if (bvec & 32){ uint64_t a6tmp = *a6; acc->gva_to_gpa(core,a6tmp,a6); } return 0; } #if 0 /* Create /dev/v3-devfile in the host */ // User mode helper call to create module private chardev for ioctls static int setup_mknod_call(int major_num) { //www.ibm.com/developerworks/library/l-user-space-apps/ struct subprocess_info *sub_info; char buf[20]; snprintf(buf,20,"%d",major_num); const char *argv[] = { "/bin/mknod", "/dev/" DEVFILE_NAME,"c", buf, "0", NULL }; static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL }; sub_info = call_usermodehelper_setup( (char*)argv[0], (char**)argv, envp, GFP_ATOMIC ); if (sub_info == NULL) { ERROR("failed to create %s\n",DEVFILE_PATH); return -ENOMEM; } SHALLOW_DEBUG_PRINT("set up usermode call\n"); return call_usermodehelper_exec( sub_info, UMH_WAIT_PROC ); } #endif static uint64_t devfile_syscall_return(struct devfile_state *s, uint64_t *errno) { uint64_t rc; uint64_t *shared_page = (uint64_t*)(s->shared_mem_va); s->state=WAIT_ON_SHADOW; // kick the the user if needed //!! IDEA: We can add Usermode Helper to start shadow process instead // and wait for it to send us an ioctl to wake up the module. wake_up_interruptible(&(s->user_wait_queue)); // goto sleep until we see a message received // part of a separate ioctl SHALLOW_DEBUG_PRINT("waiting For Shadow Process\n"); while (wait_event_interruptible(s->host_wait_queue, (s->state==WAIT_ON_GUEST)) != 0) {} SHALLOW_DEBUG_PRINT("waiting done\n"); // Get the returned value and errno rc = *(shared_page +8); *errno = *(shared_page +9); SHALLOW_DEBUG_PRINT("waiting done %016llu (errno %016llu)\n",rc,*errno); return rc; } static int devfile_syscall_hcall(struct devfile_state *s, palacios_core_t core, uint64_t sys_code, uint64_t a1, uint64_t a2,uint64_t a3, uint64_t a4, uint64_t a5, uint64_t a6, uint64_t bit_vec, uint64_t *errno) { //Using shared memory page uint64_t ret; uint64_t *shared_page = (uint64_t*)(s->shared_mem_va); *(shared_page +0) = sys_code; *(shared_page +1) = a1; *(shared_page +2) = a2; *(shared_page +3) = a3; *(shared_page +4) = a4; *(shared_page +5) = a5; *(shared_page +6) = a6; *(shared_page +7) = bit_vec; SHALLOW_DEBUG_PRINT("Host Module to wait on shadow\n"); //Now wait for rc and errno to be written to the shared page ret = devfile_syscall_return(s, errno); SHALLOW_DEBUG_PRINT("SYSCALL HCALL %016llu (errno %016llu)\n",ret,*errno); return ret; } // The main Interface for Hypercalls int devfile_hypercall(palacios_core_t *core, unsigned int hid, struct guest_accessors *acc, void *p) { uint64_t a1,a2,a3,a4,a5,a6,bit_vec,sys_code; uint64_t rc; uint64_t errno; struct devfile_state *s = find_matching_state(core); if (s->state == WAIT_FOR_INIT){ SHALLOW_DEBUG_PRINT("Shared Memory Not Yet Initialized, returning syscall hypercall\n"); return -1; } sys_code = 0; bit_vec = 0; DEEP_DEBUG_PRINT("devfile_hypercall(%p,0x%x,%p,%p)\n", core,hid,acc,p); get_args(core,acc,&sys_code,&a1,&a2,&a3,&a4,&a5,&a6,&bit_vec); DEEP_DEBUG_PRINT("original arguments: %016llu, %016llu, %016llu, %016llu, %016llu, %016llu, %016llu, %016llu\n", sys_code,a1,a2,a3,a4,a5,a6,bit_vec); // Convert any pointer arguments from GVAs to GPAs deref_args(core,acc,&a1,&a2,&a3,&a4,&a5,&a6,bit_vec); DEEP_DEBUG_PRINT("derefed arguments: %016llu, %016llu, %016llu, %016llu, %016llu, %016llu, %016llu, %016llu\n", sys_code,a1,a2,a3,a4,a5,a6,bit_vec); rc = devfile_syscall_hcall(s,core,sys_code,a1,a2,a3,a4,a5,a6,bit_vec,&errno); SHALLOW_DEBUG_PRINT("Syscall rc: %016llu errno=%016llu\n",rc,errno); put_return(core,acc,rc,errno); return 0; } static int devfile_open(struct inode * inode, struct file * filp) { struct devfile_state *s = state; if (s) { ERROR("attempting to open devfile that is already open\n"); return -EINVAL; } s=(struct devfile_state*)kmalloc(sizeof(struct devfile_state),GFP_KERNEL); if (!s) { ERROR("Failed to allocate space for open\n"); return -EINVAL; } // This hideousness is here because in this POC we // are simply allowing a single userland to be tied to // a single VM. At the same time, we are making // the rest of the code more flexible for the future state = s; memset(s,0,sizeof(*s)); init_waitqueue_head(&s->user_wait_queue); init_waitqueue_head(&s->host_wait_queue); s->state = WAIT_FOR_INIT; filp->private_data = (void*) s; return 0; } static int devfile_close(struct inode * inode, struct file * filp) { struct devfile_state *s = filp->private_data; if (s) { if (s->state==WAIT_ON_SHADOW) { ERROR("Odd, userland is closing devfile while we are waiting for it\n"); } kfree(s); state=0; } return 0; } static long devfile_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct devfile_state *s = filp->private_data; switch(cmd) { case INIT_IOCTL: s->shared_mem_pa = (uint64_t)arg; s->shared_mem_va = __va(s->shared_mem_pa); SHALLOW_DEBUG_PRINT("Shared Memory Physical Address: %016llu\n",s->shared_mem_pa); SHALLOW_DEBUG_PRINT("Shared Memory Kernel VA: %p\n",s->shared_mem_va); //Change State to wait on guest s->state = WAIT_ON_GUEST; break; case SHADOW_SYSCALL_DONE: s->state = WAIT_ON_GUEST; wake_up_interruptible(&(s->host_wait_queue)); break; default: return -EINVAL; } return 0; } static unsigned int devfile_poll(struct file * filp, struct poll_table_struct * poll_tb) { struct devfile_state *s = filp->private_data; SHALLOW_DEBUG_PRINT("poll\n"); // register ourselves on the user wait queue poll_wait(filp, &(s->user_wait_queue), poll_tb); if (s->state==WAIT_ON_SHADOW) { // Yes, we have a request if you want it! DEEP_DEBUG_PRINT("poll done immediate\n"); return POLLIN | POLLRDNORM; } // No request yet, so we need to wait for one to show up. DEEP_DEBUG_PRINT("poll delayed\n"); // We will get called again when that queue is woken up return 0; } static struct file_operations devfile_fops = { .open = devfile_open, .release = devfile_close, .poll = devfile_poll, .unlocked_ioctl = devfile_ioctl, .compat_ioctl = devfile_ioctl }; EXPORT_SYMBOL(devfile_hypercall); int init_module(void) { dev_t dev; SHALLOW_DEBUG_PRINT("INIT\n"); devfile_class = class_create(THIS_MODULE,"devfile"); if (!devfile_class || IS_ERR(devfile_class)) { ERROR("Cannot register devfile device class\n"); return PTR_ERR(devfile_class); } dev = MKDEV(0,0); if (alloc_chrdev_region(&dev,0,1,"devfile")<0) { ERROR("Failed to alloc chrdev region\n"); return -1; } devfile_major_num = MAJOR(dev); dev = MKDEV(devfile_major_num,1); cdev_init(&devfile_dev, &devfile_fops); devfile_dev.owner = THIS_MODULE; devfile_dev.ops = &devfile_fops; cdev_add(&devfile_dev, dev, 1); device_create(devfile_class, NULL, dev, NULL, "v3-devfile"); #if 0 // Setup chardev for IOCTL major = register_chrdev(0,"dfvDev", &fops); if(major < 0){ SHALLOW_DEBUG_PRINT("registering dfvDev char device failed with %d\n", major); return major; } SHALLOW_DEBUG_PRINT("assigned major: %d\n", major); SHALLOW_DEBUG_PRINT("creating node with mknod %s c %d 0\n", DEVFILE_PATH, major); // Call Helper API function to setup chardev rc = setup_mknod_call(major); SHALLOW_DEBUG_PRINT("UMH api mknod %s c %d 0 -- ret: %d\n\n", DEVFILE_PATH, major,rc); #endif INFO("inited\n"); return 0; } void cleanup_module(void) { dev_t dev = MKDEV(devfile_major_num,1); unregister_chrdev_region(MKDEV(devfile_major_num,0),1); cdev_del(&devfile_dev); device_destroy(devfile_class,dev); class_destroy(devfile_class); #if 0 unregister_chrdev(major, "dfvDev"); #endif if (state) { kfree(state); } INFO("deinited\n"); }