2 Palacios main control interface
6 #include <linux/kernel.h>
7 #include <linux/module.h>
8 #include <linux/moduleparam.h>
9 #include <linux/errno.h>
10 #include <linux/percpu.h>
12 #include <linux/uaccess.h>
13 #include <linux/device.h>
14 #include <linux/cdev.h>
18 #include <linux/file.h>
19 #include <linux/spinlock.h>
20 #include <linux/kthread.h>
22 #include <linux/proc_fs.h>
23 #include <linux/seq_file.h>
25 #include <palacios/vmm.h>
31 #include "allow_devmem.h"
33 #include "lockcheck.h"
35 #include "linux-exts.h"
37 #include "util-hashtable.h"
39 MODULE_LICENSE("GPL");
42 int cpu_list[NR_CPUS] = {};
44 module_param_array(cpu_list, int, &cpu_list_len, 0644);
45 MODULE_PARM_DESC(cpu_list, "Comma-delimited list of CPUs that Palacios will run on");
47 static int allow_devmem = 0;
48 module_param(allow_devmem, int, 0);
49 MODULE_PARM_DESC(allow_devmem, "Allow general user-space /dev/mem access even if kernel is strict");
51 // Palacios options parameter
53 module_param(options, charp, 0);
54 MODULE_PARM_DESC(options, "Generic options to internal Palacios modules");
60 static int v3_major_num = 0;
62 static struct v3_guest * guest_map[MAX_VMS] = {[0 ... MAX_VMS - 1] = 0};
63 static struct proc_dir_entry * palacios_proc_dir = NULL;
65 struct class * v3_class = NULL;
66 static struct cdev ctrl_dev;
69 // mapping from thread ids to their resource control blocks
70 struct hashtable *v3_thread_resource_map=0;
72 static int register_vm(struct v3_guest * guest) {
75 for (i = 0; i < MAX_VMS; i++) {
76 if (guest_map[i] == NULL) {
87 static long v3_dev_ioctl(struct file * filp,
88 unsigned int ioctl, unsigned long arg) {
89 void __user * argp = (void __user *)arg;
90 DEBUG("V3 IOCTL %d\n", ioctl);
94 case V3_CREATE_GUEST:{
96 struct v3_guest_img user_image;
97 struct v3_guest * guest = palacios_alloc(sizeof(struct v3_guest));
100 ERROR("Palacios: Error allocating Kernel guest_image\n");
104 memset(guest, 0, sizeof(struct v3_guest));
106 INFO("Palacios: Creating V3 Guest...\n");
108 vm_minor = register_vm(guest);
110 if (vm_minor == -1) {
111 ERROR("Palacios Error: Too many VMs are currently running\n");
115 guest->vm_dev = MKDEV(v3_major_num, vm_minor);
117 if (copy_from_user(&user_image, argp, sizeof(struct v3_guest_img))) {
118 ERROR("Palacios Error: copy from user error getting guest image...\n");
122 guest->img_size = user_image.size;
124 DEBUG("Palacios: Allocating kernel memory for guest image (%llu bytes)\n", user_image.size);
125 // overflow possible here, but only if guest image is probably to large for kernel anyway...
126 guest->img = palacios_valloc(guest->img_size);
129 ERROR("Palacios Error: Could not allocate space for guest image\n");
133 if (copy_from_user(guest->img, user_image.guest_data, guest->img_size)) {
134 ERROR("Palacios: Error loading guest data\n");
138 strncpy(guest->name, user_image.name, 128);
139 guest->name[127] = 0;
141 INIT_LIST_HEAD(&(guest->exts));
143 if (create_palacios_vm(guest) == -1) {
144 ERROR("Palacios: Error creating guest\n");
152 palacios_vfree(guest->img);
154 guest_map[vm_minor] = NULL;
156 palacios_free(guest);
162 case V3_FREE_GUEST: {
163 unsigned long vm_idx = arg;
164 struct v3_guest * guest;
166 if (vm_idx >= MAX_VMS) {
167 ERROR("Invalid VM index: %ld\n", vm_idx);
171 guest = guest_map[vm_idx];
174 ERROR("No VM at index %ld\n",vm_idx);
178 INFO("Freeing VM (%s) (%p)\n", guest->name, guest);
180 if (free_palacios_vm(guest)<0) {
181 ERROR("Cannot free guest at index %ld\n",vm_idx);
185 guest_map[vm_idx] = NULL;
188 case V3_ADD_MEMORY: {
189 struct v3_mem_region mem;
191 memset(&mem, 0, sizeof(struct v3_mem_region));
193 if (copy_from_user(&mem, argp, sizeof(struct v3_mem_region))) {
194 ERROR("copy from user error getting mem_region...\n");
198 DEBUG("Adding %llu pages to Palacios memory\n", mem.num_pages);
200 if (add_palacios_memory(&mem) == -1) {
201 ERROR("Error adding memory to Palacios\n");
208 case V3_REMOVE_MEMORY: {
209 struct v3_mem_region mem;
211 memset(&mem, 0, sizeof(struct v3_mem_region));
213 if (copy_from_user(&mem, argp, sizeof(struct v3_mem_region))) {
214 ERROR("copy from user error getting mem_region...\n");
218 DEBUG("Removing memory at address %p\n", (void*)(mem.base_addr));
220 if (remove_palacios_memory(&mem) == -1) {
221 ERROR("Error removing memory from Palacios\n");
230 case V3_RESET_MEMORY: {
231 DEBUG("Resetting memory\n");
232 if (palacios_deinit_mm() == -1) {
233 ERROR("Error deiniting the Palacios memory manager\n");
236 if (palacios_init_mm()) {
237 ERROR("Error initing the Palacios memory manager\n");
244 struct global_ctrl * ctrl = get_global_ctrl(ioctl);
247 return ctrl->handler(ioctl, arg);
250 WARNING("\tUnhandled global ctrl cmd: %d\n", ioctl);
261 static struct file_operations v3_ctrl_fops = {
262 .owner = THIS_MODULE,
263 .unlocked_ioctl = v3_dev_ioctl,
264 .compat_ioctl = v3_dev_ioctl,
269 struct proc_dir_entry *palacios_get_procdir(void)
271 // INFO("Returning procdir=%p\n",palacios_proc_dir);
272 return palacios_proc_dir;
276 #define MAX_CORES 1024
277 #define MAX_REGIONS 1024
278 #define MIN(x,y) ((x)<(y) ? (x) : (y))
280 static int read_guests_details(struct seq_file *s, void *v)
284 uint64_t num_vcores, num_regions;
285 uint64_t alloc_num_vcores, alloc_num_regions;
286 struct v3_vm_base_state *base=0;
287 struct v3_vm_core_state *core=0;
288 struct v3_vm_mem_state *mem=0;
291 base = palacios_valloc(sizeof(struct v3_vm_base_state));
295 ERROR("No space for base state structure\n");
300 for(i = 0; i < MAX_VMS; i++) {
302 if (guest_map[i] != NULL) {
304 v3_get_state_sizes_vm(guest_map[i]->v3_ctx,&num_vcores,&num_regions);
306 alloc_num_vcores = MIN(num_vcores,MAX_CORES);
307 alloc_num_regions = MIN(num_regions,MAX_REGIONS);
309 core = palacios_valloc(sizeof(struct v3_vm_core_state) + alloc_num_vcores*sizeof(struct v3_vm_vcore_state));
312 ERROR("No space for core state structure\n");
316 mem = palacios_valloc(sizeof(struct v3_vm_mem_state) + alloc_num_regions*sizeof(struct v3_vm_mem_region));
319 ERROR("No space for memory state structure\n");
324 "---------------------------------------------------------------------------------------\n");
328 "Device: /dev/v3-vm%d\n",
329 i,guest_map[i]->name, i);
332 core->num_vcores=alloc_num_vcores;
333 mem->num_regions=alloc_num_regions;
335 if (v3_get_state_vm(guest_map[i]->v3_ctx, base, core, mem)) {
336 ERROR("Cannot get VM info\n");
337 seq_printf(s, "<unable to get data for this VM>\n");
342 "Cores: %llu (%llu shown)\n"
343 "Regions: %llu (%llu shown)\n"
344 "Memsize: %llu (%llu ROS)\n\n",
345 base->vm_type==V3_VM_GENERAL ? "general" :
346 base->vm_type==V3_VM_HVM ? "HVM" : "UNKNOWN",
347 base->state==V3_VM_INVALID ? "INVALID" :
348 base->state==V3_VM_RUNNING ? "running" :
349 base->state==V3_VM_STOPPED ? "stopped" :
350 base->state==V3_VM_PAUSED ? "paused" :
351 base->state==V3_VM_ERROR ? "ERROR" :
352 base->state==V3_VM_SIMULATING ? "simulating" :
353 base->state==V3_VM_RESETTING ? "resetting" : "UNKNOWN",
361 seq_printf(s, "Core States\n");
363 for (j=0;j<core->num_vcores;j++) {
365 " vcore %u %s on pcore %lu %llu exits rip=0x%p %s %s %s %s\n",
367 core->vcore[j].state==V3_VCORE_INVALID ? "INVALID" :
368 core->vcore[j].state==V3_VCORE_RUNNING ? "running" :
369 core->vcore[j].state==V3_VCORE_STOPPED ? "stopped" :
370 core->vcore[j].state==V3_VCORE_RESETTING ? "resetting" : "UNKNOWN",
371 core->vcore[j].pcore,
372 core->vcore[j].num_exits,
373 core->vcore[j].last_rip,
374 core->vcore[j].cpu_mode==V3_VCORE_CPU_REAL ? "real" :
375 core->vcore[j].cpu_mode==V3_VCORE_CPU_PROTECTED ? "protected" :
376 core->vcore[j].cpu_mode==V3_VCORE_CPU_PROTECTED_PAE ? "protectedpae" :
377 core->vcore[j].cpu_mode==V3_VCORE_CPU_LONG ? "long" :
378 core->vcore[j].cpu_mode==V3_VCORE_CPU_LONG_32_COMPAT ? "long32" :
379 core->vcore[j].cpu_mode==V3_VCORE_CPU_LONG_16_COMPAT ? "long16" : "UNKNOWN",
380 core->vcore[j].mem_mode==V3_VCORE_MEM_MODE_PHYSICAL ? "physical" :
381 core->vcore[j].mem_mode==V3_VCORE_MEM_MODE_VIRTUAL ? "virtual" : "UNKNOWN",
382 core->vcore[j].mem_state==V3_VCORE_MEM_STATE_SHADOW ? "shadow" :
383 core->vcore[j].mem_state==V3_VCORE_MEM_STATE_NESTED ? "nested" : "UNKNOWN",
384 core->vcore[j].vcore_type==V3_VCORE_GENERAL ? "" :
385 core->vcore[j].vcore_type==V3_VCORE_ROS ? "ros" :
386 core->vcore[j].vcore_type==V3_VCORE_HRT ? "hrt" : "UNKNOWN");
390 seq_printf(s, "\nMemory Regions\n");
391 for (j=0;j<mem->num_regions;j++) {
392 seq_printf(s," region %u has HPAs 0x%016llx-0x%016llx (node %d) GPA 0x%016llx %s %s\n",
393 j, (uint64_t)mem->region[j].host_paddr, (uint64_t)mem->region[j].host_paddr+mem->region[j].size,
394 numa_addr_to_node((uintptr_t)(mem->region[j].host_paddr)),
395 (uint64_t)mem->region[j].guest_paddr,
396 mem->region[j].swapped ? "swapped" : "",
397 mem->region[j].pinned ? "pinned" : "");
402 "---------------------------------------------------------------------------------------\n");
404 palacios_vfree(mem); mem=0;
405 palacios_vfree(core); core=0;
413 if (mem) { palacios_vfree(mem); } // dead code but kept for clarity
414 if (core) { palacios_vfree(core); }
415 if (base) { palacios_vfree(base); }
420 static int read_guests(struct seq_file *s, void *v)
423 struct v3_vm_base_state *base=0;
424 struct v3_vm_core_state *core=0;
425 struct v3_vm_mem_state *mem=0;
426 uint64_t num_vcores, num_regions;
429 INFO("READ GUEST\n");
431 base = palacios_valloc(sizeof(struct v3_vm_base_state));
434 ERROR("No space for base state structure\n");
438 core = palacios_valloc(sizeof(struct v3_vm_core_state));
441 ERROR("No space for core state structure\n");
445 mem = palacios_valloc(sizeof(struct v3_vm_mem_state));
448 ERROR("No space for memory state structure\n");
453 for(i = 0; i < MAX_VMS; i++) {
454 if (guest_map[i] != NULL) {
456 v3_get_state_sizes_vm(guest_map[i]->v3_ctx,&num_vcores,&num_regions);
458 seq_printf(s,"%s\t/dev/v3-vm%d", guest_map[i]->name, i);
460 // Skip getting per core and per-region
464 if (v3_get_state_vm(guest_map[i]->v3_ctx, base, core, mem)) {
465 ERROR("Cannot get VM info\n");
466 seq_printf(s, "\t<unable to get data for this VM>\n");
468 seq_printf(s,"\t%s\t%llu vcores\t%llu regions\t%llu mem\t%s\n",
469 base->state==V3_VM_INVALID ? "INVALID" :
470 base->state==V3_VM_RUNNING ? "running" :
471 base->state==V3_VM_STOPPED ? "stopped" :
472 base->state==V3_VM_PAUSED ? "paused" :
473 base->state==V3_VM_ERROR ? "ERROR" :
474 base->state==V3_VM_SIMULATING ? "simulating" : "UNKNOWN",
478 base->vm_type == V3_VM_GENERAL ? "general" :
479 base->vm_type == V3_VM_HVM ? "hvm" : "UNKNOWN");
486 if (mem) { palacios_vfree(mem); }
487 if (core) { palacios_vfree(core); }
488 if (base) { palacios_vfree(base); }
494 static int guests_short_proc_open(struct inode * inode, struct file * filp)
496 struct proc_dir_entry * proc_entry = PDE(inode);
497 return single_open(filp, read_guests, proc_entry->data);
500 static int guests_full_proc_open(struct inode * inode, struct file * filp)
502 struct proc_dir_entry * proc_entry = PDE(inode);
503 return single_open(filp, read_guests_details, proc_entry->data);
509 static struct file_operations guest_full_proc_ops = {
510 .owner = THIS_MODULE,
511 .open = guests_full_proc_open,
514 .release = single_release,
517 static struct file_operations guest_short_proc_ops = {
518 .owner = THIS_MODULE,
519 .open = guests_short_proc_open,
522 .release = single_release,
525 // Supply basic information that the user-space tools need
526 // to manipulate Palacios. The current use case here is to
527 // convey memory information
528 static int read_info(struct seq_file *s, void *v)
530 uint64_t mem_block_size;
533 seq_printf(s,"kernel MAX_ORDER:\t%d\n",MAX_ORDER);
534 seq_printf(s,"number of nodes:\t%d\n", numa_num_nodes());
535 seq_printf(s,"number of cpus: \t%d\n", num_online_cpus());
536 seq_printf(s,"\npalacios compiled mem_block_size:\t%d\n", V3_CONFIG_MEM_BLOCK_SIZE);
537 if (!v3_lookup_option("mem_block_size")) {
538 mem_block_size = V3_CONFIG_MEM_BLOCK_SIZE;
540 if (strict_strtoull(v3_lookup_option("mem_block_size"), 0, &mem_block_size)) {
545 seq_printf(s,"palacios run-time mem_block_size:\t%llu\n", mem_block_size);
547 seq_printf(s,"\nCPU to node mappings\n");
548 for (i=0;i<num_online_cpus();i++) {
549 seq_printf(s,"cpu %d -> node %d\n", i, numa_cpu_to_node(i));
550 if (numa_cpu_to_node(i)>max_node) {
551 max_node=numa_cpu_to_node(i);
554 seq_printf(s,"\nNode to node distances\n");
555 for (j=0;j<=max_node;j++) {
556 seq_printf(s," \t%2d", j);
559 for (i=0;i<=max_node;i++) {
560 seq_printf(s,"%2d ",i);
561 for (j=0;j<=max_node;j++) {
562 seq_printf(s,"\t%2d", numa_get_distance(i,j));
566 seq_printf(s,"\nCPU to CPU distances\n");
567 for (j=0;j<num_online_cpus();j++) {
568 seq_printf(s," \t%2d", j);
571 for (i=0;i<num_online_cpus();i++) {
572 seq_printf(s,"%2d ",i);
573 for (j=0;j<num_online_cpus();j++) {
574 seq_printf(s,"\t%2d", numa_get_distance(numa_cpu_to_node(i),numa_cpu_to_node(j)));
581 static int info_proc_open(struct inode * inode, struct file * filp)
583 struct proc_dir_entry * proc_entry = PDE(inode);
584 return single_open(filp, read_info, proc_entry->data);
589 static struct file_operations info_proc_ops = {
590 .owner = THIS_MODULE,
591 .open = info_proc_open,
594 .release = single_release,
598 static inline uint_t thr_hash_func(addr_t key)
600 return palacios_hash_long((long)key,64);
603 static inline int thr_hash_comp(addr_t k1, addr_t k2)
608 static int __init v3_init(void) {
610 dev_t dev = MKDEV(0, 0); // We dynamicallly assign the major number
617 if (!(v3_thread_resource_map = palacios_create_htable(MAX_THREADS,thr_hash_func,thr_hash_comp))) {
618 ERROR("Could not create thread/resource map\n");
623 palacios_proc_dir = proc_mkdir("v3vee", NULL);
624 if (!palacios_proc_dir) {
625 ERROR("Could not create proc entry\n");
630 // this will populate the v3vee tree...
631 if (palacios_init_mm()) {
636 palacios_allow_devmem();
639 // numa is now a required interface and we need it
640 // up before primary initiatilization
641 palacios_init_numa();
643 // Initialize Palacios
644 palacios_vmm_init(options);
646 // initialize extensions
647 init_lnx_extensions();
650 v3_class = class_create(THIS_MODULE, "vms");
651 if (!v3_class || IS_ERR(v3_class)) {
652 ERROR("Failed to register V3 VM device class\n");
653 ret = PTR_ERR(v3_class);
657 INFO("intializing V3 Control device\n");
659 ret = alloc_chrdev_region(&dev, 0, MAX_VMS + 1, "v3vee");
662 ERROR("Error registering device region for V3 devices\n");
666 v3_major_num = MAJOR(dev);
668 dev = MKDEV(v3_major_num, MAX_VMS + 1);
671 DEBUG("Creating V3 Control device: Major %d, Minor %d\n", v3_major_num, MINOR(dev));
672 cdev_init(&ctrl_dev, &v3_ctrl_fops);
673 ctrl_dev.owner = THIS_MODULE;
674 ctrl_dev.ops = &v3_ctrl_fops;
675 cdev_add(&ctrl_dev, dev, 1);
677 device_create(v3_class, NULL, dev, NULL, "v3vee");
680 ERROR("Error adding v3 control device\n");
685 struct proc_dir_entry *entry;
687 entry = create_proc_entry("v3-guests", 0444, palacios_proc_dir);
689 entry->proc_fops = &guest_short_proc_ops;
690 INFO("/proc/v3vee/v3-guests successfully created\n");
692 ERROR("Could not create proc entry\n");
695 entry = create_proc_entry("v3-guests-details", 0444, palacios_proc_dir);
697 entry->proc_fops = &guest_full_proc_ops;
698 INFO("/proc/v3vee/v3-guests-details successfully created\n");
700 ERROR("Could not create proc entry\n");
704 entry = create_proc_entry("v3-info", 0444, palacios_proc_dir);
706 entry->proc_fops = &info_proc_ops;
707 INFO("/proc/v3vee/v3-info successfully created\n");
709 ERROR("Could not create proc entry\n");
719 remove_proc_entry("v3-guests-details", palacios_proc_dir);
721 remove_proc_entry("v3-guests", palacios_proc_dir);
723 device_destroy(v3_class, dev);
725 unregister_chrdev_region(MKDEV(v3_major_num, 0), MAX_VMS + 1);
727 class_destroy(v3_class);
730 palacios_restore_devmem();
732 palacios_deinit_mm();
734 remove_proc_entry("v3vee", NULL);
736 palacios_free_htable(v3_thread_resource_map,0,0);
745 static void __exit v3_exit(void) {
746 extern u32 pg_allocs;
753 struct v3_guest * guest;
757 /* Stop and free any running VMs */
758 for (i = 0; i < MAX_VMS; i++) {
759 if (guest_map[i] != NULL) {
760 guest = (struct v3_guest *)(guest_map[i]);
762 if (!guest->v3_ctx) {
763 ERROR("Orphan VM detected and skipped: index=%d name=%s\n", i, guest->name);
767 if (v3_stop_vm(guest->v3_ctx) < 0)
768 ERROR("Couldn't stop VM %d\n", i);
770 free_palacios_vm(guest);
775 dev = MKDEV(v3_major_num, MAX_VMS + 1);
777 INFO("Removing V3 Control device\n");
782 palacios_deinit_numa();
784 DEBUG("Palacios Mallocs = %d, Frees = %d\n", mallocs, frees);
785 DEBUG("Palacios Vmallocs = %d, Vfrees = %d\n", vmallocs, vfrees);
786 DEBUG("Palacios Page Allocs = %d, Page Frees = %d\n", pg_allocs, pg_frees);
788 unregister_chrdev_region(MKDEV(v3_major_num, 0), MAX_VMS + 1);
792 device_destroy(v3_class, dev);
793 class_destroy(v3_class);
796 deinit_lnx_extensions();
799 palacios_restore_devmem();
802 palacios_deinit_mm();
804 remove_proc_entry("v3-info", palacios_proc_dir);
805 remove_proc_entry("v3-guests-details", palacios_proc_dir);
806 remove_proc_entry("v3-guests", palacios_proc_dir);
807 remove_proc_entry("v3vee", NULL);
809 DEBUG("Palacios Module Mallocs = %d, Frees = %d\n", mod_allocs, mod_frees);
811 palacios_free_htable(v3_thread_resource_map,0,0);
819 module_init(v3_init);
820 module_exit(v3_exit);
824 void * trace_malloc(size_t size, gfp_t flags) {
828 addr = palacios_alloc_extended(size, flags, -1);
834 void trace_free(const void * objp) {
836 palacios_free((void*)objp);