From: Peter Dinda Date: Thu, 10 Oct 2013 20:22:01 +0000 (-0500) Subject: Dynamic migration of memory from node to node X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?p=palacios.git;a=commitdiff_plain;h=b07dd4e35a37f1db04c4f52f9904ee7206673ac6 Dynamic migration of memory from node to node This adds the ability to move memory regions of a VM from user space: v3_mem_move /dev/v3-vmX guest_paddr host_cpuid This will move the memory region in which guest_paddr is contained from its current numa node to the one affiliated with host_cpuid Combined with v3_core_move, this should provide the ability to dynamically manipulate NUMA mappings at runtime, to the granularity of the v3_mem_block_size (which is selected at insmod time) --- diff --git a/linux_module/main.c b/linux_module/main.c index 8162172..932b739 100644 --- a/linux_module/main.c +++ b/linux_module/main.c @@ -355,8 +355,9 @@ static int read_guests_details(struct seq_file *s, void *v) seq_printf(s, "\nMemory Regions\n"); for (j=0;jnum_regions;j++) { - seq_printf(s," region %u has HPAs 0x%p-0x%p\n", - j, mem->region[j].host_paddr, mem->region[j].host_paddr+mem->region[j].size); + seq_printf(s," region %u has HPAs 0x%p-0x%p (node %d)\n", + j, mem->region[j].host_paddr, mem->region[j].host_paddr+mem->region[j].size, + numa_addr_to_node((uintptr_t)(mem->region[j].host_paddr))); } } seq_printf(s, diff --git a/linux_module/palacios.h b/linux_module/palacios.h index 7b3a32b..c9cbb96 100644 --- a/linux_module/palacios.h +++ b/linux_module/palacios.h @@ -39,6 +39,8 @@ #define V3_VM_SEND 34 #define V3_VM_RECEIVE 35 +#define V3_VM_MOVE_MEM 36 + #define V3_VM_FB_INPUT 257 #define V3_VM_FB_QUERY 258 @@ -77,6 +79,11 @@ struct v3_core_move_cmd { unsigned short pcore_id; } __attribute__((packed)); +struct v3_mem_move_cmd{ + unsigned long long gpa; + unsigned short pcore_id; +} __attribute__((packed)); + struct v3_chkpt_info { char store[128]; char url[256]; /* This might need to be bigger... */ @@ -156,7 +163,7 @@ void *palacios_start_kernel_thread(int (*fn)(void * arg), void *arg, char *threa void *palacios_start_thread_on_cpu(int cpu_id, int (*fn)(void * arg), void *arg, char *thread_name); int palacios_move_thread_to_cpu(int new_cpu_id, void *thread_ptr); void palacios_yield_cpu(void); -void palacios_yield_cpu_timed(unsigned int us); +void palacios_sleep_cpu(unsigned int us); unsigned int palacios_get_cpu(void); unsigned int palacios_get_cpu_khz(void); void *palacios_mutex_alloc(void); // allocates and inits a lock diff --git a/linux_module/vm.c b/linux_module/vm.c index ea585ab..2ff482b 100644 --- a/linux_module/vm.c +++ b/linux_module/vm.c @@ -317,13 +317,36 @@ static long v3_vm_ioctl(struct file * filp, memset(&cmd, 0, sizeof(struct v3_core_move_cmd)); if (copy_from_user(&cmd, argp, sizeof(struct v3_core_move_cmd))) { - WARNING("copy from user error getting migrate command...\n"); + WARNING("copy from user error getting migrate core command...\n"); return -EFAULT; } INFO("moving guest %s vcore %d to CPU %d\n", guest->name, cmd.vcore_id, cmd.pcore_id); - v3_move_vm_core(guest->v3_ctx, cmd.vcore_id, cmd.pcore_id); + if (v3_move_vm_core(guest->v3_ctx, cmd.vcore_id, cmd.pcore_id)) { + ERROR("Could not move core\n"); + return -EFAULT; + } + + break; + } + case V3_VM_MOVE_MEM: { + struct v3_mem_move_cmd cmd; + void __user * argp = (void __user *)arg; + + memset(&cmd, 0, sizeof(struct v3_mem_move_cmd)); + + if (copy_from_user(&cmd, argp, sizeof(struct v3_mem_move_cmd))) { + WARNING("copy from user error getting migrate memory command...\n"); + return -EFAULT; + } + + INFO("moving guest %s memory at gpa %p to memory with affinity for CPU %d\n", guest->name, (void*)(cmd.gpa), cmd.pcore_id); + + if (v3_move_vm_mem(guest->v3_ctx, (void*)(cmd.gpa), cmd.pcore_id)) { + ERROR("Could not move memory\n"); + return -EFAULT; + } break; } diff --git a/linux_usr/Makefile b/linux_usr/Makefile index 0cf09ce..51fcc74 100644 --- a/linux_usr/Makefile +++ b/linux_usr/Makefile @@ -20,6 +20,7 @@ BASE_EXECS = v3_mem \ v3_pause \ v3_continue \ v3_core_move \ + v3_mem_move \ v3_load \ v3_save \ v3_cons \ diff --git a/linux_usr/v3_ctrl.h b/linux_usr/v3_ctrl.h index e5e0d76..20eadc4 100644 --- a/linux_usr/v3_ctrl.h +++ b/linux_usr/v3_ctrl.h @@ -43,6 +43,8 @@ #define V3_VM_SEND 34 #define V3_VM_RECEIVE 35 +#define V3_VM_MOVE_MEM 36 + #define V3_VM_FB_INPUT 257 #define V3_VM_FB_QUERY 258 @@ -78,6 +80,11 @@ struct v3_core_move_cmd{ unsigned short pcore_id; } __attribute__((packed)); +struct v3_mem_move_cmd{ + unsigned long long gpa; + unsigned short pcore_id; +} __attribute__((packed)); + struct v3_debug_cmd { unsigned int core; unsigned int cmd; diff --git a/linux_usr/v3_mem_move.c b/linux_usr/v3_mem_move.c new file mode 100644 index 0000000..145217c --- /dev/null +++ b/linux_usr/v3_mem_move.c @@ -0,0 +1,59 @@ +/* + * V3 memory movement control + * (c) Peter Dinda 2013 + */ + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v3_ctrl.h" + + +int main(int argc, char* argv[]) { + int vm_fd; + char * vm_dev = NULL; + struct v3_mem_move_cmd cmd; + + if (argc < 4) { + printf("usage: v3_mem_move \n\n"); + printf("Moves the memory region into which the guest_physical_addr is mapped\n"); + printf("to host physical memory that has highest affinity for the target_physical_cpu.\n"); + printf("you can find the current memory mapping via /proc/v3vee/v3-guests-details\n\n"); + printf(" guest_physical_addr - hex address\n"); + printf(" target_physical_cpu - base 10 cpuid (0..numcpus-1)\n\n"); + return -1; + } + + vm_dev = argv[1]; + cmd.gpa = strtoll(argv[2],0,16); + cmd.pcore_id = atoi(argv[3]); + + printf("Migrating memory region of %p to memory with affinity for physical CPU %d\n", cmd.gpa, cmd.pcore_id); + + vm_fd = open(vm_dev, O_RDONLY); + + if (vm_fd == -1) { + printf("Error opening VM device: %s\n", vm_dev); + return -1; + } + + int err = ioctl(vm_fd, V3_VM_MOVE_MEM, &cmd); + + if (err < 0) { + printf("Error write memory migrating command to vm\n"); + return -1; + } + + close(vm_fd); + + return 0; +} + + diff --git a/palacios/include/palacios/vmm.h b/palacios/include/palacios/vmm.h index c4d0e17..983cd78 100644 --- a/palacios/include/palacios/vmm.h +++ b/palacios/include/palacios/vmm.h @@ -449,6 +449,7 @@ int v3_send_vm(struct v3_vm_info * vm, char * store, char * url, unsigned long l int v3_receive_vm(struct v3_vm_info * vm, char * store, char * url, unsigned long long opts); int v3_move_vm_core(struct v3_vm_info * vm, int vcore_id, int target_cpu); +int v3_move_vm_mem(struct v3_vm_info * vm, void *gpa, int target_cpu); int v3_free_vm(struct v3_vm_info * vm); diff --git a/palacios/include/palacios/vmm_mem.h b/palacios/include/palacios/vmm_mem.h index ff20faf..7bec3e6 100644 --- a/palacios/include/palacios/vmm_mem.h +++ b/palacios/include/palacios/vmm_mem.h @@ -52,6 +52,7 @@ typedef struct { uint8_t exec : 1; uint8_t base : 1; uint8_t alloced : 1; + uint8_t limit32 : 1; // must be < 4GB in host } __attribute__((packed)); } __attribute__((packed)); } __attribute__((packed)) v3_mem_flags_t; diff --git a/palacios/src/palacios/vmm.c b/palacios/src/palacios/vmm.c index e9634d6..cfef4f9 100644 --- a/palacios/src/palacios/vmm.c +++ b/palacios/src/palacios/vmm.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #ifdef V3_CONFIG_SVM #include @@ -489,7 +491,121 @@ int v3_move_vm_core(struct v3_vm_info * vm, int vcore_id, int target_cpu) { return 0; } +/* move a memory region to memory with affinity for a specific physical core */ +int v3_move_vm_mem(struct v3_vm_info * vm, void *gpa, int target_cpu) { + int old_node; + int new_node; + struct v3_mem_region *reg; + void *new_hpa; + int num_pages; + void *old_hpa; + int i; + + old_node = v3_numa_gpa_to_node(vm,(addr_t)gpa); + + if (old_node<0) { + PrintError(vm, VCORE_NONE, "Cannot determine current node of gpa %p\n",gpa); + return -1; + } + + new_node = v3_numa_cpu_to_node(target_cpu); + + if (new_node<0) { + PrintError(vm, VCORE_NONE, "Cannot determine current node of cpu %d\n",target_cpu); + return -1; + } + + if (new_node==old_node) { + PrintDebug(vm, VCORE_NONE, "Affinity is already established - ignoring request\n"); + return 0; + } + + // We are now going to change the universe, so + // we'll barrier everyone first + + while (v3_raise_barrier(vm, NULL) == -1); + + // get region + + reg = v3_get_mem_region(vm, V3_MEM_CORE_ANY, (addr_t) gpa); + + if (!reg) { + PrintError(vm, VCORE_NONE, "Attempt to migrate non-existent memory\n"); + goto out_fail; + } + + if (!(reg->flags.base) || !(reg->flags.alloced)) { + PrintError(vm, VCORE_NONE, "Attempt to migrate invalid region: base=%d alloced=%d\n", reg->flags.base, reg->flags.alloced); + goto out_fail; + } + + // we now have the allocated base region corresponding to - and not a copy + // we will rewrite this region after moving its contents + + // first, let's double check that we are in fact changing the numa_id... + if (reg->numa_id==new_node) { + PrintDebug(vm, VCORE_NONE, "Affinity for this base region is already established - ignoring...\n"); + goto out_success; + } + + // region uses exclusive addressing [guest_start,guest_end) + num_pages = (reg->guest_end-reg->guest_start)/PAGE_SIZE; + + // Now we allocate space for the new region with the same constraints as + // it originally had + new_hpa = V3_AllocPagesExtended(num_pages, + PAGE_SIZE_4KB, + new_node, + reg->flags.limit32 ? V3_ALLOC_PAGES_CONSTRAINT_4GB : 0); + + if (!new_hpa) { + PrintError(vm, VCORE_NONE, "Cannot allocate memory for new base region...\n"); + goto out_fail; + } + + // Note, assumes virtual contiguity in the host OS... + memcpy(V3_VAddr((void*)new_hpa), V3_VAddr((void*)(reg->host_addr)), num_pages*PAGE_SIZE); + + old_hpa = (void*)(reg->host_addr); + old_node = (int)(reg->numa_id); + + reg->host_addr = (addr_t)new_hpa; + reg->numa_id = v3_numa_hpa_to_node((addr_t)new_hpa); + + // flush all page tables / kill all humans + + for (i=0;inum_cores;i++) { + if (vm->cores[i].shdw_pg_mode==SHADOW_PAGING) { + v3_invalidate_shadow_pts(&(vm->cores[i])); + } else if (vm->cores[i].shdw_pg_mode==NESTED_PAGING) { + // nested invalidator uses inclusive addressing [start,end], not [start,end) + v3_invalidate_nested_addr_range(&(vm->cores[i]),reg->guest_start,reg->guest_end-1); + } else { + PrintError(vm,VCORE_NONE, "Cannot determine how to invalidate paging structures! Reverting to previous region.\n"); + // We'll restore things... + reg->host_addr = (addr_t) old_hpa; + reg->numa_id = old_node; + V3_FreePages(new_hpa,num_pages); + goto out_fail; + } + } + + // Now the old region can go away... + V3_FreePages(old_hpa,num_pages); + + PrintDebug(vm,VCORE_NONE,"Migration of memory complete - new region is %p to %p\n", + (void*)(reg->host_addr),(void*)(reg->host_addr+num_pages*PAGE_SIZE-1)); + + out_success: + v3_lower_barrier(vm); + return 0; + + + out_fail: + v3_lower_barrier(vm); + return -1; +} int v3_stop_vm(struct v3_vm_info * vm) { diff --git a/palacios/src/palacios/vmm_mem.c b/palacios/src/palacios/vmm_mem.c index bc9490f..3b62842 100644 --- a/palacios/src/palacios/vmm_mem.c +++ b/palacios/src/palacios/vmm_mem.c @@ -27,6 +27,7 @@ #include #include +#include uint64_t v3_mem_block_size = V3_CONFIG_MEM_BLOCK_SIZE; @@ -185,13 +186,18 @@ int v3_init_mem_map(struct v3_vm_info * vm) { // Clear the memory... memset(V3_VAddr((void *)region->host_addr), 0, v3_mem_block_size); + + // Note assigned numa ID could be different than our request... + region->numa_id = v3_numa_hpa_to_node(region->host_addr); region->flags.read = 1; region->flags.write = 1; region->flags.exec = 1; region->flags.base = 1; region->flags.alloced = 1; + region->flags.limit32 = will_use_shadow_paging(vm); + region->unhandled = unhandled_err; }