From: Peter Dinda <pdinda@northwestern.edu>
Date: Thu, 10 Oct 2013 20:22:01 +0000 (-0500)
Subject: Dynamic migration of memory from node to node
X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?p=palacios.git;a=commitdiff_plain;h=b07dd4e35a37f1db04c4f52f9904ee7206673ac6

Dynamic migration of memory from node to node

This adds the ability to move memory regions of a VM from user space:

  v3_mem_move /dev/v3-vmX guest_paddr host_cpuid

This will move the memory region in which guest_paddr is contained
from its current numa node to the one affiliated with host_cpuid

Combined with v3_core_move, this should provide the ability to
dynamically manipulate NUMA mappings at runtime, to the granularity
of the v3_mem_block_size (which is selected at insmod time)
---

diff --git a/linux_module/main.c b/linux_module/main.c
index 8162172..932b739 100644
--- a/linux_module/main.c
+++ b/linux_module/main.c
@@ -355,8 +355,9 @@ static int read_guests_details(struct seq_file *s, void *v)
 
 		seq_printf(s, "\nMemory Regions\n");
 		for (j=0;j<mem->num_regions;j++) { 
-		    seq_printf(s,"   region %u has HPAs 0x%p-0x%p\n",
-			       j, mem->region[j].host_paddr, mem->region[j].host_paddr+mem->region[j].size);
+		    seq_printf(s,"   region %u has HPAs 0x%p-0x%p (node %d)\n",
+			       j, mem->region[j].host_paddr, mem->region[j].host_paddr+mem->region[j].size,
+			       numa_addr_to_node((uintptr_t)(mem->region[j].host_paddr)));
 		}
 	    }
 	    seq_printf(s,
diff --git a/linux_module/palacios.h b/linux_module/palacios.h
index 7b3a32b..c9cbb96 100644
--- a/linux_module/palacios.h
+++ b/linux_module/palacios.h
@@ -39,6 +39,8 @@
 #define V3_VM_SEND    34
 #define V3_VM_RECEIVE 35
 
+#define V3_VM_MOVE_MEM 36
+
 #define V3_VM_FB_INPUT 257
 #define V3_VM_FB_QUERY 258
 
@@ -77,6 +79,11 @@ struct v3_core_move_cmd {
     unsigned short pcore_id;
 } __attribute__((packed));
 
+struct v3_mem_move_cmd{
+    unsigned long long gpa;
+    unsigned short     pcore_id;
+} __attribute__((packed));
+
 struct v3_chkpt_info {
     char store[128];
     char url[256]; /* This might need to be bigger... */
@@ -156,7 +163,7 @@ void *palacios_start_kernel_thread(int (*fn)(void * arg), void *arg, char *threa
 void *palacios_start_thread_on_cpu(int cpu_id, int (*fn)(void * arg), void *arg, char *thread_name);
 int   palacios_move_thread_to_cpu(int new_cpu_id, void *thread_ptr);
 void  palacios_yield_cpu(void);
-void  palacios_yield_cpu_timed(unsigned int us);
+void  palacios_sleep_cpu(unsigned int us);
 unsigned int palacios_get_cpu(void);
 unsigned int palacios_get_cpu_khz(void);
 void *palacios_mutex_alloc(void);         // allocates and inits a lock
diff --git a/linux_module/vm.c b/linux_module/vm.c
index ea585ab..2ff482b 100644
--- a/linux_module/vm.c
+++ b/linux_module/vm.c
@@ -317,13 +317,36 @@ static long v3_vm_ioctl(struct file * filp,
 	    memset(&cmd, 0, sizeof(struct v3_core_move_cmd));
 	    
 	    if (copy_from_user(&cmd, argp, sizeof(struct v3_core_move_cmd))) {
-		WARNING("copy from user error getting migrate command...\n");
+		WARNING("copy from user error getting migrate core command...\n");
 		return -EFAULT;
 	    }
 	
 	    INFO("moving guest %s vcore %d to CPU %d\n", guest->name, cmd.vcore_id, cmd.pcore_id);
 
-	    v3_move_vm_core(guest->v3_ctx, cmd.vcore_id, cmd.pcore_id);
+	    if (v3_move_vm_core(guest->v3_ctx, cmd.vcore_id, cmd.pcore_id)) { 
+		ERROR("Could not move core\n");
+		return -EFAULT;
+	    }
+
+	    break;
+	}
+	case V3_VM_MOVE_MEM: {
+	    struct v3_mem_move_cmd cmd;
+	    void __user * argp = (void __user *)arg;
+
+	    memset(&cmd, 0, sizeof(struct v3_mem_move_cmd));
+	    
+	    if (copy_from_user(&cmd, argp, sizeof(struct v3_mem_move_cmd))) {
+		WARNING("copy from user error getting migrate memory command...\n");
+		return -EFAULT;
+	    }
+	
+	    INFO("moving guest %s memory at gpa %p to memory with affinity for CPU %d\n", guest->name, (void*)(cmd.gpa), cmd.pcore_id);
+
+	    if (v3_move_vm_mem(guest->v3_ctx, (void*)(cmd.gpa), cmd.pcore_id)) {
+		ERROR("Could not move memory\n");
+		return -EFAULT;
+	    }
 
 	    break;
 	}
diff --git a/linux_usr/Makefile b/linux_usr/Makefile
index 0cf09ce..51fcc74 100644
--- a/linux_usr/Makefile
+++ b/linux_usr/Makefile
@@ -20,6 +20,7 @@ BASE_EXECS = 	v3_mem \
 		v3_pause  \
 		v3_continue \
 		v3_core_move \
+                v3_mem_move \
 		v3_load \
 		v3_save \
 		v3_cons \
diff --git a/linux_usr/v3_ctrl.h b/linux_usr/v3_ctrl.h
index e5e0d76..20eadc4 100644
--- a/linux_usr/v3_ctrl.h
+++ b/linux_usr/v3_ctrl.h
@@ -43,6 +43,8 @@
 #define V3_VM_SEND    34
 #define V3_VM_RECEIVE 35
 
+#define V3_VM_MOVE_MEM 36
+
 #define V3_VM_FB_INPUT 257
 #define V3_VM_FB_QUERY 258
 
@@ -78,6 +80,11 @@ struct v3_core_move_cmd{
     unsigned short pcore_id;
 } __attribute__((packed));
 
+struct v3_mem_move_cmd{
+    unsigned long long gpa;
+    unsigned short     pcore_id;
+} __attribute__((packed));
+
 struct v3_debug_cmd {
     unsigned int core; 
     unsigned int cmd;
diff --git a/linux_usr/v3_mem_move.c b/linux_usr/v3_mem_move.c
new file mode 100644
index 0000000..145217c
--- /dev/null
+++ b/linux_usr/v3_mem_move.c
@@ -0,0 +1,59 @@
+/* 
+ * V3 memory movement control
+ * (c) Peter Dinda 2013
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h> 
+#include <errno.h>
+#include <assert.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "v3_ctrl.h"
+
+
+int main(int argc, char* argv[]) {
+    int vm_fd;
+    char * vm_dev = NULL;
+    struct v3_mem_move_cmd cmd; 
+
+    if (argc < 4) {
+	printf("usage: v3_mem_move <vm_device> <guest_physical_addr> <target_physical_cpu>\n\n");
+	printf("Moves the memory region into which the guest_physical_addr is mapped\n");
+	printf("to host physical memory that has highest affinity for the target_physical_cpu.\n");
+	printf("you can find the current memory mapping via /proc/v3vee/v3-guests-details\n\n");
+	printf(" guest_physical_addr  - hex address\n");
+	printf(" target_physical_cpu  - base 10 cpuid (0..numcpus-1)\n\n");
+	return -1;
+    }
+
+    vm_dev = argv[1];
+    cmd.gpa = strtoll(argv[2],0,16);
+    cmd.pcore_id = atoi(argv[3]);
+
+    printf("Migrating memory region of %p to memory with affinity for physical CPU %d\n", cmd.gpa, cmd.pcore_id);
+
+    vm_fd = open(vm_dev, O_RDONLY);
+
+    if (vm_fd == -1) {
+	printf("Error opening VM device: %s\n", vm_dev);
+	return -1;
+    }
+
+    int err = ioctl(vm_fd, V3_VM_MOVE_MEM, &cmd); 
+
+    if (err < 0) {
+	printf("Error write memory migrating command to vm\n");
+	return -1;
+    }
+
+    close(vm_fd); 
+
+    return 0; 
+}
+
+
diff --git a/palacios/include/palacios/vmm.h b/palacios/include/palacios/vmm.h
index c4d0e17..983cd78 100644
--- a/palacios/include/palacios/vmm.h
+++ b/palacios/include/palacios/vmm.h
@@ -449,6 +449,7 @@ int v3_send_vm(struct v3_vm_info * vm, char * store, char * url, unsigned long l
 int v3_receive_vm(struct v3_vm_info * vm, char * store, char * url, unsigned long long opts);
 
 int v3_move_vm_core(struct v3_vm_info * vm, int vcore_id, int target_cpu);
+int v3_move_vm_mem(struct v3_vm_info * vm, void *gpa, int target_cpu);
 
 int v3_free_vm(struct v3_vm_info * vm);
 
diff --git a/palacios/include/palacios/vmm_mem.h b/palacios/include/palacios/vmm_mem.h
index ff20faf..7bec3e6 100644
--- a/palacios/include/palacios/vmm_mem.h
+++ b/palacios/include/palacios/vmm_mem.h
@@ -52,6 +52,7 @@ typedef struct {
 	    uint8_t exec   : 1;
 	    uint8_t base   : 1;
 	    uint8_t alloced : 1;
+	    uint8_t limit32 : 1; // must be < 4GB in host
 	} __attribute__((packed));
     } __attribute__((packed));
 } __attribute__((packed)) v3_mem_flags_t;
diff --git a/palacios/src/palacios/vmm.c b/palacios/src/palacios/vmm.c
index e9634d6..cfef4f9 100644
--- a/palacios/src/palacios/vmm.c
+++ b/palacios/src/palacios/vmm.c
@@ -29,6 +29,8 @@
 #include <palacios/vmm_timeout.h>
 #include <palacios/vmm_options.h>
 #include <palacios/vmm_cpu_mapper.h>
+#include <palacios/vmm_direct_paging.h>
+#include <interfaces/vmm_numa.h>
 
 #ifdef V3_CONFIG_SVM
 #include <palacios/svm.h>
@@ -489,7 +491,121 @@ int v3_move_vm_core(struct v3_vm_info * vm, int vcore_id, int target_cpu) {
     return 0;
 }
 
+/* move a memory region to memory with affinity for a specific physical core */
+int v3_move_vm_mem(struct v3_vm_info * vm, void *gpa, int target_cpu) {
+    int old_node;
+    int new_node;
+    struct v3_mem_region *reg;
+    void *new_hpa;
+    int num_pages;
+    void *old_hpa;
+    int i;
+
+    old_node = v3_numa_gpa_to_node(vm,(addr_t)gpa);
+
+    if (old_node<0) { 
+	PrintError(vm, VCORE_NONE, "Cannot determine current node of gpa %p\n",gpa);
+	return -1;
+    }
+
+    new_node = v3_numa_cpu_to_node(target_cpu);
+
+    if (new_node<0) { 
+	PrintError(vm, VCORE_NONE, "Cannot determine current node of cpu %d\n",target_cpu);
+	return -1;
+    }
+
+    if (new_node==old_node) { 
+	PrintDebug(vm, VCORE_NONE, "Affinity is already established - ignoring request\n");
+	return 0;
+    }
+
+    // We are now going to change the universe, so 
+    // we'll barrier everyone first
+
+    while (v3_raise_barrier(vm, NULL) == -1);
+
+    // get region
+    
+    reg = v3_get_mem_region(vm, V3_MEM_CORE_ANY, (addr_t) gpa);
+
+    if (!reg) { 
+	PrintError(vm, VCORE_NONE, "Attempt to migrate non-existent memory\n");
+	goto out_fail;
+    }
+    
+    if (!(reg->flags.base) || !(reg->flags.alloced)) { 
+	PrintError(vm, VCORE_NONE, "Attempt to migrate invalid region: base=%d alloced=%d\n", reg->flags.base, reg->flags.alloced);
+	goto out_fail;
+    }
+
+    // we now have the allocated base region corresponding to  - and not a copy
+    // we will rewrite this region after moving its contents
+    
+    // first, let's double check that we are in fact changing the numa_id...
 
+    if (reg->numa_id==new_node) { 
+	PrintDebug(vm, VCORE_NONE, "Affinity for this base region is already established - ignoring...\n");
+	goto out_success;
+    }
+
+    // region uses exclusive addressing [guest_start,guest_end)
+    num_pages = (reg->guest_end-reg->guest_start)/PAGE_SIZE;
+
+    // Now we allocate space for the new region with the same constraints as
+    // it originally had
+    new_hpa = V3_AllocPagesExtended(num_pages,
+				    PAGE_SIZE_4KB,
+				    new_node,
+				    reg->flags.limit32 ? V3_ALLOC_PAGES_CONSTRAINT_4GB : 0);
+
+    if (!new_hpa) { 
+	PrintError(vm, VCORE_NONE, "Cannot allocate memory for new base region...\n");
+	goto out_fail;
+    }
+
+    // Note, assumes virtual contiguity in the host OS... 
+    memcpy(V3_VAddr((void*)new_hpa), V3_VAddr((void*)(reg->host_addr)), num_pages*PAGE_SIZE);
+
+    old_hpa = (void*)(reg->host_addr);
+    old_node = (int)(reg->numa_id);
+
+    reg->host_addr = (addr_t)new_hpa;
+    reg->numa_id = v3_numa_hpa_to_node((addr_t)new_hpa);
+
+    // flush all page tables / kill all humans 
+
+    for (i=0;i<vm->num_cores;i++) { 
+	if (vm->cores[i].shdw_pg_mode==SHADOW_PAGING) { 
+	    v3_invalidate_shadow_pts(&(vm->cores[i]));
+	} else if (vm->cores[i].shdw_pg_mode==NESTED_PAGING) { 
+	    // nested invalidator uses inclusive addressing [start,end], not [start,end)
+	    v3_invalidate_nested_addr_range(&(vm->cores[i]),reg->guest_start,reg->guest_end-1);
+	} else {
+	    PrintError(vm,VCORE_NONE, "Cannot determine how to invalidate paging structures! Reverting to previous region.\n");
+	    // We'll restore things...
+	    reg->host_addr = (addr_t) old_hpa;
+	    reg->numa_id = old_node;
+	    V3_FreePages(new_hpa,num_pages);
+	    goto out_fail;
+	}
+    }
+    
+    // Now the old region can go away...
+    V3_FreePages(old_hpa,num_pages);
+    
+    PrintDebug(vm,VCORE_NONE,"Migration of memory complete - new region is %p to %p\n",
+	       (void*)(reg->host_addr),(void*)(reg->host_addr+num_pages*PAGE_SIZE-1));
+    
+ out_success:
+    v3_lower_barrier(vm);
+    return 0;
+    
+    
+ out_fail:
+    v3_lower_barrier(vm);
+    return -1;
+}
 
 int v3_stop_vm(struct v3_vm_info * vm) {
 
diff --git a/palacios/src/palacios/vmm_mem.c b/palacios/src/palacios/vmm_mem.c
index bc9490f..3b62842 100644
--- a/palacios/src/palacios/vmm_mem.c
+++ b/palacios/src/palacios/vmm_mem.c
@@ -27,6 +27,7 @@
 #include <palacios/vmm_shadow_paging.h>
 #include <palacios/vmm_direct_paging.h>
 
+#include <interfaces/vmm_numa.h>
 
 uint64_t v3_mem_block_size = V3_CONFIG_MEM_BLOCK_SIZE;
 
@@ -185,13 +186,18 @@ int v3_init_mem_map(struct v3_vm_info * vm) {
 
 	// Clear the memory...
 	memset(V3_VAddr((void *)region->host_addr), 0, v3_mem_block_size);
+	
+	// Note assigned numa ID could be different than our request... 
+	region->numa_id = v3_numa_hpa_to_node(region->host_addr);
 
 	region->flags.read = 1;
 	region->flags.write = 1;
 	region->flags.exec = 1;
 	region->flags.base = 1;
 	region->flags.alloced = 1;
+	region->flags.limit32 = will_use_shadow_paging(vm);
 	
+
 	region->unhandled = unhandled_err;
     }