Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


v3_mem enhancements - managed offlining; ability to deallocate memory (obviates v3_me...
[palacios.git] / linux_usr / v3_mem.c
index 87663ac..be66474 100644 (file)
@@ -12,7 +12,8 @@
 #include <sys/types.h> 
 #include <unistd.h> 
 #include <string.h>
-#include <dirent.h> 
+#include <dirent.h>
+#include <alloca.h> 
 
 #include "v3_ctrl.h"
 
 
 #define BUF_SIZE 128
 
+char offname[256];
+FILE *off;
 
+int num_offline;
+unsigned long long *start_offline;
+unsigned long long *len_offline;
+
+static int read_offlined();
+static int write_offlined();
+static int find_offlined(unsigned long long base_addr);
+static int clear_offlined();
 
 
 static int offline_memory(unsigned long long mem_size_bytes,
@@ -29,48 +40,59 @@ static int offline_memory(unsigned long long mem_size_bytes,
                          unsigned long long *num_bytes, 
                          unsigned long long *base_addr);
 
+static int online_memory(unsigned long long num_bytes, 
+                        unsigned long long base_addr);
+
+
+
+
 int main(int argc, char * argv[]) {
     unsigned long long mem_size_bytes = 0;
     unsigned long long mem_min_start = 0;
     int v3_fd = -1;
     int request = 0;
     int limit32 = 0;
+    int help=0;
+    int alloffline=0;
+    enum {NONE, ADD, REMOVE} op;
     int node = -1;
     int c;
     unsigned long long num_bytes, base_addr;
     struct v3_mem_region mem;
 
-    if (argc<2 || argc>5) {
-       printf("usage: v3_mem [-r] [-l] [-n k] <memory size (MB)> [min_start (MB)]\n\n"
-              "Allocate memory for use by Palacios.\n\n"
-              "With    -r    this requests in-kernel allocation.\n"
-              "Without -r    this attempts to offline memory via hot\n"
-              "              remove.\n"
-              "With    -l    the request or offlining is limited to first 4 GB\n"
-              "Without -l    the request or offlining has no limits\n\n"
-              "With    -n k  the request is for numa node k\n"
-              "Without -n k  the request can be on any numa node\n\n"
-              "For offlining, min_start is the minimum allowable starting address.\n"
-               "This is zero by default\n\n");  
-        return -1;
-    }
-
-    while ((c=getopt(argc,argv,"rln:"))!=-1) {
+    while ((c=getopt(argc,argv,"harklm:n:"))!=-1) {
        switch (c) {
+           case 'h':
+               help=1;
+               break;
+           case 'a':
+               op=ADD;
+               break;
            case 'r':
+               op=REMOVE;
+               break;
+           case 'k':
                request=1;
                break;
            case 'l':
                limit32=1;
                break;
+           case 'm':
+               mem_min_start = atoll(optarg) * (1024*1024);
+               break;
            case 'n':
                node = atoi(optarg);
                break;
            case '?':
                if (optopt=='n') { 
                    printf("-n requires the numa node...\n");
+            return -1;
+               } else if (optopt=='m') { 
+                   printf("-m requires the minimum starting address (in MB)...\n");
+            return -1;
                } else {
                    printf("Unknown option %c\n",optopt);
+            return -1;
                }
                break;
            default:
@@ -78,48 +100,166 @@ int main(int argc, char * argv[]) {
                break;
        }
     }
+
+
+    if (op==NONE || optind==argc || help) {
+       printf("usage: v3_mem [ [-k] [-l] [-n k] [-m n] -a <memory size (MB)>] | [-r <hexaddr> | offline]\n\n"
+              "Palacios Memory Management\n\nMemory Addition\n"
+              " -a <mem>      Allocate memory for use by Palacios (MB).\n\n"
+              " With    -k    this requests in-kernel allocation\n"
+              " Without -k    this attempts to offline memory via hot remove\n\n"
+              " With    -l    the request or offlining is limited to first 4 GB\n"
+              " Without -l    the request or offlining has no limits\n\n"
+              " With    -m n  the search for offlineable memory starts at n MB\n"
+              " Without -m n  the search for offlineable memory starts at 0 MB\n\n"
+              " With    -n i  the request is for numa node i\n"
+              " Without -n i  the request can be satified on any numa node\n\n"
+              "Memory Removal\n"
+              " -r <hexaddr>  Free Palacios memory containing hexaddr, online it if needed\n"
+              " -r offline    Free all offline Palacios memory and online it\n"
+              );
        
+       return -1;
+    }
 
-    mem_size_bytes = atoll(argv[optind]) * (1024 * 1024);
+    if (op==ADD) {
+       mem_size_bytes = atoll(argv[optind]) * (1024 * 1024);
+    } else if (op==REMOVE) { 
+       if (!strcasecmp(argv[optind],"offline")) {
+           alloffline=1;
+       } else {
+           base_addr=strtoll(argv[optind],NULL,16);
+       }
+    }
+    
+    if (!getenv("PALACIOS_DIR")) { 
+       printf("Please set the PALACIOS_DIR variable\n");
+       return -1;
+    }
 
-    if ((optind+1) < argc) { 
-        mem_min_start = atoll(argv[optind+1]) * (1024 * 1024);
+    strcpy(offname,getenv("PALACIOS_DIR"));
+    strcat(offname,"/.v3offlinedmem");
+
+    if (!(off=fopen(offname,"a+"))) { 
+       printf("Cannot open or create offline memory file %s",offname);
+       return -1;
+    }
+
+    // removing all offlined memory we added is a special case
+    if (op==REMOVE && alloffline) {
+       int i;
+       int rc=0;
+
+       // we just need to reinvoke ourselves
+       read_offlined();
+       for (i=0;i<num_offline;i++) {
+           char cmd[256];
+           sprintf(cmd,"v3_mem -r %llx", start_offline[i]);
+           rc|=system(cmd);
+       }
+       clear_offlined();
+       return rc;
     }
 
+       
     v3_fd = open(v3_dev, O_RDONLY);
     
     if (v3_fd == -1) {
        printf("Error opening V3Vee control device\n");
+       fclose(off);
        return -1;
     }
 
-    if (!request) { 
-       printf("Trying to offline memory (size=%llu, min_start=%llu, limit32=%d)\n",mem_size_bytes,mem_min_start,limit32);
-       if (offline_memory(mem_size_bytes,mem_min_start,limit32, &num_bytes, &base_addr)) { 
-           printf("Could not offline memory\n");
+
+    if (op==ADD) { 
+       if (!request) { 
+           printf("Trying to offline memory (size=%llu, min_start=%llu, limit32=%d)\n",mem_size_bytes,mem_min_start,limit32);
+           if (offline_memory(mem_size_bytes,mem_min_start,limit32, &num_bytes, &base_addr)) { 
+               printf("Could not offline memory\n");
+               fclose(off);
+               close(v3_fd);
+               return -1;
+           }
+           
+           fprintf(off,"%llx\t%llx\n",base_addr, num_bytes);
+           
+           mem.type=PREALLOCATED;
+           mem.node=node;
+           mem.base_addr=base_addr;
+           mem.num_pages=num_bytes/4096;
+           
+       } else {
+           printf("Generating memory allocation request (size=%llu, limit32=%d)\n", mem_size_bytes, limit32);
+           mem.type = limit32 ? REQUESTED32 : REQUESTED;
+           mem.node = node;
+           mem.base_addr = 0;
+           mem.num_pages = mem_size_bytes / 4096;
+       }
+       
+       printf("Allocation request is: type=%d, node=%d, base_addr=0x%llx, num_pages=%llu\n",
+              mem.type, mem.node, mem.base_addr, mem.num_pages);
+       
+       if (ioctl(v3_fd, V3_ADD_MEMORY, &mem)<0) { 
+           printf("Request rejected by Palacios\n");
+           close(v3_fd);
+           fclose(off);
            return -1;
+       } else {
+           printf("Request accepted by Palacios\n");
+           close(v3_fd);       
+           fclose(off);
+           return 0;
        }
-       mem.type=PREALLOCATED;
-       mem.node=0;
+       
+    } else if (op==REMOVE) { 
+       int entry;
+
+       read_offlined();
+
+       entry=find_offlined(base_addr);
+       
+       if (entry<0) { 
+           // no need to offline
+           mem.type=REQUESTED;
+       } else {
+           mem.type=PREALLOCATED;
+       }
+       
        mem.base_addr=base_addr;
-       mem.num_pages=num_bytes/4096;
-    } else {
-       printf("Generating memory allocation request (size=%llu, limit32=%d)\n", mem_size_bytes, limit32);
-       mem.type = limit32 ? REQUESTED32 : REQUESTED;
-       mem.node = node;
-       num_bytes = mem_size_bytes;
-       base_addr = 0;
-    }
-    
-    if (ioctl(v3_fd, V3_ADD_MEMORY, &mem)<0) { 
-       printf("Request rejected by Palacios\n");
-       close(v3_fd);
-       return -1;
-    } else {
+
+       // now remove it from palacios
+       printf("Deallocation request is: type=%d, base_addr=0x%llx\n",
+              mem.type, mem.base_addr);
+       
+       if (ioctl(v3_fd, V3_REMOVE_MEMORY, &mem)<0) { 
+           printf("Request rejected by Palacios\n");
+           close(v3_fd);
+           fclose(off);
+           return -1;
+       } 
+
        printf("Request accepted by Palacios\n");
-       close(v3_fd);   
+
+       if (entry>=0) { 
+           printf("Onlining the memory to make it available to the kernel\n");
+           online_memory(start_offline[entry],len_offline[entry]);
+       
+           len_offline[entry] = 0;
+
+           write_offlined();
+
+           
+       } else {
+           printf("Memory was deallocated in the kernel\n");
+       }
+
+       clear_offlined();
+       close(v3_fd);
+       fclose(off);
+
        return 0;
     }
+
 } 
 
 
@@ -135,11 +275,31 @@ static int dir_filter(const struct dirent * dir) {
 static int dir_cmp(const struct dirent **dir1, const struct dirent ** dir2) {
     int num1 = atoi((*dir1)->d_name + 6);
     int num2 = atoi((*dir2)->d_name + 6);
-
+    
     return num1 - num2;
 }
 
 
+
+#define UNWIND(first,last)                                     \
+do {                                                           \
+    int i;                                                     \
+    for (i = first; i <= last; i++) {                          \
+       FILE *f;                                                \
+       char name[256];                                         \
+       snprintf(name,256,"%smemory%d/state",SYS_PATH,i);       \
+       f=fopen(name,"r+");                                     \
+       if (!f) {                                               \
+           perror("Cannot open state file\n");                 \
+           return -1;                                          \
+       }                                                       \
+       printf("Re-onlining block %d (%s)\n",i,name);           \
+       fprintf(f,"online\n");                                  \
+       fclose(f);                                              \
+    }                                                          \
+} while (0)
+
+
 static int offline_memory(unsigned long long mem_size_bytes,
                          unsigned long long mem_min_start,
                          int limit32, 
@@ -154,6 +314,7 @@ static int offline_memory(unsigned long long mem_size_bytes,
     int reg_start = 0;
     int mem_ready = 0;
     
+    
 
     printf("Trying to find %dMB (%d bytes) of memory above %llu with limit32=%d\n", mem_size_bytes/(1024*1024), mem_size_bytes, mem_min_start, limit32);
        
@@ -210,7 +371,8 @@ static int offline_memory(unsigned long long mem_size_bytes,
        size = bitmap_entries / 8;
        if (bitmap_entries % 8) size++;
            
-       bitmap = malloc(size);
+       bitmap = alloca(size);
+
        if (!bitmap) {
            printf("ERROR: could not allocate space for bitmap\n");
            return -1;
@@ -244,36 +406,67 @@ static int offline_memory(unsigned long long mem_size_bytes,
                continue;
            } 
                
+
+           // The prospective block must be (a) removable, and (b) currently online
                
            printf("Checking %s...", fname);
                
            block_fd = open(fname, O_RDONLY);
                
            if (block_fd == -1) {
-               printf("Hotpluggable memory not supported...\n");
+               printf("Hotpluggable memory not supported or could not determine if block is removable...\n");
                return -1;
            }
                
            if (read(block_fd, status_str, BUF_SIZE) <= 0) {
-               perror("Could not read block status");
+               perror("Could not read block removability information\n");
                return -1;
            }
+           
+           status_str[BUF_SIZE-1]=0;
                
            close(block_fd);
                
            if (atoi(status_str) == 1) {
-               printf("Removable\n");
-               bitmap[major] |= (0x1 << minor);
+               printf("Removable ");
            } else {
                printf("Not removable\n");
+               continue;
+           }
+           
+           snprintf(fname, BUF_SIZE, "%s%s/state", SYS_PATH, tmp_dir->d_name);
+           
+           block_fd = open(fname, O_RDONLY);
+           
+           if (block_fd<0) { 
+               perror("Could not open block state\n");
+               return -1;
            }
+
+           if (read(block_fd, status_str, BUF_SIZE) <=0) { 
+               perror("Could not read block state information\n");
+               return -1;
+           }
+
+           status_str[BUF_SIZE-1]=0;
+
+           close(block_fd);
+
+           if (!strncasecmp(status_str,"offline",7)) {
+               printf("and Already Offline (unusable)\n");
+           } else if (!strncasecmp(status_str,"online",6)) { 
+               printf("and Online (usable)\n");
+               bitmap[major] |= (0x1 << minor);
+           } else {
+               printf("and in Unknown State '%s' (unusable)\n",status_str);
+           }
+           
        }
        
     }
     
     while (!mem_ready) {
-       
-       
+
        /* Scan bitmap for enough consecutive space */
        {
            // num_blocks: The number of blocks we need to find
@@ -304,6 +497,7 @@ static int offline_memory(unsigned long long mem_size_bytes,
            
            if (run_len < num_blocks) {
                fprintf(stderr, "Could not find enough consecutive memory blocks... (found %d)\n", run_len);
+               // no offlining yet, so no need to unwind here
                return -1;
            }
        }
@@ -325,6 +519,7 @@ static int offline_memory(unsigned long long mem_size_bytes,
                
                if (block_file == NULL) {
                    perror("Could not open block file");
+                   UNWIND(reg_start, i+reg_start-1);
                    return -1;
                }
                
@@ -333,6 +528,7 @@ static int offline_memory(unsigned long long mem_size_bytes,
                fprintf(block_file, "offline\n");
                
                fclose(block_file);
+               
            }
        }
        
@@ -362,20 +558,22 @@ static int offline_memory(unsigned long long mem_size_bytes,
                block_fd = open(fname, O_RDONLY);
                
                if (block_fd == -1) {
-                   perror("Could not open block file");
+                   perror("Could not open block state file");
                    return -1;
                }
                
                if (read(block_fd, status_buf, BUF_SIZE) <= 0) {
-                   perror("Could not read block status");
+                   perror("Could not read block state");
                    return -1;
                }
+
+               status_buf[BUF_SIZE]=0;
                
                printf("Checking offlined block %d (%s)...", i + reg_start, fname);
                
                int ret = strncmp(status_buf, "offline", strlen("offline"));
                
-               if (ret != 0) {
+               if (ret != 0) {  // uh oh
                    int j = 0;
                    int major = (i + reg_start) / 8;
                    int minor = (i + reg_start) % 8;
@@ -384,45 +582,181 @@ static int offline_memory(unsigned long long mem_size_bytes,
                    
                    mem_ready = 0; // Keep searching
                    
-                   printf("ERROR (%d)\n", ret);
-                   
-                   for (j = 0; j < i; j++) {
-                       FILE * block_file = NULL;
-                       char fname[256];
-                       
-                       memset(fname, 0, 256);
-                       
-                       snprintf(fname, 256, "%smemory%d/state", SYS_PATH, j + reg_start);
-                       
-                       block_file = fopen(fname, "r+");
-                       
-                       if (block_file == NULL) {
-                           perror("Could not open block file");
-                           return -1;
-                       }
-                       
-                       fprintf(block_file, "online\n");
-                       
-                       fclose(block_file);
-                   }
+                   printf("ERROR - block status is '%s'\n", status_buf);
+
+                   // Unwind space
+                   UNWIND(reg_start,reg_start+num_blocks-1);
                    
                    break;
                } 
+           }
+           
+           printf("Offlined Memory OK\n");
                
-               printf("OK\n");
+       }
+    }
+    
+    /* Memory is offlined. Calculate size and phys start addr to send to Palacios */
+    *num_bytes = (unsigned long long)(num_blocks) * (unsigned long long)(block_size_bytes);
+    *base_addr = (unsigned long long)(reg_start) * (unsigned long long)(block_size_bytes);
+    
+    return 0;
+}
+
+
+static int online_memory(unsigned long long base_addr,
+                        unsigned long long num_bytes)
+{
+    
+    unsigned int block_size_bytes = 0;
+    int bitmap_entries = 0;
+    unsigned char * bitmap = NULL;
+    int num_blocks = 0;    
+    int reg_start = 0;
+    int mem_ready = 0;
+    
+    
+
+    printf("Trying to online memory from %llu to %llu\n",base_addr,base_addr+num_bytes-1);
+       
+    /* Figure out the block size */
+    {
+       int tmp_fd = 0;
+       char tmp_buf[BUF_SIZE];
+       
+       tmp_fd = open(SYS_PATH "block_size_bytes", O_RDONLY);
+       
+       if (tmp_fd == -1) {
+           perror("Could not open block size file: " SYS_PATH "block_size_bytes");
+           return -1;
+       }
+    
+       if (read(tmp_fd, tmp_buf, BUF_SIZE) <= 0) {
+           perror("Could not read block size file: " SYS_PATH "block_size_bytes");
+           return -1;
+       }
+       
+       close(tmp_fd);
+       
+       block_size_bytes = strtoll(tmp_buf, NULL, 16);
+       
+       printf("Memory block size is %dMB (%d bytes)\n", block_size_bytes / (1024 * 1024), block_size_bytes);
+       
+    }
+    
+    num_blocks =  num_bytes / block_size_bytes;
+    if (num_bytes % block_size_bytes) num_blocks++;
+
+    reg_start = base_addr / block_size_bytes;
+
+    printf("That is %lu blocks of size %llu starting at block %d\n", num_blocks, block_size_bytes, reg_start);
+   
+    
+       
+    /* Online memory blocks starting at reg_start */
+    {
+       int i = 0;
+           
+       for (i = 0; i < num_blocks; i++) {      
+           FILE * block_file = NULL;
+           char fname[256];
                
+           memset(fname, 0, 256);
+           
+           snprintf(fname, 256, "%smemory%d/state", SYS_PATH, i + reg_start);
+           
+           block_file = fopen(fname, "r+");
+           
+           if (block_file == NULL) {
+               perror("Could not open block file");
+               return -1;
            }
+               
+           
+           printf("Onlining block %d (%s)\n", i + reg_start, fname);
+           fprintf(block_file, "online\n");
            
+           fclose(block_file);
            
        }
     }
     
-    free(bitmap);
+    return 0;
     
-    /* Memory is offlined. Calculate size and phys start addr to send to Palacios */
-    *num_bytes = (unsigned long long)(num_blocks) * (unsigned long long)(block_size_bytes);
-    *base_addr = (unsigned long long)(reg_start) * (unsigned long long)(block_size_bytes);
+}
+
+
+
+static int read_offlined()
+{
+    rewind(off);
+    unsigned long long base, len;
+    int i;
+    
+    num_offline=0;
+    while (fscanf(off,"%llx\t%llx\n",&base,&len)==2) { num_offline++; }
+
+
+    start_offline=(unsigned long long *)calloc(num_offline, sizeof(unsigned long long));
+    len_offline=(unsigned long long *)calloc(num_offline, sizeof(unsigned long long));
+
+    if (!start_offline || !len_offline) { 
+       printf("Cannot allocate space to load offline map\n");
+       return -1;
+    }
+
+    rewind(off);
+    for (i=0;i<num_offline;i++) { 
+       fscanf(off,"%llx\t%llx",&(start_offline[i]),&(len_offline[i]));
+    }
+    // we are now back to the end, and can keep appending
+    return 0;
+}
 
+
+static int write_offlined()
+{
+    int i;
+
+    fclose(off);
+    if (!(off=fopen(offname,"w+"))) {  // truncate
+       printf("Cannot open %s for writing!\n");
+       return -1;
+    }
+
+    for (i=0;i<num_offline;i++) { 
+       if (len_offline[i]) { 
+           fprintf(off,"%llx\t%llx\n",start_offline[i],len_offline[i]);
+       }
+    }
+    // we are now back to the end, and can keep appending
+    return 0;
+}
+
+
+static int clear_offlined()
+{
+    free(start_offline);
+    free(len_offline);
     return 0;
 }
 
+static int find_offlined(unsigned long long base_addr)
+{
+    int i;
+
+    for (i=0;i<num_offline;i++) { 
+       if (base_addr>=start_offline[i] &&
+           base_addr<(start_offline[i]+len_offline[i])) { 
+           return i;
+       }
+    }
+
+    return -1;
+
+}
+
+
+    
+
+