X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?a=blobdiff_plain;f=linux_usr%2Fv3_mem.c;h=4d08bd4fd666873c027b8923914f6d4d03b7df24;hb=68f8c4cd303c5da40c1083cbabdaf6395e4dbaa1;hp=87663ac4d4202d6b0e0cb21112b348b80d58b10e;hpb=37c08f88840030dae5893a8ad148262f3e08e9be;p=palacios.git diff --git a/linux_usr/v3_mem.c b/linux_usr/v3_mem.c index 87663ac..4d08bd4 100644 --- a/linux_usr/v3_mem.c +++ b/linux_usr/v3_mem.c @@ -12,114 +12,329 @@ #include #include #include -#include +#include +#include #include "v3_ctrl.h" +// set to zero to ignore, or set +// to a level likely given the largest contiguous +// page allocation outside of the base regions +// note that the seed pools provide 2-4 MB chunks +// to start +#define PALACIOS_MIN_ALLOC (64*4096ULL) + #define SYS_PATH "/sys/devices/system/memory/" #define BUF_SIZE 128 +int verbose=0; + +char offname[256]; +FILE *off; + +int num_offline; +unsigned long long *start_offline; +unsigned long long *len_offline; + +unsigned long long kernel_max_order; +unsigned long long kernel_max_page_alloc_bytes; +unsigned long long kernel_num_nodes; +unsigned long long kernel_num_cpus; +unsigned long long palacios_compiled_mem_block_size; +unsigned long long palacios_runtime_mem_block_size; +#define VPRINTF(...) do { if (verbose) { printf(__VA_ARGS__); } } while (0) +#define EPRINTF(...) do { fprintf(stderr,__VA_ARGS__); } while (0) + + +static int read_offlined(); +static int write_offlined(); +static int find_offlined(unsigned long long base_addr); +static int clear_offlined(); + static int offline_memory(unsigned long long mem_size_bytes, unsigned long long mem_min_start, int limit32, + int node, unsigned long long *num_bytes, unsigned long long *base_addr); +static int online_memory(unsigned long long num_bytes, + unsigned long long base_addr); + + +static int get_kernel_setup(); + + + int main(int argc, char * argv[]) { unsigned long long mem_size_bytes = 0; unsigned long long mem_min_start = 0; int v3_fd = -1; int request = 0; int limit32 = 0; + int help=0; + int alloffline=0; + enum {NONE, ADD, REMOVE} op; int node = -1; int c; unsigned long long num_bytes, base_addr; struct v3_mem_region mem; - if (argc<2 || argc>5) { - printf("usage: v3_mem [-r] [-l] [-n k] [min_start (MB)]\n\n" - "Allocate memory for use by Palacios.\n\n" - "With -r this requests in-kernel allocation.\n" - "Without -r this attempts to offline memory via hot\n" - " remove.\n" - "With -l the request or offlining is limited to first 4 GB\n" - "Without -l the request or offlining has no limits\n\n" - "With -n k the request is for numa node k\n" - "Without -n k the request can be on any numa node\n\n" - "For offlining, min_start is the minimum allowable starting address.\n" - "This is zero by default\n\n"); - return -1; - } - - while ((c=getopt(argc,argv,"rln:"))!=-1) { + while ((c=getopt(argc,argv,"hvarklm:n:"))!=-1) { switch (c) { + case 'h': + help=1; + break; + case 'v': + verbose=1; + break; + case 'a': + op=ADD; + break; case 'r': + op=REMOVE; + break; + case 'k': request=1; break; case 'l': limit32=1; break; + case 'm': + mem_min_start = atoll(optarg) * (1024*1024); + break; case 'n': node = atoi(optarg); break; case '?': if (optopt=='n') { - printf("-n requires the numa node...\n"); + EPRINTF("-n requires the numa node...\n"); + return -1; + } else if (optopt=='m') { + EPRINTF("-m requires the minimum starting address (in MB)...\n"); + return -1; } else { - printf("Unknown option %c\n",optopt); + EPRINTF("Unknown option %c\n",optopt); + return -1; } break; default: - printf("Unknown option %c\n",optopt); + EPRINTF("Unknown option %c\n",optopt); break; } } + + + if (op==NONE || optind==argc || help) { + EPRINTF("usage: v3_mem [-h] [-v] [ [-k] [-l] [-n k] [-m n] -a ] | [-r | offline]\n\n" + "Palacios Memory Management\n\nMemory Addition\n" + " -a Allocate memory for use by Palacios (MB).\n\n" + " With -k this requests in-kernel allocation\n" + " Without -k this attempts to offline memory via hot remove\n\n" + " With -l the request or offlining is limited to first 4 GB\n" + " Without -l the request or offlining has no limits\n\n" + " With -m n the search for offlineable memory starts at n MB\n" + " Without -m n the search for offlineable memory starts at 0 MB\n\n" + " With -n i the request is for numa node i\n" + " Without -n i the request can be satified on any numa node\n\n" + "Memory Removal\n" + " -r Free Palacios memory containing hexaddr, online it if needed\n" + " -r offline Free all offline Palacios memory and online it\n\n" + "Shared Options\n" + " -v Verbose\n" + " -h Help\n" + ); + return -1; + } + + + if (get_kernel_setup()) { + EPRINTF("Cannot read kernel setup\n"); + return -1; + } + + if (op==ADD) { + mem_size_bytes = (unsigned long long) (atof(argv[optind]) * (1024 * 1024)); + + if (mem_size_bytes < palacios_runtime_mem_block_size || + (PALACIOS_MIN_ALLOC!=0 && mem_size_bytes < PALACIOS_MIN_ALLOC)) { + EPRINTF("Trying to add a smaller single chunk of memory than Palacios needs\n" + "Your request: %llu bytes\n" + "Palacios run-time memory block size: %llu bytes\n" + "Palacios minimal contiguous alloc: %llu bytes\n", + mem_size_bytes, palacios_runtime_mem_block_size, + PALACIOS_MIN_ALLOC); + return -1; + } + + if (request && mem_size_bytes > kernel_max_page_alloc_bytes) { + EPRINTF("Trying to request a larger single chunk of memory than the kernel can allocate\n" + "Your request: %llu bytes\n" + "Kernel largest page allocation: %llu bytes\n" + "Kernel MAX_ORDER: %llu\n", + mem_size_bytes, kernel_max_page_alloc_bytes, kernel_max_order); + return -1; + } + + if (node>=0 && node>=kernel_num_nodes) { + EPRINTF("Trying to request or allocate memory for a nonexistent node\n" + "Your request: node %d\n" + "Kernel number of nodes: %llu\n", + node, kernel_num_nodes); + } + + + } else if (op==REMOVE) { + if (!strcasecmp(argv[optind],"offline")) { + alloffline=1; + } else { + base_addr=strtoll(argv[optind],NULL,16); + } + } + + if (!getenv("PALACIOS_DIR")) { + EPRINTF("Please set the PALACIOS_DIR variable\n"); + return -1; + } + + strcpy(offname,getenv("PALACIOS_DIR")); + strcat(offname,"/.v3offlinedmem"); - mem_size_bytes = atoll(argv[optind]) * (1024 * 1024); + if (!(off=fopen(offname,"a+"))) { + EPRINTF("Cannot open or create offline memory file %s",offname); + return -1; + } - if ((optind+1) < argc) { - mem_min_start = atoll(argv[optind+1]) * (1024 * 1024); + // removing all offlined memory we added is a special case + if (op==REMOVE && alloffline) { + int i; + int rc=0; + + // we just need to reinvoke ourselves + read_offlined(); + for (i=0;i=0) { + VPRINTF("Onlining the memory to make it available to the kernel\n"); + online_memory(start_offline[entry],len_offline[entry]); + + len_offline[entry] = 0; + + write_offlined(); + + printf("Memory at 0x%llx has been onlined\n",mem.base_addr); + + } else { + VPRINTF("Memory was deallocated in the kernel\n"); + printf("Memory at 0x%llx has been onlined\n",mem.base_addr); + } + + clear_offlined(); close(v3_fd); - return -1; - } else { - printf("Request accepted by Palacios\n"); - close(v3_fd); + fclose(off); + return 0; } + } @@ -135,14 +350,35 @@ static int dir_filter(const struct dirent * dir) { static int dir_cmp(const struct dirent **dir1, const struct dirent ** dir2) { int num1 = atoi((*dir1)->d_name + 6); int num2 = atoi((*dir2)->d_name + 6); - + return num1 - num2; } + +#define UNWIND(first,last) \ +do { \ + int i; \ + for (i = first; i <= last; i++) { \ + FILE *f; \ + char name[256]; \ + snprintf(name,256,"%smemory%d/state",SYS_PATH,i); \ + f=fopen(name,"r+"); \ + if (!f) { \ + perror("Cannot open state file\n"); \ + return -1; \ + } \ + VPRINTF("Re-onlining block %d (%s)\n",i,name); \ + fprintf(f,"online\n"); \ + fclose(f); \ + } \ +} while (0) + + static int offline_memory(unsigned long long mem_size_bytes, unsigned long long mem_min_start, int limit32, + int node, unsigned long long *num_bytes, unsigned long long *base_addr) { @@ -154,8 +390,9 @@ static int offline_memory(unsigned long long mem_size_bytes, int reg_start = 0; int mem_ready = 0; + - printf("Trying to find %dMB (%d bytes) of memory above %llu with limit32=%d\n", mem_size_bytes/(1024*1024), mem_size_bytes, mem_min_start, limit32); + VPRINTF("Trying to find %lluMB (%llu bytes) of memory above %llu with limit32=%d\n", mem_size_bytes/(1024*1024), mem_size_bytes, mem_min_start, limit32); /* Figure out the block size */ { @@ -178,7 +415,7 @@ static int offline_memory(unsigned long long mem_size_bytes, block_size_bytes = strtoll(tmp_buf, NULL, 16); - printf("Memory block size is %dMB (%d bytes)\n", block_size_bytes / (1024 * 1024), block_size_bytes); + VPRINTF("Memory block size is %dMB (%d bytes)\n", block_size_bytes / (1024 * 1024), block_size_bytes); } @@ -189,7 +426,7 @@ static int offline_memory(unsigned long long mem_size_bytes, mem_min_start = block_size_bytes * ((mem_min_start / block_size_bytes) + (!!(mem_min_start % block_size_bytes))); - printf("Looking for %d blocks of memory starting at %p (block %llu) with limit32=%d\n", num_blocks, (void*)mem_min_start, mem_min_start/block_size_bytes,limit32); + VPRINTF("Looking for %d blocks of memory starting at %p (block %llu) with limit32=%d for node %d\n", num_blocks, (void*)mem_min_start, mem_min_start/block_size_bytes,limit32,node); // We now need to find consecutive offlinable memory blocks @@ -210,9 +447,10 @@ static int offline_memory(unsigned long long mem_size_bytes, size = bitmap_entries / 8; if (bitmap_entries % 8) size++; - bitmap = malloc(size); + bitmap = alloca(size); + if (!bitmap) { - printf("ERROR: could not allocate space for bitmap\n"); + VPRINTF("ERROR: could not allocate space for bitmap\n"); return -1; } @@ -223,11 +461,16 @@ static int offline_memory(unsigned long long mem_size_bytes, int block_fd = 0; char status_str[BUF_SIZE]; char fname[BUF_SIZE]; + char nname[BUF_SIZE]; + struct stat s; memset(status_str, 0, BUF_SIZE); + memset(fname, 0, BUF_SIZE); - snprintf(fname, BUF_SIZE, "%s%s/removable", SYS_PATH, tmp_dir->d_name); + + memset(nname, 0, BUF_SIZE); + snprintf(nname, BUF_SIZE, "%s%s/node%d", SYS_PATH, tmp_dir->d_name,node); j = atoi(tmp_dir->d_name + 6); int major = j / 8; @@ -235,45 +478,84 @@ static int offline_memory(unsigned long long mem_size_bytes, if (ilimit_block) { - printf("Skipping %s due to 32 bit constraint\n",fname); + VPRINTF("Skipping %s due to 32 bit constraint\n",fname); continue; } + // The prospective block must be (a) removable, and (b) currently online + // and for the needed node - printf("Checking %s...", fname); + VPRINTF("Checking %s...", fname); + if (node>=0) { + if (stat(nname,&s)) { + VPRINTF("Skipping %s due to it being in the wrong node\n", fname); + continue; + } + } + + block_fd = open(fname, O_RDONLY); if (block_fd == -1) { - printf("Hotpluggable memory not supported...\n"); + EPRINTF("Hotpluggable memory not supported or could not determine if block is removable...\n"); return -1; } if (read(block_fd, status_str, BUF_SIZE) <= 0) { - perror("Could not read block status"); + perror("Could not read block removability information\n"); return -1; } + + status_str[BUF_SIZE-1]=0; close(block_fd); if (atoi(status_str) == 1) { - printf("Removable\n"); + VPRINTF("Removable "); + } else { + VPRINTF("Not removable\n"); + continue; + } + + snprintf(fname, BUF_SIZE, "%s%s/state", SYS_PATH, tmp_dir->d_name); + + block_fd = open(fname, O_RDONLY); + + if (block_fd<0) { + perror("Could not open block state\n"); + return -1; + } + + if (read(block_fd, status_str, BUF_SIZE) <=0) { + perror("Could not read block state information\n"); + return -1; + } + + status_str[BUF_SIZE-1]=0; + + close(block_fd); + + if (!strncasecmp(status_str,"offline",7)) { + VPRINTF("and Already Offline (unusable)\n"); + } else if (!strncasecmp(status_str,"online",6)) { + VPRINTF("and Online (usable)\n"); bitmap[major] |= (0x1 << minor); } else { - printf("Not removable\n"); + VPRINTF("and in Unknown State '%s' (unusable)\n",status_str); } + } } while (!mem_ready) { - - + /* Scan bitmap for enough consecutive space */ { // num_blocks: The number of blocks we need to find @@ -303,7 +585,8 @@ static int offline_memory(unsigned long long mem_size_bytes, if (run_len < num_blocks) { - fprintf(stderr, "Could not find enough consecutive memory blocks... (found %d)\n", run_len); + EPRINTF("Could not find enough consecutive memory blocks... (found %d)\n", run_len); + // no offlining yet, so no need to unwind here return -1; } } @@ -325,14 +608,16 @@ static int offline_memory(unsigned long long mem_size_bytes, if (block_file == NULL) { perror("Could not open block file"); + UNWIND(reg_start, i+reg_start-1); return -1; } - printf("Offlining block %d (%s)\n", i + reg_start, fname); + VPRINTF("Offlining block %d (%s)\n", i + reg_start, fname); fprintf(block_file, "offline\n"); fclose(block_file); + } } @@ -362,67 +647,251 @@ static int offline_memory(unsigned long long mem_size_bytes, block_fd = open(fname, O_RDONLY); if (block_fd == -1) { - perror("Could not open block file"); + perror("Could not open block state file"); return -1; } if (read(block_fd, status_buf, BUF_SIZE) <= 0) { - perror("Could not read block status"); + perror("Could not read block state"); return -1; } + + status_buf[BUF_SIZE-1]=0; - printf("Checking offlined block %d (%s)...", i + reg_start, fname); + VPRINTF("Checking offlined block %d (%s)...", i + reg_start, fname); int ret = strncmp(status_buf, "offline", strlen("offline")); - if (ret != 0) { + if (ret != 0) { // uh oh int j = 0; int major = (i + reg_start) / 8; int minor = (i + reg_start) % 8; + char * pos; bitmap[major] &= ~(0x1 << minor); // mark the block as not removable in bitmap mem_ready = 0; // Keep searching - printf("ERROR (%d)\n", ret); - - for (j = 0; j < i; j++) { - FILE * block_file = NULL; - char fname[256]; - - memset(fname, 0, 256); - - snprintf(fname, 256, "%smemory%d/state", SYS_PATH, j + reg_start); - - block_file = fopen(fname, "r+"); - - if (block_file == NULL) { - perror("Could not open block file"); - return -1; - } - - fprintf(block_file, "online\n"); - - fclose(block_file); - } + // remove trailing newline + if ((pos=strchr(status_buf, '\n')) != NULL) { + *pos = '\0'; + } + + EPRINTF("ERROR - block status is '%s'\n", status_buf); + + // Unwind space + UNWIND(reg_start,reg_start+num_blocks-1); break; } + } + + VPRINTF("Offlined Memory OK\n"); - printf("OK\n"); + } + } + + /* Memory is offlined. Calculate size and phys start addr to send to Palacios */ + *num_bytes = (unsigned long long)(num_blocks) * (unsigned long long)(block_size_bytes); + *base_addr = (unsigned long long)(reg_start) * (unsigned long long)(block_size_bytes); + + return 0; +} + + +static int online_memory(unsigned long long base_addr, + unsigned long long num_bytes) +{ + + unsigned int block_size_bytes = 0; + int bitmap_entries = 0; + unsigned char * bitmap = NULL; + unsigned int num_blocks = 0; + int reg_start = 0; + int mem_ready = 0; + + + + VPRINTF("Trying to online memory from %llu to %llu\n",base_addr,base_addr+num_bytes-1); + + /* Figure out the block size */ + { + int tmp_fd = 0; + char tmp_buf[BUF_SIZE]; + + tmp_fd = open(SYS_PATH "block_size_bytes", O_RDONLY); + + if (tmp_fd == -1) { + perror("Could not open block size file: " SYS_PATH "block_size_bytes"); + return -1; + } + + if (read(tmp_fd, tmp_buf, BUF_SIZE) <= 0) { + perror("Could not read block size file: " SYS_PATH "block_size_bytes"); + return -1; + } + + close(tmp_fd); + + block_size_bytes = strtoll(tmp_buf, NULL, 16); + + VPRINTF("Memory block size is %dMB (%d bytes)\n", block_size_bytes / (1024 * 1024), block_size_bytes); + + } + + num_blocks = num_bytes / block_size_bytes; + if (num_bytes % block_size_bytes) num_blocks++; + + reg_start = base_addr / block_size_bytes; + + VPRINTF("That is %u blocks of size %u starting at block %d\n", num_blocks, block_size_bytes, reg_start); + + + + /* Online memory blocks starting at reg_start */ + { + int i = 0; + + for (i = 0; i < num_blocks; i++) { + FILE * block_file = NULL; + char fname[256]; + memset(fname, 0, 256); + + snprintf(fname, 256, "%smemory%d/state", SYS_PATH, i + reg_start); + + block_file = fopen(fname, "r+"); + + if (block_file == NULL) { + perror("Could not open block file"); + return -1; } + + VPRINTF("Onlining block %d (%s)\n", i + reg_start, fname); + fprintf(block_file, "online\n"); + + fclose(block_file); } } - free(bitmap); + return 0; - /* Memory is offlined. Calculate size and phys start addr to send to Palacios */ - *num_bytes = (unsigned long long)(num_blocks) * (unsigned long long)(block_size_bytes); - *base_addr = (unsigned long long)(reg_start) * (unsigned long long)(block_size_bytes); +} + + +static int read_offlined() +{ + rewind(off); + unsigned long long base, len; + int i; + + num_offline=0; + while (fscanf(off,"%llx\t%llx\n",&base,&len)==2) { num_offline++; } + + + start_offline=(unsigned long long *)calloc(num_offline, sizeof(unsigned long long)); + len_offline=(unsigned long long *)calloc(num_offline, sizeof(unsigned long long)); + + if (!start_offline || !len_offline) { + EPRINTF("Cannot allocate space to load offline map\n"); + return -1; + } + + rewind(off); + for (i=0;i=start_offline[i] && + base_addr<(start_offline[i]+len_offline[i])) { + return i; + } + } + + return -1; + +} + + + +static int get_kernel_setup() +{ + FILE *f; + + f = fopen("/proc/v3vee/v3-info", "r"); + + if (!f) { + EPRINTF("Cannot open /proc/v3vee/v3-info\n"); + return -1; + } + + if (fscanf(f,"kernel MAX_ORDER:\t%llu\n",&kernel_max_order)!=1) { + EPRINTF("Cannot read kernel MAX_ORDER\n"); + return -1; + } + + kernel_max_page_alloc_bytes = 4096ULL * (0x1ULL << kernel_max_order); + + if (fscanf(f,"number of nodes:\t%llu\n",&kernel_num_nodes)!=1) { + EPRINTF("Cannot read kernel number of numa nodes\n"); + return -1; + } + + if (fscanf(f,"number of cpus:\t%llu\n",&kernel_num_cpus)!=1) { + EPRINTF("Cannot read kernel number of cpus\n"); + return -1; + } + + if (fscanf(f,"palacios compiled mem_block_size:\t%llu\n",&palacios_compiled_mem_block_size)!=1) { + EPRINTF("Cannot read palacios compiled mem_block_size\n"); + return -1; + } + + if (fscanf(f,"palacios run-time mem_block_size:\t%llu\n",&palacios_runtime_mem_block_size)!=1) { + EPRINTF("Cannot read palacios run-time mem_block_size\n"); + return -1; + } + + return 0; +} + +