10 #include <sys/ioctl.h>
12 #include <sys/types.h>
20 // set to zero to ignore, or set
21 // to a level likely given the largest contiguous
22 // page allocation outside of the base regions
23 // note that the seed pools provide 2-4 MB chunks
25 #define PALACIOS_MIN_ALLOC (64*4096ULL)
27 #define SYS_PATH "/sys/devices/system/memory/"
37 unsigned long long *start_offline;
38 unsigned long long *len_offline;
40 unsigned long long kernel_max_order;
41 unsigned long long kernel_max_page_alloc_bytes;
42 unsigned long long kernel_num_nodes;
43 unsigned long long kernel_num_cpus;
44 unsigned long long palacios_compiled_mem_block_size;
45 unsigned long long palacios_runtime_mem_block_size;
48 #define VPRINTF(...) do { if (verbose) { printf(__VA_ARGS__); } } while (0)
49 #define EPRINTF(...) do { fprintf(stderr,__VA_ARGS__); } while (0)
52 static int read_offlined();
53 static int write_offlined();
54 static int find_offlined(unsigned long long base_addr);
55 static int clear_offlined();
58 static int offline_memory(unsigned long long mem_size_bytes,
59 unsigned long long mem_min_start,
62 unsigned long long *num_bytes,
63 unsigned long long *base_addr);
65 static int online_memory(unsigned long long num_bytes,
66 unsigned long long base_addr);
69 static int get_kernel_setup();
73 int main(int argc, char * argv[]) {
74 unsigned long long mem_size_bytes = 0;
75 unsigned long long mem_min_start = 0;
81 enum {NONE, ADD, REMOVE} op;
84 unsigned long long num_bytes, base_addr;
85 struct v3_mem_region mem;
87 while ((c=getopt(argc,argv,"hvarklm:n:"))!=-1) {
108 mem_min_start = atoll(optarg) * (1024*1024);
115 EPRINTF("-n requires the numa node...\n");
117 } else if (optopt=='m') {
118 EPRINTF("-m requires the minimum starting address (in MB)...\n");
121 EPRINTF("Unknown option %c\n",optopt);
126 EPRINTF("Unknown option %c\n",optopt);
132 if (op==NONE || optind==argc || help) {
133 EPRINTF("usage: v3_mem [-h] [-v] [ [-k] [-l] [-n k] [-m n] -a <memory size (MB)>] | [-r <hexaddr> | offline]\n\n"
134 "Palacios Memory Management\n\nMemory Addition\n"
135 " -a <mem> Allocate memory for use by Palacios (MB).\n\n"
136 " With -k this requests in-kernel allocation\n"
137 " Without -k this attempts to offline memory via hot remove\n\n"
138 " With -l the request or offlining is limited to first 4 GB\n"
139 " Without -l the request or offlining has no limits\n\n"
140 " With -m n the search for offlineable memory starts at n MB\n"
141 " Without -m n the search for offlineable memory starts at 0 MB\n\n"
142 " With -n i the request is for numa node i\n"
143 " Without -n i the request can be satified on any numa node\n\n"
145 " -r <hexaddr> Free Palacios memory containing hexaddr, online it if needed\n"
146 " -r offline Free all offline Palacios memory and online it\n\n"
156 if (get_kernel_setup()) {
157 EPRINTF("Cannot read kernel setup\n");
162 mem_size_bytes = (unsigned long long) (atof(argv[optind]) * (1024 * 1024));
164 if (mem_size_bytes < palacios_runtime_mem_block_size ||
165 (PALACIOS_MIN_ALLOC!=0 && mem_size_bytes < PALACIOS_MIN_ALLOC)) {
166 EPRINTF("Trying to add a smaller single chunk of memory than Palacios needs\n"
167 "Your request: %llu bytes\n"
168 "Palacios run-time memory block size: %llu bytes\n"
169 "Palacios minimal contiguous alloc: %llu bytes\n",
170 mem_size_bytes, palacios_runtime_mem_block_size,
175 if (request && mem_size_bytes > kernel_max_page_alloc_bytes) {
176 EPRINTF("Trying to request a larger single chunk of memory than the kernel can allocate\n"
177 "Your request: %llu bytes\n"
178 "Kernel largest page allocation: %llu bytes\n"
179 "Kernel MAX_ORDER: %llu\n",
180 mem_size_bytes, kernel_max_page_alloc_bytes, kernel_max_order);
184 if (node>=0 && node>=kernel_num_nodes) {
185 EPRINTF("Trying to request or allocate memory for a nonexistent node\n"
186 "Your request: node %d\n"
187 "Kernel number of nodes: %llu\n",
188 node, kernel_num_nodes);
192 } else if (op==REMOVE) {
193 if (!strcasecmp(argv[optind],"offline")) {
196 base_addr=strtoll(argv[optind],NULL,16);
200 if (!getenv("PALACIOS_DIR")) {
201 EPRINTF("Please set the PALACIOS_DIR variable\n");
205 strcpy(offname,getenv("PALACIOS_DIR"));
206 strcat(offname,"/.v3offlinedmem");
208 if (!(off=fopen(offname,"a+"))) {
209 EPRINTF("Cannot open or create offline memory file %s",offname);
213 // removing all offlined memory we added is a special case
214 if (op==REMOVE && alloffline) {
218 // we just need to reinvoke ourselves
220 for (i=0;i<num_offline;i++) {
222 sprintf(cmd,"v3_mem -r %llx", start_offline[i]);
230 v3_fd = open(v3_dev, O_RDONLY);
233 EPRINTF("Error opening V3Vee control device\n");
242 VPRINTF("Trying to offline memory (size=%llu, min_start=%llu, limit32=%d)\n",mem_size_bytes,mem_min_start,limit32);
243 if (offline_memory(mem_size_bytes,mem_min_start,limit32,node,&num_bytes, &base_addr)) {
244 EPRINTF("Could not offline memory\n");
250 fprintf(off,"%llx\t%llx\n",base_addr, num_bytes);
252 printf("Memory of size %llu at 0x%llx has been offlined\n",num_bytes,base_addr);
254 mem.type=PREALLOCATED;
256 mem.base_addr=base_addr;
257 mem.num_pages=num_bytes/4096;
261 VPRINTF("Generating memory allocation request (size=%llu, limit32=%d)\n", mem_size_bytes, limit32);
262 mem.type = limit32 ? REQUESTED32 : REQUESTED;
265 mem.num_pages = mem_size_bytes / 4096;
268 VPRINTF("Allocation request is: type=%d, node=%d, base_addr=0x%llx, num_pages=%llu\n",
269 mem.type, mem.node, mem.base_addr, mem.num_pages);
271 if (ioctl(v3_fd, V3_ADD_MEMORY, &mem)<0) {
272 EPRINTF("Request rejected by Palacios\n");
273 printf("Allocation of memory by Palacios has failed. Check dmesg output for more information.\n");
278 VPRINTF("Request accepted by Palacios\n");
279 printf("%llu bytes of memory has been allocated by Palacios\n",mem.num_pages*4096);
285 } else if (op==REMOVE) {
290 entry=find_offlined(base_addr);
293 // no need to offline
296 mem.type=PREALLOCATED;
299 mem.base_addr=base_addr;
301 // now remove it from palacios
302 VPRINTF("Deallocation request is: type=%d, base_addr=0x%llx\n",
303 mem.type, mem.base_addr);
305 if (ioctl(v3_fd, V3_REMOVE_MEMORY, &mem)<0) {
306 EPRINTF("Request rejected by Palacios\n");
312 VPRINTF("Request accepted by Palacios\n");
314 printf("Memory at 0x%llx has been deallocated by Palacios\n", mem.base_addr);
317 VPRINTF("Onlining the memory to make it available to the kernel\n");
318 online_memory(start_offline[entry],len_offline[entry]);
320 len_offline[entry] = 0;
324 printf("Memory at 0x%llx has been onlined\n",mem.base_addr);
327 VPRINTF("Memory was deallocated in the kernel\n");
328 printf("Memory at 0x%llx has been onlined\n",mem.base_addr);
341 static int dir_filter(const struct dirent * dir) {
342 if (strncmp("memory", dir->d_name, 6) == 0) {
350 static int dir_cmp(const struct dirent **dir1, const struct dirent ** dir2) {
351 int num1 = atoi((*dir1)->d_name + 6);
352 int num2 = atoi((*dir2)->d_name + 6);
359 #define UNWIND(first,last) \
362 for (i = first; i <= last; i++) { \
365 snprintf(name,256,"%smemory%d/state",SYS_PATH,i); \
366 f=fopen(name,"r+"); \
368 perror("Cannot open state file\n"); \
371 VPRINTF("Re-onlining block %d (%s)\n",i,name); \
372 fprintf(f,"online\n"); \
378 static int offline_memory(unsigned long long mem_size_bytes,
379 unsigned long long mem_min_start,
382 unsigned long long *num_bytes,
383 unsigned long long *base_addr)
386 unsigned int block_size_bytes = 0;
387 int bitmap_entries = 0;
388 unsigned char * bitmap = NULL;
395 VPRINTF("Trying to find %lluMB (%llu bytes) of memory above %llu with limit32=%d\n", mem_size_bytes/(1024*1024), mem_size_bytes, mem_min_start, limit32);
397 /* Figure out the block size */
400 char tmp_buf[BUF_SIZE];
402 tmp_fd = open(SYS_PATH "block_size_bytes", O_RDONLY);
405 perror("Could not open block size file: " SYS_PATH "block_size_bytes");
409 if (read(tmp_fd, tmp_buf, BUF_SIZE) <= 0) {
410 perror("Could not read block size file: " SYS_PATH "block_size_bytes");
416 block_size_bytes = strtoll(tmp_buf, NULL, 16);
418 VPRINTF("Memory block size is %dMB (%d bytes)\n", block_size_bytes / (1024 * 1024), block_size_bytes);
423 num_blocks = mem_size_bytes / block_size_bytes;
424 if (mem_size_bytes % block_size_bytes) num_blocks++;
426 mem_min_start = block_size_bytes *
427 ((mem_min_start / block_size_bytes) + (!!(mem_min_start % block_size_bytes)));
429 VPRINTF("Looking for %d blocks of memory starting at %p (block %llu) with limit32=%d for node %d\n", num_blocks, (void*)mem_min_start, mem_min_start/block_size_bytes,limit32,node);
432 // We now need to find <num_blocks> consecutive offlinable memory blocks
434 /* Scan the memory directories */
436 struct dirent ** namelist = NULL;
441 int first_block = mem_min_start/block_size_bytes;
442 int limit_block = 0xffffffff / block_size_bytes; // for 32 bit limiting
444 last_block = scandir(SYS_PATH, &namelist, dir_filter, dir_cmp);
445 bitmap_entries = atoi(namelist[last_block - 1]->d_name + 6) + 1;
447 size = bitmap_entries / 8;
448 if (bitmap_entries % 8) size++;
450 bitmap = alloca(size);
453 VPRINTF("ERROR: could not allocate space for bitmap\n");
457 memset(bitmap, 0, size);
459 for (i = 0 ; j < bitmap_entries - 1; i++) {
460 struct dirent * tmp_dir = namelist[i];
462 char status_str[BUF_SIZE];
463 char fname[BUF_SIZE];
464 char nname[BUF_SIZE];
467 memset(status_str, 0, BUF_SIZE);
469 memset(fname, 0, BUF_SIZE);
470 snprintf(fname, BUF_SIZE, "%s%s/removable", SYS_PATH, tmp_dir->d_name);
472 memset(nname, 0, BUF_SIZE);
473 snprintf(nname, BUF_SIZE, "%s%s/node%d", SYS_PATH, tmp_dir->d_name,node);
475 j = atoi(tmp_dir->d_name + 6);
481 VPRINTF("Skipping %s due to minimum start constraint\n",fname);
485 if (limit32 && i>limit_block) {
486 VPRINTF("Skipping %s due to 32 bit constraint\n",fname);
490 // The prospective block must be (a) removable, and (b) currently online
491 // and for the needed node
493 VPRINTF("Checking %s...", fname);
496 if (stat(nname,&s)) {
497 VPRINTF("Skipping %s due to it being in the wrong node\n", fname);
503 block_fd = open(fname, O_RDONLY);
505 if (block_fd == -1) {
506 EPRINTF("Hotpluggable memory not supported or could not determine if block is removable...\n");
510 if (read(block_fd, status_str, BUF_SIZE) <= 0) {
511 perror("Could not read block removability information\n");
515 status_str[BUF_SIZE-1]=0;
519 if (atoi(status_str) == 1) {
520 VPRINTF("Removable ");
522 VPRINTF("Not removable\n");
526 snprintf(fname, BUF_SIZE, "%s%s/state", SYS_PATH, tmp_dir->d_name);
528 block_fd = open(fname, O_RDONLY);
531 perror("Could not open block state\n");
535 if (read(block_fd, status_str, BUF_SIZE) <=0) {
536 perror("Could not read block state information\n");
540 status_str[BUF_SIZE-1]=0;
544 if (!strncasecmp(status_str,"offline",7)) {
545 VPRINTF("and Already Offline (unusable)\n");
546 } else if (!strncasecmp(status_str,"online",6)) {
547 VPRINTF("and Online (usable)\n");
548 bitmap[major] |= (0x1 << minor);
550 VPRINTF("and in Unknown State '%s' (unusable)\n",status_str);
559 /* Scan bitmap for enough consecutive space */
561 // num_blocks: The number of blocks we need to find
562 // bitmap: bitmap of blocks (1 == allocatable)
563 // bitmap_entries: number of blocks in the system/number of bits in bitmap
564 // reg_start: The block index where our allocation will start
569 for (i = 0; i < bitmap_entries; i++) {
573 if (!(bitmap[i_major] & (0x1 << i_minor))) {
574 reg_start = i + 1; // skip the region start to next entry
581 if (run_len >= num_blocks) {
587 if (run_len < num_blocks) {
588 EPRINTF("Could not find enough consecutive memory blocks... (found %d)\n", run_len);
589 // no offlining yet, so no need to unwind here
595 /* Offline memory blocks starting at reg_start */
599 for (i = 0; i < num_blocks; i++) {
600 FILE * block_file = NULL;
603 memset(fname, 0, 256);
605 snprintf(fname, 256, "%smemory%d/state", SYS_PATH, i + reg_start);
607 block_file = fopen(fname, "r+");
609 if (block_file == NULL) {
610 perror("Could not open block file");
611 UNWIND(reg_start, i+reg_start-1);
616 VPRINTF("Offlining block %d (%s)\n", i + reg_start, fname);
617 fprintf(block_file, "offline\n");
625 /* We asked to offline set of blocks, but Linux could have lied.
626 * To be safe, check whether blocks were offlined and start again if not
632 mem_ready = 1; // Hopefully we are ok...
635 for (i = 0; i < num_blocks; i++) {
637 char fname[BUF_SIZE];
638 char status_buf[BUF_SIZE];
641 memset(fname, 0, BUF_SIZE);
642 memset(status_buf, 0, BUF_SIZE);
644 snprintf(fname, BUF_SIZE, "%smemory%d/state", SYS_PATH, i + reg_start);
647 block_fd = open(fname, O_RDONLY);
649 if (block_fd == -1) {
650 perror("Could not open block state file");
654 if (read(block_fd, status_buf, BUF_SIZE) <= 0) {
655 perror("Could not read block state");
659 status_buf[BUF_SIZE-1]=0;
661 VPRINTF("Checking offlined block %d (%s)...", i + reg_start, fname);
663 int ret = strncmp(status_buf, "offline", strlen("offline"));
665 if (ret != 0) { // uh oh
667 int major = (i + reg_start) / 8;
668 int minor = (i + reg_start) % 8;
671 bitmap[major] &= ~(0x1 << minor); // mark the block as not removable in bitmap
673 mem_ready = 0; // Keep searching
675 // remove trailing newline
676 if ((pos=strchr(status_buf, '\n')) != NULL) {
680 EPRINTF("ERROR - block status is '%s'\n", status_buf);
683 UNWIND(reg_start,reg_start+num_blocks-1);
689 VPRINTF("Offlined Memory OK\n");
694 /* Memory is offlined. Calculate size and phys start addr to send to Palacios */
695 *num_bytes = (unsigned long long)(num_blocks) * (unsigned long long)(block_size_bytes);
696 *base_addr = (unsigned long long)(reg_start) * (unsigned long long)(block_size_bytes);
702 static int online_memory(unsigned long long base_addr,
703 unsigned long long num_bytes)
706 unsigned int block_size_bytes = 0;
707 int bitmap_entries = 0;
708 unsigned char * bitmap = NULL;
709 unsigned int num_blocks = 0;
715 VPRINTF("Trying to online memory from %llu to %llu\n",base_addr,base_addr+num_bytes-1);
717 /* Figure out the block size */
720 char tmp_buf[BUF_SIZE];
722 tmp_fd = open(SYS_PATH "block_size_bytes", O_RDONLY);
725 perror("Could not open block size file: " SYS_PATH "block_size_bytes");
729 if (read(tmp_fd, tmp_buf, BUF_SIZE) <= 0) {
730 perror("Could not read block size file: " SYS_PATH "block_size_bytes");
736 block_size_bytes = strtoll(tmp_buf, NULL, 16);
738 VPRINTF("Memory block size is %dMB (%d bytes)\n", block_size_bytes / (1024 * 1024), block_size_bytes);
742 num_blocks = num_bytes / block_size_bytes;
743 if (num_bytes % block_size_bytes) num_blocks++;
745 reg_start = base_addr / block_size_bytes;
747 VPRINTF("That is %u blocks of size %u starting at block %d\n", num_blocks, block_size_bytes, reg_start);
751 /* Online memory blocks starting at reg_start */
755 for (i = 0; i < num_blocks; i++) {
756 FILE * block_file = NULL;
759 memset(fname, 0, 256);
761 snprintf(fname, 256, "%smemory%d/state", SYS_PATH, i + reg_start);
763 block_file = fopen(fname, "r+");
765 if (block_file == NULL) {
766 perror("Could not open block file");
771 VPRINTF("Onlining block %d (%s)\n", i + reg_start, fname);
772 fprintf(block_file, "online\n");
785 static int read_offlined()
788 unsigned long long base, len;
792 while (fscanf(off,"%llx\t%llx\n",&base,&len)==2) { num_offline++; }
795 start_offline=(unsigned long long *)calloc(num_offline, sizeof(unsigned long long));
796 len_offline=(unsigned long long *)calloc(num_offline, sizeof(unsigned long long));
798 if (!start_offline || !len_offline) {
799 EPRINTF("Cannot allocate space to load offline map\n");
804 for (i=0;i<num_offline;i++) {
805 fscanf(off,"%llx\t%llx",&(start_offline[i]),&(len_offline[i]));
807 // we are now back to the end, and can keep appending
812 static int write_offlined()
817 if (!(off=fopen(offname,"w+"))) { // truncate
818 EPRINTF("Cannot open %s for writing!\n",offname);
822 for (i=0;i<num_offline;i++) {
823 if (len_offline[i]) {
824 fprintf(off,"%llx\t%llx\n",start_offline[i],len_offline[i]);
827 // we are now back to the end, and can keep appending
832 static int clear_offlined()
839 static int find_offlined(unsigned long long base_addr)
843 for (i=0;i<num_offline;i++) {
844 if (base_addr>=start_offline[i] &&
845 base_addr<(start_offline[i]+len_offline[i])) {
856 static int get_kernel_setup()
860 f = fopen("/proc/v3vee/v3-info", "r");
863 EPRINTF("Cannot open /proc/v3vee/v3-info\n");
867 if (fscanf(f,"kernel MAX_ORDER:\t%llu\n",&kernel_max_order)!=1) {
868 EPRINTF("Cannot read kernel MAX_ORDER\n");
872 kernel_max_page_alloc_bytes = 4096ULL * (0x1ULL << kernel_max_order);
874 if (fscanf(f,"number of nodes:\t%llu\n",&kernel_num_nodes)!=1) {
875 EPRINTF("Cannot read kernel number of numa nodes\n");
879 if (fscanf(f,"number of cpus:\t%llu\n",&kernel_num_cpus)!=1) {
880 EPRINTF("Cannot read kernel number of cpus\n");
884 if (fscanf(f,"palacios compiled mem_block_size:\t%llu\n",&palacios_compiled_mem_block_size)!=1) {
885 EPRINTF("Cannot read palacios compiled mem_block_size\n");
889 if (fscanf(f,"palacios run-time mem_block_size:\t%llu\n",&palacios_runtime_mem_block_size)!=1) {
890 EPRINTF("Cannot read palacios run-time mem_block_size\n");