Constraints in page allocation, and code changes to use them; shadow paging allocati...

[palacios.git] / linux_module / mm.c
diff --git a/linux_module/mm.c b/linux_module/mm.c

index 3edbd23..5e98f79 100644 (file)
--- a/linux_module/mm.c
+++ b/linux_module/mm.c
@@ -10,247 +10,289 @@
 //static struct list_head pools;
 
 #include "palacios.h"
+#include "mm.h"
+#include "buddy.h"
+#include "numa.h"
+#include "palacios/vmm.h"
 
-#define OFFLINE_POOL_THRESHOLD 12
 
-struct mempool {
-    uintptr_t base_addr;
-    u64 num_pages;
+static struct buddy_memzone ** memzones = NULL;
+static uintptr_t * seed_addrs = NULL;
 
-    u8 * bitmap;
-};
 
+// alignment is in bytes
+uintptr_t alloc_palacios_pgs(u64 num_pages, u32 alignment, int node_id, int constraints) {
+    uintptr_t addr = 0; 
+    int any = node_id==-1; // can allocate on any
+    int buddy_constraints=0;
 
-static struct mempool pool;
+    if (constraints && constraints!=V3_ALLOC_PAGES_CONSTRAINT_4GB) { 
+       ERROR("Unknown constraint mask 0x%x\n",constraints);
+       return 0;
+    }
+    
+    if (constraints & V3_ALLOC_PAGES_CONSTRAINT_4GB) { 
+       buddy_constraints |= LWK_BUDDY_CONSTRAINT_4GB;
+    }
 
-static inline int get_page_bit(int index) {
-    int major = index / 8;
-    int minor = index % 8;
 
-    return (pool.bitmap[major] & (0x1 << minor));
-}
+    if (node_id == -1) {
+       int cpu_id = get_cpu();
+       put_cpu();
+       
+       node_id = numa_cpu_to_node(cpu_id); // try first preferentially for the calling pcore
 
-static inline void set_page_bit(int index) {
-    int major = index / 8;
-    int minor = index % 8;
+    } else if (numa_num_nodes() == 1) {
+       node_id = 0;
+    } else if (node_id >= numa_num_nodes()) {
+       ERROR("Requesting memory from an invalid NUMA node. (Node: %d) (%d nodes on system)\n", 
+             node_id, numa_num_nodes());
+       return 0;
+    }
 
-    pool.bitmap[major] |= (0x1 << minor);
-}
+    addr = buddy_alloc(memzones[node_id], get_order(num_pages * PAGE_SIZE) + PAGE_SHIFT, buddy_constraints);
 
-static inline void clear_page_bit(int index) {
-    int major = index / 8;
-    int minor = index % 8;
+    if (!addr && any) { 
+       int i;
+       // do a scan to see if we can satisfy request on any node
+       for (i=0; i< numa_num_nodes(); i++) { 
+           if (i!=node_id) { 
+               addr = buddy_alloc(memzones[i], get_order(num_pages * PAGE_SIZE) + PAGE_SHIFT, buddy_constraints);
+               if (addr) {
+                   break;
+               }
+           }
+       }
+    }
+               
 
-    pool.bitmap[major] &= ~(0x1 << minor);
+    //DEBUG("Returning from alloc addr=%p, vaddr=%p\n", (void *)addr, __va(addr));
+    return addr;
 }
 
 
-uintptr_t get_palacios_base_addr(void) {
-    return pool.base_addr;
-}
 
-u64 get_palacios_num_pages(void) {
-    return pool.num_pages;
+void free_palacios_pgs(uintptr_t pg_addr, u64 num_pages) {
+    int node_id = numa_addr_to_node(pg_addr);
+
+    //DEBUG("Freeing Memory page %p\n", (void *)pg_addr);
+    buddy_free(memzones[node_id], pg_addr, get_order(num_pages * PAGE_SIZE) + PAGE_SHIFT);
+    
+    return;
 }
 
 
-static uintptr_t alloc_contig_pgs(u64 num_pages, u32 alignment) {
-    int step = 1;
-    int i = 0;
-    int start = 0;
+unsigned long long pow2(int i)
+{
+    unsigned long long x=1;
+    for (;i!=0;i--) { x*=2; } 
+    return x;
+}
 
-    DEBUG("Allocating %llu pages (align=%lu)\n", 
-          num_pages, (unsigned long)alignment);
+int add_palacios_memory(struct v3_mem_region *r) {
+    int pool_order = 0;
+    int node_id = 0;
 
-    if (pool.bitmap == NULL) {
-       ERROR("ERROR: Attempting to allocate from non initialized memory\n");
-       return 0;
-    }
+    struct v3_mem_region *keep;
 
-    if (alignment > 0) {
-       step = alignment / PAGE_SIZE;
-    }
+    INFO("Palacios Memory Add Request: type=%d, node=%d, base_addr=0x%llx, num_pages=%llu\n",r->type,r->node,r->base_addr,r->num_pages);
 
-    // Start the search at the correct alignment 
-    if (pool.base_addr % alignment) {
-       start = ((alignment - (pool.base_addr % alignment)) >> 12);
+    // fixup request regardless of its type
+    if (r->num_pages*4096 < V3_CONFIG_MEM_BLOCK_SIZE) { 
+       WARNING("Allocating a memory pool smaller than the Palacios block size - may not be useful\n");
     }
 
-    DEBUG("\t Start idx %d (base_addr=%p)\n", start, (void *)(u64)pool.base_addr);
+    if (pow2(get_order(r->num_pages*PAGE_SIZE)) != r->num_pages) { 
+       WARNING("Allocating a memory pool that is not a power of two (is %llu) - it will be rounded down!\n", r->num_pages);
+       r->num_pages=pow2(get_order(r->num_pages*PAGE_SIZE));
+       WARNING("Rounded request is for %llu pages\n", r->num_pages);
+    }
 
-    for (i = start; i < (pool.num_pages - num_pages); i += step) {
-       if (get_page_bit(i) == 0) {
-           int j = 0;
-           int collision = 0;
 
-           for (j = i; (j - i) < num_pages; j++) {
-               if (get_page_bit(j) == 1) {
-                   collision = 1;
-                   break;
-               }
-           }
+    if (!(keep=palacios_alloc(sizeof(struct v3_mem_region)))) { 
+       ERROR("Error allocating space for tracking region\n");
+       return -1;
+    }
 
-           if (collision == 1) {
-               break;
-           }
 
-           for (j = i; (j - i) < num_pages; j++) {
-               set_page_bit(j);
-           }
+    if (r->type==REQUESTED || r->type==REQUESTED32) { 
+       struct page *pgs;
 
-           return pool.base_addr + (i * PAGE_SIZE);
+       INFO("Attempting to allocate %llu pages of %s memory\n", r->num_pages,
+            r->type==REQUESTED ? "64 bit (unrestricted)" : 
+            r->type==REQUESTED32 ? "32 bit (restricted)" : "unknown (assuming 64 bit unrestricted)");
+            
+       pgs = alloc_pages_node(r->node, 
+                              r->type==REQUESTED ? GFP_KERNEL :
+                              r->type==REQUESTED32 ? GFP_DMA32 : GFP_KERNEL, 
+                              get_order(r->num_pages*PAGE_SIZE));
+       if (!pgs) { 
+           ERROR("Unable to satisfy allocation request\n");
+           palacios_free(keep);
+           return -1;
        }
+       r->base_addr = page_to_pfn(pgs) << PAGE_SHIFT;
     }
+       
 
-    ERROR("ALERT ALERT Allocation of Large Number of Contiguous Pages FAILED\n"); 
+    *keep = *r;
 
-    return 0;
-}
+    node_id = numa_addr_to_node(r->base_addr);
 
+    if (node_id == -1) {
+       ERROR("Error locating node for addr %p\n", (void *)(r->base_addr));
+       return -1;
+    }
 
-// alignment is in bytes
-uintptr_t alloc_palacios_pgs(u64 num_pages, u32 alignment, int node) {
-    uintptr_t addr = 0; 
+    if ((node_id != r->node) && (r->node!=-1)) { 
+       INFO("Memory add request is for node %d, but memory is in node %d\n",r->node,node_id);
+    }
 
-    if (num_pages < OFFLINE_POOL_THRESHOLD) {
-       struct page * pgs = NULL;
-       void *temp;
-       int order = get_order(num_pages * PAGE_SIZE);
-        
-       pgs = alloc_pages(GFP_DMA32, order);
-    
-       if (!pgs) { 
-           ERROR("Could not allocate small number of contigious pages - retrying with internal allocation\n");
-           goto trybig;
+    pool_order = get_order(r->num_pages * PAGE_SIZE) + PAGE_SHIFT;
+
+    if (buddy_add_pool(memzones[node_id], r->base_addr, pool_order, keep)) {
+       ERROR("ALERT ALERT ALERT Unable to add pool to buddy allocator...\n");
+       if (r->type==REQUESTED || r->type==REQUESTED32) { 
+           free_pages((uintptr_t)__va(r->base_addr), get_order(r->num_pages));
        }
- 
-       /* DEBUG("%llu pages (order=%d) aquired from alloc_pages\n", 
-              num_pages, order); */
+       palacios_free(keep);
+       return -1;
+    }
 
-       addr = page_to_pfn(pgs) << PAGE_SHIFT; 
+    return 0;
+}
 
-       temp = (void*)addr;
 
-       if ( (temp>=(void*)(pool.base_addr) && 
-             (temp<((void*)(pool.base_addr)+pool.num_pages*PAGE_SIZE))) 
-            || ((temp+num_pages*PAGE_SIZE)>=(void*)(pool.base_addr) && 
-                ((temp+num_pages*PAGE_SIZE)<((void*)(pool.base_addr)+pool.num_pages*PAGE_SIZE))) ) {
 
-           ERROR("ALERT ALERT Allocation of small number of contiguous pages returned block that "
-                 "OVERLAPS with the offline page pool addr=%p, addr+numpages=%p, "
-                 "pool.base_addr=%p, pool.base_addr+pool.numpages=%p\n", 
-                 temp, temp+num_pages*PAGE_SIZE, (void*)(pool.base_addr), 
-                 (void*)(pool.base_addr)+pool.num_pages*PAGE_SIZE);
-       }
+int palacios_remove_memory(uintptr_t base_addr) {
+    int node_id = numa_addr_to_node(base_addr);
+    struct v3_mem_region *r;
 
-       
-    } else {
-    trybig:
-       //DEBUG("Allocating %llu pages from bitmap allocator\n", num_pages);
-       //addr = pool.base_addr;
-       addr = alloc_contig_pgs(num_pages, alignment);
-       if (!addr) { 
-           ERROR("Could not allocate large number of contiguous pages\n");
-       }
+    if (buddy_remove_pool(memzones[node_id], base_addr, 0, (void**)(&r))) { //unforced remove
+       ERROR("Cannot remove memory at base address 0x%p because it is in use\n", (void*)base_addr);
+       return -1;
     }
 
+    if (r->type==REQUESTED || r->type==REQUESTED32) { 
+       free_pages((uintptr_t)__va(r->base_addr), get_order(r->num_pages));
+    } else {
+       // user space resposible for onlining
+    }
+    
+    palacios_free(r);
 
-    //DEBUG("Returning from alloc addr=%p, vaddr=%p\n", (void *)addr, __va(addr));
-    return addr;
+    return 0;
 }
 
 
 
-void free_palacios_pgs(uintptr_t pg_addr, int num_pages) {
-    //DEBUG("Freeing Memory page %p\n", (void *)pg_addr);
-
-    if ((pg_addr >= pool.base_addr) && 
-       (pg_addr < pool.base_addr + (PAGE_SIZE * pool.num_pages))) {
-       int pg_idx = (pg_addr - pool.base_addr) / PAGE_SIZE;
-       int i = 0;
-
-
-       if (num_pages<OFFLINE_POOL_THRESHOLD) { 
-           ERROR("ALERT ALERT  small page deallocation from offline pool\n");
-           return;
-        }      
+int palacios_deinit_mm( void ) {
 
-       if ((pg_idx + num_pages) > pool.num_pages) {
-           ERROR("Freeing memory bounds exceeded for offline pool\n");
-           return;
-       }
+    int i = 0;
 
-       for (i = 0; i < num_pages; i++) {
-           if (get_page_bit(pg_idx + i) == 0) { 
-               ERROR("Trying to free unallocated page from offline pool\n");
+    if (memzones) {
+       for (i = 0; i < numa_num_nodes(); i++) {
+           
+           if (memzones[i]) {
+               buddy_deinit(memzones[i]);
+           }
+           
+           // note that the memory is not onlined here - offlining and onlining
+           // is the resposibility of the caller
+           
+           if (seed_addrs[i]) {
+               // free the seed regions
+               free_pages((uintptr_t)__va(seed_addrs[i]), MAX_ORDER - 1);
            }
-           clear_page_bit(pg_idx + i);
        }
        
-    } else {
-       if (num_pages>=OFFLINE_POOL_THRESHOLD) {
-          ERROR("ALERT ALERT Large page deallocation from linux pool\n");
-       }
-       __free_pages(pfn_to_page(pg_addr >> PAGE_SHIFT), get_order(num_pages * PAGE_SIZE));
+       palacios_free(memzones);
+       palacios_free(seed_addrs);
     }
+
+    return 0;
 }
 
+int palacios_init_mm( void ) {
+    int num_nodes = numa_num_nodes();
+    int node_id = 0;
 
-int add_palacios_memory(uintptr_t base_addr, u64 num_pages) {
-    /* JRL: OK.... so this is horrible, terrible and if anyone else did it I would yell at them.
-     * But... the fact that you can do this in C is so ridiculous that I can't help myself.
-     * Note that we're repurposing "true" to be 1 here
-     */
+    INFO("memory manager init: MAX_ORDER=%d (%llu bytes)\n",MAX_ORDER, PAGE_SIZE*pow2(MAX_ORDER));
 
-    int bitmap_size = (num_pages / 8) + ((num_pages % 8) > 0); 
+    memzones = palacios_alloc_extended(sizeof(struct buddy_memzone *) * num_nodes, GFP_KERNEL,-1);
 
-    if (pool.num_pages != 0) {
-       ERROR("ERROR: Memory has already been added\n");
+    if (!memzones) { 
+       ERROR("Cannot allocate space for memory zones\n");
+       palacios_deinit_mm();
        return -1;
     }
 
-    DEBUG("Managing %dMB of memory starting at %llu (%lluMB)\n", 
-          (unsigned int)(num_pages * PAGE_SIZE) / (1024 * 1024), 
-          (unsigned long long)base_addr, 
-          (unsigned long long)(base_addr / (1024 * 1024)));
+    memset(memzones, 0, sizeof(struct buddy_memzone *) * num_nodes);
 
+    seed_addrs = palacios_alloc_extended(sizeof(uintptr_t) * num_nodes, GFP_KERNEL,-1);
 
-    pool.bitmap = palacios_alloc(bitmap_size);
-    
-    if (IS_ERR(pool.bitmap)) {
-       ERROR("Error allocating Palacios MM bitmap\n");
+    if (!seed_addrs) { 
+       ERROR("Cannot allocate space for seed addrs\n");
+       palacios_deinit_mm();
        return -1;
     }
-    
-    memset(pool.bitmap, 0, bitmap_size);
 
-    pool.base_addr = base_addr;
-    pool.num_pages = num_pages;
+    memset(seed_addrs, 0, sizeof(uintptr_t) * num_nodes);
 
-    return 0;
-}
+    for (node_id = 0; node_id < num_nodes; node_id++) {
+       struct buddy_memzone * zone = NULL;
 
+       // Seed the allocator with a small set of pages to allow initialization to complete. 
+       // For now we will just grab some random pages, but in the future we will need to grab NUMA specific regions
+       // See: alloc_pages_node()
 
+       {
+           struct page * pgs;
 
-int palacios_init_mm( void ) {
+           // attempt to first allocate below 4 GB for compatibility with
+           // 32 bit shadow paging
+           pgs = alloc_pages_node(node_id, GFP_DMA32, MAX_ORDER - 1);
 
-    pool.base_addr = 0;
-    pool.num_pages = 0;
-    pool.bitmap = NULL;
+           if (!pgs) {
+               INFO("Could not allocate initial memory block for node %d beloew 4GB\n", node_id);
+               
+               pgs = alloc_pages_node(node_id, GFP_KERNEL, MAX_ORDER - 1);
 
-    return 0;
-}
+               if (!pgs) {
+                   INFO("Could not allocate initial memory block for node %d beloew 4GB\n", node_id);
+                   if (!pgs) {
+                       ERROR("Could not allocate initial memory block for node %d without restrictions\n", node_id);
+                       BUG_ON(!pgs);
+                       palacios_deinit_mm();
+                       return -1;
+                   }
+               }
+           }
 
-int palacios_deinit_mm( void ) {
+           seed_addrs[node_id] = page_to_pfn(pgs) << PAGE_SHIFT;
+       }
+
+       zone = buddy_init(get_order(V3_CONFIG_MEM_BLOCK_SIZE) + PAGE_SHIFT, PAGE_SHIFT, node_id);
 
-    palacios_free(pool.bitmap);
+       if (zone == NULL) {
+           ERROR("Could not initialization memory management for node %d\n", node_id);
+           palacios_deinit_mm();
+           return -1;
+       }
 
-    pool.bitmap=0;
-    pool.base_addr=0;
-    pool.num_pages=0;
+       printk("Zone initialized, Adding seed region (order=%d)\n", 
+              (MAX_ORDER - 1) + PAGE_SHIFT);
+
+       if (buddy_add_pool(zone, seed_addrs[node_id], (MAX_ORDER - 1) + PAGE_SHIFT,0)) { 
+           ERROR("Could not add pool to buddy allocator\n");
+           palacios_deinit_mm();
+           return -1;
+       }
+
+       memzones[node_id] = zone;
+    }
 
-    // note that the memory is not onlined here - offlining and onlining
-    // is the resposibility of the caller
-    
     return 0;
+
 }
+