From: Jack Lange Date: Tue, 19 Oct 2010 17:04:40 +0000 (-0500) Subject: large page changes X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?a=commitdiff_plain;h=e8fd1090974b1f82edd92d155ccdad6ad24b074b;p=palacios.git large page changes --- diff --git a/palacios/include/palacios/vm_guest.h b/palacios/include/palacios/vm_guest.h index 322f390..a5bb46d 100644 --- a/palacios/include/palacios/vm_guest.h +++ b/palacios/include/palacios/vm_guest.h @@ -71,7 +71,8 @@ struct guest_info { uint32_t flags; struct { uint8_t use_large_pages : 1; /* Enable virtual page tables to use large pages */ - uint32_t rsvd : 31; + uint8_t use_giant_pages : 1; /* Enable virtual page tables to use giant (1GB) pages */ + uint32_t rsvd : 30; } __attribute__((packed)); } __attribute__((packed)); diff --git a/palacios/include/palacios/vmm_mem.h b/palacios/include/palacios/vmm_mem.h index 32d6ae6..58b5dd3 100644 --- a/palacios/include/palacios/vmm_mem.h +++ b/palacios/include/palacios/vmm_mem.h @@ -103,14 +103,13 @@ int v3_add_shadow_mem(struct v3_vm_info * vm, uint16_t core_id, struct v3_mem_region * v3_get_mem_region(struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr); -struct v3_mem_region * v3_get_next_mem_region(struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr); +uint32_t v3_get_max_page_size(struct guest_info * core, addr_t fault_addr, v3_cpu_mode_t mode); + void v3_print_mem_map(struct v3_vm_info * vm); -uint32_t v3_get_max_page_size(struct guest_info * core, addr_t fault_addr, uint32_t req_size); -uint32_t v3_compute_page_alignment(addr_t addr); #endif // ! __V3VEE__ diff --git a/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_32.h b/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_32.h index 83bfb10..5acac32 100644 --- a/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_32.h +++ b/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_32.h @@ -133,25 +133,21 @@ static inline int handle_shadow_pagefault_32(struct guest_info * info, addr_t fa if (shadow_pde_access == PT_ACCESS_NOT_PRESENT) { - if (info->use_large_pages && guest_pde->large_page) { + if ((info->use_large_pages == 1) && (guest_pde->large_page == 1)) { // Check underlying physical memory map to see if a large page is viable - addr_t guest_pa = BASE_TO_PAGE_ADDR_4MB(((pde32_4MB_t *)guest_pde)->page_base_addr); - addr_t host_pa; - if (v3_get_max_page_size(info, guest_pa, PAGE_SIZE_4MB) < PAGE_SIZE_4MB) { - PrintDebug("Underlying physical memory map doesn't allow use of a large page.\n"); - // Fallthrough to small pages - } else if ((v3_gpa_to_hpa(info, guest_pa, &host_pa) != 0) - || (v3_compute_page_alignment(host_pa) < PAGE_SIZE_4MB)) { - PrintDebug("Host memory alignment doesn't allow use of a large page.\n"); - // Fallthrough to small pages - } else if (handle_4MB_shadow_pagefault_pde_32(info, fault_addr, error_code, shadow_pde_access, - (pde32_4MB_t *)shadow_pde, (pde32_4MB_t *)guest_pde) == 0) { + addr_t guest_pa = BASE_TO_PAGE_ADDR_4MB(((pde32_4MB_t *)guest_pde)->page_base_addr); + uint32_t page_size = v3_get_max_page_size(info, guest_pa, PROTECTED); + + if (page_size == PAGE_SIZE_4MB) { + PrintError("using large page for fault_addr %p (gpa=%p)\n", (void *)fault_addr, (void *)guest_pa); + if (handle_4MB_shadow_pagefault_pde_32(info, fault_addr, error_code, shadow_pde_access, + (pde32_4MB_t *)shadow_pde, (pde32_4MB_t *)guest_pde) == -1) { + PrintError("Error handling large pagefault with large page\n"); + return -1; + } + return 0; - } else { - PrintError("Error handling large pagefault with large page\n"); - return -1; - } - // Fallthrough to handle the region with small pages + } } struct shadow_page_data * shdw_page = create_new_shadow_pt(info); @@ -176,7 +172,6 @@ static inline int handle_shadow_pagefault_32(struct guest_info * info, addr_t fa } } - // VMM Specific options shadow_pde->write_through = guest_pde->write_through; shadow_pde->cache_disable = guest_pde->cache_disable; @@ -185,14 +180,12 @@ static inline int handle_shadow_pagefault_32(struct guest_info * info, addr_t fa guest_pde->accessed = 1; - shadow_pde->pt_base_addr = PAGE_BASE_ADDR(shdw_page->page_pa); } else { shadow_pt = (pte32_t *)V3_VAddr((void *)BASE_TO_PAGE_ADDR(shadow_pde->pt_base_addr)); } - - + if (guest_pde->large_page == 0) { if (v3_gpa_to_hva(info, BASE_TO_PAGE_ADDR(guest_pde->pt_base_addr), (addr_t*)&guest_pt) == -1) { // Machine check the guest @@ -486,7 +479,8 @@ static int handle_4MB_shadow_pagefault_pde_32(struct guest_info * info, return -1; } - PrintDebug("\tMapping shadow page (%p)\n", (void *)BASE_TO_PAGE_ADDR(shadow_pte->page_base_addr)); + PrintError("shadow PA = %p\n", (void *)shadow_pa); + large_guest_pde->vmm_info = V3_LARGE_PG; /* For invalidations */ large_shadow_pde->page_base_addr = PAGE_BASE_ADDR_4MB(shadow_pa); @@ -494,6 +488,8 @@ static int handle_4MB_shadow_pagefault_pde_32(struct guest_info * info, large_shadow_pde->present = 1; large_shadow_pde->user_page = 1; + PrintDebug("\tMapping shadow page (%p)\n", (void *)BASE_TO_PAGE_ADDR_4MB(large_shadow_pde->page_base_addr)); + if (shdw_reg->flags.write == 0) { large_shadow_pde->writable = 0; } else { diff --git a/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h b/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h index 011a882..38aebe6 100644 --- a/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h +++ b/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h @@ -338,23 +338,19 @@ static int handle_pde_shadow_pagefault_64(struct guest_info * info, addr_t fault if (shadow_pde_access == PT_ACCESS_NOT_PRESENT) { // Check if we can use large pages and the guest memory is properly aligned // to potentially use a large page - if (info->use_large_pages && guest_pde->large_page) { - // Check underlying physical memory map to see if a large page is viable + + if ((info->use_large_pages == 1) && (guest_pde->large_page == 1)) { addr_t guest_pa = BASE_TO_PAGE_ADDR_2MB(((pde64_2MB_t *)guest_pde)->page_base_addr); - addr_t host_pa; - if (v3_get_max_page_size(info, guest_pa, PAGE_SIZE_2MB) < PAGE_SIZE_2MB) { - PrintDebug("Underlying physical memory map doesn't allow use of a large page.\n"); - // Fallthrough to small pages - } else if ((v3_gpa_to_hpa(info, guest_pa, &host_pa) != 0) - || (v3_compute_page_alignment(host_pa) < PAGE_SIZE_2MB)) { - PrintDebug("Host memory alignment doesn't allow use of a large page.\n"); - // Fallthrough to small pages - } else if (handle_2MB_shadow_pagefault_pde_64(info, fault_addr, error_code, shadow_pde_access, - (pde64_2MB_t *)shadow_pde, (pde64_2MB_t *)guest_pde) == 0) { - return 0; - } else { - PrintError("Error handling large pagefault with large page\n"); - return -1; + uint32_t page_size = v3_get_max_page_size(info, guest_pa, LONG); + + if (page_size == PAGE_SIZE_2MB) { + if (handle_2MB_shadow_pagefault_pde_64(info, fault_addr, error_code, shadow_pde_access, + (pde64_2MB_t *)shadow_pde, (pde64_2MB_t *)guest_pde) == -1) { + PrintError("Error handling large pagefault with large page\n"); + return -1; + } + + return 0; } // Fallthrough to handle the region with small pages } @@ -367,7 +363,6 @@ static int handle_pde_shadow_pagefault_64(struct guest_info * info, addr_t fault shadow_pde->present = 1; shadow_pde->user_page = guest_pde->user_page; - if (guest_pde->large_page == 0) { shadow_pde->writable = guest_pde->writable; } else { diff --git a/palacios/src/palacios/vmm_direct_paging_64.h b/palacios/src/palacios/vmm_direct_paging_64.h index 97324d4..baae5d5 100644 --- a/palacios/src/palacios/vmm_direct_paging_64.h +++ b/palacios/src/palacios/vmm_direct_paging_64.h @@ -54,8 +54,8 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr * 1. the guest is configured to use large pages and * 2. the memory regions can be referenced by a large page */ - if ((core->use_large_pages == 1) ) { - page_size = v3_get_max_page_size(core, fault_addr, PAGE_SIZE_2MB); + if ((core->use_large_pages == 1) || (core->use_giant_pages == 1)) { + page_size = v3_get_max_page_size(core, fault_addr, LONG); } PrintDebug("Using page size of %dKB\n", page_size / 1024); diff --git a/palacios/src/palacios/vmm_mem.c b/palacios/src/palacios/vmm_mem.c index 7994cc4..2a47822 100644 --- a/palacios/src/palacios/vmm_mem.c +++ b/palacios/src/palacios/vmm_mem.c @@ -139,7 +139,6 @@ int v3_add_shadow_mem( struct v3_vm_info * vm, uint16_t core_id, entry->host_addr = host_addr; - entry->flags.read = 1; entry->flags.write = 1; entry->flags.exec = 1; @@ -157,7 +156,7 @@ int v3_add_shadow_mem( struct v3_vm_info * vm, uint16_t core_id, static inline struct v3_mem_region * __insert_mem_region(struct v3_vm_info * vm, - struct v3_mem_region * region) { + struct v3_mem_region * region) { struct rb_node ** p = &(vm->mem_map.mem_regions.rb_node); struct rb_node * parent = NULL; struct v3_mem_region * tmp_region; @@ -291,52 +290,119 @@ struct v3_mem_region * v3_get_mem_region(struct v3_vm_info * vm, uint16_t core_i -/* Given an address, find the successor region. If the address is within a region, return that - * region. Input is an address, because the address may not have a region associated with it. - * - * Returns a region following or touching the given address. If address is invalid, NULL is - * returned, else the base region is returned if no region exists at or after the given address. +/* This returns the next memory region based on a given address. + * If the address falls inside a sub region, that region is returned. + * If the address falls outside a sub region, the next sub region is returned + * NOTE that we have to be careful about core_ids here... */ -struct v3_mem_region * v3_get_next_mem_region( struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr) { - struct rb_node * current_n = vm->mem_map.mem_regions.rb_node; - struct rb_node * successor_n = NULL; /* left-most node greater than guest_addr */ - struct v3_mem_region * current_r = NULL; - - /* current_n tries to find the region containing guest_addr, going right when smaller and left when - * greater. Each time current_n becomes greater than guest_addr, update successor <- current_n. - * current_n becomes successively closer to guest_addr than the previous time it was greater - * than guest_addr. - */ - - /* | is address, ---- is region, + is intersection */ - while (current_n) { - current_r = rb_entry(current_n, struct v3_mem_region, tree_node); - if (current_r->guest_start > guest_addr) { /* | ---- */ - successor_n = current_n; - current_n = current_n->rb_left; +static struct v3_mem_region * get_next_mem_region( struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr) { + struct rb_node * n = vm->mem_map.mem_regions.rb_node; + struct v3_mem_region * reg = NULL; + struct v3_mem_region * parent = NULL; + + while (n) { + + reg = rb_entry(n, struct v3_mem_region, tree_node); + + if (guest_addr < reg->guest_start) { + n = n->rb_left; + } else if (guest_addr >= reg->guest_end) { + n = n->rb_right; } else { - if (current_r->guest_end > guest_addr) { - return current_r; /* +--- or --+- */ + if (reg->core_id == V3_MEM_CORE_ANY) { + // found relevant region, it's available on all cores + return reg; + } else if (core_id == reg->core_id) { + // found relevant region, it's available on the indicated core + return reg; + } else if (core_id < reg->core_id) { + // go left, core too big + n = n->rb_left; + } else if (core_id > reg->core_id) { + // go right, core too small + n = n->rb_right; + } else { + PrintError("v3_get_mem_region: Impossible!\n"); + return NULL; } - current_n = current_n->rb_right; /* ---- | */ + } + + if ((reg->core_id == core_id) || (reg->core_id == V3_MEM_CORE_ANY)) { + parent = reg; } } - /* Address does not have its own region. Check if it's a valid address in the base region */ - if (guest_addr >= vm->mem_map.base_region.guest_end) { - PrintError("%s: Guest Address Exceeds Base Memory Size (ga=%p), (limit=%p)\n", - __FUNCTION__, (void *)guest_addr, (void *)vm->mem_map.base_region.guest_end); - v3_print_mem_map(vm); - return NULL; + if (parent->guest_start > guest_addr) { + return parent; + } else if (parent->guest_end < guest_addr) { + struct rb_node * node = &(parent->tree_node); + + while ((node = v3_rb_next(node)) != NULL) { + struct v3_mem_region * next_reg = rb_entry(node, struct v3_mem_region, tree_node); + + if ((next_reg->core_id == V3_MEM_CORE_ANY) || + (next_reg->core_id == core_id)) { + + // This check is not strictly necessary, but it makes it clearer + if (next_reg->guest_start > guest_addr) { + return next_reg; + } + } + } } - return &(vm->mem_map.base_region); + return NULL; } +/* Given an address region of memory, find if there are any regions that overlap with it. + * This checks that the range lies in a single region, and returns that region if it does, + * this can be either the base region or a sub region. + * IF there are multiple regions in the range then it returns NULL + */ +static struct v3_mem_region * get_overlapping_region(struct v3_vm_info * vm, uint16_t core_id, + addr_t start_gpa, addr_t end_gpa) { + struct v3_mem_region * start_region = v3_get_mem_region(vm, core_id, start_gpa); + + if (start_region == NULL) { + PrintError("Invalid memory region\n"); + return NULL; + } + + + if (start_region->guest_end < end_gpa) { + // Region ends before range + return NULL; + } else if (start_region->flags.base == 0) { + // sub region overlaps range + return start_region; + } else { + // Base region, now we have to scan forward for the next sub region + struct v3_mem_region * next_reg = get_next_mem_region(vm, core_id, start_gpa); + + if (next_reg == NULL) { + // no sub regions after start_addr, base region is ok + return start_region; + } else if (next_reg->guest_start >= end_gpa) { + // Next sub region begins outside range + return start_region; + } else { + return NULL; + } + } + + + // Should never get here + return NULL; +} + + + + + void v3_delete_mem_region(struct v3_vm_info * vm, struct v3_mem_region * reg) { int i = 0; @@ -387,110 +453,77 @@ void v3_delete_mem_region(struct v3_vm_info * vm, struct v3_mem_region * reg) { } // Determine if a given address can be handled by a large page of the requested size -uint32_t v3_get_max_page_size(struct guest_info * core, addr_t fault_addr, uint32_t req_size) { - addr_t pg_start = 0UL, pg_end = 0UL; // large page containing the faulting addres - struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr +uint32_t v3_get_max_page_size(struct guest_info * core, addr_t page_addr, v3_cpu_mode_t mode) { + addr_t pg_start = 0; + addr_t pg_end = 0; uint32_t page_size = PAGE_SIZE_4KB; + struct v3_mem_region * reg = NULL; - /* If the guest has been configured for large pages, then we must check for hooked regions of - * memory which may overlap with the large page containing the faulting address (due to - * potentially differing access policies in place for e.g. i/o devices and APIC). A large page - * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains - * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this - * note if someone decides to enable this optimization. It can be tested with the SeaStar - * mapping. - * - * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg) - * - * |region| |region| 2MiB mapped (state A) - * |reg| |REG| 2MiB mapped (state B) - * |region| |reg| |REG| |region| |reg| 4KiB mapped (state C) - * |reg| |reg| |--REGION---| [2MiB mapped (state D)] - * |--------------------------------------------| RAM - * ^ fault addr - * |----|----|----|----|----|page|----|----|----| 2MB pages - * >>>>>>>>>>>>>>>>>>>> search space - */ - - - // guest page maps to a host page + offset (so when we shift, it aligns with a host page) - switch (req_size) { - case PAGE_SIZE_4KB: - return PAGE_SIZE_4KB; - case PAGE_SIZE_2MB: - pg_start = PAGE_ADDR_2MB(fault_addr); - pg_end = (pg_start + PAGE_SIZE_2MB); - break; - case PAGE_SIZE_4MB: - pg_start = PAGE_ADDR_4MB(fault_addr); - pg_end = (pg_start + PAGE_SIZE_4MB); - break; - case PAGE_SIZE_1GB: - pg_start = PAGE_ADDR_1GB(fault_addr); - pg_end = (pg_start + PAGE_SIZE_1GB); - break; - default: - PrintError("Invalid large page size requested.\n"); - return -1; - } - - //PrintDebug("%s: page [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end); - pg_next_reg = v3_get_next_mem_region(core->vm_info, core->cpu_id, pg_start); + PrintError("Getting max page size for addr %p\n", (void *)page_addr); + + switch (mode) { + case PROTECTED: + if (core->use_large_pages == 1) { + pg_start = PAGE_ADDR_4MB(page_addr); + pg_end = (pg_start + PAGE_SIZE_4MB); - if (pg_next_reg == NULL) { - PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr); - return PAGE_SIZE_4KB; - } + reg = get_overlapping_region(core->vm_info, core->cpu_id, pg_start, pg_end); - if (pg_next_reg->flags.base == 1) { - page_size = req_size; // State A - //PrintDebug("%s: base region [%p,%p) contains page.\n", __FUNCTION__, - // (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end); - } else { -#if 0 // State B/C and D optimization - if ((pg_next_reg->guest_end >= pg_end) && - ((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start))) { - page_size = req_size; - } + if ((reg) && ((reg->host_addr % PAGE_SIZE_4MB) == 0)) { + page_size = PAGE_SIZE_4MB; + } + } + break; + case PROTECTED_PAE: + if (core->use_large_pages == 1) { + pg_start = PAGE_ADDR_2MB(page_addr); + pg_end = (pg_start + PAGE_SIZE_2MB); - PrintDebug("%s: region [%p,%p) %s partially overlap with page\n", __FUNCTION__, - (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end, - (page_size == req_size) ? "does not" : "does"); + reg = get_overlapping_region(core->vm_info, core->cpu_id, pg_start, pg_end); -#else // State B/C - if (pg_next_reg->guest_start >= pg_end) { - - page_size = req_size; - } + if ((reg) && ((reg->host_addr % PAGE_SIZE_2MB) == 0)) { + page_size = PAGE_SIZE_2MB; + } + } + break; + case LONG: + case LONG_32_COMPAT: + case LONG_16_COMPAT: + if (core->use_giant_pages == 1) { + pg_start = PAGE_ADDR_1GB(page_addr); + pg_end = (pg_start + PAGE_SIZE_1GB); + + reg = get_overlapping_region(core->vm_info, core->cpu_id, pg_start, pg_end); + + if ((reg) && ((reg->host_addr % PAGE_SIZE_1GB) == 0)) { + page_size = PAGE_SIZE_1GB; + break; + } + } - PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__, - (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end, - (page_size == req_size) ? "does not" : "does"); + if (core->use_large_pages == 1) { + pg_start = PAGE_ADDR_2MB(page_addr); + pg_end = (pg_start + PAGE_SIZE_2MB); -#endif + reg = get_overlapping_region(core->vm_info, core->cpu_id, pg_start, pg_end); + + if ((reg) && ((reg->host_addr % PAGE_SIZE_2MB) == 0)) { + page_size = PAGE_SIZE_2MB; + } + } + break; + default: + PrintError("Invalid CPU mode: %s\n", v3_cpu_mode_to_str(v3_get_vm_cpu_mode(core))); + return -1; } + + PrintError("Returning PAGE size = %d\n", page_size); return page_size; } -// For an address on a page of size page_size, compute the actual alignment -// of the physical page it maps to -uint32_t v3_compute_page_alignment(addr_t page_addr) -{ - if (PAGE_OFFSET_1GB(page_addr) == 0) { - return PAGE_SIZE_1GB; - } else if (PAGE_OFFSET_4MB(page_addr) == 0) { - return PAGE_SIZE_4MB; - } else if (PAGE_OFFSET_2MB(page_addr) == 0) { - return PAGE_SIZE_2MB; - } else if (PAGE_OFFSET_4KB(page_addr) == 0) { - return PAGE_SIZE_4KB; - } else { - PrintError("Non-page aligned address passed to %s.\n", __FUNCTION__); - return 0; - } -} + void v3_print_mem_map(struct v3_vm_info * vm) { struct rb_node * node = v3_rb_first(&(vm->mem_map.mem_regions)); diff --git a/palacios/src/palacios/vmm_shadow_paging.c b/palacios/src/palacios/vmm_shadow_paging.c index 434b34e..65d19ee 100644 --- a/palacios/src/palacios/vmm_shadow_paging.c +++ b/palacios/src/palacios/vmm_shadow_paging.c @@ -47,6 +47,8 @@ #endif +static const char default_strategy[] = "VTLB"; + static struct hashtable * master_shdw_pg_table = NULL; @@ -146,10 +148,14 @@ int v3_init_shdw_impl(struct v3_vm_info * vm) { struct v3_shdw_pg_impl * impl = NULL; PrintDebug("Checking if shadow paging requested.\n"); - if (pg_mode && (strcasecmp(pg_mode, "nested") == 0)) { + if ((pg_mode != NULL) && (strcasecmp(pg_mode, "nested") == 0)) { PrintDebug("Nested paging specified - not initializing shadow paging.\n"); return 0; } + + if (pg_strat == NULL) { + pg_strat = (char *)default_strategy; + } V3_Print("Initialization of Shadow Paging implementation\n");