From: Patrick Bridges <bridges@apu.cs.unm.edu>
Date: Wed, 11 Aug 2010 17:21:48 +0000 (-0600)
Subject: Changes to support large shadow pages *correctly*.
X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?a=commitdiff_plain;h=8684ef59c3ede8fe0c33b2f04dbe30a287e7b353;p=palacios.git

Changes to support large shadow pages *correctly*.
---

diff --git a/palacios/include/palacios/vmm_mem.h b/palacios/include/palacios/vmm_mem.h
index a8e776a..651e82e 100644
--- a/palacios/include/palacios/vmm_mem.h
+++ b/palacios/include/palacios/vmm_mem.h
@@ -109,7 +109,7 @@ struct v3_mem_region * v3_get_next_mem_region(struct v3_vm_info * vm, uint16_t c
 
 void v3_print_mem_map(struct v3_vm_info * vm);
 
-
+uint32_t v3_get_max_page_size(struct guest_info * core, addr_t fault_addr, uint32_t req_size);
 
 
 
diff --git a/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h b/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h
index db59d7d..aff8034 100644
--- a/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h
+++ b/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h
@@ -332,23 +332,32 @@ static int handle_pde_shadow_pagefault_64(struct guest_info * info, addr_t fault
 	return 0;
     }
 
-    // Handle as a shadow large page if possible
-    if (guest_pde->large_page 
-	&& (info->vm_info->mem_align >= PAGE_SIZE_2MB)) {
-	if (handle_2MB_shadow_pagefault_pde_64(info, fault_addr, error_code, shadow_pde_access,
-		 	 		       (pde64_2MB_t *)shadow_pde, (pde64_2MB_t *)guest_pde) == -1) {
-	    PrintError("Error handling large pagefault with large page\n");
-	    return -1;
-	} else {
-	    return 0;
-	}
-    } 
-
     pte64_t * shadow_pt = NULL;
     pte64_t * guest_pt = NULL;
 
     // get the next shadow page level, allocate if not present
     if (shadow_pde_access == PT_ACCESS_NOT_PRESENT) {
+        // Check if  we can use large pages and the guest memory is properly aligned
+        // to potentially use a large page
+        if (info->use_large_pages && guest_pde->large_page 
+	    && (info->vm_info->mem_align >= PAGE_SIZE_2MB)) {
+            // Check underlying physical memory map to see if a large page is viable
+	    addr_t guest_pa = BASE_TO_PAGE_ADDR_2MB(((pde64_2MB_t *)guest_pde)->page_base_addr);
+	    uint32_t max_size = v3_get_max_page_size(info, guest_pa, PAGE_SIZE_2MB);
+	    if (max_size >= PAGE_SIZE_2MB) {
+	        if (handle_2MB_shadow_pagefault_pde_64(info, fault_addr, error_code, shadow_pde_access,
+		 	 		               (pde64_2MB_t *)shadow_pde, (pde64_2MB_t *)guest_pde) ==  0) {
+	    	    return 0;
+	        } else {
+	            PrintError("Error handling large pagefault with large page\n");
+	            return -1;
+		}
+	    } else {
+		PrintDebug("Underlying physical memory map doesn't allow use of a large page.\n");
+	    }
+	    // Fallthrough to handle the region with small pages
+	}
+
 	struct shadow_page_data * shdw_page = create_new_shadow_pt(info);
 	shadow_pt = (pte64_t *)V3_VAddr((void *)shdw_page->page_pa);
 
diff --git a/palacios/src/palacios/vmm_config.c b/palacios/src/palacios/vmm_config.c
index 22ab611..7a596d2 100644
--- a/palacios/src/palacios/vmm_config.c
+++ b/palacios/src/palacios/vmm_config.c
@@ -301,6 +301,7 @@ static int determine_paging_mode(struct guest_info *info, v3_cfg_tree_t * core_c
 	info->shdw_pg_mode = SHADOW_PAGING;
     }
 
+
     if (info->shdw_pg_mode == NESTED_PAGING) {
     	PrintDebug("Guest Paging Mode: NESTED_PAGING\n");
     } else if (info->shdw_pg_mode == SHADOW_PAGING) {
@@ -309,6 +310,12 @@ static int determine_paging_mode(struct guest_info *info, v3_cfg_tree_t * core_c
 	PrintError("Guest paging mode incorrectly set.\n");
 	return -1;
     }
+
+    if (strcasecmp(v3_cfg_val(pg_tree, "large_pages"), "true") == 0) {
+	info->use_large_pages = 1;
+    	PrintDebug("Use of large pages in memory virtualization enabled.\n");
+    }
+
     return 0;
 }
 
diff --git a/palacios/src/palacios/vmm_direct_paging_64.h b/palacios/src/palacios/vmm_direct_paging_64.h
index d45ae17..97324d4 100644
--- a/palacios/src/palacios/vmm_direct_paging_64.h
+++ b/palacios/src/palacios/vmm_direct_paging_64.h
@@ -29,75 +29,6 @@
 
 // Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection"
 
-static uint32_t get_page_size(struct guest_info * core, addr_t fault_addr) {
-    addr_t pg_start = 0UL, pg_end = 0UL; // 2MiB page containing the faulting address
-    struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr
-    uint32_t page_size = PAGE_SIZE_4KB;
-
-   /* If the guest has been configured for 2MiB pages, then we must check for hooked regions of
-     * memory which may overlap with the 2MiB page containing the faulting address (due to
-     * potentially differing access policies in place for e.g. i/o devices and APIC). A 2MiB page
-     * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains
-     * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this
-     * note if someone decides to enable this optimization. It can be tested with the SeaStar
-     * mapping.
-     *
-     * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg)
-     *
-     *    |region| |region|                               2MiB mapped (state A)
-     *                   |reg|          |REG|             2MiB mapped (state B)
-     *   |region|     |reg|   |REG| |region|   |reg|      4KiB mapped (state C)
-     *        |reg|  |reg|   |--REGION---|                [2MiB mapped (state D)]
-     * |--------------------------------------------|     RAM
-     *                             ^                      fault addr
-     * |----|----|----|----|----|page|----|----|----|     2MB pages
-     *                           >>>>>>>>>>>>>>>>>>>>     search space
-     */
-
-
-    // guest page maps to a host page + offset (so when we shift, it aligns with a host page)
-    pg_start = PAGE_ADDR_2MB(fault_addr);
-    pg_end = (pg_start + PAGE_SIZE_2MB);
-
-    PrintDebug("%s: page   [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end);
-
-    pg_next_reg = v3_get_next_mem_region(core->vm_info, core->cpu_id, pg_start);
-
-    if (pg_next_reg == NULL) {
-	PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr);
-	return PAGE_SIZE_4KB;
-    }
-
-    if (pg_next_reg->flags.base == 1) {
-	page_size = PAGE_SIZE_2MB; // State A
-    } else {
-#if 0       // State B/C and D optimization
-	if ((pg_next_reg->guest_end >= pg_end) &&
-	    ((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start))) {	    
-	    page_size = PAGE_SIZE_2MB;
-	}
-
-	PrintDebug("%s: region [%p,%p) %s partially overlap with page\n", __FUNCTION__,
-		   (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end, 
-		   (page_size == PAGE_SIZE_2MB) ? "does not" : "does");
-
-#else       // State B/C
-	if (pg_next_reg->guest_start >= pg_end) {
-	    
-	    page_size = PAGE_SIZE_2MB;
-	}
-
-	PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__,
-		   (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
-		   (page_size == PAGE_SIZE_2MB) ? "does not" : "does");
-
-#endif
-    }
-
-    return page_size;
-}
-
-
 static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr_t fault_addr, pf_error_t error_code) {
     pml4e64_t * pml      = NULL;
     pdpe64_t * pdpe      = NULL;
@@ -124,7 +55,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr
      * 	2. the memory regions can be referenced by a large page
      */
     if ((core->use_large_pages == 1) ) {
-	page_size = get_page_size(core, fault_addr);
+	page_size = v3_get_max_page_size(core, fault_addr, PAGE_SIZE_2MB);
     }
 
     PrintDebug("Using page size of %dKB\n", page_size / 1024);
diff --git a/palacios/src/palacios/vmm_mem.c b/palacios/src/palacios/vmm_mem.c
index c45f18d..78ee376 100644
--- a/palacios/src/palacios/vmm_mem.c
+++ b/palacios/src/palacios/vmm_mem.c
@@ -385,6 +385,93 @@ void v3_delete_mem_region(struct v3_vm_info * vm, struct v3_mem_region * reg) {
 
 }
 
+// Determine if a given address can be handled by a large page of the requested size
+uint32_t v3_get_max_page_size(struct guest_info * core, addr_t fault_addr, uint32_t req_size) {
+    addr_t pg_start = 0UL, pg_end = 0UL; // large page containing the faulting addres
+    struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr
+    uint32_t page_size = PAGE_SIZE_4KB;
+
+   /* If the guest has been configured for large pages, then we must check for hooked regions of
+     * memory which may overlap with the large page containing the faulting address (due to
+     * potentially differing access policies in place for e.g. i/o devices and APIC). A large page
+     * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains
+     * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this
+     * note if someone decides to enable this optimization. It can be tested with the SeaStar
+     * mapping.
+     *
+     * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg)
+     *
+     *    |region| |region|                               2MiB mapped (state A)
+     *                   |reg|          |REG|             2MiB mapped (state B)
+     *   |region|     |reg|   |REG| |region|   |reg|      4KiB mapped (state C)
+     *        |reg|  |reg|   |--REGION---|                [2MiB mapped (state D)]
+     * |--------------------------------------------|     RAM
+     *                             ^                      fault addr
+     * |----|----|----|----|----|page|----|----|----|     2MB pages
+     *                           >>>>>>>>>>>>>>>>>>>>     search space
+     */
+
+
+    // guest page maps to a host page + offset (so when we shift, it aligns with a host page)
+    switch (req_size) {
+	case PAGE_SIZE_4KB:
+		return PAGE_SIZE_4KB;
+	case PAGE_SIZE_2MB:
+    		pg_start = PAGE_ADDR_2MB(fault_addr);
+    		pg_end = (pg_start + PAGE_SIZE_2MB);
+		break;
+	case PAGE_SIZE_4MB:
+    		pg_start = PAGE_ADDR_4MB(fault_addr);
+    		pg_end = (pg_start + PAGE_SIZE_4MB);
+		break;
+	case PAGE_SIZE_1GB:
+    		pg_start = PAGE_ADDR_1GB(fault_addr);
+    		pg_end = (pg_start + PAGE_SIZE_1GB);
+		break;
+	default:
+		PrintError("Invalid large page size requested.\n");
+		return -1;
+    }
+
+    PrintDebug("%s: page   [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end);
+
+    pg_next_reg = v3_get_next_mem_region(core->vm_info, core->cpu_id, pg_start);
+
+    if (pg_next_reg == NULL) {
+	PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr);
+	return PAGE_SIZE_4KB;
+    }
+
+    if (pg_next_reg->flags.base == 1) {
+	page_size = req_size; // State A
+	PrintDebug("%s: base region [%p,%p) contains page.\n", __FUNCTION__,
+		   (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end);
+    } else {
+#if 0       // State B/C and D optimization
+	if ((pg_next_reg->guest_end >= pg_end) &&
+	    ((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start))) {	    
+	    page_size = req_size;
+	}
+
+	PrintDebug("%s: region [%p,%p) %s partially overlap with page\n", __FUNCTION__,
+		   (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end, 
+		   (page_size == req_size) ? "does not" : "does");
+
+#else       // State B/C
+	if (pg_next_reg->guest_start >= pg_end) {
+	    
+	    page_size = req_size;
+	}
+
+	PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__,
+		   (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
+		   (page_size == req_size) ? "does not" : "does");
+
+#endif
+    }
+
+    return page_size;
+}
 
 
 void v3_print_mem_map(struct v3_vm_info * vm) {
diff --git a/utils/guest_creator/default.xml b/utils/guest_creator/default.xml
index db20b2c..3e60cd6 100644
--- a/utils/guest_creator/default.xml
+++ b/utils/guest_creator/default.xml
@@ -3,16 +3,17 @@
 <vm class="PC"> 
 
 	<!-- Memory in MB -->
-	<memory alignment="4KB">256</memory> 
+	<memory alignment="2MB">256</memory> 
 
 	<!-- Basic VMM system flags -->
 	<telemetry>enable</telemetry>
 	<paging mode="shadow">
 		<strategy>VTLB</strategy>
+		<large_pages/>
 	</paging>
 <!--
 	<paging mode="nested">
-		<pagesize>4KB</pagesize>
+		<large_pages/>
 	</paging>
 -->
 	<schedule_hz>100</schedule_hz>