Changes to support large shadow pages *correctly*.

diff --git a/palacios/include/palacios/vmm_mem.h b/palacios/include/palacios/vmm_mem.h

index a8e776a..651e82e 100644 (file)
--- a/palacios/include/palacios/vmm_mem.h
+++ b/palacios/include/palacios/vmm_mem.h
@@ -109,7 +109,7 @@ struct v3_mem_region * v3_get_next_mem_region(struct v3_vm_info * vm, uint16_t c
 
 void v3_print_mem_map(struct v3_vm_info * vm);
 
-
+uint32_t v3_get_max_page_size(struct guest_info * core, addr_t fault_addr, uint32_t req_size);
 
 
 
diff --git a/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h b/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h

index db59d7d..aff8034 100644 (file)
--- a/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h
+++ b/palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h
@@ -332,23 +332,32 @@ static int handle_pde_shadow_pagefault_64(struct guest_info * info, addr_t fault
        return 0;
     }
 
-    // Handle as a shadow large page if possible
-    if (guest_pde->large_page 
-       && (info->vm_info->mem_align >= PAGE_SIZE_2MB)) {
-       if (handle_2MB_shadow_pagefault_pde_64(info, fault_addr, error_code, shadow_pde_access,
-                                              (pde64_2MB_t *)shadow_pde, (pde64_2MB_t *)guest_pde) == -1) {
-           PrintError("Error handling large pagefault with large page\n");
-           return -1;
-       } else {
-           return 0;
-       }
-    } 
-
     pte64_t * shadow_pt = NULL;
     pte64_t * guest_pt = NULL;
 
     // get the next shadow page level, allocate if not present
     if (shadow_pde_access == PT_ACCESS_NOT_PRESENT) {
+        // Check if  we can use large pages and the guest memory is properly aligned
+        // to potentially use a large page
+        if (info->use_large_pages && guest_pde->large_page 
+           && (info->vm_info->mem_align >= PAGE_SIZE_2MB)) {
+            // Check underlying physical memory map to see if a large page is viable
+           addr_t guest_pa = BASE_TO_PAGE_ADDR_2MB(((pde64_2MB_t *)guest_pde)->page_base_addr);
+           uint32_t max_size = v3_get_max_page_size(info, guest_pa, PAGE_SIZE_2MB);
+           if (max_size >= PAGE_SIZE_2MB) {
+               if (handle_2MB_shadow_pagefault_pde_64(info, fault_addr, error_code, shadow_pde_access,
+                                                      (pde64_2MB_t *)shadow_pde, (pde64_2MB_t *)guest_pde) ==  0) {
+                   return 0;
+               } else {
+                   PrintError("Error handling large pagefault with large page\n");
+                   return -1;
+               }
+           } else {
+               PrintDebug("Underlying physical memory map doesn't allow use of a large page.\n");
+           }
+           // Fallthrough to handle the region with small pages
+       }
+
        struct shadow_page_data * shdw_page = create_new_shadow_pt(info);
        shadow_pt = (pte64_t *)V3_VAddr((void *)shdw_page->page_pa);
 
diff --git a/palacios/src/palacios/vmm_config.c b/palacios/src/palacios/vmm_config.c

index 22ab611..7a596d2 100644 (file)
--- a/palacios/src/palacios/vmm_config.c
+++ b/palacios/src/palacios/vmm_config.c
@@ -301,6 +301,7 @@ static int determine_paging_mode(struct guest_info *info, v3_cfg_tree_t * core_c
        info->shdw_pg_mode = SHADOW_PAGING;
     }
 
+
     if (info->shdw_pg_mode == NESTED_PAGING) {
        PrintDebug("Guest Paging Mode: NESTED_PAGING\n");
     } else if (info->shdw_pg_mode == SHADOW_PAGING) {
@@ -309,6 +310,12 @@ static int determine_paging_mode(struct guest_info *info, v3_cfg_tree_t * core_c
        PrintError("Guest paging mode incorrectly set.\n");
        return -1;
     }
+
+    if (strcasecmp(v3_cfg_val(pg_tree, "large_pages"), "true") == 0) {
+       info->use_large_pages = 1;
+       PrintDebug("Use of large pages in memory virtualization enabled.\n");
+    }
+
     return 0;
 }
 
diff --git a/palacios/src/palacios/vmm_direct_paging_64.h b/palacios/src/palacios/vmm_direct_paging_64.h

index d45ae17..97324d4 100644 (file)
--- a/palacios/src/palacios/vmm_direct_paging_64.h
+++ b/palacios/src/palacios/vmm_direct_paging_64.h
@@ -29,75 +29,6 @@
 
 // Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection"
 
-static uint32_t get_page_size(struct guest_info * core, addr_t fault_addr) {
-    addr_t pg_start = 0UL, pg_end = 0UL; // 2MiB page containing the faulting address
-    struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr
-    uint32_t page_size = PAGE_SIZE_4KB;
-
-   /* If the guest has been configured for 2MiB pages, then we must check for hooked regions of
-     * memory which may overlap with the 2MiB page containing the faulting address (due to
-     * potentially differing access policies in place for e.g. i/o devices and APIC). A 2MiB page
-     * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains
-     * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this
-     * note if someone decides to enable this optimization. It can be tested with the SeaStar
-     * mapping.
-     *
-     * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg)
-     *
-     *    |region| |region|                               2MiB mapped (state A)
-     *                   |reg|          |REG|             2MiB mapped (state B)
-     *   |region|     |reg|   |REG| |region|   |reg|      4KiB mapped (state C)
-     *        |reg|  |reg|   |--REGION---|                [2MiB mapped (state D)]
-     * |--------------------------------------------|     RAM
-     *                             ^                      fault addr
-     * |----|----|----|----|----|page|----|----|----|     2MB pages
-     *                           >>>>>>>>>>>>>>>>>>>>     search space
-     */
-
-
-    // guest page maps to a host page + offset (so when we shift, it aligns with a host page)
-    pg_start = PAGE_ADDR_2MB(fault_addr);
-    pg_end = (pg_start + PAGE_SIZE_2MB);
-
-    PrintDebug("%s: page   [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end);
-
-    pg_next_reg = v3_get_next_mem_region(core->vm_info, core->cpu_id, pg_start);
-
-    if (pg_next_reg == NULL) {
-       PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr);
-       return PAGE_SIZE_4KB;
-    }
-
-    if (pg_next_reg->flags.base == 1) {
-       page_size = PAGE_SIZE_2MB; // State A
-    } else {
-#if 0       // State B/C and D optimization
-       if ((pg_next_reg->guest_end >= pg_end) &&
-           ((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start))) {     
-           page_size = PAGE_SIZE_2MB;
-       }
-
-       PrintDebug("%s: region [%p,%p) %s partially overlap with page\n", __FUNCTION__,
-                  (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end, 
-                  (page_size == PAGE_SIZE_2MB) ? "does not" : "does");
-
-#else       // State B/C
-       if (pg_next_reg->guest_start >= pg_end) {
-           
-           page_size = PAGE_SIZE_2MB;
-       }
-
-       PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__,
-                  (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
-                  (page_size == PAGE_SIZE_2MB) ? "does not" : "does");
-
-#endif
-    }
-
-    return page_size;
-}
-
-
 static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr_t fault_addr, pf_error_t error_code) {
     pml4e64_t * pml      = NULL;
     pdpe64_t * pdpe      = NULL;
@@ -124,7 +55,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr
      *         2. the memory regions can be referenced by a large page
      */
     if ((core->use_large_pages == 1) ) {
-       page_size = get_page_size(core, fault_addr);
+       page_size = v3_get_max_page_size(core, fault_addr, PAGE_SIZE_2MB);
     }
 
     PrintDebug("Using page size of %dKB\n", page_size / 1024);
diff --git a/palacios/src/palacios/vmm_mem.c b/palacios/src/palacios/vmm_mem.c

index c45f18d..78ee376 100644 (file)
--- a/palacios/src/palacios/vmm_mem.c
+++ b/palacios/src/palacios/vmm_mem.c
@@ -385,6 +385,93 @@ void v3_delete_mem_region(struct v3_vm_info * vm, struct v3_mem_region * reg) {
 
 }
 
+// Determine if a given address can be handled by a large page of the requested size
+uint32_t v3_get_max_page_size(struct guest_info * core, addr_t fault_addr, uint32_t req_size) {
+    addr_t pg_start = 0UL, pg_end = 0UL; // large page containing the faulting addres
+    struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr
+    uint32_t page_size = PAGE_SIZE_4KB;
+
+   /* If the guest has been configured for large pages, then we must check for hooked regions of
+     * memory which may overlap with the large page containing the faulting address (due to
+     * potentially differing access policies in place for e.g. i/o devices and APIC). A large page
+     * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains
+     * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this
+     * note if someone decides to enable this optimization. It can be tested with the SeaStar
+     * mapping.
+     *
+     * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg)
+     *
+     *    |region| |region|                               2MiB mapped (state A)
+     *                   |reg|          |REG|             2MiB mapped (state B)
+     *   |region|     |reg|   |REG| |region|   |reg|      4KiB mapped (state C)
+     *        |reg|  |reg|   |--REGION---|                [2MiB mapped (state D)]
+     * |--------------------------------------------|     RAM
+     *                             ^                      fault addr
+     * |----|----|----|----|----|page|----|----|----|     2MB pages
+     *                           >>>>>>>>>>>>>>>>>>>>     search space
+     */
+
+
+    // guest page maps to a host page + offset (so when we shift, it aligns with a host page)
+    switch (req_size) {
+       case PAGE_SIZE_4KB:
+               return PAGE_SIZE_4KB;
+       case PAGE_SIZE_2MB:
+               pg_start = PAGE_ADDR_2MB(fault_addr);
+               pg_end = (pg_start + PAGE_SIZE_2MB);
+               break;
+       case PAGE_SIZE_4MB:
+               pg_start = PAGE_ADDR_4MB(fault_addr);
+               pg_end = (pg_start + PAGE_SIZE_4MB);
+               break;
+       case PAGE_SIZE_1GB:
+               pg_start = PAGE_ADDR_1GB(fault_addr);
+               pg_end = (pg_start + PAGE_SIZE_1GB);
+               break;
+       default:
+               PrintError("Invalid large page size requested.\n");
+               return -1;
+    }
+
+    PrintDebug("%s: page   [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end);
+
+    pg_next_reg = v3_get_next_mem_region(core->vm_info, core->cpu_id, pg_start);
+
+    if (pg_next_reg == NULL) {
+       PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr);
+       return PAGE_SIZE_4KB;
+    }
+
+    if (pg_next_reg->flags.base == 1) {
+       page_size = req_size; // State A
+       PrintDebug("%s: base region [%p,%p) contains page.\n", __FUNCTION__,
+                  (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end);
+    } else {
+#if 0       // State B/C and D optimization
+       if ((pg_next_reg->guest_end >= pg_end) &&
+           ((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start))) {     
+           page_size = req_size;
+       }
+
+       PrintDebug("%s: region [%p,%p) %s partially overlap with page\n", __FUNCTION__,
+                  (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end, 
+                  (page_size == req_size) ? "does not" : "does");
+
+#else       // State B/C
+       if (pg_next_reg->guest_start >= pg_end) {
+           
+           page_size = req_size;
+       }
+
+       PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__,
+                  (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
+                  (page_size == req_size) ? "does not" : "does");
+
+#endif
+    }
+
+    return page_size;
+}
 
 
 void v3_print_mem_map(struct v3_vm_info * vm) {
diff --git a/utils/guest_creator/default.xml b/utils/guest_creator/default.xml

index db20b2c..3e60cd6 100644 (file)
--- a/utils/guest_creator/default.xml
+++ b/utils/guest_creator/default.xml
@@ -3,16 +3,17 @@
 <vm class="PC"> 
 
        <!-- Memory in MB -->
-       <memory alignment="4KB">256</memory> 
+       <memory alignment="2MB">256</memory> 
 
        <!-- Basic VMM system flags -->
        <telemetry>enable</telemetry>
        <paging mode="shadow">
                <strategy>VTLB</strategy>
+               <large_pages/>
        </paging>
 <!--
        <paging mode="nested">
-               <pagesize>4KB</pagesize>
+               <large_pages/>
        </paging>
 -->
        <schedule_hz>100</schedule_hz>
palacios/include/palacios/vmm_mem.h		patch \| blob \| history
palacios/src/palacios/mmu/vmm_shdw_pg_tlb_64.h		patch \| blob \| history
palacios/src/palacios/vmm_config.c		patch \| blob \| history
palacios/src/palacios/vmm_direct_paging_64.h		patch \| blob \| history
palacios/src/palacios/vmm_mem.c		patch \| blob \| history
utils/guest_creator/default.xml		patch \| blob \| history