Refactoring and additions to direct paging (nested and passthrough)

[palacios.git] / palacios / src / palacios / vmm_direct_paging_64.h
diff --git a/palacios/src/palacios/vmm_direct_paging_64.h b/palacios/src/palacios/vmm_direct_paging_64.h

index 80dce91..6afb9fb 100644 (file)
--- a/palacios/src/palacios/vmm_direct_paging_64.h
+++ b/palacios/src/palacios/vmm_direct_paging_64.h
@@ -27,85 +27,12 @@
 #include <palacios/vm_guest_mem.h>
 #include <palacios/vm_guest.h>
 
-// Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection"
-
-static int get_page_size() {
-
-    // Need to fix this....
-    return PAGE_SIZE_4KB; 
-
-
-#if 0
-   struct v3_mem_region * base_reg = &(info->vm_info->mem_map.base_region);
-
-   /* If the guest has been configured for 2MiB pages, then we must check for hooked regions of
-     * memory which may overlap with the 2MiB page containing the faulting address (due to
-     * potentially differing access policies in place for e.g. i/o devices and APIC). A 2MiB page
-     * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains
-     * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this
-     * note if someone decides to enable this optimization. It can be tested with the SeaStar
-     * mapping.
-     *
-     * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg)
-     *
-     *    |region| |region|                               2MiB mapped (state A)
-     *                   |reg|          |REG|             2MiB mapped (state B)
-     *   |region|     |reg|   |REG| |region|   |reg|      4KiB mapped (state C)
-     *        |reg|  |reg|   |--REGION---|                [2MiB mapped (state D)]
-     * |--------------------------------------------|     RAM
-     *                             ^                      fault addr
-     * |----|----|----|----|----|page|----|----|----|     2MB pages
-     *                           >>>>>>>>>>>>>>>>>>>>     search space
-     */
-    addr_t pg_start = 0UL, pg_end = 0UL; // 2MiB page containing the faulting address
-    struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr
-    bool use_large_page = false;
-
-    if (region == NULL) {
-       PrintError("%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr);
-       return -1;
-    }
-
-    // set use_large_page here
-    if (info->vm_info->paging_size == PAGING_2MB) {
-
-       // guest page maps to a host page + offset (so when we shift, it aligns with a host page)
-       pg_start = PAGE_ADDR_2MB(fault_addr);
-       pg_end = (pg_start + PAGE_SIZE_2MB);
-
-       PrintDebug("%s: page   [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end);
-
-       pg_next_reg = v3_get_next_mem_region(info->vm_info, info->cpu_id, pg_start);
-
-       if (pg_next_reg == NULL) {
-           PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr);
-           return -1;
-       }
-
-       if (pg_next_reg->base == 1) { // next region == base region
-           use_large_page = 1; // State A
-       } else {
-#if 0       // State B/C and D optimization
-           use_large_page = (pg_next_reg->guest_end >= pg_end) &&
-               ((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start));
-           PrintDebug("%s: region [%p,%p) %s partial overlap with page\n", __FUNCTION__,
-                   (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
-                   (use_large_page ? "does not have" : "has"));
-#else       // State B/C
-           use_large_page = (pg_next_reg->guest_start >= pg_end);
-           PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__,
-                   (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
-                   (use_large_page ? "does not have" : "has"));
-#endif
-       }
-    }
-
-    PrintDebug("%s: Address gets a 2MiB page? %s\n", __FUNCTION__, (use_large_page ? "yes" : "no"));
-#endif
-}
+/* this always builds 4 level page tables, but large pages are allowed */
 
+// Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection"
 
-static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr_t fault_addr, pf_error_t error_code) {
+static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr_t fault_addr, pf_error_t error_code,
+                                                 addr_t *actual_start, addr_t *actual_end) {
     pml4e64_t * pml      = NULL;
     pdpe64_t * pdpe      = NULL;
     pde64_t * pde        = NULL;
@@ -118,18 +45,24 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr
     int pde_index  = PDE64_INDEX(fault_addr);
     int pte_index  = PTE64_INDEX(fault_addr);
 
-    struct v3_mem_region * region =  v3_get_mem_region(core->vm_info, core->cpu_id, fault_addr);
+    struct v3_mem_region * region =  v3_get_mem_region(core->vm_info, core->vcpu_id, fault_addr);
     int page_size = PAGE_SIZE_4KB;
 
+    if (region == NULL) {
+       PrintError(core->vm_info, core, "%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr);
+       return -1;
+    }
 
     /*  Check if:
      *  1. the guest is configured to use large pages and 
      *         2. the memory regions can be referenced by a large page
      */
-    if ((core->use_large_pages == 1) ) {
-       page_size = get_page_size();
+    if ((core->use_large_pages == 1) || (core->use_giant_pages == 1)) {
+       page_size = v3_get_max_page_size(core, fault_addr, LONG);
     }
 
+    PrintDebug(core->vm_info, core, "Using page size of %dKB\n", page_size / 1024);
+
  
     // Lookup the correct PML address based on the PAGING MODE
     if (core->shdw_pg_mode == SHADOW_PAGING) {
@@ -140,7 +73,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr
 
     //Fix up the PML entry
     if (pml[pml_index].present == 0) {
-       pdpe = (pdpe64_t *)create_generic_pt_page();
+       pdpe = (pdpe64_t *)create_generic_pt_page(core);
    
        // Set default PML Flags...
        pml[pml_index].present = 1;
@@ -154,7 +87,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr
 
     // Fix up the PDPE entry
     if (pdpe[pdpe_index].present == 0) {
-       pde = (pde64_t *)create_generic_pt_page();
+       pde = (pde64_t *)create_generic_pt_page(core);
        
        // Set default PDPE Flags...
        pdpe[pdpe_index].present = 1;
@@ -171,6 +104,9 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr
        pde2mb = (pde64_2MB_t *)pde; // all but these two lines are the same for PTE
        pde2mb[pde_index].large_page = 1;
 
+       *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(fault_addr));
+       *actual_end = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(fault_addr)+1)-1;
+
        if (pde2mb[pde_index].present == 0) {
            pde2mb[pde_index].user_page = 1;
 
@@ -186,7 +122,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr
                }
 
                if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) {
-                   PrintError("Error Could not translate fault addr (%p)\n", (void *)fault_addr);
+                   PrintError(core->vm_info, core, "Error Could not translate fault addr (%p)\n", (void *)fault_addr);
                    return -1;
                }
 
@@ -207,9 +143,12 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr
 
     // Continue with the 4KiB page heirarchy
     
+    *actual_start = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(fault_addr));
+    *actual_end = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(fault_addr)+1)-1;
+
     // Fix up the PDE entry
     if (pde[pde_index].present == 0) {
-       pte = (pte64_t *)create_generic_pt_page();
+       pte = (pte64_t *)create_generic_pt_page(core);
        
        pde[pde_index].present = 1;
        pde[pde_index].writable = 1;
@@ -236,7 +175,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr
            }
 
            if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) {
-               PrintError("Error Could not translate fault addr (%p)\n", (void *)fault_addr);
+               PrintError(core->vm_info, core, "Error Could not translate fault addr (%p)\n", (void *)fault_addr);
                return -1;
            }
 
@@ -254,7 +193,8 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr
     return 0;
 }
 
-static inline int invalidate_addr_64(struct guest_info * core, addr_t inv_addr) {
+static inline int invalidate_addr_64_internal(struct guest_info * core, addr_t inv_addr,
+                                             addr_t *actual_start, uint64_t *actual_size) {
     pml4e64_t * pml = NULL;
     pdpe64_t * pdpe = NULL;
     pde64_t * pde = NULL;
@@ -279,34 +219,91 @@ static inline int invalidate_addr_64(struct guest_info * core, addr_t inv_addr)
     }
 
     if (pml[pml_index].present == 0) {
-       return 0;
+        *actual_start = BASE_TO_PAGE_ADDR_512GB(PAGE_BASE_ADDR_512GB(inv_addr));
+        *actual_size = PAGE_SIZE_512GB;
+       return 0;
     }
 
     pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pml[pml_index].pdp_base_addr));
 
     if (pdpe[pdpe_index].present == 0) {
+        *actual_start = BASE_TO_PAGE_ADDR_1GB(PAGE_BASE_ADDR_1GB(inv_addr));
+        *actual_size = PAGE_SIZE_1GB;
        return 0;
     } else if (pdpe[pdpe_index].large_page == 1) { // 1GiB
        pdpe[pdpe_index].present = 0;
+       pdpe[pdpe_index].writable = 0;
+       pdpe[pdpe_index].user_page = 0;
+        *actual_start = BASE_TO_PAGE_ADDR_1GB(PAGE_BASE_ADDR_1GB(inv_addr));
+        *actual_size = PAGE_SIZE_1GB;
        return 0;
     }
 
     pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pdpe[pdpe_index].pd_base_addr));
 
     if (pde[pde_index].present == 0) {
+        *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(inv_addr));
+        *actual_size = PAGE_SIZE_2MB;
        return 0;
     } else if (pde[pde_index].large_page == 1) { // 2MiB
        pde[pde_index].present = 0;
+       pde[pde_index].writable = 0;
+       pde[pde_index].user_page = 0;
+        *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(inv_addr));
+        *actual_size = PAGE_SIZE_2MB;
        return 0;
     }
 
     pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr));
 
     pte[pte_index].present = 0; // 4KiB
+    pte[pte_index].writable = 0;
+    pte[pte_index].user_page = 0;
+
+    *actual_start = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(inv_addr));
+    *actual_size = PAGE_SIZE_4KB;
 
     return 0;
 }
 
+static inline int invalidate_addr_64(struct guest_info * core, addr_t inv_addr, 
+                                    addr_t *actual_start, addr_t *actual_end)
+{
+  uint64_t len;
+  int rc;
+  
+  rc = invalidate_addr_64_internal(core,inv_addr,actual_start,&len);
+
+  *actual_end = *actual_start + len - 1;
+
+  return rc;
+}
+   
+static inline int invalidate_addr_64_range(struct guest_info * core, addr_t inv_addr_start, addr_t inv_addr_end, 
+                                          addr_t *actual_start, addr_t *actual_end)
+{
+  addr_t next;
+  addr_t start;
+  uint64_t len;
+  int rc;
+  
+  for (next=inv_addr_start; next<=inv_addr_end; ) {
+    rc = invalidate_addr_64_internal(core,next,&start, &len);
+    if (next==inv_addr_start) { 
+      // first iteration, capture where we start invalidating
+      *actual_start = start;
+    }
+    if (rc) { 
+      return rc;
+    }
+    next = start + len;
+    *actual_end = next;
+  }
+  // last iteration, actual_end is off by one
+  (*actual_end)--;
+  return 0;
+}
+
 
 
 #endif