#include <palacios/vm_guest_mem.h>
#include <palacios/vm_guest.h>
-// Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection"
-
-static int check_large_page_ok() {
-
- // Need to fix this....
- return 0;
-
-
-#if 0
- struct v3_mem_region * base_reg = &(info->vm_info->mem_map.base_region);
-
- /* If the guest has been configured for 2MiB pages, then we must check for hooked regions of
- * memory which may overlap with the 2MiB page containing the faulting address (due to
- * potentially differing access policies in place for e.g. i/o devices and APIC). A 2MiB page
- * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains
- * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this
- * note if someone decides to enable this optimization. It can be tested with the SeaStar
- * mapping.
- *
- * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg)
- *
- * |region| |region| 2MiB mapped (state A)
- * |reg| |REG| 2MiB mapped (state B)
- * |region| |reg| |REG| |region| |reg| 4KiB mapped (state C)
- * |reg| |reg| |--REGION---| [2MiB mapped (state D)]
- * |--------------------------------------------| RAM
- * ^ fault addr
- * |----|----|----|----|----|page|----|----|----| 2MB pages
- * >>>>>>>>>>>>>>>>>>>> search space
- */
- addr_t pg_start = 0UL, pg_end = 0UL; // 2MiB page containing the faulting address
- struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr
- bool use_large_page = false;
-
- if (region == NULL) {
- PrintError("%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr);
- return -1;
- }
-
- // set use_large_page here
- if (info->vm_info->paging_size == PAGING_2MB) {
-
- // guest page maps to a host page + offset (so when we shift, it aligns with a host page)
- pg_start = PAGE_ADDR_2MB(fault_addr);
- pg_end = (pg_start + PAGE_SIZE_2MB);
-
- PrintDebug("%s: page [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end);
-
- pg_next_reg = v3_get_next_mem_region(info->vm_info, info->cpu_id, pg_start);
-
- if (pg_next_reg == NULL) {
- PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr);
- return -1;
- }
-
- if ((pg_next_reg->base == 1) { // next region == base region
- use_large_page = 1; // State A
- } else {
-#if 0 // State B/C and D optimization
- use_large_page = (pg_next_reg->guest_end >= pg_end) &&
- ((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start));
- PrintDebug("%s: region [%p,%p) %s partial overlap with page\n", __FUNCTION__,
- (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
- (use_large_page ? "does not have" : "has"));
-#else // State B/C
- use_large_page = (pg_next_reg->guest_start >= pg_end);
- PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__,
- (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
- (use_large_page ? "does not have" : "has"));
-#endif
- }
- }
-
- PrintDebug("%s: Address gets a 2MiB page? %s\n", __FUNCTION__, (use_large_page ? "yes" : "no"));
-#endif
-}
+/* this always builds 4 level page tables, but large pages are allowed */
+// Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection"
-static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr_t fault_addr, pf_error_t error_code) {
+static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr_t fault_addr, pf_error_t error_code,
+ addr_t *actual_start, addr_t *actual_end) {
pml4e64_t * pml = NULL;
pdpe64_t * pdpe = NULL;
pde64_t * pde = NULL;
int pde_index = PDE64_INDEX(fault_addr);
int pte_index = PTE64_INDEX(fault_addr);
- struct v3_mem_region * region = v3_get_mem_region(core->vm_info, core->cpu_id, fault_addr);
- int use_large_page = 0;
+ struct v3_mem_region * region = v3_get_mem_region(core->vm_info, core->vcpu_id, fault_addr);
+ int page_size = PAGE_SIZE_4KB;
+ if (region == NULL) {
+ PrintError(core->vm_info, core, "%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr);
+ return -1;
+ }
/* Check if:
* 1. the guest is configured to use large pages and
* 2. the memory regions can be referenced by a large page
*/
- if ((core->use_large_pages == 1) && (check_large_page_ok() == 1)) {
- use_large_page = 1;
+ if ((core->use_large_pages == 1) || (core->use_giant_pages == 1)) {
+ page_size = v3_get_max_page_size(core, fault_addr, LONG);
}
+ PrintDebug(core->vm_info, core, "Using page size of %dKB\n", page_size / 1024);
+
// Lookup the correct PML address based on the PAGING MODE
if (core->shdw_pg_mode == SHADOW_PAGING) {
//Fix up the PML entry
if (pml[pml_index].present == 0) {
- pdpe = (pdpe64_t *)create_generic_pt_page();
+ pdpe = (pdpe64_t *)create_generic_pt_page(core);
// Set default PML Flags...
pml[pml_index].present = 1;
// Fix up the PDPE entry
if (pdpe[pdpe_index].present == 0) {
- pde = (pde64_t *)create_generic_pt_page();
+ pde = (pde64_t *)create_generic_pt_page(core);
// Set default PDPE Flags...
pdpe[pdpe_index].present = 1;
}
// Fix up the 2MiB PDE and exit here
- if (use_large_page == 1) {
+ if (page_size == PAGE_SIZE_2MB) {
pde2mb = (pde64_2MB_t *)pde; // all but these two lines are the same for PTE
pde2mb[pde_index].large_page = 1;
+ *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(fault_addr));
+ *actual_end = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(fault_addr)+1)-1;
+
if (pde2mb[pde_index].present == 0) {
pde2mb[pde_index].user_page = 1;
}
if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) {
- PrintError("Error Could not translate fault addr (%p)\n", (void *)fault_addr);
+ PrintError(core->vm_info, core, "Error Could not translate fault addr (%p)\n", (void *)fault_addr);
return -1;
}
return region->unhandled(core, fault_addr, fault_addr, region, error_code);
}
- }
+
+ // All done
+ return 0;
+ }
// Continue with the 4KiB page heirarchy
+
+ *actual_start = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(fault_addr));
+ *actual_end = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(fault_addr)+1)-1;
// Fix up the PDE entry
if (pde[pde_index].present == 0) {
- pte = (pte64_t *)create_generic_pt_page();
+ pte = (pte64_t *)create_generic_pt_page(core);
pde[pde_index].present = 1;
pde[pde_index].writable = 1;
pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pde[pde_index].pt_base_addr));
}
-
// Fix up the PTE entry
if (pte[pte_index].present == 0) {
pte[pte_index].user_page = 1;
}
if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) {
- PrintError("Error Could not translate fault addr (%p)\n", (void *)fault_addr);
+ PrintError(core->vm_info, core, "Error Could not translate fault addr (%p)\n", (void *)fault_addr);
return -1;
}
return 0;
}
-static inline int invalidate_addr_64(struct guest_info * core, addr_t inv_addr) {
+static inline int invalidate_addr_64_internal(struct guest_info * core, addr_t inv_addr,
+ addr_t *actual_start, uint64_t *actual_size) {
pml4e64_t * pml = NULL;
pdpe64_t * pdpe = NULL;
pde64_t * pde = NULL;
}
if (pml[pml_index].present == 0) {
- return 0;
+ *actual_start = BASE_TO_PAGE_ADDR_512GB(PAGE_BASE_ADDR_512GB(inv_addr));
+ *actual_size = PAGE_SIZE_512GB;
+ return 0;
}
pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pml[pml_index].pdp_base_addr));
if (pdpe[pdpe_index].present == 0) {
+ *actual_start = BASE_TO_PAGE_ADDR_1GB(PAGE_BASE_ADDR_1GB(inv_addr));
+ *actual_size = PAGE_SIZE_1GB;
return 0;
} else if (pdpe[pdpe_index].large_page == 1) { // 1GiB
pdpe[pdpe_index].present = 0;
+ pdpe[pdpe_index].writable = 0;
+ pdpe[pdpe_index].user_page = 0;
+ *actual_start = BASE_TO_PAGE_ADDR_1GB(PAGE_BASE_ADDR_1GB(inv_addr));
+ *actual_size = PAGE_SIZE_1GB;
return 0;
}
pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pdpe[pdpe_index].pd_base_addr));
if (pde[pde_index].present == 0) {
+ *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(inv_addr));
+ *actual_size = PAGE_SIZE_2MB;
return 0;
} else if (pde[pde_index].large_page == 1) { // 2MiB
pde[pde_index].present = 0;
+ pde[pde_index].writable = 0;
+ pde[pde_index].user_page = 0;
+ *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(inv_addr));
+ *actual_size = PAGE_SIZE_2MB;
return 0;
}
pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr));
pte[pte_index].present = 0; // 4KiB
+ pte[pte_index].writable = 0;
+ pte[pte_index].user_page = 0;
+
+ *actual_start = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(inv_addr));
+ *actual_size = PAGE_SIZE_4KB;
return 0;
}
+static inline int invalidate_addr_64(struct guest_info * core, addr_t inv_addr,
+ addr_t *actual_start, addr_t *actual_end)
+{
+ uint64_t len;
+ int rc;
+
+ rc = invalidate_addr_64_internal(core,inv_addr,actual_start,&len);
+
+ *actual_end = *actual_start + len - 1;
+
+ return rc;
+}
+
+static inline int invalidate_addr_64_range(struct guest_info * core, addr_t inv_addr_start, addr_t inv_addr_end,
+ addr_t *actual_start, addr_t *actual_end)
+{
+ addr_t next;
+ addr_t start;
+ uint64_t len;
+ int rc;
+
+ for (next=inv_addr_start; next<=inv_addr_end; ) {
+ rc = invalidate_addr_64_internal(core,next,&start, &len);
+ if (next==inv_addr_start) {
+ // first iteration, capture where we start invalidating
+ *actual_start = start;
+ }
+ if (rc) {
+ return rc;
+ }
+ next = start + len;
+ *actual_end = next;
+ }
+ // last iteration, actual_end is off by one
+ (*actual_end)--;
+ return 0;
+}
+
#endif