X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?a=blobdiff_plain;f=palacios%2Fsrc%2Fpalacios%2Fvmm_direct_paging_64.h;h=6afb9fb88f7ecad95383935f314f8fb6e11c2867;hb=7a10bbf26095ed08095f38de6c1db4b3a131d6f0;hp=d45ae175ec9451523ae9c17d0a5eccb018826c77;hpb=1242db041374fb8d6d09e692b6ab95f7b86319ac;p=palacios.git diff --git a/palacios/src/palacios/vmm_direct_paging_64.h b/palacios/src/palacios/vmm_direct_paging_64.h index d45ae17..6afb9fb 100644 --- a/palacios/src/palacios/vmm_direct_paging_64.h +++ b/palacios/src/palacios/vmm_direct_paging_64.h @@ -27,78 +27,12 @@ #include #include -// Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection" - -static uint32_t get_page_size(struct guest_info * core, addr_t fault_addr) { - addr_t pg_start = 0UL, pg_end = 0UL; // 2MiB page containing the faulting address - struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr - uint32_t page_size = PAGE_SIZE_4KB; - - /* If the guest has been configured for 2MiB pages, then we must check for hooked regions of - * memory which may overlap with the 2MiB page containing the faulting address (due to - * potentially differing access policies in place for e.g. i/o devices and APIC). A 2MiB page - * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains - * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this - * note if someone decides to enable this optimization. It can be tested with the SeaStar - * mapping. - * - * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg) - * - * |region| |region| 2MiB mapped (state A) - * |reg| |REG| 2MiB mapped (state B) - * |region| |reg| |REG| |region| |reg| 4KiB mapped (state C) - * |reg| |reg| |--REGION---| [2MiB mapped (state D)] - * |--------------------------------------------| RAM - * ^ fault addr - * |----|----|----|----|----|page|----|----|----| 2MB pages - * >>>>>>>>>>>>>>>>>>>> search space - */ - - - // guest page maps to a host page + offset (so when we shift, it aligns with a host page) - pg_start = PAGE_ADDR_2MB(fault_addr); - pg_end = (pg_start + PAGE_SIZE_2MB); - - PrintDebug("%s: page [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end); - - pg_next_reg = v3_get_next_mem_region(core->vm_info, core->cpu_id, pg_start); - - if (pg_next_reg == NULL) { - PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr); - return PAGE_SIZE_4KB; - } - - if (pg_next_reg->flags.base == 1) { - page_size = PAGE_SIZE_2MB; // State A - } else { -#if 0 // State B/C and D optimization - if ((pg_next_reg->guest_end >= pg_end) && - ((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start))) { - page_size = PAGE_SIZE_2MB; - } - - PrintDebug("%s: region [%p,%p) %s partially overlap with page\n", __FUNCTION__, - (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end, - (page_size == PAGE_SIZE_2MB) ? "does not" : "does"); - -#else // State B/C - if (pg_next_reg->guest_start >= pg_end) { - - page_size = PAGE_SIZE_2MB; - } - - PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__, - (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end, - (page_size == PAGE_SIZE_2MB) ? "does not" : "does"); - -#endif - } - - return page_size; -} +/* this always builds 4 level page tables, but large pages are allowed */ +// Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection" -static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr_t fault_addr, pf_error_t error_code) { +static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr_t fault_addr, pf_error_t error_code, + addr_t *actual_start, addr_t *actual_end) { pml4e64_t * pml = NULL; pdpe64_t * pdpe = NULL; pde64_t * pde = NULL; @@ -111,11 +45,11 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr int pde_index = PDE64_INDEX(fault_addr); int pte_index = PTE64_INDEX(fault_addr); - struct v3_mem_region * region = v3_get_mem_region(core->vm_info, core->cpu_id, fault_addr); + struct v3_mem_region * region = v3_get_mem_region(core->vm_info, core->vcpu_id, fault_addr); int page_size = PAGE_SIZE_4KB; if (region == NULL) { - PrintError("%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr); + PrintError(core->vm_info, core, "%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr); return -1; } @@ -123,11 +57,11 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr * 1. the guest is configured to use large pages and * 2. the memory regions can be referenced by a large page */ - if ((core->use_large_pages == 1) ) { - page_size = get_page_size(core, fault_addr); + if ((core->use_large_pages == 1) || (core->use_giant_pages == 1)) { + page_size = v3_get_max_page_size(core, fault_addr, LONG); } - PrintDebug("Using page size of %dKB\n", page_size / 1024); + PrintDebug(core->vm_info, core, "Using page size of %dKB\n", page_size / 1024); // Lookup the correct PML address based on the PAGING MODE @@ -139,7 +73,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr //Fix up the PML entry if (pml[pml_index].present == 0) { - pdpe = (pdpe64_t *)create_generic_pt_page(); + pdpe = (pdpe64_t *)create_generic_pt_page(core); // Set default PML Flags... pml[pml_index].present = 1; @@ -153,7 +87,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr // Fix up the PDPE entry if (pdpe[pdpe_index].present == 0) { - pde = (pde64_t *)create_generic_pt_page(); + pde = (pde64_t *)create_generic_pt_page(core); // Set default PDPE Flags... pdpe[pdpe_index].present = 1; @@ -170,6 +104,9 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr pde2mb = (pde64_2MB_t *)pde; // all but these two lines are the same for PTE pde2mb[pde_index].large_page = 1; + *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(fault_addr)); + *actual_end = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(fault_addr)+1)-1; + if (pde2mb[pde_index].present == 0) { pde2mb[pde_index].user_page = 1; @@ -185,7 +122,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr } if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) { - PrintError("Error Could not translate fault addr (%p)\n", (void *)fault_addr); + PrintError(core->vm_info, core, "Error Could not translate fault addr (%p)\n", (void *)fault_addr); return -1; } @@ -206,9 +143,12 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr // Continue with the 4KiB page heirarchy + *actual_start = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(fault_addr)); + *actual_end = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(fault_addr)+1)-1; + // Fix up the PDE entry if (pde[pde_index].present == 0) { - pte = (pte64_t *)create_generic_pt_page(); + pte = (pte64_t *)create_generic_pt_page(core); pde[pde_index].present = 1; pde[pde_index].writable = 1; @@ -235,7 +175,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr } if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) { - PrintError("Error Could not translate fault addr (%p)\n", (void *)fault_addr); + PrintError(core->vm_info, core, "Error Could not translate fault addr (%p)\n", (void *)fault_addr); return -1; } @@ -253,7 +193,8 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr return 0; } -static inline int invalidate_addr_64(struct guest_info * core, addr_t inv_addr) { +static inline int invalidate_addr_64_internal(struct guest_info * core, addr_t inv_addr, + addr_t *actual_start, uint64_t *actual_size) { pml4e64_t * pml = NULL; pdpe64_t * pdpe = NULL; pde64_t * pde = NULL; @@ -278,34 +219,91 @@ static inline int invalidate_addr_64(struct guest_info * core, addr_t inv_addr) } if (pml[pml_index].present == 0) { - return 0; + *actual_start = BASE_TO_PAGE_ADDR_512GB(PAGE_BASE_ADDR_512GB(inv_addr)); + *actual_size = PAGE_SIZE_512GB; + return 0; } pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pml[pml_index].pdp_base_addr)); if (pdpe[pdpe_index].present == 0) { + *actual_start = BASE_TO_PAGE_ADDR_1GB(PAGE_BASE_ADDR_1GB(inv_addr)); + *actual_size = PAGE_SIZE_1GB; return 0; } else if (pdpe[pdpe_index].large_page == 1) { // 1GiB pdpe[pdpe_index].present = 0; + pdpe[pdpe_index].writable = 0; + pdpe[pdpe_index].user_page = 0; + *actual_start = BASE_TO_PAGE_ADDR_1GB(PAGE_BASE_ADDR_1GB(inv_addr)); + *actual_size = PAGE_SIZE_1GB; return 0; } pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pdpe[pdpe_index].pd_base_addr)); if (pde[pde_index].present == 0) { + *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(inv_addr)); + *actual_size = PAGE_SIZE_2MB; return 0; } else if (pde[pde_index].large_page == 1) { // 2MiB pde[pde_index].present = 0; + pde[pde_index].writable = 0; + pde[pde_index].user_page = 0; + *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(inv_addr)); + *actual_size = PAGE_SIZE_2MB; return 0; } pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr)); pte[pte_index].present = 0; // 4KiB + pte[pte_index].writable = 0; + pte[pte_index].user_page = 0; + + *actual_start = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(inv_addr)); + *actual_size = PAGE_SIZE_4KB; return 0; } +static inline int invalidate_addr_64(struct guest_info * core, addr_t inv_addr, + addr_t *actual_start, addr_t *actual_end) +{ + uint64_t len; + int rc; + + rc = invalidate_addr_64_internal(core,inv_addr,actual_start,&len); + + *actual_end = *actual_start + len - 1; + + return rc; +} + +static inline int invalidate_addr_64_range(struct guest_info * core, addr_t inv_addr_start, addr_t inv_addr_end, + addr_t *actual_start, addr_t *actual_end) +{ + addr_t next; + addr_t start; + uint64_t len; + int rc; + + for (next=inv_addr_start; next<=inv_addr_end; ) { + rc = invalidate_addr_64_internal(core,next,&start, &len); + if (next==inv_addr_start) { + // first iteration, capture where we start invalidating + *actual_start = start; + } + if (rc) { + return rc; + } + next = start + len; + *actual_end = next; + } + // last iteration, actual_end is off by one + (*actual_end)--; + return 0; +} + #endif