2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2008, Steven Jaconette <stevenjaconette2007@u.northwestern.edu>
11 * Copyright (c) 2008, Jack Lange <jarusl@cs.northwestern.edu>
12 * Copyright (c) 2008, The V3VEE Project <http://www.v3vee.org>
13 * All rights reserved.
15 * Author: Steven Jaconette <stevenjaconette2007@u.northwestern.edu>
17 * This is free software. You are permitted to use,
18 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
21 #ifndef __VMM_DIRECT_PAGING_64_H__
22 #define __VMM_DIRECT_PAGING_64_H__
24 #include <palacios/vmm_mem.h>
25 #include <palacios/vmm_paging.h>
26 #include <palacios/vmm.h>
27 #include <palacios/vm_guest_mem.h>
28 #include <palacios/vm_guest.h>
30 // Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection"
32 static int get_page_size() {
34 // Need to fix this....
39 struct v3_mem_region * base_reg = &(info->vm_info->mem_map.base_region);
41 /* If the guest has been configured for 2MiB pages, then we must check for hooked regions of
42 * memory which may overlap with the 2MiB page containing the faulting address (due to
43 * potentially differing access policies in place for e.g. i/o devices and APIC). A 2MiB page
44 * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains
45 * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this
46 * note if someone decides to enable this optimization. It can be tested with the SeaStar
49 * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg)
51 * |region| |region| 2MiB mapped (state A)
52 * |reg| |REG| 2MiB mapped (state B)
53 * |region| |reg| |REG| |region| |reg| 4KiB mapped (state C)
54 * |reg| |reg| |--REGION---| [2MiB mapped (state D)]
55 * |--------------------------------------------| RAM
57 * |----|----|----|----|----|page|----|----|----| 2MB pages
58 * >>>>>>>>>>>>>>>>>>>> search space
60 addr_t pg_start = 0UL, pg_end = 0UL; // 2MiB page containing the faulting address
61 struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr
62 bool use_large_page = false;
65 PrintError("%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr);
69 // set use_large_page here
70 if (info->vm_info->paging_size == PAGING_2MB) {
72 // guest page maps to a host page + offset (so when we shift, it aligns with a host page)
73 pg_start = PAGE_ADDR_2MB(fault_addr);
74 pg_end = (pg_start + PAGE_SIZE_2MB);
76 PrintDebug("%s: page [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end);
78 pg_next_reg = v3_get_next_mem_region(info->vm_info, info->cpu_id, pg_start);
80 if (pg_next_reg == NULL) {
81 PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr);
85 if (pg_next_reg->base == 1) {
86 use_large_page = 1; // State A
88 #if 0 // State B/C and D optimization
89 use_large_page = (pg_next_reg->guest_end >= pg_end) &&
90 ((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start));
91 PrintDebug("%s: region [%p,%p) %s partial overlap with page\n", __FUNCTION__,
92 (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
93 (use_large_page ? "does not have" : "has"));
95 use_large_page = (pg_next_reg->guest_start >= pg_end);
96 PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__,
97 (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
98 (use_large_page ? "does not have" : "has"));
103 PrintDebug("%s: Address gets a 2MiB page? %s\n", __FUNCTION__, (use_large_page ? "yes" : "no"));
108 static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr_t fault_addr, pf_error_t error_code) {
109 pml4e64_t * pml = NULL;
110 pdpe64_t * pdpe = NULL;
111 pde64_t * pde = NULL;
112 pde64_2MB_t * pde2mb = NULL;
113 pte64_t * pte = NULL;
114 addr_t host_addr = 0;
116 int pml_index = PML4E64_INDEX(fault_addr);
117 int pdpe_index = PDPE64_INDEX(fault_addr);
118 int pde_index = PDE64_INDEX(fault_addr);
119 int pte_index = PTE64_INDEX(fault_addr);
121 struct v3_mem_region * region = v3_get_mem_region(core->vm_info, core->cpu_id, fault_addr);
122 int page_size = PAGE_SIZE_4KB;
124 if (region == NULL) {
125 PrintError("%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr);
130 * 1. the guest is configured to use large pages and
131 * 2. the memory regions can be referenced by a large page
133 if ((core->use_large_pages == 1) ) {
134 page_size = get_page_size();
138 // Lookup the correct PML address based on the PAGING MODE
139 if (core->shdw_pg_mode == SHADOW_PAGING) {
140 pml = CR3_TO_PML4E64_VA(core->ctrl_regs.cr3);
142 pml = CR3_TO_PML4E64_VA(core->direct_map_pt);
145 //Fix up the PML entry
146 if (pml[pml_index].present == 0) {
147 pdpe = (pdpe64_t *)create_generic_pt_page();
149 // Set default PML Flags...
150 pml[pml_index].present = 1;
151 pml[pml_index].writable = 1;
152 pml[pml_index].user_page = 1;
154 pml[pml_index].pdp_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pdpe));
156 pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pml[pml_index].pdp_base_addr));
159 // Fix up the PDPE entry
160 if (pdpe[pdpe_index].present == 0) {
161 pde = (pde64_t *)create_generic_pt_page();
163 // Set default PDPE Flags...
164 pdpe[pdpe_index].present = 1;
165 pdpe[pdpe_index].writable = 1;
166 pdpe[pdpe_index].user_page = 1;
168 pdpe[pdpe_index].pd_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pde));
170 pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pdpe[pdpe_index].pd_base_addr));
173 // Fix up the 2MiB PDE and exit here
174 if (page_size == PAGE_SIZE_2MB) {
175 pde2mb = (pde64_2MB_t *)pde; // all but these two lines are the same for PTE
176 pde2mb[pde_index].large_page = 1;
178 if (pde2mb[pde_index].present == 0) {
179 pde2mb[pde_index].user_page = 1;
181 if ( (region->flags.alloced == 1) &&
182 (region->flags.read == 1)) {
184 pde2mb[pde_index].present = 1;
186 if (region->flags.write == 1) {
187 pde2mb[pde_index].writable = 1;
189 pde2mb[pde_index].writable = 0;
192 if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) {
193 PrintError("Error Could not translate fault addr (%p)\n", (void *)fault_addr);
197 pde2mb[pde_index].page_base_addr = PAGE_BASE_ADDR_2MB(host_addr);
199 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
202 // We fix all permissions on the first pass,
203 // so we only get here if its an unhandled exception
205 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
212 // Continue with the 4KiB page heirarchy
214 // Fix up the PDE entry
215 if (pde[pde_index].present == 0) {
216 pte = (pte64_t *)create_generic_pt_page();
218 pde[pde_index].present = 1;
219 pde[pde_index].writable = 1;
220 pde[pde_index].user_page = 1;
222 pde[pde_index].pt_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pte));
224 pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pde[pde_index].pt_base_addr));
227 // Fix up the PTE entry
228 if (pte[pte_index].present == 0) {
229 pte[pte_index].user_page = 1;
231 if ((region->flags.alloced == 1) &&
232 (region->flags.read == 1)) {
234 pte[pte_index].present = 1;
236 if (region->flags.write == 1) {
237 pte[pte_index].writable = 1;
239 pte[pte_index].writable = 0;
242 if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) {
243 PrintError("Error Could not translate fault addr (%p)\n", (void *)fault_addr);
247 pte[pte_index].page_base_addr = PAGE_BASE_ADDR_4KB(host_addr);
249 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
252 // We fix all permissions on the first pass,
253 // so we only get here if its an unhandled exception
255 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
261 static inline int invalidate_addr_64(struct guest_info * core, addr_t inv_addr) {
262 pml4e64_t * pml = NULL;
263 pdpe64_t * pdpe = NULL;
264 pde64_t * pde = NULL;
265 pte64_t * pte = NULL;
271 // clear the page table entry
272 int pml_index = PML4E64_INDEX(inv_addr);
273 int pdpe_index = PDPE64_INDEX(inv_addr);
274 int pde_index = PDE64_INDEX(inv_addr);
275 int pte_index = PTE64_INDEX(inv_addr);
278 // Lookup the correct PDE address based on the PAGING MODE
279 if (core->shdw_pg_mode == SHADOW_PAGING) {
280 pml = CR3_TO_PML4E64_VA(core->ctrl_regs.cr3);
282 pml = CR3_TO_PML4E64_VA(core->direct_map_pt);
285 if (pml[pml_index].present == 0) {
289 pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pml[pml_index].pdp_base_addr));
291 if (pdpe[pdpe_index].present == 0) {
293 } else if (pdpe[pdpe_index].large_page == 1) { // 1GiB
294 pdpe[pdpe_index].present = 0;
298 pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pdpe[pdpe_index].pd_base_addr));
300 if (pde[pde_index].present == 0) {
302 } else if (pde[pde_index].large_page == 1) { // 2MiB
303 pde[pde_index].present = 0;
307 pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr));
309 pte[pte_index].present = 0; // 4KiB