2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2008, Steven Jaconette <stevenjaconette2007@u.northwestern.edu>
11 * Copyright (c) 2008, Jack Lange <jarusl@cs.northwestern.edu>
12 * Copyright (c) 2008, The V3VEE Project <http://www.v3vee.org>
13 * All rights reserved.
15 * Author: Steven Jaconette <stevenjaconette2007@u.northwestern.edu>
17 * This is free software. You are permitted to use,
18 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
21 #ifndef __VMM_DIRECT_PAGING_64_H__
22 #define __VMM_DIRECT_PAGING_64_H__
24 #include <palacios/vmm_mem.h>
25 #include <palacios/vmm_paging.h>
26 #include <palacios/vmm.h>
27 #include <palacios/vm_guest_mem.h>
28 #include <palacios/vm_guest.h>
30 /* this always builds 4 level page tables, but large pages are allowed */
32 // Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection"
34 static inline int handle_passthrough_pagefault_64(struct guest_info * core, addr_t fault_addr, pf_error_t error_code,
35 addr_t *actual_start, addr_t *actual_end) {
36 pml4e64_t * pml = NULL;
37 pdpe64_t * pdpe = NULL;
39 pde64_2MB_t * pde2mb = NULL;
43 int pml_index = PML4E64_INDEX(fault_addr);
44 int pdpe_index = PDPE64_INDEX(fault_addr);
45 int pde_index = PDE64_INDEX(fault_addr);
46 int pte_index = PTE64_INDEX(fault_addr);
48 struct v3_mem_region * region = v3_get_mem_region(core->vm_info, core->vcpu_id, fault_addr);
49 int page_size = PAGE_SIZE_4KB;
52 PrintError(core->vm_info, core, "%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr);
57 * 1. the guest is configured to use large pages and
58 * 2. the memory regions can be referenced by a large page
60 if ((core->use_large_pages == 1) || (core->use_giant_pages == 1)) {
61 page_size = v3_get_max_page_size(core, fault_addr, LONG);
64 PrintDebug(core->vm_info, core, "Using page size of %dKB\n", page_size / 1024);
67 // Lookup the correct PML address based on the PAGING MODE
68 if (core->shdw_pg_mode == SHADOW_PAGING) {
69 pml = CR3_TO_PML4E64_VA(core->ctrl_regs.cr3);
71 pml = CR3_TO_PML4E64_VA(core->direct_map_pt);
74 //Fix up the PML entry
75 if (pml[pml_index].present == 0) {
76 pdpe = (pdpe64_t *)create_generic_pt_page(core);
78 // Set default PML Flags...
79 pml[pml_index].present = 1;
80 pml[pml_index].writable = 1;
81 pml[pml_index].user_page = 1;
83 pml[pml_index].pdp_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pdpe));
85 pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pml[pml_index].pdp_base_addr));
88 // Fix up the PDPE entry
89 if (pdpe[pdpe_index].present == 0) {
90 pde = (pde64_t *)create_generic_pt_page(core);
92 // Set default PDPE Flags...
93 pdpe[pdpe_index].present = 1;
94 pdpe[pdpe_index].writable = 1;
95 pdpe[pdpe_index].user_page = 1;
97 pdpe[pdpe_index].pd_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pde));
99 pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pdpe[pdpe_index].pd_base_addr));
102 // Fix up the 2MiB PDE and exit here
103 if (page_size == PAGE_SIZE_2MB) {
104 pde2mb = (pde64_2MB_t *)pde; // all but these two lines are the same for PTE
105 pde2mb[pde_index].large_page = 1;
107 *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(fault_addr));
108 *actual_end = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(fault_addr)+1)-1;
110 if (pde2mb[pde_index].present == 0) {
111 pde2mb[pde_index].user_page = 1;
113 if ( (region->flags.alloced == 1) &&
114 (region->flags.read == 1)) {
116 pde2mb[pde_index].present = 1;
118 if (region->flags.write == 1) {
119 pde2mb[pde_index].writable = 1;
121 pde2mb[pde_index].writable = 0;
124 if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) {
125 PrintError(core->vm_info, core, "Error Could not translate fault addr (%p)\n", (void *)fault_addr);
129 pde2mb[pde_index].page_base_addr = PAGE_BASE_ADDR_2MB(host_addr);
131 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
134 // We fix all permissions on the first pass,
135 // so we only get here if its an unhandled exception
137 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
144 // Continue with the 4KiB page heirarchy
146 *actual_start = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(fault_addr));
147 *actual_end = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(fault_addr)+1)-1;
149 // Fix up the PDE entry
150 if (pde[pde_index].present == 0) {
151 pte = (pte64_t *)create_generic_pt_page(core);
153 pde[pde_index].present = 1;
154 pde[pde_index].writable = 1;
155 pde[pde_index].user_page = 1;
157 pde[pde_index].pt_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pte));
159 pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pde[pde_index].pt_base_addr));
162 // Fix up the PTE entry
163 if (pte[pte_index].present == 0) {
164 pte[pte_index].user_page = 1;
166 if ((region->flags.alloced == 1) &&
167 (region->flags.read == 1)) {
169 pte[pte_index].present = 1;
171 if (region->flags.write == 1) {
172 pte[pte_index].writable = 1;
174 pte[pte_index].writable = 0;
177 if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) {
178 PrintError(core->vm_info, core, "Error Could not translate fault addr (%p)\n", (void *)fault_addr);
182 pte[pte_index].page_base_addr = PAGE_BASE_ADDR_4KB(host_addr);
184 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
187 // We fix all permissions on the first pass,
188 // so we only get here if its an unhandled exception
190 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
196 static inline int invalidate_addr_64_internal(struct guest_info * core, addr_t inv_addr,
197 addr_t *actual_start, uint64_t *actual_size) {
198 pml4e64_t * pml = NULL;
199 pdpe64_t * pdpe = NULL;
200 pde64_t * pde = NULL;
201 pte64_t * pte = NULL;
207 // clear the page table entry
208 int pml_index = PML4E64_INDEX(inv_addr);
209 int pdpe_index = PDPE64_INDEX(inv_addr);
210 int pde_index = PDE64_INDEX(inv_addr);
211 int pte_index = PTE64_INDEX(inv_addr);
214 // Lookup the correct PDE address based on the PAGING MODE
215 if (core->shdw_pg_mode == SHADOW_PAGING) {
216 pml = CR3_TO_PML4E64_VA(core->ctrl_regs.cr3);
218 pml = CR3_TO_PML4E64_VA(core->direct_map_pt);
221 if (pml[pml_index].present == 0) {
222 *actual_start = BASE_TO_PAGE_ADDR_512GB(PAGE_BASE_ADDR_512GB(inv_addr));
223 *actual_size = PAGE_SIZE_512GB;
227 pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pml[pml_index].pdp_base_addr));
229 if (pdpe[pdpe_index].present == 0) {
230 *actual_start = BASE_TO_PAGE_ADDR_1GB(PAGE_BASE_ADDR_1GB(inv_addr));
231 *actual_size = PAGE_SIZE_1GB;
233 } else if (pdpe[pdpe_index].large_page == 1) { // 1GiB
234 pdpe[pdpe_index].present = 0;
235 pdpe[pdpe_index].writable = 0;
236 pdpe[pdpe_index].user_page = 0;
237 *actual_start = BASE_TO_PAGE_ADDR_1GB(PAGE_BASE_ADDR_1GB(inv_addr));
238 *actual_size = PAGE_SIZE_1GB;
242 pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pdpe[pdpe_index].pd_base_addr));
244 if (pde[pde_index].present == 0) {
245 *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(inv_addr));
246 *actual_size = PAGE_SIZE_2MB;
248 } else if (pde[pde_index].large_page == 1) { // 2MiB
249 pde[pde_index].present = 0;
250 pde[pde_index].writable = 0;
251 pde[pde_index].user_page = 0;
252 *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(inv_addr));
253 *actual_size = PAGE_SIZE_2MB;
257 pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr));
259 pte[pte_index].present = 0; // 4KiB
260 pte[pte_index].writable = 0;
261 pte[pte_index].user_page = 0;
263 *actual_start = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(inv_addr));
264 *actual_size = PAGE_SIZE_4KB;
269 static inline int invalidate_addr_64(struct guest_info * core, addr_t inv_addr,
270 addr_t *actual_start, addr_t *actual_end)
275 rc = invalidate_addr_64_internal(core,inv_addr,actual_start,&len);
277 *actual_end = *actual_start + len - 1;
282 static inline int invalidate_addr_64_range(struct guest_info * core, addr_t inv_addr_start, addr_t inv_addr_end,
283 addr_t *actual_start, addr_t *actual_end)
290 for (next=inv_addr_start; next<=inv_addr_end; ) {
291 rc = invalidate_addr_64_internal(core,next,&start, &len);
292 if (next==inv_addr_start) {
293 // first iteration, capture where we start invalidating
294 *actual_start = start;
302 // last iteration, actual_end is off by one