2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2011, Jack Lange <jacklange@cs.pitt.edu>
11 * All rights reserved.
13 * Author: Jack Lange <jacklange@cs.pitt.edu> (implementation)
14 * Peter Dinda <pdinda@northwestern.edu> (invalidation)
16 * This is free software. You are permitted to use,
17 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
20 #include <palacios/vmm.h>
21 #include <palacios/vmx_ept.h>
22 #include <palacios/vmx_lowlevel.h>
23 #include <palacios/vmm_paging.h>
24 #include <palacios/vm_guest_mem.h>
29 Note that the Intel nested page table have a slightly different format
30 than regular page tables. Also note that our implementation
31 uses only 64 bit (4 level) page tables. This is unlike the SVM
32 nested paging implementation.
40 static int handle_vmx_nested_pagefault(struct guest_info * info, addr_t fault_addr, void *info)
42 PrintError(info->vm_info, info, "Cannot do nested page fault as VMX is not enabled.\n");
45 static int handle_vmx_invalidate_nested_addr(struct guest_info * info, addr_t inv_addr)
47 PrintError(info->vm_info, info, "Cannot do invalidate nested addr as VMX is not enabled.\n");
50 static int handle_vmx_invalidate_nested_addr_range(struct guest_info * info,
51 addr_t inv_addr_start, addr_t inv_addr_end)
53 PrintError(info->vm_info, info, "Cannot do invalidate nested addr range as VMX is not enabled.\n");
59 static struct vmx_ept_msr * ept_info = NULL;
62 static addr_t create_ept_page() {
66 temp = V3_AllocPages(1); // need not be shadow-safe, not exposed to guest
68 PrintError(VM_NONE, VCORE_NONE, "Cannot allocate EPT page\n");
71 page = V3_VAddr(temp);
72 memset(page, 0, PAGE_SIZE);
80 static int init_ept(struct guest_info * core, struct vmx_hw_info * hw_info) {
81 addr_t ept_pa = (addr_t)V3_PAddr((void *)create_ept_page());
82 vmx_eptp_t * ept_ptr = (vmx_eptp_t *)&(core->direct_map_pt);
85 ept_info = &(hw_info->ept_info);
87 /* TODO: Should we set this to WB?? */
90 if (ept_info->pg_walk_len4) {
93 PrintError(core->vm_info, core, "Unsupported EPT Table depth\n");
97 ept_ptr->pml_base_addr = PAGE_BASE_ADDR(ept_pa);
104 static inline void ept_exit_qual_to_pf_error(struct ept_exit_qual *qual, pf_error_t *error)
106 memset(error,0,sizeof(pf_error_t));
107 error->present = qual->present;
108 error->write = qual->write;
109 error->ifetch = qual->ifetch;
113 /* We can use the default paging macros, since the formats are close enough to allow it */
116 static int handle_vmx_nested_pagefault(struct guest_info * core, addr_t fault_addr, void *pfinfo,
117 addr_t *actual_start, addr_t *actual_end )
119 struct ept_exit_qual * ept_qual = (struct ept_exit_qual *) pfinfo;
120 ept_pml4_t * pml = NULL;
121 // ept_pdp_1GB_t * pdpe1gb = NULL;
122 ept_pdp_t * pdpe = NULL;
123 ept_pde_2MB_t * pde2mb = NULL;
124 ept_pde_t * pde = NULL;
125 ept_pte_t * pte = NULL;
126 addr_t host_addr = 0;
128 int pml_index = PML4E64_INDEX(fault_addr);
129 int pdpe_index = PDPE64_INDEX(fault_addr);
130 int pde_index = PDE64_INDEX(fault_addr);
131 int pte_index = PTE64_INDEX(fault_addr);
133 struct v3_mem_region * region = v3_get_mem_region(core->vm_info, core->vcpu_id, fault_addr);
134 int page_size = PAGE_SIZE_4KB;
137 pf_error_t error_code;
139 ept_exit_qual_to_pf_error(ept_qual, &error_code);
141 PrintDebug(info->vm_info, info, "Nested PageFault: fault_addr=%p, error_code=%u, exit_qual=0x%llx\n", (void *)fault_addr, *(uint_t *)&error_code, qual->value);
144 if (region == NULL) {
145 PrintError(core->vm_info, core, "invalid region, addr=%p\n", (void *)fault_addr);
149 if ((core->use_large_pages == 1) || (core->use_giant_pages == 1)) {
150 page_size = v3_get_max_page_size(core, fault_addr, LONG);
155 pml = (ept_pml4_t *)CR3_TO_PML4E64_VA(core->direct_map_pt);
159 //Fix up the PML entry
160 if (pml[pml_index].read == 0) {
161 pdpe = (ept_pdp_t *)create_ept_page();
163 // Set default PML Flags...
164 pml[pml_index].read = 1;
165 pml[pml_index].write = 1;
166 pml[pml_index].exec = 1;
168 pml[pml_index].pdp_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pdpe));
170 pdpe = V3_VAddr((void *)BASE_TO_PAGE_ADDR_4KB(pml[pml_index].pdp_base_addr));
174 // Fix up the PDPE entry
175 if (pdpe[pdpe_index].read == 0) {
176 pde = (ept_pde_t *)create_ept_page();
178 // Set default PDPE Flags...
179 pdpe[pdpe_index].read = 1;
180 pdpe[pdpe_index].write = 1;
181 pdpe[pdpe_index].exec = 1;
183 pdpe[pdpe_index].pd_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pde));
185 pde = V3_VAddr((void *)BASE_TO_PAGE_ADDR_4KB(pdpe[pdpe_index].pd_base_addr));
190 // Fix up the 2MiB PDE and exit here
191 if (page_size == PAGE_SIZE_2MB) {
192 pde2mb = (ept_pde_2MB_t *)pde; // all but these two lines are the same for PTE
193 pde2mb[pde_index].large_page = 1;
195 *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(fault_addr));
196 *actual_end = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(fault_addr)+1)-1;
198 if (pde2mb[pde_index].read == 0) {
200 if ( (region->flags.alloced == 1) &&
201 (region->flags.read == 1)) {
203 pde2mb[pde_index].read = 1;
204 pde2mb[pde_index].exec = 1;
205 pde2mb[pde_index].ipat = 1;
206 pde2mb[pde_index].mt = 6;
208 if (region->flags.write == 1) {
209 pde2mb[pde_index].write = 1;
211 pde2mb[pde_index].write = 0;
214 if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) {
215 PrintError(core->vm_info, core, "Error: Could not translate fault addr (%p)\n", (void *)fault_addr);
219 pde2mb[pde_index].page_base_addr = PAGE_BASE_ADDR_2MB(host_addr);
221 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
224 // We fix all permissions on the first pass,
225 // so we only get here if its an unhandled exception
227 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
233 // Continue with the 4KiB page heirarchy
236 // Fix up the PDE entry
237 if (pde[pde_index].read == 0) {
238 pte = (ept_pte_t *)create_ept_page();
240 pde[pde_index].read = 1;
241 pde[pde_index].write = 1;
242 pde[pde_index].exec = 1;
244 pde[pde_index].pt_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pte));
246 pte = V3_VAddr((void *)BASE_TO_PAGE_ADDR_4KB(pde[pde_index].pt_base_addr));
250 *actual_start = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(fault_addr));
251 *actual_end = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(fault_addr)+1)-1;
254 // Fix up the PTE entry
255 if (pte[pte_index].read == 0) {
257 if ( (region->flags.alloced == 1) &&
258 (region->flags.read == 1)) {
260 pte[pte_index].read = 1;
261 pte[pte_index].exec = 1;
262 pte[pte_index].ipat = 1;
263 pte[pte_index].mt = 6;
265 if (region->flags.write == 1) {
266 pte[pte_index].write = 1;
268 pte[pte_index].write = 0;
271 if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) {
272 PrintError(core->vm_info, core, "Error Could not translate fault addr (%p)\n", (void *)fault_addr);
277 pte[pte_index].page_base_addr = PAGE_BASE_ADDR_4KB(host_addr);
279 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
282 // We fix all permissions on the first pass,
283 // so we only get here if its an unhandled exception
285 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
293 static int handle_vmx_invalidate_nested_addr_internal(struct guest_info *core, addr_t inv_addr,
294 addr_t *actual_start, uint64_t *actual_size) {
295 ept_pml4_t *pml = NULL;
296 ept_pdp_t *pdpe = NULL;
297 ept_pde_t *pde = NULL;
298 ept_pte_t *pte = NULL;
302 // clear the page table entry
304 int pml_index = PML4E64_INDEX(inv_addr);
305 int pdpe_index = PDPE64_INDEX(inv_addr);
306 int pde_index = PDE64_INDEX(inv_addr);
307 int pte_index = PTE64_INDEX(inv_addr);
310 pml = (ept_pml4_t *)CR3_TO_PML4E64_VA(core->direct_map_pt);
313 // note that there are no present bits in EPT, so we
314 // use the read bit to signify this.
315 // either an entry is read/write/exec or it is none of these
317 if (pml[pml_index].read == 0) {
318 // already invalidated
319 *actual_start = BASE_TO_PAGE_ADDR_512GB(PAGE_BASE_ADDR_512GB(inv_addr));
320 *actual_size = PAGE_SIZE_512GB;
324 pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pml[pml_index].pdp_base_addr));
326 if (pdpe[pdpe_index].read == 0) {
327 // already invalidated
328 *actual_start = BASE_TO_PAGE_ADDR_1GB(PAGE_BASE_ADDR_1GB(inv_addr));
329 *actual_size = PAGE_SIZE_1GB;
331 } else if (pdpe[pdpe_index].large_page == 1) { // 1GiB
332 pdpe[pdpe_index].read = 0;
333 pdpe[pdpe_index].write = 0;
334 pdpe[pdpe_index].exec = 0;
335 *actual_start = BASE_TO_PAGE_ADDR_1GB(PAGE_BASE_ADDR_1GB(inv_addr));
336 *actual_size = PAGE_SIZE_1GB;
340 pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pdpe[pdpe_index].pd_base_addr));
342 if (pde[pde_index].read == 0) {
343 // already invalidated
344 *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(inv_addr));
345 *actual_size = PAGE_SIZE_2MB;
347 } else if (pde[pde_index].large_page == 1) { // 2MiB
348 pde[pde_index].read = 0;
349 pde[pde_index].write = 0;
350 pde[pde_index].exec = 0;
351 *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(inv_addr));
352 *actual_size = PAGE_SIZE_2MB;
356 pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr));
358 pte[pte_index].read = 0; // 4KiB
359 pte[pte_index].write = 0;
360 pte[pte_index].exec = 0;
362 *actual_start = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(inv_addr));
363 *actual_size = PAGE_SIZE_4KB;
369 static int handle_vmx_invalidate_nested_addr(struct guest_info *core, addr_t inv_addr,
370 addr_t *actual_start, addr_t *actual_end)
375 rc = handle_vmx_invalidate_nested_addr_internal(core,inv_addr,actual_start,&len);
377 *actual_end = *actual_start + len - 1;
383 static int handle_vmx_invalidate_nested_addr_range(struct guest_info *core,
384 addr_t inv_addr_start, addr_t inv_addr_end,
385 addr_t *actual_start, addr_t *actual_end)
392 for (next=inv_addr_start; next<=inv_addr_end; ) {
393 rc = handle_vmx_invalidate_nested_addr_internal(core,next,&start, &len);
394 if (next==inv_addr_start) {
395 // first iteration, capture where we start invalidating
396 *actual_start = start;
404 // last iteration, actual_end is off by one