2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2011, Jack Lange <jacklange@cs.pitt.edu>
11 * All rights reserved.
13 * Author: Jack Lange <jacklange@cs.pitt.edu> (implementation)
14 * Peter Dinda <pdinda@northwestern.edu> (invalidation)
16 * This is free software. You are permitted to use,
17 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
20 #include <palacios/vmm.h>
21 #include <palacios/vmx_ept.h>
22 #include <palacios/vmx_lowlevel.h>
23 #include <palacios/vmm_paging.h>
24 #include <palacios/vm_guest_mem.h>
29 Note that the Intel nested page table have a slightly different format
30 than regular page tables. Also note that our implementation
31 uses only 64 bit (4 level) page tables. This is unlike the SVM
32 nested paging implementation.
40 static int handle_vmx_nested_pagefault(struct guest_info * info, addr_t fault_addr, void *info)
42 PrintError(info->vm_info, info, "Cannot do nested page fault as VMX is not enabled.\n");
45 static int handle_vmx_invalidate_nested_addr(struct guest_info * info, addr_t inv_addr)
47 PrintError(info->vm_info, info, "Cannot do invalidate nested addr as VMX is not enabled.\n");
50 static int handle_vmx_invalidate_nested_addr_range(struct guest_info * info,
51 addr_t inv_addr_start, addr_t inv_addr_end)
53 PrintError(info->vm_info, info, "Cannot do invalidate nested addr range as VMX is not enabled.\n");
59 static struct vmx_ept_msr * ept_info = NULL;
62 static addr_t create_ept_page() {
66 temp = V3_AllocPages(1); // need not be shadow-safe, not exposed to guest
68 PrintError(VM_NONE, VCORE_NONE, "Cannot allocate EPT page\n");
71 page = V3_VAddr(temp);
72 memset(page, 0, PAGE_SIZE);
80 static int init_ept(struct guest_info * core, struct vmx_hw_info * hw_info) {
81 addr_t ept_pa = (addr_t)V3_PAddr((void *)create_ept_page());
82 vmx_eptp_t * ept_ptr = (vmx_eptp_t *)&(core->direct_map_pt);
85 ept_info = &(hw_info->ept_info);
87 /* TODO: Should we set this to WB?? */
90 if (ept_info->pg_walk_len4) {
93 PrintError(core->vm_info, core, "Unsupported EPT Table depth\n");
97 ept_ptr->pml_base_addr = PAGE_BASE_ADDR(ept_pa);
99 PrintDebug(core->vm_info,core,"init_ept direct_map_pt=%p\n",(void*)(core->direct_map_pt));
106 // You would think we could just the regular 64 bit PT free
107 // routine, but no, because the EPT format is slightly different, in that
108 // it has no present bit.... We signify present via the read
109 static void delete_page_tables_ept64(ept_pml4_t * pml4) {
116 PrintDebug(VM_NONE, VCORE_NONE,"Deleting EPT Page Tables -- PML4 (%p)\n", pml4);
118 for (i = 0; i < MAX_PML4E64_ENTRIES; i++) {
119 if (!pml4[i].read && !pml4[i].write && !pml4[i].exec) {
123 ept_pdp_t * pdpe = (ept_pdp_t *)V3_VAddr((void *)(addr_t)BASE_TO_PAGE_ADDR_4KB(pml4[i].pdp_base_addr));
125 for (j = 0; j < MAX_PDPE64_ENTRIES; j++) {
126 if ((!pdpe[j].read && !pdpe[j].write && !pdpe[j].exec) || (pdpe[j].large_page == 1)) {
130 ept_pde_t * pde = (ept_pde_t *)V3_VAddr((void *)(addr_t)BASE_TO_PAGE_ADDR_4KB(pdpe[j].pd_base_addr));
132 for (k = 0; k < MAX_PDE64_ENTRIES; k++) {
133 if ((!pde[k].read && !pde[k].write && !pde[k].exec) || (pde[k].large_page == 1)) {
137 V3_FreePages((void *)(addr_t)BASE_TO_PAGE_ADDR_4KB(pde[k].pt_base_addr), 1);
140 V3_FreePages(V3_PAddr(pde), 1);
143 V3_FreePages(V3_PAddr(pdpe), 1);
146 V3_FreePages(V3_PAddr(pml4), 1);
151 static int deinit_ept(struct guest_info * core) {
154 pml = (ept_pml4_t *)CR3_TO_PML4E64_VA(core->direct_map_pt);
156 delete_page_tables_ept64(pml);
158 core->direct_map_pt = 0;
164 static inline void ept_exit_qual_to_pf_error(struct ept_exit_qual *qual, pf_error_t *error)
166 memset(error,0,sizeof(pf_error_t));
167 error->present = qual->present;
168 error->write = qual->write;
169 error->ifetch = qual->ifetch;
173 /* We can use the default paging macros, since the formats are close enough to allow it */
176 static int handle_vmx_nested_pagefault(struct guest_info * core, addr_t fault_addr, void *pfinfo,
177 addr_t *actual_start, addr_t *actual_end )
179 struct ept_exit_qual * ept_qual = (struct ept_exit_qual *) pfinfo;
180 ept_pml4_t * pml = NULL;
181 // ept_pdp_1GB_t * pdpe1gb = NULL;
182 ept_pdp_t * pdpe = NULL;
183 ept_pde_2MB_t * pde2mb = NULL;
184 ept_pde_t * pde = NULL;
185 ept_pte_t * pte = NULL;
186 addr_t host_addr = 0;
188 int pml_index = PML4E64_INDEX(fault_addr);
189 int pdpe_index = PDPE64_INDEX(fault_addr);
190 int pde_index = PDE64_INDEX(fault_addr);
191 int pte_index = PTE64_INDEX(fault_addr);
193 struct v3_mem_region * region = v3_get_mem_region(core->vm_info, core->vcpu_id, fault_addr);
194 int page_size = PAGE_SIZE_4KB;
197 pf_error_t error_code;
199 ept_exit_qual_to_pf_error(ept_qual, &error_code);
201 PrintDebug(info->vm_info, info, "Nested PageFault: fault_addr=%p, error_code=%u, exit_qual=0x%llx\n", (void *)fault_addr, *(uint_t *)&error_code, qual->value);
204 if (region == NULL) {
205 PrintError(core->vm_info, core, "invalid region, addr=%p\n", (void *)fault_addr);
209 if ((core->use_large_pages == 1) || (core->use_giant_pages == 1)) {
210 page_size = v3_get_max_page_size(core, fault_addr, LONG);
215 pml = (ept_pml4_t *)CR3_TO_PML4E64_VA(core->direct_map_pt);
219 //Fix up the PML entry
220 if (pml[pml_index].read == 0) {
221 pdpe = (ept_pdp_t *)create_ept_page();
223 // Set default PML Flags...
224 pml[pml_index].read = 1;
225 pml[pml_index].write = 1;
226 pml[pml_index].exec = 1;
228 pml[pml_index].pdp_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pdpe));
230 pdpe = V3_VAddr((void *)BASE_TO_PAGE_ADDR_4KB(pml[pml_index].pdp_base_addr));
234 // Fix up the PDPE entry
235 if (pdpe[pdpe_index].read == 0) {
236 pde = (ept_pde_t *)create_ept_page();
238 // Set default PDPE Flags...
239 pdpe[pdpe_index].read = 1;
240 pdpe[pdpe_index].write = 1;
241 pdpe[pdpe_index].exec = 1;
243 pdpe[pdpe_index].pd_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pde));
245 pde = V3_VAddr((void *)BASE_TO_PAGE_ADDR_4KB(pdpe[pdpe_index].pd_base_addr));
250 // Fix up the 2MiB PDE and exit here
251 if (page_size == PAGE_SIZE_2MB) {
252 pde2mb = (ept_pde_2MB_t *)pde; // all but these two lines are the same for PTE
253 pde2mb[pde_index].large_page = 1;
255 *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(fault_addr));
256 *actual_end = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(fault_addr)+1)-1;
258 if (pde2mb[pde_index].read == 0) {
260 if ( (region->flags.alloced == 1) &&
261 (region->flags.read == 1)) {
263 pde2mb[pde_index].read = 1;
264 pde2mb[pde_index].exec = 1;
265 pde2mb[pde_index].ipat = 1;
266 pde2mb[pde_index].mt = 6;
268 if (region->flags.write == 1) {
269 pde2mb[pde_index].write = 1;
271 pde2mb[pde_index].write = 0;
274 if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) {
275 PrintError(core->vm_info, core, "Error: Could not translate fault addr (%p)\n", (void *)fault_addr);
279 pde2mb[pde_index].page_base_addr = PAGE_BASE_ADDR_2MB(host_addr);
281 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
284 // We fix all permissions on the first pass,
285 // so we only get here if its an unhandled exception
287 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
293 // Continue with the 4KiB page heirarchy
296 // Fix up the PDE entry
297 if (pde[pde_index].read == 0) {
298 pte = (ept_pte_t *)create_ept_page();
300 pde[pde_index].read = 1;
301 pde[pde_index].write = 1;
302 pde[pde_index].exec = 1;
304 pde[pde_index].pt_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pte));
306 pte = V3_VAddr((void *)BASE_TO_PAGE_ADDR_4KB(pde[pde_index].pt_base_addr));
310 *actual_start = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(fault_addr));
311 *actual_end = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(fault_addr)+1)-1;
314 // Fix up the PTE entry
315 if (pte[pte_index].read == 0) {
317 if ( (region->flags.alloced == 1) &&
318 (region->flags.read == 1)) {
320 pte[pte_index].read = 1;
321 pte[pte_index].exec = 1;
322 pte[pte_index].ipat = 1;
323 pte[pte_index].mt = 6;
325 if (region->flags.write == 1) {
326 pte[pte_index].write = 1;
328 pte[pte_index].write = 0;
331 if (v3_gpa_to_hpa(core, fault_addr, &host_addr) == -1) {
332 PrintError(core->vm_info, core, "Error Could not translate fault addr (%p)\n", (void *)fault_addr);
337 pte[pte_index].page_base_addr = PAGE_BASE_ADDR_4KB(host_addr);
339 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
342 // We fix all permissions on the first pass,
343 // so we only get here if its an unhandled exception
345 return region->unhandled(core, fault_addr, fault_addr, region, error_code);
353 static int handle_vmx_invalidate_nested_addr_internal(struct guest_info *core, addr_t inv_addr,
354 addr_t *actual_start, uint64_t *actual_size) {
355 ept_pml4_t *pml = NULL;
356 ept_pdp_t *pdpe = NULL;
357 ept_pde_t *pde = NULL;
358 ept_pte_t *pte = NULL;
362 // clear the page table entry
364 int pml_index = PML4E64_INDEX(inv_addr);
365 int pdpe_index = PDPE64_INDEX(inv_addr);
366 int pde_index = PDE64_INDEX(inv_addr);
367 int pte_index = PTE64_INDEX(inv_addr);
370 pml = (ept_pml4_t *)CR3_TO_PML4E64_VA(core->direct_map_pt);
373 // note that there are no present bits in EPT, so we
374 // use the read bit to signify this.
375 // either an entry is read/write/exec or it is none of these
377 if (pml[pml_index].read == 0) {
378 // already invalidated
379 *actual_start = BASE_TO_PAGE_ADDR_512GB(PAGE_BASE_ADDR_512GB(inv_addr));
380 *actual_size = PAGE_SIZE_512GB;
384 pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pml[pml_index].pdp_base_addr));
386 if (pdpe[pdpe_index].read == 0) {
387 // already invalidated
388 *actual_start = BASE_TO_PAGE_ADDR_1GB(PAGE_BASE_ADDR_1GB(inv_addr));
389 *actual_size = PAGE_SIZE_1GB;
391 } else if (pdpe[pdpe_index].large_page == 1) { // 1GiB
392 pdpe[pdpe_index].read = 0;
393 pdpe[pdpe_index].write = 0;
394 pdpe[pdpe_index].exec = 0;
395 *actual_start = BASE_TO_PAGE_ADDR_1GB(PAGE_BASE_ADDR_1GB(inv_addr));
396 *actual_size = PAGE_SIZE_1GB;
400 pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pdpe[pdpe_index].pd_base_addr));
402 if (pde[pde_index].read == 0) {
403 // already invalidated
404 *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(inv_addr));
405 *actual_size = PAGE_SIZE_2MB;
407 } else if (pde[pde_index].large_page == 1) { // 2MiB
408 pde[pde_index].read = 0;
409 pde[pde_index].write = 0;
410 pde[pde_index].exec = 0;
411 *actual_start = BASE_TO_PAGE_ADDR_2MB(PAGE_BASE_ADDR_2MB(inv_addr));
412 *actual_size = PAGE_SIZE_2MB;
416 pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr));
418 pte[pte_index].read = 0; // 4KiB
419 pte[pte_index].write = 0;
420 pte[pte_index].exec = 0;
422 *actual_start = BASE_TO_PAGE_ADDR_4KB(PAGE_BASE_ADDR_4KB(inv_addr));
423 *actual_size = PAGE_SIZE_4KB;
429 static int handle_vmx_invalidate_nested_addr(struct guest_info *core, addr_t inv_addr,
430 addr_t *actual_start, addr_t *actual_end)
435 rc = handle_vmx_invalidate_nested_addr_internal(core,inv_addr,actual_start,&len);
437 *actual_end = *actual_start + len - 1;
443 static int handle_vmx_invalidate_nested_addr_range(struct guest_info *core,
444 addr_t inv_addr_start, addr_t inv_addr_end,
445 addr_t *actual_start, addr_t *actual_end)
452 for (next=inv_addr_start; next<=inv_addr_end; ) {
453 rc = handle_vmx_invalidate_nested_addr_internal(core,next,&start, &len);
454 if (next==inv_addr_start) {
455 // first iteration, capture where we start invalidating
456 *actual_start = start;
464 // last iteration, actual_end is off by one