2 * Shadow page cache implementation that has been stolen from Linux's KVM Implementation
3 * This module is licensed under the GPL
6 #include <palacios/vmm_shadow_paging.h>
7 #include <palacios/vmm_ctrl_regs.h>
9 #include <palacios/vm_guest.h>
10 #include <palacios/vm_guest_mem.h>
12 #include <palacios/vmm_paging.h>
15 #ifndef V3_CONFIG_DEBUG_SHDW_CACHE
17 #define PrintDebug(fmt, ...)
20 #ifdef V3_CONFIG_SHADOW_CACHE
23 addr_t shadow_pdes[NR_PTE_CHAIN_ENTRIES];
24 struct hlist_node link;
28 addr_t shadow_ptes[RMAP_EXT];
32 static inline int activate_shadow_pt_32(struct guest_info * core);
33 static inline unsigned shadow_page_table_hashfn(addr_t guest_fn)
38 static void *shadow_cache_alloc(struct shadow_cache *mc, size_t size)
42 PrintDebug(info->vm_info, info, "at shadow_cache_alloc mc->nobjs non-exist\n");
45 p = mc->objects[--mc->nobjs];
51 static void shadow_cache_free(struct shadow_cache *mc, void *obj)
53 if (mc->nobjs < NR_MEM_OBJS) {
54 mc->objects[mc->nobjs++] = obj;
59 static struct rmap *shadow_alloc_rmap(struct guest_info *core)
61 return shadow_cache_alloc(&core->shadow_rmap_cache,sizeof(struct rmap));
64 static void shadow_free_rmap(struct guest_info *core,struct rmap *rd)
66 return shadow_cache_free(&core->shadow_rmap_cache,rd);
69 int shadow_topup_cache(struct shadow_cache * cache, size_t objsize, int min) {
73 if (cache->nobjs >= min) return 0;
74 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
75 obj = V3_Malloc(objsize);
77 PrintDebug(info->vm_info, info, "at shadow_topup_cache obj alloc fail\n");
80 cache->objects[cache->nobjs++] = obj;
86 static int shadow_topup_caches(struct guest_info * core) {
89 r = shadow_topup_cache(&core->shadow_pde_chain_cache,
90 sizeof(struct pde_chain), 4);
94 r = shadow_topup_cache(&core->shadow_rmap_cache,
95 sizeof(struct rmap), 1);
101 static struct pde_chain *shadow_alloc_pde_chain(struct guest_info *core)
103 return shadow_cache_alloc(&core->shadow_pde_chain_cache,
104 sizeof(struct pde_chain));
107 static void shadow_free_pde_chain(struct guest_info *core, struct pde_chain *pc)
109 PrintDebug(info->vm_info, info, "shdw_free_pdechain: start\n");
110 shadow_cache_free(&core->shadow_pde_chain_cache, pc);
111 PrintDebug(info->vm_info, info, "shdw_free_pdechain: return\n");
115 static void shadow_free_page (struct guest_info * core, struct shadow_page_cache_data * page)
117 list_del(&page->link);
119 V3_FreePages((void *)page->page_pa, 1);
120 page->page_pa=(addr_t)V3_AllocPages(1);
122 if (!page->page_pa) {
123 PrintError(info->vm_info, info, "Freeing shadow page failed on allocation\n");
127 list_add(&page->link,&core->free_pages);
128 ++core->n_free_shadow_pages;
132 static struct shadow_page_cache_data * shadow_alloc_page(struct guest_info * core, addr_t shadow_pde) {
134 struct shadow_page_cache_data * page;
136 if (list_empty(&core->free_pages)) return NULL;
138 page = list_entry(core->free_pages.next, struct shadow_page_cache_data, link);
139 list_del(&page->link);
141 list_add(&page->link, &core->active_shadow_pages);
142 page->multimapped = 0;
143 page->shadow_pde = shadow_pde;
144 --core->n_free_shadow_pages;
146 PrintDebug(info->vm_info, info, "alloc_page: n_free_shdw_pg %d page_pa %p page_va %p\n",
147 core->n_free_shadow_pages,(void *)(page->page_pa),V3_VAddr((void *)(page->page_pa)));
149 addr_t shdw_page = (addr_t)V3_VAddr((void *)(page->page_pa));
150 memset((void *)shdw_page, 0, PAGE_SIZE_4KB);
156 static void shadow_zap_page(struct guest_info * core, struct shadow_page_cache_data * page);
158 static void free_shadow_pages(struct guest_info * core)
160 struct shadow_page_cache_data *page;
162 while (!list_empty(&core->active_shadow_pages)) {
163 page = container_of(core->active_shadow_pages.next,
164 struct shadow_page_cache_data, link);
165 shadow_zap_page(core, page);
168 while (!list_empty(&core->free_pages)) {
169 page = list_entry(core->free_pages.next, struct shadow_page_cache_data, link);
170 list_del(&page->link);
171 V3_FreePages((void *)page->page_pa, 1);
172 page->page_pa = ~(addr_t)0; //invalid address
176 static int alloc_shadow_pages(struct guest_info * core)
179 struct shadow_page_cache_data * page_header = NULL;
181 for (i = 0; i < NUM_SHADOW_PAGES; i++) {
182 page_header = &core->page_header_buf[i];
184 INIT_LIST_HEAD(&page_header->link);
185 if (!(page_header->page_pa = (addr_t)V3_AllocPages(1))) {
186 PrintError(info->vm_info, info, "Allocation failed in allocating shadow page\n");
189 addr_t shdw_page = (addr_t)V3_VAddr((void *)(page_header->page_pa));
190 memset((void *)shdw_page, 0, PAGE_SIZE_4KB);
192 list_add(&page_header->link, &core->free_pages);
193 ++core->n_free_shadow_pages;
194 PrintDebug(info->vm_info, info, "alloc_shdw_pg: n_free_shdw_pg %d page_pa %p\n",
195 core->n_free_shadow_pages,(void*)page_header->page_pa);
200 free_shadow_pages(core);
201 return -1; //out of memory
205 static void shadow_page_add_shadow_pde(struct guest_info * core,
206 struct shadow_page_cache_data * page, addr_t shadow_pde)
208 struct pde_chain *pde_chain;
209 struct hlist_node *node;
217 if (!page->multimapped) {
218 old = page->shadow_pde;
221 page->shadow_pde = shadow_pde;
225 page->multimapped = 1;
226 pde_chain = shadow_alloc_pde_chain(core);
227 INIT_HLIST_HEAD(&page->shadow_pdes);
228 hlist_add_head(&pde_chain->link,&page->shadow_pdes);
229 pde_chain->shadow_pdes[0] = old;
232 hlist_for_each_entry(pde_chain, node, &page->shadow_pdes, link) {
233 if (pde_chain->shadow_pdes[NR_PTE_CHAIN_ENTRIES-1]) continue;
234 for(i=0; i < NR_PTE_CHAIN_ENTRIES; ++i)
235 if (!pde_chain->shadow_pdes[i]) {
236 pde_chain->shadow_pdes[i] = shadow_pde;
241 pde_chain = shadow_alloc_pde_chain(core);
243 hlist_add_head(&pde_chain->link,&page->shadow_pdes);
244 pde_chain->shadow_pdes[0] = shadow_pde;
248 static void shadow_page_remove_shadow_pde(struct guest_info * core,
249 struct shadow_page_cache_data * page, addr_t shadow_pde)
252 struct pde_chain * pde_chain;
253 struct hlist_node * node;
256 PrintDebug(info->vm_info, info, "rm_shdw_pde: multimap %d\n", page->multimapped);
257 if(!page->multimapped) {
258 PrintDebug(info->vm_info, info, "rm_shdw_pde: no multimap\n");
259 if(page->shadow_pde != shadow_pde)
260 PrintDebug(info->vm_info, info, "rm_shdw_pde: error page->shadow_pde is not equal to shadow_pde\n");
261 page->shadow_pde = 0;
262 PrintDebug(info->vm_info, info, "rm_shdw_pde: return\n");
266 PrintDebug(info->vm_info, info, "rm_shdw_pde: multimap\n");
268 hlist_for_each_entry (pde_chain, node, &page->shadow_pdes, link)
269 for (i=0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
270 if(!pde_chain->shadow_pdes[i]) break;
271 if(pde_chain->shadow_pdes[i] != shadow_pde) continue;
273 PrintDebug(info->vm_info, info, "rm_shdw_pde: found shadow_pde at i %d\n",i);
274 while (i+1 < NR_PTE_CHAIN_ENTRIES && pde_chain->shadow_pdes[i+1]) {
275 pde_chain->shadow_pdes[i] = pde_chain->shadow_pdes[i+1];
278 pde_chain->shadow_pdes[i] = 0;
281 PrintDebug(info->vm_info, info, "rm_shdw_pde: only one!\n");
282 hlist_del(&pde_chain->link);
283 shadow_free_pde_chain(core, pde_chain);
284 if(hlist_empty(&page->shadow_pdes)) {
285 page->multimapped = 0;
286 page->shadow_pde = 0;
290 PrintDebug(info->vm_info, info, "rm_shdw_pde: return\n");
293 PrintDebug(info->vm_info, info, "rm_shdw_pde: return\n");
296 static void shadow_page_search_shadow_pde (struct guest_info* core, addr_t shadow_pde,
297 addr_t guest_pde, unsigned hlevel) {
299 struct shadow_page_cache_data* shdw_page;
301 struct hlist_head* bucket;
302 struct hlist_node* node;
303 int hugepage_access = 0;
304 union shadow_page_role role;
305 addr_t pt_base_addr = 0;
306 int metaphysical = 0;
308 PrintDebug(info->vm_info, info, "shadow_page_search_shadow_pde\n");
309 v3_cpu_mode_t mode = v3_get_vm_cpu_mode(core);
311 if (mode == PROTECTED) {
313 PrintDebug(info->vm_info, info, "shadow_page_search_shadow_pde: PROTECTED\n");
314 pt_base_addr = ((pde32_t*)guest_pde)->pt_base_addr;
316 if(((pde32_t*)guest_pde)->large_page == 1) {
317 PrintDebug(info->vm_info, info, "shadow_page_search_shadow_pde: large page\n");
318 hugepage_access = (((pde32_4MB_t *) guest_pde)->writable) | (((pde32_4MB_t*)guest_pde)->user_page << 1);
320 pt_base_addr = (addr_t) PAGE_BASE_ADDR(BASE_TO_PAGE_ADDR_4MB(((pde32_4MB_t*)guest_pde)->page_base_addr));
324 role.glevels = PT32_ROOT_LEVEL; //max level
325 role.hlevels = PT_PAGE_TABLE_LEVEL;
326 role.metaphysical = metaphysical;
327 role.hugepage_access = hugepage_access;
329 } else if (mode == LONG_32_COMPAT || mode == LONG) {
331 PrintDebug(info->vm_info, info, "shadow_page_search_shadow_pde: LONG_32_COMPAT/LONG\n");
332 pt_base_addr = ((pde64_t*)guest_pde)->pt_base_addr;
335 if(hlevel == PT_DIRECTORY_LEVEL) {
336 if(((pde64_t*)guest_pde)->large_page == 1) {
337 hugepage_access = (((pde64_2MB_t *) guest_pde)->writable) | (((pde64_2MB_t*)guest_pde)->user_page << 1);
339 pt_base_addr = (addr_t) PAGE_BASE_ADDR(BASE_TO_PAGE_ADDR_2MB(((pde64_2MB_t*)guest_pde)->page_base_addr));
341 role.hlevels = PT_PAGE_TABLE_LEVEL;
343 } else if(hlevel == PT32E_ROOT_LEVEL) {
344 if(((pdpe64_t*)guest_pde)->large_page == 1) {
345 hugepage_access = (((pdpe64_1GB_t *) guest_pde)->writable) | (((pdpe64_1GB_t*)guest_pde)->user_page << 1);
347 pt_base_addr = (addr_t) PAGE_BASE_ADDR(BASE_TO_PAGE_ADDR_1GB(((pdpe64_1GB_t*)guest_pde)->page_base_addr));
349 role.hlevels = PT_DIRECTORY_LEVEL;
351 } else if(hlevel == PT64_ROOT_LEVEL) {
352 if(((pdpe64_t*)guest_pde)->large_page == 1) {
353 hugepage_access = (((pdpe64_1GB_t *) guest_pde)->writable) | (((pdpe64_1GB_t*)guest_pde)->user_page << 1);
355 pt_base_addr = (addr_t) PAGE_BASE_ADDR(BASE_TO_PAGE_ADDR_1GB(((pdpe64_1GB_t*)guest_pde)->page_base_addr));
357 role.hlevels = PT32E_ROOT_LEVEL;
362 role.glevels = PT64_ROOT_LEVEL; //store numeric
363 role.metaphysical = metaphysical;
364 role.hugepage_access = hugepage_access;
368 index = shadow_page_table_hashfn(pt_base_addr) % NUM_SHADOW_PAGES;
369 bucket = &core->shadow_page_hash[index];
371 hlist_for_each_entry(shdw_page, node, bucket, hash_link)
372 if (shdw_page->guest_fn == pt_base_addr && shdw_page->role.word == role.word ) {
373 PrintDebug(info->vm_info, info, "shadow_page_search_shadow_pde: found\n");
374 shadow_page_remove_shadow_pde(core, shdw_page, (addr_t)shadow_pde);
381 static struct shadow_page_cache_data * shadow_page_lookup_page(struct guest_info *core, addr_t guest_fn, int opt) //purpose of this is write protection
384 struct hlist_head * bucket;
385 struct shadow_page_cache_data * page;
386 struct hlist_node * node;
388 PrintDebug(info->vm_info, info, "lookup: guest_fn addr %p\n",(void *)BASE_TO_PAGE_ADDR(guest_fn));
390 index = shadow_page_table_hashfn(guest_fn) % NUM_SHADOW_PAGES;
391 bucket = &core->shadow_page_hash[index];
392 PrintDebug(info->vm_info, info, "lookup: index %d bucket %p\n",index,(void*)bucket);
394 hlist_for_each_entry(page, node, bucket, hash_link)
396 PrintDebug(info->vm_info, info, "lookup: page->gfn %p gfn %p metaphysical %d\n",
397 (void*)BASE_TO_PAGE_ADDR(page->guest_fn),(void*)BASE_TO_PAGE_ADDR(guest_fn),page->role.metaphysical);
398 if (page->guest_fn == guest_fn && !page->role.metaphysical) {
402 else if(page->guest_fn == guest_fn) {
409 static void rmap_remove(struct guest_info * core, addr_t shadow_pte);
410 static void rmap_write_protect(struct guest_info * core, addr_t guest_fn);
412 struct shadow_page_cache_data * shadow_page_get_page(struct guest_info *core,
416 unsigned hugepage_access,
418 int force) //0:default 1:off cache 2:off debug print
420 struct shadow_page_cache_data *page;
421 union shadow_page_role role;
423 struct hlist_head *bucket;
424 struct hlist_node *node;
425 v3_cpu_mode_t mode = v3_get_vm_cpu_mode(core);
428 if (mode == REAL || mode == PROTECTED) role.glevels = PT32_ROOT_LEVEL;
429 //exceptional, longterm there should be argument
430 else if (mode == PROTECTED_PAE) role.glevels = PT32E_ROOT_LEVEL;
431 else if (mode == LONG || mode == LONG_32_COMPAT) role.glevels = PT64_ROOT_LEVEL;
435 role.hlevels = level;
436 role.metaphysical = metaphysical;
437 role.hugepage_access = hugepage_access;
439 index = shadow_page_table_hashfn(guest_fn) % NUM_SHADOW_PAGES;
440 bucket = &core->shadow_page_hash[index];
442 if (force != 2) PrintDebug(info->vm_info, info, "get_page: lvl %d idx %d gfn %p role %x\n", level, index, (void *)guest_fn,role.word);
444 hlist_for_each_entry(page, node, bucket, hash_link)
445 if (page->guest_fn == guest_fn && page->role.word == role.word) {
446 shadow_page_add_shadow_pde(core, page, shadow_pde); //guest_fn is right there
448 PrintDebug(info->vm_info, info, "get_page: found guest_fn %p, index %d, multi %d, next %p\n",
449 (void *)page->guest_fn, index, page->multimapped, (void *)page->hash_link.next);
450 if (force == 0 || force == 2)
453 shadow_zap_page(core,page);
458 PrintDebug(info->vm_info, info, "get_page: no found guest_fn %p, index %d, multimapped %d, next %p\n",
459 (void *)page->guest_fn, index, page->multimapped, (void *)page->hash_link.next);
463 PrintDebug(info->vm_info, info, "get_page: no found\n");
467 page=shadow_alloc_page(core, shadow_pde);
469 if (!page) return page;
471 page->guest_fn = guest_fn;
473 page->multimapped = 0;
474 page->shadow_pde = 0;
477 PrintDebug(info->vm_info, info, "get_page: hadd h->first %p, n %p, n->next %p\n",
478 (void *)bucket->first, (void *)&page->hash_link, (void *)page->hash_link.next);
480 hlist_add_head(&page->hash_link, bucket);
481 shadow_page_add_shadow_pde(core, page, shadow_pde);
483 if (force != 2) PrintDebug(info->vm_info, info, "get_page: hadd h->first %p, n %p, n->next %p\n",
484 (void *)bucket->first, (void *)&page->hash_link, (void *)page->hash_link.next);
486 if (!metaphysical) rmap_write_protect(core, guest_fn); //in case rmapped guest_fn being allocated as pt or pd
487 if (force != 2) PrintDebug(info->vm_info, info, "get_page: return\n");
493 static void shadow_page_unlink_children (struct guest_info * core, struct shadow_page_cache_data * page) {
496 uint32_t* shdw32_table;
497 uint32_t* shdw32_entry;
498 uint64_t* shdw64_table;
499 uint64_t* shdw64_entry;
501 uint32_t* guest32_table;
502 uint32_t* guest32_entry;
503 uint64_t* guest64_table;
504 uint64_t* guest64_entry;
506 v3_cpu_mode_t mode = v3_get_vm_cpu_mode(core);
508 if(page->role.hlevels == PT_PAGE_TABLE_LEVEL) {
510 if (mode == PROTECTED) {
512 shdw32_table = (uint32_t*) V3_VAddr((void *)(addr_t)CR3_TO_PDE32_PA(page->page_pa));
513 PrintDebug(info->vm_info, info, "ulink_chil: pte lvl\n");
515 for (i = 0; i < PT32_ENT_PER_PAGE; ++i) {
516 shdw32_entry = (uint32_t*)&(shdw32_table[i]);
517 if (*shdw32_entry & PT_PRESENT_MASK) {
518 rmap_remove(core, (addr_t)shdw32_entry);
519 PrintDebug(info->vm_info, info, "ulink_chil: %d pte: shadow %x\n", i, *shdw32_entry);
521 memset((void *)shdw32_entry, 0, sizeof(uint32_t));
523 PrintDebug(info->vm_info, info, "ulink_chil: return pte\n");
526 } else if (mode == LONG_32_COMPAT || mode == LONG) {
528 shdw64_table = (uint64_t*) V3_VAddr((void *)(addr_t)CR3_TO_PML4E64_PA(page->page_pa));
529 PrintDebug(info->vm_info, info, "ulink_chil: pte lvl\n");
531 for (i = 0; i < PT_ENT_PER_PAGE; ++i) {
532 shdw64_entry = (uint64_t*)&(shdw64_table[i]);
533 if (*shdw64_entry & PT_PRESENT_MASK) {
534 rmap_remove(core, (addr_t)shdw64_entry);
535 PrintDebug(info->vm_info, info, "ulink_chil: %d pte: shadow %p\n", i, (void*)*((uint64_t*)shdw64_entry));
537 memset((void *)shdw64_entry, 0, sizeof(uint64_t));
540 PrintDebug(info->vm_info, info, "ulink_chil: return pte\n");
545 PrintDebug(info->vm_info, info, "ulink_chil: pde lvl\n");
546 if (mode == PROTECTED) {
548 shdw32_table = (uint32_t*) V3_VAddr((void*)(addr_t)CR3_TO_PDE32_PA(page->page_pa));
550 if (guest_pa_to_host_va(core, BASE_TO_PAGE_ADDR(page->guest_fn), (addr_t*)&guest32_table) == -1) {
551 PrintError(info->vm_info, info, "Invalid Guest PDE Address: 0x%p\n", (void *)BASE_TO_PAGE_ADDR(page->guest_fn));
555 for (i = 0; i < PT32_ENT_PER_PAGE; ++i) {
557 shdw32_entry = (uint32_t*)&(shdw32_table[i]);
558 guest32_entry = (uint32_t*)&(guest32_table[i]);
559 present = *shdw32_entry & PT_PRESENT_MASK;
560 if(present) PrintDebug(info->vm_info, info, "ulink_chil: pde %dth: shadow %x\n", i, *((uint32_t*)shdw32_entry));
561 memset((void *)shdw32_entry, 0, sizeof(uint32_t));
562 if (present != 1) continue;
564 shadow_page_search_shadow_pde(core, (addr_t)shdw32_entry, (addr_t)guest32_entry, page->role.hlevels);
566 PrintDebug(info->vm_info, info, "ulink_child: before return at pde lvel\n");
569 }else if(mode == LONG_32_COMPAT || mode == LONG) {
571 shdw64_table = (uint64_t*) V3_VAddr((void*)(addr_t)CR3_TO_PML4E64_PA(page->page_pa));
573 if (guest_pa_to_host_va(core, BASE_TO_PAGE_ADDR(page->guest_fn), (addr_t*)&guest64_table) == -1) {
574 if(page->role.hlevels == PT_DIRECTORY_LEVEL)
575 PrintError(info->vm_info, info, "Invalid Guest PDE Address: 0x%p\n", (void *)BASE_TO_PAGE_ADDR(page->guest_fn));
576 if(page->role.hlevels == PT32E_ROOT_LEVEL)
577 PrintError(info->vm_info, info, "Invalid Guest PDPE Address: 0x%p\n", (void *)BASE_TO_PAGE_ADDR(page->guest_fn));
578 if(page->role.hlevels == PT64_ROOT_LEVEL)
579 PrintError(info->vm_info, info, "Invalid Guest PML4E Address: 0x%p\n", (void *)BASE_TO_PAGE_ADDR(page->guest_fn));
583 for (i = 0; i < PT_ENT_PER_PAGE; ++i) {
585 shdw64_entry = (uint64_t*)&(shdw64_table[i]);
586 guest64_entry = (uint64_t*)&(guest64_table[i]);
587 present = *shdw64_entry & PT_PRESENT_MASK;
588 if(present) PrintDebug(info->vm_info, info, "ulink_chil: pde: shadow %p\n",(void *)*((uint64_t *)shdw64_entry));
589 memset((void *)shdw64_entry, 0, sizeof(uint64_t));
590 if (present != 1) continue;
592 shadow_page_search_shadow_pde(core, (addr_t)shdw64_entry, (addr_t)guest64_entry, page->role.hlevels);
597 //PrintDebug(info->vm_info, info, "ulink_chil: return pde\n");
601 static void shadow_page_put_page(struct guest_info *core, struct shadow_page_cache_data * page, addr_t shadow_pde) {
603 PrintDebug(info->vm_info, info, "put_page: start\n");
604 shadow_page_remove_shadow_pde(core, page, shadow_pde);
606 PrintDebug(info->vm_info, info, "put_page: end\n");
610 static void shadow_zap_page(struct guest_info * core, struct shadow_page_cache_data * page) {
613 addr_t cr3_base_addr = 0;
614 v3_cpu_mode_t mode = v3_get_vm_cpu_mode(core);
616 PrintDebug(info->vm_info, info, "zap: multimapped %d, metaphysical %d\n", page->multimapped, page->role.metaphysical);
618 while (page->multimapped || page->shadow_pde) {
619 if (!page->multimapped) {
620 shadow_pde = page->shadow_pde;
622 struct pde_chain * chain;
623 chain = container_of(page->shadow_pdes.first, struct pde_chain, link);
624 shadow_pde = chain->shadow_pdes[0];
626 shadow_page_put_page(core, page, shadow_pde);
627 PrintDebug(info->vm_info, info, "zap_parent: pde: shadow %p\n",(void *)*((addr_t *)shadow_pde));
628 memset((void *)shadow_pde, 0, sizeof(uint32_t));
631 shadow_page_unlink_children(core, page);
633 PrintDebug(info->vm_info, info, "zap: end of unlink\n");
635 if (mode == PROTECTED) {
636 cr3_base_addr = ((struct cr3_32 *)&(core->shdw_pg_state.guest_cr3))->pdt_base_addr;
637 } else if (mode == LONG_32_COMPAT || mode == LONG) {
638 cr3_base_addr = ((struct cr3_64 *)&(core->shdw_pg_state.guest_cr3))->pml4t_base_addr;
642 PrintDebug(info->vm_info, info, "zap: before hlist_del\n");
643 PrintDebug(info->vm_info, info, "zap: page->guest_fn %p\n", (void*) page->guest_fn);
645 if (page->guest_fn != (addr_t)(cr3_base_addr)) {
646 PrintDebug(info->vm_info, info, "zap: first hlist_del\n");
648 hlist_del(&page->hash_link);
649 shadow_free_page(core, page);
652 PrintDebug(info->vm_info, info, "zap: second hlist_del\n");
654 list_del(&page->link);
655 list_add(&page->link,&core->active_shadow_pages);
658 PrintDebug(info->vm_info, info, "zap: end hlist_del\n");
662 int shadow_zap_hierarchy_32(struct guest_info * core, struct shadow_page_cache_data * page) {
670 if (page->role.hlevels != 2) return -1;
672 shadow_pd = CR3_TO_PDE32_VA(page->page_pa);
673 if (guest_pa_to_host_va(core, BASE_TO_PAGE_ADDR(page->guest_fn), (addr_t*)&guest_pd) == -1) {
674 PrintError(info->vm_info, info, "Invalid Guest PDE Address: 0x%p\n", (void*)BASE_TO_PAGE_ADDR(page->guest_fn));
678 for (i=0; i < PT32_ENT_PER_PAGE; ++i) {
680 shadow_pde = (pde32_t*)&(shadow_pd[i]);
681 guest_pde = (pde32_t*)&(guest_pd[i]);
682 present = shadow_pde->present;
683 if (shadow_pde->present) PrintDebug(info->vm_info, info, "ulink_child: pde shadow %x\n", *((uint32_t*)shadow_pde));
684 memset((void*)shadow_pde, 0, sizeof(uint32_t));
685 if (present != 1) continue;
687 struct shadow_page_cache_data *shdw_page;
689 struct hlist_head *bucket;
690 struct hlist_node *node;
691 int hugepage_access =0;
692 int metaphysical = 0;
693 union shadow_page_role role;
694 v3_cpu_mode_t mode = v3_get_vm_cpu_mode(core);
696 if (((pde32_t*)guest_pde)->large_page == 1) {
697 hugepage_access = (((pde32_4MB_t*)guest_pde)->writable) | (((pde32_4MB_t*)guest_pde)->user_page << 1);
702 if (mode == REAL || mode == PROTECTED) role.glevels = PT32_ROOT_LEVEL;
703 //exceptional, longterm there should be argument
704 else if (mode == PROTECTED_PAE) role.glevels = PT32E_ROOT_LEVEL;
705 else if (mode == LONG || mode == LONG_32_COMPAT) role.glevels = PT64_ROOT_LEVEL;
709 role.metaphysical = metaphysical;
710 role.hugepage_access = hugepage_access;
712 index = shadow_page_table_hashfn(guest_pde->pt_base_addr) % NUM_SHADOW_PAGES;
713 bucket = &core->shadow_page_hash[index];
715 hlist_for_each_entry(shdw_page, node, bucket, hash_link)
716 if (shdw_page->guest_fn == (guest_pde->pt_base_addr) && (shdw_page->role.word == role.word)) {
717 shadow_zap_page(core, shdw_page);
721 shadow_zap_page(core, page);
726 int shadow_unprotect_page(struct guest_info * core, addr_t guest_fn) {
729 struct hlist_head * bucket;
730 struct shadow_page_cache_data * page = NULL;
731 struct hlist_node * node;
732 struct hlist_node * n;
736 index = shadow_page_table_hashfn(guest_fn) % NUM_SHADOW_PAGES;
737 bucket = &core->shadow_page_hash[index];
738 PrintDebug(info->vm_info, info, "unprotect: gfn %p\n",(void *) guest_fn);
740 hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
741 //hlist_for_each_entry(page, node, bucket, hash_link) {
742 if ((page->guest_fn == guest_fn) && !(page->role.metaphysical)) {
743 PrintDebug(info->vm_info, info, "unprotect: match page.gfn %p page.role %x gfn %p\n",(void *) page->guest_fn,page->role.word,(void *)guest_fn);
744 shadow_zap_page(core, page);
749 PrintDebug(info->vm_info, info, "at shadow_unprotect_page return %d\n",r);
754 reverse mapping data structures:
755 if page_private bit zero is zero, then page->private points to the shadow page table entry that points to page address
756 if page_private bit zero is one, then page->private & ~1 points to a struct rmap containing more mappings
759 void rmap_add(struct guest_info *core, addr_t shadow_pte) {
761 addr_t page_private = 0;
762 gen_pt_t * shadow_pte_gen;
763 addr_t page_base_addr = 0;
766 v3_cpu_mode_t mode = v3_get_vm_cpu_mode(core);
768 shadow_pte_gen = (gen_pt_t *) shadow_pte;
770 if (mode == PROTECTED) {
771 page_base_addr = ((pte32_t *)shadow_pte)->page_base_addr;
772 PrintDebug(info->vm_info, info, "at rmap_add shadow_pte: %x\n", (uint32_t)*((uint32_t*)shadow_pte));
774 } else if (mode == LONG_32_COMPAT || mode == LONG) {
775 page_base_addr = ((pte64_t *)shadow_pte)->page_base_addr;
776 PrintDebug(info->vm_info, info, "at rmap_add shadow_pte: %p\n", (void*)*((uint64_t*)shadow_pte));
781 PrintDebug(info->vm_info, info, "debug rmap: at rmap_add shadow_pte->page_base_addr (%p), shadow_pte_present %d, shadow_pte_writable %d\n",
782 (void *)BASE_TO_PAGE_ADDR(page_base_addr), (shadow_pte_gen->present), (shadow_pte_gen->writable));
784 if (shadow_pte_gen->present == 0 || shadow_pte_gen->writable == 0)
787 PrintDebug(info->vm_info, info, "at rmap_add host_fn %p\n", (void *)BASE_TO_PAGE_ADDR(page_base_addr));
789 mem_map = core->vm_info.mem_map.base_region.mem_map;
790 page_private = mem_map[page_base_addr];
792 PrintDebug(info->vm_info, info, "at rmap_add page_private %p\n", (void *)page_private);
795 PrintDebug(info->vm_info, info, "at rmap_add initial\n");
796 mem_map[page_base_addr] = (addr_t)shadow_pte;
797 PrintDebug(info->vm_info, info, "rmap_add: shadow_pte %p\n", (void *)shadow_pte);
799 } else if (!(page_private & 1)) {
800 PrintDebug(info->vm_info, info, "at rmap_add into multi\n");
802 desc = shadow_alloc_rmap(core);
803 desc->shadow_ptes[0] = page_private;
804 desc->shadow_ptes[1] = shadow_pte;
805 mem_map[page_base_addr] = (addr_t)desc | 1;
807 PrintDebug(info->vm_info, info, "rmap_add: desc %p desc|1 %p\n",(void *)desc,(void *)((addr_t)desc |1));
810 PrintDebug(info->vm_info, info, "at rmap_add multimap\n");
811 desc = (struct rmap *)(page_private & ~1ul);
813 while (desc->more && desc->shadow_ptes[RMAP_EXT-1]) desc = desc->more;
815 if (desc->shadow_ptes[RMAP_EXT-1]) {
816 desc->more = shadow_alloc_rmap(core);
820 for (i = 0; desc->shadow_ptes[i]; ++i) ;
821 desc->shadow_ptes[i] = shadow_pte;
826 static void rmap_desc_remove_entry(struct guest_info *core,
827 addr_t * page_private,
830 struct rmap *prev_desc)
834 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) ;
835 desc->shadow_ptes[i] = desc->shadow_ptes[j];
836 desc->shadow_ptes[j] = 0;
839 PrintDebug(info->vm_info, info, "rmap_desc_rm: i %d j %d\n",i,j);
843 if (!prev_desc && !desc->more) {
844 PrintDebug(info->vm_info, info, "rmap_desc_rm: no more no less\n");
845 *page_private = desc->shadow_ptes[0];
846 } else { //more should be null
848 PrintDebug(info->vm_info, info, "rmap_desc_rm: no more\n");
849 prev_desc->more = desc->more;
851 PrintDebug(info->vm_info, info, "rmap_desc_rm: no less\n");
852 *page_private = (addr_t) desc->more | 1;
855 shadow_free_rmap(core, desc);
858 static void rmap_remove(struct guest_info * core, addr_t shadow_pte) {
860 struct rmap *prev_desc;
861 addr_t page_private = 0;
862 gen_pt_t * shadow_pte_gen;
863 addr_t page_base_addr = 0;
867 v3_cpu_mode_t mode = v3_get_vm_cpu_mode(core);
869 if (mode == PROTECTED) {
870 PrintDebug(info->vm_info, info, "rmap_rm: PROTECTED %d\n", mode);
871 page_base_addr = ((pte32_t *)shadow_pte)->page_base_addr;
873 } else if (mode == LONG_32_COMPAT || mode == LONG) {
874 PrintDebug(info->vm_info, info, "rmap_rm: LONG_32_COMPAT/LONG %d\n", mode);
875 page_base_addr = ((pte64_t *)shadow_pte)->page_base_addr;
878 PrintDebug(info->vm_info, info, "rmap_rm: mode %d\n", mode);
881 shadow_pte_gen = (gen_pt_t*)shadow_pte;
883 if (shadow_pte_gen->present == 0 || shadow_pte_gen->writable == 0) {
884 PrintDebug(info->vm_info, info, "rmap_rm: present %d, write %d, pte %p\n",
885 shadow_pte_gen->present, shadow_pte_gen->writable,
886 (void*)*((addr_t*)shadow_pte));
889 PrintDebug(info->vm_info, info, "rmap_rm: shadow_pte->page_base_addr (%p)\n", (void *)BASE_TO_PAGE_ADDR(page_base_addr));
891 mem_map = core->vm_info.mem_map.base_region.mem_map;
892 page_private = mem_map[page_base_addr];
894 PrintDebug(info->vm_info, info, "rmap_rm: page_private %p page_private&1 %p\n",(void *)page_private,(void*)(page_private&1));
897 PrintDebug(info->vm_info, info, "rmap_rm: single page_prive %p\n",(void *)page_private);
899 } else if (!(page_private & 1)) {
900 PrintDebug(info->vm_info, info, "rmap_rm: multi page_prive %p\n",(void *)page_private);
901 mem_map[page_base_addr] = (addr_t)0;
904 PrintDebug(info->vm_info, info, "rmap_rm: multimap page_prive %p\n",(void *)page_private);
905 desc = (struct rmap *)(page_private & ~1ul);
909 PrintDebug(info->vm_info, info, "rmap_rm: desc loop\n");
910 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
911 if (desc->shadow_ptes[i] == shadow_pte) {
912 PrintDebug(info->vm_info, info, "rmap_rm: rmap_desc_remove_entry i %d\n",i);
913 rmap_desc_remove_entry(core, &mem_map[page_base_addr], desc, i, prev_desc);
922 static inline int activate_shadow_pt_32(struct guest_info * core);
924 static void rmap_write_protect(struct guest_info * core, addr_t guest_fn) {
926 //pte32_t * shadow_pte;
931 PrintDebug(info->vm_info, info, "rmap_wrprot: gfn %p\n",(void *) guest_fn);
933 if (guest_pa_to_host_pa(core, BASE_TO_PAGE_ADDR(guest_fn), &host_pa)!=0) {
934 PrintDebug(info->vm_info, info, "rmap_wrprot: error \n");
937 page_private = core->vm_info.mem_map.base_region.mem_map[PAGE_BASE_ADDR(host_pa)];
939 PrintDebug(info->vm_info, info, "rmap_wrprot: host_fn %p\n",(void *)PAGE_BASE_ADDR(host_pa));
941 while(page_private) {
942 PrintDebug(info->vm_info, info, "rmap_wrprot: page_private %p\n", (void*)page_private);
943 if(!(page_private & 1)) {
944 PrintDebug(info->vm_info, info, "rmap_wrprot: reverse desc single\n");
945 shadow_pte = page_private;
948 desc = (struct rmap *) (page_private & ~1ul);
949 PrintDebug(info->vm_info, info, "rmap_wrprot: reverse desc multimap\n");
950 shadow_pte = desc->shadow_ptes[0];
953 PrintDebug(info->vm_info, info, "rmap_wrprot: pg_priv %p, host_fn %p, shdw_pte %p\n",
954 (void *)page_private, (void *)PAGE_BASE_ADDR(host_pa), (void*)*((uint64_t*)shadow_pte));
957 rmap_remove(core, shadow_pte);
959 //PrintDebug(info->vm_info, info, "rmap_wrprot: shadow_pte->page_base_addr (%p)\n",
960 // (void *)BASE_TO_PAGE_ADDR(shadow_pte->page_base_addr));
962 ((gen_pt_t *)shadow_pte)->writable = 0;
963 PrintDebug(info->vm_info, info, "rmap_wrprot: %p\n",(void*)*((uint64_t *)shadow_pte));
965 page_private = core->vm_info.mem_map.base_region.mem_map[PAGE_BASE_ADDR(host_pa)];
967 PrintDebug(info->vm_info, info, "rmap_wrprot: page_private %p\n",(void*)page_private);
970 PrintDebug(info->vm_info, info, "rmap_wrprot: done\n");
974 void shadow_page_pre_write(struct guest_info * core, addr_t guest_pa, int bytes, int force) {
975 //guest frame number is not guest physical address
976 addr_t guest_fn = PAGE_BASE_ADDR(guest_pa);
977 struct shadow_page_cache_data * page;
978 struct hlist_node *node, *n;
979 struct hlist_head * bucket;
982 uint32_t* shdw32_table = NULL;
983 uint32_t* shdw32_entry = NULL;
984 uint64_t* shdw64_table = NULL;
985 uint64_t* shdw64_entry = NULL;
988 unsigned offset = PAGE_OFFSET(guest_pa);
989 unsigned misaligned = 0;
993 v3_cpu_mode_t mode = v3_get_vm_cpu_mode(core);
995 if (guest_fn == core->last_pt_write_guest_fn) {
996 ++core->last_pt_write_count;
997 if (core->last_pt_write_count >= 3) flooded = 1;
999 core->last_pt_write_guest_fn = guest_fn;
1000 core->last_pt_write_count = 1;
1003 PrintDebug(info->vm_info, info, "shdw_pre-write: gpa %p byte %d force %d flood %d last_gfn %p last_cnt %d\n",
1004 (void *)guest_pa,bytes,force,flooded,(void*)core->last_pt_write_guest_fn,core->last_pt_write_count);
1006 index = shadow_page_table_hashfn(guest_fn) % NUM_SHADOW_PAGES;
1007 bucket = &core->shadow_page_hash[index];
1009 PrintDebug(info->vm_info, info, "shdw_pre-write: check point after bucket\n");
1011 //hlist_for_each_entry_safe(page, node, bucket, hash_link) {
1012 hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
1014 if (page->guest_fn != guest_fn || page->role.metaphysical) continue;
1016 pte_size = 4; //because 32bit nonPAE for now
1017 pte_size = page->role.glevels == 2 ? 4 : 8;
1019 if (!force) misaligned = (offset & (offset + bytes -1)) & ~(pte_size -1);
1021 if (misaligned || flooded || force) {
1023 * Misaligned accesses are too much trobule to fix up
1024 * also they usually indicate a page is not used as a page table
1026 PrintDebug(info->vm_info, info, "shdw_pre-write: misaligned\n");
1027 shadow_zap_page(core, page);
1031 level = page->role.hlevels;
1033 PrintDebug(info->vm_info, info, "shdw_pre-write: found out one page at the level of %d\n", level);
1035 if (mode == PROTECTED) {
1036 shdw32_table = (uint32_t*)V3_VAddr((void *)(addr_t)BASE_TO_PAGE_ADDR(PAGE_BASE_ADDR(page->page_pa)));
1037 shdw32_entry = (uint32_t*)&(shdw32_table[offset/sizeof(uint32_t)]);
1039 if (*shdw32_entry & PT_PRESENT_MASK) {
1040 if (level == PT_PAGE_TABLE_LEVEL) {
1041 PrintDebug(info->vm_info, info, "shdw_pre-write: pte idx %d\n", (unsigned int)(offset/sizeof(uint32_t)));
1042 rmap_remove(core, (addr_t)shdw32_entry);
1043 memset((void*)shdw32_entry, 0, sizeof(uint32_t));
1046 shadow_page_remove_shadow_pde(core, page, (addr_t)shdw32_entry);
1047 memset((void*)shdw32_entry, 0, sizeof(uint32_t));
1051 } else if (mode == LONG_32_COMPAT || mode == LONG) {
1053 shdw64_table = (uint64_t*)V3_VAddr((void*)(addr_t)BASE_TO_PAGE_ADDR(PAGE_BASE_ADDR(page->page_pa)));
1054 shdw64_entry = (uint64_t*)&(shdw64_table[offset/sizeof(uint64_t)]);
1056 if (*shdw64_entry & PT_PRESENT_MASK) {
1057 if (level == PT_PAGE_TABLE_LEVEL) {
1058 PrintDebug(info->vm_info, info, "shdw_pre-write: pte idx %d\n", (unsigned int)(offset/sizeof(uint64_t)));
1059 rmap_remove(core, (addr_t)shdw64_entry);
1060 memset((void*)shdw64_entry, 0, sizeof(uint64_t));
1062 shadow_page_remove_shadow_pde(core, page, (addr_t)shdw64_entry);
1063 memset((void*)shdw64_entry, 0, sizeof(uint64_t));
1070 //emulation for synchronization
1071 void shadow_page_post_write(struct guest_info * core, addr_t guest_pa) {
1075 int shadow_unprotect_page_virt(struct guest_info * core, addr_t guest_va) {
1078 if (guest_va_to_guest_pa(core, guest_va, &guest_pa) != 0) {
1079 PrintError(info->vm_info, info, "In GVA->HVA: Invalid GVA(%p)->GPA lookup\n",
1084 return shadow_unprotect_page(core, PAGE_BASE_ADDR(guest_pa));
1087 void shadow_free_some_pages(struct guest_info * core) {
1088 while (core->n_free_shadow_pages < REFILE_PAGES) {
1089 struct shadow_page_cache_data * page;
1090 page = container_of(core->active_shadow_pages.prev,
1091 struct shadow_page_cache_data, link);
1092 shadow_zap_page(core,page);
1096 void shadow_free_all_pages(struct guest_info *core) {
1098 struct shadow_page_cache_data * sp, *node;
1099 list_for_each_entry_safe(sp, node, &core->active_shadow_pages, link) {
1100 shadow_zap_page(core , sp);
1105 static struct shadow_page_cache_data * create_new_shadow_pt(struct guest_info * core);
1108 #include "vmm_shdw_pg_cache_32.h"
1109 #include "vmm_shdw_pg_cache_32pae.h"
1110 #include "vmm_shdw_pg_cache_64.h"
1112 static int vtlb_caching_init(struct v3_vm_info * vm, v3_cfg_tree_t * cfg) {
1114 V3_Print(info->vm_info, info, "VTLB Caching initialization\n");
1118 static int vtlb_caching_deinit(struct v3_vm_info * vm) {
1122 static int vtlb_caching_local_init(struct guest_info * core) {
1124 V3_Print(info->vm_info, info, "VTLB local initialization\n");
1126 INIT_LIST_HEAD(&core->active_shadow_pages);
1127 INIT_LIST_HEAD(&core->free_pages);
1129 alloc_shadow_pages(core);
1131 shadow_topup_caches(core);
1133 core->prev_cr3_pdt_base = 0;
1139 static int vtlb_caching_activate_shdw_pt(struct guest_info * core) {
1140 switch (v3_get_vm_cpu_mode(core)) {
1143 return activate_shadow_pt_32(core);
1145 return activate_shadow_pt_32pae(core);
1147 case LONG_32_COMPAT:
1148 case LONG_16_COMPAT:
1149 return activate_shadow_pt_64(core);
1151 PrintError(info->vm_info, info, "Invalid CPU mode: %s\n", v3_cpu_mode_to_str(v3_get_vm_cpu_mode(core)));
1158 static int vtlb_caching_invalidate_shdw_pt(struct guest_info * core) {
1159 return vtlb_caching_activate_shdw_pt(core);
1163 static int vtlb_caching_handle_pf(struct guest_info * core, addr_t fault_addr, pf_error_t error_code) {
1165 switch (v3_get_vm_cpu_mode(core)) {
1167 return handle_shadow_pagefault_32(core, fault_addr, error_code);
1170 return handle_shadow_pagefault_32pae(core, fault_addr, error_code);
1172 case LONG_32_COMPAT:
1173 case LONG_16_COMPAT:
1174 return handle_shadow_pagefault_64(core, fault_addr, error_code);
1177 PrintError(info->vm_info, info, "Unhandled CPU Mode: %s\n", v3_cpu_mode_to_str(v3_get_vm_cpu_mode(core)));
1183 static int vtlb_caching_handle_invlpg(struct guest_info * core, addr_t vaddr) {
1185 switch (v3_get_vm_cpu_mode(core)) {
1187 return handle_shadow_invlpg_32(core, vaddr);
1189 return handle_shadow_invlpg_32pae(core, vaddr);
1191 case LONG_32_COMPAT:
1192 case LONG_16_COMPAT:
1193 return handle_shadow_invlpg_64(core, vaddr);
1195 PrintError(info->vm_info, info, "Invalid CPU mode: %s\n", v3_cpu_mode_to_str(v3_get_vm_cpu_mode(core)));
1200 static struct v3_shdw_pg_impl vtlb_caching_impl = {
1201 .name = "VTLB_CACHING",
1202 .init = vtlb_caching_init,
1203 .deinit = vtlb_caching_deinit,
1204 .local_init = vtlb_caching_local_init,
1205 .handle_pagefault = vtlb_caching_handle_pf,
1206 .handle_invlpg = vtlb_caching_handle_invlpg,
1207 .activate_shdw_pt = vtlb_caching_activate_shdw_pt,
1208 .invalidate_shdw_pt = vtlb_caching_invalidate_shdw_pt
1215 register_shdw_pg_impl(&vtlb_caching_impl);