From: Peter Dinda Date: Wed, 4 Aug 2010 00:19:20 +0000 (-0500) Subject: Functional 2 core linux guest X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?p=palacios.git;a=commitdiff_plain;h=e531d13b53ac8b32bca19131dd7a2824cb17eff9 Functional 2 core linux guest Main addition is logical destination mode for IPIs --- diff --git a/palacios/include/devices/apic.h b/palacios/include/devices/apic.h index f78e2a6..e4452e7 100644 --- a/palacios/include/devices/apic.h +++ b/palacios/include/devices/apic.h @@ -25,6 +25,7 @@ #include + int v3_apic_raise_intr(struct guest_info * info, struct vm_device * apic_dev, int intr_num); diff --git a/palacios/include/devices/icc_bus.h b/palacios/include/devices/icc_bus.h index da5f39f..c3ec43a 100644 --- a/palacios/include/devices/icc_bus.h +++ b/palacios/include/devices/icc_bus.h @@ -23,6 +23,7 @@ struct v3_icc_ops { int (*raise_intr)(struct guest_info * core, int intr_num, void * private_data); + int (*should_deliver_flat)(struct guest_info * core, uint8_t mda, void * private_data); }; @@ -39,9 +40,10 @@ int v3_icc_register_ioapic(struct v3_vm_info *vm, struct vm_device * icc_bus, ui * @param apic_src - The source APIC id. * @param apic_num - The remote APIC number. * @param icr - A copy of the APIC's ICR. (LAPIC-style ICR, clone from redir table for ioapics) + * @param dfr - A copy of the APIC's DFR (LAPIC-style DFR) & @param extirq - irq for external interrupts (e.g., from 8259) */ -int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t apic_src, uint64_t icr, uint32_t ext_irq); +int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t apic_src, uint64_t icr, uint32_t dfr, uint32_t ext_irq); #if 0 diff --git a/palacios/include/palacios/vm_guest.h b/palacios/include/palacios/vm_guest.h index b13f462..286592d 100644 --- a/palacios/include/palacios/vm_guest.h +++ b/palacios/include/palacios/vm_guest.h @@ -66,7 +66,6 @@ struct guest_info { struct v3_shdw_pg_state shdw_pg_state; addr_t direct_map_pt; - // This structure is how we get interrupts for the guest struct v3_intr_core_state intr_core_state; @@ -120,6 +119,8 @@ struct v3_vm_info { addr_t mem_size; // In bytes for now struct v3_mem_map mem_map; + v3_paging_size_t paging_size; // for nested paging + struct v3_mem_hooks mem_hooks; struct v3_shdw_impl_state shdw_impl; diff --git a/palacios/include/palacios/vmm.h b/palacios/include/palacios/vmm.h index bf13c3f..f37ad83 100644 --- a/palacios/include/palacios/vmm.h +++ b/palacios/include/palacios/vmm.h @@ -67,14 +67,26 @@ struct guest_info; -#define V3_AllocPages(num_pages) \ - ({ \ - extern struct v3_os_hooks * os_hooks; \ - void * ptr = 0; \ - if ((os_hooks) && (os_hooks)->allocate_pages) { \ - ptr = (os_hooks)->allocate_pages(num_pages); \ - } \ - ptr; \ +/* 4KB-aligned */ +#define V3_AllocPages(num_pages) \ + ({ \ + extern struct v3_os_hooks * os_hooks; \ + void * ptr = 0; \ + if ((os_hooks) && (os_hooks)->allocate_pages) { \ + ptr = (os_hooks)->allocate_pages(num_pages,PAGE_SIZE_4KB); \ + } \ + ptr; \ + }) + + +#define V3_AllocAlignedPages(num_pages, align) \ + ({ \ + extern struct v3_os_hooks * os_hooks; \ + void * ptr = 0; \ + if ((os_hooks) && (os_hooks)->allocate_pages) { \ + ptr = (os_hooks)->allocate_pages(num_pages,align); \ + } \ + ptr; \ }) @@ -239,7 +251,7 @@ struct v3_os_hooks { void (*print)(const char * format, ...) __attribute__ ((format (printf, 1, 2))); - void *(*allocate_pages)(int numPages); + void *(*allocate_pages)(int numPages, unsigned int alignment); void (*free_page)(void * page); void *(*malloc)(unsigned int size); @@ -266,7 +278,7 @@ struct v3_os_hooks { unsigned int (*get_cpu)(void); void (*interrupt_cpu)(struct v3_vm_info * vm, int logical_cpu, int vector); void (*call_on_cpu)(int logical_cpu, void (*fn)(void * arg), void * arg); - void * (*start_thread_on_cpu)(int logical_cpu, int (*fn)(void * arg), void * arg, char * thread_name); + void * (*start_thread_on_cpu)(int cpu_id, int (*fn)(void * arg), void * arg, char * thread_name); }; diff --git a/palacios/include/palacios/vmm_mem.h b/palacios/include/palacios/vmm_mem.h index 7b6d5d4..a8e776a 100644 --- a/palacios/include/palacios/vmm_mem.h +++ b/palacios/include/palacios/vmm_mem.h @@ -103,6 +103,7 @@ int v3_add_shadow_mem(struct v3_vm_info * vm, uint16_t core_id, struct v3_mem_region * v3_get_mem_region(struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr); +struct v3_mem_region * v3_get_next_mem_region(struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr); diff --git a/palacios/include/palacios/vmm_types.h b/palacios/include/palacios/vmm_types.h index 0c95d0d..fc4fd5f 100644 --- a/palacios/include/palacios/vmm_types.h +++ b/palacios/include/palacios/vmm_types.h @@ -29,6 +29,7 @@ typedef enum {SHADOW_PAGING, NESTED_PAGING} v3_paging_mode_t; typedef enum {VM_RUNNING, VM_STOPPED, VM_SUSPENDED, VM_ERROR, VM_EMULATING} v3_vm_operating_mode_t; +typedef enum {PAGING_4KB, PAGING_2MB} v3_paging_size_t; typedef enum {INIT, SIPI, REAL, /*UNREAL,*/ PROTECTED, PROTECTED_PAE, LONG, LONG_32_COMPAT, LONG_16_COMPAT} v3_cpu_mode_t; typedef enum {PHYSICAL_MEM, VIRTUAL_MEM} v3_mem_mode_t; diff --git a/palacios/src/devices/apic.c b/palacios/src/devices/apic.c index ad684f1..3064490 100644 --- a/palacios/src/devices/apic.c +++ b/palacios/src/devices/apic.c @@ -118,6 +118,8 @@ typedef enum { APIC_TMR_INT, APIC_THERM_INT, APIC_PERF_INT, + + struct apic_msr { union { uint64_t value; @@ -133,8 +135,6 @@ struct apic_msr { } __attribute__((packed)); - - struct apic_state { addr_t base_addr; @@ -188,6 +188,10 @@ struct apic_state { v3_lock_t lock; }; + + + + static int apic_read(struct guest_info * core, addr_t guest_addr, void * dst, uint_t length, void * priv_data); static int apic_write(struct guest_info * core, addr_t guest_addr, void * src, uint_t length, void * priv_data); @@ -888,7 +892,9 @@ static int apic_write(struct guest_info * core, addr_t guest_addr, void * src, u // ICC??? PrintDebug("apic %u: core %u: sending cmd 0x%llx to apic %u\n",apic->lapic_id.val,core->cpu_id, apic->int_cmd.val, apic->int_cmd.dst); - v3_icc_send_ipi(apic->icc_bus, apic->lapic_id.val, apic->int_cmd.val,0); + if (v3_icc_send_ipi(apic->icc_bus, apic->lapic_id.val, apic->int_cmd.val,apic->dst_fmt.val,0)==-1) { + return -1; + } break; case INT_CMD_HI_OFFSET: apic->int_cmd.hi = op_val; @@ -1098,8 +1104,20 @@ static struct v3_device_ops dev_ops = { +static int apic_should_deliver_flat(struct guest_info * core, uint8_t mda, void * private_data) +{ + struct apic_state * apic = (struct apic_state *)private_data; + + if (mda==0xff || (apic->log_dst.dst_log_id & mda)) { + return 1; + } else { + return 0; + } +} + static struct v3_icc_ops icc_ops = { .raise_intr = apic_raise_intr, + .should_deliver_flat = apic_should_deliver_flat, }; diff --git a/palacios/src/devices/icc_bus.c b/palacios/src/devices/icc_bus.c index c02e7f5..99eed8f 100644 --- a/palacios/src/devices/icc_bus.c +++ b/palacios/src/devices/icc_bus.c @@ -23,7 +23,6 @@ #include #include - #define MAX_APICS 256 #ifndef CONFIG_DEBUG_ICC_BUS @@ -202,11 +201,13 @@ static int deliver(uint32_t src_apic, struct apic_data *dest_apic, struct int_cm // icr_data contains interrupt vector *except* for ext_int // in which case it is given via irq // -int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_data, uint32_t extirq) { +int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_data, + uint32_t dfr_data, uint32_t extirq) { PrintDebug("icc_bus: icc_bus=%p, src_apic=%u, icr_data=%llx, extirq=%u\n",icc_bus,src_apic,icr_data,extirq); struct int_cmd_reg *icr = (struct int_cmd_reg *)&icr_data; + struct dst_fmt_reg *dfr = (struct dst_fmt_reg*)&dfr_data; struct icc_bus_state * state = (struct icc_bus_state *)icc_bus->private_data; // initial sanity checks @@ -218,23 +219,63 @@ int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_ PrintError("icc_bus: Attempted send to unregistered apic id=%u\n",icr->dst); return -1; } - - struct apic_data * dest_apic = &(state->apics[icr->dst]); - PrintDebug("icc_bus: IPI %s %u from %s %u to %s %u (icr=0x%llx) (extirq=%u)\n", - deliverymode_str[icr->del_mode], icr->vec, src_apic==state->ioapic_id ? "ioapic" : "apic", - src_apic, shorthand_str[icr->dst_shorthand], icr->dst,icr->val, + PrintDebug("icc_bus: IPI %s %u from %s %u to %s %s %u (icr=0x%llx, dfr=0x%x) (extirq=%u)\n", + deliverymode_str[icr->del_mode], icr->vec, + src_apic==state->ioapic_id ? "ioapic" : "apic", + src_apic, + icr->dst_mode==0 ? "(physical)" : "(logical)", + shorthand_str[icr->dst_shorthand], icr->dst,icr->val, dfr->val, extirq); + /* + + if (icr->dst==state->ioapic_id) { + PrintError("icc_bus: Attempted send to ioapic ignored\n"); + return -1; + } + */ switch (icr->dst_shorthand) { case 0: // no shorthand - if (deliver(src_apic,dest_apic,icr,state,extirq)) { - return -1; + if (icr->dst_mode==0) { + // physical delivery + struct apic_data * dest_apic = &(state->apics[icr->dst]); + if (deliver(src_apic,dest_apic,icr,state,extirq)) { + return -1; + } + } else { + // logical delivery + uint8_t mda = icr->dst; // message destination address, not physical address + + if (dfr->model==0xf) { + // flat model + // deliver irq if + // mda of sender & ldr of receiver is nonzero + // mda=0xff means broadcaset to all + + int i; + for (i=0;iapics[i]); + if (dest_apic->present && + dest_apic->ops->should_deliver_flat(dest_apic->core, + mda, + dest_apic->priv_data)) { + if (deliver(src_apic,dest_apic,icr,state,extirq)) { + return -1; + } + } + } + } else { + // cluster model + PrintError("icc_bus: use of cluster model not yet supported\n"); + return -1; + } } + break; case 1: // self @@ -242,6 +283,7 @@ int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_ PrintError("icc_bus: ioapic attempting to send to itself\n"); return -1; } + struct apic_data *dest_apic=&(state->apics[src_apic]); if (deliver(src_apic,dest_apic,icr,state,extirq)) { return -1; } @@ -251,7 +293,7 @@ int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_ case 3: { // all and all-but-me int i; for (i=0;iapics[i]); + struct apic_data *dest_apic=&(state->apics[i]); if (dest_apic->present && (i!=src_apic || icr->dst_shorthand==2)) { if (deliver(src_apic,dest_apic,icr,state,extirq)) { return -1; @@ -260,7 +302,7 @@ int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_ } } break; - } + } return 0; } diff --git a/palacios/src/devices/io_apic.c b/palacios/src/devices/io_apic.c index ad238f3..dbbab94 100644 --- a/palacios/src/devices/io_apic.c +++ b/palacios/src/devices/io_apic.c @@ -291,7 +291,9 @@ static int ioapic_raise_irq(struct v3_vm_info * vm, void * private_data, int irq icr.dst_shorthand=0; // no shorthand icr.rsvd2=0; - v3_icc_send_ipi(ioapic->icc_bus, ioapic->ioapic_id.val,icr.val, irq); + // Note: 0 yhere is "cluster model", but it should be irrelevant + // since we are sending this as a physical destination + v3_icc_send_ipi(ioapic->icc_bus, ioapic->ioapic_id.val,icr.val, 0, irq); } return 0; diff --git a/palacios/src/palacios/svm.c b/palacios/src/palacios/svm.c index 34fed45..7b33c8c 100644 --- a/palacios/src/palacios/svm.c +++ b/palacios/src/palacios/svm.c @@ -44,6 +44,12 @@ #include +#ifndef CONFIG_DEBUG_SVM +#undef PrintDebug +#define PrintDebug(fmt, args...) +#endif + + uint32_t v3_last_exit; // This is a global pointer to the host's VMCB diff --git a/palacios/src/palacios/vmm_config.c b/palacios/src/palacios/vmm_config.c index 9ba0a11..86eb1dd 100644 --- a/palacios/src/palacios/vmm_config.c +++ b/palacios/src/palacios/vmm_config.c @@ -199,7 +199,7 @@ static int pre_config_vm(struct v3_vm_info * vm, v3_cfg_tree_t * vm_cfg) { PrintDebug("Memory=%s\n", memory_str); // Amount of ram the Guest will have, always in MB - vm->mem_size = atoi(memory_str) * 1024 * 1024; + vm->mem_size = (unsigned long)atoi(memory_str) * 1024UL * 1024UL; if (strcasecmp(vm_class, "PC") == 0) { vm->vm_class = V3_PC_VM; @@ -208,7 +208,6 @@ static int pre_config_vm(struct v3_vm_info * vm, v3_cfg_tree_t * vm_cfg) { return -1; } - #ifdef CONFIG_TELEMETRY { char * telemetry = v3_cfg_val(vm_cfg, "telemetry"); @@ -247,7 +246,8 @@ static int determine_paging_mode(struct guest_info *info, v3_cfg_tree_t * core_c v3_cfg_tree_t *vm_tree = info->vm_info->cfg_data->cfg; v3_cfg_tree_t *pg_tree = v3_cfg_subtree(vm_tree, "paging"); - char *pg_mode = v3_cfg_val(pg_tree, "mode"); + char *pg_mode = v3_cfg_val(pg_tree, "mode"); + char *page_size = v3_cfg_val(pg_tree, "page_size"); PrintDebug("Paging mode specified as %s\n", pg_mode); @@ -266,12 +266,21 @@ static int determine_paging_mode(struct guest_info *info, v3_cfg_tree_t * core_c info->shdw_pg_mode = SHADOW_PAGING; } } else { - PrintDebug("No paging mode specified in configuration.\n"); + PrintDebug("No paging type specified in configuration. Defaulting to shadow paging\n"); info->shdw_pg_mode = SHADOW_PAGING; } if (info->shdw_pg_mode == NESTED_PAGING) { PrintDebug("Guest Paging Mode: NESTED_PAGING\n"); + if (strcasecmp(page_size, "4kb") == 0) { /* TODO: this may not be an ideal place for this */ + info->vm_info->paging_size = PAGING_4KB; + } else if (strcasecmp(page_size, "2mb") == 0) { + info->vm_info->paging_size = PAGING_2MB; + } else { + PrintError("Invalid VM paging size: '%s'\n", page_size); + return -1; + } + PrintDebug("VM page size=%s\n", page_size); } else if (info->shdw_pg_mode == SHADOW_PAGING) { PrintDebug("Guest Paging Mode: SHADOW_PAGING\n"); } else { diff --git a/palacios/src/palacios/vmm_direct_paging_64.h b/palacios/src/palacios/vmm_direct_paging_64.h index a0408d9..c4c41e3 100644 --- a/palacios/src/palacios/vmm_direct_paging_64.h +++ b/palacios/src/palacios/vmm_direct_paging_64.h @@ -27,33 +27,91 @@ #include #include - +// Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection" static inline int handle_passthrough_pagefault_64(struct guest_info * info, addr_t fault_addr, pf_error_t error_code) { - pml4e64_t * pml = NULL; - pdpe64_t * pdpe = NULL; - pde64_t * pde = NULL; - pte64_t * pte = NULL; - addr_t host_addr = 0; - - int pml_index = PML4E64_INDEX(fault_addr); + pml4e64_t * pml = NULL; + pdpe64_t * pdpe = NULL; + pde64_t * pde = NULL; + pde64_2MB_t * pde2mb = NULL; + pte64_t * pte = NULL; + addr_t host_addr = 0; + + int pml_index = PML4E64_INDEX(fault_addr); int pdpe_index = PDPE64_INDEX(fault_addr); - int pde_index = PDE64_INDEX(fault_addr); - int pte_index = PTE64_INDEX(fault_addr); - - - + int pde_index = PDE64_INDEX(fault_addr); + int pte_index = PTE64_INDEX(fault_addr); struct v3_mem_region * region = v3_get_mem_region(info->vm_info, info->cpu_id, fault_addr); - + struct v3_mem_region * base_reg = &(info->vm_info->mem_map.base_region); + + /* If the guest has been configured for 2MiB pages, then we must check for hooked regions of + * memory which may overlap with the 2MiB page containing the faulting address (due to + * potentially differing access policies in place for e.g. i/o devices and APIC). A 2MiB page + * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains + * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this + * note if someone decides to enable this optimization. It can be tested with the SeaStar + * mapping. + * + * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg) + * + * |region| |region| 2MiB mapped (state A) + * |reg| |REG| 2MiB mapped (state B) + * |region| |reg| |REG| |region| |reg| 4KiB mapped (state C) + * |reg| |reg| |--REGION---| [2MiB mapped (state D)] + * |--------------------------------------------| RAM + * ^ fault addr + * |----|----|----|----|----|page|----|----|----| 2MB pages + * >>>>>>>>>>>>>>>>>>>> search space + */ + addr_t pg_start = 0UL, pg_end = 0UL; // 2MiB page containing the faulting address + struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr + bool use_large_page = false; + if (region == NULL) { - PrintError("Invalid region in passthrough page fault 64, addr=%p\n", - (void *)fault_addr); + PrintError("%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr); return -1; } + // set use_large_page here + if (info->vm_info->paging_size == PAGING_2MB) { + + // guest page maps to a host page + offset (so when we shift, it aligns with a host page) + pg_start = PAGE_ADDR_2MB(fault_addr); + pg_end = (pg_start + PAGE_SIZE_2MB); + + PrintDebug("%s: page [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end); + + pg_next_reg = v3_get_next_mem_region(info->vm_info, info->cpu_id, pg_start); + + if (pg_next_reg == NULL) { + PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr); + return -1; + } + + if ((pg_next_reg->guest_start == base_reg->guest_start) && + (pg_next_reg->guest_end == base_reg->guest_end)) { // next region == base region + use_large_page = 1; // State A + } else { +#if 0 // State B/C and D optimization + use_large_page = (pg_next_reg->guest_end >= pg_end) && + ((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start)); + PrintDebug("%s: region [%p,%p) %s partial overlap with page\n", __FUNCTION__, + (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end, + (use_large_page ? "does not have" : "has")); +#else // State B/C + use_large_page = (pg_next_reg->guest_start >= pg_end); + PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__, + (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end, + (use_large_page ? "does not have" : "has")); +#endif + } + } + + PrintDebug("%s: Address gets a 2MiB page? %s\n", __FUNCTION__, (use_large_page ? "yes" : "no")); + // Lookup the correct PML address based on the PAGING MODE if (info->shdw_pg_mode == SHADOW_PAGING) { pml = CR3_TO_PML4E64_VA(info->ctrl_regs.cr3); @@ -70,9 +128,9 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * info, pml[pml_index].writable = 1; pml[pml_index].user_page = 1; - pml[pml_index].pdp_base_addr = PAGE_BASE_ADDR((addr_t)V3_PAddr(pdpe)); + pml[pml_index].pdp_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pdpe)); } else { - pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pml[pml_index].pdp_base_addr)); + pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pml[pml_index].pdp_base_addr)); } // Fix up the PDPE entry @@ -84,11 +142,48 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * info, pdpe[pdpe_index].writable = 1; pdpe[pdpe_index].user_page = 1; - pdpe[pdpe_index].pd_base_addr = PAGE_BASE_ADDR((addr_t)V3_PAddr(pde)); + pdpe[pdpe_index].pd_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pde)); } else { - pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pdpe[pdpe_index].pd_base_addr)); + pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pdpe[pdpe_index].pd_base_addr)); + } + + // Fix up the 2MiB PDE and exit here + if (use_large_page) { + + pde2mb = (pde64_2MB_t *)pde; // all but these two lines are the same for PTE + pde2mb[pde_index].large_page = 1; + + if (pde2mb[pde_index].present == 0) { + pde2mb[pde_index].user_page = 1; + + if ((region->flags.alloced == 1) && (region->flags.read == 1)) { + // Full access + pde2mb[pde_index].present = 1; + + if (region->flags.write == 1) { + pde2mb[pde_index].writable = 1; + } else { + pde2mb[pde_index].writable = 0; + } + + if (v3_gpa_to_hpa(info, fault_addr, &host_addr) == -1) { + PrintError("Error Could not translate fault addr (%p)\n", (void *)fault_addr); + return -1; + } + + pde2mb[pde_index].page_base_addr = PAGE_BASE_ADDR_2MB(host_addr); + } else { + return region->unhandled(info, fault_addr, fault_addr, region, error_code); + } + } else { + // We fix all permissions on the first pass, + // so we only get here if its an unhandled exception + + return region->unhandled(info, fault_addr, fault_addr, region, error_code); + } } + // Continue with the 4KiB page heirarchy // Fix up the PDE entry if (pde[pde_index].present == 0) { @@ -98,9 +193,9 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * info, pde[pde_index].writable = 1; pde[pde_index].user_page = 1; - pde[pde_index].pt_base_addr = PAGE_BASE_ADDR((addr_t)V3_PAddr(pte)); + pde[pde_index].pt_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pte)); } else { - pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr)); + pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pde[pde_index].pt_base_addr)); } @@ -124,7 +219,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * info, return -1; } - pte[pte_index].page_base_addr = PAGE_BASE_ADDR(host_addr); + pte[pte_index].page_base_addr = PAGE_BASE_ADDR_4KB(host_addr); } else { return region->unhandled(info, fault_addr, fault_addr, region, error_code); } @@ -170,7 +265,7 @@ static inline int invalidate_addr_64(struct guest_info * info, addr_t inv_addr) if (pdpe[pdpe_index].present == 0) { return 0; - } else if (pdpe[pdpe_index].large_page == 1) { + } else if (pdpe[pdpe_index].large_page == 1) { // 1GiB pdpe[pdpe_index].present = 0; return 0; } @@ -179,14 +274,14 @@ static inline int invalidate_addr_64(struct guest_info * info, addr_t inv_addr) if (pde[pde_index].present == 0) { return 0; - } else if (pde[pde_index].large_page == 1) { + } else if (pde[pde_index].large_page == 1) { // 2MiB pde[pde_index].present = 0; return 0; } pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr)); - pte[pte_index].present = 0; + pte[pte_index].present = 0; // 4KiB return 0; } diff --git a/palacios/src/palacios/vmm_mem.c b/palacios/src/palacios/vmm_mem.c index 80e17a3..1c6d51a 100644 --- a/palacios/src/palacios/vmm_mem.c +++ b/palacios/src/palacios/vmm_mem.c @@ -64,9 +64,10 @@ int v3_init_mem_map(struct v3_vm_info * vm) { // There is an underlying region that contains all of the guest memory // PrintDebug("Mapping %d pages of memory (%u bytes)\n", (int)mem_pages, (uint_t)info->mem_size); + // 2MB page alignment needed for 2MB hardware nested paging map->base_region.guest_start = 0; map->base_region.guest_end = mem_pages * PAGE_SIZE_4KB; - map->base_region.host_addr = (addr_t)V3_AllocPages(mem_pages); + map->base_region.host_addr = (addr_t)V3_AllocAlignedPages(mem_pages, PAGE_SIZE_2MB); map->base_region.flags.read = 1; map->base_region.flags.write = 1; @@ -189,8 +190,7 @@ struct v3_mem_region * __insert_mem_region(struct v3_vm_info * vm, -int v3_insert_mem_region(struct v3_vm_info * vm, - struct v3_mem_region * region) { +int v3_insert_mem_region(struct v3_vm_info * vm, struct v3_mem_region * region) { struct v3_mem_region * ret; int i = 0; @@ -289,6 +289,43 @@ struct v3_mem_region * v3_get_mem_region(struct v3_vm_info * vm, uint16_t core_i } +/* Search the "hooked" memory regions for a region that ends after the given address. If the + * address is invalid, return NULL. Else, return the first region found or the base region if no + * region ends after the given address. + */ +struct v3_mem_region * v3_get_next_mem_region( struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr) { + struct rb_node * n = vm->mem_map.mem_regions.rb_node; + struct v3_mem_region * reg = NULL; + + // Keep going to the right in the tree while the address is greater than the current region's + // end address. + while (n) { + reg = rb_entry(n, struct v3_mem_region, tree_node); + if (guest_addr >= reg->guest_end) { // reg is [start,end) + n = n->rb_right; + } else { + // PAD this may be buggy since there is no guarantees that + // the cores are in order + if ((core_id == reg->core_id) || (reg->core_id == V3_MEM_CORE_ANY)) { + return reg; + } else { + n = n->rb_right; + } + } + } + + // There is no registered region, so we check if it's a valid address in the base region + + if (guest_addr >= vm->mem_map.base_region.guest_end) { + PrintError("%s: Guest Address Exceeds Base Memory Size (ga=%p), (limit=%p)\n", + __FUNCTION__, (void *)guest_addr, (void *)vm->mem_map.base_region.guest_end); + v3_print_mem_map(vm); + return NULL; + } + + return &(vm->mem_map.base_region); +} + void v3_delete_mem_region(struct v3_vm_info * vm, struct v3_mem_region * reg) { diff --git a/utils/guest_creator/default.xml b/utils/guest_creator/default.xml index 6316742..b1727ac 100644 --- a/utils/guest_creator/default.xml +++ b/utils/guest_creator/default.xml @@ -12,7 +12,7 @@ 100