#include <palacios/vmm_dev_mgr.h>
+
int v3_apic_raise_intr(struct guest_info * info, struct vm_device * apic_dev, int intr_num);
struct v3_icc_ops {
int (*raise_intr)(struct guest_info * core, int intr_num, void * private_data);
+ int (*should_deliver_flat)(struct guest_info * core, uint8_t mda, void * private_data);
};
* @param apic_src - The source APIC id.
* @param apic_num - The remote APIC number.
* @param icr - A copy of the APIC's ICR. (LAPIC-style ICR, clone from redir table for ioapics)
+ * @param dfr - A copy of the APIC's DFR (LAPIC-style DFR)
& @param extirq - irq for external interrupts (e.g., from 8259)
*/
-int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t apic_src, uint64_t icr, uint32_t ext_irq);
+int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t apic_src, uint64_t icr, uint32_t dfr, uint32_t ext_irq);
#if 0
struct v3_shdw_pg_state shdw_pg_state;
addr_t direct_map_pt;
-
// This structure is how we get interrupts for the guest
struct v3_intr_core_state intr_core_state;
addr_t mem_size; // In bytes for now
struct v3_mem_map mem_map;
+ v3_paging_size_t paging_size; // for nested paging
+
struct v3_mem_hooks mem_hooks;
struct v3_shdw_impl_state shdw_impl;
-#define V3_AllocPages(num_pages) \
- ({ \
- extern struct v3_os_hooks * os_hooks; \
- void * ptr = 0; \
- if ((os_hooks) && (os_hooks)->allocate_pages) { \
- ptr = (os_hooks)->allocate_pages(num_pages); \
- } \
- ptr; \
+/* 4KB-aligned */
+#define V3_AllocPages(num_pages) \
+ ({ \
+ extern struct v3_os_hooks * os_hooks; \
+ void * ptr = 0; \
+ if ((os_hooks) && (os_hooks)->allocate_pages) { \
+ ptr = (os_hooks)->allocate_pages(num_pages,PAGE_SIZE_4KB); \
+ } \
+ ptr; \
+ })
+
+
+#define V3_AllocAlignedPages(num_pages, align) \
+ ({ \
+ extern struct v3_os_hooks * os_hooks; \
+ void * ptr = 0; \
+ if ((os_hooks) && (os_hooks)->allocate_pages) { \
+ ptr = (os_hooks)->allocate_pages(num_pages,align); \
+ } \
+ ptr; \
})
void (*print)(const char * format, ...)
__attribute__ ((format (printf, 1, 2)));
- void *(*allocate_pages)(int numPages);
+ void *(*allocate_pages)(int numPages, unsigned int alignment);
void (*free_page)(void * page);
void *(*malloc)(unsigned int size);
unsigned int (*get_cpu)(void);
void (*interrupt_cpu)(struct v3_vm_info * vm, int logical_cpu, int vector);
void (*call_on_cpu)(int logical_cpu, void (*fn)(void * arg), void * arg);
- void * (*start_thread_on_cpu)(int logical_cpu, int (*fn)(void * arg), void * arg, char * thread_name);
+ void * (*start_thread_on_cpu)(int cpu_id, int (*fn)(void * arg), void * arg, char * thread_name);
};
struct v3_mem_region * v3_get_mem_region(struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr);
+struct v3_mem_region * v3_get_next_mem_region(struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr);
typedef enum {SHADOW_PAGING, NESTED_PAGING} v3_paging_mode_t;
typedef enum {VM_RUNNING, VM_STOPPED, VM_SUSPENDED, VM_ERROR, VM_EMULATING} v3_vm_operating_mode_t;
+typedef enum {PAGING_4KB, PAGING_2MB} v3_paging_size_t;
typedef enum {INIT, SIPI, REAL, /*UNREAL,*/ PROTECTED, PROTECTED_PAE, LONG, LONG_32_COMPAT, LONG_16_COMPAT} v3_cpu_mode_t;
typedef enum {PHYSICAL_MEM, VIRTUAL_MEM} v3_mem_mode_t;
+
+
struct apic_msr {
union {
uint64_t value;
} __attribute__((packed));
-
-
struct apic_state {
addr_t base_addr;
v3_lock_t lock;
};
+
+
+
+
static int apic_read(struct guest_info * core, addr_t guest_addr, void * dst, uint_t length, void * priv_data);
static int apic_write(struct guest_info * core, addr_t guest_addr, void * src, uint_t length, void * priv_data);
// ICC???
PrintDebug("apic %u: core %u: sending cmd 0x%llx to apic %u\n",apic->lapic_id.val,core->cpu_id,
apic->int_cmd.val, apic->int_cmd.dst);
- v3_icc_send_ipi(apic->icc_bus, apic->lapic_id.val, apic->int_cmd.val,0);
+ if (v3_icc_send_ipi(apic->icc_bus, apic->lapic_id.val, apic->int_cmd.val,apic->dst_fmt.val,0)==-1) {
+ return -1;
+ }
break;
case INT_CMD_HI_OFFSET:
apic->int_cmd.hi = op_val;
+static int apic_should_deliver_flat(struct guest_info * core, uint8_t mda, void * private_data)
+{
+ struct apic_state * apic = (struct apic_state *)private_data;
+
+ if (mda==0xff || (apic->log_dst.dst_log_id & mda)) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
static struct v3_icc_ops icc_ops = {
.raise_intr = apic_raise_intr,
+ .should_deliver_flat = apic_should_deliver_flat,
};
#include <devices/icc_bus.h>
#include <devices/apic_regs.h>
-
#define MAX_APICS 256
#ifndef CONFIG_DEBUG_ICC_BUS
// icr_data contains interrupt vector *except* for ext_int
// in which case it is given via irq
//
-int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_data, uint32_t extirq) {
+int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_data,
+ uint32_t dfr_data, uint32_t extirq) {
PrintDebug("icc_bus: icc_bus=%p, src_apic=%u, icr_data=%llx, extirq=%u\n",icc_bus,src_apic,icr_data,extirq);
struct int_cmd_reg *icr = (struct int_cmd_reg *)&icr_data;
+ struct dst_fmt_reg *dfr = (struct dst_fmt_reg*)&dfr_data;
struct icc_bus_state * state = (struct icc_bus_state *)icc_bus->private_data;
// initial sanity checks
PrintError("icc_bus: Attempted send to unregistered apic id=%u\n",icr->dst);
return -1;
}
-
- struct apic_data * dest_apic = &(state->apics[icr->dst]);
- PrintDebug("icc_bus: IPI %s %u from %s %u to %s %u (icr=0x%llx) (extirq=%u)\n",
- deliverymode_str[icr->del_mode], icr->vec, src_apic==state->ioapic_id ? "ioapic" : "apic",
- src_apic, shorthand_str[icr->dst_shorthand], icr->dst,icr->val,
+ PrintDebug("icc_bus: IPI %s %u from %s %u to %s %s %u (icr=0x%llx, dfr=0x%x) (extirq=%u)\n",
+ deliverymode_str[icr->del_mode], icr->vec,
+ src_apic==state->ioapic_id ? "ioapic" : "apic",
+ src_apic,
+ icr->dst_mode==0 ? "(physical)" : "(logical)",
+ shorthand_str[icr->dst_shorthand], icr->dst,icr->val, dfr->val,
extirq);
+ /*
+
+ if (icr->dst==state->ioapic_id) {
+ PrintError("icc_bus: Attempted send to ioapic ignored\n");
+ return -1;
+ }
+ */
switch (icr->dst_shorthand) {
case 0: // no shorthand
- if (deliver(src_apic,dest_apic,icr,state,extirq)) {
- return -1;
+ if (icr->dst_mode==0) {
+ // physical delivery
+ struct apic_data * dest_apic = &(state->apics[icr->dst]);
+ if (deliver(src_apic,dest_apic,icr,state,extirq)) {
+ return -1;
+ }
+ } else {
+ // logical delivery
+ uint8_t mda = icr->dst; // message destination address, not physical address
+
+ if (dfr->model==0xf) {
+ // flat model
+ // deliver irq if
+ // mda of sender & ldr of receiver is nonzero
+ // mda=0xff means broadcaset to all
+
+ int i;
+ for (i=0;i<MAX_APICS;i++) {
+ struct apic_data *dest_apic=&(state->apics[i]);
+ if (dest_apic->present &&
+ dest_apic->ops->should_deliver_flat(dest_apic->core,
+ mda,
+ dest_apic->priv_data)) {
+ if (deliver(src_apic,dest_apic,icr,state,extirq)) {
+ return -1;
+ }
+ }
+ }
+ } else {
+ // cluster model
+ PrintError("icc_bus: use of cluster model not yet supported\n");
+ return -1;
+ }
}
+
break;
case 1: // self
PrintError("icc_bus: ioapic attempting to send to itself\n");
return -1;
}
+ struct apic_data *dest_apic=&(state->apics[src_apic]);
if (deliver(src_apic,dest_apic,icr,state,extirq)) {
return -1;
}
case 3: { // all and all-but-me
int i;
for (i=0;i<MAX_APICS;i++) {
- dest_apic=&(state->apics[i]);
+ struct apic_data *dest_apic=&(state->apics[i]);
if (dest_apic->present && (i!=src_apic || icr->dst_shorthand==2)) {
if (deliver(src_apic,dest_apic,icr,state,extirq)) {
return -1;
}
}
break;
- }
+ }
return 0;
}
icr.dst_shorthand=0; // no shorthand
icr.rsvd2=0;
- v3_icc_send_ipi(ioapic->icc_bus, ioapic->ioapic_id.val,icr.val, irq);
+ // Note: 0 yhere is "cluster model", but it should be irrelevant
+ // since we are sending this as a physical destination
+ v3_icc_send_ipi(ioapic->icc_bus, ioapic->ioapic_id.val,icr.val, 0, irq);
}
return 0;
#include <palacios/vmm_sprintf.h>
+#ifndef CONFIG_DEBUG_SVM
+#undef PrintDebug
+#define PrintDebug(fmt, args...)
+#endif
+
+
uint32_t v3_last_exit;
// This is a global pointer to the host's VMCB
PrintDebug("Memory=%s\n", memory_str);
// Amount of ram the Guest will have, always in MB
- vm->mem_size = atoi(memory_str) * 1024 * 1024;
+ vm->mem_size = (unsigned long)atoi(memory_str) * 1024UL * 1024UL;
if (strcasecmp(vm_class, "PC") == 0) {
vm->vm_class = V3_PC_VM;
return -1;
}
-
#ifdef CONFIG_TELEMETRY
{
char * telemetry = v3_cfg_val(vm_cfg, "telemetry");
v3_cfg_tree_t *vm_tree = info->vm_info->cfg_data->cfg;
v3_cfg_tree_t *pg_tree = v3_cfg_subtree(vm_tree, "paging");
- char *pg_mode = v3_cfg_val(pg_tree, "mode");
+ char *pg_mode = v3_cfg_val(pg_tree, "mode");
+ char *page_size = v3_cfg_val(pg_tree, "page_size");
PrintDebug("Paging mode specified as %s\n", pg_mode);
info->shdw_pg_mode = SHADOW_PAGING;
}
} else {
- PrintDebug("No paging mode specified in configuration.\n");
+ PrintDebug("No paging type specified in configuration. Defaulting to shadow paging\n");
info->shdw_pg_mode = SHADOW_PAGING;
}
if (info->shdw_pg_mode == NESTED_PAGING) {
PrintDebug("Guest Paging Mode: NESTED_PAGING\n");
+ if (strcasecmp(page_size, "4kb") == 0) { /* TODO: this may not be an ideal place for this */
+ info->vm_info->paging_size = PAGING_4KB;
+ } else if (strcasecmp(page_size, "2mb") == 0) {
+ info->vm_info->paging_size = PAGING_2MB;
+ } else {
+ PrintError("Invalid VM paging size: '%s'\n", page_size);
+ return -1;
+ }
+ PrintDebug("VM page size=%s\n", page_size);
} else if (info->shdw_pg_mode == SHADOW_PAGING) {
PrintDebug("Guest Paging Mode: SHADOW_PAGING\n");
} else {
#include <palacios/vm_guest_mem.h>
#include <palacios/vm_guest.h>
-
+// Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection"
static inline int handle_passthrough_pagefault_64(struct guest_info * info,
addr_t fault_addr,
pf_error_t error_code) {
- pml4e64_t * pml = NULL;
- pdpe64_t * pdpe = NULL;
- pde64_t * pde = NULL;
- pte64_t * pte = NULL;
- addr_t host_addr = 0;
-
- int pml_index = PML4E64_INDEX(fault_addr);
+ pml4e64_t * pml = NULL;
+ pdpe64_t * pdpe = NULL;
+ pde64_t * pde = NULL;
+ pde64_2MB_t * pde2mb = NULL;
+ pte64_t * pte = NULL;
+ addr_t host_addr = 0;
+
+ int pml_index = PML4E64_INDEX(fault_addr);
int pdpe_index = PDPE64_INDEX(fault_addr);
- int pde_index = PDE64_INDEX(fault_addr);
- int pte_index = PTE64_INDEX(fault_addr);
-
-
-
+ int pde_index = PDE64_INDEX(fault_addr);
+ int pte_index = PTE64_INDEX(fault_addr);
struct v3_mem_region * region = v3_get_mem_region(info->vm_info, info->cpu_id, fault_addr);
-
+ struct v3_mem_region * base_reg = &(info->vm_info->mem_map.base_region);
+
+ /* If the guest has been configured for 2MiB pages, then we must check for hooked regions of
+ * memory which may overlap with the 2MiB page containing the faulting address (due to
+ * potentially differing access policies in place for e.g. i/o devices and APIC). A 2MiB page
+ * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains
+ * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this
+ * note if someone decides to enable this optimization. It can be tested with the SeaStar
+ * mapping.
+ *
+ * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg)
+ *
+ * |region| |region| 2MiB mapped (state A)
+ * |reg| |REG| 2MiB mapped (state B)
+ * |region| |reg| |REG| |region| |reg| 4KiB mapped (state C)
+ * |reg| |reg| |--REGION---| [2MiB mapped (state D)]
+ * |--------------------------------------------| RAM
+ * ^ fault addr
+ * |----|----|----|----|----|page|----|----|----| 2MB pages
+ * >>>>>>>>>>>>>>>>>>>> search space
+ */
+ addr_t pg_start = 0UL, pg_end = 0UL; // 2MiB page containing the faulting address
+ struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr
+ bool use_large_page = false;
+
if (region == NULL) {
- PrintError("Invalid region in passthrough page fault 64, addr=%p\n",
- (void *)fault_addr);
+ PrintError("%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr);
return -1;
}
+ // set use_large_page here
+ if (info->vm_info->paging_size == PAGING_2MB) {
+
+ // guest page maps to a host page + offset (so when we shift, it aligns with a host page)
+ pg_start = PAGE_ADDR_2MB(fault_addr);
+ pg_end = (pg_start + PAGE_SIZE_2MB);
+
+ PrintDebug("%s: page [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end);
+
+ pg_next_reg = v3_get_next_mem_region(info->vm_info, info->cpu_id, pg_start);
+
+ if (pg_next_reg == NULL) {
+ PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr);
+ return -1;
+ }
+
+ if ((pg_next_reg->guest_start == base_reg->guest_start) &&
+ (pg_next_reg->guest_end == base_reg->guest_end)) { // next region == base region
+ use_large_page = 1; // State A
+ } else {
+#if 0 // State B/C and D optimization
+ use_large_page = (pg_next_reg->guest_end >= pg_end) &&
+ ((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start));
+ PrintDebug("%s: region [%p,%p) %s partial overlap with page\n", __FUNCTION__,
+ (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
+ (use_large_page ? "does not have" : "has"));
+#else // State B/C
+ use_large_page = (pg_next_reg->guest_start >= pg_end);
+ PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__,
+ (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
+ (use_large_page ? "does not have" : "has"));
+#endif
+ }
+ }
+
+ PrintDebug("%s: Address gets a 2MiB page? %s\n", __FUNCTION__, (use_large_page ? "yes" : "no"));
+
// Lookup the correct PML address based on the PAGING MODE
if (info->shdw_pg_mode == SHADOW_PAGING) {
pml = CR3_TO_PML4E64_VA(info->ctrl_regs.cr3);
pml[pml_index].writable = 1;
pml[pml_index].user_page = 1;
- pml[pml_index].pdp_base_addr = PAGE_BASE_ADDR((addr_t)V3_PAddr(pdpe));
+ pml[pml_index].pdp_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pdpe));
} else {
- pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pml[pml_index].pdp_base_addr));
+ pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pml[pml_index].pdp_base_addr));
}
// Fix up the PDPE entry
pdpe[pdpe_index].writable = 1;
pdpe[pdpe_index].user_page = 1;
- pdpe[pdpe_index].pd_base_addr = PAGE_BASE_ADDR((addr_t)V3_PAddr(pde));
+ pdpe[pdpe_index].pd_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pde));
} else {
- pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pdpe[pdpe_index].pd_base_addr));
+ pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pdpe[pdpe_index].pd_base_addr));
+ }
+
+ // Fix up the 2MiB PDE and exit here
+ if (use_large_page) {
+
+ pde2mb = (pde64_2MB_t *)pde; // all but these two lines are the same for PTE
+ pde2mb[pde_index].large_page = 1;
+
+ if (pde2mb[pde_index].present == 0) {
+ pde2mb[pde_index].user_page = 1;
+
+ if ((region->flags.alloced == 1) && (region->flags.read == 1)) {
+ // Full access
+ pde2mb[pde_index].present = 1;
+
+ if (region->flags.write == 1) {
+ pde2mb[pde_index].writable = 1;
+ } else {
+ pde2mb[pde_index].writable = 0;
+ }
+
+ if (v3_gpa_to_hpa(info, fault_addr, &host_addr) == -1) {
+ PrintError("Error Could not translate fault addr (%p)\n", (void *)fault_addr);
+ return -1;
+ }
+
+ pde2mb[pde_index].page_base_addr = PAGE_BASE_ADDR_2MB(host_addr);
+ } else {
+ return region->unhandled(info, fault_addr, fault_addr, region, error_code);
+ }
+ } else {
+ // We fix all permissions on the first pass,
+ // so we only get here if its an unhandled exception
+
+ return region->unhandled(info, fault_addr, fault_addr, region, error_code);
+ }
}
+ // Continue with the 4KiB page heirarchy
// Fix up the PDE entry
if (pde[pde_index].present == 0) {
pde[pde_index].writable = 1;
pde[pde_index].user_page = 1;
- pde[pde_index].pt_base_addr = PAGE_BASE_ADDR((addr_t)V3_PAddr(pte));
+ pde[pde_index].pt_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pte));
} else {
- pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr));
+ pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pde[pde_index].pt_base_addr));
}
return -1;
}
- pte[pte_index].page_base_addr = PAGE_BASE_ADDR(host_addr);
+ pte[pte_index].page_base_addr = PAGE_BASE_ADDR_4KB(host_addr);
} else {
return region->unhandled(info, fault_addr, fault_addr, region, error_code);
}
if (pdpe[pdpe_index].present == 0) {
return 0;
- } else if (pdpe[pdpe_index].large_page == 1) {
+ } else if (pdpe[pdpe_index].large_page == 1) { // 1GiB
pdpe[pdpe_index].present = 0;
return 0;
}
if (pde[pde_index].present == 0) {
return 0;
- } else if (pde[pde_index].large_page == 1) {
+ } else if (pde[pde_index].large_page == 1) { // 2MiB
pde[pde_index].present = 0;
return 0;
}
pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr));
- pte[pte_index].present = 0;
+ pte[pte_index].present = 0; // 4KiB
return 0;
}
// There is an underlying region that contains all of the guest memory
// PrintDebug("Mapping %d pages of memory (%u bytes)\n", (int)mem_pages, (uint_t)info->mem_size);
+ // 2MB page alignment needed for 2MB hardware nested paging
map->base_region.guest_start = 0;
map->base_region.guest_end = mem_pages * PAGE_SIZE_4KB;
- map->base_region.host_addr = (addr_t)V3_AllocPages(mem_pages);
+ map->base_region.host_addr = (addr_t)V3_AllocAlignedPages(mem_pages, PAGE_SIZE_2MB);
map->base_region.flags.read = 1;
map->base_region.flags.write = 1;
-int v3_insert_mem_region(struct v3_vm_info * vm,
- struct v3_mem_region * region) {
+int v3_insert_mem_region(struct v3_vm_info * vm, struct v3_mem_region * region) {
struct v3_mem_region * ret;
int i = 0;
}
+/* Search the "hooked" memory regions for a region that ends after the given address. If the
+ * address is invalid, return NULL. Else, return the first region found or the base region if no
+ * region ends after the given address.
+ */
+struct v3_mem_region * v3_get_next_mem_region( struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr) {
+ struct rb_node * n = vm->mem_map.mem_regions.rb_node;
+ struct v3_mem_region * reg = NULL;
+
+ // Keep going to the right in the tree while the address is greater than the current region's
+ // end address.
+ while (n) {
+ reg = rb_entry(n, struct v3_mem_region, tree_node);
+ if (guest_addr >= reg->guest_end) { // reg is [start,end)
+ n = n->rb_right;
+ } else {
+ // PAD this may be buggy since there is no guarantees that
+ // the cores are in order
+ if ((core_id == reg->core_id) || (reg->core_id == V3_MEM_CORE_ANY)) {
+ return reg;
+ } else {
+ n = n->rb_right;
+ }
+ }
+ }
+
+ // There is no registered region, so we check if it's a valid address in the base region
+
+ if (guest_addr >= vm->mem_map.base_region.guest_end) {
+ PrintError("%s: Guest Address Exceeds Base Memory Size (ga=%p), (limit=%p)\n",
+ __FUNCTION__, (void *)guest_addr, (void *)vm->mem_map.base_region.guest_end);
+ v3_print_mem_map(vm);
+ return NULL;
+ }
+
+ return &(vm->mem_map.base_region);
+}
+
void v3_delete_mem_region(struct v3_vm_info * vm, struct v3_mem_region * reg) {
</paging>
<!--
<paging mode="nested">
- <pagesize>4KB</pagesize>
+ <page_size>2MB</page_size>
</paging>
-->
<schedule_hz>100</schedule_hz>