Functional 2 core linux guest

diff --git a/palacios/include/devices/apic.h b/palacios/include/devices/apic.h

index f78e2a6..e4452e7 100644 (file)
--- a/palacios/include/devices/apic.h
+++ b/palacios/include/devices/apic.h
@@ -25,6 +25,7 @@
 #include <palacios/vmm_dev_mgr.h>
 
 
+
 int v3_apic_raise_intr(struct guest_info * info, struct vm_device * apic_dev, int intr_num);
 
 
diff --git a/palacios/include/devices/icc_bus.h b/palacios/include/devices/icc_bus.h

index da5f39f..c3ec43a 100644 (file)
--- a/palacios/include/devices/icc_bus.h
+++ b/palacios/include/devices/icc_bus.h
@@ -23,6 +23,7 @@
 
 struct v3_icc_ops {
     int (*raise_intr)(struct guest_info * core, int intr_num, void * private_data);
+    int (*should_deliver_flat)(struct guest_info * core, uint8_t mda, void * private_data);
 };
 
 
@@ -39,9 +40,10 @@ int v3_icc_register_ioapic(struct v3_vm_info *vm, struct vm_device * icc_bus, ui
  * @param apic_src - The source APIC id.
  * @param apic_num - The remote APIC number.
  * @param icr      - A copy of the APIC's ICR.  (LAPIC-style ICR, clone from redir table for ioapics)
+ * @param dfr      - A copy of the APIC's DFR   (LAPIC-style DFR)
  & @param extirq   - irq for external interrupts (e.g., from 8259)
  */
-int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t apic_src, uint64_t icr, uint32_t ext_irq);
+int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t apic_src, uint64_t icr, uint32_t dfr, uint32_t ext_irq);
 
 
 #if 0
diff --git a/palacios/include/palacios/vm_guest.h b/palacios/include/palacios/vm_guest.h

index b13f462..286592d 100644 (file)
--- a/palacios/include/palacios/vm_guest.h
+++ b/palacios/include/palacios/vm_guest.h
@@ -66,7 +66,6 @@ struct guest_info {
     struct v3_shdw_pg_state shdw_pg_state;
     addr_t direct_map_pt;
 
-
     // This structure is how we get interrupts for the guest
     struct v3_intr_core_state intr_core_state;
 
@@ -120,6 +119,8 @@ struct v3_vm_info {
     addr_t mem_size; // In bytes for now
     struct v3_mem_map mem_map;
 
+    v3_paging_size_t paging_size; // for nested paging
+
     struct v3_mem_hooks mem_hooks;
 
     struct v3_shdw_impl_state shdw_impl;
diff --git a/palacios/include/palacios/vmm.h b/palacios/include/palacios/vmm.h

index bf13c3f..f37ad83 100644 (file)
--- a/palacios/include/palacios/vmm.h
+++ b/palacios/include/palacios/vmm.h
@@ -67,14 +67,26 @@ struct guest_info;
 
 
 
-#define V3_AllocPages(num_pages)                               \
-    ({                                                         \
-       extern struct v3_os_hooks * os_hooks;                   \
-       void * ptr = 0;                                         \
-       if ((os_hooks) && (os_hooks)->allocate_pages) {         \
-           ptr = (os_hooks)->allocate_pages(num_pages);        \
-       }                                                       \
-       ptr;                                                    \
+/* 4KB-aligned */
+#define V3_AllocPages(num_pages)                                       \
+    ({                                                                 \
+       extern struct v3_os_hooks * os_hooks;                           \
+       void * ptr = 0;                                                 \
+       if ((os_hooks) && (os_hooks)->allocate_pages) {                 \
+           ptr = (os_hooks)->allocate_pages(num_pages,PAGE_SIZE_4KB);  \
+       }                                                               \
+       ptr;                                                            \
+    })
+
+
+#define V3_AllocAlignedPages(num_pages, align)                         \
+    ({                                                                 \
+       extern struct v3_os_hooks * os_hooks;                           \
+       void * ptr = 0;                                                 \
+       if ((os_hooks) && (os_hooks)->allocate_pages) {                 \
+           ptr = (os_hooks)->allocate_pages(num_pages,align);          \
+       }                                                               \
+       ptr;                                                            \
     })
 
 
@@ -239,7 +251,7 @@ struct v3_os_hooks {
     void (*print)(const char * format, ...)
        __attribute__ ((format (printf, 1, 2)));
   
-    void *(*allocate_pages)(int numPages);
+    void *(*allocate_pages)(int numPages, unsigned int alignment);
     void (*free_page)(void * page);
 
     void *(*malloc)(unsigned int size);
@@ -266,7 +278,7 @@ struct v3_os_hooks {
     unsigned int (*get_cpu)(void);
     void (*interrupt_cpu)(struct v3_vm_info * vm, int logical_cpu, int vector);
     void (*call_on_cpu)(int logical_cpu, void (*fn)(void * arg), void * arg);
-    void * (*start_thread_on_cpu)(int logical_cpu, int (*fn)(void * arg), void * arg, char * thread_name);
+    void * (*start_thread_on_cpu)(int cpu_id, int (*fn)(void * arg), void * arg, char * thread_name);
 
 };
 
diff --git a/palacios/include/palacios/vmm_mem.h b/palacios/include/palacios/vmm_mem.h

index 7b6d5d4..a8e776a 100644 (file)
--- a/palacios/include/palacios/vmm_mem.h
+++ b/palacios/include/palacios/vmm_mem.h
@@ -103,6 +103,7 @@ int v3_add_shadow_mem(struct v3_vm_info * vm, uint16_t core_id,
 
 
 struct v3_mem_region * v3_get_mem_region(struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr);
+struct v3_mem_region * v3_get_next_mem_region(struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr);
 
 
 
diff --git a/palacios/include/palacios/vmm_types.h b/palacios/include/palacios/vmm_types.h

index 0c95d0d..fc4fd5f 100644 (file)
--- a/palacios/include/palacios/vmm_types.h
+++ b/palacios/include/palacios/vmm_types.h
@@ -29,6 +29,7 @@
 typedef enum {SHADOW_PAGING, NESTED_PAGING} v3_paging_mode_t;
 typedef enum {VM_RUNNING, VM_STOPPED, VM_SUSPENDED, VM_ERROR, VM_EMULATING} v3_vm_operating_mode_t;
 
+typedef enum {PAGING_4KB, PAGING_2MB} v3_paging_size_t;
 
 typedef enum {INIT, SIPI, REAL, /*UNREAL,*/ PROTECTED, PROTECTED_PAE, LONG, LONG_32_COMPAT, LONG_16_COMPAT} v3_cpu_mode_t;
 typedef enum {PHYSICAL_MEM, VIRTUAL_MEM} v3_mem_mode_t;
diff --git a/palacios/src/devices/apic.c b/palacios/src/devices/apic.c

index ad684f1..3064490 100644 (file)
--- a/palacios/src/devices/apic.c
+++ b/palacios/src/devices/apic.c
@@ -118,6 +118,8 @@ typedef enum { APIC_TMR_INT, APIC_THERM_INT, APIC_PERF_INT,
 
 
 
+
+
 struct apic_msr {
     union {
        uint64_t value;
@@ -133,8 +135,6 @@ struct apic_msr {
 } __attribute__((packed));
 
 
-
-
 struct apic_state {
     addr_t base_addr;
 
@@ -188,6 +188,10 @@ struct apic_state {
     v3_lock_t  lock;
 };
 
+
+
+
+
 static int apic_read(struct guest_info * core, addr_t guest_addr, void * dst, uint_t length, void * priv_data);
 static int apic_write(struct guest_info * core, addr_t guest_addr, void * src, uint_t length, void * priv_data);
 
@@ -888,7 +892,9 @@ static int apic_write(struct guest_info * core, addr_t guest_addr, void * src, u
            // ICC???
            PrintDebug("apic %u: core %u: sending cmd 0x%llx to apic %u\n",apic->lapic_id.val,core->cpu_id,
                       apic->int_cmd.val, apic->int_cmd.dst);
-           v3_icc_send_ipi(apic->icc_bus, apic->lapic_id.val, apic->int_cmd.val,0);
+           if (v3_icc_send_ipi(apic->icc_bus, apic->lapic_id.val, apic->int_cmd.val,apic->dst_fmt.val,0)==-1) { 
+               return -1;
+           }
            break;
        case INT_CMD_HI_OFFSET:
            apic->int_cmd.hi = op_val;
@@ -1098,8 +1104,20 @@ static struct v3_device_ops dev_ops = {
 
 
 
+static int apic_should_deliver_flat(struct guest_info * core, uint8_t mda, void * private_data)
+{
+  struct apic_state * apic = (struct apic_state *)private_data;
+
+  if (mda==0xff || (apic->log_dst.dst_log_id & mda)) { 
+      return 1;
+  } else {
+      return 0;
+  }
+}
+
 static struct v3_icc_ops icc_ops = {
     .raise_intr = apic_raise_intr,
+    .should_deliver_flat = apic_should_deliver_flat,
 };
 
 
diff --git a/palacios/src/devices/icc_bus.c b/palacios/src/devices/icc_bus.c

index c02e7f5..99eed8f 100644 (file)
--- a/palacios/src/devices/icc_bus.c
+++ b/palacios/src/devices/icc_bus.c
@@ -23,7 +23,6 @@
 #include <devices/icc_bus.h>
 #include <devices/apic_regs.h>
 
-
 #define MAX_APICS 256
 
 #ifndef CONFIG_DEBUG_ICC_BUS
@@ -202,11 +201,13 @@ static int deliver(uint32_t src_apic, struct apic_data *dest_apic, struct int_cm
 // icr_data contains interrupt vector *except* for ext_int
 // in which case it is given via irq
 //
-int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_data, uint32_t extirq) {
+int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_data, 
+                   uint32_t dfr_data, uint32_t extirq) {
 
     PrintDebug("icc_bus: icc_bus=%p, src_apic=%u, icr_data=%llx, extirq=%u\n",icc_bus,src_apic,icr_data,extirq);
 
     struct int_cmd_reg *icr = (struct int_cmd_reg *)&icr_data;
+    struct dst_fmt_reg *dfr = (struct dst_fmt_reg*)&dfr_data;
     struct icc_bus_state * state = (struct icc_bus_state *)icc_bus->private_data;
 
     // initial sanity checks
@@ -218,23 +219,63 @@ int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_
        PrintError("icc_bus: Attempted send to unregistered apic id=%u\n",icr->dst);
        return -1;
     }
-    
-    struct apic_data * dest_apic =  &(state->apics[icr->dst]);
 
-    PrintDebug("icc_bus: IPI %s %u from %s %u to %s %u (icr=0x%llx) (extirq=%u)\n",
-              deliverymode_str[icr->del_mode], icr->vec, src_apic==state->ioapic_id ? "ioapic" : "apic",
-              src_apic, shorthand_str[icr->dst_shorthand], icr->dst,icr->val,
+    PrintDebug("icc_bus: IPI %s %u from %s %u to %s %s %u (icr=0x%llx, dfr=0x%x) (extirq=%u)\n",
+              deliverymode_str[icr->del_mode], icr->vec, 
+              src_apic==state->ioapic_id ? "ioapic" : "apic",
+              src_apic,               
+              icr->dst_mode==0 ? "(physical)" : "(logical)", 
+              shorthand_str[icr->dst_shorthand], icr->dst,icr->val, dfr->val,
               extirq);
 
+    /*
+
+    if (icr->dst==state->ioapic_id) { 
+       PrintError("icc_bus: Attempted send to ioapic ignored\n");
+       return -1;
+    }
+    */
 
 
 
     switch (icr->dst_shorthand) {
 
        case 0:  // no shorthand
-           if (deliver(src_apic,dest_apic,icr,state,extirq)) { 
-               return -1;
+           if (icr->dst_mode==0) { 
+               // physical delivery
+               struct apic_data * dest_apic =  &(state->apics[icr->dst]);
+               if (deliver(src_apic,dest_apic,icr,state,extirq)) { 
+                   return -1;
+               }
+           } else {
+               // logical delivery
+               uint8_t mda = icr->dst; // message destination address, not physical address
+               
+               if (dfr->model==0xf) { 
+                   // flat model
+                   // deliver irq if
+                   // mda of sender & ldr of receiver is nonzero
+                   // mda=0xff means broadcaset to all
+                   
+                   int i;
+                   for (i=0;i<MAX_APICS;i++) { 
+                       struct apic_data *dest_apic=&(state->apics[i]);
+                       if (dest_apic->present &&
+                           dest_apic->ops->should_deliver_flat(dest_apic->core,
+                                                               mda,
+                                                               dest_apic->priv_data)) { 
+                           if (deliver(src_apic,dest_apic,icr,state,extirq)) { 
+                               return -1;
+                           }
+                       }
+                   }
+               } else {
+                   // cluster model
+                   PrintError("icc_bus: use of cluster model not yet supported\n");
+                   return -1;
+               }
            }
+               
            break;
 
        case 1:  // self
@@ -242,6 +283,7 @@ int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_
                PrintError("icc_bus: ioapic attempting to send to itself\n");
                return -1;
            }
+           struct apic_data *dest_apic=&(state->apics[src_apic]);
            if (deliver(src_apic,dest_apic,icr,state,extirq)) { 
                return -1;
            }
@@ -251,7 +293,7 @@ int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_
        case 3: { // all and all-but-me
            int i;
            for (i=0;i<MAX_APICS;i++) { 
-               dest_apic=&(state->apics[i]);
+               struct apic_data *dest_apic=&(state->apics[i]);
                if (dest_apic->present && (i!=src_apic || icr->dst_shorthand==2)) { 
                    if (deliver(src_apic,dest_apic,icr,state,extirq)) { 
                        return -1;
@@ -260,7 +302,7 @@ int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_
            }
        }
            break;
-    }
+           }
 
     return 0;
 }
diff --git a/palacios/src/devices/io_apic.c b/palacios/src/devices/io_apic.c

index ad238f3..dbbab94 100644 (file)
--- a/palacios/src/devices/io_apic.c
+++ b/palacios/src/devices/io_apic.c
@@ -291,7 +291,9 @@ static int ioapic_raise_irq(struct v3_vm_info * vm, void * private_data, int irq
        icr.dst_shorthand=0; // no shorthand
        icr.rsvd2=0;
 
-       v3_icc_send_ipi(ioapic->icc_bus, ioapic->ioapic_id.val,icr.val, irq);
+       // Note: 0 yhere is "cluster model", but it should be irrelevant
+       // since we are sending this as a physical destination
+       v3_icc_send_ipi(ioapic->icc_bus, ioapic->ioapic_id.val,icr.val, 0, irq);
     }
 
     return 0;
diff --git a/palacios/src/palacios/svm.c b/palacios/src/palacios/svm.c

index 34fed45..7b33c8c 100644 (file)
--- a/palacios/src/palacios/svm.c
+++ b/palacios/src/palacios/svm.c
@@ -44,6 +44,12 @@
 #include <palacios/vmm_sprintf.h>
 
 
+#ifndef CONFIG_DEBUG_SVM
+#undef PrintDebug
+#define PrintDebug(fmt, args...)
+#endif
+
+
 uint32_t v3_last_exit;
 
 // This is a global pointer to the host's VMCB
diff --git a/palacios/src/palacios/vmm_config.c b/palacios/src/palacios/vmm_config.c

index 9ba0a11..86eb1dd 100644 (file)
--- a/palacios/src/palacios/vmm_config.c
+++ b/palacios/src/palacios/vmm_config.c
@@ -199,7 +199,7 @@ static int pre_config_vm(struct v3_vm_info * vm, v3_cfg_tree_t * vm_cfg) {
     PrintDebug("Memory=%s\n", memory_str);
 
     // Amount of ram the Guest will have, always in MB
-    vm->mem_size = atoi(memory_str) * 1024 * 1024;
+    vm->mem_size = (unsigned long)atoi(memory_str) * 1024UL * 1024UL;
     
     if (strcasecmp(vm_class, "PC") == 0) {
        vm->vm_class = V3_PC_VM;
@@ -208,7 +208,6 @@ static int pre_config_vm(struct v3_vm_info * vm, v3_cfg_tree_t * vm_cfg) {
        return -1;
     }
 
-
 #ifdef CONFIG_TELEMETRY
     {
        char * telemetry = v3_cfg_val(vm_cfg, "telemetry");
@@ -247,7 +246,8 @@ static int determine_paging_mode(struct guest_info *info, v3_cfg_tree_t * core_c
 
     v3_cfg_tree_t *vm_tree = info->vm_info->cfg_data->cfg;
     v3_cfg_tree_t *pg_tree = v3_cfg_subtree(vm_tree, "paging");
-    char *pg_mode = v3_cfg_val(pg_tree, "mode");
+    char *pg_mode          = v3_cfg_val(pg_tree, "mode");
+    char *page_size        = v3_cfg_val(pg_tree, "page_size");
     
     PrintDebug("Paging mode specified as %s\n", pg_mode);
 
@@ -266,12 +266,21 @@ static int determine_paging_mode(struct guest_info *info, v3_cfg_tree_t * core_c
            info->shdw_pg_mode = SHADOW_PAGING;
        }
     } else {
-       PrintDebug("No paging mode specified in configuration.\n");
+       PrintDebug("No paging type specified in configuration. Defaulting to shadow paging\n");
        info->shdw_pg_mode = SHADOW_PAGING;
     }
 
     if (info->shdw_pg_mode == NESTED_PAGING) {
        PrintDebug("Guest Paging Mode: NESTED_PAGING\n");
+       if (strcasecmp(page_size, "4kb") == 0) { /* TODO: this may not be an ideal place for this */
+           info->vm_info->paging_size = PAGING_4KB;
+       } else if (strcasecmp(page_size, "2mb") == 0) {
+           info->vm_info->paging_size = PAGING_2MB;
+       } else {
+           PrintError("Invalid VM paging size: '%s'\n", page_size);
+           return -1;
+       }
+       PrintDebug("VM page size=%s\n", page_size);
     } else if (info->shdw_pg_mode == SHADOW_PAGING) {
         PrintDebug("Guest Paging Mode: SHADOW_PAGING\n");
     } else {
diff --git a/palacios/src/palacios/vmm_direct_paging_64.h b/palacios/src/palacios/vmm_direct_paging_64.h

index a0408d9..c4c41e3 100644 (file)
--- a/palacios/src/palacios/vmm_direct_paging_64.h
+++ b/palacios/src/palacios/vmm_direct_paging_64.h
@@ -27,33 +27,91 @@
 #include <palacios/vm_guest_mem.h>
 #include <palacios/vm_guest.h>
 
-
+// Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection"
 
 static inline int handle_passthrough_pagefault_64(struct guest_info * info, 
                                                  addr_t fault_addr, 
                                                  pf_error_t error_code) {
-    pml4e64_t * pml = NULL;
-    pdpe64_t * pdpe = NULL;
-    pde64_t * pde = NULL;
-    pte64_t * pte = NULL;
-    addr_t host_addr = 0;
-
-    int pml_index = PML4E64_INDEX(fault_addr);
+    pml4e64_t * pml      = NULL;
+    pdpe64_t * pdpe      = NULL;
+    pde64_t * pde        = NULL;
+    pde64_2MB_t * pde2mb = NULL;
+    pte64_t * pte        = NULL;
+    addr_t host_addr     = 0;
+
+    int pml_index  = PML4E64_INDEX(fault_addr);
     int pdpe_index = PDPE64_INDEX(fault_addr);
-    int pde_index = PDE64_INDEX(fault_addr);
-    int pte_index = PTE64_INDEX(fault_addr);
-
-
-    
+    int pde_index  = PDE64_INDEX(fault_addr);
+    int pte_index  = PTE64_INDEX(fault_addr);
 
     struct v3_mem_region * region =  v3_get_mem_region(info->vm_info, info->cpu_id, fault_addr);
-  
+    struct v3_mem_region * base_reg = &(info->vm_info->mem_map.base_region);
+
+    /* If the guest has been configured for 2MiB pages, then we must check for hooked regions of
+     * memory which may overlap with the 2MiB page containing the faulting address (due to
+     * potentially differing access policies in place for e.g. i/o devices and APIC). A 2MiB page
+     * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains
+     * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this
+     * note if someone decides to enable this optimization. It can be tested with the SeaStar
+     * mapping.
+     *
+     * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg)
+     *
+     *    |region| |region|                               2MiB mapped (state A)
+     *                   |reg|          |REG|             2MiB mapped (state B)
+     *   |region|     |reg|   |REG| |region|   |reg|      4KiB mapped (state C)
+     *        |reg|  |reg|   |--REGION---|                [2MiB mapped (state D)]
+     * |--------------------------------------------|     RAM
+     *                             ^                      fault addr
+     * |----|----|----|----|----|page|----|----|----|     2MB pages
+     *                           >>>>>>>>>>>>>>>>>>>>     search space
+     */
+    addr_t pg_start = 0UL, pg_end = 0UL; // 2MiB page containing the faulting address
+    struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr
+    bool use_large_page = false;
+
     if (region == NULL) {
-       PrintError("Invalid region in passthrough page fault 64, addr=%p\n", 
-                  (void *)fault_addr);
+       PrintError("%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr);
        return -1;
     }
 
+    // set use_large_page here
+    if (info->vm_info->paging_size == PAGING_2MB) {
+
+       // guest page maps to a host page + offset (so when we shift, it aligns with a host page)
+       pg_start = PAGE_ADDR_2MB(fault_addr);
+       pg_end = (pg_start + PAGE_SIZE_2MB);
+
+       PrintDebug("%s: page   [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end);
+
+       pg_next_reg = v3_get_next_mem_region(info->vm_info, info->cpu_id, pg_start);
+
+       if (pg_next_reg == NULL) {
+           PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr);
+           return -1;
+       }
+
+       if ((pg_next_reg->guest_start == base_reg->guest_start) &&
+               (pg_next_reg->guest_end == base_reg->guest_end)) { // next region == base region
+           use_large_page = 1; // State A
+       } else {
+#if 0       // State B/C and D optimization
+           use_large_page = (pg_next_reg->guest_end >= pg_end) &&
+               ((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start));
+           PrintDebug("%s: region [%p,%p) %s partial overlap with page\n", __FUNCTION__,
+                   (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
+                   (use_large_page ? "does not have" : "has"));
+#else       // State B/C
+           use_large_page = (pg_next_reg->guest_start >= pg_end);
+           PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__,
+                   (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
+                   (use_large_page ? "does not have" : "has"));
+#endif
+       }
+    }
+
+    PrintDebug("%s: Address gets a 2MiB page? %s\n", __FUNCTION__, (use_large_page ? "yes" : "no"));
+
     // Lookup the correct PML address based on the PAGING MODE
     if (info->shdw_pg_mode == SHADOW_PAGING) {
        pml = CR3_TO_PML4E64_VA(info->ctrl_regs.cr3);
@@ -70,9 +128,9 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * info,
         pml[pml_index].writable = 1;
         pml[pml_index].user_page = 1;
 
-       pml[pml_index].pdp_base_addr = PAGE_BASE_ADDR((addr_t)V3_PAddr(pdpe));    
+       pml[pml_index].pdp_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pdpe));
     } else {
-       pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pml[pml_index].pdp_base_addr));
+       pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pml[pml_index].pdp_base_addr));
     }
 
     // Fix up the PDPE entry
@@ -84,11 +142,48 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * info,
        pdpe[pdpe_index].writable = 1;
        pdpe[pdpe_index].user_page = 1;
 
-       pdpe[pdpe_index].pd_base_addr = PAGE_BASE_ADDR((addr_t)V3_PAddr(pde));    
+       pdpe[pdpe_index].pd_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pde));
     } else {
-       pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pdpe[pdpe_index].pd_base_addr));
+       pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pdpe[pdpe_index].pd_base_addr));
+    }
+
+    // Fix up the 2MiB PDE and exit here
+    if (use_large_page) {
+
+       pde2mb = (pde64_2MB_t *)pde; // all but these two lines are the same for PTE
+       pde2mb[pde_index].large_page = 1;
+
+       if (pde2mb[pde_index].present == 0) {
+           pde2mb[pde_index].user_page = 1;
+
+           if ((region->flags.alloced == 1) && (region->flags.read == 1)) {
+               // Full access
+               pde2mb[pde_index].present = 1;
+
+               if (region->flags.write == 1) {
+                   pde2mb[pde_index].writable = 1;
+               } else {
+                   pde2mb[pde_index].writable = 0;
+               }
+
+               if (v3_gpa_to_hpa(info, fault_addr, &host_addr) == -1) {
+                   PrintError("Error Could not translate fault addr (%p)\n", (void *)fault_addr);
+                   return -1;
+               }
+
+               pde2mb[pde_index].page_base_addr = PAGE_BASE_ADDR_2MB(host_addr);
+           } else {
+               return region->unhandled(info, fault_addr, fault_addr, region, error_code);
+           }
+       } else {
+           // We fix all permissions on the first pass,
+           // so we only get here if its an unhandled exception
+
+           return region->unhandled(info, fault_addr, fault_addr, region, error_code);
+       }
     }
 
+    // Continue with the 4KiB page heirarchy
 
     // Fix up the PDE entry
     if (pde[pde_index].present == 0) {
@@ -98,9 +193,9 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * info,
        pde[pde_index].writable = 1;
        pde[pde_index].user_page = 1;
        
-       pde[pde_index].pt_base_addr = PAGE_BASE_ADDR((addr_t)V3_PAddr(pte));
+       pde[pde_index].pt_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pte));
     } else {
-       pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr));
+       pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pde[pde_index].pt_base_addr));
     }
 
 
@@ -124,7 +219,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * info,
                return -1;
            }
 
-           pte[pte_index].page_base_addr = PAGE_BASE_ADDR(host_addr);
+           pte[pte_index].page_base_addr = PAGE_BASE_ADDR_4KB(host_addr);
        } else {
            return region->unhandled(info, fault_addr, fault_addr, region, error_code);
        }
@@ -170,7 +265,7 @@ static inline int invalidate_addr_64(struct guest_info * info, addr_t inv_addr)
 
     if (pdpe[pdpe_index].present == 0) {
        return 0;
-    } else if (pdpe[pdpe_index].large_page == 1) {
+    } else if (pdpe[pdpe_index].large_page == 1) { // 1GiB
        pdpe[pdpe_index].present = 0;
        return 0;
     }
@@ -179,14 +274,14 @@ static inline int invalidate_addr_64(struct guest_info * info, addr_t inv_addr)
 
     if (pde[pde_index].present == 0) {
        return 0;
-    } else if (pde[pde_index].large_page == 1) {
+    } else if (pde[pde_index].large_page == 1) { // 2MiB
        pde[pde_index].present = 0;
        return 0;
     }
 
     pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr));
 
-    pte[pte_index].present = 0;
+    pte[pte_index].present = 0; // 4KiB
 
     return 0;
 }
diff --git a/palacios/src/palacios/vmm_mem.c b/palacios/src/palacios/vmm_mem.c

index 80e17a3..1c6d51a 100644 (file)
--- a/palacios/src/palacios/vmm_mem.c
+++ b/palacios/src/palacios/vmm_mem.c
@@ -64,9 +64,10 @@ int v3_init_mem_map(struct v3_vm_info * vm) {
     // There is an underlying region that contains all of the guest memory
     // PrintDebug("Mapping %d pages of memory (%u bytes)\n", (int)mem_pages, (uint_t)info->mem_size);
 
+    // 2MB page alignment needed for 2MB hardware nested paging
     map->base_region.guest_start = 0;
     map->base_region.guest_end = mem_pages * PAGE_SIZE_4KB;
-    map->base_region.host_addr = (addr_t)V3_AllocPages(mem_pages);
+    map->base_region.host_addr = (addr_t)V3_AllocAlignedPages(mem_pages, PAGE_SIZE_2MB);
 
     map->base_region.flags.read = 1;
     map->base_region.flags.write = 1;
@@ -189,8 +190,7 @@ struct v3_mem_region * __insert_mem_region(struct v3_vm_info * vm,
 
 
 
-int v3_insert_mem_region(struct v3_vm_info * vm, 
-                           struct v3_mem_region * region) {
+int v3_insert_mem_region(struct v3_vm_info * vm, struct v3_mem_region * region) {
     struct v3_mem_region * ret;
     int i = 0;
 
@@ -289,6 +289,43 @@ struct v3_mem_region * v3_get_mem_region(struct v3_vm_info * vm, uint16_t core_i
 }
 
 
+/* Search the "hooked" memory regions for a region that ends after the given address.  If the
+ * address is invalid, return NULL. Else, return the first region found or the base region if no
+ * region ends after the given address.
+ */
+struct v3_mem_region * v3_get_next_mem_region( struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr) {
+    struct rb_node * n = vm->mem_map.mem_regions.rb_node;
+    struct v3_mem_region * reg = NULL;
+
+    // Keep going to the right in the tree while the address is greater than the current region's
+    // end address.
+    while (n) {
+        reg = rb_entry(n, struct v3_mem_region, tree_node);
+        if (guest_addr >= reg->guest_end) { // reg is [start,end)
+            n = n->rb_right;
+        } else {
+           // PAD this may be buggy since there is no guarantees that 
+           // the cores are in order
+           if ((core_id == reg->core_id) || (reg->core_id == V3_MEM_CORE_ANY)) {
+               return reg;
+           } else {
+               n = n->rb_right;
+           }
+        }
+    }
+    
+    // There is no registered region, so we check if it's a valid address in the base region
+    
+    if (guest_addr >= vm->mem_map.base_region.guest_end) {
+       PrintError("%s: Guest Address Exceeds Base Memory Size (ga=%p), (limit=%p)\n",
+                  __FUNCTION__, (void *)guest_addr, (void *)vm->mem_map.base_region.guest_end);
+        v3_print_mem_map(vm);
+        return NULL;
+    }
+    
+    return &(vm->mem_map.base_region);
+}
+
 
 
 void v3_delete_mem_region(struct v3_vm_info * vm, struct v3_mem_region * reg) {
diff --git a/utils/guest_creator/default.xml b/utils/guest_creator/default.xml

index 6316742..b1727ac 100644 (file)
--- a/utils/guest_creator/default.xml
+++ b/utils/guest_creator/default.xml
@@ -12,7 +12,7 @@
        </paging>
 <!--
        <paging mode="nested">
-               <pagesize>4KB</pagesize>
+               <page_size>2MB</page_size>
        </paging>
 -->
        <schedule_hz>100</schedule_hz>
palacios/include/devices/apic.h		patch \| blob \| history
palacios/include/devices/icc_bus.h		patch \| blob \| history
palacios/include/palacios/vm_guest.h		patch \| blob \| history
palacios/include/palacios/vmm.h		patch \| blob \| history
palacios/include/palacios/vmm_mem.h		patch \| blob \| history
palacios/include/palacios/vmm_types.h		patch \| blob \| history
palacios/src/devices/apic.c		patch \| blob \| history
palacios/src/devices/icc_bus.c		patch \| blob \| history
palacios/src/devices/io_apic.c		patch \| blob \| history
palacios/src/palacios/svm.c		patch \| blob \| history
palacios/src/palacios/vmm_config.c		patch \| blob \| history
palacios/src/palacios/vmm_direct_paging_64.h		patch \| blob \| history
palacios/src/palacios/vmm_mem.c		patch \| blob \| history
utils/guest_creator/default.xml		patch \| blob \| history