From: Peter Dinda <pdinda@northwestern.edu>
Date: Wed, 4 Aug 2010 00:19:20 +0000 (-0500)
Subject: Functional 2 core linux guest
X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?p=palacios.git;a=commitdiff_plain;h=e531d13b53ac8b32bca19131dd7a2824cb17eff9

Functional 2 core linux guest

Main addition is logical destination mode for IPIs
---

diff --git a/palacios/include/devices/apic.h b/palacios/include/devices/apic.h
index f78e2a6..e4452e7 100644
--- a/palacios/include/devices/apic.h
+++ b/palacios/include/devices/apic.h
@@ -25,6 +25,7 @@
 #include <palacios/vmm_dev_mgr.h>
 
 
+
 int v3_apic_raise_intr(struct guest_info * info, struct vm_device * apic_dev, int intr_num);
 
 
diff --git a/palacios/include/devices/icc_bus.h b/palacios/include/devices/icc_bus.h
index da5f39f..c3ec43a 100644
--- a/palacios/include/devices/icc_bus.h
+++ b/palacios/include/devices/icc_bus.h
@@ -23,6 +23,7 @@
 
 struct v3_icc_ops {
     int (*raise_intr)(struct guest_info * core, int intr_num, void * private_data);
+    int (*should_deliver_flat)(struct guest_info * core, uint8_t mda, void * private_data);
 };
 
 
@@ -39,9 +40,10 @@ int v3_icc_register_ioapic(struct v3_vm_info *vm, struct vm_device * icc_bus, ui
  * @param apic_src - The source APIC id.
  * @param apic_num - The remote APIC number.
  * @param icr      - A copy of the APIC's ICR.  (LAPIC-style ICR, clone from redir table for ioapics)
+ * @param dfr      - A copy of the APIC's DFR   (LAPIC-style DFR)
  & @param extirq   - irq for external interrupts (e.g., from 8259)
  */
-int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t apic_src, uint64_t icr, uint32_t ext_irq);
+int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t apic_src, uint64_t icr, uint32_t dfr, uint32_t ext_irq);
 
 
 #if 0
diff --git a/palacios/include/palacios/vm_guest.h b/palacios/include/palacios/vm_guest.h
index b13f462..286592d 100644
--- a/palacios/include/palacios/vm_guest.h
+++ b/palacios/include/palacios/vm_guest.h
@@ -66,7 +66,6 @@ struct guest_info {
     struct v3_shdw_pg_state shdw_pg_state;
     addr_t direct_map_pt;
 
-
     // This structure is how we get interrupts for the guest
     struct v3_intr_core_state intr_core_state;
 
@@ -120,6 +119,8 @@ struct v3_vm_info {
     addr_t mem_size; // In bytes for now
     struct v3_mem_map mem_map;
 
+    v3_paging_size_t paging_size; // for nested paging
+
     struct v3_mem_hooks mem_hooks;
 
     struct v3_shdw_impl_state shdw_impl;
diff --git a/palacios/include/palacios/vmm.h b/palacios/include/palacios/vmm.h
index bf13c3f..f37ad83 100644
--- a/palacios/include/palacios/vmm.h
+++ b/palacios/include/palacios/vmm.h
@@ -67,14 +67,26 @@ struct guest_info;
 
 
 
-#define V3_AllocPages(num_pages)				\
-    ({								\
-	extern struct v3_os_hooks * os_hooks;			\
-	void * ptr = 0;						\
-	if ((os_hooks) && (os_hooks)->allocate_pages) {		\
-	    ptr = (os_hooks)->allocate_pages(num_pages);	\
-	}							\
-	ptr;							\
+/* 4KB-aligned */
+#define V3_AllocPages(num_pages)			        	\
+    ({							        	\
+	extern struct v3_os_hooks * os_hooks;		        	\
+	void * ptr = 0;					        	\
+	if ((os_hooks) && (os_hooks)->allocate_pages) {	        	\
+	    ptr = (os_hooks)->allocate_pages(num_pages,PAGE_SIZE_4KB);	\
+	}						        	\
+	ptr;						        	\
+    })
+
+
+#define V3_AllocAlignedPages(num_pages, align)		        	\
+    ({							        	\
+	extern struct v3_os_hooks * os_hooks;		        	\
+	void * ptr = 0;					        	\
+	if ((os_hooks) && (os_hooks)->allocate_pages) {	        	\
+	    ptr = (os_hooks)->allocate_pages(num_pages,align);  	\
+	}						        	\
+	ptr;						        	\
     })
 
 
@@ -239,7 +251,7 @@ struct v3_os_hooks {
     void (*print)(const char * format, ...)
   	__attribute__ ((format (printf, 1, 2)));
   
-    void *(*allocate_pages)(int numPages);
+    void *(*allocate_pages)(int numPages, unsigned int alignment);
     void (*free_page)(void * page);
 
     void *(*malloc)(unsigned int size);
@@ -266,7 +278,7 @@ struct v3_os_hooks {
     unsigned int (*get_cpu)(void);
     void (*interrupt_cpu)(struct v3_vm_info * vm, int logical_cpu, int vector);
     void (*call_on_cpu)(int logical_cpu, void (*fn)(void * arg), void * arg);
-    void * (*start_thread_on_cpu)(int logical_cpu, int (*fn)(void * arg), void * arg, char * thread_name);
+    void * (*start_thread_on_cpu)(int cpu_id, int (*fn)(void * arg), void * arg, char * thread_name);
 
 };
 
diff --git a/palacios/include/palacios/vmm_mem.h b/palacios/include/palacios/vmm_mem.h
index 7b6d5d4..a8e776a 100644
--- a/palacios/include/palacios/vmm_mem.h
+++ b/palacios/include/palacios/vmm_mem.h
@@ -103,6 +103,7 @@ int v3_add_shadow_mem(struct v3_vm_info * vm, uint16_t core_id,
 
 
 struct v3_mem_region * v3_get_mem_region(struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr);
+struct v3_mem_region * v3_get_next_mem_region(struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr);
 
 
 
diff --git a/palacios/include/palacios/vmm_types.h b/palacios/include/palacios/vmm_types.h
index 0c95d0d..fc4fd5f 100644
--- a/palacios/include/palacios/vmm_types.h
+++ b/palacios/include/palacios/vmm_types.h
@@ -29,6 +29,7 @@
 typedef enum {SHADOW_PAGING, NESTED_PAGING} v3_paging_mode_t;
 typedef enum {VM_RUNNING, VM_STOPPED, VM_SUSPENDED, VM_ERROR, VM_EMULATING} v3_vm_operating_mode_t;
 
+typedef enum {PAGING_4KB, PAGING_2MB} v3_paging_size_t;
 
 typedef enum {INIT, SIPI, REAL, /*UNREAL,*/ PROTECTED, PROTECTED_PAE, LONG, LONG_32_COMPAT, LONG_16_COMPAT} v3_cpu_mode_t;
 typedef enum {PHYSICAL_MEM, VIRTUAL_MEM} v3_mem_mode_t;
diff --git a/palacios/src/devices/apic.c b/palacios/src/devices/apic.c
index ad684f1..3064490 100644
--- a/palacios/src/devices/apic.c
+++ b/palacios/src/devices/apic.c
@@ -118,6 +118,8 @@ typedef enum { APIC_TMR_INT, APIC_THERM_INT, APIC_PERF_INT,
 
 
 
+
+
 struct apic_msr {
     union {
 	uint64_t value;
@@ -133,8 +135,6 @@ struct apic_msr {
 } __attribute__((packed));
 
 
-
-
 struct apic_state {
     addr_t base_addr;
 
@@ -188,6 +188,10 @@ struct apic_state {
     v3_lock_t  lock;
 };
 
+
+
+
+
 static int apic_read(struct guest_info * core, addr_t guest_addr, void * dst, uint_t length, void * priv_data);
 static int apic_write(struct guest_info * core, addr_t guest_addr, void * src, uint_t length, void * priv_data);
 
@@ -888,7 +892,9 @@ static int apic_write(struct guest_info * core, addr_t guest_addr, void * src, u
 	    // ICC???
 	    PrintDebug("apic %u: core %u: sending cmd 0x%llx to apic %u\n",apic->lapic_id.val,core->cpu_id,
 		       apic->int_cmd.val, apic->int_cmd.dst);
-	    v3_icc_send_ipi(apic->icc_bus, apic->lapic_id.val, apic->int_cmd.val,0);
+	    if (v3_icc_send_ipi(apic->icc_bus, apic->lapic_id.val, apic->int_cmd.val,apic->dst_fmt.val,0)==-1) { 
+		return -1;
+	    }
 	    break;
 	case INT_CMD_HI_OFFSET:
 	    apic->int_cmd.hi = op_val;
@@ -1098,8 +1104,20 @@ static struct v3_device_ops dev_ops = {
 
 
 
+static int apic_should_deliver_flat(struct guest_info * core, uint8_t mda, void * private_data)
+{
+  struct apic_state * apic = (struct apic_state *)private_data;
+
+  if (mda==0xff || (apic->log_dst.dst_log_id & mda)) { 
+      return 1;
+  } else {
+      return 0;
+  }
+}
+
 static struct v3_icc_ops icc_ops = {
     .raise_intr = apic_raise_intr,
+    .should_deliver_flat = apic_should_deliver_flat,
 };
 
 
diff --git a/palacios/src/devices/icc_bus.c b/palacios/src/devices/icc_bus.c
index c02e7f5..99eed8f 100644
--- a/palacios/src/devices/icc_bus.c
+++ b/palacios/src/devices/icc_bus.c
@@ -23,7 +23,6 @@
 #include <devices/icc_bus.h>
 #include <devices/apic_regs.h>
 
-
 #define MAX_APICS 256
 
 #ifndef CONFIG_DEBUG_ICC_BUS
@@ -202,11 +201,13 @@ static int deliver(uint32_t src_apic, struct apic_data *dest_apic, struct int_cm
 // icr_data contains interrupt vector *except* for ext_int
 // in which case it is given via irq
 //
-int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_data, uint32_t extirq) {
+int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_data, 
+		    uint32_t dfr_data, uint32_t extirq) {
 
     PrintDebug("icc_bus: icc_bus=%p, src_apic=%u, icr_data=%llx, extirq=%u\n",icc_bus,src_apic,icr_data,extirq);
 
     struct int_cmd_reg *icr = (struct int_cmd_reg *)&icr_data;
+    struct dst_fmt_reg *dfr = (struct dst_fmt_reg*)&dfr_data;
     struct icc_bus_state * state = (struct icc_bus_state *)icc_bus->private_data;
 
     // initial sanity checks
@@ -218,23 +219,63 @@ int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_
 	PrintError("icc_bus: Attempted send to unregistered apic id=%u\n",icr->dst);
 	return -1;
     }
-    
-    struct apic_data * dest_apic =  &(state->apics[icr->dst]);
 
-    PrintDebug("icc_bus: IPI %s %u from %s %u to %s %u (icr=0x%llx) (extirq=%u)\n",
-	       deliverymode_str[icr->del_mode], icr->vec, src_apic==state->ioapic_id ? "ioapic" : "apic",
-	       src_apic, shorthand_str[icr->dst_shorthand], icr->dst,icr->val,
+    PrintDebug("icc_bus: IPI %s %u from %s %u to %s %s %u (icr=0x%llx, dfr=0x%x) (extirq=%u)\n",
+	       deliverymode_str[icr->del_mode], icr->vec, 
+	       src_apic==state->ioapic_id ? "ioapic" : "apic",
+	       src_apic, 	       
+	       icr->dst_mode==0 ? "(physical)" : "(logical)", 
+	       shorthand_str[icr->dst_shorthand], icr->dst,icr->val, dfr->val,
 	       extirq);
 
+    /*
+
+    if (icr->dst==state->ioapic_id) { 
+	PrintError("icc_bus: Attempted send to ioapic ignored\n");
+	return -1;
+    }
+    */
 
 
 
     switch (icr->dst_shorthand) {
 
 	case 0:  // no shorthand
-	    if (deliver(src_apic,dest_apic,icr,state,extirq)) { 
-		return -1;
+	    if (icr->dst_mode==0) { 
+		// physical delivery
+		struct apic_data * dest_apic =  &(state->apics[icr->dst]);
+		if (deliver(src_apic,dest_apic,icr,state,extirq)) { 
+		    return -1;
+		}
+	    } else {
+		// logical delivery
+		uint8_t mda = icr->dst; // message destination address, not physical address
+		
+		if (dfr->model==0xf) { 
+		    // flat model
+		    // deliver irq if
+		    // mda of sender & ldr of receiver is nonzero
+		    // mda=0xff means broadcaset to all
+		    
+		    int i;
+		    for (i=0;i<MAX_APICS;i++) { 
+			struct apic_data *dest_apic=&(state->apics[i]);
+			if (dest_apic->present &&
+			    dest_apic->ops->should_deliver_flat(dest_apic->core,
+								mda,
+								dest_apic->priv_data)) { 
+			    if (deliver(src_apic,dest_apic,icr,state,extirq)) { 
+				return -1;
+			    }
+			}
+		    }
+		} else {
+		    // cluster model
+		    PrintError("icc_bus: use of cluster model not yet supported\n");
+		    return -1;
+		}
 	    }
+		
 	    break;
 
 	case 1:  // self
@@ -242,6 +283,7 @@ int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_
 		PrintError("icc_bus: ioapic attempting to send to itself\n");
 		return -1;
 	    }
+	    struct apic_data *dest_apic=&(state->apics[src_apic]);
 	    if (deliver(src_apic,dest_apic,icr,state,extirq)) { 
 		return -1;
 	    }
@@ -251,7 +293,7 @@ int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_
 	case 3: { // all and all-but-me
 	    int i;
 	    for (i=0;i<MAX_APICS;i++) { 
-		dest_apic=&(state->apics[i]);
+		struct apic_data *dest_apic=&(state->apics[i]);
 		if (dest_apic->present && (i!=src_apic || icr->dst_shorthand==2)) { 
 		    if (deliver(src_apic,dest_apic,icr,state,extirq)) { 
 			return -1;
@@ -260,7 +302,7 @@ int v3_icc_send_ipi(struct vm_device * icc_bus, uint32_t src_apic, uint64_t icr_
 	    }
 	}
 	    break;
-    }
+	    }
 
     return 0;
 }
diff --git a/palacios/src/devices/io_apic.c b/palacios/src/devices/io_apic.c
index ad238f3..dbbab94 100644
--- a/palacios/src/devices/io_apic.c
+++ b/palacios/src/devices/io_apic.c
@@ -291,7 +291,9 @@ static int ioapic_raise_irq(struct v3_vm_info * vm, void * private_data, int irq
 	icr.dst_shorthand=0; // no shorthand
 	icr.rsvd2=0;
 
-	v3_icc_send_ipi(ioapic->icc_bus, ioapic->ioapic_id.val,icr.val, irq);
+	// Note: 0 yhere is "cluster model", but it should be irrelevant
+	// since we are sending this as a physical destination
+	v3_icc_send_ipi(ioapic->icc_bus, ioapic->ioapic_id.val,icr.val, 0, irq);
     }
 
     return 0;
diff --git a/palacios/src/palacios/svm.c b/palacios/src/palacios/svm.c
index 34fed45..7b33c8c 100644
--- a/palacios/src/palacios/svm.c
+++ b/palacios/src/palacios/svm.c
@@ -44,6 +44,12 @@
 #include <palacios/vmm_sprintf.h>
 
 
+#ifndef CONFIG_DEBUG_SVM
+#undef PrintDebug
+#define PrintDebug(fmt, args...)
+#endif
+
+
 uint32_t v3_last_exit;
 
 // This is a global pointer to the host's VMCB
diff --git a/palacios/src/palacios/vmm_config.c b/palacios/src/palacios/vmm_config.c
index 9ba0a11..86eb1dd 100644
--- a/palacios/src/palacios/vmm_config.c
+++ b/palacios/src/palacios/vmm_config.c
@@ -199,7 +199,7 @@ static int pre_config_vm(struct v3_vm_info * vm, v3_cfg_tree_t * vm_cfg) {
     PrintDebug("Memory=%s\n", memory_str);
 
     // Amount of ram the Guest will have, always in MB
-    vm->mem_size = atoi(memory_str) * 1024 * 1024;
+    vm->mem_size = (unsigned long)atoi(memory_str) * 1024UL * 1024UL;
     
     if (strcasecmp(vm_class, "PC") == 0) {
 	vm->vm_class = V3_PC_VM;
@@ -208,7 +208,6 @@ static int pre_config_vm(struct v3_vm_info * vm, v3_cfg_tree_t * vm_cfg) {
 	return -1;
     }
 
-
 #ifdef CONFIG_TELEMETRY
     {
 	char * telemetry = v3_cfg_val(vm_cfg, "telemetry");
@@ -247,7 +246,8 @@ static int determine_paging_mode(struct guest_info *info, v3_cfg_tree_t * core_c
 
     v3_cfg_tree_t *vm_tree = info->vm_info->cfg_data->cfg;
     v3_cfg_tree_t *pg_tree = v3_cfg_subtree(vm_tree, "paging");
-    char *pg_mode = v3_cfg_val(pg_tree, "mode");
+    char *pg_mode          = v3_cfg_val(pg_tree, "mode");
+    char *page_size        = v3_cfg_val(pg_tree, "page_size");
     
     PrintDebug("Paging mode specified as %s\n", pg_mode);
 
@@ -266,12 +266,21 @@ static int determine_paging_mode(struct guest_info *info, v3_cfg_tree_t * core_c
 	    info->shdw_pg_mode = SHADOW_PAGING;
 	}
     } else {
-	PrintDebug("No paging mode specified in configuration.\n");
+	PrintDebug("No paging type specified in configuration. Defaulting to shadow paging\n");
 	info->shdw_pg_mode = SHADOW_PAGING;
     }
 
     if (info->shdw_pg_mode == NESTED_PAGING) {
     	PrintDebug("Guest Paging Mode: NESTED_PAGING\n");
+	if (strcasecmp(page_size, "4kb") == 0) { /* TODO: this may not be an ideal place for this */
+	    info->vm_info->paging_size = PAGING_4KB;
+	} else if (strcasecmp(page_size, "2mb") == 0) {
+	    info->vm_info->paging_size = PAGING_2MB;
+	} else {
+	    PrintError("Invalid VM paging size: '%s'\n", page_size);
+	    return -1;
+	}
+	PrintDebug("VM page size=%s\n", page_size);
     } else if (info->shdw_pg_mode == SHADOW_PAGING) {
         PrintDebug("Guest Paging Mode: SHADOW_PAGING\n");
     } else {
diff --git a/palacios/src/palacios/vmm_direct_paging_64.h b/palacios/src/palacios/vmm_direct_paging_64.h
index a0408d9..c4c41e3 100644
--- a/palacios/src/palacios/vmm_direct_paging_64.h
+++ b/palacios/src/palacios/vmm_direct_paging_64.h
@@ -27,33 +27,91 @@
 #include <palacios/vm_guest_mem.h>
 #include <palacios/vm_guest.h>
 
-
+// Reference: AMD Software Developer Manual Vol.2 Ch.5 "Page Translation and Protection"
 
 static inline int handle_passthrough_pagefault_64(struct guest_info * info, 
 						  addr_t fault_addr, 
 						  pf_error_t error_code) {
-    pml4e64_t * pml = NULL;
-    pdpe64_t * pdpe = NULL;
-    pde64_t * pde = NULL;
-    pte64_t * pte = NULL;
-    addr_t host_addr = 0;
-
-    int pml_index = PML4E64_INDEX(fault_addr);
+    pml4e64_t * pml      = NULL;
+    pdpe64_t * pdpe      = NULL;
+    pde64_t * pde        = NULL;
+    pde64_2MB_t * pde2mb = NULL;
+    pte64_t * pte        = NULL;
+    addr_t host_addr     = 0;
+
+    int pml_index  = PML4E64_INDEX(fault_addr);
     int pdpe_index = PDPE64_INDEX(fault_addr);
-    int pde_index = PDE64_INDEX(fault_addr);
-    int pte_index = PTE64_INDEX(fault_addr);
-
-
-    
+    int pde_index  = PDE64_INDEX(fault_addr);
+    int pte_index  = PTE64_INDEX(fault_addr);
 
     struct v3_mem_region * region =  v3_get_mem_region(info->vm_info, info->cpu_id, fault_addr);
-  
+    struct v3_mem_region * base_reg = &(info->vm_info->mem_map.base_region);
+
+    /* If the guest has been configured for 2MiB pages, then we must check for hooked regions of
+     * memory which may overlap with the 2MiB page containing the faulting address (due to
+     * potentially differing access policies in place for e.g. i/o devices and APIC). A 2MiB page
+     * can be used if a) no region overlaps the page [or b) a region does overlap but fully contains
+     * the page]. The [bracketed] text pertains to the #if 0'd code below, state D. TODO modify this
+     * note if someone decides to enable this optimization. It can be tested with the SeaStar
+     * mapping.
+     *
+     * Examples: (CAPS regions are returned by v3_get_next_mem_region; state A returns the base reg)
+     *
+     *    |region| |region|                               2MiB mapped (state A)
+     *                   |reg|          |REG|             2MiB mapped (state B)
+     *   |region|     |reg|   |REG| |region|   |reg|      4KiB mapped (state C)
+     *        |reg|  |reg|   |--REGION---|                [2MiB mapped (state D)]
+     * |--------------------------------------------|     RAM
+     *                             ^                      fault addr
+     * |----|----|----|----|----|page|----|----|----|     2MB pages
+     *                           >>>>>>>>>>>>>>>>>>>>     search space
+     */
+    addr_t pg_start = 0UL, pg_end = 0UL; // 2MiB page containing the faulting address
+    struct v3_mem_region * pg_next_reg = NULL; // next immediate mem reg after page start addr
+    bool use_large_page = false;
+
     if (region == NULL) {
-	PrintError("Invalid region in passthrough page fault 64, addr=%p\n", 
-		   (void *)fault_addr);
+	PrintError("%s: invalid region, addr=%p\n", __FUNCTION__, (void *)fault_addr);
 	return -1;
     }
 
+    // set use_large_page here
+    if (info->vm_info->paging_size == PAGING_2MB) {
+
+	// guest page maps to a host page + offset (so when we shift, it aligns with a host page)
+	pg_start = PAGE_ADDR_2MB(fault_addr);
+	pg_end = (pg_start + PAGE_SIZE_2MB);
+
+	PrintDebug("%s: page   [%p,%p) contains address\n", __FUNCTION__, (void *)pg_start, (void *)pg_end);
+
+	pg_next_reg = v3_get_next_mem_region(info->vm_info, info->cpu_id, pg_start);
+
+	if (pg_next_reg == NULL) {
+	    PrintError("%s: Error: address not in base region, %p\n", __FUNCTION__, (void *)fault_addr);
+	    return -1;
+	}
+
+	if ((pg_next_reg->guest_start == base_reg->guest_start) &&
+		(pg_next_reg->guest_end == base_reg->guest_end)) { // next region == base region
+	    use_large_page = 1; // State A
+	} else {
+#if 0       // State B/C and D optimization
+	    use_large_page = (pg_next_reg->guest_end >= pg_end) &&
+		((pg_next_reg->guest_start >= pg_end) || (pg_next_reg->guest_start <= pg_start));
+	    PrintDebug("%s: region [%p,%p) %s partial overlap with page\n", __FUNCTION__,
+		    (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
+		    (use_large_page ? "does not have" : "has"));
+#else       // State B/C
+	    use_large_page = (pg_next_reg->guest_start >= pg_end);
+	    PrintDebug("%s: region [%p,%p) %s overlap with page\n", __FUNCTION__,
+		    (void *)pg_next_reg->guest_start, (void *)pg_next_reg->guest_end,
+		    (use_large_page ? "does not have" : "has"));
+#endif
+	}
+    }
+
+    PrintDebug("%s: Address gets a 2MiB page? %s\n", __FUNCTION__, (use_large_page ? "yes" : "no"));
+
     // Lookup the correct PML address based on the PAGING MODE
     if (info->shdw_pg_mode == SHADOW_PAGING) {
 	pml = CR3_TO_PML4E64_VA(info->ctrl_regs.cr3);
@@ -70,9 +128,9 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * info,
         pml[pml_index].writable = 1;
         pml[pml_index].user_page = 1;
 
-	pml[pml_index].pdp_base_addr = PAGE_BASE_ADDR((addr_t)V3_PAddr(pdpe));    
+	pml[pml_index].pdp_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pdpe));
     } else {
-	pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pml[pml_index].pdp_base_addr));
+	pdpe = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pml[pml_index].pdp_base_addr));
     }
 
     // Fix up the PDPE entry
@@ -84,11 +142,48 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * info,
 	pdpe[pdpe_index].writable = 1;
 	pdpe[pdpe_index].user_page = 1;
 
-	pdpe[pdpe_index].pd_base_addr = PAGE_BASE_ADDR((addr_t)V3_PAddr(pde));    
+	pdpe[pdpe_index].pd_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pde));
     } else {
-	pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pdpe[pdpe_index].pd_base_addr));
+	pde = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pdpe[pdpe_index].pd_base_addr));
+    }
+
+    // Fix up the 2MiB PDE and exit here
+    if (use_large_page) {
+
+	pde2mb = (pde64_2MB_t *)pde; // all but these two lines are the same for PTE
+	pde2mb[pde_index].large_page = 1;
+
+	if (pde2mb[pde_index].present == 0) {
+	    pde2mb[pde_index].user_page = 1;
+
+	    if ((region->flags.alloced == 1) && (region->flags.read == 1)) {
+		// Full access
+		pde2mb[pde_index].present = 1;
+
+		if (region->flags.write == 1) {
+		    pde2mb[pde_index].writable = 1;
+		} else {
+		    pde2mb[pde_index].writable = 0;
+		}
+
+		if (v3_gpa_to_hpa(info, fault_addr, &host_addr) == -1) {
+		    PrintError("Error Could not translate fault addr (%p)\n", (void *)fault_addr);
+		    return -1;
+		}
+
+		pde2mb[pde_index].page_base_addr = PAGE_BASE_ADDR_2MB(host_addr);
+	    } else {
+		return region->unhandled(info, fault_addr, fault_addr, region, error_code);
+	    }
+	} else {
+	    // We fix all permissions on the first pass,
+	    // so we only get here if its an unhandled exception
+
+	    return region->unhandled(info, fault_addr, fault_addr, region, error_code);
+	}
     }
 
+    // Continue with the 4KiB page heirarchy
 
     // Fix up the PDE entry
     if (pde[pde_index].present == 0) {
@@ -98,9 +193,9 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * info,
 	pde[pde_index].writable = 1;
 	pde[pde_index].user_page = 1;
 	
-	pde[pde_index].pt_base_addr = PAGE_BASE_ADDR((addr_t)V3_PAddr(pte));
+	pde[pde_index].pt_base_addr = PAGE_BASE_ADDR_4KB((addr_t)V3_PAddr(pte));
     } else {
-	pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr));
+	pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR_4KB(pde[pde_index].pt_base_addr));
     }
 
 
@@ -124,7 +219,7 @@ static inline int handle_passthrough_pagefault_64(struct guest_info * info,
 		return -1;
    	    }
 
-	    pte[pte_index].page_base_addr = PAGE_BASE_ADDR(host_addr);
+	    pte[pte_index].page_base_addr = PAGE_BASE_ADDR_4KB(host_addr);
 	} else {
 	    return region->unhandled(info, fault_addr, fault_addr, region, error_code);
 	}
@@ -170,7 +265,7 @@ static inline int invalidate_addr_64(struct guest_info * info, addr_t inv_addr)
 
     if (pdpe[pdpe_index].present == 0) {
 	return 0;
-    } else if (pdpe[pdpe_index].large_page == 1) {
+    } else if (pdpe[pdpe_index].large_page == 1) { // 1GiB
 	pdpe[pdpe_index].present = 0;
 	return 0;
     }
@@ -179,14 +274,14 @@ static inline int invalidate_addr_64(struct guest_info * info, addr_t inv_addr)
 
     if (pde[pde_index].present == 0) {
 	return 0;
-    } else if (pde[pde_index].large_page == 1) {
+    } else if (pde[pde_index].large_page == 1) { // 2MiB
 	pde[pde_index].present = 0;
 	return 0;
     }
 
     pte = V3_VAddr((void*)BASE_TO_PAGE_ADDR(pde[pde_index].pt_base_addr));
 
-    pte[pte_index].present = 0;
+    pte[pte_index].present = 0; // 4KiB
 
     return 0;
 }
diff --git a/palacios/src/palacios/vmm_mem.c b/palacios/src/palacios/vmm_mem.c
index 80e17a3..1c6d51a 100644
--- a/palacios/src/palacios/vmm_mem.c
+++ b/palacios/src/palacios/vmm_mem.c
@@ -64,9 +64,10 @@ int v3_init_mem_map(struct v3_vm_info * vm) {
     // There is an underlying region that contains all of the guest memory
     // PrintDebug("Mapping %d pages of memory (%u bytes)\n", (int)mem_pages, (uint_t)info->mem_size);
 
+    // 2MB page alignment needed for 2MB hardware nested paging
     map->base_region.guest_start = 0;
     map->base_region.guest_end = mem_pages * PAGE_SIZE_4KB;
-    map->base_region.host_addr = (addr_t)V3_AllocPages(mem_pages);
+    map->base_region.host_addr = (addr_t)V3_AllocAlignedPages(mem_pages, PAGE_SIZE_2MB);
 
     map->base_region.flags.read = 1;
     map->base_region.flags.write = 1;
@@ -189,8 +190,7 @@ struct v3_mem_region * __insert_mem_region(struct v3_vm_info * vm,
 
 
 
-int v3_insert_mem_region(struct v3_vm_info * vm, 
-			    struct v3_mem_region * region) {
+int v3_insert_mem_region(struct v3_vm_info * vm, struct v3_mem_region * region) {
     struct v3_mem_region * ret;
     int i = 0;
 
@@ -289,6 +289,43 @@ struct v3_mem_region * v3_get_mem_region(struct v3_vm_info * vm, uint16_t core_i
 }
 
 
+/* Search the "hooked" memory regions for a region that ends after the given address.  If the
+ * address is invalid, return NULL. Else, return the first region found or the base region if no
+ * region ends after the given address.
+ */
+struct v3_mem_region * v3_get_next_mem_region( struct v3_vm_info * vm, uint16_t core_id, addr_t guest_addr) {
+    struct rb_node * n = vm->mem_map.mem_regions.rb_node;
+    struct v3_mem_region * reg = NULL;
+
+    // Keep going to the right in the tree while the address is greater than the current region's
+    // end address.
+    while (n) {
+        reg = rb_entry(n, struct v3_mem_region, tree_node);
+        if (guest_addr >= reg->guest_end) { // reg is [start,end)
+            n = n->rb_right;
+        } else {
+	    // PAD this may be buggy since there is no guarantees that 
+	    // the cores are in order
+	    if ((core_id == reg->core_id) || (reg->core_id == V3_MEM_CORE_ANY)) {
+		return reg;
+	    } else {
+		n = n->rb_right;
+	    }
+        }
+    }
+    
+    // There is no registered region, so we check if it's a valid address in the base region
+    
+    if (guest_addr >= vm->mem_map.base_region.guest_end) {
+	PrintError("%s: Guest Address Exceeds Base Memory Size (ga=%p), (limit=%p)\n",
+		   __FUNCTION__, (void *)guest_addr, (void *)vm->mem_map.base_region.guest_end);
+        v3_print_mem_map(vm);
+        return NULL;
+    }
+    
+    return &(vm->mem_map.base_region);
+}
+
 
 
 void v3_delete_mem_region(struct v3_vm_info * vm, struct v3_mem_region * reg) {
diff --git a/utils/guest_creator/default.xml b/utils/guest_creator/default.xml
index 6316742..b1727ac 100644
--- a/utils/guest_creator/default.xml
+++ b/utils/guest_creator/default.xml
@@ -12,7 +12,7 @@
 	</paging>
 <!--
 	<paging mode="nested">
-		<pagesize>4KB</pagesize>
+		<page_size>2MB</page_size>
 	</paging>
 -->
 	<schedule_hz>100</schedule_hz>