From: Peter Dinda <pdinda@northwestern.edu>
Date: Sun, 2 Aug 2015 23:31:43 +0000 (-0500)
Subject: HVM capability enhancments
X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?p=palacios.git;a=commitdiff_plain;h=142b59ca44b3d00f0479910c986f6791216651e7

HVM capability enhancments

- ROS / HRT boot-time interaction protocol enhancements
- ROS / HRT / VMM run-time interaction protocol
- ROS->HRT resets
- ROS->HRT address space merges
- ROS->HRT sequential and parallel function invocations
- More generalized paging environment build for HRT
  including offsets, PIC, etc.
- Refactoring between multiboot and HVM
- More consistent magic numbers
- Descriptor, EFER, and ctrl reg corrections
---

diff --git a/palacios/include/palacios/vmm_hvm.h b/palacios/include/palacios/vmm_hvm.h
index 7a1dceb..7f82220 100644
--- a/palacios/include/palacios/vmm_hvm.h
+++ b/palacios/include/palacios/vmm_hvm.h
@@ -24,6 +24,7 @@
 #ifdef __V3VEE__ 
 
 #include <palacios/vmm_types.h>
+#include <palacios/vmm_multiboot.h>
 
 struct v3_vm_hvm {
     uint8_t   is_hvm;
@@ -32,6 +33,23 @@ struct v3_vm_hvm {
     struct v3_cfg_file *hrt_file;
     uint64_t  hrt_entry_addr;
     enum { HRT_BLOB, HRT_ELF64, HRT_MBOOT2, HRT_MBOOT64 } hrt_type;
+
+    // The following parallel the content of mb_info_hrt_t in
+    // the extended multiboot header.   They reflect how the 
+    // HRT has actually been mapped, as opposed to the requested
+    // mapping/flags from the mb_mb64_hrt_t
+    uint64_t  hrt_flags; 
+    uint64_t  max_mem_mapped;
+    uint64_t  gva_offset;
+    uint64_t  gva_entry;
+    uint64_t  comm_page_gpa;
+    uint8_t   hrt_int_vector;
+
+    void     *comm_page_hpa;
+    void     *comm_page_hva;
+
+    enum {HRT_IDLE=0, HRT_CALL, HRT_PARCALL, HRT_MERGE} trans_state;
+    uint64_t  trans_count;
 };
 
 struct v3_core_hvm {
@@ -39,6 +57,8 @@ struct v3_core_hvm {
     uint64_t  last_boot_start;
 };
 
+
+
 struct v3_xml;
 
 int v3_init_hvm();
@@ -68,11 +88,110 @@ void     v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_in
 					uint32_t *start_apic, uint32_t *num_apics);
 
 
+int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt);
+
 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm);
 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core);
 
 int v3_handle_hvm_reset(struct guest_info *core);
 
+/*
+  HVM/HRT interaction is as follows:
+
+  1. MB_TAG_MB64_HRT tag in the HRT multiboot kernel signifies it
+     is handled by the HVM.
+  2. The flags and other info in the the tag indicate the properties of the HRT
+     to the HVM.  (see vmm_multiboot.h), in particular:
+         - position independence
+         - ability to be initially mapped with an offset
+           between virtual and physical addresses, for example  
+           to hoist it into the same position that the ROS kernel
+           will occupy in the virtual address space of a ROS
+           process
+         - how much physical address space we will intiially map
+           and what kind of page tables are used to map it
+         - what physical page (4KB) should we reserve for use
+           in HVM/HRT communication (particularly upcalls)
+         - the interrupt vector used to upcall from the HVM to the HRT
+  3. The MB_INFO_HRT_TAG within the multiboot info structures the
+     HRT sees on boot indicates that HRT functionality is established and
+     gives details of operation to the HRT, including the following.
+     See vmm_multiboot.c for more info
+         - apics and ioapic ids, and indications of which apics
+           and which entries on ioapics are exclusively for HRT use
+         - physical address range that is exclusively for HRT use
+         - where the the physical address range exclusively for HRT use 
+           is mapped into the virtual address space (offset).  The
+           ROS part of the physical address space is always identity mapped 
+           initially.
+         - the amount of physical memory that has been mapped
+         - the physical address of the page the HVM will use to 
+           communicate with the HRT
+         - the interrupt vector the HVM will use to upcall the HRT
+         - flags copied from the HRT's HRT tag (position independence, 
+           page table model, offset, etc)
+  4. Downcalls:
+         hypercall 0xf00df00d with arguments depending on operation
+         with examples described below.
+  5. Upcalls
+         interrupt injected by VMM or a magic #PF
+         communication via a shared memory page, contents below
+
+  Upcalls
+
+   Type of upcall is determined by the first 64 bits in the commm page
+
+   0x0  =>  Null (test)
+   0x20 =>  Invoke function in HRT 
+            Next 64 bits contains address of structure
+            describing function call.   This is typically the ROS
+            trying to get the HRT to run a function for it. 
+            ROS is resposible for assuring that this address
+            (and other addresses) are correct with respect to
+            mappings.   That is, for a non-merged address space,
+            the ROS needs to supply physical addresses so that
+            they can be used (with the identity-mapped ROS physical
+            memory.)  If it wants to use virtual addresses, it
+            needs to first merge the address spaces. 
+   0x21 =>  Invoke function in HRT in parallel
+            Exactly like previos, but the upcall is happening 
+            simultaneously on all HRT cores. 
+   0x30 =>  Merge address space
+            Next 64 bits contains the ROS CR3 that we will use
+            to map the user portion of ROS address space into
+            the HRT address space
+   0x31 =>  Unmerge address space
+            return the ROS memory mapping to normal (physical/virtual identity)
+
+  Downcalls
+
+   HVM_HCALL is the general hypercall number used to talk to the HVM
+     The first argument is the request number (below).   The other arguments
+     depend on the first.
+
+   0x0  =>  Null, just for timing
+   0x1  =>  Reboot ROS
+   0x2  =>  Reboot HRT
+   0x3  =>  Reboot Both
+   0xf  =>  Get HRT transaction state
+
+   0x20 =>  Invoke function (ROS->HRT)
+            first argument is pointer to structure describing call
+   0x21 =>  Invoke function in parallel (ROS->HRT)
+            same as above, but simultaneously on all HRT cores
+   0x2f =>  Function execution complete (HRT->ROS, once per core)
+   0x30 =>  Merge address space (ROS->HRT)
+            no arguments (CR3 implicit).   Merge the current
+            address space in the ROS with the address space on 
+            the HRT
+   0x31 =>  Unmerge address apce (ROS->HRT)
+            release any address space merger and restore identity mapping
+   0x3f =>  Merge request complete (HRT->ROS)
+
+*/     
+     
+
+
 #endif /* ! __V3VEE__ */
 
 
diff --git a/palacios/include/palacios/vmm_multiboot.h b/palacios/include/palacios/vmm_multiboot.h
index c5c370c..f4aaac1 100644
--- a/palacios/include/palacios/vmm_multiboot.h
+++ b/palacios/include/palacios/vmm_multiboot.h
@@ -26,6 +26,13 @@
 #include <palacios/vmm_types.h>
 
 
+/******************************************************************
+     Data contained in the ELF file we will attempt to boot  
+******************************************************************/
+
+#define ELF_MAGIC    0x464c457f
+#define MB2_MAGIC    0xe85250d6
+
 typedef struct mb_header {
     uint32_t magic;
     uint32_t arch; 
@@ -89,10 +96,35 @@ typedef struct mb_modalign {
 // version of multiboot.  The existence of
 // this tag indicates that this special mode is
 // requested
-#define MB_TAG_MB64_HRT 0xf00d
+#define MB_TAG_MB64_HRT           0xf00d
 typedef struct mb_mb64_hrt {
     mb_tag_t       tag;
-    uint32_t       hrt_flags;
+    uint64_t       hrt_flags;
+    // whether this kernel is relocable
+#define MB_TAG_MB64_HRT_FLAG_RELOC      0x1
+    // How to map the memory in the initial PTs
+    // highest set bit wins
+#define MB_TAG_MB64_HRT_FLAG_MAP_4KB    0x100
+#define MB_TAG_MB64_HRT_FLAG_MAP_2MB    0x200
+#define MB_TAG_MB64_HRT_FLAG_MAP_1GB    0x400
+#define MB_TAG_MB64_HRT_FLAG_MAP_512GB  0x800
+
+    // How much physical address space to map in the
+    // initial page tables (bytes)
+    // 
+    uint64_t       max_mem_to_map;
+    // offset of the GVA->GPA mappings (GVA of GPA 0)
+    uint64_t       gva_offset;
+    // 64 bit entry address (=0 to use entry tag (which will be offset by gva_offset))
+    uint64_t       gva_entry;
+    // desired address of the page the VMM, HRT, and ROS share
+    // for communication.  "page" here a 4 KB quantity
+    uint64_t       comm_page_gpa;
+    // desired interrupt vector that should be used for upcalls
+    // the default for this is 255
+    uint8_t        hrt_int_vector;
+    uint8_t        reserved[7];
+    
 } __attribute__((packed)) mb_mb64_hrt_t;
 
 typedef struct mb_data {
@@ -106,6 +138,131 @@ typedef struct mb_data {
     mb_mb64_hrt_t *mb64_hrt;
 } mb_data_t;
 
+
+
+// We are not doing:
+//
+// - BIOS Boot Device
+// - Modules
+// - ELF symbols
+// - Boot Loader name
+// - APM table
+// - VBE info
+// - Framebuffer info
+//
+
+
+
+/******************************************************************
+     Data we will pass to the kernel via rbx
+******************************************************************/
+
+#define MB2_INFO_MAGIC    0x36d76289
+
+
+typedef struct mb_info_header {
+    uint32_t  totalsize;
+    uint32_t  reserved;
+} __attribute__((packed)) mb_info_header_t;
+
+// A tag of type 0, size 8 indicates last value
+//
+typedef struct mb_info_tag {
+    uint32_t  type;
+    uint32_t  size;
+} __attribute__((packed)) mb_info_tag_t;
+
+
+#define MB_INFO_MEM_TAG  4
+typedef struct mb_info_mem {
+    mb_info_tag_t tag;
+    uint32_t  mem_lower; // 0..640K in KB 
+    uint32_t  mem_upper; // in KB to first hole - 1 MB
+} __attribute__((packed)) mb_info_mem_t;
+
+#define MB_INFO_CMDLINE_TAG  1
+// note alignment of 8 bytes required for each... 
+typedef struct mb_info_cmdline {
+    mb_info_tag_t tag;
+    uint32_t  size;      // includes zero termination
+    uint8_t   string[];  // zero terminated
+} __attribute__((packed)) mb_info_cmdline_t;
+
+
+#define MEM_RAM   1
+#define MEM_ACPI  3
+#define MEM_RESV  4
+
+typedef struct mb_info_memmap_entry {
+    uint64_t  base_addr;
+    uint64_t  length;
+    uint32_t  type;
+    uint32_t  reserved;
+} __attribute__((packed)) mb_info_memmap_entry_t;
+
+#define MB_INFO_MEMMAP_TAG  6
+// note alignment of 8 bytes required for each... 
+typedef struct mb_info_memmap {
+    mb_info_tag_t tag;
+    uint32_t  entry_size;     // multiple of 8
+    uint32_t  entry_version;  // 0
+    mb_info_memmap_entry_t  entries[];
+} __attribute__((packed)) mb_info_memmap_t;
+
+#define MB_INFO_HRT_TAG 0xf00df00d
+typedef struct mb_info_hrt {
+    mb_info_tag_t  tag;
+    // apic ids are 0..num_apics-1
+    // ioapics follow
+    // apic and ioapic addresses are the well known places
+    uint32_t       total_num_apics;
+    // first apic the HRT owns (HRT core 0)
+    uint32_t       first_hrt_apic_id;
+    // can the HRT use an ioapic?
+    uint32_t       have_hrt_ioapic;
+    // if so, this is the first entry on the
+    // ioapic that can be used by the HRT
+    uint32_t       first_hrt_ioapic_entry;
+    // CPU speed
+    uint64_t       cpu_freq_khz;
+    // copy of the HRT flags from the kernel (indicating 
+    // page table mapping type, position independence, etc.
+    // these reflect how it has actually been mapped
+    uint64_t       hrt_flags;
+    // the amount of physical address space that has been mapped
+    // initially. 
+    uint64_t       max_mem_mapped; 
+    // The first physical address the HRT should
+    // (nominally) use.   Physical addresses below this are
+    // visible to the ROS
+    uint64_t       first_hrt_gpa;
+    // Where the intial boot state starts in the physical address
+    // space.   This includes INT HANDLER,IDT,GDT,TSS, PAGETABLES,
+    // and MBINFO, but not the scratch stacks
+    // This is essentially the content of CR3 - 1 page on boot
+    uint64_t       boot_state_gpa;
+    // Where GPA 0 is mapped in the virtual address space
+    uint64_t       gva_offset;   
+
+    // Typically:
+    //     first_hrt_vaddr==first_hrt_paddr => no address space coalescing
+    //     first_hrt_vaddr>first_hrt_paddr => address space coalescing
+    // For example, first_hrt_vaddr might be set to the start of linux kernel
+    // This then allows us to coalesce user portion of the address space of 
+    // a linux process and the HRT
+    // for communication.  "page" here a 4 KB quantity
+
+    // address of the page the VMM, HRT, and ROS share
+    uint64_t       comm_page_gpa;
+    // interrupt vector used to upcall to HRT (==0 if none)
+    // downcalls are done with HVM hypercall 0xf00df00d
+    uint8_t        hrt_int_vector;
+    uint8_t        reserved[7];
+} __attribute__((packed)) mb_info_hrt_t;
+
+
+
+
 struct v3_vm_multiboot {
     uint8_t   is_multiboot;
     struct v3_cfg_file *mb_file;
@@ -118,6 +275,7 @@ struct v3_vm_multiboot {
 // There is no core structure for
 // multiboot capability
 
+
 struct v3_xml;
 
 int v3_init_multiboot();
diff --git a/palacios/src/palacios/vmm_hvm.c b/palacios/src/palacios/vmm_hvm.c
index 7fb278b..3e506d7 100644
--- a/palacios/src/palacios/vmm_hvm.c
+++ b/palacios/src/palacios/vmm_hvm.c
@@ -56,7 +56,7 @@
   <mem ... >RAM</mem>   (MB)  Note these are  
   <cores count="CORES" ...>   backward compatible
 
-  <hvm enable="y">
+  <hvm enable="y" >
     <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
     <hrt file_id="hrtelf" /hrt>
   </hvm>
@@ -69,12 +69,6 @@
 #endif
 
 
-// if set, we will map the first 1 GB of memory using a 3 level
-// hierarchy, for compatibility with Nautilus out of the box.
-// Otherwise we will map the first 512 GB using a 2 level
-// hieratchy
-#define HVM_MAP_1G_2M 1
-
 int v3_init_hvm()
 {
     PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
@@ -87,19 +81,245 @@ int v3_deinit_hvm()
     return 0;
 }
 
+// ignore requests from when we are in the wrong state
+#define ENFORCE_STATE_MACHINE 1
+
+// invoke the HRT using a page fault instead of
+// the SWINTR mechanism
+#define USE_UPCALL_MAGIC_PF  1
+#define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
+#define UPCALL_MAGIC_ERROR   0xf00df00d
+
+/*
+  64 bit only hypercall:
 
+  rax = hypercall number
+  rbx = 0x646464...
+  then args are:  rcx, rdx, rsi, rdi r8, r9, r10, r11
+  rcx = 1st arg
+*/
 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
 {
     uint64_t c;
+    uint64_t bitness = core->vm_regs.rbx;
+    uint64_t a1 = core->vm_regs.rcx;
+    uint64_t a2 = core->vm_regs.rdx;
+    struct v3_vm_hvm *h = &core->vm_info->hvm_state;
+
+
+    if (bitness!=0x6464646464646464) { 
+	PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
+	core->vm_regs.rax = -1;
+	return 0;
+    }
+
+    switch (a1) {
+	case 0x0:   // null
+	    
+	    rdtscll(c);
+	    
+	    V3_Print(core->vm_info,core, "hvm: received hypercall %x  rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
+		     hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
+	    //v3_print_core_telemetry(core);
+	    //    v3_print_guest_state(core);
+	    core->vm_regs.rax = 0;
+	    break;
+	    
+	case 0x1: // reset ros
+	    PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
+	    if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) { 
+		PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
+		core->vm_regs.rax = -1;
+	    } else {
+		core->vm_regs.rax = 0;
+	    }
+	    break;
+
+	case 0x2: // reset hrt
+	    PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
+	    if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) { 
+		PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
+		core->vm_regs.rax = -1;
+	    } else {
+		core->vm_regs.rax = 0;
+	    }
+	    break;
 
-    rdtscll(c);
+	case 0x3: // reset both
+	    PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
+	    if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) { 
+		PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
+		core->vm_regs.rax = -1;
+	    } else {
+		core->vm_regs.rax = 0;
+	    }
+	    break;
+	    
+	case 0xf: // get HRT state
+	    core->vm_regs.rax = h->trans_state;
+	    //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
+	    break;
 
+	case 0x20: // invoke function (ROS->HRT)
+	case 0x21: // invoke parallel function (ROS->HRT)
+	    if (v3_is_hvm_hrt_core(core)) { 
+		PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
+		core->vm_regs.rax = -1;
+	    } else {
+		if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
+		    PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
+		    core->vm_regs.rax = -1;
+		} else {
+		    uint64_t *page = (uint64_t *) h->comm_page_hva;
+		    uint64_t first, last, cur;
+
+		    PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
+		    page[0] = a1;
+		    page[1] = a2;
+
+		    if (a1==0x20) { 
+			first=last=h->first_hrt_core;
+		    } else {
+			first=h->first_hrt_core;
+			last=core->vm_info->num_cores-1;
+		    }
+
+		    core->vm_regs.rax = 0;
+
+		    h->trans_count = last-first+1;
+
+		    for (cur=first;cur<=last;cur++) { 
+
+#if USE_UPCALL_MAGIC_PF
+			PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
+			core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
+			if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
+							  PF_EXCEPTION, 
+							  UPCALL_MAGIC_ERROR)) { 
+			    PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
+			    core->vm_regs.rax = -1;
+			    break;
+			}
+#else
+			PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
+			if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) { 
+			    PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
+			    core->vm_regs.rax = -1;
+			    break;
+			}
+#endif
+			// Force core to exit now
+			v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
+			  
+		    }
+		    if (core->vm_regs.rax==0) { 
+			if (a1==0x20) { 
+			    h->trans_state = HRT_CALL;
+			} else {
+			    h->trans_state = HRT_PARCALL;
+			}
+		    }  else {
+			PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
+			h->trans_state = HRT_IDLE;
+			h->trans_count = 0;
+		    }
+		}
+	    }
+	    break;
 
-    V3_Print(core->vm_info,core, "hvm: received hypercall %x  rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
-	     hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, c-core->hvm_state.last_boot_start, core->num_exits);
-    //v3_print_core_telemetry(core);
-    //    v3_print_guest_state(core);
 
+	case 0x2f: // function exec done
+	    if (v3_is_hvm_ros_core(core)) { 
+		PrintError(core->vm_info,core, "hvm: request for exec done from ROS core\n");
+		core->vm_regs.rax=-1;
+	    } else {
+		if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_CALL && h->trans_state!=HRT_PARCALL) {
+		    PrintError(core->vm_info,core,"hvm: function completion when not in HRT_CALL or HRT_PARCALL state\n");
+		    core->vm_regs.rax=-1;
+		} else {
+		    uint64_t one=1;
+		    PrintDebug(core->vm_info,core, "hvm: function complete\n");
+		    if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
+			// last one, switch state
+			h->trans_state=HRT_IDLE;
+			PrintDebug(core->vm_info,core, "hvm: function complete - back to idle\n");
+		    }
+		    core->vm_regs.rax=0;
+		}
+	    }
+		    
+	    break;
+
+	case 0x30: // merge address space
+	case 0x31: // unmerge address space
+	    if (v3_is_hvm_hrt_core(core)) { 
+		PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
+		core->vm_regs.rax=-1;
+	    } else {
+		if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
+		    PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state\n",a1==0x30 ? "" : "un");
+		    core->vm_regs.rax=-1;
+		} else {
+		    uint64_t *page = (uint64_t *) h->comm_page_hva;
+
+		    PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
+		    // should sanity check to make sure guest is in 64 bit without anything strange
+
+		    page[0] = a1;
+		    page[1] = core->ctrl_regs.cr3;  // this is a do-not-care for an unmerge
+
+		    core->vm_regs.rax = 0;
+#if USE_UPCALL_MAGIC_PF
+		    PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %u\n",h->first_hrt_core);
+		    core->vm_info->cores[h->first_hrt_core].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
+		    if (v3_raise_exception_with_error(&core->vm_info->cores[h->first_hrt_core],
+						      PF_EXCEPTION,  
+						      UPCALL_MAGIC_ERROR)) { 
+		      PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %u\n",h->first_hrt_core);
+		      core->vm_regs.rax = -1;
+		      break;
+		    }
+#else
+		    PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %u\n",h->hrt_int_vector,h->first_hrt_core);
+		    if (v3_raise_swintr(&core->vm_info->cores[h->first_hrt_core],h->hrt_int_vector)) { 
+			PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %u\n",h->first_hrt_core);
+			core->vm_regs.rax = -1;
+		    } 
+#endif		
+		    // Force core to exit now
+		    v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
+
+		    h->trans_state = HRT_MERGE;
+		}
+		
+	    }
+		
+	    break;
+	    
+
+	case 0x3f: // merge operation done
+	    if (v3_is_hvm_ros_core(core)) { 
+		PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
+		core->vm_regs.rax=-1;
+	    } else {
+		if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
+		    PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
+		    core->vm_regs.rax=-1;
+		} else {
+		    PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
+		    h->trans_state=HRT_IDLE;
+		    core->vm_regs.rax=0;
+		}
+	    }
+		    
+	    break;
+
+	default:
+	    PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
+	    core->vm_regs.rax=-1;
+	    break;
+    }
+		
     return 0;
 }
 
@@ -153,7 +373,7 @@ int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
     }
 
     vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
-	
+
     if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) { 
 	PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
 	return -1;
@@ -206,6 +426,15 @@ int v3_deinit_hvm_vm(struct v3_vm_info *vm)
 
     v3_remove_hypercall(vm,HVM_HCALL);
 
+    if (vm->hvm_state.comm_page_hpa) { 
+	struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
+	if (!r) { 
+	    PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
+	} else {
+	    v3_delete_mem_region(vm,r);
+	}
+    }
+
     return 0;
 }
 
@@ -331,15 +560,15 @@ void     v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_in
 #define MAX(x,y) ((x)>(y)?(x):(y))
 #define MIN(x,y) ((x)<(y)?(x):(y))
 
-#ifdef HVM_MAP_1G_2M
-#define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x40000000ULL))
-#else
-#define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x800000000ULL))
-#endif
 
+static uint64_t boot_state_end_addr(struct v3_vm_info *vm) 
+{
+    return PAGE_ADDR(vm->mem_size);
+}
+   
 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 {
-    *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - PAGE_SIZE);
+    *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
     *limit = PAGE_SIZE;
 }
 
@@ -391,7 +620,7 @@ static void write_null_int_handler(struct v3_vm_info *vm)
 
 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 {
-    *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - 2 * PAGE_SIZE);
+    *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
     *limit = 16*256;
 }
 
@@ -403,7 +632,7 @@ static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 //    3 ist        => (stack) = 0 => current stack
 //    5 reserved   => 0
 //    4 type       => 0xe=>INT, 0xf=>TRAP 
-//    1 reserved   => 0
+//    1 reserved   => 0  (indicates "system" by being zero)
 //    2 dpl        => 0
 //    1 present    => 1
 //   16 offsetmid  => 0
@@ -414,7 +643,7 @@ static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 // 
 // Note little endian
 //
-static uint64_t idt64_trap_gate_entry_mask[2] = {  0x00008f0000080000, 0x0 } ;
+static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
 static uint64_t idt64_int_gate_entry_mask[2] =  { 0x00008e0000080000, 0x0 };
 
 static void write_idt(struct v3_vm_info *vm)
@@ -431,6 +660,8 @@ static void write_idt(struct v3_vm_info *vm)
 
     get_null_int_handler_loc(vm,&handler,&handler_len);
 
+    handler += vm->hvm_state.gva_offset;
+
     memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
     memcpy(int_gate,idt64_int_gate_entry_mask,16);
 
@@ -469,7 +700,7 @@ static void write_idt(struct v3_vm_info *vm)
 
 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 {
-    *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 3 * PAGE_SIZE);
+    *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
     *limit = 8*3;
 }
 
@@ -494,7 +725,7 @@ static void write_gdt(struct v3_vm_info *vm)
 
 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 {
-    *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 4 * PAGE_SIZE);
+    *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
     *limit = PAGE_SIZE;
 }
 
@@ -510,159 +741,307 @@ static void write_tss(struct v3_vm_info *vm)
     PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
 }
 
+
+#define TOP_HALF_START  0xffff800000000000ULL
+#define BOTTOM_HALF_END 0x00007fffffffffffULL
+
+
+#define L4_UNIT PAGE_SIZE
+#define L3_UNIT (512ULL * L4_UNIT)
+#define L2_UNIT (512ULL * L3_UNIT)
+#define L1_UNIT (512ULL * L2_UNIT)
+
+static void compute_pts_4KB(struct v3_vm_info *vm, 
+			    uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)    
+{
+
+    // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
+    // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
+    // so it is the same number of page tables regardless
+
+    uint64_t max_gva = vm->hvm_state.max_mem_mapped;
+
+    *l1 = 1;  // 1 PML4
+    *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
+    *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
+    *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
+}
+
+
+
 /*
-  PTS MAP FIRST 512 GB identity mapped: 
-  1 second level
-     512 entries
+  PTS MAP using 1 GB pages
+  n second levels pts, highest gva, highest address
   1 top level
-     1 entries
+
 
 OR
   
-  PTS MAP FIRST 1 GB identity mapped:
-  1 third level
-    512 entries
-  1 second level
-    1 entries
-  1 top level
-    1 entries
+  PTS MAP using 2 MB pages
+  n third level pts, highest gva, highest address
+  m second level pts, highest gva, highest address
+  1 top level pt
+
+OR
+
+  PTS MAP using 4 KB pages
+  n 4th level, highest gva, highest address
+  m 3rd level, highest gva, hihgest address
+  l second level, highest gva, highest address
+  1 top level pt
+
+OR
+  PTS MAP using 512 GB pages when this becomes available
+
 */
 
+
 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 {
-#ifdef HVM_MAP_1G_2M
-    *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+2)*PAGE_SIZE);
-    *limit =  3*PAGE_SIZE;
-#else
-    *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+1)*PAGE_SIZE);
-    *limit =  2*PAGE_SIZE;
-#endif
+    uint64_t l1,l2,l3,l4;
+    uint64_t num_pt;
+
+    compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
+
+    if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
+	num_pt = l1;
+    } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
+	num_pt = l1 + l2;
+    } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
+	num_pt = l1 + l2 + l3;
+    } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
+	num_pt = l1 + l2 + l3 + l4;
+    } else {
+	PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
+	return;
+    }
+
+    *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
+    *limit = num_pt*PAGE_SIZE;
 }
 
-#ifndef HVM_MAP_1G_2M
-static void write_pt_2level_512GB(struct v3_vm_info *vm)
+static void write_pts(struct v3_vm_info *vm)
 {
-    void *base;
     uint64_t size;
-    struct pml4e64 pml4e;
-    struct pdpe64 pdpe;
-    uint64_t i;
-
-    get_pt_loc(vm,&base, &size);
-    if (size!=2*PAGE_SIZE) { 
-	PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n");
+    uint64_t num_l1, num_l2, num_l3, num_l4;
+    void *start_l1, *start_l2, *start_l3, *start_l4;
+    uint64_t max_level;
+    void *cur_pt;
+    void *cur_gva;
+    void *cur_gpa;
+    void *min_gpa = 0;
+    void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
+    void *min_gva = (void*) vm->hvm_state.gva_offset;
+#ifdef V3_CONFIG_DEBUG_HVM
+    void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
+#endif
+    uint64_t i, pt;
+    uint64_t i_start,i_end;
+    
+    struct pml4e64 *pml4e;
+    struct pdpe64 *pdpe;
+    struct pde64 *pde;
+    struct pte64 *pte;
+
+    if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
+	PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
+	max_level = 1;
+    } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
+	max_level = 2;
+    } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
+	max_level = 3;
+    } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
+	max_level = 4;
+    } else {
+	PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
+	return;
     }
 
-    if (vm->mem_size > 0x800000000ULL) { 
-	PrintError(vm,VCORE_NONE, "VM has more than 512 GB\n");
+    get_pt_loc(vm,&start_l1,&size);
+    compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
+
+    start_l2=start_l1+PAGE_SIZE*num_l1;
+    start_l3=start_l2+PAGE_SIZE*num_l2;
+    start_l4=start_l3+PAGE_SIZE*num_l3;
+
+    PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
+    PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
+    PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
+    PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
+
+    cur_pt=start_l1;
+
+    // build PML4 (only one)
+    if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) { 
+	PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
+	return;
     }
 
-    memset(&pdpe,0,sizeof(pdpe));
-    pdpe.present=1;
-    pdpe.writable=1;
-    pdpe.large_page=1;
-    
-    for (i=0;i<512;i++) {
-	pdpe.pd_base_addr = i*0x40000;  // 0x4000 = 256K pages = 1 GB
-	v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe);
+    memset(pml4e,0,PAGE_SIZE);
+
+    if (min_gva==0x0) { 
+	i_start=0; i_end = num_l2;
+    } else if (min_gva==(void*)TOP_HALF_START) { 
+	i_start=256; i_end=256+num_l2;
+    } else {
+	PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
+	return;
     }
 
-    memset(&pml4e,0,sizeof(pml4e));
-    pml4e.present=1;
-    pml4e.writable=1;
-    pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE));
+    for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
+	 (i<i_end);
+	 i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
 
-    v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e);    
+	pml4e[i].present=1;
+	pml4e[i].writable=1;
+	
+	if (max_level==1) { 
+	    PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
+	    pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
+	    //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
+	} else {
+	    pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
+	    //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
+	}
+    }
 
-    for (i=1;i<512;i++) {
-	pml4e.present=0;
-	v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e);
+    // 512 GB only
+    if (max_level==1) {
+	return;
     }
 
-    PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE) at %p (512 GB mapped)\n",base);
-}
 
-#else 
 
-static void write_pt_3level_1GB(struct v3_vm_info *vm)
-{
-    void *base;
-    uint64_t size;
-    struct pml4e64 pml4e;
-    struct pdpe64 pdpe;
-    struct pde64 pde;
+    for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
+	 pt<num_l2;
+	 cur_pt+=PAGE_SIZE, pt++) { 
 
-    uint64_t i;
+	// build PDPE
+	if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) { 
+	    PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
+	    return;
+	}
+	
+	memset(pdpe,0,PAGE_SIZE);
+	
+	for (i=0; 
+	     i<512 && cur_gpa<max_gpa; 
+	     i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
 
-    get_pt_loc(vm,&base, &size);
-    if (size!=3*PAGE_SIZE) { 
-	PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n");
+	    pdpe[i].present=1;
+	    pdpe[i].writable=1;
+	
+	    if (max_level==2) { 
+		pdpe[i].large_page=1;
+		pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
+		//PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
+	    } else {
+		pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
+		//PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
+	    }
+	}
     }
-
-    if (vm->mem_size > 0x40000000ULL) { 
-	PrintError(vm,VCORE_NONE, "VM has more than 1 GB\n");
+	
+    //1 GB only
+    if (max_level==2) { 
+	return;
     }
 
-    memset(&pde,0,sizeof(pde));
-    pde.present=1;
-    pde.writable=1;
-    pde.large_page=1;
-    
-    for (i=0;i<512;i++) {
-	pde.pt_base_addr = i*0x200;  // 0x200 = 512 pages = 2 MB
-	v3_write_gpa_memory(&vm->cores[0],
-			    (addr_t)(base+2*PAGE_SIZE+i*sizeof(pde)),
-			    sizeof(pde),(uint8_t*)&pde);
-    }
+    for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
+	 pt<num_l3;
+	 cur_pt+=PAGE_SIZE, pt++) { 
 
-    memset(&pdpe,0,sizeof(pdpe));
-    pdpe.present=1;
-    pdpe.writable=1;
-    pdpe.large_page=0;
+	// build PDE
+	if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) { 
+	    PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
+	    return;
+	}
+	
+	memset(pde,0,PAGE_SIZE);
+	
+	for (i=0; 
+	     i<512 && cur_gpa<max_gpa; 
+	     i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
 
-    pdpe.pd_base_addr = PAGE_BASE_ADDR((addr_t)(base+2*PAGE_SIZE));
+	    pde[i].present=1;
+	    pde[i].writable=1;
+	
+	    if (max_level==3) { 
+		pde[i].large_page=1;
+		pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
+		//PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
+	    } else {
+	        pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
+	        //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
+	    }
+	}
+    }
 
-    v3_write_gpa_memory(&vm->cores[0],(addr_t)base+PAGE_SIZE,sizeof(pdpe),(uint8_t*)&pdpe);    
-    
-    for (i=1;i<512;i++) {
-	pdpe.present = 0; 
-	v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe);
+    //2 MB only
+    if (max_level==3) { 
+	return;
     }
 
-    memset(&pml4e,0,sizeof(pml4e));
-    pml4e.present=1;
-    pml4e.writable=1;
-    pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE));
 
-    v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e);    
+    // 4 KB
+    for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
+	 pt<num_l4;
+	 cur_pt+=PAGE_SIZE, pt++) { 
 
-    for (i=1;i<512;i++) {
-	pml4e.present=0;
-	v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e);
+	// build PTE
+	if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) { 
+	    PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
+	    return;
+	}
+	
+	memset(pte,0,PAGE_SIZE);
+	
+	for (i=0; 
+	     i<512 && cur_gpa<max_gpa; 
+	     i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
+
+	    pte[i].present=1;
+	    pte[i].writable=1;
+	    pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
+	    //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
+	}
     }
 
-    PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE, 1 PDP) at %p (1 GB mapped)\n",base);
+    return;
 }
 
-#endif
 
-static void write_pt(struct v3_vm_info *vm)
+static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 {
-#ifdef HVM_MAP_1G_2M
-    write_pt_3level_1GB(vm);
-#else
-    write_pt_2level_512GB(vm);
-#endif
+    
+    get_pt_loc(vm,base, limit);
+    *base-=PAGE_SIZE;
+    *limit=PAGE_SIZE;
 }
 
-static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
+
+int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
 {
-#ifdef HVM_MAP_1G_2M
-    *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+2)*PAGE_SIZE);
-#else
-    *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+1)*PAGE_SIZE);
-#endif
-    *limit =  PAGE_SIZE;
+    struct v3_vm_info *vm = core->vm_info;
+
+    hrt->tag.type = MB_INFO_HRT_TAG;
+    hrt->tag.size = sizeof(mb_info_hrt_t);
+
+    hrt->total_num_apics = vm->num_cores;
+    hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
+    hrt->have_hrt_ioapic=0;
+    hrt->first_hrt_ioapic_entry=0;
+
+    hrt->cpu_freq_khz = V3_CPU_KHZ();
+
+    hrt->hrt_flags = vm->hvm_state.hrt_flags;
+    hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
+    hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
+    hrt->gva_offset = vm->hvm_state.gva_offset;
+    hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
+    hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
+    
+    return 0;
 }
 
 static void write_mb_info(struct v3_vm_info *vm) 
@@ -753,77 +1132,151 @@ static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
 }
 
 
-// 
-// BROKEN - THIS DOES NOT DO WHAT YOU THINK
-//
-static int setup_elf(struct v3_vm_info *vm, void *base, uint64_t limit)
+static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
 {
-    v3_write_gpa_memory(&vm->cores[0],(addr_t)base,vm->hvm_state.hrt_file->size,vm->hvm_state.hrt_file->data);
-
-    vm->hvm_state.hrt_entry_addr = (uint64_t) (base+0x40);
-
-    PrintDebug(vm,VCORE_NONE,"hvm: wrote HRT ELF %s at %p\n", vm->hvm_state.hrt_file->tag,base);
-    PrintDebug(vm,VCORE_NONE,"hvm: set ELF entry to %p and hoping for the best...\n", (void*) vm->hvm_state.hrt_entry_addr);
+    struct v3_vm_hvm *h = &vm->hvm_state;
+    uint64_t f = mb->mb64_hrt->hrt_flags;
+    uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
+    uint64_t gvaoff = mb->mb64_hrt->gva_offset;
+    uint64_t gvaentry = mb->mb64_hrt->gva_entry;
+    uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
+    uint8_t  vec = mb->mb64_hrt->hrt_int_vector;
     
-    vm->hvm_state.hrt_type = HRT_ELF64;
 
-    return 0;
+    PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
+	       f, maxmap, gvaoff,gvaentry,commgpa, vec);
 
-}
+    if (maxmap<0x100000000ULL) { 
+	PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
+	maxmap=0x100000000ULL;
+    }
 
-static int setup_mb_kernel(struct v3_vm_info *vm, void *base, uint64_t limit)
-{
-    mb_data_t mb;
+    if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
+	PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
+	return -1;
+    } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
+	f &= ~0x3c;
+	f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
+	h->max_mem_mapped = maxmap;
+	PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
+    } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { 
+	f &= ~0x3c;
+	f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
+	h->max_mem_mapped = maxmap;
+	PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
+    } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
+	f &= ~0x3c;
+	f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
+	h->max_mem_mapped = maxmap;
+	PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
+    } else {
+	PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
+	return -1;
+    }
 
-    if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) { 
-	PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
+    if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
+	PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
 	return -1;
     }
 
+    h->hrt_flags = f;
 
-    if (v3_write_multiboot_kernel(vm,&mb,vm->hvm_state.hrt_file,base,limit)) {
-	PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
+    if (maxmap>h->max_mem_mapped) { 
+	PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
 	return -1;
     }
 
-    /*
-    if (!mb.addr || !mb.entry) { 
-	PrintError(vm,VCORE_NONE, "hvm: kernel is missing address or entry point\n");
+    if (gvaoff!=0 && gvaoff!=TOP_HALF_START) { 
+	PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
 	return -1;
     }
+    
+    h->gva_offset = gvaoff;
 
-    if (((void*)(uint64_t)(mb.addr->header_addr) < base ) ||
-	((void*)(uint64_t)(mb.addr->load_end_addr) > base+limit) ||
-	((void*)(uint64_t)(mb.addr->bss_end_addr) > base+limit)) { 
-	PrintError(vm,VCORE_NONE, "hvm: kernel is not within the allowed portion of HVM\n");
+    h->gva_entry = gvaentry;
+
+    if (mb->addr->load_addr < h->first_hrt_gpa) { 
+	PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
 	return -1;
     }
+    
+    if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
+	PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
+	return -1;
+    }
+    
+    if (vec<32) { 
+	PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
+	return -1;
+    }
+    
+    h->hrt_int_vector = vec;
+    
+    
+    if (commgpa < vm->mem_size) { 
+	PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
+	return -1;
+    } 
 
-    offset = mb.addr->load_addr - mb.addr->header_addr;
+    h->comm_page_gpa = commgpa;
 
-    // Skip the ELF header - assume 1 page... weird.... 
-    // FIX ME TO CONFORM TO MULTIBOOT.C
-    v3_write_gpa_memory(&vm->cores[0],
-			(addr_t)(mb.addr->load_addr),
-			vm->hvm_state.hrt_file->size-PAGE_SIZE-offset,
-			vm->hvm_state.hrt_file->data+PAGE_SIZE+offset);
+    if (!h->comm_page_hpa) { 
+	if (!(h->comm_page_hpa=V3_AllocPages(1))) { 
+	    PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
+	    return -1;
+	}
 
+	h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
 	
-    // vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + PAGE_SIZE; //HACK PAD
+	memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
+	
+	if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) { 
+	    PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
+	    V3_FreePages((void*)(h->comm_page_gpa),1);
+	    return -1;
+	}
+	
+	
+	PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
+    }
 
+    memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
+    
+    
+    PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
+ 	       h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
+    
+    return 0;
 
-    PrintDebug(vm,VCORE_NONE,
-	       "hvm: wrote 0x%llx bytes starting at offset 0x%llx to %p; set entry to %p\n",
-	       (uint64_t) vm->hvm_state.hrt_file->size-PAGE_SIZE-offset,
-	       (uint64_t) PAGE_SIZE+offset,
-	       (void*)(addr_t)(mb.addr->load_addr),
-	       (void*) vm->hvm_state.hrt_entry_addr);
+}
 
+static int setup_mb_kernel_hrt(struct v3_vm_info *vm)
+{
+    mb_data_t mb;
 
-    */
+    if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) { 
+	PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
+	return -1;
+    }
 
-    vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr;
+    if (configure_hrt(vm,&mb)) {
+	PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
+	return -1;
+    }
     
+    if (v3_write_multiboot_kernel(vm,&mb,vm->hvm_state.hrt_file,
+				  (void*)vm->hvm_state.first_hrt_gpa,
+				  vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
+	PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
+	return -1;
+    }
+
+    if (vm->hvm_state.gva_entry) { 
+	vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
+    } else {
+	vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
+    }
+
     vm->hvm_state.hrt_type = HRT_MBOOT64;
 
     return 0;
@@ -833,37 +1286,17 @@ static int setup_mb_kernel(struct v3_vm_info *vm, void *base, uint64_t limit)
 
 static int setup_hrt(struct v3_vm_info *vm)
 {
-    void *base;
-    uint64_t limit;
-
-    get_hrt_loc(vm,&base,&limit);
-
-    if (vm->hvm_state.hrt_file->size > limit) { 
-	PrintError(vm,VCORE_NONE,"hvm: Cannot map HRT because it is too big (%llu bytes, but only have %llu space\n", vm->hvm_state.hrt_file->size, (uint64_t)limit);
-	return -1;
-    }
+    if (is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size) && 
+	find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { 
 
-    if (!is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { 
-	PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not an ELF but we are going to act like it is!\n");
-	if (setup_elf(vm,base,limit)) {
-	    PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n");
+	PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
+	if (setup_mb_kernel_hrt(vm)) { 
+	    PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
 	    return -1;
-	}
-	vm->hvm_state.hrt_type=HRT_BLOB;
+	} 
     } else {
-	if (find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { 
-	    PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
-	    if (setup_mb_kernel(vm,base,limit)) { 
-		PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
-		return -1;
-	    } 
-	} else {
-	    PrintDebug(vm,VCORE_NONE,"hvm: supplied HRT is an ELF\n");
-	    if (setup_elf(vm,base,limit)) {
-		PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n");
-		return -1;
-	    }
-	}
+	PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
+	return -1;
     }
 
     return 0;
@@ -887,8 +1320,9 @@ static int setup_hrt(struct v3_vm_info *vm)
   GDT (1 page - page aligned)
   TSS (1 page - page asligned)
   PAGETABLES  (identy map of first N GB)
-     ROOT PT first, followed by 2nd level, etc.
-     Currently PML4 followed by 1 PDPE for 512 GB of mapping
+     ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
+     followed by 3rd level PTs in order, followed by 4th level
+     PTs in order.  
   MBINFO_PAGE
   SCRATCH_STACK_HRT_CORE0 
   SCRATCH_STACK_HRT_CORE1
@@ -898,7 +1332,8 @@ static int setup_hrt(struct v3_vm_info *vm)
   HRT (as many pages as needed, page-aligned, starting at first HRT address)
   ---
   ROS
-      
+
+
 */
 
 
@@ -911,20 +1346,22 @@ int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
 
     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
 
+    if (setup_hrt(vm)) {
+	PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
+	return -1;
+    } 
+
+    // the locations of all the other items are determined by
+    // the HRT setup, so these must happen after
+
     write_null_int_handler(vm);
     write_idt(vm);
     write_gdt(vm);
     write_tss(vm);
 
-    write_pt(vm);
+    write_pts(vm);
 
-    
-    if (setup_hrt(vm)) {
-	PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
-	return -1;
-    } 
-
-    // need to parse HRT first
+    // this must happen last
     write_mb_info(vm);
 
     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
@@ -940,7 +1377,7 @@ int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
    TS   points to stub TSS
    CR3 points to root page table
    CR0 has PE and PG
-   EFER has LME AND LMA
+   EFER has LME AND LMA (and NX for compatibility with Linux)
    RSP is TOS of core's scratch stack (looks like a call)
 
    RAX = MB magic cookie
@@ -948,6 +1385,8 @@ int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
    RCX = this core id / apic id (0..N-1)
    RDX = this core id - first HRT core ID (==0 for the first HRT core)
 
+   All addresses are virtual addresses, offset as needed by gva_offset
+
    Other regs are zeroed
 
    shadow/nested paging state reset for long mode
@@ -957,18 +1396,20 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
 {
     void *base;
     uint64_t limit;
+    uint64_t gva_offset;
 
     rdtscll(core->hvm_state.last_boot_start);
+    
 
     if (!core->hvm_state.is_hrt) { 
 	PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
 	return 0;
     }
 
-    PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
 
-    
+    PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
 
+    gva_offset = core->vm_info->hvm_state.gva_offset;
     
     memset(&core->vm_regs,0,sizeof(core->vm_regs));
     memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
@@ -990,7 +1431,7 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
 
     // multiboot info pointer
     get_mb_info_loc(core->vm_info, &base,&limit);
-    core->vm_regs.rbx = (uint64_t) base;  
+    core->vm_regs.rbx = (uint64_t) base + gva_offset;  
 
     // core number
     core->vm_regs.rcx = core->vcpu_id;
@@ -1001,6 +1442,7 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
     // Now point to scratch stack for this core
     // it begins at an ofset relative to the MB info page
     get_mb_info_loc(core->vm_info, &base,&limit);
+    base = base + gva_offset;
     base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
     core->vm_regs.rsp = (v3_reg_t) base;  
     core->vm_regs.rbp = (v3_reg_t) base-8; 
@@ -1008,14 +1450,19 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
     // push onto the stack a bad rbp and bad return address
     core->vm_regs.rsp-=16;
     v3_set_gpa_memory(core,
-		      core->vm_regs.rsp,
+		      core->vm_regs.rsp-gva_offset,
 		      16,
 		      0xff);
 
 
     // HRT entry point
     get_hrt_loc(core->vm_info, &base,&limit);
-    core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr ; 
+    if (core->vm_info->hvm_state.gva_entry) { 
+      core->rip = core->vm_info->hvm_state.gva_entry;
+    } else {
+      core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset; 
+    }
+      
 
 
     PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
@@ -1036,7 +1483,7 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
     // CR2: don't care (output from #PF)
     // CE3: set to our PML4E, without setting PCD or PWT
     get_pt_loc(core->vm_info, &base,&limit);
-    core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);
+    core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);  // not offset as this is a GPA
     core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
 
     // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
@@ -1044,8 +1491,8 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
     core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
     // CR8 as usual
     // RFLAGS zeroed is fine: come in with interrupts off
-    // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0
-    core->ctrl_regs.efer = 0x1500;
+    // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
+    core->ctrl_regs.efer = 0x1d00;
     core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
 
 
@@ -1065,47 +1512,50 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
     
     // Install our stub IDT
     get_idt_loc(core->vm_info, &base,&limit);
+    base += gva_offset;
     core->segments.idtr.selector = 0;  // entry 0 (NULL) of the GDT
-    core->segments.idtr.base = (addr_t) base;
+    core->segments.idtr.base = (addr_t) base;  // only base+limit are used
     core->segments.idtr.limit = limit-1;
-    core->segments.idtr.type = 0xe;
-    core->segments.idtr.system = 1; 
+    core->segments.idtr.type = 0x0;
+    core->segments.idtr.system = 0; 
     core->segments.idtr.dpl = 0;
-    core->segments.idtr.present = 1;
-    core->segments.idtr.long_mode = 1;
+    core->segments.idtr.present = 0;
+    core->segments.idtr.long_mode = 0;
 
     // Install our stub GDT
     get_gdt_loc(core->vm_info, &base,&limit);
-    core->segments.gdtr.selector = 0;
+    base += gva_offset;
+    core->segments.gdtr.selector = 0;  // entry 0 (NULL) of the GDT
     core->segments.gdtr.base = (addr_t) base;
-    core->segments.gdtr.limit = limit-1;
-    core->segments.gdtr.type = 0x6;
-    core->segments.gdtr.system = 1; 
+    core->segments.gdtr.limit = limit-1;   // only base+limit are used
+    core->segments.gdtr.type = 0x0;
+    core->segments.gdtr.system = 0; 
     core->segments.gdtr.dpl = 0;
-    core->segments.gdtr.present = 1;
-    core->segments.gdtr.long_mode = 1;
+    core->segments.gdtr.present = 0;
+    core->segments.gdtr.long_mode = 0;
     
     // And our TSS
     get_tss_loc(core->vm_info, &base,&limit);
+    base += gva_offset;  
     core->segments.tr.selector = 0;
     core->segments.tr.base = (addr_t) base;
     core->segments.tr.limit = limit-1;
-    core->segments.tr.type = 0x6;
-    core->segments.tr.system = 1; 
+    core->segments.tr.type = 0x9;
+    core->segments.tr.system = 0;   // available 64 bit TSS 
     core->segments.tr.dpl = 0;
     core->segments.tr.present = 1;
-    core->segments.tr.long_mode = 1;
+    core->segments.tr.long_mode = 0; // not used
     
-    base = 0x0;
+    base = 0x0; // these are not offset as we want to make all gvas visible
     limit = -1;
 
     // And CS
     core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
-    core->segments.cs.base = (addr_t) base;
-    core->segments.cs.limit = limit;
-    core->segments.cs.type = 0xe;
-    core->segments.cs.system = 0; 
-    core->segments.cs.dpl = 0;
+    core->segments.cs.base = (addr_t) base;   // not used
+    core->segments.cs.limit = limit;          // not used
+    core->segments.cs.type = 0xe;             // only C is used
+    core->segments.cs.system = 1;             // not a system segment
+    core->segments.cs.dpl = 0;                       
     core->segments.cs.present = 1;
     core->segments.cs.long_mode = 1;
 
@@ -1113,8 +1563,8 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
     core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
     core->segments.ds.base = (addr_t) base;
     core->segments.ds.limit = limit;
-    core->segments.ds.type = 0x6;
-    core->segments.ds.system = 0; 
+    core->segments.ds.type = 0x6;            // ignored
+    core->segments.ds.system = 1;            // not a system segment
     core->segments.ds.dpl = 0;
     core->segments.ds.present = 1;
     core->segments.ds.long_mode = 1;
@@ -1184,6 +1634,7 @@ int v3_handle_hvm_reset(struct guest_info *core)
 	if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
 	    // leader
 	    core->vm_info->run_state = VM_RUNNING;
+            core->vm_info->hvm_state.trans_state = HRT_IDLE;
 	}
 
 	v3_counting_barrier(&core->vm_info->reset_barrier);
diff --git a/palacios/src/palacios/vmm_hvm_lowlevel.S b/palacios/src/palacios/vmm_hvm_lowlevel.S
index b6d1a76..ffd63f2 100644
--- a/palacios/src/palacios/vmm_hvm_lowlevel.S
+++ b/palacios/src/palacios/vmm_hvm_lowlevel.S
@@ -1,7 +1,7 @@
 /* HVM environment code and data */
 
 /* This must match the number in vmm_hypercall.h */
-#define HVM_HCALL $0xf000
+#define HVM_HCALL $0xf00d
 	
 /*
 	The default installed interrupt handlers simply hcall
@@ -45,4 +45,4 @@ v3_hvm_vmx_null_int_handler_end:
 
 	
 	
-	
\ No newline at end of file
+	
diff --git a/palacios/src/palacios/vmm_multiboot.c b/palacios/src/palacios/vmm_multiboot.c
index c007291..a9a20b3 100644
--- a/palacios/src/palacios/vmm_multiboot.c
+++ b/palacios/src/palacios/vmm_multiboot.c
@@ -144,94 +144,6 @@ int v3_deinit_multiboot_core(struct guest_info *core)
 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"multiboot: " fmt,##args)
 
 
- 
-/******************************************************************
-     Data contained in the ELF file we will attempt to boot  
-******************************************************************/
-
-#define ELF_MAGIC    0x464c457f
-#define MB2_MAGIC    0xe85250d6
-
-
-/******************************************************************
-     Data we will pass to the kernel via rbx
-******************************************************************/
-
-#define MB2_INFO_MAGIC    0x36d76289
-
-typedef struct mb_info_header {
-    uint32_t  totalsize;
-    uint32_t  reserved;
-} __attribute__((packed)) mb_info_header_t;
-
-// A tag of type 0, size 8 indicates last value
-//
-typedef struct mb_info_tag {
-    uint32_t  type;
-    uint32_t  size;
-} __attribute__((packed)) mb_info_tag_t;
-
-
-#define MB_INFO_MEM_TAG  4
-typedef struct mb_info_mem {
-    mb_info_tag_t tag;
-    uint32_t  mem_lower; // 0..640K in KB 
-    uint32_t  mem_upper; // in KB to first hole - 1 MB
-} __attribute__((packed)) mb_info_mem_t;
-
-#define MB_INFO_CMDLINE_TAG  1
-// note alignment of 8 bytes required for each... 
-typedef struct mb_info_cmdline {
-    mb_info_tag_t tag;
-    uint32_t  size;      // includes zero termination
-    uint8_t   string[];  // zero terminated
-} __attribute__((packed)) mb_info_cmdline_t;
-
-
-#define MEM_RAM   1
-#define MEM_ACPI  3
-#define MEM_RESV  4
-
-typedef struct mb_info_memmap_entry {
-    uint64_t  base_addr;
-    uint64_t  length;
-    uint32_t  type;
-    uint32_t  reserved;
-} __attribute__((packed)) mb_info_memmap_entry_t;
-
-#define MB_INFO_MEMMAP_TAG  6
-// note alignment of 8 bytes required for each... 
-typedef struct mb_info_memmap {
-    mb_info_tag_t tag;
-    uint32_t  entry_size;     // multiple of 8
-    uint32_t  entry_version;  // 0
-    mb_info_memmap_entry_t  entries[];
-} __attribute__((packed)) mb_info_memmap_t;
-
-#define MB_INFO_HRT_TAG 0xf00df00d
-typedef struct mb_info_hrt {
-    mb_info_tag_t  tag;
-    // apic ids are 0..num_apics-1
-    // apic and ioapic addresses are the well known places
-    uint32_t       total_num_apics;
-    uint32_t       first_hrt_apic_id;
-    uint32_t       have_hrt_ioapic;
-    uint32_t       first_hrt_ioapic_entry;
-    uint64_t       first_hrt_addr;
-} __attribute__((packed)) mb_info_hrt_t;
-
-
-// We are not doing:
-//
-// - BIOS Boot Devie
-// - Modules
-// - ELF symbols
-// - Boot Loader name
-// - APM table
-// - VBE info
-// - Framebuffer info
-//
-
 static int is_elf(uint8_t *data, uint64_t size)
 {
     if (*((uint32_t*)data)==ELF_MAGIC) {
@@ -382,7 +294,7 @@ static int parse_multiboot_kernel(uint8_t *data, uint64_t size, mb_data_t *mb)
 		INFO("  size            =  0x%x\n", mb_modalign->size);
 	    }
 		break;
-#if 0
+#if 1
 	    case MB_TAG_MB64_HRT: {
 		if (mb_mb64_hrt) { 
 		    ERROR("Multiple mb64_hrt tags found!\n");
@@ -539,13 +451,7 @@ uint64_t v3_build_multiboot_table(struct guest_info *core, uint8_t *dest, uint64
 
 #ifdef V3_CONFIG_HVM
     if (core->vm_info->hvm_state.is_hvm && v3_is_hvm_hrt_core(core)) { 
-	hrt->tag.type = MB_INFO_HRT_TAG;
-	hrt->tag.size = sizeof(mb_info_hrt_t);
-	hrt->total_num_apics = vm->num_cores;
-	hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
-	hrt->have_hrt_ioapic=0;
-	hrt->first_hrt_ioapic_entry=0;
-	hrt->first_hrt_addr = vm->hvm_state.first_hrt_gpa;
+	v3_build_hrt_multiboot_tag(core,hrt);
     }
 #endif