Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


HVM Enhancements + Bug Fixes
[palacios.git] / palacios / src / palacios / vmm_hvm.c
index 24ae9fd..ec1c42c 100644 (file)
 #include <palacios/vmm_debug.h>
 
 
+struct gdt_area {
+    struct {
+        uint16_t limit;
+        uint64_t base;
+    } __attribute__((packed)) gdtr;
+
+    uint64_t fsbase;
+    uint16_t cs;
+    uint16_t ds;
+    uint16_t es;
+    uint16_t fs;
+    uint16_t gs;
+    uint16_t ss;
+
+    uint64_t gdt[0];
+} __attribute__((packed));
+
+
 /*
 
   MEM     = Total size of memory in the GPA (in MB)
@@ -56,7 +74,7 @@
   <mem ... >RAM</mem>   (MB)  Note these are  
   <cores count="CORES" ...>   backward compatible
 
-  <hvm enable="y">
+  <hvm enable="y" >
     <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
     <hrt file_id="hrtelf" /hrt>
   </hvm>
 #endif
 
 
-// if set, we will map the first 1 GB of memory using a 3 level
-// hierarchy, for compatibility with Nautilus out of the box.
-// Otherwise we will map the first 512 GB using a 2 level
-// hieratchy
-#define HVM_MAP_1G_2M 1
-
 int v3_init_hvm()
 {
     PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
@@ -87,22 +99,620 @@ int v3_deinit_hvm()
     return 0;
 }
 
+// ignore requests from when we are in the wrong state
+#define ENFORCE_STATE_MACHINE 1
+
+// invoke the HRT using one of the followng mechanisms
+#define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
+#define UPCALL_MAGIC_ERROR   0xf00df00d
+
+
+static int magic_upcall(struct guest_info *core, uint64_t num)
+{
+#ifdef V3_CONFIG_HVM_UPCALL_MAGIC_GPF
+    PrintDebug(core->vm_info, core, "hvm: injecting magic #GP into core %llu\n",num);
+    if (v3_raise_exception_with_error(&core->vm_info->cores[num],
+                                     GPF_EXCEPTION, 
+                                     UPCALL_MAGIC_ERROR)) { 
+       PrintError(core->vm_info, core,"hvm: cannot inject HRT #GP to core %llu\n",num);
+       return -1;
+    } else {
+       return 0;
+    }
+#endif
 
+#ifdef V3_CONFIG_HVM_UPCALL_MAGIC_PF
+    PrintDebug(core->vm_info,core,"hvm: injecting magic #GP into core %llu\n",num);
+    core->vm_info->cores[num].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
+    if (v3_raise_exception_with_error(&core->vm_info->cores[num],
+                                     PF_EXCEPTION, 
+                                     UPCALL_MAGIC_ERROR)) { 
+       PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",num);
+       return -1;
+    } else {
+       return 0;
+    }
+#endif
+#ifdef V3_CONFIG_HVM_UPCALL_MAGIC_SWIN
+    PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",core->vm_info->hvm_info.hrt_int_vector,num);
+    if (v3_raise_swintr(&core->vm_info->cores[cur],core->vm_info->hvm_info-->hrt_int_vector)) { 
+       PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
+       return -1;
+    } else {
+       return 0;
+    }
+#endif
+
+    PrintError(core->vm_info,core,"hvm: no upcall mechanism is enabled!\n");
+    return -1;
+}
+
+
+/*
+  64 bit only hypercall:
+
+  rax = hypercall number
+  rbx = 0x646464...
+  then args are:  rcx, rdx, rsi, rdi r8, r9, r10, r11
+  rcx = 1st arg
+*/
 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
 {
     uint64_t c;
+    uint64_t bitness = core->vm_regs.rbx;
+    uint64_t a1 = core->vm_regs.rcx;
+    uint64_t a2 = core->vm_regs.rdx;
+    uint64_t a3 = core->vm_regs.rsi;
+    struct v3_vm_hvm *h = &core->vm_info->hvm_state;
+    addr_t irq_state;
+
+    // Let's be paranoid here
+    irq_state = v3_lock_irqsave(h->hypercall_lock);
+
+    if (bitness!=0x6464646464646464) { 
+       PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
+       core->vm_regs.rax = -1;
+       v3_unlock_irqrestore(h->hypercall_lock,irq_state);
+       return 0;
+    }
+
+    switch (a1) {
+       case 0x0:   // null
+           
+           rdtscll(c);
+           
+           V3_Print(core->vm_info,core, "hvm: received hypercall %x  rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
+                    hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
+           //v3_print_core_telemetry(core);
+           //    v3_print_guest_state(core);
+           core->vm_regs.rax = 0;
+           break;
+           
+       case 0x1: // reset ros
+           PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
+           if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) { 
+               PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
+               core->vm_regs.rax = -1;
+           } else {
+               core->vm_regs.rax = 0;
+           }
+           break;
+
+       case 0x2: // reset hrt
+           PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
+           if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) { 
+               PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
+               core->vm_regs.rax = -1;
+           } else {
+               core->vm_regs.rax = 0;
+           }
+           break;
+
+       case 0x3: // reset both
+           PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
+           if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) { 
+               PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
+               core->vm_regs.rax = -1;
+           } else {
+               core->vm_regs.rax = 0;
+           }
+           break;
+           
+       case 0x8: // replace HRT image
+           // a2 = gva of image
+           // a3 = size of image
+           PrintDebug(core->vm_info,core,"hvm: request replacement HRT image addr=0x%llx size=0x%llx\n",a2,a3);
+
+           if (h->hrt_image) { 
+               // delete old
+               V3_VFree(h->hrt_image);
+               h->hrt_image = 0;
+           }
+
+           h->hrt_image = V3_VMalloc(a3);
+
+           if (!(h->hrt_image)) {
+               PrintError(core->vm_info,core, "hvm: failed to allocate space for replacement image\n");
+               core->vm_regs.rax = -1;
+           } else {
+               if (v3_read_gva_memory(core, a2, a3, (uint8_t*) h->hrt_image)!=a3) { 
+                   PrintError(core->vm_info, core, "hvm: cannot read replacement image\n");
+                   core->vm_regs.rax = -1;
+               } else {
+                   h->hrt_image_size = a3; 
+                   core->vm_regs.rax = 0;
+               }
+           }
+
+           if (core->vm_regs.rax) { 
+               PrintError(core->vm_info,core,"hvm: Failed to replace HRT image\n");
+           } else {
+               PrintDebug(core->vm_info,core,"hvm: HRT image successfully replaced\n");
+           }
+
+           break;
+
+
+       case 0xf: // get HRT state
+           core->vm_regs.rax = h->trans_state;
+           if (v3_write_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*) &h->ros_event)!=sizeof(h->ros_event)) { 
+               PrintError(core->vm_info, core, "hvm: cannot write back ROS event state to %p - continuing\n",(void*)a2);
+           }
+           //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
+           break;
+
+       case 0x10:
+           PrintDebug(core->vm_info, core, "hvm: ROS event request\n");
+           if (h->ros_event.event_type!=ROS_NONE) { 
+               PrintError(core->vm_info, core, "hvm: ROS event is already in progress\n");
+               core->vm_regs.rax = -1;
+           } else {
+               if (v3_read_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*)&h->ros_event)!=sizeof(h->ros_event)) { 
+                   PrintError(core->vm_info, core, "hvm: cannot read ROS event from %p\n",(void*)a2);
+                   core->vm_regs.rax = -1;
+               } else {
+                   core->vm_regs.rax = 0;
+                   PrintDebug(core->vm_info, core, "hvm: copied new ROS event (type=%s)\n",
+                              h->ros_event.event_type == ROS_PAGE_FAULT ? "page fault" : 
+                              (h->ros_event.event_type == ROS_SYSCALL ? "syscall" : "none"));
+                   
+               }
+           }
+
+           break;
+
+       case 0x1e: // ack result (HRT has read the result of the finished event)
+           if (h->ros_event.event_type != ROS_DONE) {
+               PrintError(core->vm_info, core, "hvm: cannot ack event result when not in ROS_DONE state\n");
+               core->vm_regs.rax = -1;
+           } else {
+               h->ros_event.event_type=ROS_NONE;
+               PrintDebug(core->vm_info, core, "hvm: HRT core acks event result\n");
+               core->vm_regs.rax = 0;
+           }
+           break;
+
+       case 0x1f:
+           PrintDebug(core->vm_info, core, "hvm: completion of ROS event (rc=0x%llx)\n",a2);
+           h->ros_event.event_type=ROS_DONE;
+           h->ros_event.last_ros_event_result = a2;
+           break;
+
+       case 0x20: // invoke function (ROS->HRT)
+       case 0x21: // invoke parallel function (ROS->HRT)
+           if (v3_is_hvm_hrt_core(core)) { 
+               PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
+               core->vm_regs.rax = -1;
+           } else {
+               if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
+                   PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
+                   core->vm_regs.rax = -1;
+               } else {
+                   uint64_t *page = (uint64_t *) h->comm_page_hva;
+                   uint64_t first, last, cur;
+
+                   PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
+                   page[0] = a1;
+                   page[1] = a2;
+
+                   if (a1==0x20) { 
+                       first=last=h->first_hrt_core;
+                   } else {
+                       first=h->first_hrt_core;
+                       last=core->vm_info->num_cores-1;
+                   }
+
+                   core->vm_regs.rax = 0;
+
+                   h->trans_count = last-first+1;
+
+                   for (cur=first;cur<=last;cur++) { 
+                       if (magic_upcall(core,cur)) {
+                           core->vm_regs.rax = -1;
+                           break;
+                       }
+                       // Force core to exit now
+                       v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
+                   }
+                   if (core->vm_regs.rax==0) { 
+                       if (a1==0x20) { 
+                           h->trans_state = HRT_CALL;
+                       } else {
+                           h->trans_state = HRT_PARCALL;
+                       }
+                   }  else {
+                       PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
+                       h->trans_state = HRT_IDLE;
+                       h->trans_count = 0;
+                   }
+               }
+           }
+           break;
+
+
+       case 0x28: // setup for synchronous operation (ROS->HRT)
+       case 0x29: // teardown for synchronous operation (ROS->HRT)
+           if (v3_is_hvm_hrt_core(core)) { 
+               PrintError(core->vm_info,core,"hvm: %ssynchronization invocation not supported from HRT core\n",a1==0x29 ? "de" : "");
+               core->vm_regs.rax = -1;
+           } else {
+               if (ENFORCE_STATE_MACHINE && 
+                   ((a1==0x28 && h->trans_state!=HRT_IDLE) || (a1==0x29 && h->trans_state!=HRT_SYNC))) { 
+                   PrintError(core->vm_info,core, "hvm: cannot invoke %ssynchronization in state %d\n",a1==0x29 ? "de" : "", h->trans_state);
+                   core->vm_regs.rax = -1;
+               } else {
+                   uint64_t *page = (uint64_t *) h->comm_page_hva;
+                   uint64_t first, last, cur;
+
+                   PrintDebug(core->vm_info,core, "hvm: invoke %ssynchronization on address %p\n",a1==0x29 ? "de" : "",(void*)a2);
+                   page[0] = a1;
+                   page[1] = a2;
+
+                   first=last=h->first_hrt_core;  // initially we will sync only with BSP
+
+                   core->vm_regs.rax = 0;
+
+                   h->trans_count = last-first+1;
+
+                   for (cur=first;cur<=last;cur++) { 
+                       
+                       if (magic_upcall(core,cur)) { 
+                           core->vm_regs.rax = -1;
+                           break;
+                       }
+                       // Force core to exit now
+                       v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
+                         
+                   }
+                   if (core->vm_regs.rax==0) { 
+                       if (a1==0x28) { 
+                           h->trans_state = HRT_SYNCSETUP;
+                       } else {
+                           h->trans_state = HRT_SYNCTEARDOWN;                      
+                       }
+                   }  else {
+                       PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
+                       h->trans_state = HRT_IDLE;
+                       h->trans_count = 0;
+                   }
+               }
+           }
+           break;
+
+       case 0x2f: // function exec or sync done
+           if (v3_is_hvm_ros_core(core)) { 
+               PrintError(core->vm_info,core, "hvm: request for exec or sync done from ROS core\n");
+               core->vm_regs.rax=-1;
+           } else {
+               if (ENFORCE_STATE_MACHINE && 
+                   h->trans_state!=HRT_CALL && 
+                   h->trans_state!=HRT_PARCALL && 
+                   h->trans_state!=HRT_SYNCSETUP &&
+                   h->trans_state!=HRT_SYNCTEARDOWN) {
+                   PrintError(core->vm_info,core,"hvm: function or sync completion when not in HRT_CALL, HRT_PARCALL, HRT_SYNCSETUP, or HRT_SYNCTEARDOWN state\n");
+                   core->vm_regs.rax=-1;
+               } else {
+                   uint64_t one=1;
+                   PrintDebug(core->vm_info,core, "hvm: function or sync complete\n");
+                   if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
+                       // last one, switch state
+                       if (h->trans_state==HRT_SYNCSETUP) { 
+                           h->trans_state=HRT_SYNC;
+                           PrintDebug(core->vm_info,core, "hvm: function complete - now synchronous\n");
+                       } else {
+                           h->trans_state=HRT_IDLE;
+                       }
+                   }
+                   core->vm_regs.rax=0;
+               }
+           }
+                   
+           break;
+
+       case 0x30: // merge address space
+       case 0x31: // unmerge address space
+           if (v3_is_hvm_hrt_core(core)) { 
+               PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
+               core->vm_regs.rax=-1;
+           } else {
+               if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
+                   PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state (%d)\n",a1==0x30 ? "" : "un", h->trans_state);
+                   core->vm_regs.rax=-1;
+               } else {
+                   uint64_t *page = (uint64_t *) h->comm_page_hva;
+
+                   PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
+                   // should sanity check to make sure guest is in 64 bit without anything strange
+
+                   page[0] = a1;
+                   page[1] = core->ctrl_regs.cr3;  // this is a do-not-care for an unmerge
+
+                   core->vm_regs.rax = 0;
+
+                   h->trans_state = HRT_MERGE;
+
+                   if (magic_upcall(core,h->first_hrt_core)) {
+                       core->vm_regs.rax = -1;
+                       break;
+                   }
+
+                   // Force core to exit now
+                   v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
+
+               }
+               
+           }
+               
+           break;
+           
+
+       case 0x3f: // merge operation done
+           if (v3_is_hvm_ros_core(core)) { 
+               PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
+               core->vm_regs.rax=-1;
+           } else {
+               if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
+                   PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
+                   core->vm_regs.rax=-1;
+               } else {
+                   PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
+                   h->trans_state=HRT_IDLE;
+                   core->vm_regs.rax=0;
+               }
+           }
+                   
+           break;
+           
+       case 0x40: // install or remove signal handler
+           if (v3_is_hvm_hrt_core(core)) { 
+               PrintError(core->vm_info,core, "hvm: HRT cannot install signal handler...\n");
+               core->vm_regs.rax=-1;
+           } else {
+               PrintDebug(core->vm_info,core,"hvm: install signal handler for CR3=%p, handler=%p, stack=%p\n",(void*)core->ctrl_regs.cr3, (void*)a2, (void*)a3);
+               if (h->ros_signal.code) { 
+                   PrintError(core->vm_info,core,"hvm: signal is pending...\n");
+                   core->vm_regs.rax=-1;
+               } else {
+                   if ((a2 || a3) && (h->ros_signal.handler || h->ros_signal.stack)) { 
+                       PrintError(core->vm_info,core,"hvm: attempt to replace existing handler without removing it first\n");
+                       core->vm_regs.rax=-1;
+                   } else {
+                       // actually make the change
+                       h->ros_signal.handler=a2;
+                       h->ros_signal.stack=a3;
+                       h->ros_signal.cr3=core->ctrl_regs.cr3;
+                       core->vm_regs.rax=0;
+
+                       // test by signalling back a hello 
+                       // if (a2 && a3) { 
+                       //    v3_hvm_signal_ros(core->vm_info,0xf00d);
+                       //}
+                   }
+               }
+           }
+           break;
 
-    rdtscll(c);
+       case 0x41: // raise signal in the ROS from HRT or ROS
+           PrintDebug(core->vm_info,core,"hvm: HRT raises signal code=0x%llx\n", a2);
+           core->vm_regs.rax = v3_hvm_signal_ros(core->vm_info,a2);
+           break;
 
+       case 0x51: // fill GDT area (HRT only)
+           if (v3_is_hvm_hrt_core(core)) {
+               PrintError(core->vm_info, core, "hvm: HRT cannot request a GDT area fill\n");
+               core->vm_regs.rax = -1;
+           } else {
+               struct guest_info * hrt_core = &core->vm_info->cores[h->first_hrt_core];
+               struct gdt_area * area = V3_Malloc(sizeof(struct gdt_area) + core->segments.gdtr.limit);
+               if (!area) {
+                   PrintError(core->vm_info, core, "hvm: could not allocate GDT area\n");
+                   core->vm_regs.rax = -1;
+                   break;
+               }
+
+               PrintDebug(core->vm_info, core, "hvm: ROS requests to fill GDT area with fsbase=%p\n", (void*)a2);
+
+               if (!h->hrt_gdt_gva) {
+                   PrintError(core->vm_info, core, "hvm: HRT has not registered a GDT state save area\n");
+                   core->vm_regs.rax = -1;
+                   V3_Free(area);
+                   break;
+               }
+
+               area->gdtr.base  = h->hrt_gdt_gva + sizeof(struct gdt_area);
+               area->gdtr.limit = core->segments.gdtr.limit;
+               area->fsbase     = a2;
+               area->cs         = core->segments.cs.selector;
+               area->ds         = core->segments.ds.selector;
+               area->es         = core->segments.es.selector;
+               area->fs         = core->segments.fs.selector;
+               area->gs         = core->segments.gs.selector;
+               area->ss         = core->segments.ss.selector;
+               
+               if (v3_read_gva_memory(core, 
+                                      core->segments.gdtr.base,
+                                      core->segments.gdtr.limit,
+                                      (uint8_t*)area->gdt) != core->segments.gdtr.limit) {
+                   PrintError(core->vm_info, core, "hvm: could not copy GDT from ROS\n");
+                   core->vm_regs.rax = -1;
+                   V3_Free(area);
+                   break;
+               }
+                                       
+               uint_t area_size = sizeof(struct gdt_area) + core->segments.gdtr.limit;
+
+               // copy the entire area over
+               PrintDebug(core->vm_info, core, "hvm: copying %u bytes into GDT area\n", area_size);
+
+               if (v3_write_gva_memory(hrt_core, h->hrt_gdt_gva, area_size, (uchar_t*)area) != area_size) {
+                   PrintError(core->vm_info, core, "hvm: could not copy GDT area\n");
+                   core->vm_regs.rax = -1;
+                   V3_Free(area);
+                   break;
+               }
+
+               if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
+                   PrintError(core->vm_info,core, "hvm: cannot sync GDT in state %d\n", h->trans_state);
+                   core->vm_regs.rax = -1;
+                   V3_Free(area);
+                   break;
+               } else {
+                   uint64_t *page = (uint64_t *) h->comm_page_hva;
+                   uint64_t first, last, cur;
+
+                   PrintDebug(core->vm_info,core, "hvm: sync GDT\n");
+                   page[0] = a1;
+                   page[1] = h->hrt_gdt_gva;
+                   page[2] = a3;
+
+                   first=last=h->first_hrt_core;
+                   
+                   core->vm_regs.rax = 0;
+                   
+                   h->trans_count = last-first+1;
+
+                   for (cur=first;cur<=last;cur++) { 
+                       if (magic_upcall(core,cur)) {
+                           core->vm_regs.rax = -1;
+                           break;
+                       }
+                       // Force core to exit now
+                       v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
+                   }
+                   
+                   if (core->vm_regs.rax==0) { 
+                       h->trans_state = HRT_GDTSYNC;
+                   }  else {
+                       PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT GDT SYNC failure\n");
+                       h->trans_state = HRT_IDLE;
+                       h->trans_count = 0;
+                   }
+
+                   V3_Free(area);
+
+               }
+               
+           }
+           
+           break;
+        
+       case 0x52: // register HRT GDT area
+           if (!v3_is_hvm_hrt_core(core)) {
+               PrintError(core->vm_info, core, "hvm: ROS cannot install a GDT area\n"); 
+               core->vm_regs.rax = -1;
+           } else {
+               PrintDebug(core->vm_info, core, "hvm: HRT registers GDT save area at gva=%p\n", (void*)a2);
+               h->hrt_gdt_gva = a2;
+               core->vm_regs.rax = 0;
+           }
 
-    V3_Print(core->vm_info,core, "hvm: received hypercall %x  rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
-            hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, c-core->hvm_state.last_boot_start, core->num_exits);
-    //v3_print_core_telemetry(core);
-    //    v3_print_guest_state(core);
+        PrintDebug(core->vm_info, core, "hvm: Printing current HRT GDT...\n");
+#ifdef V3_CONFIG_DEBUG_HVM
+        v3_print_gdt(core, core->segments.gdtr.base);
+#endif
+       
+        break;
+       
+       case 0x53: // restore GDT
+
+           if (v3_is_hvm_hrt_core(core)) {
+               PrintError(core->vm_info, core, "hvm: HRT cannot request GDT restoration\n");
+               core->vm_regs.rax = -1;
+               break;
+           } else {
+               PrintDebug(core->vm_info, core, "hvm: ROS requesting to restore original GDT\n");
+               core->vm_regs.rax = 0;
+           }
+           
+           if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
+               PrintError(core->vm_info,core, "hvm: cannot sync GDT in state %d\n", h->trans_state);
+               core->vm_regs.rax = -1;
+               break;
+           } else {
+               uint64_t *page = (uint64_t *) h->comm_page_hva;
+               uint64_t first, last, cur;
+               
+               PrintDebug(core->vm_info,core, "hvm: restore GDT\n");
+               page[0] = a1;
+               
+               first=last=h->first_hrt_core;
+               
+               core->vm_regs.rax = 0;
+               
+               h->trans_count = last-first+1;
+               
+               for (cur=first;cur<=last;cur++) { 
+                   if (magic_upcall(core,cur)) {
+                       core->vm_regs.rax = -1;
+                       break;
+                   }
+                   // Force core to exit now
+                   v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
+               }
+               
+               if (core->vm_regs.rax==0) { 
+                   h->trans_state = HRT_GDTSYNC;
+               }  else {
+                   PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT GDT SYNC failure\n");
+                   h->trans_state = HRT_IDLE;
+                   h->trans_count = 0;
+               }
+           }
+           
+           break;
+           
+       case 0x5f: // GDT sync operation done
+           if (v3_is_hvm_ros_core(core)) { 
+               PrintError(core->vm_info,core, "hvm: invalid request for GDT sync done from ROS core\n");
+               core->vm_regs.rax=-1;
+           } else {
+               if (ENFORCE_STATE_MACHINE && h->trans_state != HRT_GDTSYNC) {
+                   PrintError(core->vm_info,core,"hvm: GDT sync done when in incorrect state (%d)\n", h->trans_state);
+                   core->vm_regs.rax=-1;
+               } else {
+                   PrintDebug(core->vm_info,core, "hvm: GDT sync complete - back to idle\n");
+                   PrintDebug(core->vm_info, core, "hvm: Dumping new HRT GDT...\n");
+#ifdef V3_CONFIG_DEBUG_HVM
+                   v3_print_gdt(core, core->segments.gdtr.base);
+#endif
+                   h->trans_state=HRT_IDLE;
+                   core->vm_regs.rax=0;
+               }
+               
+           }
+           break;
 
+       default:
+           PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
+           core->vm_regs.rax=-1;
+           break;
+    }
+               
+    v3_unlock_irqrestore(h->hypercall_lock,irq_state);
     return 0;
 }
 
+
 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
 
 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
@@ -153,7 +763,7 @@ int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
     }
 
     vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
-       
+
     if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) { 
        PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
        return -1;
@@ -177,6 +787,8 @@ int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
        return -1;
     }
 
+    v3_lock_init(&(vm->hvm_state.hypercall_lock));
+
     // XXX sanity check config here
 
     vm->hvm_state.is_hvm=1;
@@ -204,8 +816,26 @@ int v3_deinit_hvm_vm(struct v3_vm_info *vm)
 {
     PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
 
+
+    if (vm->hvm_state.hrt_image) { 
+       V3_VFree(vm->hvm_state.hrt_image);
+       vm->hvm_state.hrt_image=0;
+       vm->hvm_state.hrt_image_size=0;
+    }
+
     v3_remove_hypercall(vm,HVM_HCALL);
 
+    v3_lock_deinit(&(vm->hvm_state.hypercall_lock));
+
+    if (vm->hvm_state.comm_page_hpa) { 
+       struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
+       if (!r) { 
+           PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
+       } else {
+           v3_delete_mem_region(vm,r);
+       }
+    }
+
     return 0;
 }
 
@@ -263,7 +893,7 @@ uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
 {
     if (vm->hvm_state.is_hvm) { 
-       return gpa>=0 && gpa<vm->hvm_state.first_hrt_gpa;
+       return gpa<vm->hvm_state.first_hrt_gpa;
     } else {
        return 1;
     }
@@ -331,15 +961,15 @@ void     v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_in
 #define MAX(x,y) ((x)>(y)?(x):(y))
 #define MIN(x,y) ((x)<(y)?(x):(y))
 
-#ifdef HVM_MAP_1G_2M
-#define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x40000000ULL))
-#else
-#define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x800000000ULL))
-#endif
 
+static uint64_t boot_state_end_addr(struct v3_vm_info *vm) 
+{
+    return PAGE_ADDR(vm->mem_size);
+}
+   
 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 {
-    *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - PAGE_SIZE);
+    *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
     *limit = PAGE_SIZE;
 }
 
@@ -391,7 +1021,7 @@ static void write_null_int_handler(struct v3_vm_info *vm)
 
 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 {
-    *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - 2 * PAGE_SIZE);
+    *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
     *limit = 16*256;
 }
 
@@ -403,7 +1033,7 @@ static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 //    3 ist        => (stack) = 0 => current stack
 //    5 reserved   => 0
 //    4 type       => 0xe=>INT, 0xf=>TRAP 
-//    1 reserved   => 0
+//    1 reserved   => 0  (indicates "system" by being zero)
 //    2 dpl        => 0
 //    1 present    => 1
 //   16 offsetmid  => 0
@@ -414,7 +1044,7 @@ static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 // 
 // Note little endian
 //
-static uint64_t idt64_trap_gate_entry_mask[2] = {  0x00008f0000080000, 0x0 } ;
+static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
 static uint64_t idt64_int_gate_entry_mask[2] =  { 0x00008e0000080000, 0x0 };
 
 static void write_idt(struct v3_vm_info *vm)
@@ -431,6 +1061,8 @@ static void write_idt(struct v3_vm_info *vm)
 
     get_null_int_handler_loc(vm,&handler,&handler_len);
 
+    handler += vm->hvm_state.gva_offset;
+
     memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
     memcpy(int_gate,idt64_int_gate_entry_mask,16);
 
@@ -469,7 +1101,7 @@ static void write_idt(struct v3_vm_info *vm)
 
 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 {
-    *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 3 * PAGE_SIZE);
+    *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
     *limit = 8*3;
 }
 
@@ -494,7 +1126,7 @@ static void write_gdt(struct v3_vm_info *vm)
 
 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 {
-    *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 4 * PAGE_SIZE);
+    *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
     *limit = PAGE_SIZE;
 }
 
@@ -510,159 +1142,307 @@ static void write_tss(struct v3_vm_info *vm)
     PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
 }
 
+
+#define TOP_HALF_START  0xffff800000000000ULL
+#define BOTTOM_HALF_END 0x00007fffffffffffULL
+
+
+#define L4_UNIT PAGE_SIZE
+#define L3_UNIT (512ULL * L4_UNIT)
+#define L2_UNIT (512ULL * L3_UNIT)
+#define L1_UNIT (512ULL * L2_UNIT)
+
+static void compute_pts_4KB(struct v3_vm_info *vm, 
+                           uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)    
+{
+
+    // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
+    // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
+    // so it is the same number of page tables regardless
+
+    uint64_t max_gva = vm->hvm_state.max_mem_mapped;
+
+    *l1 = 1;  // 1 PML4
+    *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
+    *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
+    *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
+}
+
+
+
 /*
-  PTS MAP FIRST 512 GB identity mapped: 
-  1 second level
-     512 entries
+  PTS MAP using 1 GB pages
+  n second levels pts, highest gva, highest address
   1 top level
-     1 entries
+
 
 OR
   
-  PTS MAP FIRST 1 GB identity mapped:
-  1 third level
-    512 entries
-  1 second level
-    1 entries
-  1 top level
-    1 entries
+  PTS MAP using 2 MB pages
+  n third level pts, highest gva, highest address
+  m second level pts, highest gva, highest address
+  1 top level pt
+
+OR
+
+  PTS MAP using 4 KB pages
+  n 4th level, highest gva, highest address
+  m 3rd level, highest gva, hihgest address
+  l second level, highest gva, highest address
+  1 top level pt
+
+OR
+  PTS MAP using 512 GB pages when this becomes available
+
 */
 
+
 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 {
-#ifdef HVM_MAP_1G_2M
-    *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+2)*PAGE_SIZE);
-    *limit =  3*PAGE_SIZE;
-#else
-    *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+1)*PAGE_SIZE);
-    *limit =  2*PAGE_SIZE;
-#endif
+    uint64_t l1,l2,l3,l4;
+    uint64_t num_pt;
+
+    compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
+
+    if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
+       num_pt = l1;
+    } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
+       num_pt = l1 + l2;
+    } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
+       num_pt = l1 + l2 + l3;
+    } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
+       num_pt = l1 + l2 + l3 + l4;
+    } else {
+       PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
+       return;
+    }
+
+    *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
+    *limit = num_pt*PAGE_SIZE;
 }
 
-#ifndef HVM_MAP_1G_2M
-static void write_pt_2level_512GB(struct v3_vm_info *vm)
+static void write_pts(struct v3_vm_info *vm)
 {
-    void *base;
     uint64_t size;
-    struct pml4e64 pml4e;
-    struct pdpe64 pdpe;
-    uint64_t i;
-
-    get_pt_loc(vm,&base, &size);
-    if (size!=2*PAGE_SIZE) { 
-       PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n");
+    uint64_t num_l1, num_l2, num_l3, num_l4;
+    void *start_l1, *start_l2, *start_l3, *start_l4;
+    uint64_t max_level;
+    void *cur_pt;
+    void *cur_gva;
+    void *cur_gpa;
+    void *min_gpa = 0;
+    void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
+    void *min_gva = (void*) vm->hvm_state.gva_offset;
+#ifdef V3_CONFIG_DEBUG_HVM
+    void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
+#endif
+    uint64_t i, pt;
+    uint64_t i_start,i_end;
+    
+    struct pml4e64 *pml4e;
+    struct pdpe64 *pdpe;
+    struct pde64 *pde;
+    struct pte64 *pte;
+
+    if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
+       PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
+       max_level = 1;
+    } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
+       max_level = 2;
+    } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
+       max_level = 3;
+    } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
+       max_level = 4;
+    } else {
+       PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
+       return;
     }
 
-    if (vm->mem_size > 0x800000000ULL) { 
-       PrintError(vm,VCORE_NONE, "VM has more than 512 GB\n");
+    get_pt_loc(vm,&start_l1,&size);
+    compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
+
+    start_l2=start_l1+PAGE_SIZE*num_l1;
+    start_l3=start_l2+PAGE_SIZE*num_l2;
+    start_l4=start_l3+PAGE_SIZE*num_l3;
+
+    PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
+    PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
+    PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
+    PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
+
+    cur_pt=start_l1;
+
+    // build PML4 (only one)
+    if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) { 
+       PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
+       return;
     }
 
-    memset(&pdpe,0,sizeof(pdpe));
-    pdpe.present=1;
-    pdpe.writable=1;
-    pdpe.large_page=1;
-    
-    for (i=0;i<512;i++) {
-       pdpe.pd_base_addr = i*0x40000;  // 0x4000 = 256K pages = 1 GB
-       v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe);
+    memset(pml4e,0,PAGE_SIZE);
+
+    if (min_gva==0x0) { 
+       i_start=0; i_end = num_l2;
+    } else if (min_gva==(void*)TOP_HALF_START) { 
+       i_start=256; i_end=256+num_l2;
+    } else {
+       PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
+       return;
     }
 
-    memset(&pml4e,0,sizeof(pml4e));
-    pml4e.present=1;
-    pml4e.writable=1;
-    pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE));
+    for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
+        (i<i_end);
+        i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
 
-    v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e);    
+       pml4e[i].present=1;
+       pml4e[i].writable=1;
+       
+       if (max_level==1) { 
+           PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
+           pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
+           //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
+       } else {
+           pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
+           //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
+       }
+    }
 
-    for (i=1;i<512;i++) {
-       pml4e.present=0;
-       v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e);
+    // 512 GB only
+    if (max_level==1) {
+       return;
     }
 
-    PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE) at %p (512 GB mapped)\n",base);
-}
 
-#else 
 
-static void write_pt_3level_1GB(struct v3_vm_info *vm)
-{
-    void *base;
-    uint64_t size;
-    struct pml4e64 pml4e;
-    struct pdpe64 pdpe;
-    struct pde64 pde;
+    for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
+        pt<num_l2;
+        cur_pt+=PAGE_SIZE, pt++) { 
 
-    uint64_t i;
+       // build PDPE
+       if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) { 
+           PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
+           return;
+       }
+       
+       memset(pdpe,0,PAGE_SIZE);
+       
+       for (i=0; 
+            i<512 && cur_gpa<max_gpa; 
+            i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
 
-    get_pt_loc(vm,&base, &size);
-    if (size!=3*PAGE_SIZE) { 
-       PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n");
+           pdpe[i].present=1;
+           pdpe[i].writable=1;
+       
+           if (max_level==2) { 
+               pdpe[i].large_page=1;
+               pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
+               //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
+           } else {
+               pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
+               //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
+           }
+       }
     }
-
-    if (vm->mem_size > 0x40000000ULL) { 
-       PrintError(vm,VCORE_NONE, "VM has more than 1 GB\n");
+       
+    //1 GB only
+    if (max_level==2) { 
+       return;
     }
 
-    memset(&pde,0,sizeof(pde));
-    pde.present=1;
-    pde.writable=1;
-    pde.large_page=1;
-    
-    for (i=0;i<512;i++) {
-       pde.pt_base_addr = i*0x200;  // 0x200 = 512 pages = 2 MB
-       v3_write_gpa_memory(&vm->cores[0],
-                           (addr_t)(base+2*PAGE_SIZE+i*sizeof(pde)),
-                           sizeof(pde),(uint8_t*)&pde);
-    }
+    for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
+        pt<num_l3;
+        cur_pt+=PAGE_SIZE, pt++) { 
 
-    memset(&pdpe,0,sizeof(pdpe));
-    pdpe.present=1;
-    pdpe.writable=1;
-    pdpe.large_page=0;
+       // build PDE
+       if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) { 
+           PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
+           return;
+       }
+       
+       memset(pde,0,PAGE_SIZE);
+       
+       for (i=0; 
+            i<512 && cur_gpa<max_gpa; 
+            i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
 
-    pdpe.pd_base_addr = PAGE_BASE_ADDR((addr_t)(base+2*PAGE_SIZE));
+           pde[i].present=1;
+           pde[i].writable=1;
+       
+           if (max_level==3) { 
+               pde[i].large_page=1;
+               pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
+               //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
+           } else {
+               pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
+               //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
+           }
+       }
+    }
 
-    v3_write_gpa_memory(&vm->cores[0],(addr_t)base+PAGE_SIZE,sizeof(pdpe),(uint8_t*)&pdpe);    
-    
-    for (i=1;i<512;i++) {
-       pdpe.present = 0; 
-       v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe);
+    //2 MB only
+    if (max_level==3) { 
+       return;
     }
 
-    memset(&pml4e,0,sizeof(pml4e));
-    pml4e.present=1;
-    pml4e.writable=1;
-    pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE));
 
-    v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e);    
+    // 4 KB
+    for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
+        pt<num_l4;
+        cur_pt+=PAGE_SIZE, pt++) { 
 
-    for (i=1;i<512;i++) {
-       pml4e.present=0;
-       v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e);
+       // build PTE
+       if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) { 
+           PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
+           return;
+       }
+       
+       memset(pte,0,PAGE_SIZE);
+       
+       for (i=0; 
+            i<512 && cur_gpa<max_gpa; 
+            i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
+
+           pte[i].present=1;
+           pte[i].writable=1;
+           pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
+           //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
+       }
     }
 
-    PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE, 1 PDP) at %p (1 GB mapped)\n",base);
+    return;
 }
 
-#endif
 
-static void write_pt(struct v3_vm_info *vm)
+static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
 {
-#ifdef HVM_MAP_1G_2M
-    write_pt_3level_1GB(vm);
-#else
-    write_pt_2level_512GB(vm);
-#endif
+    
+    get_pt_loc(vm,base, limit);
+    *base-=PAGE_SIZE;
+    *limit=PAGE_SIZE;
 }
 
-static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
+
+int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
 {
-#ifdef HVM_MAP_1G_2M
-    *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+2)*PAGE_SIZE);
-#else
-    *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+1)*PAGE_SIZE);
-#endif
-    *limit =  PAGE_SIZE;
+    struct v3_vm_info *vm = core->vm_info;
+
+    hrt->tag.type = MB_INFO_HRT_TAG;
+    hrt->tag.size = sizeof(mb_info_hrt_t);
+
+    hrt->total_num_apics = vm->num_cores;
+    hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
+    hrt->have_hrt_ioapic=0;
+    hrt->first_hrt_ioapic_entry=0;
+
+    hrt->cpu_freq_khz = V3_CPU_KHZ();
+
+    hrt->hrt_flags = vm->hvm_state.hrt_flags;
+    hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
+    hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
+    hrt->gva_offset = vm->hvm_state.gva_offset;
+    hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
+    hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
+    
+    return 0;
 }
 
 static void write_mb_info(struct v3_vm_info *vm) 
@@ -753,77 +1533,156 @@ static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
 }
 
 
-// 
-// BROKEN - THIS DOES NOT DO WHAT YOU THINK
-//
-static int setup_elf(struct v3_vm_info *vm, void *base, uint64_t limit)
+static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
 {
-    v3_write_gpa_memory(&vm->cores[0],(addr_t)base,vm->hvm_state.hrt_file->size,vm->hvm_state.hrt_file->data);
-
-    vm->hvm_state.hrt_entry_addr = (uint64_t) (base+0x40);
-
-    PrintDebug(vm,VCORE_NONE,"hvm: wrote HRT ELF %s at %p\n", vm->hvm_state.hrt_file->tag,base);
-    PrintDebug(vm,VCORE_NONE,"hvm: set ELF entry to %p and hoping for the best...\n", (void*) vm->hvm_state.hrt_entry_addr);
+    struct v3_vm_hvm *h = &vm->hvm_state;
+    uint64_t f = mb->mb64_hrt->hrt_flags;
+    uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
+    uint64_t gvaoff = mb->mb64_hrt->gva_offset;
+    uint64_t gvaentry = mb->mb64_hrt->gva_entry;
+    uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
+    uint8_t  vec = mb->mb64_hrt->hrt_int_vector;
     
-    vm->hvm_state.hrt_type = HRT_ELF64;
 
-    return 0;
+    PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
+              f, maxmap, gvaoff,gvaentry,commgpa, vec);
 
-}
+    if (maxmap<0x100000000ULL) { 
+       PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
+       maxmap=0x100000000ULL;
+    }
 
-static int setup_mb_kernel(struct v3_vm_info *vm, void *base, uint64_t limit)
-{
-    mb_data_t mb;
+    if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
+       PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
+       return -1;
+    } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
+       f &= ~0x3c;
+       f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
+       h->max_mem_mapped = maxmap;
+       PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
+    } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { 
+       f &= ~0x3c;
+       f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
+       h->max_mem_mapped = maxmap;
+       PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
+    } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
+       f &= ~0x3c;
+       f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
+       h->max_mem_mapped = maxmap;
+       PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
+    } else {
+       PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
+       return -1;
+    }
 
-    if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) { 
-       PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
+    if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
+       PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
        return -1;
     }
 
+    h->hrt_flags = f;
 
-    if (v3_write_multiboot_kernel(vm,&mb,vm->hvm_state.hrt_file,base,limit)) {
-       PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
+    if (maxmap>h->max_mem_mapped) { 
+       PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
        return -1;
     }
 
-    /*
-    if (!mb.addr || !mb.entry) { 
-       PrintError(vm,VCORE_NONE, "hvm: kernel is missing address or entry point\n");
+    if (gvaoff!=0 && gvaoff!=TOP_HALF_START) { 
+       PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
        return -1;
     }
+    
+    h->gva_offset = gvaoff;
+
+    h->gva_entry = gvaentry;
 
-    if (((void*)(uint64_t)(mb.addr->header_addr) < base ) ||
-       ((void*)(uint64_t)(mb.addr->load_end_addr) > base+limit) ||
-       ((void*)(uint64_t)(mb.addr->bss_end_addr) > base+limit)) { 
-       PrintError(vm,VCORE_NONE, "hvm: kernel is not within the allowed portion of HVM\n");
+    if (mb->addr->load_addr < h->first_hrt_gpa) { 
+       PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
+       return -1;
+    }
+    
+    if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
+       PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
        return -1;
     }
+    
+    if (vec<32) { 
+       PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
+       return -1;
+    }
+    
+    h->hrt_int_vector = vec;
+    
+    
+    if (commgpa < vm->mem_size) { 
+       PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
+       return -1;
+    } 
 
-    offset = mb.addr->load_addr - mb.addr->header_addr;
+    h->comm_page_gpa = commgpa;
 
-    // Skip the ELF header - assume 1 page... weird.... 
-    // FIX ME TO CONFORM TO MULTIBOOT.C
-    v3_write_gpa_memory(&vm->cores[0],
-                       (addr_t)(mb.addr->load_addr),
-                       vm->hvm_state.hrt_file->size-PAGE_SIZE-offset,
-                       vm->hvm_state.hrt_file->data+PAGE_SIZE+offset);
+    if (!h->comm_page_hpa) { 
+       if (!(h->comm_page_hpa=V3_AllocPages(1))) { 
+           PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
+           return -1;
+       }
 
+       h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
+       
+       memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
        
-    // vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + PAGE_SIZE; //HACK PAD
+       if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) { 
+           PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
+           V3_FreePages((void*)(h->comm_page_gpa),1);
+           return -1;
+       }
+       
+       
+       PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
+    }
 
+    memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
+    
+    
+    PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
+              h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
+    
+    return 0;
 
-    PrintDebug(vm,VCORE_NONE,
-              "hvm: wrote 0x%llx bytes starting at offset 0x%llx to %p; set entry to %p\n",
-              (uint64_t) vm->hvm_state.hrt_file->size-PAGE_SIZE-offset,
-              (uint64_t) PAGE_SIZE+offset,
-              (void*)(addr_t)(mb.addr->load_addr),
-              (void*) vm->hvm_state.hrt_entry_addr);
+}
 
+static int setup_mb_kernel_hrt(struct v3_vm_info *vm, void *data, uint64_t size)
+{
+    mb_data_t mb;
 
-    */
+    if (v3_parse_multiboot_header(data, size, &mb)) { 
+       PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
+       return -1;
+    }
+
+    if (!mb.mb64_hrt) { 
+       PrintError(vm,VCORE_NONE,"hvm: invalid HRT - there is no MB64_HRT tag\n");
+       return -1;
+    }
 
-    vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr;
+    if (configure_hrt(vm,&mb)) {
+       PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
+       return -1;
+    }
     
+    if (v3_write_multiboot_kernel(vm,&mb,data,size,
+                                 (void*)vm->hvm_state.first_hrt_gpa,
+                                 vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
+       PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
+       return -1;
+    }
+
+    if (vm->hvm_state.gva_entry) { 
+       vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
+    } else {
+       vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
+    }
+
     vm->hvm_state.hrt_type = HRT_MBOOT64;
 
     return 0;
@@ -833,37 +1692,29 @@ static int setup_mb_kernel(struct v3_vm_info *vm, void *base, uint64_t limit)
 
 static int setup_hrt(struct v3_vm_info *vm)
 {
-    void *base;
-    uint64_t limit;
-
-    get_hrt_loc(vm,&base,&limit);
+    void *data;
+    uint64_t size;
 
-    if (vm->hvm_state.hrt_file->size > limit) { 
-       PrintError(vm,VCORE_NONE,"hvm: Cannot map HRT because it is too big (%llu bytes, but only have %llu space\n", vm->hvm_state.hrt_file->size, (uint64_t)limit);
-       return -1;
+    // If the ROS has installed an image, it takes priority
+    if (vm->hvm_state.hrt_image) { 
+       data = vm->hvm_state.hrt_image;
+       size = vm->hvm_state.hrt_image_size;
+    } else {
+       data = vm->hvm_state.hrt_file->data;
+       size = vm->hvm_state.hrt_file->size;
     }
+       
+    if (is_elf(data,size) &&
+       find_mb_header(data,size)) {
 
-    if (!is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { 
-       PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not an ELF but we are going to act like it is!\n");
-       if (setup_elf(vm,base,limit)) {
-           PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n");
+       PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
+       if (setup_mb_kernel_hrt(vm,data,size)) { 
+           PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
            return -1;
-       }
-       vm->hvm_state.hrt_type=HRT_BLOB;
+       } 
     } else {
-       if (find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { 
-           PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
-           if (setup_mb_kernel(vm,base,limit)) { 
-               PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
-               return -1;
-           } 
-       } else {
-           PrintDebug(vm,VCORE_NONE,"hvm: supplied HRT is an ELF\n");
-           if (setup_elf(vm,base,limit)) {
-               PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n");
-               return -1;
-           }
-       }
+       PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
+       return -1;
     }
 
     return 0;
@@ -887,8 +1738,9 @@ static int setup_hrt(struct v3_vm_info *vm)
   GDT (1 page - page aligned)
   TSS (1 page - page asligned)
   PAGETABLES  (identy map of first N GB)
-     ROOT PT first, followed by 2nd level, etc.
-     Currently PML4 followed by 1 PDPE for 512 GB of mapping
+     ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
+     followed by 3rd level PTs in order, followed by 4th level
+     PTs in order.  
   MBINFO_PAGE
   SCRATCH_STACK_HRT_CORE0 
   SCRATCH_STACK_HRT_CORE1
@@ -898,7 +1750,8 @@ static int setup_hrt(struct v3_vm_info *vm)
   HRT (as many pages as needed, page-aligned, starting at first HRT address)
   ---
   ROS
-      
+
+
 */
 
 
@@ -911,20 +1764,22 @@ int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
 
     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
 
+    if (setup_hrt(vm)) {
+       PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
+       return -1;
+    } 
+
+    // the locations of all the other items are determined by
+    // the HRT setup, so these must happen after
+
     write_null_int_handler(vm);
     write_idt(vm);
     write_gdt(vm);
     write_tss(vm);
 
-    write_pt(vm);
-
-    
-    if (setup_hrt(vm)) {
-       PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
-       return -1;
-    } 
+    write_pts(vm);
 
-    // need to parse HRT first
+    // this must happen last
     write_mb_info(vm);
 
     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
@@ -939,8 +1794,8 @@ int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
    GDTR points to stub GDT
    TS   points to stub TSS
    CR3 points to root page table
-   CR0 has PE and PG
-   EFER has LME AND LMA
+   CR0 has PE, PG, and WP
+   EFER has LME AND LMA (and NX for compatibility with Linux)
    RSP is TOS of core's scratch stack (looks like a call)
 
    RAX = MB magic cookie
@@ -948,6 +1803,8 @@ int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
    RCX = this core id / apic id (0..N-1)
    RDX = this core id - first HRT core ID (==0 for the first HRT core)
 
+   All addresses are virtual addresses, offset as needed by gva_offset
+
    Other regs are zeroed
 
    shadow/nested paging state reset for long mode
@@ -957,18 +1814,20 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
 {
     void *base;
     uint64_t limit;
+    uint64_t gva_offset;
 
     rdtscll(core->hvm_state.last_boot_start);
+    
 
     if (!core->hvm_state.is_hrt) { 
        PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
        return 0;
     }
 
-    PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
 
-    
+    PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
 
+    gva_offset = core->vm_info->hvm_state.gva_offset;
     
     memset(&core->vm_regs,0,sizeof(core->vm_regs));
     memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
@@ -990,7 +1849,7 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
 
     // multiboot info pointer
     get_mb_info_loc(core->vm_info, &base,&limit);
-    core->vm_regs.rbx = (uint64_t) base;  
+    core->vm_regs.rbx = (uint64_t) base + gva_offset;  
 
     // core number
     core->vm_regs.rcx = core->vcpu_id;
@@ -1001,6 +1860,7 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
     // Now point to scratch stack for this core
     // it begins at an ofset relative to the MB info page
     get_mb_info_loc(core->vm_info, &base,&limit);
+    base = base + gva_offset;
     base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
     core->vm_regs.rsp = (v3_reg_t) base;  
     core->vm_regs.rbp = (v3_reg_t) base-8; 
@@ -1008,14 +1868,19 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
     // push onto the stack a bad rbp and bad return address
     core->vm_regs.rsp-=16;
     v3_set_gpa_memory(core,
-                     core->vm_regs.rsp,
+                     core->vm_regs.rsp-gva_offset,
                      16,
                      0xff);
 
 
     // HRT entry point
     get_hrt_loc(core->vm_info, &base,&limit);
-    core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr ; 
+    if (core->vm_info->hvm_state.gva_entry) { 
+      core->rip = core->vm_info->hvm_state.gva_entry;
+    } else {
+      core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset; 
+    }
+      
 
 
     PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
@@ -1029,14 +1894,14 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
               (void*)(core->vm_regs.rdx));
 
     // Setup CRs for long mode and our stub page table
-    // CR0: PG, PE
-    core->ctrl_regs.cr0 = 0x80000001;
+    // CR0: PG, PE, and WP for catching COW faults in kernel-mode (which is not default behavior)
+    core->ctrl_regs.cr0 = 0x80010001;
     core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
 
     // CR2: don't care (output from #PF)
     // CE3: set to our PML4E, without setting PCD or PWT
     get_pt_loc(core->vm_info, &base,&limit);
-    core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);
+    core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);  // not offset as this is a GPA
     core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
 
     // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
@@ -1044,8 +1909,8 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
     core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
     // CR8 as usual
     // RFLAGS zeroed is fine: come in with interrupts off
-    // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0
-    core->ctrl_regs.efer = 0x1500;
+    // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
+    core->ctrl_regs.efer = 0x1d00;
     core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
 
 
@@ -1065,47 +1930,50 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
     
     // Install our stub IDT
     get_idt_loc(core->vm_info, &base,&limit);
+    base += gva_offset;
     core->segments.idtr.selector = 0;  // entry 0 (NULL) of the GDT
-    core->segments.idtr.base = (addr_t) base;
+    core->segments.idtr.base = (addr_t) base;  // only base+limit are used
     core->segments.idtr.limit = limit-1;
-    core->segments.idtr.type = 0xe;
-    core->segments.idtr.system = 1; 
+    core->segments.idtr.type = 0x0;
+    core->segments.idtr.system = 0; 
     core->segments.idtr.dpl = 0;
-    core->segments.idtr.present = 1;
-    core->segments.idtr.long_mode = 1;
+    core->segments.idtr.present = 0;
+    core->segments.idtr.long_mode = 0;
 
     // Install our stub GDT
     get_gdt_loc(core->vm_info, &base,&limit);
-    core->segments.gdtr.selector = 0;
+    base += gva_offset;
+    core->segments.gdtr.selector = 0;  // entry 0 (NULL) of the GDT
     core->segments.gdtr.base = (addr_t) base;
-    core->segments.gdtr.limit = limit-1;
-    core->segments.gdtr.type = 0x6;
-    core->segments.gdtr.system = 1; 
+    core->segments.gdtr.limit = limit-1;   // only base+limit are used
+    core->segments.gdtr.type = 0x0;
+    core->segments.gdtr.system = 0; 
     core->segments.gdtr.dpl = 0;
-    core->segments.gdtr.present = 1;
-    core->segments.gdtr.long_mode = 1;
+    core->segments.gdtr.present = 0;
+    core->segments.gdtr.long_mode = 0;
     
     // And our TSS
     get_tss_loc(core->vm_info, &base,&limit);
+    base += gva_offset;  
     core->segments.tr.selector = 0;
     core->segments.tr.base = (addr_t) base;
     core->segments.tr.limit = limit-1;
-    core->segments.tr.type = 0x6;
-    core->segments.tr.system = 1; 
+    core->segments.tr.type = 0x9;
+    core->segments.tr.system = 0;   // available 64 bit TSS 
     core->segments.tr.dpl = 0;
     core->segments.tr.present = 1;
-    core->segments.tr.long_mode = 1;
+    core->segments.tr.long_mode = 0; // not used
     
-    base = 0x0;
+    base = 0x0; // these are not offset as we want to make all gvas visible
     limit = -1;
 
     // And CS
     core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
-    core->segments.cs.base = (addr_t) base;
-    core->segments.cs.limit = limit;
-    core->segments.cs.type = 0xe;
-    core->segments.cs.system = 0; 
-    core->segments.cs.dpl = 0;
+    core->segments.cs.base = (addr_t) base;   // not used
+    core->segments.cs.limit = limit;          // not used
+    core->segments.cs.type = 0xe;             // only C is used
+    core->segments.cs.system = 1;             // not a system segment
+    core->segments.cs.dpl = 0;                       
     core->segments.cs.present = 1;
     core->segments.cs.long_mode = 1;
 
@@ -1113,8 +1981,8 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
     core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
     core->segments.ds.base = (addr_t) base;
     core->segments.ds.limit = limit;
-    core->segments.ds.type = 0x6;
-    core->segments.ds.system = 0; 
+    core->segments.ds.type = 0x6;            // ignored
+    core->segments.ds.system = 1;            // not a system segment
     core->segments.ds.dpl = 0;
     core->segments.ds.present = 1;
     core->segments.ds.long_mode = 1;
@@ -1166,21 +2034,31 @@ int v3_handle_hvm_reset(struct guest_info *core)
            // and recopy the .data, but for now we'll just
            // do everything
            rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
+
+           if (rc) { 
+               PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
+           }
        }
 
        // now everyone is ready to reset
        rc |= v3_setup_hvm_hrt_core_for_boot(core);
 
+       if (rc) { 
+           PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
+       }
+
        core->core_run_state = CORE_RUNNING;
 
        if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
            // leader
            core->vm_info->run_state = VM_RUNNING;
+            core->vm_info->hvm_state.trans_state = HRT_IDLE;
        }
 
        v3_counting_barrier(&core->vm_info->reset_barrier);
 
        if (rc<0) { 
+           PrintError(core->vm_info,core,"hvm: reset failed\n");
            return rc;
        } else {
            return 1;
@@ -1191,3 +2069,111 @@ int v3_handle_hvm_reset(struct guest_info *core)
        return 0;
     }
 }
+
+int v3_handle_hvm_entry(struct guest_info *core)
+{
+    if (!core->vm_info->hvm_state.is_hvm        // not relevant to non-HVM
+       || core->hvm_state.is_hrt              // not relevant to an HRT in an HVM
+       || !core->vm_info->hvm_state.ros_signal.code) { // not relevant if there is no code to inject
+
+       // Note that above check for code could race with a writer, but
+       // if that happens, we'll simply inject at the next opportunity instead of 
+       // this one (see below for atomic update)
+       return 0;
+    } else {
+       struct v3_ros_signal *s = &core->vm_info->hvm_state.ros_signal;
+
+       // HVM ROS
+       if (! (s->handler && // handler installed
+              s->cr3 &&     // process installed
+              s->stack &&   // stack installed
+              core->cpl == 3 &&  // user mode
+              core->ctrl_regs.cr3 == s->cr3) // right process active
+           ) {
+           // Cannot inject at this time
+           return 0;
+       } else {
+           // We can inject now, let's atomically see if we have something
+           // and commit to doing it if we do
+           uint64_t code;
+
+           // Get code, reset to allow next one
+           code = __sync_fetch_and_and(&(s->code), 0);
+
+           if (!code) { 
+               // nothing to do after all
+               return 0;
+           } else {
+
+               // actually do inject
+
+               uint64_t rsp;
+               uint64_t frame[6];
+               
+               PrintDebug(core->vm_info,core,"hvm: ROS interrupt starting with rip=%p rsp=%p\n", (void*) core->rip, (void*) core->vm_regs.rsp);
+               // build interrupt frame
+               frame[0] = code;
+               frame[1] = core->rip;
+               frame[2] = core->segments.cs.selector; // return cs
+               frame[3] = core->ctrl_regs.rflags;
+               frame[4] = core->vm_regs.rsp;
+               frame[5] = core->segments.ss.selector; // return ss
+               
+               rsp = (s->stack - 16) & (~0xf); // We should be 16 byte aligned to start
+               rsp -= sizeof(frame);
+               
+
+               if (v3_write_gva_memory(core,(addr_t)rsp,sizeof(frame),(uint8_t*)frame)!=sizeof(frame)) { 
+                   PrintError(core->vm_info,core,"hvm: failed to write interrupt frame\n");
+                   // we just lost this inject
+                   return -1;
+               }
+               
+               // now make us look like we are jumping to the entry
+               core->rip = s->handler;
+               core->vm_regs.rsp = rsp;
+
+               PrintDebug(core->vm_info,core,"hvm: ROS frame is 0x%llx|0x%llx|0x%llx|0x%llx|0x%llx|0x%llx and and on entry rip=%p and rsp=%p\n", frame[0],frame[1],frame[2],frame[3],frame[4],frame[5],(void*) core->rip, (void*) core->vm_regs.rsp);
+               
+               // and we should be good to go
+               return 0;
+           } 
+       }
+    }
+}
+
+int v3_handle_hvm_exit(struct guest_info *core)
+{
+    // currently nothing
+    return 0;
+}
+
+
+int v3_hvm_signal_ros(struct v3_vm_info *vm, uint64_t code)
+{
+    struct v3_ros_signal *s = &vm->hvm_state.ros_signal;
+
+    if (!code) { 
+       PrintError(vm,VCORE_NONE,"hvm: cannot signal ros with code zero\n");
+       return -1;
+    }
+
+    // handler, etc, must exist
+    if (!s->handler || !s->stack) { 
+       PrintError(vm,VCORE_NONE,"hvm: cannot signal ros with no installed handler\n");
+       return -1;
+    } else {
+       // we set the code only if we are idle (code 0), 
+       // and we do so only 
+       if (!__sync_bool_compare_and_swap(&(s->code), 0, code)) {
+           PrintError(vm,VCORE_NONE,"hvm: signal was already asserted\n");
+           return -1;
+       } else {
+           PrintDebug(vm,VCORE_NONE,"hvm: raised signal 0x%llx to the ROS\n",code);
+           return 0;
+       }
+    }
+}
+
+
+