X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?a=blobdiff_plain;f=palacios%2Fsrc%2Fpalacios%2Fvmm_hvm.c;h=ec1c42c7a0a8af41ed0278e0f06137aa4c764aa6;hb=d85300ed95766164d14a7f3b6c1c681b8b9a9c52;hp=b1f7013d8cbeeca843f97724ac18a54250978c30;hpb=60ad6a41c6d0ee08ed689e8505eb0c3df0c2a289;p=palacios.git diff --git a/palacios/src/palacios/vmm_hvm.c b/palacios/src/palacios/vmm_hvm.c index b1f7013..ec1c42c 100644 --- a/palacios/src/palacios/vmm_hvm.c +++ b/palacios/src/palacios/vmm_hvm.c @@ -28,8 +28,26 @@ #include -#include -#include +#include + + +struct gdt_area { + struct { + uint16_t limit; + uint64_t base; + } __attribute__((packed)) gdtr; + + uint64_t fsbase; + uint16_t cs; + uint16_t ds; + uint16_t es; + uint16_t fs; + uint16_t gs; + uint16_t ss; + + uint64_t gdt[0]; +} __attribute__((packed)); + /* @@ -56,7 +74,7 @@ RAM (MB) Note these are backward compatible - + (MB) @@ -68,6 +86,7 @@ #define PrintDebug(fmt, args...) #endif + int v3_init_hvm() { PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n"); @@ -80,14 +99,620 @@ int v3_deinit_hvm() return 0; } +// ignore requests from when we are in the wrong state +#define ENFORCE_STATE_MACHINE 1 + +// invoke the HRT using one of the followng mechanisms +#define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL +#define UPCALL_MAGIC_ERROR 0xf00df00d + + +static int magic_upcall(struct guest_info *core, uint64_t num) +{ +#ifdef V3_CONFIG_HVM_UPCALL_MAGIC_GPF + PrintDebug(core->vm_info, core, "hvm: injecting magic #GP into core %llu\n",num); + if (v3_raise_exception_with_error(&core->vm_info->cores[num], + GPF_EXCEPTION, + UPCALL_MAGIC_ERROR)) { + PrintError(core->vm_info, core,"hvm: cannot inject HRT #GP to core %llu\n",num); + return -1; + } else { + return 0; + } +#endif +#ifdef V3_CONFIG_HVM_UPCALL_MAGIC_PF + PrintDebug(core->vm_info,core,"hvm: injecting magic #GP into core %llu\n",num); + core->vm_info->cores[num].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS; + if (v3_raise_exception_with_error(&core->vm_info->cores[num], + PF_EXCEPTION, + UPCALL_MAGIC_ERROR)) { + PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",num); + return -1; + } else { + return 0; + } +#endif +#ifdef V3_CONFIG_HVM_UPCALL_MAGIC_SWIN + PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",core->vm_info->hvm_info.hrt_int_vector,num); + if (v3_raise_swintr(&core->vm_info->cores[cur],core->vm_info->hvm_info-->hrt_int_vector)) { + PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur); + return -1; + } else { + return 0; + } +#endif + + PrintError(core->vm_info,core,"hvm: no upcall mechanism is enabled!\n"); + return -1; +} + + +/* + 64 bit only hypercall: + + rax = hypercall number + rbx = 0x646464... + then args are: rcx, rdx, rsi, rdi r8, r9, r10, r11 + rcx = 1st arg +*/ static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data) { - V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx\n", - hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx); + uint64_t c; + uint64_t bitness = core->vm_regs.rbx; + uint64_t a1 = core->vm_regs.rcx; + uint64_t a2 = core->vm_regs.rdx; + uint64_t a3 = core->vm_regs.rsi; + struct v3_vm_hvm *h = &core->vm_info->hvm_state; + addr_t irq_state; + + // Let's be paranoid here + irq_state = v3_lock_irqsave(h->hypercall_lock); + + if (bitness!=0x6464646464646464) { + PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n"); + core->vm_regs.rax = -1; + v3_unlock_irqrestore(h->hypercall_lock,irq_state); + return 0; + } + + switch (a1) { + case 0x0: // null + + rdtscll(c); + + V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n", + hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits); + //v3_print_core_telemetry(core); + // v3_print_guest_state(core); + core->vm_regs.rax = 0; + break; + + case 0x1: // reset ros + PrintDebug(core->vm_info,core, "hvm: reset ROS\n"); + if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) { + PrintError(core->vm_info,core, "hvm: reset of ROS failed\n"); + core->vm_regs.rax = -1; + } else { + core->vm_regs.rax = 0; + } + break; + + case 0x2: // reset hrt + PrintDebug(core->vm_info,core, "hvm: reset HRT\n"); + if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) { + PrintError(core->vm_info,core, "hvm: reset of HRT failed\n"); + core->vm_regs.rax = -1; + } else { + core->vm_regs.rax = 0; + } + break; + + case 0x3: // reset both + PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n"); + if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) { + PrintError(core->vm_info,core, "hvm: reset of HRT failed\n"); + core->vm_regs.rax = -1; + } else { + core->vm_regs.rax = 0; + } + break; + + case 0x8: // replace HRT image + // a2 = gva of image + // a3 = size of image + PrintDebug(core->vm_info,core,"hvm: request replacement HRT image addr=0x%llx size=0x%llx\n",a2,a3); + + if (h->hrt_image) { + // delete old + V3_VFree(h->hrt_image); + h->hrt_image = 0; + } + + h->hrt_image = V3_VMalloc(a3); + + if (!(h->hrt_image)) { + PrintError(core->vm_info,core, "hvm: failed to allocate space for replacement image\n"); + core->vm_regs.rax = -1; + } else { + if (v3_read_gva_memory(core, a2, a3, (uint8_t*) h->hrt_image)!=a3) { + PrintError(core->vm_info, core, "hvm: cannot read replacement image\n"); + core->vm_regs.rax = -1; + } else { + h->hrt_image_size = a3; + core->vm_regs.rax = 0; + } + } + + if (core->vm_regs.rax) { + PrintError(core->vm_info,core,"hvm: Failed to replace HRT image\n"); + } else { + PrintDebug(core->vm_info,core,"hvm: HRT image successfully replaced\n"); + } + + break; + + + case 0xf: // get HRT state + core->vm_regs.rax = h->trans_state; + if (v3_write_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*) &h->ros_event)!=sizeof(h->ros_event)) { + PrintError(core->vm_info, core, "hvm: cannot write back ROS event state to %p - continuing\n",(void*)a2); + } + //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax); + break; + + case 0x10: + PrintDebug(core->vm_info, core, "hvm: ROS event request\n"); + if (h->ros_event.event_type!=ROS_NONE) { + PrintError(core->vm_info, core, "hvm: ROS event is already in progress\n"); + core->vm_regs.rax = -1; + } else { + if (v3_read_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*)&h->ros_event)!=sizeof(h->ros_event)) { + PrintError(core->vm_info, core, "hvm: cannot read ROS event from %p\n",(void*)a2); + core->vm_regs.rax = -1; + } else { + core->vm_regs.rax = 0; + PrintDebug(core->vm_info, core, "hvm: copied new ROS event (type=%s)\n", + h->ros_event.event_type == ROS_PAGE_FAULT ? "page fault" : + (h->ros_event.event_type == ROS_SYSCALL ? "syscall" : "none")); + + } + } + + break; + + case 0x1e: // ack result (HRT has read the result of the finished event) + if (h->ros_event.event_type != ROS_DONE) { + PrintError(core->vm_info, core, "hvm: cannot ack event result when not in ROS_DONE state\n"); + core->vm_regs.rax = -1; + } else { + h->ros_event.event_type=ROS_NONE; + PrintDebug(core->vm_info, core, "hvm: HRT core acks event result\n"); + core->vm_regs.rax = 0; + } + break; + + case 0x1f: + PrintDebug(core->vm_info, core, "hvm: completion of ROS event (rc=0x%llx)\n",a2); + h->ros_event.event_type=ROS_DONE; + h->ros_event.last_ros_event_result = a2; + break; + + case 0x20: // invoke function (ROS->HRT) + case 0x21: // invoke parallel function (ROS->HRT) + if (v3_is_hvm_hrt_core(core)) { + PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel"); + core->vm_regs.rax = -1; + } else { + if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { + PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state); + core->vm_regs.rax = -1; + } else { + uint64_t *page = (uint64_t *) h->comm_page_hva; + uint64_t first, last, cur; + + PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2); + page[0] = a1; + page[1] = a2; + + if (a1==0x20) { + first=last=h->first_hrt_core; + } else { + first=h->first_hrt_core; + last=core->vm_info->num_cores-1; + } + + core->vm_regs.rax = 0; + + h->trans_count = last-first+1; + + for (cur=first;cur<=last;cur++) { + if (magic_upcall(core,cur)) { + core->vm_regs.rax = -1; + break; + } + // Force core to exit now + v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0); + } + if (core->vm_regs.rax==0) { + if (a1==0x20) { + h->trans_state = HRT_CALL; + } else { + h->trans_state = HRT_PARCALL; + } + } else { + PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n"); + h->trans_state = HRT_IDLE; + h->trans_count = 0; + } + } + } + break; + + + case 0x28: // setup for synchronous operation (ROS->HRT) + case 0x29: // teardown for synchronous operation (ROS->HRT) + if (v3_is_hvm_hrt_core(core)) { + PrintError(core->vm_info,core,"hvm: %ssynchronization invocation not supported from HRT core\n",a1==0x29 ? "de" : ""); + core->vm_regs.rax = -1; + } else { + if (ENFORCE_STATE_MACHINE && + ((a1==0x28 && h->trans_state!=HRT_IDLE) || (a1==0x29 && h->trans_state!=HRT_SYNC))) { + PrintError(core->vm_info,core, "hvm: cannot invoke %ssynchronization in state %d\n",a1==0x29 ? "de" : "", h->trans_state); + core->vm_regs.rax = -1; + } else { + uint64_t *page = (uint64_t *) h->comm_page_hva; + uint64_t first, last, cur; + + PrintDebug(core->vm_info,core, "hvm: invoke %ssynchronization on address %p\n",a1==0x29 ? "de" : "",(void*)a2); + page[0] = a1; + page[1] = a2; + + first=last=h->first_hrt_core; // initially we will sync only with BSP + + core->vm_regs.rax = 0; + + h->trans_count = last-first+1; + + for (cur=first;cur<=last;cur++) { + + if (magic_upcall(core,cur)) { + core->vm_regs.rax = -1; + break; + } + // Force core to exit now + v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0); + + } + if (core->vm_regs.rax==0) { + if (a1==0x28) { + h->trans_state = HRT_SYNCSETUP; + } else { + h->trans_state = HRT_SYNCTEARDOWN; + } + } else { + PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n"); + h->trans_state = HRT_IDLE; + h->trans_count = 0; + } + } + } + break; + + case 0x2f: // function exec or sync done + if (v3_is_hvm_ros_core(core)) { + PrintError(core->vm_info,core, "hvm: request for exec or sync done from ROS core\n"); + core->vm_regs.rax=-1; + } else { + if (ENFORCE_STATE_MACHINE && + h->trans_state!=HRT_CALL && + h->trans_state!=HRT_PARCALL && + h->trans_state!=HRT_SYNCSETUP && + h->trans_state!=HRT_SYNCTEARDOWN) { + PrintError(core->vm_info,core,"hvm: function or sync completion when not in HRT_CALL, HRT_PARCALL, HRT_SYNCSETUP, or HRT_SYNCTEARDOWN state\n"); + core->vm_regs.rax=-1; + } else { + uint64_t one=1; + PrintDebug(core->vm_info,core, "hvm: function or sync complete\n"); + if (__sync_fetch_and_sub(&h->trans_count,one)==1) { + // last one, switch state + if (h->trans_state==HRT_SYNCSETUP) { + h->trans_state=HRT_SYNC; + PrintDebug(core->vm_info,core, "hvm: function complete - now synchronous\n"); + } else { + h->trans_state=HRT_IDLE; + } + } + core->vm_regs.rax=0; + } + } + + break; + + case 0x30: // merge address space + case 0x31: // unmerge address space + if (v3_is_hvm_hrt_core(core)) { + PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un"); + core->vm_regs.rax=-1; + } else { + if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { + PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state (%d)\n",a1==0x30 ? "" : "un", h->trans_state); + core->vm_regs.rax=-1; + } else { + uint64_t *page = (uint64_t *) h->comm_page_hva; + + PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3); + // should sanity check to make sure guest is in 64 bit without anything strange + + page[0] = a1; + page[1] = core->ctrl_regs.cr3; // this is a do-not-care for an unmerge + + core->vm_regs.rax = 0; + + h->trans_state = HRT_MERGE; + + if (magic_upcall(core,h->first_hrt_core)) { + core->vm_regs.rax = -1; + break; + } + + // Force core to exit now + v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0); + + } + + } + + break; + + + case 0x3f: // merge operation done + if (v3_is_hvm_ros_core(core)) { + PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n"); + core->vm_regs.rax=-1; + } else { + if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) { + PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n"); + core->vm_regs.rax=-1; + } else { + PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n"); + h->trans_state=HRT_IDLE; + core->vm_regs.rax=0; + } + } + + break; + + case 0x40: // install or remove signal handler + if (v3_is_hvm_hrt_core(core)) { + PrintError(core->vm_info,core, "hvm: HRT cannot install signal handler...\n"); + core->vm_regs.rax=-1; + } else { + PrintDebug(core->vm_info,core,"hvm: install signal handler for CR3=%p, handler=%p, stack=%p\n",(void*)core->ctrl_regs.cr3, (void*)a2, (void*)a3); + if (h->ros_signal.code) { + PrintError(core->vm_info,core,"hvm: signal is pending...\n"); + core->vm_regs.rax=-1; + } else { + if ((a2 || a3) && (h->ros_signal.handler || h->ros_signal.stack)) { + PrintError(core->vm_info,core,"hvm: attempt to replace existing handler without removing it first\n"); + core->vm_regs.rax=-1; + } else { + // actually make the change + h->ros_signal.handler=a2; + h->ros_signal.stack=a3; + h->ros_signal.cr3=core->ctrl_regs.cr3; + core->vm_regs.rax=0; + + // test by signalling back a hello + // if (a2 && a3) { + // v3_hvm_signal_ros(core->vm_info,0xf00d); + //} + } + } + } + break; + + case 0x41: // raise signal in the ROS from HRT or ROS + PrintDebug(core->vm_info,core,"hvm: HRT raises signal code=0x%llx\n", a2); + core->vm_regs.rax = v3_hvm_signal_ros(core->vm_info,a2); + break; + + case 0x51: // fill GDT area (HRT only) + if (v3_is_hvm_hrt_core(core)) { + PrintError(core->vm_info, core, "hvm: HRT cannot request a GDT area fill\n"); + core->vm_regs.rax = -1; + } else { + struct guest_info * hrt_core = &core->vm_info->cores[h->first_hrt_core]; + struct gdt_area * area = V3_Malloc(sizeof(struct gdt_area) + core->segments.gdtr.limit); + if (!area) { + PrintError(core->vm_info, core, "hvm: could not allocate GDT area\n"); + core->vm_regs.rax = -1; + break; + } + + PrintDebug(core->vm_info, core, "hvm: ROS requests to fill GDT area with fsbase=%p\n", (void*)a2); + + if (!h->hrt_gdt_gva) { + PrintError(core->vm_info, core, "hvm: HRT has not registered a GDT state save area\n"); + core->vm_regs.rax = -1; + V3_Free(area); + break; + } + + area->gdtr.base = h->hrt_gdt_gva + sizeof(struct gdt_area); + area->gdtr.limit = core->segments.gdtr.limit; + area->fsbase = a2; + area->cs = core->segments.cs.selector; + area->ds = core->segments.ds.selector; + area->es = core->segments.es.selector; + area->fs = core->segments.fs.selector; + area->gs = core->segments.gs.selector; + area->ss = core->segments.ss.selector; + + if (v3_read_gva_memory(core, + core->segments.gdtr.base, + core->segments.gdtr.limit, + (uint8_t*)area->gdt) != core->segments.gdtr.limit) { + PrintError(core->vm_info, core, "hvm: could not copy GDT from ROS\n"); + core->vm_regs.rax = -1; + V3_Free(area); + break; + } + + uint_t area_size = sizeof(struct gdt_area) + core->segments.gdtr.limit; + + // copy the entire area over + PrintDebug(core->vm_info, core, "hvm: copying %u bytes into GDT area\n", area_size); + + if (v3_write_gva_memory(hrt_core, h->hrt_gdt_gva, area_size, (uchar_t*)area) != area_size) { + PrintError(core->vm_info, core, "hvm: could not copy GDT area\n"); + core->vm_regs.rax = -1; + V3_Free(area); + break; + } + + if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { + PrintError(core->vm_info,core, "hvm: cannot sync GDT in state %d\n", h->trans_state); + core->vm_regs.rax = -1; + V3_Free(area); + break; + } else { + uint64_t *page = (uint64_t *) h->comm_page_hva; + uint64_t first, last, cur; + + PrintDebug(core->vm_info,core, "hvm: sync GDT\n"); + page[0] = a1; + page[1] = h->hrt_gdt_gva; + page[2] = a3; + + first=last=h->first_hrt_core; + + core->vm_regs.rax = 0; + + h->trans_count = last-first+1; + + for (cur=first;cur<=last;cur++) { + if (magic_upcall(core,cur)) { + core->vm_regs.rax = -1; + break; + } + // Force core to exit now + v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0); + } + + if (core->vm_regs.rax==0) { + h->trans_state = HRT_GDTSYNC; + } else { + PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT GDT SYNC failure\n"); + h->trans_state = HRT_IDLE; + h->trans_count = 0; + } + + V3_Free(area); + + } + + } + + break; + + case 0x52: // register HRT GDT area + if (!v3_is_hvm_hrt_core(core)) { + PrintError(core->vm_info, core, "hvm: ROS cannot install a GDT area\n"); + core->vm_regs.rax = -1; + } else { + PrintDebug(core->vm_info, core, "hvm: HRT registers GDT save area at gva=%p\n", (void*)a2); + h->hrt_gdt_gva = a2; + core->vm_regs.rax = 0; + } + + PrintDebug(core->vm_info, core, "hvm: Printing current HRT GDT...\n"); +#ifdef V3_CONFIG_DEBUG_HVM + v3_print_gdt(core, core->segments.gdtr.base); +#endif + + break; + + case 0x53: // restore GDT + + if (v3_is_hvm_hrt_core(core)) { + PrintError(core->vm_info, core, "hvm: HRT cannot request GDT restoration\n"); + core->vm_regs.rax = -1; + break; + } else { + PrintDebug(core->vm_info, core, "hvm: ROS requesting to restore original GDT\n"); + core->vm_regs.rax = 0; + } + + if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { + PrintError(core->vm_info,core, "hvm: cannot sync GDT in state %d\n", h->trans_state); + core->vm_regs.rax = -1; + break; + } else { + uint64_t *page = (uint64_t *) h->comm_page_hva; + uint64_t first, last, cur; + + PrintDebug(core->vm_info,core, "hvm: restore GDT\n"); + page[0] = a1; + + first=last=h->first_hrt_core; + + core->vm_regs.rax = 0; + + h->trans_count = last-first+1; + + for (cur=first;cur<=last;cur++) { + if (magic_upcall(core,cur)) { + core->vm_regs.rax = -1; + break; + } + // Force core to exit now + v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0); + } + + if (core->vm_regs.rax==0) { + h->trans_state = HRT_GDTSYNC; + } else { + PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT GDT SYNC failure\n"); + h->trans_state = HRT_IDLE; + h->trans_count = 0; + } + } + + break; + + case 0x5f: // GDT sync operation done + if (v3_is_hvm_ros_core(core)) { + PrintError(core->vm_info,core, "hvm: invalid request for GDT sync done from ROS core\n"); + core->vm_regs.rax=-1; + } else { + if (ENFORCE_STATE_MACHINE && h->trans_state != HRT_GDTSYNC) { + PrintError(core->vm_info,core,"hvm: GDT sync done when in incorrect state (%d)\n", h->trans_state); + core->vm_regs.rax=-1; + } else { + PrintDebug(core->vm_info,core, "hvm: GDT sync complete - back to idle\n"); + PrintDebug(core->vm_info, core, "hvm: Dumping new HRT GDT...\n"); +#ifdef V3_CONFIG_DEBUG_HVM + v3_print_gdt(core, core->segments.gdtr.base); +#endif + h->trans_state=HRT_IDLE; + core->vm_regs.rax=0; + } + + } + break; + + default: + PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1); + core->vm_regs.rax=-1; + break; + } + + v3_unlock_irqrestore(h->hypercall_lock,irq_state); return 0; } + #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y))) int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config) @@ -98,7 +723,7 @@ int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config) char *enable; char *ros_cores; char *ros_mem; - char *hrt_file_id; + char *hrt_file_id=0; PrintDebug(vm, VCORE_NONE, "hvm: vm init\n"); @@ -138,7 +763,7 @@ int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config) } vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024; - + if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) { PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n"); return -1; @@ -162,6 +787,8 @@ int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config) return -1; } + v3_lock_init(&(vm->hvm_state.hypercall_lock)); + // XXX sanity check config here vm->hvm_state.is_hvm=1; @@ -189,8 +816,26 @@ int v3_deinit_hvm_vm(struct v3_vm_info *vm) { PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n"); + + if (vm->hvm_state.hrt_image) { + V3_VFree(vm->hvm_state.hrt_image); + vm->hvm_state.hrt_image=0; + vm->hvm_state.hrt_image_size=0; + } + v3_remove_hypercall(vm,HVM_HCALL); + v3_lock_deinit(&(vm->hvm_state.hypercall_lock)); + + if (vm->hvm_state.comm_page_hpa) { + struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa); + if (!r) { + PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n"); + } else { + v3_delete_mem_region(vm,r); + } + } + return 0; } @@ -223,11 +868,7 @@ uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm) } uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm) { - if (vm->hvm_state.is_hvm) { - return vm->mem_size - vm->hvm_state.first_hrt_gpa; - } else { - return 0; - } + return vm->mem_size; } uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm) @@ -252,7 +893,7 @@ uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm) int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa) { if (vm->hvm_state.is_hvm) { - return gpa>=0 && gpahvm_state.first_hrt_gpa; + return gpahvm_state.first_hrt_gpa; } else { return 1; } @@ -317,10 +958,18 @@ void v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_in } } +#define MAX(x,y) ((x)>(y)?(x):(y)) +#define MIN(x,y) ((x)<(y)?(x):(y)) + +static uint64_t boot_state_end_addr(struct v3_vm_info *vm) +{ + return PAGE_ADDR(vm->mem_size); +} + static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*) PAGE_ADDR(vm->mem_size - PAGE_SIZE); + *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE); *limit = PAGE_SIZE; } @@ -372,7 +1021,7 @@ static void write_null_int_handler(struct v3_vm_info *vm) static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*) PAGE_ADDR(vm->mem_size - 2 * PAGE_SIZE); + *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE); *limit = 16*256; } @@ -384,7 +1033,7 @@ static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) // 3 ist => (stack) = 0 => current stack // 5 reserved => 0 // 4 type => 0xe=>INT, 0xf=>TRAP -// 1 reserved => 0 +// 1 reserved => 0 (indicates "system" by being zero) // 2 dpl => 0 // 1 present => 1 // 16 offsetmid => 0 @@ -395,7 +1044,7 @@ static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) // // Note little endian // -static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ; +static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ; static uint64_t idt64_int_gate_entry_mask[2] = { 0x00008e0000080000, 0x0 }; static void write_idt(struct v3_vm_info *vm) @@ -412,6 +1061,8 @@ static void write_idt(struct v3_vm_info *vm) get_null_int_handler_loc(vm,&handler,&handler_len); + handler += vm->hvm_state.gva_offset; + memcpy(trap_gate,idt64_trap_gate_entry_mask,16); memcpy(int_gate,idt64_int_gate_entry_mask,16); @@ -450,7 +1101,7 @@ static void write_idt(struct v3_vm_info *vm) static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*)PAGE_ADDR(vm->mem_size - 3 * PAGE_SIZE); + *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE); *limit = 8*3; } @@ -475,140 +1126,598 @@ static void write_gdt(struct v3_vm_info *vm) static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*)PAGE_ADDR(vm->mem_size - 4 * PAGE_SIZE); + *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE); *limit = PAGE_SIZE; } -static uint64_t tss_data=0x0; - static void write_tss(struct v3_vm_info *vm) { void *base; uint64_t limit; - int i; get_tss_loc(vm,&base,&limit); - for (i=0;icores[0],(addr_t)(base+8*i),8,(uint8_t*) &tss_data); - } + + v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0); PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base); } + +#define TOP_HALF_START 0xffff800000000000ULL +#define BOTTOM_HALF_END 0x00007fffffffffffULL + + +#define L4_UNIT PAGE_SIZE +#define L3_UNIT (512ULL * L4_UNIT) +#define L2_UNIT (512ULL * L3_UNIT) +#define L1_UNIT (512ULL * L2_UNIT) + +static void compute_pts_4KB(struct v3_vm_info *vm, + uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4) +{ + + // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start + // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4 + // so it is the same number of page tables regardless + + uint64_t max_gva = vm->hvm_state.max_mem_mapped; + + *l1 = 1; // 1 PML4 + *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512); + *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512); + *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512); +} + + + /* - PTS MAP FIRST 512 GB identity mapped: - 1 second level - 512 entries + PTS MAP using 1 GB pages + n second levels pts, highest gva, highest address 1 top level - 1 entries + + +OR + + PTS MAP using 2 MB pages + n third level pts, highest gva, highest address + m second level pts, highest gva, highest address + 1 top level pt + +OR + + PTS MAP using 4 KB pages + n 4th level, highest gva, highest address + m 3rd level, highest gva, hihgest address + l second level, highest gva, highest address + 1 top level pt + +OR + PTS MAP using 512 GB pages when this becomes available + */ + static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*)PAGE_ADDR(vm->mem_size-(5+1)*PAGE_SIZE); - *limit = 2*PAGE_SIZE; + uint64_t l1,l2,l3,l4; + uint64_t num_pt; + + compute_pts_4KB(vm,&l1,&l2,&l3,&l4); + + if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { + num_pt = l1; + } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { + num_pt = l1 + l2; + } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { + num_pt = l1 + l2 + l3; + } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { + num_pt = l1 + l2 + l3 + l4; + } else { + PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size); + return; + } + + *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE); + *limit = num_pt*PAGE_SIZE; } -static void write_pt(struct v3_vm_info *vm) +static void write_pts(struct v3_vm_info *vm) { - void *base; uint64_t size; - struct pml4e64 pml4e; - struct pdpe64 pdpe; - uint64_t i; + uint64_t num_l1, num_l2, num_l3, num_l4; + void *start_l1, *start_l2, *start_l3, *start_l4; + uint64_t max_level; + void *cur_pt; + void *cur_gva; + void *cur_gpa; + void *min_gpa = 0; + void *max_gpa = (void*) vm->hvm_state.max_mem_mapped; + void *min_gva = (void*) vm->hvm_state.gva_offset; +#ifdef V3_CONFIG_DEBUG_HVM + void *max_gva = min_gva+vm->hvm_state.max_mem_mapped; +#endif + uint64_t i, pt; + uint64_t i_start,i_end; + + struct pml4e64 *pml4e; + struct pdpe64 *pdpe; + struct pde64 *pde; + struct pte64 *pte; + + if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { + PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n"); + max_level = 1; + } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { + max_level = 2; + } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { + max_level = 3; + } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { + max_level = 4; + } else { + PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n"); + return; + } - get_pt_loc(vm,&base, &size); - if (size!=2*PAGE_SIZE) { - PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n"); + get_pt_loc(vm,&start_l1,&size); + compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4); + + start_l2=start_l1+PAGE_SIZE*num_l1; + start_l3=start_l2+PAGE_SIZE*num_l2; + start_l4=start_l3+PAGE_SIZE*num_l3; + + PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1); + PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa); + PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4); + PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4); + + cur_pt=start_l1; + + // build PML4 (only one) + if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) { + PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n"); + return; } - memset(&pdpe,0,sizeof(pdpe)); - pdpe.present=1; - pdpe.writable=1; - pdpe.large_page=1; - - for (i=0;i<512;i++) { - pdpe.pd_base_addr = i*0x40000; // 0x4000 = 256K pages = 1 GB - v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe); + memset(pml4e,0,PAGE_SIZE); + + if (min_gva==0x0) { + i_start=0; i_end = num_l2; + } else if (min_gva==(void*)TOP_HALF_START) { + i_start=256; i_end=256+num_l2; + } else { + PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n"); + return; + } + + for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa; + (icores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) { + PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n"); + return; + } + + memset(pdpe,0,PAGE_SIZE); + + for (i=0; + i<512 && cur_gpacores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e); + // build PDE + if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) { + PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n"); + return; + } + + memset(pde,0,PAGE_SIZE); + + for (i=0; + i<512 && cur_gpacores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e); + pde[i].present=1; + pde[i].writable=1; + + if (max_level==3) { + pde[i].large_page=1; + pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa)); + //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr); + } else { + pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE)); + //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr); + } + } } - PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE) at %p\n",base); + //2 MB only + if (max_level==3) { + return; + } + + + // 4 KB + for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva; + ptcores[0],(addr_t)cur_pt,(addr_t*)&pte)) { + PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n"); + return; + } + + memset(pte,0,PAGE_SIZE); + + for (i=0; + i<512 && cur_gpamem_size-(6+1)*PAGE_SIZE); - *limit = PAGE_SIZE; + + get_pt_loc(vm,base, limit); + *base-=PAGE_SIZE; + *limit=PAGE_SIZE; } -static void write_bp(struct v3_vm_info *vm) + +int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt) { - void *base; - uint64_t limit; - uint64_t data=-1; - int i; + struct v3_vm_info *vm = core->vm_info; - get_bp_loc(vm,&base,&limit); - - for (i=0;icores[0],(addr_t)(base+i*8),8,(uint8_t*)&data); - } + hrt->tag.type = MB_INFO_HRT_TAG; + hrt->tag.size = sizeof(mb_info_hrt_t); + + hrt->total_num_apics = vm->num_cores; + hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core; + hrt->have_hrt_ioapic=0; + hrt->first_hrt_ioapic_entry=0; - PrintDebug(vm,VCORE_NONE,"hvm: wrote boundary page at %p\n", base); + hrt->cpu_freq_khz = V3_CPU_KHZ(); + + hrt->hrt_flags = vm->hvm_state.hrt_flags; + hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped; + hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa; + hrt->gva_offset = vm->hvm_state.gva_offset; + hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa; + hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector; + return 0; } -#define MIN_STACK (4096*4) +static void write_mb_info(struct v3_vm_info *vm) +{ + if (vm->hvm_state.hrt_type!=HRT_MBOOT64) { + PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n"); + return; + } else { + uint8_t buf[256]; + uint64_t size; + void *base; + uint64_t limit; + + get_mb_info_loc(vm,&base,&limit); + + if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) { + PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n"); + return; + } + + if (size>limit) { + PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n"); + return; + } + + v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core], + (addr_t)base, + size, + buf); + + PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base); + } +} + +#define SCRATCH_STACK_SIZE 4096 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - void *bp_base; - uint64_t bp_limit; + void *mb_base; + uint64_t mb_limit; - get_bp_loc(vm,&bp_base,&bp_limit); + get_mb_info_loc(vm,&mb_base,&mb_limit); - // assume at least a minimal stack - - bp_base-=MIN_STACK; + mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm); *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa); - if (bp_base < *base+PAGE_SIZE) { + if (mb_base < *base+PAGE_SIZE) { PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n"); } - *limit = bp_base - *base; + *limit = mb_base - *base; +} + + +#define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args) +#define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args) + +#define ELF_MAGIC 0x464c457f +#define MB2_MAGIC 0xe85250d6 + +#define MB2_INFO_MAGIC 0x36d76289 + +static int is_elf(uint8_t *data, uint64_t size) +{ + if (*((uint32_t*)data)==ELF_MAGIC) { + return 1; + } else { + return 0; + } } -static void write_hrt(struct v3_vm_info *vm) +static mb_header_t *find_mb_header(uint8_t *data, uint64_t size) { - void *base; - uint64_t limit; + uint64_t limit = size > 32768 ? 32768 : size; + uint64_t i; - get_hrt_loc(vm,&base,&limit); + // Scan for the .boot magic cookie + // must be in first 32K, assume 4 byte aligned + for (i=0;ihvm_state; + uint64_t f = mb->mb64_hrt->hrt_flags; + uint64_t maxmap = mb->mb64_hrt->max_mem_to_map; + uint64_t gvaoff = mb->mb64_hrt->gva_offset; + uint64_t gvaentry = mb->mb64_hrt->gva_entry; + uint64_t commgpa = mb->mb64_hrt->comm_page_gpa; + uint8_t vec = mb->mb64_hrt->hrt_int_vector; - if (vm->hvm_state.hrt_file->size > limit) { - PrintError(vm,VCORE_NONE,"hvm: Cannot map HRT because it is too big (%llu bytes, but only have %llu space\n", vm->hvm_state.hrt_file->size, (uint64_t)limit); - return; + + PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n", + f, maxmap, gvaoff,gvaentry,commgpa, vec); + + if (maxmap<0x100000000ULL) { + PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n"); + maxmap=0x100000000ULL; + } + + if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { + PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n"); + return -1; + } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { + f &= ~0x3c; + f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB; + h->max_mem_mapped = maxmap; + PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n"); + } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { + f &= ~0x3c; + f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB; + h->max_mem_mapped = maxmap; + PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n"); + } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { + f &= ~0x3c; + f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB; + h->max_mem_mapped = maxmap; + PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n"); + } else { + PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n"); + return -1; + } + + if (f & MB_TAG_MB64_HRT_FLAG_RELOC) { + PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n"); + return -1; + } + + h->hrt_flags = f; + + if (maxmap>h->max_mem_mapped) { + PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap); + return -1; + } + + if (gvaoff!=0 && gvaoff!=TOP_HALF_START) { + PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START); + return -1; + } + + h->gva_offset = gvaoff; + + h->gva_entry = gvaentry; + + if (mb->addr->load_addr < h->first_hrt_gpa) { + PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n"); + return -1; } + + if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) { + PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n"); + return -1; + } + + if (vec<32) { + PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec); + return -1; + } + + h->hrt_int_vector = vec; + + + if (commgpa < vm->mem_size) { + PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n"); + return -1; + } + + h->comm_page_gpa = commgpa; - v3_write_gpa_memory(&vm->cores[0],(addr_t)base,vm->hvm_state.hrt_file->size,vm->hvm_state.hrt_file->data); + if (!h->comm_page_hpa) { + if (!(h->comm_page_hpa=V3_AllocPages(1))) { + PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n"); + return -1; + } - PrintDebug(vm,VCORE_NONE,"hvm: wrote HRT %s at %p\n", vm->hvm_state.hrt_file->tag,base); + h->comm_page_hva = V3_VAddr(h->comm_page_hpa); + + memset(h->comm_page_hva,0,PAGE_SIZE_4KB); + + if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) { + PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n"); + V3_FreePages((void*)(h->comm_page_gpa),1); + return -1; + } + + + PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n"); + } + + memset(h->comm_page_hva,0,PAGE_SIZE_4KB); + + + PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n", + h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector); + return 0; + +} + +static int setup_mb_kernel_hrt(struct v3_vm_info *vm, void *data, uint64_t size) +{ + mb_data_t mb; + + if (v3_parse_multiboot_header(data, size, &mb)) { + PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n"); + return -1; + } + + if (!mb.mb64_hrt) { + PrintError(vm,VCORE_NONE,"hvm: invalid HRT - there is no MB64_HRT tag\n"); + return -1; + } + + if (configure_hrt(vm,&mb)) { + PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n"); + return -1; + } + + if (v3_write_multiboot_kernel(vm,&mb,data,size, + (void*)vm->hvm_state.first_hrt_gpa, + vm->mem_size-vm->hvm_state.first_hrt_gpa)) { + PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n"); + return -1; + } + + if (vm->hvm_state.gva_entry) { + vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry; + } else { + vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset; + } + + vm->hvm_state.hrt_type = HRT_MBOOT64; + + return 0; + +} + + +static int setup_hrt(struct v3_vm_info *vm) +{ + void *data; + uint64_t size; + + // If the ROS has installed an image, it takes priority + if (vm->hvm_state.hrt_image) { + data = vm->hvm_state.hrt_image; + size = vm->hvm_state.hrt_image_size; + } else { + data = vm->hvm_state.hrt_file->data; + size = vm->hvm_state.hrt_file->size; + } + + if (is_elf(data,size) && + find_mb_header(data,size)) { + + PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n"); + if (setup_mb_kernel_hrt(vm,data,size)) { + PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n"); + return -1; + } + } else { + PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n"); + return -1; + } + + return 0; } @@ -629,15 +1738,20 @@ static void write_hrt(struct v3_vm_info *vm) GDT (1 page - page aligned) TSS (1 page - page asligned) PAGETABLES (identy map of first N GB) - ROOT PT first, followed by 2nd level, etc. - Currently PML4 followed by 1 PDPE for 512 GB of mapping - BOUNDARY PAGE (all 0xff - avoid smashing page tables in case we keep going...) - (stack - we will push machine description) + ROOT PT first (lowest memory addr), followed by 2nd level PTs in order, + followed by 3rd level PTs in order, followed by 4th level + PTs in order. + MBINFO_PAGE + SCRATCH_STACK_HRT_CORE0 + SCRATCH_STACK_HRT_CORE1 + .. + SCRATCH_STACK_HRT_COREN ... HRT (as many pages as needed, page-aligned, starting at first HRT address) --- ROS - + + */ @@ -650,17 +1764,23 @@ int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm) PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n"); + if (setup_hrt(vm)) { + PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n"); + return -1; + } + + // the locations of all the other items are determined by + // the HRT setup, so these must happen after + write_null_int_handler(vm); write_idt(vm); write_gdt(vm); write_tss(vm); - write_pt(vm); - - write_bp(vm); - - write_hrt(vm); + write_pts(vm); + // this must happen last + write_mb_info(vm); PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n"); @@ -668,20 +1788,22 @@ int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm) } /* - On entry: + On entry for every core: IDTR points to stub IDT GDTR points to stub GDT TS points to stub TSS CR3 points to root page table - CR0 has PE and PG - EFER has LME AND LMA - RSP is TOS (looks like a call) - INFO <= RDI - 0 (fake return address) <= RSP - - RIP is entry point to HRT - RDI points to machine info on stack + CR0 has PE, PG, and WP + EFER has LME AND LMA (and NX for compatibility with Linux) + RSP is TOS of core's scratch stack (looks like a call) + + RAX = MB magic cookie + RBX = address of multiboot info table + RCX = this core id / apic id (0..N-1) + RDX = this core id - first HRT core ID (==0 for the first HRT core) + + All addresses are virtual addresses, offset as needed by gva_offset Other regs are zeroed @@ -692,14 +1814,20 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) { void *base; uint64_t limit; + uint64_t gva_offset; + + rdtscll(core->hvm_state.last_boot_start); + if (!core->hvm_state.is_hrt) { PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id); return 0; } + PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id); + gva_offset = core->vm_info->hvm_state.gva_offset; memset(&core->vm_regs,0,sizeof(core->vm_regs)); memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs)); @@ -715,31 +1843,75 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) core->mem_mode = VIRTUAL_MEM; core->core_run_state = CORE_RUNNING ; - // We are going to enter right into the HRT - // HRT stack and argument passing - get_bp_loc(core->vm_info, &base,&limit); - // TODO: push description here - core->vm_regs.rsp = (v3_reg_t) base; // so if we ret, we will blow up - core->vm_regs.rbp = (v3_reg_t) base; - // TODO: RDI should really get pointer to description - core->vm_regs.rdi = (v3_reg_t) base; + + // magic + core->vm_regs.rax = MB2_INFO_MAGIC; + + // multiboot info pointer + get_mb_info_loc(core->vm_info, &base,&limit); + core->vm_regs.rbx = (uint64_t) base + gva_offset; + + // core number + core->vm_regs.rcx = core->vcpu_id; + + // HRT core number + core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core; + + // Now point to scratch stack for this core + // it begins at an ofset relative to the MB info page + get_mb_info_loc(core->vm_info, &base,&limit); + base = base + gva_offset; + base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE; + core->vm_regs.rsp = (v3_reg_t) base; + core->vm_regs.rbp = (v3_reg_t) base-8; + + // push onto the stack a bad rbp and bad return address + core->vm_regs.rsp-=16; + v3_set_gpa_memory(core, + core->vm_regs.rsp-gva_offset, + 16, + 0xff); + + // HRT entry point get_hrt_loc(core->vm_info, &base,&limit); - core->rip = (uint64_t) base + 0x40; // hack for test.o + if (core->vm_info->hvm_state.gva_entry) { + core->rip = core->vm_info->hvm_state.gva_entry; + } else { + core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset; + } + + + + PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n", + (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core), + (void*)(core->rip), + (void*)(core->vm_regs.rsp), + (void*)(core->vm_regs.rbp), + (void*)(core->vm_regs.rax), + (void*)(core->vm_regs.rbx), + (void*)(core->vm_regs.rcx), + (void*)(core->vm_regs.rdx)); // Setup CRs for long mode and our stub page table - // CR0: PG, PE - core->ctrl_regs.cr0 = 0x80000001; + // CR0: PG, PE, and WP for catching COW faults in kernel-mode (which is not default behavior) + core->ctrl_regs.cr0 = 0x80010001; + core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0; + // CR2: don't care (output from #PF) // CE3: set to our PML4E, without setting PCD or PWT get_pt_loc(core->vm_info, &base,&limit); - core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base); + core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base); // not offset as this is a GPA + core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3; + // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0) core->ctrl_regs.cr4 = 0xb0; + core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4; // CR8 as usual // RFLAGS zeroed is fine: come in with interrupts off - // EFER needs SVME LMA LME (last 16 bites: 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 - core->ctrl_regs.efer = 0x1500; + // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 + core->ctrl_regs.efer = 0x1d00; + core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer; /* @@ -758,47 +1930,50 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) // Install our stub IDT get_idt_loc(core->vm_info, &base,&limit); + base += gva_offset; core->segments.idtr.selector = 0; // entry 0 (NULL) of the GDT - core->segments.idtr.base = (addr_t) base; + core->segments.idtr.base = (addr_t) base; // only base+limit are used core->segments.idtr.limit = limit-1; - core->segments.idtr.type = 0xe; - core->segments.idtr.system = 1; + core->segments.idtr.type = 0x0; + core->segments.idtr.system = 0; core->segments.idtr.dpl = 0; - core->segments.idtr.present = 1; - core->segments.idtr.long_mode = 1; + core->segments.idtr.present = 0; + core->segments.idtr.long_mode = 0; // Install our stub GDT get_gdt_loc(core->vm_info, &base,&limit); - core->segments.gdtr.selector = 0; + base += gva_offset; + core->segments.gdtr.selector = 0; // entry 0 (NULL) of the GDT core->segments.gdtr.base = (addr_t) base; - core->segments.gdtr.limit = limit-1; - core->segments.gdtr.type = 0x6; - core->segments.gdtr.system = 1; + core->segments.gdtr.limit = limit-1; // only base+limit are used + core->segments.gdtr.type = 0x0; + core->segments.gdtr.system = 0; core->segments.gdtr.dpl = 0; - core->segments.gdtr.present = 1; - core->segments.gdtr.long_mode = 1; + core->segments.gdtr.present = 0; + core->segments.gdtr.long_mode = 0; // And our TSS get_tss_loc(core->vm_info, &base,&limit); + base += gva_offset; core->segments.tr.selector = 0; core->segments.tr.base = (addr_t) base; core->segments.tr.limit = limit-1; - core->segments.tr.type = 0x6; - core->segments.tr.system = 1; + core->segments.tr.type = 0x9; + core->segments.tr.system = 0; // available 64 bit TSS core->segments.tr.dpl = 0; core->segments.tr.present = 1; - core->segments.tr.long_mode = 1; + core->segments.tr.long_mode = 0; // not used - base = 0x0; + base = 0x0; // these are not offset as we want to make all gvas visible limit = -1; // And CS core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0) - core->segments.cs.base = (addr_t) base; - core->segments.cs.limit = limit; - core->segments.cs.type = 0xe; - core->segments.cs.system = 0; - core->segments.cs.dpl = 0; + core->segments.cs.base = (addr_t) base; // not used + core->segments.cs.limit = limit; // not used + core->segments.cs.type = 0xe; // only C is used + core->segments.cs.system = 1; // not a system segment + core->segments.cs.dpl = 0; core->segments.cs.present = 1; core->segments.cs.long_mode = 1; @@ -806,8 +1981,8 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0) core->segments.ds.base = (addr_t) base; core->segments.ds.limit = limit; - core->segments.ds.type = 0x6; - core->segments.ds.system = 0; + core->segments.ds.type = 0x6; // ignored + core->segments.ds.system = 1; // not a system segment core->segments.ds.dpl = 0; core->segments.ds.present = 1; core->segments.ds.long_mode = 1; @@ -817,12 +1992,188 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds)); memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds)); + // reset paging here for shadow... if (core->shdw_pg_mode != NESTED_PAGING) { PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n"); + return -1; } return 0; } + +int v3_handle_hvm_reset(struct guest_info *core) +{ + + if (core->core_run_state != CORE_RESETTING) { + return 0; + } + + if (!core->vm_info->hvm_state.is_hvm) { + return 0; + } + + if (v3_is_hvm_hrt_core(core)) { + // this is an HRT reset + int rc=0; + + // wait for all the HRT cores + v3_counting_barrier(&core->vm_info->reset_barrier); + + if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { + // I am leader + core->vm_info->run_state = VM_RESETTING; + } + + core->core_run_state = CORE_RESETTING; + + if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { + // we really only need to clear the bss + // and recopy the .data, but for now we'll just + // do everything + rc |= v3_setup_hvm_vm_for_boot(core->vm_info); + + if (rc) { + PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc); + } + } + + // now everyone is ready to reset + rc |= v3_setup_hvm_hrt_core_for_boot(core); + + if (rc) { + PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc); + } + + core->core_run_state = CORE_RUNNING; + + if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { + // leader + core->vm_info->run_state = VM_RUNNING; + core->vm_info->hvm_state.trans_state = HRT_IDLE; + } + + v3_counting_barrier(&core->vm_info->reset_barrier); + + if (rc<0) { + PrintError(core->vm_info,core,"hvm: reset failed\n"); + return rc; + } else { + return 1; + } + + } else { + // ROS core will be handled by normal reset functionality + return 0; + } +} + +int v3_handle_hvm_entry(struct guest_info *core) +{ + if (!core->vm_info->hvm_state.is_hvm // not relevant to non-HVM + || core->hvm_state.is_hrt // not relevant to an HRT in an HVM + || !core->vm_info->hvm_state.ros_signal.code) { // not relevant if there is no code to inject + + // Note that above check for code could race with a writer, but + // if that happens, we'll simply inject at the next opportunity instead of + // this one (see below for atomic update) + return 0; + } else { + struct v3_ros_signal *s = &core->vm_info->hvm_state.ros_signal; + + // HVM ROS + if (! (s->handler && // handler installed + s->cr3 && // process installed + s->stack && // stack installed + core->cpl == 3 && // user mode + core->ctrl_regs.cr3 == s->cr3) // right process active + ) { + // Cannot inject at this time + return 0; + } else { + // We can inject now, let's atomically see if we have something + // and commit to doing it if we do + uint64_t code; + + // Get code, reset to allow next one + code = __sync_fetch_and_and(&(s->code), 0); + + if (!code) { + // nothing to do after all + return 0; + } else { + + // actually do inject + + uint64_t rsp; + uint64_t frame[6]; + + PrintDebug(core->vm_info,core,"hvm: ROS interrupt starting with rip=%p rsp=%p\n", (void*) core->rip, (void*) core->vm_regs.rsp); + // build interrupt frame + frame[0] = code; + frame[1] = core->rip; + frame[2] = core->segments.cs.selector; // return cs + frame[3] = core->ctrl_regs.rflags; + frame[4] = core->vm_regs.rsp; + frame[5] = core->segments.ss.selector; // return ss + + rsp = (s->stack - 16) & (~0xf); // We should be 16 byte aligned to start + rsp -= sizeof(frame); + + + if (v3_write_gva_memory(core,(addr_t)rsp,sizeof(frame),(uint8_t*)frame)!=sizeof(frame)) { + PrintError(core->vm_info,core,"hvm: failed to write interrupt frame\n"); + // we just lost this inject + return -1; + } + + // now make us look like we are jumping to the entry + core->rip = s->handler; + core->vm_regs.rsp = rsp; + + PrintDebug(core->vm_info,core,"hvm: ROS frame is 0x%llx|0x%llx|0x%llx|0x%llx|0x%llx|0x%llx and and on entry rip=%p and rsp=%p\n", frame[0],frame[1],frame[2],frame[3],frame[4],frame[5],(void*) core->rip, (void*) core->vm_regs.rsp); + + // and we should be good to go + return 0; + } + } + } +} + +int v3_handle_hvm_exit(struct guest_info *core) +{ + // currently nothing + return 0; +} + + +int v3_hvm_signal_ros(struct v3_vm_info *vm, uint64_t code) +{ + struct v3_ros_signal *s = &vm->hvm_state.ros_signal; + + if (!code) { + PrintError(vm,VCORE_NONE,"hvm: cannot signal ros with code zero\n"); + return -1; + } + + // handler, etc, must exist + if (!s->handler || !s->stack) { + PrintError(vm,VCORE_NONE,"hvm: cannot signal ros with no installed handler\n"); + return -1; + } else { + // we set the code only if we are idle (code 0), + // and we do so only + if (!__sync_bool_compare_and_swap(&(s->code), 0, code)) { + PrintError(vm,VCORE_NONE,"hvm: signal was already asserted\n"); + return -1; + } else { + PrintDebug(vm,VCORE_NONE,"hvm: raised signal 0x%llx to the ROS\n",code); + return 0; + } + } +} + + +