From: Peter Dinda Date: Sun, 2 Aug 2015 23:31:43 +0000 (-0500) Subject: HVM capability enhancments X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?a=commitdiff_plain;h=142b59ca44b3d00f0479910c986f6791216651e7;p=palacios.git HVM capability enhancments - ROS / HRT boot-time interaction protocol enhancements - ROS / HRT / VMM run-time interaction protocol - ROS->HRT resets - ROS->HRT address space merges - ROS->HRT sequential and parallel function invocations - More generalized paging environment build for HRT including offsets, PIC, etc. - Refactoring between multiboot and HVM - More consistent magic numbers - Descriptor, EFER, and ctrl reg corrections --- diff --git a/palacios/include/palacios/vmm_hvm.h b/palacios/include/palacios/vmm_hvm.h index 7a1dceb..7f82220 100644 --- a/palacios/include/palacios/vmm_hvm.h +++ b/palacios/include/palacios/vmm_hvm.h @@ -24,6 +24,7 @@ #ifdef __V3VEE__ #include +#include struct v3_vm_hvm { uint8_t is_hvm; @@ -32,6 +33,23 @@ struct v3_vm_hvm { struct v3_cfg_file *hrt_file; uint64_t hrt_entry_addr; enum { HRT_BLOB, HRT_ELF64, HRT_MBOOT2, HRT_MBOOT64 } hrt_type; + + // The following parallel the content of mb_info_hrt_t in + // the extended multiboot header. They reflect how the + // HRT has actually been mapped, as opposed to the requested + // mapping/flags from the mb_mb64_hrt_t + uint64_t hrt_flags; + uint64_t max_mem_mapped; + uint64_t gva_offset; + uint64_t gva_entry; + uint64_t comm_page_gpa; + uint8_t hrt_int_vector; + + void *comm_page_hpa; + void *comm_page_hva; + + enum {HRT_IDLE=0, HRT_CALL, HRT_PARCALL, HRT_MERGE} trans_state; + uint64_t trans_count; }; struct v3_core_hvm { @@ -39,6 +57,8 @@ struct v3_core_hvm { uint64_t last_boot_start; }; + + struct v3_xml; int v3_init_hvm(); @@ -68,11 +88,110 @@ void v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_in uint32_t *start_apic, uint32_t *num_apics); +int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt); + int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm); int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core); int v3_handle_hvm_reset(struct guest_info *core); +/* + HVM/HRT interaction is as follows: + + 1. MB_TAG_MB64_HRT tag in the HRT multiboot kernel signifies it + is handled by the HVM. + 2. The flags and other info in the the tag indicate the properties of the HRT + to the HVM. (see vmm_multiboot.h), in particular: + - position independence + - ability to be initially mapped with an offset + between virtual and physical addresses, for example + to hoist it into the same position that the ROS kernel + will occupy in the virtual address space of a ROS + process + - how much physical address space we will intiially map + and what kind of page tables are used to map it + - what physical page (4KB) should we reserve for use + in HVM/HRT communication (particularly upcalls) + - the interrupt vector used to upcall from the HVM to the HRT + 3. The MB_INFO_HRT_TAG within the multiboot info structures the + HRT sees on boot indicates that HRT functionality is established and + gives details of operation to the HRT, including the following. + See vmm_multiboot.c for more info + - apics and ioapic ids, and indications of which apics + and which entries on ioapics are exclusively for HRT use + - physical address range that is exclusively for HRT use + - where the the physical address range exclusively for HRT use + is mapped into the virtual address space (offset). The + ROS part of the physical address space is always identity mapped + initially. + - the amount of physical memory that has been mapped + - the physical address of the page the HVM will use to + communicate with the HRT + - the interrupt vector the HVM will use to upcall the HRT + - flags copied from the HRT's HRT tag (position independence, + page table model, offset, etc) + 4. Downcalls: + hypercall 0xf00df00d with arguments depending on operation + with examples described below. + 5. Upcalls + interrupt injected by VMM or a magic #PF + communication via a shared memory page, contents below + + Upcalls + + Type of upcall is determined by the first 64 bits in the commm page + + 0x0 => Null (test) + 0x20 => Invoke function in HRT + Next 64 bits contains address of structure + describing function call. This is typically the ROS + trying to get the HRT to run a function for it. + ROS is resposible for assuring that this address + (and other addresses) are correct with respect to + mappings. That is, for a non-merged address space, + the ROS needs to supply physical addresses so that + they can be used (with the identity-mapped ROS physical + memory.) If it wants to use virtual addresses, it + needs to first merge the address spaces. + 0x21 => Invoke function in HRT in parallel + Exactly like previos, but the upcall is happening + simultaneously on all HRT cores. + 0x30 => Merge address space + Next 64 bits contains the ROS CR3 that we will use + to map the user portion of ROS address space into + the HRT address space + 0x31 => Unmerge address space + return the ROS memory mapping to normal (physical/virtual identity) + + Downcalls + + HVM_HCALL is the general hypercall number used to talk to the HVM + The first argument is the request number (below). The other arguments + depend on the first. + + 0x0 => Null, just for timing + 0x1 => Reboot ROS + 0x2 => Reboot HRT + 0x3 => Reboot Both + 0xf => Get HRT transaction state + + 0x20 => Invoke function (ROS->HRT) + first argument is pointer to structure describing call + 0x21 => Invoke function in parallel (ROS->HRT) + same as above, but simultaneously on all HRT cores + 0x2f => Function execution complete (HRT->ROS, once per core) + 0x30 => Merge address space (ROS->HRT) + no arguments (CR3 implicit). Merge the current + address space in the ROS with the address space on + the HRT + 0x31 => Unmerge address apce (ROS->HRT) + release any address space merger and restore identity mapping + 0x3f => Merge request complete (HRT->ROS) + +*/ + + + #endif /* ! __V3VEE__ */ diff --git a/palacios/include/palacios/vmm_multiboot.h b/palacios/include/palacios/vmm_multiboot.h index c5c370c..f4aaac1 100644 --- a/palacios/include/palacios/vmm_multiboot.h +++ b/palacios/include/palacios/vmm_multiboot.h @@ -26,6 +26,13 @@ #include +/****************************************************************** + Data contained in the ELF file we will attempt to boot +******************************************************************/ + +#define ELF_MAGIC 0x464c457f +#define MB2_MAGIC 0xe85250d6 + typedef struct mb_header { uint32_t magic; uint32_t arch; @@ -89,10 +96,35 @@ typedef struct mb_modalign { // version of multiboot. The existence of // this tag indicates that this special mode is // requested -#define MB_TAG_MB64_HRT 0xf00d +#define MB_TAG_MB64_HRT 0xf00d typedef struct mb_mb64_hrt { mb_tag_t tag; - uint32_t hrt_flags; + uint64_t hrt_flags; + // whether this kernel is relocable +#define MB_TAG_MB64_HRT_FLAG_RELOC 0x1 + // How to map the memory in the initial PTs + // highest set bit wins +#define MB_TAG_MB64_HRT_FLAG_MAP_4KB 0x100 +#define MB_TAG_MB64_HRT_FLAG_MAP_2MB 0x200 +#define MB_TAG_MB64_HRT_FLAG_MAP_1GB 0x400 +#define MB_TAG_MB64_HRT_FLAG_MAP_512GB 0x800 + + // How much physical address space to map in the + // initial page tables (bytes) + // + uint64_t max_mem_to_map; + // offset of the GVA->GPA mappings (GVA of GPA 0) + uint64_t gva_offset; + // 64 bit entry address (=0 to use entry tag (which will be offset by gva_offset)) + uint64_t gva_entry; + // desired address of the page the VMM, HRT, and ROS share + // for communication. "page" here a 4 KB quantity + uint64_t comm_page_gpa; + // desired interrupt vector that should be used for upcalls + // the default for this is 255 + uint8_t hrt_int_vector; + uint8_t reserved[7]; + } __attribute__((packed)) mb_mb64_hrt_t; typedef struct mb_data { @@ -106,6 +138,131 @@ typedef struct mb_data { mb_mb64_hrt_t *mb64_hrt; } mb_data_t; + + +// We are not doing: +// +// - BIOS Boot Device +// - Modules +// - ELF symbols +// - Boot Loader name +// - APM table +// - VBE info +// - Framebuffer info +// + + + +/****************************************************************** + Data we will pass to the kernel via rbx +******************************************************************/ + +#define MB2_INFO_MAGIC 0x36d76289 + + +typedef struct mb_info_header { + uint32_t totalsize; + uint32_t reserved; +} __attribute__((packed)) mb_info_header_t; + +// A tag of type 0, size 8 indicates last value +// +typedef struct mb_info_tag { + uint32_t type; + uint32_t size; +} __attribute__((packed)) mb_info_tag_t; + + +#define MB_INFO_MEM_TAG 4 +typedef struct mb_info_mem { + mb_info_tag_t tag; + uint32_t mem_lower; // 0..640K in KB + uint32_t mem_upper; // in KB to first hole - 1 MB +} __attribute__((packed)) mb_info_mem_t; + +#define MB_INFO_CMDLINE_TAG 1 +// note alignment of 8 bytes required for each... +typedef struct mb_info_cmdline { + mb_info_tag_t tag; + uint32_t size; // includes zero termination + uint8_t string[]; // zero terminated +} __attribute__((packed)) mb_info_cmdline_t; + + +#define MEM_RAM 1 +#define MEM_ACPI 3 +#define MEM_RESV 4 + +typedef struct mb_info_memmap_entry { + uint64_t base_addr; + uint64_t length; + uint32_t type; + uint32_t reserved; +} __attribute__((packed)) mb_info_memmap_entry_t; + +#define MB_INFO_MEMMAP_TAG 6 +// note alignment of 8 bytes required for each... +typedef struct mb_info_memmap { + mb_info_tag_t tag; + uint32_t entry_size; // multiple of 8 + uint32_t entry_version; // 0 + mb_info_memmap_entry_t entries[]; +} __attribute__((packed)) mb_info_memmap_t; + +#define MB_INFO_HRT_TAG 0xf00df00d +typedef struct mb_info_hrt { + mb_info_tag_t tag; + // apic ids are 0..num_apics-1 + // ioapics follow + // apic and ioapic addresses are the well known places + uint32_t total_num_apics; + // first apic the HRT owns (HRT core 0) + uint32_t first_hrt_apic_id; + // can the HRT use an ioapic? + uint32_t have_hrt_ioapic; + // if so, this is the first entry on the + // ioapic that can be used by the HRT + uint32_t first_hrt_ioapic_entry; + // CPU speed + uint64_t cpu_freq_khz; + // copy of the HRT flags from the kernel (indicating + // page table mapping type, position independence, etc. + // these reflect how it has actually been mapped + uint64_t hrt_flags; + // the amount of physical address space that has been mapped + // initially. + uint64_t max_mem_mapped; + // The first physical address the HRT should + // (nominally) use. Physical addresses below this are + // visible to the ROS + uint64_t first_hrt_gpa; + // Where the intial boot state starts in the physical address + // space. This includes INT HANDLER,IDT,GDT,TSS, PAGETABLES, + // and MBINFO, but not the scratch stacks + // This is essentially the content of CR3 - 1 page on boot + uint64_t boot_state_gpa; + // Where GPA 0 is mapped in the virtual address space + uint64_t gva_offset; + + // Typically: + // first_hrt_vaddr==first_hrt_paddr => no address space coalescing + // first_hrt_vaddr>first_hrt_paddr => address space coalescing + // For example, first_hrt_vaddr might be set to the start of linux kernel + // This then allows us to coalesce user portion of the address space of + // a linux process and the HRT + // for communication. "page" here a 4 KB quantity + + // address of the page the VMM, HRT, and ROS share + uint64_t comm_page_gpa; + // interrupt vector used to upcall to HRT (==0 if none) + // downcalls are done with HVM hypercall 0xf00df00d + uint8_t hrt_int_vector; + uint8_t reserved[7]; +} __attribute__((packed)) mb_info_hrt_t; + + + + struct v3_vm_multiboot { uint8_t is_multiboot; struct v3_cfg_file *mb_file; @@ -118,6 +275,7 @@ struct v3_vm_multiboot { // There is no core structure for // multiboot capability + struct v3_xml; int v3_init_multiboot(); diff --git a/palacios/src/palacios/vmm_hvm.c b/palacios/src/palacios/vmm_hvm.c index 7fb278b..3e506d7 100644 --- a/palacios/src/palacios/vmm_hvm.c +++ b/palacios/src/palacios/vmm_hvm.c @@ -56,7 +56,7 @@ RAM (MB) Note these are backward compatible - + (MB) @@ -69,12 +69,6 @@ #endif -// if set, we will map the first 1 GB of memory using a 3 level -// hierarchy, for compatibility with Nautilus out of the box. -// Otherwise we will map the first 512 GB using a 2 level -// hieratchy -#define HVM_MAP_1G_2M 1 - int v3_init_hvm() { PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n"); @@ -87,19 +81,245 @@ int v3_deinit_hvm() return 0; } +// ignore requests from when we are in the wrong state +#define ENFORCE_STATE_MACHINE 1 + +// invoke the HRT using a page fault instead of +// the SWINTR mechanism +#define USE_UPCALL_MAGIC_PF 1 +#define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL +#define UPCALL_MAGIC_ERROR 0xf00df00d + +/* + 64 bit only hypercall: + rax = hypercall number + rbx = 0x646464... + then args are: rcx, rdx, rsi, rdi r8, r9, r10, r11 + rcx = 1st arg +*/ static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data) { uint64_t c; + uint64_t bitness = core->vm_regs.rbx; + uint64_t a1 = core->vm_regs.rcx; + uint64_t a2 = core->vm_regs.rdx; + struct v3_vm_hvm *h = &core->vm_info->hvm_state; + + + if (bitness!=0x6464646464646464) { + PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n"); + core->vm_regs.rax = -1; + return 0; + } + + switch (a1) { + case 0x0: // null + + rdtscll(c); + + V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n", + hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits); + //v3_print_core_telemetry(core); + // v3_print_guest_state(core); + core->vm_regs.rax = 0; + break; + + case 0x1: // reset ros + PrintDebug(core->vm_info,core, "hvm: reset ROS\n"); + if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) { + PrintError(core->vm_info,core, "hvm: reset of ROS failed\n"); + core->vm_regs.rax = -1; + } else { + core->vm_regs.rax = 0; + } + break; + + case 0x2: // reset hrt + PrintDebug(core->vm_info,core, "hvm: reset HRT\n"); + if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) { + PrintError(core->vm_info,core, "hvm: reset of HRT failed\n"); + core->vm_regs.rax = -1; + } else { + core->vm_regs.rax = 0; + } + break; - rdtscll(c); + case 0x3: // reset both + PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n"); + if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) { + PrintError(core->vm_info,core, "hvm: reset of HRT failed\n"); + core->vm_regs.rax = -1; + } else { + core->vm_regs.rax = 0; + } + break; + + case 0xf: // get HRT state + core->vm_regs.rax = h->trans_state; + //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax); + break; + case 0x20: // invoke function (ROS->HRT) + case 0x21: // invoke parallel function (ROS->HRT) + if (v3_is_hvm_hrt_core(core)) { + PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel"); + core->vm_regs.rax = -1; + } else { + if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { + PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state); + core->vm_regs.rax = -1; + } else { + uint64_t *page = (uint64_t *) h->comm_page_hva; + uint64_t first, last, cur; + + PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2); + page[0] = a1; + page[1] = a2; + + if (a1==0x20) { + first=last=h->first_hrt_core; + } else { + first=h->first_hrt_core; + last=core->vm_info->num_cores-1; + } + + core->vm_regs.rax = 0; + + h->trans_count = last-first+1; + + for (cur=first;cur<=last;cur++) { + +#if USE_UPCALL_MAGIC_PF + PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur); + core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS; + if (v3_raise_exception_with_error(&core->vm_info->cores[cur], + PF_EXCEPTION, + UPCALL_MAGIC_ERROR)) { + PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur); + core->vm_regs.rax = -1; + break; + } +#else + PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur); + if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) { + PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur); + core->vm_regs.rax = -1; + break; + } +#endif + // Force core to exit now + v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0); + + } + if (core->vm_regs.rax==0) { + if (a1==0x20) { + h->trans_state = HRT_CALL; + } else { + h->trans_state = HRT_PARCALL; + } + } else { + PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n"); + h->trans_state = HRT_IDLE; + h->trans_count = 0; + } + } + } + break; - V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n", - hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, c-core->hvm_state.last_boot_start, core->num_exits); - //v3_print_core_telemetry(core); - // v3_print_guest_state(core); + case 0x2f: // function exec done + if (v3_is_hvm_ros_core(core)) { + PrintError(core->vm_info,core, "hvm: request for exec done from ROS core\n"); + core->vm_regs.rax=-1; + } else { + if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_CALL && h->trans_state!=HRT_PARCALL) { + PrintError(core->vm_info,core,"hvm: function completion when not in HRT_CALL or HRT_PARCALL state\n"); + core->vm_regs.rax=-1; + } else { + uint64_t one=1; + PrintDebug(core->vm_info,core, "hvm: function complete\n"); + if (__sync_fetch_and_sub(&h->trans_count,one)==1) { + // last one, switch state + h->trans_state=HRT_IDLE; + PrintDebug(core->vm_info,core, "hvm: function complete - back to idle\n"); + } + core->vm_regs.rax=0; + } + } + + break; + + case 0x30: // merge address space + case 0x31: // unmerge address space + if (v3_is_hvm_hrt_core(core)) { + PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un"); + core->vm_regs.rax=-1; + } else { + if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { + PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state\n",a1==0x30 ? "" : "un"); + core->vm_regs.rax=-1; + } else { + uint64_t *page = (uint64_t *) h->comm_page_hva; + + PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3); + // should sanity check to make sure guest is in 64 bit without anything strange + + page[0] = a1; + page[1] = core->ctrl_regs.cr3; // this is a do-not-care for an unmerge + + core->vm_regs.rax = 0; +#if USE_UPCALL_MAGIC_PF + PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %u\n",h->first_hrt_core); + core->vm_info->cores[h->first_hrt_core].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS; + if (v3_raise_exception_with_error(&core->vm_info->cores[h->first_hrt_core], + PF_EXCEPTION, + UPCALL_MAGIC_ERROR)) { + PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %u\n",h->first_hrt_core); + core->vm_regs.rax = -1; + break; + } +#else + PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %u\n",h->hrt_int_vector,h->first_hrt_core); + if (v3_raise_swintr(&core->vm_info->cores[h->first_hrt_core],h->hrt_int_vector)) { + PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %u\n",h->first_hrt_core); + core->vm_regs.rax = -1; + } +#endif + // Force core to exit now + v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0); + + h->trans_state = HRT_MERGE; + } + + } + + break; + + + case 0x3f: // merge operation done + if (v3_is_hvm_ros_core(core)) { + PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n"); + core->vm_regs.rax=-1; + } else { + if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) { + PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n"); + core->vm_regs.rax=-1; + } else { + PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n"); + h->trans_state=HRT_IDLE; + core->vm_regs.rax=0; + } + } + + break; + + default: + PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1); + core->vm_regs.rax=-1; + break; + } + return 0; } @@ -153,7 +373,7 @@ int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config) } vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024; - + if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) { PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n"); return -1; @@ -206,6 +426,15 @@ int v3_deinit_hvm_vm(struct v3_vm_info *vm) v3_remove_hypercall(vm,HVM_HCALL); + if (vm->hvm_state.comm_page_hpa) { + struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa); + if (!r) { + PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n"); + } else { + v3_delete_mem_region(vm,r); + } + } + return 0; } @@ -331,15 +560,15 @@ void v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_in #define MAX(x,y) ((x)>(y)?(x):(y)) #define MIN(x,y) ((x)<(y)?(x):(y)) -#ifdef HVM_MAP_1G_2M -#define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x40000000ULL)) -#else -#define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x800000000ULL)) -#endif +static uint64_t boot_state_end_addr(struct v3_vm_info *vm) +{ + return PAGE_ADDR(vm->mem_size); +} + static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - PAGE_SIZE); + *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE); *limit = PAGE_SIZE; } @@ -391,7 +620,7 @@ static void write_null_int_handler(struct v3_vm_info *vm) static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - 2 * PAGE_SIZE); + *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE); *limit = 16*256; } @@ -403,7 +632,7 @@ static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) // 3 ist => (stack) = 0 => current stack // 5 reserved => 0 // 4 type => 0xe=>INT, 0xf=>TRAP -// 1 reserved => 0 +// 1 reserved => 0 (indicates "system" by being zero) // 2 dpl => 0 // 1 present => 1 // 16 offsetmid => 0 @@ -414,7 +643,7 @@ static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) // // Note little endian // -static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ; +static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ; static uint64_t idt64_int_gate_entry_mask[2] = { 0x00008e0000080000, 0x0 }; static void write_idt(struct v3_vm_info *vm) @@ -431,6 +660,8 @@ static void write_idt(struct v3_vm_info *vm) get_null_int_handler_loc(vm,&handler,&handler_len); + handler += vm->hvm_state.gva_offset; + memcpy(trap_gate,idt64_trap_gate_entry_mask,16); memcpy(int_gate,idt64_int_gate_entry_mask,16); @@ -469,7 +700,7 @@ static void write_idt(struct v3_vm_info *vm) static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 3 * PAGE_SIZE); + *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE); *limit = 8*3; } @@ -494,7 +725,7 @@ static void write_gdt(struct v3_vm_info *vm) static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 4 * PAGE_SIZE); + *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE); *limit = PAGE_SIZE; } @@ -510,159 +741,307 @@ static void write_tss(struct v3_vm_info *vm) PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base); } + +#define TOP_HALF_START 0xffff800000000000ULL +#define BOTTOM_HALF_END 0x00007fffffffffffULL + + +#define L4_UNIT PAGE_SIZE +#define L3_UNIT (512ULL * L4_UNIT) +#define L2_UNIT (512ULL * L3_UNIT) +#define L1_UNIT (512ULL * L2_UNIT) + +static void compute_pts_4KB(struct v3_vm_info *vm, + uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4) +{ + + // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start + // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4 + // so it is the same number of page tables regardless + + uint64_t max_gva = vm->hvm_state.max_mem_mapped; + + *l1 = 1; // 1 PML4 + *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512); + *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512); + *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512); +} + + + /* - PTS MAP FIRST 512 GB identity mapped: - 1 second level - 512 entries + PTS MAP using 1 GB pages + n second levels pts, highest gva, highest address 1 top level - 1 entries + OR - PTS MAP FIRST 1 GB identity mapped: - 1 third level - 512 entries - 1 second level - 1 entries - 1 top level - 1 entries + PTS MAP using 2 MB pages + n third level pts, highest gva, highest address + m second level pts, highest gva, highest address + 1 top level pt + +OR + + PTS MAP using 4 KB pages + n 4th level, highest gva, highest address + m 3rd level, highest gva, hihgest address + l second level, highest gva, highest address + 1 top level pt + +OR + PTS MAP using 512 GB pages when this becomes available + */ + static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { -#ifdef HVM_MAP_1G_2M - *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+2)*PAGE_SIZE); - *limit = 3*PAGE_SIZE; -#else - *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+1)*PAGE_SIZE); - *limit = 2*PAGE_SIZE; -#endif + uint64_t l1,l2,l3,l4; + uint64_t num_pt; + + compute_pts_4KB(vm,&l1,&l2,&l3,&l4); + + if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { + num_pt = l1; + } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { + num_pt = l1 + l2; + } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { + num_pt = l1 + l2 + l3; + } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { + num_pt = l1 + l2 + l3 + l4; + } else { + PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size); + return; + } + + *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE); + *limit = num_pt*PAGE_SIZE; } -#ifndef HVM_MAP_1G_2M -static void write_pt_2level_512GB(struct v3_vm_info *vm) +static void write_pts(struct v3_vm_info *vm) { - void *base; uint64_t size; - struct pml4e64 pml4e; - struct pdpe64 pdpe; - uint64_t i; - - get_pt_loc(vm,&base, &size); - if (size!=2*PAGE_SIZE) { - PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n"); + uint64_t num_l1, num_l2, num_l3, num_l4; + void *start_l1, *start_l2, *start_l3, *start_l4; + uint64_t max_level; + void *cur_pt; + void *cur_gva; + void *cur_gpa; + void *min_gpa = 0; + void *max_gpa = (void*) vm->hvm_state.max_mem_mapped; + void *min_gva = (void*) vm->hvm_state.gva_offset; +#ifdef V3_CONFIG_DEBUG_HVM + void *max_gva = min_gva+vm->hvm_state.max_mem_mapped; +#endif + uint64_t i, pt; + uint64_t i_start,i_end; + + struct pml4e64 *pml4e; + struct pdpe64 *pdpe; + struct pde64 *pde; + struct pte64 *pte; + + if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { + PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n"); + max_level = 1; + } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { + max_level = 2; + } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { + max_level = 3; + } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { + max_level = 4; + } else { + PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n"); + return; } - if (vm->mem_size > 0x800000000ULL) { - PrintError(vm,VCORE_NONE, "VM has more than 512 GB\n"); + get_pt_loc(vm,&start_l1,&size); + compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4); + + start_l2=start_l1+PAGE_SIZE*num_l1; + start_l3=start_l2+PAGE_SIZE*num_l2; + start_l4=start_l3+PAGE_SIZE*num_l3; + + PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1); + PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa); + PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4); + PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4); + + cur_pt=start_l1; + + // build PML4 (only one) + if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) { + PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n"); + return; } - memset(&pdpe,0,sizeof(pdpe)); - pdpe.present=1; - pdpe.writable=1; - pdpe.large_page=1; - - for (i=0;i<512;i++) { - pdpe.pd_base_addr = i*0x40000; // 0x4000 = 256K pages = 1 GB - v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe); + memset(pml4e,0,PAGE_SIZE); + + if (min_gva==0x0) { + i_start=0; i_end = num_l2; + } else if (min_gva==(void*)TOP_HALF_START) { + i_start=256; i_end=256+num_l2; + } else { + PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n"); + return; } - memset(&pml4e,0,sizeof(pml4e)); - pml4e.present=1; - pml4e.writable=1; - pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE)); + for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa; + (icores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e); + pml4e[i].present=1; + pml4e[i].writable=1; + + if (max_level==1) { + PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n"); + pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa)); + //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr); + } else { + pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE)); + //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr); + } + } - for (i=1;i<512;i++) { - pml4e.present=0; - v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e); + // 512 GB only + if (max_level==1) { + return; } - PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE) at %p (512 GB mapped)\n",base); -} -#else -static void write_pt_3level_1GB(struct v3_vm_info *vm) -{ - void *base; - uint64_t size; - struct pml4e64 pml4e; - struct pdpe64 pdpe; - struct pde64 pde; + for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva; + ptcores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) { + PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n"); + return; + } + + memset(pdpe,0,PAGE_SIZE); + + for (i=0; + i<512 && cur_gpamem_size > 0x40000000ULL) { - PrintError(vm,VCORE_NONE, "VM has more than 1 GB\n"); + + //1 GB only + if (max_level==2) { + return; } - memset(&pde,0,sizeof(pde)); - pde.present=1; - pde.writable=1; - pde.large_page=1; - - for (i=0;i<512;i++) { - pde.pt_base_addr = i*0x200; // 0x200 = 512 pages = 2 MB - v3_write_gpa_memory(&vm->cores[0], - (addr_t)(base+2*PAGE_SIZE+i*sizeof(pde)), - sizeof(pde),(uint8_t*)&pde); - } + for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva; + ptcores[0],(addr_t)cur_pt,(addr_t*)&pde)) { + PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n"); + return; + } + + memset(pde,0,PAGE_SIZE); + + for (i=0; + i<512 && cur_gpacores[0],(addr_t)base+PAGE_SIZE,sizeof(pdpe),(uint8_t*)&pdpe); - - for (i=1;i<512;i++) { - pdpe.present = 0; - v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe); + //2 MB only + if (max_level==3) { + return; } - memset(&pml4e,0,sizeof(pml4e)); - pml4e.present=1; - pml4e.writable=1; - pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE)); - v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e); + // 4 KB + for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva; + ptcores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e); + // build PTE + if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) { + PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n"); + return; + } + + memset(pte,0,PAGE_SIZE); + + for (i=0; + i<512 && cur_gpavm_info; + + hrt->tag.type = MB_INFO_HRT_TAG; + hrt->tag.size = sizeof(mb_info_hrt_t); + + hrt->total_num_apics = vm->num_cores; + hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core; + hrt->have_hrt_ioapic=0; + hrt->first_hrt_ioapic_entry=0; + + hrt->cpu_freq_khz = V3_CPU_KHZ(); + + hrt->hrt_flags = vm->hvm_state.hrt_flags; + hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped; + hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa; + hrt->gva_offset = vm->hvm_state.gva_offset; + hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa; + hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector; + + return 0; } static void write_mb_info(struct v3_vm_info *vm) @@ -753,77 +1132,151 @@ static mb_header_t *find_mb_header(uint8_t *data, uint64_t size) } -// -// BROKEN - THIS DOES NOT DO WHAT YOU THINK -// -static int setup_elf(struct v3_vm_info *vm, void *base, uint64_t limit) +static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb) { - v3_write_gpa_memory(&vm->cores[0],(addr_t)base,vm->hvm_state.hrt_file->size,vm->hvm_state.hrt_file->data); - - vm->hvm_state.hrt_entry_addr = (uint64_t) (base+0x40); - - PrintDebug(vm,VCORE_NONE,"hvm: wrote HRT ELF %s at %p\n", vm->hvm_state.hrt_file->tag,base); - PrintDebug(vm,VCORE_NONE,"hvm: set ELF entry to %p and hoping for the best...\n", (void*) vm->hvm_state.hrt_entry_addr); + struct v3_vm_hvm *h = &vm->hvm_state; + uint64_t f = mb->mb64_hrt->hrt_flags; + uint64_t maxmap = mb->mb64_hrt->max_mem_to_map; + uint64_t gvaoff = mb->mb64_hrt->gva_offset; + uint64_t gvaentry = mb->mb64_hrt->gva_entry; + uint64_t commgpa = mb->mb64_hrt->comm_page_gpa; + uint8_t vec = mb->mb64_hrt->hrt_int_vector; - vm->hvm_state.hrt_type = HRT_ELF64; - return 0; + PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n", + f, maxmap, gvaoff,gvaentry,commgpa, vec); -} + if (maxmap<0x100000000ULL) { + PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n"); + maxmap=0x100000000ULL; + } -static int setup_mb_kernel(struct v3_vm_info *vm, void *base, uint64_t limit) -{ - mb_data_t mb; + if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { + PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n"); + return -1; + } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { + f &= ~0x3c; + f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB; + h->max_mem_mapped = maxmap; + PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n"); + } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { + f &= ~0x3c; + f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB; + h->max_mem_mapped = maxmap; + PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n"); + } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { + f &= ~0x3c; + f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB; + h->max_mem_mapped = maxmap; + PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n"); + } else { + PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n"); + return -1; + } - if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) { - PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n"); + if (f & MB_TAG_MB64_HRT_FLAG_RELOC) { + PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n"); return -1; } + h->hrt_flags = f; - if (v3_write_multiboot_kernel(vm,&mb,vm->hvm_state.hrt_file,base,limit)) { - PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n"); + if (maxmap>h->max_mem_mapped) { + PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap); return -1; } - /* - if (!mb.addr || !mb.entry) { - PrintError(vm,VCORE_NONE, "hvm: kernel is missing address or entry point\n"); + if (gvaoff!=0 && gvaoff!=TOP_HALF_START) { + PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START); return -1; } + + h->gva_offset = gvaoff; - if (((void*)(uint64_t)(mb.addr->header_addr) < base ) || - ((void*)(uint64_t)(mb.addr->load_end_addr) > base+limit) || - ((void*)(uint64_t)(mb.addr->bss_end_addr) > base+limit)) { - PrintError(vm,VCORE_NONE, "hvm: kernel is not within the allowed portion of HVM\n"); + h->gva_entry = gvaentry; + + if (mb->addr->load_addr < h->first_hrt_gpa) { + PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n"); return -1; } + + if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) { + PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n"); + return -1; + } + + if (vec<32) { + PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec); + return -1; + } + + h->hrt_int_vector = vec; + + + if (commgpa < vm->mem_size) { + PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n"); + return -1; + } - offset = mb.addr->load_addr - mb.addr->header_addr; + h->comm_page_gpa = commgpa; - // Skip the ELF header - assume 1 page... weird.... - // FIX ME TO CONFORM TO MULTIBOOT.C - v3_write_gpa_memory(&vm->cores[0], - (addr_t)(mb.addr->load_addr), - vm->hvm_state.hrt_file->size-PAGE_SIZE-offset, - vm->hvm_state.hrt_file->data+PAGE_SIZE+offset); + if (!h->comm_page_hpa) { + if (!(h->comm_page_hpa=V3_AllocPages(1))) { + PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n"); + return -1; + } + h->comm_page_hva = V3_VAddr(h->comm_page_hpa); - // vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + PAGE_SIZE; //HACK PAD + memset(h->comm_page_hva,0,PAGE_SIZE_4KB); + + if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) { + PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n"); + V3_FreePages((void*)(h->comm_page_gpa),1); + return -1; + } + + + PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n"); + } + memset(h->comm_page_hva,0,PAGE_SIZE_4KB); + + + PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n", + h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector); + + return 0; - PrintDebug(vm,VCORE_NONE, - "hvm: wrote 0x%llx bytes starting at offset 0x%llx to %p; set entry to %p\n", - (uint64_t) vm->hvm_state.hrt_file->size-PAGE_SIZE-offset, - (uint64_t) PAGE_SIZE+offset, - (void*)(addr_t)(mb.addr->load_addr), - (void*) vm->hvm_state.hrt_entry_addr); +} +static int setup_mb_kernel_hrt(struct v3_vm_info *vm) +{ + mb_data_t mb; - */ + if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) { + PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n"); + return -1; + } - vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr; + if (configure_hrt(vm,&mb)) { + PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n"); + return -1; + } + if (v3_write_multiboot_kernel(vm,&mb,vm->hvm_state.hrt_file, + (void*)vm->hvm_state.first_hrt_gpa, + vm->mem_size-vm->hvm_state.first_hrt_gpa)) { + PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n"); + return -1; + } + + if (vm->hvm_state.gva_entry) { + vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry; + } else { + vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset; + } + vm->hvm_state.hrt_type = HRT_MBOOT64; return 0; @@ -833,37 +1286,17 @@ static int setup_mb_kernel(struct v3_vm_info *vm, void *base, uint64_t limit) static int setup_hrt(struct v3_vm_info *vm) { - void *base; - uint64_t limit; - - get_hrt_loc(vm,&base,&limit); - - if (vm->hvm_state.hrt_file->size > limit) { - PrintError(vm,VCORE_NONE,"hvm: Cannot map HRT because it is too big (%llu bytes, but only have %llu space\n", vm->hvm_state.hrt_file->size, (uint64_t)limit); - return -1; - } + if (is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size) && + find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { - if (!is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { - PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not an ELF but we are going to act like it is!\n"); - if (setup_elf(vm,base,limit)) { - PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n"); + PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n"); + if (setup_mb_kernel_hrt(vm)) { + PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n"); return -1; - } - vm->hvm_state.hrt_type=HRT_BLOB; + } } else { - if (find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { - PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n"); - if (setup_mb_kernel(vm,base,limit)) { - PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n"); - return -1; - } - } else { - PrintDebug(vm,VCORE_NONE,"hvm: supplied HRT is an ELF\n"); - if (setup_elf(vm,base,limit)) { - PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n"); - return -1; - } - } + PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n"); + return -1; } return 0; @@ -887,8 +1320,9 @@ static int setup_hrt(struct v3_vm_info *vm) GDT (1 page - page aligned) TSS (1 page - page asligned) PAGETABLES (identy map of first N GB) - ROOT PT first, followed by 2nd level, etc. - Currently PML4 followed by 1 PDPE for 512 GB of mapping + ROOT PT first (lowest memory addr), followed by 2nd level PTs in order, + followed by 3rd level PTs in order, followed by 4th level + PTs in order. MBINFO_PAGE SCRATCH_STACK_HRT_CORE0 SCRATCH_STACK_HRT_CORE1 @@ -898,7 +1332,8 @@ static int setup_hrt(struct v3_vm_info *vm) HRT (as many pages as needed, page-aligned, starting at first HRT address) --- ROS - + + */ @@ -911,20 +1346,22 @@ int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm) PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n"); + if (setup_hrt(vm)) { + PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n"); + return -1; + } + + // the locations of all the other items are determined by + // the HRT setup, so these must happen after + write_null_int_handler(vm); write_idt(vm); write_gdt(vm); write_tss(vm); - write_pt(vm); + write_pts(vm); - - if (setup_hrt(vm)) { - PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n"); - return -1; - } - - // need to parse HRT first + // this must happen last write_mb_info(vm); PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n"); @@ -940,7 +1377,7 @@ int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm) TS points to stub TSS CR3 points to root page table CR0 has PE and PG - EFER has LME AND LMA + EFER has LME AND LMA (and NX for compatibility with Linux) RSP is TOS of core's scratch stack (looks like a call) RAX = MB magic cookie @@ -948,6 +1385,8 @@ int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm) RCX = this core id / apic id (0..N-1) RDX = this core id - first HRT core ID (==0 for the first HRT core) + All addresses are virtual addresses, offset as needed by gva_offset + Other regs are zeroed shadow/nested paging state reset for long mode @@ -957,18 +1396,20 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) { void *base; uint64_t limit; + uint64_t gva_offset; rdtscll(core->hvm_state.last_boot_start); + if (!core->hvm_state.is_hrt) { PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id); return 0; } - PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id); - + PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id); + gva_offset = core->vm_info->hvm_state.gva_offset; memset(&core->vm_regs,0,sizeof(core->vm_regs)); memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs)); @@ -990,7 +1431,7 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) // multiboot info pointer get_mb_info_loc(core->vm_info, &base,&limit); - core->vm_regs.rbx = (uint64_t) base; + core->vm_regs.rbx = (uint64_t) base + gva_offset; // core number core->vm_regs.rcx = core->vcpu_id; @@ -1001,6 +1442,7 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) // Now point to scratch stack for this core // it begins at an ofset relative to the MB info page get_mb_info_loc(core->vm_info, &base,&limit); + base = base + gva_offset; base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE; core->vm_regs.rsp = (v3_reg_t) base; core->vm_regs.rbp = (v3_reg_t) base-8; @@ -1008,14 +1450,19 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) // push onto the stack a bad rbp and bad return address core->vm_regs.rsp-=16; v3_set_gpa_memory(core, - core->vm_regs.rsp, + core->vm_regs.rsp-gva_offset, 16, 0xff); // HRT entry point get_hrt_loc(core->vm_info, &base,&limit); - core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr ; + if (core->vm_info->hvm_state.gva_entry) { + core->rip = core->vm_info->hvm_state.gva_entry; + } else { + core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset; + } + PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n", @@ -1036,7 +1483,7 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) // CR2: don't care (output from #PF) // CE3: set to our PML4E, without setting PCD or PWT get_pt_loc(core->vm_info, &base,&limit); - core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base); + core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base); // not offset as this is a GPA core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3; // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0) @@ -1044,8 +1491,8 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4; // CR8 as usual // RFLAGS zeroed is fine: come in with interrupts off - // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 - core->ctrl_regs.efer = 0x1500; + // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 + core->ctrl_regs.efer = 0x1d00; core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer; @@ -1065,47 +1512,50 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) // Install our stub IDT get_idt_loc(core->vm_info, &base,&limit); + base += gva_offset; core->segments.idtr.selector = 0; // entry 0 (NULL) of the GDT - core->segments.idtr.base = (addr_t) base; + core->segments.idtr.base = (addr_t) base; // only base+limit are used core->segments.idtr.limit = limit-1; - core->segments.idtr.type = 0xe; - core->segments.idtr.system = 1; + core->segments.idtr.type = 0x0; + core->segments.idtr.system = 0; core->segments.idtr.dpl = 0; - core->segments.idtr.present = 1; - core->segments.idtr.long_mode = 1; + core->segments.idtr.present = 0; + core->segments.idtr.long_mode = 0; // Install our stub GDT get_gdt_loc(core->vm_info, &base,&limit); - core->segments.gdtr.selector = 0; + base += gva_offset; + core->segments.gdtr.selector = 0; // entry 0 (NULL) of the GDT core->segments.gdtr.base = (addr_t) base; - core->segments.gdtr.limit = limit-1; - core->segments.gdtr.type = 0x6; - core->segments.gdtr.system = 1; + core->segments.gdtr.limit = limit-1; // only base+limit are used + core->segments.gdtr.type = 0x0; + core->segments.gdtr.system = 0; core->segments.gdtr.dpl = 0; - core->segments.gdtr.present = 1; - core->segments.gdtr.long_mode = 1; + core->segments.gdtr.present = 0; + core->segments.gdtr.long_mode = 0; // And our TSS get_tss_loc(core->vm_info, &base,&limit); + base += gva_offset; core->segments.tr.selector = 0; core->segments.tr.base = (addr_t) base; core->segments.tr.limit = limit-1; - core->segments.tr.type = 0x6; - core->segments.tr.system = 1; + core->segments.tr.type = 0x9; + core->segments.tr.system = 0; // available 64 bit TSS core->segments.tr.dpl = 0; core->segments.tr.present = 1; - core->segments.tr.long_mode = 1; + core->segments.tr.long_mode = 0; // not used - base = 0x0; + base = 0x0; // these are not offset as we want to make all gvas visible limit = -1; // And CS core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0) - core->segments.cs.base = (addr_t) base; - core->segments.cs.limit = limit; - core->segments.cs.type = 0xe; - core->segments.cs.system = 0; - core->segments.cs.dpl = 0; + core->segments.cs.base = (addr_t) base; // not used + core->segments.cs.limit = limit; // not used + core->segments.cs.type = 0xe; // only C is used + core->segments.cs.system = 1; // not a system segment + core->segments.cs.dpl = 0; core->segments.cs.present = 1; core->segments.cs.long_mode = 1; @@ -1113,8 +1563,8 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0) core->segments.ds.base = (addr_t) base; core->segments.ds.limit = limit; - core->segments.ds.type = 0x6; - core->segments.ds.system = 0; + core->segments.ds.type = 0x6; // ignored + core->segments.ds.system = 1; // not a system segment core->segments.ds.dpl = 0; core->segments.ds.present = 1; core->segments.ds.long_mode = 1; @@ -1184,6 +1634,7 @@ int v3_handle_hvm_reset(struct guest_info *core) if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { // leader core->vm_info->run_state = VM_RUNNING; + core->vm_info->hvm_state.trans_state = HRT_IDLE; } v3_counting_barrier(&core->vm_info->reset_barrier); diff --git a/palacios/src/palacios/vmm_hvm_lowlevel.S b/palacios/src/palacios/vmm_hvm_lowlevel.S index b6d1a76..ffd63f2 100644 --- a/palacios/src/palacios/vmm_hvm_lowlevel.S +++ b/palacios/src/palacios/vmm_hvm_lowlevel.S @@ -1,7 +1,7 @@ /* HVM environment code and data */ /* This must match the number in vmm_hypercall.h */ -#define HVM_HCALL $0xf000 +#define HVM_HCALL $0xf00d /* The default installed interrupt handlers simply hcall @@ -45,4 +45,4 @@ v3_hvm_vmx_null_int_handler_end: - \ No newline at end of file + diff --git a/palacios/src/palacios/vmm_multiboot.c b/palacios/src/palacios/vmm_multiboot.c index c007291..a9a20b3 100644 --- a/palacios/src/palacios/vmm_multiboot.c +++ b/palacios/src/palacios/vmm_multiboot.c @@ -144,94 +144,6 @@ int v3_deinit_multiboot_core(struct guest_info *core) #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"multiboot: " fmt,##args) - -/****************************************************************** - Data contained in the ELF file we will attempt to boot -******************************************************************/ - -#define ELF_MAGIC 0x464c457f -#define MB2_MAGIC 0xe85250d6 - - -/****************************************************************** - Data we will pass to the kernel via rbx -******************************************************************/ - -#define MB2_INFO_MAGIC 0x36d76289 - -typedef struct mb_info_header { - uint32_t totalsize; - uint32_t reserved; -} __attribute__((packed)) mb_info_header_t; - -// A tag of type 0, size 8 indicates last value -// -typedef struct mb_info_tag { - uint32_t type; - uint32_t size; -} __attribute__((packed)) mb_info_tag_t; - - -#define MB_INFO_MEM_TAG 4 -typedef struct mb_info_mem { - mb_info_tag_t tag; - uint32_t mem_lower; // 0..640K in KB - uint32_t mem_upper; // in KB to first hole - 1 MB -} __attribute__((packed)) mb_info_mem_t; - -#define MB_INFO_CMDLINE_TAG 1 -// note alignment of 8 bytes required for each... -typedef struct mb_info_cmdline { - mb_info_tag_t tag; - uint32_t size; // includes zero termination - uint8_t string[]; // zero terminated -} __attribute__((packed)) mb_info_cmdline_t; - - -#define MEM_RAM 1 -#define MEM_ACPI 3 -#define MEM_RESV 4 - -typedef struct mb_info_memmap_entry { - uint64_t base_addr; - uint64_t length; - uint32_t type; - uint32_t reserved; -} __attribute__((packed)) mb_info_memmap_entry_t; - -#define MB_INFO_MEMMAP_TAG 6 -// note alignment of 8 bytes required for each... -typedef struct mb_info_memmap { - mb_info_tag_t tag; - uint32_t entry_size; // multiple of 8 - uint32_t entry_version; // 0 - mb_info_memmap_entry_t entries[]; -} __attribute__((packed)) mb_info_memmap_t; - -#define MB_INFO_HRT_TAG 0xf00df00d -typedef struct mb_info_hrt { - mb_info_tag_t tag; - // apic ids are 0..num_apics-1 - // apic and ioapic addresses are the well known places - uint32_t total_num_apics; - uint32_t first_hrt_apic_id; - uint32_t have_hrt_ioapic; - uint32_t first_hrt_ioapic_entry; - uint64_t first_hrt_addr; -} __attribute__((packed)) mb_info_hrt_t; - - -// We are not doing: -// -// - BIOS Boot Devie -// - Modules -// - ELF symbols -// - Boot Loader name -// - APM table -// - VBE info -// - Framebuffer info -// - static int is_elf(uint8_t *data, uint64_t size) { if (*((uint32_t*)data)==ELF_MAGIC) { @@ -382,7 +294,7 @@ static int parse_multiboot_kernel(uint8_t *data, uint64_t size, mb_data_t *mb) INFO(" size = 0x%x\n", mb_modalign->size); } break; -#if 0 +#if 1 case MB_TAG_MB64_HRT: { if (mb_mb64_hrt) { ERROR("Multiple mb64_hrt tags found!\n"); @@ -539,13 +451,7 @@ uint64_t v3_build_multiboot_table(struct guest_info *core, uint8_t *dest, uint64 #ifdef V3_CONFIG_HVM if (core->vm_info->hvm_state.is_hvm && v3_is_hvm_hrt_core(core)) { - hrt->tag.type = MB_INFO_HRT_TAG; - hrt->tag.size = sizeof(mb_info_hrt_t); - hrt->total_num_apics = vm->num_cores; - hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core; - hrt->have_hrt_ioapic=0; - hrt->first_hrt_ioapic_entry=0; - hrt->first_hrt_addr = vm->hvm_state.first_hrt_gpa; + v3_build_hrt_multiboot_tag(core,hrt); } #endif