From: Peter Dinda Date: Tue, 19 May 2015 16:27:47 +0000 (-0500) Subject: Addition of basic multiboot functionality plus refactor of HVM X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?p=palacios.git;a=commitdiff_plain;h=3650bf4aaa4f83afae52c8fee98fce6e3ee68deb Addition of basic multiboot functionality plus refactor of HVM functionality to extend multiboot --- diff --git a/Kconfig b/Kconfig index 5288b22..e8cae68 100644 --- a/Kconfig +++ b/Kconfig @@ -57,6 +57,8 @@ config OTHER_OS endchoice +source "Kconfig.stdlibs" + config CRAY_XT bool "Red Storm (Cray XT3/XT4)" help @@ -260,185 +262,7 @@ config DEBUG_CHECKPOINT endmenu - -source "Kconfig.stdlibs" - - -menu "Virtual Paging" - -config NESTED_PAGING - bool "Enable nested paging" - default y - help - Enable nested paging (should always be on) - -config SHADOW_PAGING - bool "Enable shadow paging" - default y - help - Enables shadow paging for virtual machines - - -config SHADOW_PAGING_VTLB - bool "Virtual TLB" - default y - depends on SHADOW_PAGING - help - Enables Virtual TLB implemenation for shadow paging - Virtual TLB now uses PAE so there are no 4 GB restrictions - - -config DEBUG_SHDW_PG_VTLB - bool "Enable VTLB debugging" - default n - depends on SHADOW_PAGING_VTLB - help - Enables debugging messages for VTLB implementation - -config SHADOW_PAGING_CACHE - bool "Shadow Page Cache" - default n - depends on SHADOW_PAGING && EXPERIMENTAL - help - Enables caching implementation of shadow paging - -config DEBUG_SHADOW_PAGING_CACHE - bool "Enable Shadow Page Cache Debugging" - default n - depends on SHADOW_PAGING_CACHE - help - Enables debugging messages for the VTLB + Caching implementation - -#config SHADOW_PAGING_KVM -# bool "KVM-style Shadow Pager" -# default n -# depends on SHADOW_PAGING && EXPERIMENTAL -# help -# Enables shadow pager derived from KVM -# You probably do not want this and it will probably not compile! -# -#config DEBUG_SHADOW_PAGING_KVM -# bool "Enable KVM-style Shadow Pager Debugging" -# default n -# depends on SHADOW_PAGING_KVM -# help -# Enables debugging messages for the KVM-style shadow pager - - -config SWAPPING - bool "Enable swapping" - default n - depends on (SHADOW_PAGING || NESTED_PAGING) && FILE - help - Enables swapping of regions of guest physical memory to a file - -config DEBUG_SWAPPING - bool "Enable swapping debugging" - default n - depends on SWAPPING - help - Provides debugging output from the swapping system - -config MEM_TRACK - bool "Enable memory access tracking" - default n - depends on SHADOW_PAGING || NESTED_PAGING - help - Allows tracking of memory accesses on a page granularity - -config DEBUG_MEM_TRACK - bool "Enable memory access tracking debugging" - default n - depends on MEM_TRACK - help - Provides debugging output for memory access tracking - -endmenu - -menu "Symbiotic Functions" - -config SYMBIOTIC - bool "Enable Symbiotic Functionality" - default n - help - Enable Symbiotic components of the VMM. - This includes the SymSpy interface. - -config SYMCALL - bool "Symbiotic upcalls" - default n - depends on SYMBIOTIC && EXPERIMENTAL - help - Enables the Symbiotic upcall interface - -config SWAPBYPASS - bool "SwapBypass" - default n - depends on SYMBIOTIC && SYMCALL && EXPERIMENTAL - help - This enables the SwapBypass architecture - -config SWAPBYPASS_TELEMETRY - bool "Enable SwapBypass Telemetry" - default n - depends on TELEMETRY && SWAPBYPASS - help - Enable the telemetry information for the SwapBypass subsystem - -menuconfig SYMMOD - bool "Symbiotic Modules" - default n - depends on EXPERIMENTAL -# depends on SYMBIOTIC - help - Enable Symbiotic module loading - - -endmenu - -menu "VNET" - -config VNET - bool "Enable Vnet in Palacios" - default n - help - Enable the Vnet in Palacios - -config DEBUG_VNET - depends on VNET - bool "Enable Vnet Debug in Palacios" - default n - help - Enable the Vnet debug in Palacios - - -endmenu - -source "palacios/src/gears/Kconfig" - - -menu "HVM" - -config HVM - bool "Support Hybrid Virtual Machines" - default n - help - If set, it is possible to make VMs that are partitioned - (cores, memory, devices, hardware access, etc) into - a part ("the ROS") that supports normal VM operation and - a part ("the HRT") that supports Hybrid Run-Times, - for example Nautilus-based HRTs for parallel languages. - -config DEBUG_HVM - depends on HVM - bool "Enable HVM debugging in Palacios" - default n - help - Enable HVM debugging output - -endmenu - -menu "Debug configuration" +menu "Debug Configuration" ## Is unwind information useful @@ -585,7 +409,106 @@ config DEBUG_MEM_ALLOC endmenu -menu "BIOS Selection" + + +menu "Virtual Paging" + +config NESTED_PAGING + bool "Enable nested paging" + default y + help + Enable nested paging (should always be on) + +config SHADOW_PAGING + bool "Enable shadow paging" + default y + help + Enables shadow paging for virtual machines + + +config SHADOW_PAGING_VTLB + bool "Virtual TLB" + default y + depends on SHADOW_PAGING + help + Enables Virtual TLB implemenation for shadow paging + Virtual TLB now uses PAE so there are no 4 GB restrictions + + +config DEBUG_SHDW_PG_VTLB + bool "Enable VTLB debugging" + default n + depends on SHADOW_PAGING_VTLB + help + Enables debugging messages for VTLB implementation + +config SHADOW_PAGING_CACHE + bool "Shadow Page Cache" + default n + depends on SHADOW_PAGING && EXPERIMENTAL + help + Enables caching implementation of shadow paging + +config DEBUG_SHADOW_PAGING_CACHE + bool "Enable Shadow Page Cache Debugging" + default n + depends on SHADOW_PAGING_CACHE + help + Enables debugging messages for the VTLB + Caching implementation + +#config SHADOW_PAGING_KVM +# bool "KVM-style Shadow Pager" +# default n +# depends on SHADOW_PAGING && EXPERIMENTAL +# help +# Enables shadow pager derived from KVM +# You probably do not want this and it will probably not compile! +# +#config DEBUG_SHADOW_PAGING_KVM +# bool "Enable KVM-style Shadow Pager Debugging" +# default n +# depends on SHADOW_PAGING_KVM +# help +# Enables debugging messages for the KVM-style shadow pager + + +config SWAPPING + bool "Enable swapping" + default n + depends on (SHADOW_PAGING || NESTED_PAGING) && FILE + help + Enables swapping of regions of guest physical memory to a file + +config DEBUG_SWAPPING + bool "Enable swapping debugging" + default n + depends on SWAPPING + help + Provides debugging output from the swapping system + +config MEM_TRACK + bool "Enable memory access tracking" + default n + depends on SHADOW_PAGING || NESTED_PAGING + help + Allows tracking of memory accesses on a page granularity + +config DEBUG_MEM_TRACK + bool "Enable memory access tracking debugging" + default n + depends on MEM_TRACK + help + Provides debugging output for memory access tracking + +endmenu + + +source "palacios/src/devices/Kconfig" + +menu "Boot Environments" + + +menu "BIOS" choice prompt "Boot Code Selection" @@ -613,7 +536,6 @@ config OTHERBIOS endchoice - config SEABIOS_PATH string "Path to pre-built SEABIOS binary" depends on SEABIOS @@ -666,8 +588,116 @@ config VMXASSIST_PATH This is vmxassist image to boot real mode guests on Intel VMX Platforms +endmenu + +menu Multiboot + +config MULTIBOOT + bool "Support Multiboot2-compliant boot" + default y + help + If set, it is possible to boot a multiboot2 compliant + kernel directly. + +config DEBUG_MULTIBOOT + depends on MULTIBOOT + bool "Enable Multiboot2 debugging in Palacios" + default n + help + Enable Multiboot2 debugging output + endmenu -source "palacios/src/devices/Kconfig" +endmenu + +menu "Symbiosis" + +config SYMBIOTIC + bool "Enable Symbiotic Functionality" + default n + help + Enable Symbiotic components of the VMM. + This includes the SymSpy interface. + +config SYMCALL + bool "Symbiotic upcalls" + default n + depends on SYMBIOTIC && EXPERIMENTAL + help + Enables the Symbiotic upcall interface + +config SWAPBYPASS + bool "SwapBypass" + default n + depends on SYMBIOTIC && SYMCALL && EXPERIMENTAL + help + This enables the SwapBypass architecture + +config SWAPBYPASS_TELEMETRY + bool "Enable SwapBypass Telemetry" + default n + depends on TELEMETRY && SWAPBYPASS + help + Enable the telemetry information for the SwapBypass subsystem + +menuconfig SYMMOD + bool "Symbiotic Modules" + default n + depends on EXPERIMENTAL +# depends on SYMBIOTIC + help + Enable Symbiotic module loading + + +endmenu + +menu "VNET" + +config VNET + bool "Enable Vnet in Palacios" + default n + help + Enable the Vnet in Palacios + +config DEBUG_VNET + depends on VNET + bool "Enable Vnet Debug in Palacios" + default n + help + Enable the Vnet debug in Palacios + + +endmenu + +source "palacios/src/gears/Kconfig" + + +menu HVM + +config HVM + bool "Support Hybrid Virtual Machines" + depends on MULTIBOOT + default n + help + If set, it is possible to make VMs that are partitioned + (cores, memory, devices, hardware access, etc) into + a part ("the ROS") that supports normal VM operation and + a part ("the HRT") that supports Hybrid Run-Times, + for example Nautilus-based HRTs for parallel languages. + +config DEBUG_HVM + depends on HVM + bool "Enable HVM debugging in Palacios" + default n + help + Enable HVM debugging output + +endmenu + + + + + + diff --git a/palacios/include/palacios/vm_guest.h b/palacios/include/palacios/vm_guest.h index 3029c3c..e646c86 100644 --- a/palacios/include/palacios/vm_guest.h +++ b/palacios/include/palacios/vm_guest.h @@ -69,6 +69,10 @@ struct v3_sym_core_state; #include #endif +#ifdef V3_CONFIG_MULTIBOOT +#include +#endif + #ifdef V3_CONFIG_HVM #include #endif @@ -264,10 +268,15 @@ struct v3_vm_info { struct v3_vm_mem_track memtrack_state; #endif +#ifdef V3_CONFIG_MULTIBOOT + struct v3_vm_multiboot mb_state; +#endif + #ifdef V3_CONFIG_HVM struct v3_vm_hvm hvm_state; #endif + uint64_t yield_cycle_period; diff --git a/palacios/include/palacios/vmm_hvm.h b/palacios/include/palacios/vmm_hvm.h index 6d145c2..576828c 100644 --- a/palacios/include/palacios/vmm_hvm.h +++ b/palacios/include/palacios/vmm_hvm.h @@ -30,10 +30,13 @@ struct v3_vm_hvm { uint32_t first_hrt_core; uint64_t first_hrt_gpa; struct v3_cfg_file *hrt_file; + uint64_t hrt_entry_addr; + enum { HRT_BLOB, HRT_ELF64, HRT_MBOOT2, HRT_MBOOT64 } hrt_type; }; struct v3_core_hvm { uint8_t is_hrt; + uint64_t last_boot_start; }; struct v3_xml; diff --git a/palacios/src/palacios/Makefile b/palacios/src/palacios/Makefile index 0ca0f57..1bbce2a 100644 --- a/palacios/src/palacios/Makefile +++ b/palacios/src/palacios/Makefile @@ -93,6 +93,7 @@ obj-$(V3_CONFIG_SYMMOD) += vmm_symmod.o obj-$(V3_CONFIG_MEM_TRACK) += vmm_mem_track.o +obj-$(V3_CONFIG_MULTIBOOT) += vmm_multiboot.o obj-$(V3_CONFIG_HVM) += vmm_hvm.o vmm_hvm_lowlevel.o obj-y += mmu/ diff --git a/palacios/src/palacios/svm.c b/palacios/src/palacios/svm.c index 05fd183..0e37c4c 100644 --- a/palacios/src/palacios/svm.c +++ b/palacios/src/palacios/svm.c @@ -850,6 +850,13 @@ int v3_start_svm_guest(struct guest_info * info) { PrintDebug(info->vm_info, info, "Starting SVM core %u (on logical core %u)\n", info->vcpu_id, info->pcpu_id); +#ifdef V3_CONFIG_MULTIBOOT + if (v3_setup_multiboot_core_for_boot(info)) { + PrintError(info->vm_info, info, "Failed to setup Multiboot core...\n"); + return -1; + } +#endif + #ifdef V3_CONFIG_HVM if (v3_setup_hvm_hrt_core_for_boot(info)) { PrintError(info->vm_info, info, "Failed to setup HRT core...\n"); diff --git a/palacios/src/palacios/vm_guest.c b/palacios/src/palacios/vm_guest.c index d325e6b..41158e7 100644 --- a/palacios/src/palacios/vm_guest.c +++ b/palacios/src/palacios/vm_guest.c @@ -322,6 +322,11 @@ int v3_free_vm_internal(struct v3_vm_info * vm) { v3_deinit_hvm_vm(vm); #endif +#ifdef V3_CONFIG_MULTIBOOT + v3_deinit_multiboot_vm(vm); +#endif + + #ifdef V3_CONFIG_SYMBIOTIC v3_deinit_symbiotic_vm(vm); #endif @@ -474,6 +479,10 @@ int v3_free_core(struct guest_info * core) { v3_deinit_hvm_core(core); #endif +#ifdef V3_CONFIG_MULTIBOOT + v3_deinit_multiboot_core(core); +#endif + v3_deinit_decoder(core); v3_deinit_intr_controllers(core); diff --git a/palacios/src/palacios/vmm.c b/palacios/src/palacios/vmm.c index b7f45cd..6257294 100644 --- a/palacios/src/palacios/vmm.c +++ b/palacios/src/palacios/vmm.c @@ -154,6 +154,10 @@ void Init_V3(struct v3_os_hooks * hooks, char * cpu_mask, int num_cpus, char *op // Parse host-os defined options into an easily-accessed format. v3_parse_options(options); +#ifdef V3_CONFIG_MULTIBOOT + v3_init_multiboot(); +#endif + #ifdef V3_CONFIG_HVM v3_init_hvm(); #endif @@ -263,6 +267,10 @@ void Shutdown_V3() { v3_deinit_hvm(); #endif +#ifdef V3_CONFIG_MULTIBOOT + v3_deinit_multiboot(); +#endif + v3_deinit_options(); @@ -385,6 +393,12 @@ int v3_start_vm(struct v3_vm_info * vm, unsigned int cpu_mask) { return -1; } +#if V3_CONFIG_MULTIBOOT + if (v3_setup_multiboot_vm_for_boot(vm)) { + PrintError(vm, VCORE_NONE, "Multiboot setup for boot failed\n"); + return -1; + } +#endif #if V3_CONFIG_HVM if (v3_setup_hvm_vm_for_boot(vm)) { PrintError(vm, VCORE_NONE, "HVM setup for boot failed\n"); diff --git a/palacios/src/palacios/vmm_config.c b/palacios/src/palacios/vmm_config.c index 38cfb70..7987957 100644 --- a/palacios/src/palacios/vmm_config.c +++ b/palacios/src/palacios/vmm_config.c @@ -37,6 +37,10 @@ #include #endif +#ifdef V3_CONFIG_MULTIBOOT +#include +#endif + #ifdef V3_CONFIG_HVM #include #endif @@ -360,6 +364,12 @@ static int pre_config_vm(struct v3_vm_info * vm, v3_cfg_tree_t * vm_cfg) { return -1; } +#ifdef V3_CONFIG_MULTIBOOT + if (v3_init_multiboot_vm(vm,vm_cfg)) { + PrintError(vm,VCORE_NONE,"Cannot initialize Multiboot for VM\n"); + return -1; + } +#endif #ifdef V3_CONFIG_HVM if (v3_init_hvm_vm(vm,vm_cfg)) { PrintError(vm,VCORE_NONE,"Cannot initialize HVM for VM\n"); @@ -434,6 +444,12 @@ static int pre_config_core(struct guest_info * info, v3_cfg_tree_t * core_cfg) { return -1; } +#ifdef V3_CONFIG_MULTIBOOT + if (v3_init_multiboot_core(info)) { + PrintError(info->vm_info, info, "Error Initializing Multiboot Core\n"); + return -1; + } +#endif #ifdef V3_CONFIG_HVM if (v3_init_hvm_core(info)) { PrintError(info->vm_info, info, "Error Initializing HVM Core\n"); diff --git a/palacios/src/palacios/vmm_hvm.c b/palacios/src/palacios/vmm_hvm.c index b1f7013..edff2fd 100644 --- a/palacios/src/palacios/vmm_hvm.c +++ b/palacios/src/palacios/vmm_hvm.c @@ -28,8 +28,8 @@ #include -#include -#include +#include + /* @@ -68,6 +68,13 @@ #define PrintDebug(fmt, args...) #endif + +// if set, we will map the first 1 GB of memory using a 3 level +// hierarchy, for compatibility with Nautilus out of the box. +// Otherwise we will map the first 512 GB using a 2 level +// hieratchy +#define HVM_MAP_1G_2M 1 + int v3_init_hvm() { PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n"); @@ -83,8 +90,16 @@ int v3_deinit_hvm() static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data) { - V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx\n", - hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx); + uint64_t c; + + rdtscll(c); + + + V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n", + hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, c-core->hvm_state.last_boot_start, core->num_exits); + v3_print_core_telemetry(core); + // v3_print_guest_state(core); + return 0; } @@ -98,7 +113,7 @@ int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config) char *enable; char *ros_cores; char *ros_mem; - char *hrt_file_id; + char *hrt_file_id=0; PrintDebug(vm, VCORE_NONE, "hvm: vm init\n"); @@ -317,10 +332,18 @@ void v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_in } } +#define MAX(x,y) ((x)>(y)?(x):(y)) +#define MIN(x,y) ((x)<(y)?(x):(y)) + +#ifdef HVM_MAP_1G_2M +#define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x40000000ULL)) +#else +#define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x800000000ULL)) +#endif static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*) PAGE_ADDR(vm->mem_size - PAGE_SIZE); + *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - PAGE_SIZE); *limit = PAGE_SIZE; } @@ -372,7 +395,7 @@ static void write_null_int_handler(struct v3_vm_info *vm) static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*) PAGE_ADDR(vm->mem_size - 2 * PAGE_SIZE); + *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - 2 * PAGE_SIZE); *limit = 16*256; } @@ -450,7 +473,7 @@ static void write_idt(struct v3_vm_info *vm) static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*)PAGE_ADDR(vm->mem_size - 3 * PAGE_SIZE); + *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 3 * PAGE_SIZE); *limit = 8*3; } @@ -475,7 +498,7 @@ static void write_gdt(struct v3_vm_info *vm) static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*)PAGE_ADDR(vm->mem_size - 4 * PAGE_SIZE); + *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 4 * PAGE_SIZE); *limit = PAGE_SIZE; } @@ -501,15 +524,31 @@ static void write_tss(struct v3_vm_info *vm) 512 entries 1 top level 1 entries + +OR + + PTS MAP FIRST 1 GB identity mapped: + 1 third level + 512 entries + 1 second level + 1 entries + 1 top level + 1 entries */ static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*)PAGE_ADDR(vm->mem_size-(5+1)*PAGE_SIZE); +#ifdef HVM_MAP_1G_2M + *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+2)*PAGE_SIZE); + *limit = 3*PAGE_SIZE; +#else + *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+1)*PAGE_SIZE); *limit = 2*PAGE_SIZE; +#endif } -static void write_pt(struct v3_vm_info *vm) +#ifndef HVM_MAP_1G_2M +static void write_pt_2level_512GB(struct v3_vm_info *vm) { void *base; uint64_t size; @@ -522,6 +561,10 @@ static void write_pt(struct v3_vm_info *vm) PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n"); } + if (vm->mem_size > 0x800000000ULL) { + PrintError(vm,VCORE_NONE, "VM has more than 512 GB\n"); + } + memset(&pdpe,0,sizeof(pdpe)); pdpe.present=1; pdpe.writable=1; @@ -544,12 +587,89 @@ static void write_pt(struct v3_vm_info *vm) v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e); } - PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE) at %p\n",base); + PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE) at %p (512 GB mapped)\n",base); +} + +#else + +static void write_pt_3level_1GB(struct v3_vm_info *vm) +{ + void *base; + uint64_t size; + struct pml4e64 pml4e; + struct pdpe64 pdpe; + struct pde64 pde; + + uint64_t i; + + get_pt_loc(vm,&base, &size); + if (size!=3*PAGE_SIZE) { + PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n"); + } + + if (vm->mem_size > 0x40000000ULL) { + PrintError(vm,VCORE_NONE, "VM has more than 1 GB\n"); + } + + memset(&pde,0,sizeof(pde)); + pde.present=1; + pde.writable=1; + pde.large_page=1; + + for (i=0;i<512;i++) { + pde.pt_base_addr = i*0x200; // 0x200 = 512 pages = 2 MB + v3_write_gpa_memory(&vm->cores[0], + (addr_t)(base+2*PAGE_SIZE+i*sizeof(pde)), + sizeof(pde),(uint8_t*)&pde); + } + + memset(&pdpe,0,sizeof(pdpe)); + pdpe.present=1; + pdpe.writable=1; + pdpe.large_page=0; + + pdpe.pd_base_addr = PAGE_BASE_ADDR((addr_t)(base+2*PAGE_SIZE)); + + v3_write_gpa_memory(&vm->cores[0],(addr_t)base+PAGE_SIZE,sizeof(pdpe),(uint8_t*)&pdpe); + + for (i=1;i<512;i++) { + pdpe.present = 0; + v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe); + } + + memset(&pml4e,0,sizeof(pml4e)); + pml4e.present=1; + pml4e.writable=1; + pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE)); + + v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e); + + for (i=1;i<512;i++) { + pml4e.present=0; + v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e); + } + + PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE, 1 PDP) at %p (1 GB mapped)\n",base); +} + +#endif + +static void write_pt(struct v3_vm_info *vm) +{ +#ifdef HVM_MAP_1G_2M + write_pt_3level_1GB(vm); +#else + write_pt_2level_512GB(vm); +#endif } static void get_bp_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) { - *base = (void*) PAGE_ADDR(vm->mem_size-(6+1)*PAGE_SIZE); +#ifdef HVM_MAP_1G_2M + *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+2)*PAGE_SIZE); +#else + *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+1)*PAGE_SIZE); +#endif *limit = PAGE_SIZE; } @@ -593,22 +713,146 @@ static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit) *limit = bp_base - *base; } -static void write_hrt(struct v3_vm_info *vm) + +#define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args) +#define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args) + +#define ELF_MAGIC 0x464c457f +#define MB2_MAGIC 0xe85250d6 + +#define MB2_INFO_MAGIC 0x36d76289 + +static int is_elf(uint8_t *data, uint64_t size) +{ + if (*((uint32_t*)data)==ELF_MAGIC) { + return 1; + } else { + return 0; + } +} + +static mb_header_t *find_mb_header(uint8_t *data, uint64_t size) +{ + uint64_t limit = size > 32768 ? 32768 : size; + uint64_t i; + + // Scan for the .boot magic cookie + // must be in first 32K, assume 4 byte aligned + for (i=0;icores[0],(addr_t)base,vm->hvm_state.hrt_file->size,vm->hvm_state.hrt_file->data); + + vm->hvm_state.hrt_entry_addr = (uint64_t) (base+0x40); + + PrintDebug(vm,VCORE_NONE,"hvm: wrote HRT ELF %s at %p\n", vm->hvm_state.hrt_file->tag,base); + PrintDebug(vm,VCORE_NONE,"hvm: set ELF entry to %p and hoping for the best...\n", (void*) vm->hvm_state.hrt_entry_addr); + + vm->hvm_state.hrt_type = HRT_ELF64; + + return 0; + +} + +static int setup_mb_kernel(struct v3_vm_info *vm, void *base, uint64_t limit) +{ + mb_data_t mb; + uint32_t offset; + + + // FIX USING GENERIC TOOLS + + if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) { + PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n"); + return -1; + } + + if (!mb.addr || !mb.entry) { + PrintError(vm,VCORE_NONE, "hvm: kernel is missing address or entry point\n"); + return -1; + } + + if (((void*)(uint64_t)(mb.addr->header_addr) < base ) || + ((void*)(uint64_t)(mb.addr->load_end_addr) > base+limit) || + ((void*)(uint64_t)(mb.addr->bss_end_addr) > base+limit)) { + PrintError(vm,VCORE_NONE, "hvm: kernel is not within the allowed portion of HVM\n"); + return -1; + } + + offset = mb.addr->load_addr - mb.addr->header_addr; + + // Skip the ELF header - assume 1 page... weird.... + v3_write_gpa_memory(&vm->cores[0], + (addr_t)(mb.addr->load_addr), + vm->hvm_state.hrt_file->size-PAGE_SIZE-offset, + vm->hvm_state.hrt_file->data+PAGE_SIZE+offset); + + + // vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + PAGE_SIZE; //HACK PAD + + vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr; + + vm->hvm_state.hrt_type = HRT_MBOOT64; + + PrintDebug(vm,VCORE_NONE, + "hvm: wrote 0x%llx bytes starting at offset 0x%llx to %p; set entry to %p\n", + (uint64_t) vm->hvm_state.hrt_file->size-PAGE_SIZE-offset, + (uint64_t) PAGE_SIZE+offset, + (void*)(addr_t)(mb.addr->load_addr), + (void*) vm->hvm_state.hrt_entry_addr); + return 0; + +} + + +static int setup_hrt(struct v3_vm_info *vm) { void *base; uint64_t limit; get_hrt_loc(vm,&base,&limit); - + if (vm->hvm_state.hrt_file->size > limit) { PrintError(vm,VCORE_NONE,"hvm: Cannot map HRT because it is too big (%llu bytes, but only have %llu space\n", vm->hvm_state.hrt_file->size, (uint64_t)limit); - return; + return -1; } - v3_write_gpa_memory(&vm->cores[0],(addr_t)base,vm->hvm_state.hrt_file->size,vm->hvm_state.hrt_file->data); + if (!is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { + PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not an ELF but we are going to act like it is!\n"); + if (setup_elf(vm,base,limit)) { + PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n"); + return -1; + } + vm->hvm_state.hrt_type=HRT_BLOB; + } else { + if (find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { + PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n"); + if (setup_mb_kernel(vm,base,limit)) { + PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n"); + return -1; + } + } else { + PrintDebug(vm,VCORE_NONE,"hvm: supplied HRT is an ELF\n"); + if (setup_elf(vm,base,limit)) { + PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n"); + return -1; + } + } + } - PrintDebug(vm,VCORE_NONE,"hvm: wrote HRT %s at %p\n", vm->hvm_state.hrt_file->tag,base); - + return 0; } @@ -659,7 +903,10 @@ int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm) write_bp(vm); - write_hrt(vm); + if (setup_hrt(vm)) { + PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n"); + return -1; + } PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n"); @@ -693,6 +940,8 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) void *base; uint64_t limit; + rdtscll(core->hvm_state.last_boot_start); + if (!core->hvm_state.is_hrt) { PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id); return 0; @@ -725,21 +974,27 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) core->vm_regs.rdi = (v3_reg_t) base; // HRT entry point get_hrt_loc(core->vm_info, &base,&limit); - core->rip = (uint64_t) base + 0x40; // hack for test.o + core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr ; // Setup CRs for long mode and our stub page table // CR0: PG, PE core->ctrl_regs.cr0 = 0x80000001; + core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0; + // CR2: don't care (output from #PF) // CE3: set to our PML4E, without setting PCD or PWT get_pt_loc(core->vm_info, &base,&limit); core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base); + core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3; + // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0) core->ctrl_regs.cr4 = 0xb0; + core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4; // CR8 as usual // RFLAGS zeroed is fine: come in with interrupts off - // EFER needs SVME LMA LME (last 16 bites: 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 + // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 core->ctrl_regs.efer = 0x1500; + core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer; /* @@ -817,10 +1072,83 @@ int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core) memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds)); memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds)); + + if (core->vm_info->hvm_state.hrt_type==HRT_MBOOT64) { + /* + Temporary hackery for multiboot2 "64" + We will push the MB structure onto the stack and update RSP + and RBX + */ + uint8_t buf[256]; + uint64_t size; + + if ((size=v3_build_multiboot_table(core,buf,256))==-1) { + PrintError(core->vm_info,core,"hvm: Failed to write MB info\n"); + return -1; + } + core->vm_regs.rsp -= size; + + v3_write_gpa_memory(core, + core->vm_regs.rsp, + size, + buf); + + PrintDebug(core->vm_info,core, "hvm: wrote MB info at %p\n", (void*)core->vm_regs.rsp); + + if (core->vcpu_id == core->vm_info->hvm_state.first_hrt_core) { + // We are the BSP for this HRT + // this is where rbx needs to point + core->vm_regs.rbx = core->vm_regs.rsp; + PrintDebug(core->vm_info,core, "hvm: \"BSP\" core\n"); + } else { + // We are an AP for this HRT + // so we don't get the multiboot struct + core->vm_regs.rbx = 0; + PrintDebug(core->vm_info,core, "hvm: \"AP\" core\n"); + } + + + + // one more push, something that looks like a return address + size=0; + core->vm_regs.rsp -= 8; + + v3_write_gpa_memory(core, + core->vm_regs.rsp, + 8, + (uint8_t*) &size); + + // Now for our magic - this signals + // the kernel that a multiboot loader loaded it + // and that rbx points to its offered data + core->vm_regs.rax = MB2_INFO_MAGIC; + + /* + Note that "real" MB starts in protected mode without paging + This hack starts in long mode... so these requirements go + out the window for a large part + + Requirements: + + OK EAX has magic + OK EBX points to MB info + OK CS = base 0, offset big, code (LONG MODE) + OK DS,ES,FS,GS,SS => base 0, offset big, data (LONG MODE) + OK A20 gate on + XXX CR0 PE on PG off (nope) + XXX EFLAGS IF and VM off + */ + + + + } + + // reset paging here for shadow... if (core->shdw_pg_mode != NESTED_PAGING) { PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n"); + return -1; } diff --git a/palacios/src/palacios/vmm_multiboot.c b/palacios/src/palacios/vmm_multiboot.c new file mode 100644 index 0000000..0a7f60b --- /dev/null +++ b/palacios/src/palacios/vmm_multiboot.c @@ -0,0 +1,986 @@ +/* + * This file is part of the Palacios Virtual Machine Monitor developed + * by the V3VEE Project with funding from the United States National + * Science Foundation and the Department of Energy. + * + * The V3VEE Project is a joint project between Northwestern University + * and the University of New Mexico. You can find out more at + * http://www.v3vee.org + * + * Copyright (c) 2015, The V3VEE Project + * All rights reserved. + * + * Author: Peter Dinda + * + * This is free software. You are permitted to use, + * redistribute, and modify it as specified in the file "V3VEE_LICENSE". + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + + +/* + + In a Pal file: + + + + + + + + +*/ + +#ifndef V3_CONFIG_DEBUG_MULTIBOOT +#undef PrintDebug +#define PrintDebug(fmt, args...) +#endif + + +int v3_init_multiboot() +{ + PrintDebug(VM_NONE,VCORE_NONE, "multiboot: init\n"); + return 0; +} + +int v3_deinit_multiboot() +{ + PrintDebug(VM_NONE,VCORE_NONE, "multiboot: deinit\n"); + return 0; +} + + + +#define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y))) + +int v3_init_multiboot_vm(struct v3_vm_info *vm, struct v3_xml *config) +{ + v3_cfg_tree_t *mb_config; + char *enable; + char *mb_file_id=0; + + PrintDebug(vm, VCORE_NONE, "multiboot: vm init\n"); + + memset(&vm->mb_state,0,sizeof(struct v3_vm_multiboot)); + vm->mb_state.is_multiboot=0; + + if (!config || !(mb_config=v3_cfg_subtree(config,"multiboot"))) { + PrintDebug(vm,VCORE_NONE,"multiboot: no multiboot configuration found - normal boot will occur\n"); + goto out_ok; + } + + if (!(enable=v3_cfg_val(mb_config,"enable")) || strcasecmp(enable,"y")) { + PrintDebug(vm,VCORE_NONE,"multiboot: multiboot configuration disabled\n"); + goto out_ok; + } + + if (!(mb_file_id=v3_cfg_val(mb_config,"file_id"))) { + PrintError(vm,VCORE_NONE,"multiboot: multiboot block without file_id...\n"); + return -1; + } + + vm->mb_state.mb_file = v3_cfg_get_file(vm,mb_file_id); + + if (!vm->mb_state.mb_file) { + PrintError(vm,VCORE_NONE,"multiboot: multiboot block contains bad file_id (%s)\n",mb_file_id); + return -1; + } + + vm->mb_state.is_multiboot=1; + + out_ok: + if (vm->mb_state.is_multiboot) { + V3_Print(vm,VCORE_NONE,"multiboot: file_id=%s (tag %s)]\n", + mb_file_id, + vm->mb_state.mb_file->tag); + } else { + V3_Print(vm,VCORE_NONE,"multiboot: This is not a multiboot VM\n"); + } + return 0; + +} + + +int v3_deinit_multiboot_vm(struct v3_vm_info *vm) +{ + PrintDebug(vm, VCORE_NONE, "multiboot: multiboot VM deinit\n"); + + return 0; +} + +int v3_init_multiboot_core(struct guest_info *core) +{ + PrintDebug(core->vm_info, VCORE_NONE, "multiboot: multiboot core init\n"); + + // Nothing to do at this point + + return 0; +} + +int v3_deinit_multiboot_core(struct guest_info *core) +{ + PrintDebug(core->vm_info, VCORE_NONE, "multiboot: multiboot core deinit\n"); + + return 0; +} + + + + +#define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"multiboot: " fmt,##args) +#define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"multiboot: " fmt,##args) + + + +/****************************************************************** + Data contained in the ELF file we will attempt to boot +******************************************************************/ + +#define ELF_MAGIC 0x464c457f +#define MB2_MAGIC 0xe85250d6 + + +/****************************************************************** + Data we will pass to the kernel via rbx +******************************************************************/ + +#define MB2_INFO_MAGIC 0x36d76289 + +typedef struct mb_info_header { + uint32_t totalsize; + uint32_t reserved; +} __attribute__((packed)) mb_info_header_t; + +// A tag of type 0, size 8 indicates last value +// +typedef struct mb_info_tag { + uint32_t type; + uint32_t size; +} __attribute__((packed)) mb_info_tag_t; + + +#define MB_INFO_MEM_TAG 4 +typedef struct mb_info_mem { + mb_info_tag_t tag; + uint32_t mem_lower; // 0..640K in KB + uint32_t mem_upper; // in KB to first hole - 1 MB +} __attribute__((packed)) mb_info_mem_t; + +#define MB_INFO_CMDLINE_TAG 1 +// note alignment of 8 bytes required for each... +typedef struct mb_info_cmdline { + mb_info_tag_t tag; + uint32_t size; // includes zero termination + uint8_t string[]; // zero terminated +} __attribute__((packed)) mb_info_cmdline_t; + + +#define MEM_RAM 1 +#define MEM_ACPI 3 +#define MEM_RESV 4 + +typedef struct mb_info_memmap_entry { + uint64_t base_addr; + uint64_t length; + uint32_t type; + uint32_t reserved; +} __attribute__((packed)) mb_info_memmap_entry_t; + +#define MB_INFO_MEMMAP_TAG 6 +// note alignment of 8 bytes required for each... +typedef struct mb_info_memmap { + mb_info_tag_t tag; + uint32_t entry_size; // multiple of 8 + uint32_t entry_version; // 0 + mb_info_memmap_entry_t entries[]; +} __attribute__((packed)) mb_info_memmap_t; + +#define MB_INFO_HRT_TAG 0xf00df00d +typedef struct mb_info_hrt { + mb_info_tag_t tag; + // apic ids are 0..num_apics-1 + // apic and ioapic addresses are the well known places + uint32_t total_num_apics; + uint32_t first_hrt_apic_id; + uint32_t have_hrt_ioapic; + uint32_t first_hrt_ioapic_entry; +} __attribute__((packed)) mb_info_hrt_t; + + +// We are not doing: +// +// - BIOS Boot Devie +// - Modules +// - ELF symbols +// - Boot Loader name +// - APM table +// - VBE info +// - Framebuffer info +// + +static int is_elf(uint8_t *data, uint64_t size) +{ + if (*((uint32_t*)data)==ELF_MAGIC) { + return 1; + } else { + return 0; + } +} + +static mb_header_t *find_mb_header(uint8_t *data, uint64_t size) +{ + uint64_t limit = size > 32768 ? 32768 : size; + uint64_t i; + + // Scan for the .boot magic cookie + // must be in first 32K, assume 4 byte aligned + for (i=0;iarch, mb_header->headerlen); + + mb_tag = (mb_tag_t*)((void*)mb_header+16); + + while (!(mb_tag->type==0 && mb_tag->size==8)) { + INFO("tag: type 0x%x flags=0x%x size=0x%x\n",mb_tag->type, mb_tag->flags,mb_tag->size); + switch (mb_tag->type) { + case MB_TAG_INFO: { + if (mb_inf) { + ERROR("Multiple info tags found!\n"); + return -1; + } + mb_inf = (mb_info_t*)mb_tag; + INFO(" info request - types follow\n"); + for (i=0;(mb_tag->size-8)/4;i++) { + INFO(" %llu: type 0x%x\n", i, mb_inf->types[i]); + } + } + break; + + case MB_TAG_ADDRESS: { + if (mb_addr) { + ERROR("Multiple address tags found!\n"); + return -1; + } + mb_addr = (mb_addr_t*)mb_tag; + INFO(" address\n"); + INFO(" header_addr = 0x%x\n", mb_addr->header_addr); + INFO(" load_addr = 0x%x\n", mb_addr->load_addr); + INFO(" load_end_addr = 0x%x\n", mb_addr->load_end_addr); + INFO(" bss_end_addr = 0x%x\n", mb_addr->bss_end_addr); + } + break; + + case MB_TAG_ENTRY: { + if (mb_entry) { + ERROR("Multiple entry tags found!\n"); + return -1; + } + mb_entry=(mb_entry_t*)mb_tag; + INFO(" entry\n"); + INFO(" entry_addr = 0x%x\n", mb_entry->entry_addr); + } + break; + + case MB_TAG_FLAGS: { + if (mb_flags) { + ERROR("Multiple flags tags found!\n"); + return -1; + } + mb_flags = (mb_flags_t*)mb_tag; + INFO(" flags\n"); + INFO(" console_flags = 0x%x\n", mb_flags->console_flags); + } + break; + + case MB_TAG_FRAMEBUF: { + if (mb_framebuf) { + ERROR("Multiple framebuf tags found!\n"); + return -1; + } + mb_framebuf = (mb_framebuf_t*)mb_tag; + INFO(" framebuf\n"); + INFO(" width = 0x%x\n", mb_framebuf->width); + INFO(" height = 0x%x\n", mb_framebuf->height); + INFO(" depth = 0x%x\n", mb_framebuf->depth); + } + break; + + case MB_TAG_MODALIGN: { + if (mb_modalign) { + ERROR("Multiple modalign tags found!\n"); + return -1; + } + mb_modalign = (mb_modalign_t*)mb_tag; + INFO(" modalign\n"); + INFO(" size = 0x%x\n", mb_modalign->size); + } + break; +#if 0 + case MB_TAG_MB64_HRT: { + if (mb_mb64_hrt) { + ERROR("Multiple mb64_hrt tags found!\n"); + return -1; + } + mb_mb64_hrt = (mb_mb64_hrt_t*)mb_tag; + INFO(" mb64_hrt\n"); + } + break; +#endif + + default: + INFO("Unknown tag... Skipping...\n"); + break; + } + mb_tag = (mb_tag_t *)(((void*)mb_tag) + mb_tag->size); + } + + // copy out to caller + mb->header=mb_header; + mb->info=mb_inf; + mb->addr=mb_addr; + mb->entry=mb_entry; + mb->flags=mb_flags; + mb->framebuf=mb_framebuf; + mb->modalign=mb_modalign; + mb->mb64_hrt=mb_mb64_hrt; + + return 0; +} + + +int v3_parse_multiboot_header(struct v3_cfg_file *file, mb_data_t *result) +{ + return parse_multiboot_kernel(file->data,file->size,result); +} + + +#define APIC_BASE 0xfee00000 +#define IOAPIC_BASE 0xfec00000 + +/* + MB_INFO_HEADER + MB_HRT (if this is an HVM + MB_BASIC_MEMORY + MB_MEMORY_MAP + 0..640K RAM + 640K..1024 reserved + 1024..ioapic_base RAM + ioapic_base to ioapic_base+page reserved + ioapic_base+page to apic_base ram + apic_base oto apic_base+page reserved + apic_base+page to total RAM + + + The multiboot structure that is written reflects the + perspective of the core given the kind of VM it is part of. + + Regular VM + - core does not matter + - all memory visible + + HVM + ROS core + - only ROS memory visible + - regular multiboot or bios boot assumed + HRT core + - full HRT memory visible + - HRT64 multiboot assumed + +*/ + +uint64_t v3_build_multiboot_table(struct guest_info *core, uint8_t *dest, uint64_t size) +{ + struct v3_vm_info *vm = core->vm_info; + mb_info_header_t *header; +#ifdef V3_CONFIG_HVM + mb_info_hrt_t *hrt; +#endif + mb_info_mem_t *mem; + mb_info_memmap_t *memmap; + mb_info_tag_t *tag; + uint64_t num_mem, cur_mem; + + uint64_t total_mem = vm->mem_size; + +#ifdef V3_CONFIG_HVM + if (vm->hvm_state.is_hvm) { + if (v3_is_hvm_ros_core(core)) { + PrintDebug(core->vm_info,core,"multiboot: hvm: building mb table from ROS core perspective\n"); + total_mem = v3_get_hvm_ros_memsize(vm); + } else { + PrintDebug(core->vm_info,core,"multiboot: hvm: building mb table from HRT core perspective\n"); + total_mem = v3_get_hvm_hrt_memsize(vm); + } + } +#endif + + // assume we have > 1 MB + apic+ioapic + num_mem = 5; + if (total_mem>IOAPIC_BASE+PAGE_SIZE) { + num_mem++; + } + if (total_mem>APIC_BASE+PAGE_SIZE) { + num_mem++; + } + + + uint64_t needed = + sizeof(mb_info_header_t) + +#ifdef V3_CONFIG_HVM + core->vm_info->hvm_state.is_hvm && core->hvm_state.is_hrt ? sizeof(mb_info_hrt_t) : 0 +#endif + + + sizeof(mb_info_mem_t) + + sizeof(mb_info_memmap_t) + + sizeof(mb_info_memmap_entry_t) * num_mem + + sizeof(mb_info_tag_t); + + if (needed>size) { + return 0; + } + + uint8_t *next; + + if (needed>size) { + ERROR("Cannot fit MB info in needed space\n"); + return -1; + } + + next = dest; + + header = (mb_info_header_t*)next; + next += sizeof(mb_info_header_t); + +#if V3_CONFIG_HVM + if (core->vm_info->hvm_state.is_hvm && v3_is_hvm_hrt_core(core)) { + hrt = (mb_info_hrt_t*)next; + next += sizeof(mb_info_hrt_t); + } +#endif + + mem = (mb_info_mem_t*)next; + next += sizeof(mb_info_mem_t); + + memmap = (mb_info_memmap_t*)next; + next += sizeof(mb_info_memmap_t) + num_mem * sizeof(mb_info_memmap_entry_t); + + tag = (mb_info_tag_t*)next; + next += sizeof(mb_info_tag_t); + + header->totalsize = (uint32_t)(next - dest); + header->reserved = 0; + +#ifdef V3_CONFIG_HVM + if (core->vm_info->hvm_state.is_hvm && v3_is_hvm_hrt_core(core)) { + hrt->tag.type = MB_INFO_HRT_TAG; + hrt->tag.size = sizeof(mb_info_hrt_t); + hrt->total_num_apics = vm->num_cores; + hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core; + hrt->have_hrt_ioapic=0; + hrt->first_hrt_ioapic_entry=0; + } +#endif + + mem->tag.type = MB_INFO_MEM_TAG; + mem->tag.size = sizeof(mb_info_mem_t); + mem->mem_lower = 640; // thank you, bill gates + mem->mem_upper = (total_mem - 1024 * 1024) / 1024; + + memmap->tag.type = MB_INFO_MEMMAP_TAG; + memmap->tag.size = sizeof(mb_info_memmap_t) + num_mem * sizeof(mb_info_memmap_entry_t); + memmap->entry_size = 24; + memmap->entry_version = 0; + + cur_mem=0; + + // first 640K + memmap->entries[cur_mem].base_addr = 0; + memmap->entries[cur_mem].length = 640*1024; + memmap->entries[cur_mem].type = MEM_RAM; + memmap->entries[cur_mem].reserved = 0; + cur_mem++; + + // legacy io (640K->1 MB) + memmap->entries[cur_mem].base_addr = 640*1024; + memmap->entries[cur_mem].length = 384*1024; + memmap->entries[cur_mem].type = MEM_RESV; + memmap->entries[cur_mem].reserved = 1; + cur_mem++; + + // first meg to ioapic + memmap->entries[cur_mem].base_addr = 1024*1024; + memmap->entries[cur_mem].length = (total_mem < IOAPIC_BASE ? total_mem : IOAPIC_BASE) - 1024*1024; + memmap->entries[cur_mem].type = MEM_RAM; + memmap->entries[cur_mem].reserved = 0; + cur_mem++; + + // ioapic reservation + memmap->entries[cur_mem].base_addr = IOAPIC_BASE; + memmap->entries[cur_mem].length = PAGE_SIZE; + memmap->entries[cur_mem].type = MEM_RESV; + memmap->entries[cur_mem].reserved = 1; + cur_mem++; + + if (total_mem > (IOAPIC_BASE + PAGE_SIZE)) { + // memory between ioapic and apic + memmap->entries[cur_mem].base_addr = IOAPIC_BASE+PAGE_SIZE; + memmap->entries[cur_mem].length = (total_mem < APIC_BASE ? total_mem : APIC_BASE) - (IOAPIC_BASE+PAGE_SIZE);; + memmap->entries[cur_mem].type = MEM_RAM; + memmap->entries[cur_mem].reserved = 0; + cur_mem++; + } + + // apic + memmap->entries[cur_mem].base_addr = APIC_BASE; + memmap->entries[cur_mem].length = PAGE_SIZE; + memmap->entries[cur_mem].type = MEM_RESV; + memmap->entries[cur_mem].reserved = 1; + cur_mem++; + + if (total_mem > (APIC_BASE + PAGE_SIZE)) { + // memory after apic + memmap->entries[cur_mem].base_addr = APIC_BASE+PAGE_SIZE; + memmap->entries[cur_mem].length = total_mem - (APIC_BASE+PAGE_SIZE); + memmap->entries[cur_mem].type = MEM_RAM; + memmap->entries[cur_mem].reserved = 0; + cur_mem++; + } + + for (cur_mem=0;cur_mementries[cur_mem].base_addr, + memmap->entries[cur_mem].length, + memmap->entries[cur_mem].type, + memmap->entries[cur_mem].reserved ? "reserved" : ""); + } + + + + // This demarcates end of list + tag->type = 0; + tag->size = 8; + + return header->totalsize; + +} + + +int v3_write_multiboot_kernel(struct v3_vm_info *vm, mb_data_t *mb, struct v3_cfg_file *file, + void *base, uint64_t limit) +{ + uint32_t offset; + + if (!mb->addr || !mb->entry) { + PrintError(vm,VCORE_NONE, "multiboot: kernel is missing address or entry point\n"); + return -1; + } + + if (((void*)(uint64_t)(mb->addr->header_addr) < base ) || + ((void*)(uint64_t)(mb->addr->load_end_addr) > base+limit) || + ((void*)(uint64_t)(mb->addr->bss_end_addr) > base+limit)) { + PrintError(vm,VCORE_NONE, "multiboot: kernel is not within the allowed portion of VM\n"); + return -1; + } + + offset = mb->addr->load_addr - mb->addr->header_addr; + + // Skip the ELF header - assume 1 page... weird.... + // We are trying to do as little ELF loading here as humanly possible + v3_write_gpa_memory(&vm->cores[0], + (addr_t)(mb->addr->load_addr), + file->size-PAGE_SIZE-offset, + file->data+PAGE_SIZE+offset); + + PrintDebug(vm,VCORE_NONE, + "multiboot: wrote 0x%llx bytes starting at offset 0x%llx to %p\n", + (uint64_t) file->size-PAGE_SIZE-offset, + (uint64_t) PAGE_SIZE+offset, + (void*)(addr_t)(mb->addr->load_addr)); + + return 0; + +} + + +static int setup_multiboot_kernel(struct v3_vm_info *vm) +{ + void *base = 0; + uint64_t limit = vm->mem_size; + + + if (vm->mb_state.mb_file->size > limit) { + PrintError(vm,VCORE_NONE,"multiboot: Cannot map kernel because it is too big (%llu bytes, but only have %llu space\n", vm->mb_state.mb_file->size, (uint64_t)limit); + return -1; + } + + if (!is_elf(vm->mb_state.mb_file->data,vm->mb_state.mb_file->size)) { + PrintError(vm,VCORE_NONE,"multiboot: supplied kernel is not an ELF\n"); + return -1; + } else { + if (find_mb_header(vm->mb_state.mb_file->data,vm->mb_state.mb_file->size)) { + PrintDebug(vm,VCORE_NONE,"multiboot: appears to be a multiboot kernel\n"); + if (v3_parse_multiboot_header(vm->mb_state.mb_file,&vm->mb_state.mb_data)) { + PrintError(vm,VCORE_NONE,"multiboot: cannot parse multiboot kernel header\n"); + return -1; + } + if (v3_write_multiboot_kernel(vm, &(vm->mb_state.mb_data),vm->mb_state.mb_file,base,limit)) { + PrintError(vm,VCORE_NONE,"multiboot: multiboot kernel setup failed\n"); + return -1; + } + } else { + PrintError(vm,VCORE_NONE,"multiboot: multiboot kernel has no header\n"); + return -1; + } + } + + return 0; + +} + +// 32 bit GDT entries +// +// base24-31 flags2 limit16-19 access8 base16-23 base0-15 limit0-15 +// null 0 0 0 0 0 0 0 +// code 0 1100 f 10011010 0 0 ffff +// data 0 1100 f 10010010 0 0 ffff +// +// null = 00 00 00 00 00 00 00 00 +// code = 00 cf 9a 00 00 00 ff ff +// data = 00 cf 92 00 00 00 ff ff +// +static uint64_t gdt32[3] = { + 0x0000000000000000, /* null */ + 0x00cf9a000000ffff, /* code (note lme=0) */ + 0x00cf92000000ffff, /* data */ +}; + +static void write_gdt(struct v3_vm_info *vm, void *base, uint64_t limit) +{ + v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt32); + + PrintDebug(vm,VCORE_NONE,"multiboot: wrote GDT at %p\n",base); +} + + +static void write_tss(struct v3_vm_info *vm, void *base, uint64_t limit) +{ + int i; + uint64_t tss_data=0x0; + + for (i=0;icores[0],(addr_t)(base+8*i),8,(uint8_t*) &tss_data); + } + + PrintDebug(vm,VCORE_NONE,"multiboot: wrote TSS at %p\n",base); +} + +static void write_table(struct v3_vm_info *vm, void *base, uint64_t limit) +{ + uint64_t size; + uint8_t buf[256]; + + limit = limit < 256 ? limit : 256; + + size = v3_build_multiboot_table(&vm->cores[0], buf, limit); + + if (size>256 || size==0) { + PrintError(vm,VCORE_NONE,"multiboot: cannot build multiboot table\n"); + return; + } + + v3_write_gpa_memory(&vm->cores[0],(addr_t)base,size,buf); + +} + + + +/* + GPA layout: + + GDT + TSS + MBinfo + Kernel at its desired load address (or error) + +*/ + + +int v3_setup_multiboot_vm_for_boot(struct v3_vm_info *vm) +{ + void *kernel_start_gpa; + void *kernel_end_gpa; + void *mb_gpa; + void *tss_gpa; + void *gdt_gpa; + + if (!vm->mb_state.is_multiboot) { + PrintDebug(vm,VCORE_NONE,"multiboot: skipping multiboot setup for boot as this is not a multiboot VM\n"); + return 0; + } + + + if (setup_multiboot_kernel(vm)) { + PrintError(vm,VCORE_NONE,"multiboot: failed to setup kernel\n"); + return -1; + } + + kernel_start_gpa = (void*) (uint64_t) (vm->mb_state.mb_data.addr->load_addr); + kernel_end_gpa = (void*) (uint64_t) (vm->mb_state.mb_data.addr->bss_end_addr); + + // Is there room below the kernel? + if ((uint64_t)kernel_start_gpa > 19*4096 ) { + // at least 3 pages between 64K and start of kernel + // place at 64K + mb_gpa=(void*)(16*4096); + } else { + // is there room above the kernel? + if ((uint64_t)kernel_end_gpa < vm->mem_size-4*4096) { + if (((uint64_t)kernel_end_gpa + 4 * 4096) <= 0xffffffff) { + mb_gpa=(void*) (4096*((uint64_t)kernel_end_gpa/4096 + 1)); + } else { + PrintError(vm,VCORE_NONE,"multiboot: no room for mb data below 4 GB\n"); + return -1; + } + } else { + PrintError(vm,VCORE_NONE,"multiboot: no room for mb data above kernel\n"); + return -1; + } + } + + PrintDebug(vm,VCORE_NONE,"multiboot: mb data will start at %p\n",mb_gpa); + + vm->mb_state.mb_data_gpa=mb_gpa; + + tss_gpa = mb_gpa + 1 * 4096; + gdt_gpa = mb_gpa + 2 * 4096; + + write_table(vm,mb_gpa,4096); + + write_tss(vm,tss_gpa,4096); + + write_gdt(vm,gdt_gpa,4096); + + PrintDebug(vm,VCORE_NONE,"multiboot: setup of memory done\n"); + + return 0; +} + +/* + On entry: + + IDTR not set + GDTR points to stub GDT + TR points to stub TSS + CR0 has PE and not PG + EIP is entry point to kernel + EBX points to multiboot info + EAX multiboot magic cookie + +*/ +int v3_setup_multiboot_core_for_boot(struct guest_info *core) +{ + void *base; + uint64_t limit; + + if (!core->vm_info->mb_state.is_multiboot) { + PrintDebug(core->vm_info,core,"multiboot: skipping mb core setup as this is not an mb VM\n"); + return 0; + } + + if (core->vcpu_id != 0) { + PrintDebug(core->vm_info,core,"multiboot: skipping mb core setup as this is not the BSP core\n"); + return 0; + } + + + PrintDebug(core->vm_info, core, "multiboot: setting up MB BSP core for boot\n"); + + + memset(&core->vm_regs,0,sizeof(core->vm_regs)); + memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs)); + memset(&core->dbg_regs,0,sizeof(core->dbg_regs)); + memset(&core->segments,0,sizeof(core->segments)); + memset(&core->msrs,0,sizeof(core->msrs)); + memset(&core->fp_state,0,sizeof(core->fp_state)); + + // We need to be in protected mode at ring zero + core->cpl = 0; // we are going right into the kernel + core->cpu_mode = PROTECTED; + core->mem_mode = PHYSICAL_MEM; + // default run-state is fine, we are core zero + // core->core_run_state = CORE_RUNNING ; + + // right into the kernel + core->rip = (uint64_t) core->vm_info->mb_state.mb_data.entry->entry_addr; + + // Setup CRs for protected mode + // CR0: PE (but no PG) + core->ctrl_regs.cr0 = 0x1; + core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0; + + // CR2: don't care (output from #PF) + // CR3: don't care (no paging) + core->ctrl_regs.cr3 = 0; + core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3; + + // CR4: no features + core->ctrl_regs.cr4 = 0x0; + core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4; + // CR8 as usual + // RFLAGS zeroed is fine: come in with interrupts off + // EFER needs SVME and LME but not LMA (last 16 bits: 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 + core->ctrl_regs.efer = 0x1400; + core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer; + + + /* + Notes on selectors: + + selector is 13 bits of index, 1 bit table indicator + (0=>GDT), 2 bit RPL + + index is scaled by 8, even in long mode, where some entries + are 16 bytes long.... + -> code, data descriptors have 8 byte format + because base, limit, etc, are ignored (no segmentation) + -> interrupt/trap gates have 16 byte format + because offset needs to be 64 bits + */ + + // There is no IDTR set and interrupts are disabled + + // Install our stub GDT + core->segments.gdtr.selector = 0; + core->segments.gdtr.base = (addr_t) core->vm_info->mb_state.mb_data_gpa+2*4096; + core->segments.gdtr.limit = 4096-1; + core->segments.gdtr.type = 0x6; + core->segments.gdtr.system = 1; + core->segments.gdtr.dpl = 0; + core->segments.gdtr.present = 1; + core->segments.gdtr.long_mode = 0; + + // And our TSS + core->segments.tr.selector = 0; + core->segments.tr.base = (addr_t) core->vm_info->mb_state.mb_data_gpa+1*4096; + core->segments.tr.limit = 4096-1; + core->segments.tr.type = 0x6; + core->segments.tr.system = 1; + core->segments.tr.dpl = 0; + core->segments.tr.present = 1; + core->segments.tr.long_mode = 0; + + base = 0x0; + limit = -1; + + // And CS + core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0) + core->segments.cs.base = (addr_t) base; + core->segments.cs.limit = limit; + core->segments.cs.type = 0xe; + core->segments.cs.system = 0; + core->segments.cs.dpl = 0; + core->segments.cs.present = 1; + core->segments.cs.long_mode = 0; + + // DS, SS, etc are identical + core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0) + core->segments.ds.base = (addr_t) base; + core->segments.ds.limit = limit; + core->segments.ds.type = 0x6; + core->segments.ds.system = 0; + core->segments.ds.dpl = 0; + core->segments.ds.present = 1; + core->segments.ds.long_mode = 0; + + memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds)); + memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds)); + memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds)); + memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds)); + + + + // Now for our magic - this signals + // the kernel that a multiboot loader loaded it + // and that rbx points to its offered data + core->vm_regs.rax = MB2_INFO_MAGIC; + + core->vm_regs.rbx = (uint64_t) (core->vm_info->mb_state.mb_data_gpa); + + // reset paging here for shadow... + + if (core->shdw_pg_mode != NESTED_PAGING) { + PrintError(core->vm_info, core, "multiboot: shadow paging guest... this will end badly\n"); + return -1; + } + + + return 0; +} diff --git a/palacios/src/palacios/vmx.c b/palacios/src/palacios/vmx.c index daf8eee..23d631a 100644 --- a/palacios/src/palacios/vmx.c +++ b/palacios/src/palacios/vmx.c @@ -1198,6 +1198,13 @@ int v3_start_vmx_guest(struct guest_info * info) { PrintDebug(info->vm_info, info, "Starting VMX core %u\n", info->vcpu_id); +#ifdef V3_CONFIG_MULTIBOOT + if (v3_setup_multiboot_core_for_boot(info)) { + PrintError(info->vm_info, info, "Failed to setup Multiboot core...\n"); + return -1; + } +#endif + #ifdef V3_CONFIG_HVM if (v3_setup_hvm_hrt_core_for_boot(info)) { PrintError(info->vm_info, info, "Failed to setup HRT core...\n");