endchoice
+source "Kconfig.stdlibs"
+
config CRAY_XT
bool "Red Storm (Cray XT3/XT4)"
help
endmenu
-
-source "Kconfig.stdlibs"
-
-
-menu "Virtual Paging"
-
-config NESTED_PAGING
- bool "Enable nested paging"
- default y
- help
- Enable nested paging (should always be on)
-
-config SHADOW_PAGING
- bool "Enable shadow paging"
- default y
- help
- Enables shadow paging for virtual machines
-
-
-config SHADOW_PAGING_VTLB
- bool "Virtual TLB"
- default y
- depends on SHADOW_PAGING
- help
- Enables Virtual TLB implemenation for shadow paging
- Virtual TLB now uses PAE so there are no 4 GB restrictions
-
-
-config DEBUG_SHDW_PG_VTLB
- bool "Enable VTLB debugging"
- default n
- depends on SHADOW_PAGING_VTLB
- help
- Enables debugging messages for VTLB implementation
-
-config SHADOW_PAGING_CACHE
- bool "Shadow Page Cache"
- default n
- depends on SHADOW_PAGING && EXPERIMENTAL
- help
- Enables caching implementation of shadow paging
-
-config DEBUG_SHADOW_PAGING_CACHE
- bool "Enable Shadow Page Cache Debugging"
- default n
- depends on SHADOW_PAGING_CACHE
- help
- Enables debugging messages for the VTLB + Caching implementation
-
-#config SHADOW_PAGING_KVM
-# bool "KVM-style Shadow Pager"
-# default n
-# depends on SHADOW_PAGING && EXPERIMENTAL
-# help
-# Enables shadow pager derived from KVM
-# You probably do not want this and it will probably not compile!
-#
-#config DEBUG_SHADOW_PAGING_KVM
-# bool "Enable KVM-style Shadow Pager Debugging"
-# default n
-# depends on SHADOW_PAGING_KVM
-# help
-# Enables debugging messages for the KVM-style shadow pager
-
-
-config SWAPPING
- bool "Enable swapping"
- default n
- depends on (SHADOW_PAGING || NESTED_PAGING) && FILE
- help
- Enables swapping of regions of guest physical memory to a file
-
-config DEBUG_SWAPPING
- bool "Enable swapping debugging"
- default n
- depends on SWAPPING
- help
- Provides debugging output from the swapping system
-
-config MEM_TRACK
- bool "Enable memory access tracking"
- default n
- depends on SHADOW_PAGING || NESTED_PAGING
- help
- Allows tracking of memory accesses on a page granularity
-
-config DEBUG_MEM_TRACK
- bool "Enable memory access tracking debugging"
- default n
- depends on MEM_TRACK
- help
- Provides debugging output for memory access tracking
-
-endmenu
-
-menu "Symbiotic Functions"
-
-config SYMBIOTIC
- bool "Enable Symbiotic Functionality"
- default n
- help
- Enable Symbiotic components of the VMM.
- This includes the SymSpy interface.
-
-config SYMCALL
- bool "Symbiotic upcalls"
- default n
- depends on SYMBIOTIC && EXPERIMENTAL
- help
- Enables the Symbiotic upcall interface
-
-config SWAPBYPASS
- bool "SwapBypass"
- default n
- depends on SYMBIOTIC && SYMCALL && EXPERIMENTAL
- help
- This enables the SwapBypass architecture
-
-config SWAPBYPASS_TELEMETRY
- bool "Enable SwapBypass Telemetry"
- default n
- depends on TELEMETRY && SWAPBYPASS
- help
- Enable the telemetry information for the SwapBypass subsystem
-
-menuconfig SYMMOD
- bool "Symbiotic Modules"
- default n
- depends on EXPERIMENTAL
-# depends on SYMBIOTIC
- help
- Enable Symbiotic module loading
-
-
-endmenu
-
-menu "VNET"
-
-config VNET
- bool "Enable Vnet in Palacios"
- default n
- help
- Enable the Vnet in Palacios
-
-config DEBUG_VNET
- depends on VNET
- bool "Enable Vnet Debug in Palacios"
- default n
- help
- Enable the Vnet debug in Palacios
-
-
-endmenu
-
-source "palacios/src/gears/Kconfig"
-
-
-menu "HVM"
-
-config HVM
- bool "Support Hybrid Virtual Machines"
- default n
- help
- If set, it is possible to make VMs that are partitioned
- (cores, memory, devices, hardware access, etc) into
- a part ("the ROS") that supports normal VM operation and
- a part ("the HRT") that supports Hybrid Run-Times,
- for example Nautilus-based HRTs for parallel languages.
-
-config DEBUG_HVM
- depends on HVM
- bool "Enable HVM debugging in Palacios"
- default n
- help
- Enable HVM debugging output
-
-endmenu
-
-menu "Debug configuration"
+menu "Debug Configuration"
## Is unwind information useful
endmenu
-menu "BIOS Selection"
+
+
+menu "Virtual Paging"
+
+config NESTED_PAGING
+ bool "Enable nested paging"
+ default y
+ help
+ Enable nested paging (should always be on)
+
+config SHADOW_PAGING
+ bool "Enable shadow paging"
+ default y
+ help
+ Enables shadow paging for virtual machines
+
+
+config SHADOW_PAGING_VTLB
+ bool "Virtual TLB"
+ default y
+ depends on SHADOW_PAGING
+ help
+ Enables Virtual TLB implemenation for shadow paging
+ Virtual TLB now uses PAE so there are no 4 GB restrictions
+
+
+config DEBUG_SHDW_PG_VTLB
+ bool "Enable VTLB debugging"
+ default n
+ depends on SHADOW_PAGING_VTLB
+ help
+ Enables debugging messages for VTLB implementation
+
+config SHADOW_PAGING_CACHE
+ bool "Shadow Page Cache"
+ default n
+ depends on SHADOW_PAGING && EXPERIMENTAL
+ help
+ Enables caching implementation of shadow paging
+
+config DEBUG_SHADOW_PAGING_CACHE
+ bool "Enable Shadow Page Cache Debugging"
+ default n
+ depends on SHADOW_PAGING_CACHE
+ help
+ Enables debugging messages for the VTLB + Caching implementation
+
+#config SHADOW_PAGING_KVM
+# bool "KVM-style Shadow Pager"
+# default n
+# depends on SHADOW_PAGING && EXPERIMENTAL
+# help
+# Enables shadow pager derived from KVM
+# You probably do not want this and it will probably not compile!
+#
+#config DEBUG_SHADOW_PAGING_KVM
+# bool "Enable KVM-style Shadow Pager Debugging"
+# default n
+# depends on SHADOW_PAGING_KVM
+# help
+# Enables debugging messages for the KVM-style shadow pager
+
+
+config SWAPPING
+ bool "Enable swapping"
+ default n
+ depends on (SHADOW_PAGING || NESTED_PAGING) && FILE
+ help
+ Enables swapping of regions of guest physical memory to a file
+
+config DEBUG_SWAPPING
+ bool "Enable swapping debugging"
+ default n
+ depends on SWAPPING
+ help
+ Provides debugging output from the swapping system
+
+config MEM_TRACK
+ bool "Enable memory access tracking"
+ default n
+ depends on SHADOW_PAGING || NESTED_PAGING
+ help
+ Allows tracking of memory accesses on a page granularity
+
+config DEBUG_MEM_TRACK
+ bool "Enable memory access tracking debugging"
+ default n
+ depends on MEM_TRACK
+ help
+ Provides debugging output for memory access tracking
+
+endmenu
+
+
+source "palacios/src/devices/Kconfig"
+
+menu "Boot Environments"
+
+
+menu "BIOS"
choice
prompt "Boot Code Selection"
endchoice
-
config SEABIOS_PATH
string "Path to pre-built SEABIOS binary"
depends on SEABIOS
This is vmxassist image to boot real mode guests on
Intel VMX Platforms
+endmenu
+
+menu Multiboot
+
+config MULTIBOOT
+ bool "Support Multiboot2-compliant boot"
+ default y
+ help
+ If set, it is possible to boot a multiboot2 compliant
+ kernel directly.
+
+config DEBUG_MULTIBOOT
+ depends on MULTIBOOT
+ bool "Enable Multiboot2 debugging in Palacios"
+ default n
+ help
+ Enable Multiboot2 debugging output
+
endmenu
-source "palacios/src/devices/Kconfig"
+endmenu
+
+menu "Symbiosis"
+
+config SYMBIOTIC
+ bool "Enable Symbiotic Functionality"
+ default n
+ help
+ Enable Symbiotic components of the VMM.
+ This includes the SymSpy interface.
+
+config SYMCALL
+ bool "Symbiotic upcalls"
+ default n
+ depends on SYMBIOTIC && EXPERIMENTAL
+ help
+ Enables the Symbiotic upcall interface
+
+config SWAPBYPASS
+ bool "SwapBypass"
+ default n
+ depends on SYMBIOTIC && SYMCALL && EXPERIMENTAL
+ help
+ This enables the SwapBypass architecture
+
+config SWAPBYPASS_TELEMETRY
+ bool "Enable SwapBypass Telemetry"
+ default n
+ depends on TELEMETRY && SWAPBYPASS
+ help
+ Enable the telemetry information for the SwapBypass subsystem
+
+menuconfig SYMMOD
+ bool "Symbiotic Modules"
+ default n
+ depends on EXPERIMENTAL
+# depends on SYMBIOTIC
+ help
+ Enable Symbiotic module loading
+
+
+endmenu
+
+menu "VNET"
+
+config VNET
+ bool "Enable Vnet in Palacios"
+ default n
+ help
+ Enable the Vnet in Palacios
+
+config DEBUG_VNET
+ depends on VNET
+ bool "Enable Vnet Debug in Palacios"
+ default n
+ help
+ Enable the Vnet debug in Palacios
+
+
+endmenu
+
+source "palacios/src/gears/Kconfig"
+
+
+menu HVM
+
+config HVM
+ bool "Support Hybrid Virtual Machines"
+ depends on MULTIBOOT
+ default n
+ help
+ If set, it is possible to make VMs that are partitioned
+ (cores, memory, devices, hardware access, etc) into
+ a part ("the ROS") that supports normal VM operation and
+ a part ("the HRT") that supports Hybrid Run-Times,
+ for example Nautilus-based HRTs for parallel languages.
+
+config DEBUG_HVM
+ depends on HVM
+ bool "Enable HVM debugging in Palacios"
+ default n
+ help
+ Enable HVM debugging output
+
+endmenu
+
+
+
+
+
+
#include <palacios/vm_guest_mem.h>
-#include <stdio.h>
-#include <stdlib.h>
+#include <palacios/vmm_debug.h>
+
/*
#define PrintDebug(fmt, args...)
#endif
+
+// if set, we will map the first 1 GB of memory using a 3 level
+// hierarchy, for compatibility with Nautilus out of the box.
+// Otherwise we will map the first 512 GB using a 2 level
+// hieratchy
+#define HVM_MAP_1G_2M 1
+
int v3_init_hvm()
{
PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
{
- V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx\n",
- hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx);
+ uint64_t c;
+
+ rdtscll(c);
+
+
+ V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
+ hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, c-core->hvm_state.last_boot_start, core->num_exits);
+ v3_print_core_telemetry(core);
+ // v3_print_guest_state(core);
+
return 0;
}
char *enable;
char *ros_cores;
char *ros_mem;
- char *hrt_file_id;
+ char *hrt_file_id=0;
PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
}
}
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#define MIN(x,y) ((x)<(y)?(x):(y))
+
+#ifdef HVM_MAP_1G_2M
+#define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x40000000ULL))
+#else
+#define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x800000000ULL))
+#endif
static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
{
- *base = (void*) PAGE_ADDR(vm->mem_size - PAGE_SIZE);
+ *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - PAGE_SIZE);
*limit = PAGE_SIZE;
}
static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
{
- *base = (void*) PAGE_ADDR(vm->mem_size - 2 * PAGE_SIZE);
+ *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - 2 * PAGE_SIZE);
*limit = 16*256;
}
static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
{
- *base = (void*)PAGE_ADDR(vm->mem_size - 3 * PAGE_SIZE);
+ *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 3 * PAGE_SIZE);
*limit = 8*3;
}
static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
{
- *base = (void*)PAGE_ADDR(vm->mem_size - 4 * PAGE_SIZE);
+ *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 4 * PAGE_SIZE);
*limit = PAGE_SIZE;
}
512 entries
1 top level
1 entries
+
+OR
+
+ PTS MAP FIRST 1 GB identity mapped:
+ 1 third level
+ 512 entries
+ 1 second level
+ 1 entries
+ 1 top level
+ 1 entries
*/
static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
{
- *base = (void*)PAGE_ADDR(vm->mem_size-(5+1)*PAGE_SIZE);
+#ifdef HVM_MAP_1G_2M
+ *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+2)*PAGE_SIZE);
+ *limit = 3*PAGE_SIZE;
+#else
+ *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+1)*PAGE_SIZE);
*limit = 2*PAGE_SIZE;
+#endif
}
-static void write_pt(struct v3_vm_info *vm)
+#ifndef HVM_MAP_1G_2M
+static void write_pt_2level_512GB(struct v3_vm_info *vm)
{
void *base;
uint64_t size;
PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n");
}
+ if (vm->mem_size > 0x800000000ULL) {
+ PrintError(vm,VCORE_NONE, "VM has more than 512 GB\n");
+ }
+
memset(&pdpe,0,sizeof(pdpe));
pdpe.present=1;
pdpe.writable=1;
v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e);
}
- PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE) at %p\n",base);
+ PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE) at %p (512 GB mapped)\n",base);
+}
+
+#else
+
+static void write_pt_3level_1GB(struct v3_vm_info *vm)
+{
+ void *base;
+ uint64_t size;
+ struct pml4e64 pml4e;
+ struct pdpe64 pdpe;
+ struct pde64 pde;
+
+ uint64_t i;
+
+ get_pt_loc(vm,&base, &size);
+ if (size!=3*PAGE_SIZE) {
+ PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n");
+ }
+
+ if (vm->mem_size > 0x40000000ULL) {
+ PrintError(vm,VCORE_NONE, "VM has more than 1 GB\n");
+ }
+
+ memset(&pde,0,sizeof(pde));
+ pde.present=1;
+ pde.writable=1;
+ pde.large_page=1;
+
+ for (i=0;i<512;i++) {
+ pde.pt_base_addr = i*0x200; // 0x200 = 512 pages = 2 MB
+ v3_write_gpa_memory(&vm->cores[0],
+ (addr_t)(base+2*PAGE_SIZE+i*sizeof(pde)),
+ sizeof(pde),(uint8_t*)&pde);
+ }
+
+ memset(&pdpe,0,sizeof(pdpe));
+ pdpe.present=1;
+ pdpe.writable=1;
+ pdpe.large_page=0;
+
+ pdpe.pd_base_addr = PAGE_BASE_ADDR((addr_t)(base+2*PAGE_SIZE));
+
+ v3_write_gpa_memory(&vm->cores[0],(addr_t)base+PAGE_SIZE,sizeof(pdpe),(uint8_t*)&pdpe);
+
+ for (i=1;i<512;i++) {
+ pdpe.present = 0;
+ v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe);
+ }
+
+ memset(&pml4e,0,sizeof(pml4e));
+ pml4e.present=1;
+ pml4e.writable=1;
+ pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE));
+
+ v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e);
+
+ for (i=1;i<512;i++) {
+ pml4e.present=0;
+ v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e);
+ }
+
+ PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE, 1 PDP) at %p (1 GB mapped)\n",base);
+}
+
+#endif
+
+static void write_pt(struct v3_vm_info *vm)
+{
+#ifdef HVM_MAP_1G_2M
+ write_pt_3level_1GB(vm);
+#else
+ write_pt_2level_512GB(vm);
+#endif
}
static void get_bp_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
{
- *base = (void*) PAGE_ADDR(vm->mem_size-(6+1)*PAGE_SIZE);
+#ifdef HVM_MAP_1G_2M
+ *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+2)*PAGE_SIZE);
+#else
+ *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+1)*PAGE_SIZE);
+#endif
*limit = PAGE_SIZE;
}
*limit = bp_base - *base;
}
-static void write_hrt(struct v3_vm_info *vm)
+
+#define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
+#define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
+
+#define ELF_MAGIC 0x464c457f
+#define MB2_MAGIC 0xe85250d6
+
+#define MB2_INFO_MAGIC 0x36d76289
+
+static int is_elf(uint8_t *data, uint64_t size)
+{
+ if (*((uint32_t*)data)==ELF_MAGIC) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
+{
+ uint64_t limit = size > 32768 ? 32768 : size;
+ uint64_t i;
+
+ // Scan for the .boot magic cookie
+ // must be in first 32K, assume 4 byte aligned
+ for (i=0;i<limit;i+=4) {
+ if (*((uint32_t*)&data[i])==MB2_MAGIC) {
+ INFO("Found multiboot header at offset 0x%llx\n",i);
+ return (mb_header_t *) &data[i];
+ }
+ }
+ return 0;
+}
+
+
+//
+// BROKEN - THIS DOES NOT DO WHAT YOU THINK
+//
+static int setup_elf(struct v3_vm_info *vm, void *base, uint64_t limit)
+{
+ v3_write_gpa_memory(&vm->cores[0],(addr_t)base,vm->hvm_state.hrt_file->size,vm->hvm_state.hrt_file->data);
+
+ vm->hvm_state.hrt_entry_addr = (uint64_t) (base+0x40);
+
+ PrintDebug(vm,VCORE_NONE,"hvm: wrote HRT ELF %s at %p\n", vm->hvm_state.hrt_file->tag,base);
+ PrintDebug(vm,VCORE_NONE,"hvm: set ELF entry to %p and hoping for the best...\n", (void*) vm->hvm_state.hrt_entry_addr);
+
+ vm->hvm_state.hrt_type = HRT_ELF64;
+
+ return 0;
+
+}
+
+static int setup_mb_kernel(struct v3_vm_info *vm, void *base, uint64_t limit)
+{
+ mb_data_t mb;
+ uint32_t offset;
+
+
+ // FIX USING GENERIC TOOLS
+
+ if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) {
+ PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
+ return -1;
+ }
+
+ if (!mb.addr || !mb.entry) {
+ PrintError(vm,VCORE_NONE, "hvm: kernel is missing address or entry point\n");
+ return -1;
+ }
+
+ if (((void*)(uint64_t)(mb.addr->header_addr) < base ) ||
+ ((void*)(uint64_t)(mb.addr->load_end_addr) > base+limit) ||
+ ((void*)(uint64_t)(mb.addr->bss_end_addr) > base+limit)) {
+ PrintError(vm,VCORE_NONE, "hvm: kernel is not within the allowed portion of HVM\n");
+ return -1;
+ }
+
+ offset = mb.addr->load_addr - mb.addr->header_addr;
+
+ // Skip the ELF header - assume 1 page... weird....
+ v3_write_gpa_memory(&vm->cores[0],
+ (addr_t)(mb.addr->load_addr),
+ vm->hvm_state.hrt_file->size-PAGE_SIZE-offset,
+ vm->hvm_state.hrt_file->data+PAGE_SIZE+offset);
+
+
+ // vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + PAGE_SIZE; //HACK PAD
+
+ vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr;
+
+ vm->hvm_state.hrt_type = HRT_MBOOT64;
+
+ PrintDebug(vm,VCORE_NONE,
+ "hvm: wrote 0x%llx bytes starting at offset 0x%llx to %p; set entry to %p\n",
+ (uint64_t) vm->hvm_state.hrt_file->size-PAGE_SIZE-offset,
+ (uint64_t) PAGE_SIZE+offset,
+ (void*)(addr_t)(mb.addr->load_addr),
+ (void*) vm->hvm_state.hrt_entry_addr);
+ return 0;
+
+}
+
+
+static int setup_hrt(struct v3_vm_info *vm)
{
void *base;
uint64_t limit;
get_hrt_loc(vm,&base,&limit);
-
+
if (vm->hvm_state.hrt_file->size > limit) {
PrintError(vm,VCORE_NONE,"hvm: Cannot map HRT because it is too big (%llu bytes, but only have %llu space\n", vm->hvm_state.hrt_file->size, (uint64_t)limit);
- return;
+ return -1;
}
- v3_write_gpa_memory(&vm->cores[0],(addr_t)base,vm->hvm_state.hrt_file->size,vm->hvm_state.hrt_file->data);
+ if (!is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) {
+ PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not an ELF but we are going to act like it is!\n");
+ if (setup_elf(vm,base,limit)) {
+ PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n");
+ return -1;
+ }
+ vm->hvm_state.hrt_type=HRT_BLOB;
+ } else {
+ if (find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) {
+ PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
+ if (setup_mb_kernel(vm,base,limit)) {
+ PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
+ return -1;
+ }
+ } else {
+ PrintDebug(vm,VCORE_NONE,"hvm: supplied HRT is an ELF\n");
+ if (setup_elf(vm,base,limit)) {
+ PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n");
+ return -1;
+ }
+ }
+ }
- PrintDebug(vm,VCORE_NONE,"hvm: wrote HRT %s at %p\n", vm->hvm_state.hrt_file->tag,base);
-
+ return 0;
}
write_bp(vm);
- write_hrt(vm);
+ if (setup_hrt(vm)) {
+ PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
+ return -1;
+ }
PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
void *base;
uint64_t limit;
+ rdtscll(core->hvm_state.last_boot_start);
+
if (!core->hvm_state.is_hrt) {
PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
return 0;
core->vm_regs.rdi = (v3_reg_t) base;
// HRT entry point
get_hrt_loc(core->vm_info, &base,&limit);
- core->rip = (uint64_t) base + 0x40; // hack for test.o
+ core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr ;
// Setup CRs for long mode and our stub page table
// CR0: PG, PE
core->ctrl_regs.cr0 = 0x80000001;
+ core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
+
// CR2: don't care (output from #PF)
// CE3: set to our PML4E, without setting PCD or PWT
get_pt_loc(core->vm_info, &base,&limit);
core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);
+ core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
+
// CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
core->ctrl_regs.cr4 = 0xb0;
+ core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
// CR8 as usual
// RFLAGS zeroed is fine: come in with interrupts off
- // EFER needs SVME LMA LME (last 16 bites: 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0
+ // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0
core->ctrl_regs.efer = 0x1500;
+ core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
/*
memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
+
+ if (core->vm_info->hvm_state.hrt_type==HRT_MBOOT64) {
+ /*
+ Temporary hackery for multiboot2 "64"
+ We will push the MB structure onto the stack and update RSP
+ and RBX
+ */
+ uint8_t buf[256];
+ uint64_t size;
+
+ if ((size=v3_build_multiboot_table(core,buf,256))==-1) {
+ PrintError(core->vm_info,core,"hvm: Failed to write MB info\n");
+ return -1;
+ }
+ core->vm_regs.rsp -= size;
+
+ v3_write_gpa_memory(core,
+ core->vm_regs.rsp,
+ size,
+ buf);
+
+ PrintDebug(core->vm_info,core, "hvm: wrote MB info at %p\n", (void*)core->vm_regs.rsp);
+
+ if (core->vcpu_id == core->vm_info->hvm_state.first_hrt_core) {
+ // We are the BSP for this HRT
+ // this is where rbx needs to point
+ core->vm_regs.rbx = core->vm_regs.rsp;
+ PrintDebug(core->vm_info,core, "hvm: \"BSP\" core\n");
+ } else {
+ // We are an AP for this HRT
+ // so we don't get the multiboot struct
+ core->vm_regs.rbx = 0;
+ PrintDebug(core->vm_info,core, "hvm: \"AP\" core\n");
+ }
+
+
+
+ // one more push, something that looks like a return address
+ size=0;
+ core->vm_regs.rsp -= 8;
+
+ v3_write_gpa_memory(core,
+ core->vm_regs.rsp,
+ 8,
+ (uint8_t*) &size);
+
+ // Now for our magic - this signals
+ // the kernel that a multiboot loader loaded it
+ // and that rbx points to its offered data
+ core->vm_regs.rax = MB2_INFO_MAGIC;
+
+ /*
+ Note that "real" MB starts in protected mode without paging
+ This hack starts in long mode... so these requirements go
+ out the window for a large part
+
+ Requirements:
+
+ OK EAX has magic
+ OK EBX points to MB info
+ OK CS = base 0, offset big, code (LONG MODE)
+ OK DS,ES,FS,GS,SS => base 0, offset big, data (LONG MODE)
+ OK A20 gate on
+ XXX CR0 PE on PG off (nope)
+ XXX EFLAGS IF and VM off
+ */
+
+
+
+ }
+
+
// reset paging here for shadow...
if (core->shdw_pg_mode != NESTED_PAGING) {
PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
+ return -1;
}
--- /dev/null
+/*
+ * This file is part of the Palacios Virtual Machine Monitor developed
+ * by the V3VEE Project with funding from the United States National
+ * Science Foundation and the Department of Energy.
+ *
+ * The V3VEE Project is a joint project between Northwestern University
+ * and the University of New Mexico. You can find out more at
+ * http://www.v3vee.org
+ *
+ * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org>
+ * All rights reserved.
+ *
+ * Author: Peter Dinda <pdinda@northwestern.edu>
+ *
+ * This is free software. You are permitted to use,
+ * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
+ */
+
+#include <palacios/vmm_mem.h>
+#include <palacios/vmm.h>
+#include <palacios/vmm_util.h>
+#include <palacios/vmm_emulator.h>
+#include <palacios/vm_guest.h>
+#include <palacios/vmm_debug.h>
+#include <palacios/vmm_hypercall.h>
+
+#include <palacios/vmm_xml.h>
+
+#include <palacios/vm_guest_mem.h>
+
+#include <palacios/vmm_debug.h>
+
+
+/*
+
+ In a Pal file:
+
+ <files>
+ <file id="multibootelf" filename="multibootelf.o" />
+ </files>
+
+ <multiboot enable="y" file_id="multibootelf" />
+
+
+*/
+
+#ifndef V3_CONFIG_DEBUG_MULTIBOOT
+#undef PrintDebug
+#define PrintDebug(fmt, args...)
+#endif
+
+
+int v3_init_multiboot()
+{
+ PrintDebug(VM_NONE,VCORE_NONE, "multiboot: init\n");
+ return 0;
+}
+
+int v3_deinit_multiboot()
+{
+ PrintDebug(VM_NONE,VCORE_NONE, "multiboot: deinit\n");
+ return 0;
+}
+
+
+
+#define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
+
+int v3_init_multiboot_vm(struct v3_vm_info *vm, struct v3_xml *config)
+{
+ v3_cfg_tree_t *mb_config;
+ char *enable;
+ char *mb_file_id=0;
+
+ PrintDebug(vm, VCORE_NONE, "multiboot: vm init\n");
+
+ memset(&vm->mb_state,0,sizeof(struct v3_vm_multiboot));
+ vm->mb_state.is_multiboot=0;
+
+ if (!config || !(mb_config=v3_cfg_subtree(config,"multiboot"))) {
+ PrintDebug(vm,VCORE_NONE,"multiboot: no multiboot configuration found - normal boot will occur\n");
+ goto out_ok;
+ }
+
+ if (!(enable=v3_cfg_val(mb_config,"enable")) || strcasecmp(enable,"y")) {
+ PrintDebug(vm,VCORE_NONE,"multiboot: multiboot configuration disabled\n");
+ goto out_ok;
+ }
+
+ if (!(mb_file_id=v3_cfg_val(mb_config,"file_id"))) {
+ PrintError(vm,VCORE_NONE,"multiboot: multiboot block without file_id...\n");
+ return -1;
+ }
+
+ vm->mb_state.mb_file = v3_cfg_get_file(vm,mb_file_id);
+
+ if (!vm->mb_state.mb_file) {
+ PrintError(vm,VCORE_NONE,"multiboot: multiboot block contains bad file_id (%s)\n",mb_file_id);
+ return -1;
+ }
+
+ vm->mb_state.is_multiboot=1;
+
+ out_ok:
+ if (vm->mb_state.is_multiboot) {
+ V3_Print(vm,VCORE_NONE,"multiboot: file_id=%s (tag %s)]\n",
+ mb_file_id,
+ vm->mb_state.mb_file->tag);
+ } else {
+ V3_Print(vm,VCORE_NONE,"multiboot: This is not a multiboot VM\n");
+ }
+ return 0;
+
+}
+
+
+int v3_deinit_multiboot_vm(struct v3_vm_info *vm)
+{
+ PrintDebug(vm, VCORE_NONE, "multiboot: multiboot VM deinit\n");
+
+ return 0;
+}
+
+int v3_init_multiboot_core(struct guest_info *core)
+{
+ PrintDebug(core->vm_info, VCORE_NONE, "multiboot: multiboot core init\n");
+
+ // Nothing to do at this point
+
+ return 0;
+}
+
+int v3_deinit_multiboot_core(struct guest_info *core)
+{
+ PrintDebug(core->vm_info, VCORE_NONE, "multiboot: multiboot core deinit\n");
+
+ return 0;
+}
+
+
+
+
+#define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"multiboot: " fmt,##args)
+#define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"multiboot: " fmt,##args)
+
+
+
+/******************************************************************
+ Data contained in the ELF file we will attempt to boot
+******************************************************************/
+
+#define ELF_MAGIC 0x464c457f
+#define MB2_MAGIC 0xe85250d6
+
+
+/******************************************************************
+ Data we will pass to the kernel via rbx
+******************************************************************/
+
+#define MB2_INFO_MAGIC 0x36d76289
+
+typedef struct mb_info_header {
+ uint32_t totalsize;
+ uint32_t reserved;
+} __attribute__((packed)) mb_info_header_t;
+
+// A tag of type 0, size 8 indicates last value
+//
+typedef struct mb_info_tag {
+ uint32_t type;
+ uint32_t size;
+} __attribute__((packed)) mb_info_tag_t;
+
+
+#define MB_INFO_MEM_TAG 4
+typedef struct mb_info_mem {
+ mb_info_tag_t tag;
+ uint32_t mem_lower; // 0..640K in KB
+ uint32_t mem_upper; // in KB to first hole - 1 MB
+} __attribute__((packed)) mb_info_mem_t;
+
+#define MB_INFO_CMDLINE_TAG 1
+// note alignment of 8 bytes required for each...
+typedef struct mb_info_cmdline {
+ mb_info_tag_t tag;
+ uint32_t size; // includes zero termination
+ uint8_t string[]; // zero terminated
+} __attribute__((packed)) mb_info_cmdline_t;
+
+
+#define MEM_RAM 1
+#define MEM_ACPI 3
+#define MEM_RESV 4
+
+typedef struct mb_info_memmap_entry {
+ uint64_t base_addr;
+ uint64_t length;
+ uint32_t type;
+ uint32_t reserved;
+} __attribute__((packed)) mb_info_memmap_entry_t;
+
+#define MB_INFO_MEMMAP_TAG 6
+// note alignment of 8 bytes required for each...
+typedef struct mb_info_memmap {
+ mb_info_tag_t tag;
+ uint32_t entry_size; // multiple of 8
+ uint32_t entry_version; // 0
+ mb_info_memmap_entry_t entries[];
+} __attribute__((packed)) mb_info_memmap_t;
+
+#define MB_INFO_HRT_TAG 0xf00df00d
+typedef struct mb_info_hrt {
+ mb_info_tag_t tag;
+ // apic ids are 0..num_apics-1
+ // apic and ioapic addresses are the well known places
+ uint32_t total_num_apics;
+ uint32_t first_hrt_apic_id;
+ uint32_t have_hrt_ioapic;
+ uint32_t first_hrt_ioapic_entry;
+} __attribute__((packed)) mb_info_hrt_t;
+
+
+// We are not doing:
+//
+// - BIOS Boot Devie
+// - Modules
+// - ELF symbols
+// - Boot Loader name
+// - APM table
+// - VBE info
+// - Framebuffer info
+//
+
+static int is_elf(uint8_t *data, uint64_t size)
+{
+ if (*((uint32_t*)data)==ELF_MAGIC) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
+{
+ uint64_t limit = size > 32768 ? 32768 : size;
+ uint64_t i;
+
+ // Scan for the .boot magic cookie
+ // must be in first 32K, assume 4 byte aligned
+ for (i=0;i<limit;i+=4) {
+ if (*((uint32_t*)&data[i])==MB2_MAGIC) {
+ INFO("Found multiboot header at offset 0x%llx\n",i);
+ return (mb_header_t *) &data[i];
+ }
+ }
+ return 0;
+}
+
+static int checksum4_ok(uint32_t *data, uint64_t size)
+{
+ int i;
+ uint32_t sum=0;
+
+ for (i=0;i<size;i++) {
+ sum+=data[i];
+ }
+
+ return sum==0;
+}
+
+static int parse_multiboot_kernel(uint8_t *data, uint64_t size, mb_data_t *mb)
+{
+ uint64_t i;
+
+ mb_header_t *mb_header=0;
+ mb_tag_t *mb_tag=0;
+ mb_info_t *mb_inf=0;
+ mb_addr_t *mb_addr=0;
+ mb_entry_t *mb_entry=0;
+ mb_flags_t *mb_flags=0;
+ mb_framebuf_t *mb_framebuf=0;
+ mb_modalign_t *mb_modalign=0;
+ mb_mb64_hrt_t *mb_mb64_hrt=0;
+
+
+ if (!is_elf(data,size)) {
+ ERROR("HRT is not an ELF\n");
+ return -1;
+ }
+
+ mb_header = find_mb_header(data,size);
+
+ if (!mb_header) {
+ ERROR("No multiboot header found\n");
+ return -1;
+ }
+
+ // Checksum applies only to the header itself, not to
+ // the subsequent tags...
+ if (!checksum4_ok((uint32_t*)mb_header,4)) {
+ ERROR("Multiboot header has bad checksum\n");
+ return -1;
+ }
+
+ INFO("Multiboot header: arch=0x%x, headerlen=0x%x\n", mb_header->arch, mb_header->headerlen);
+
+ mb_tag = (mb_tag_t*)((void*)mb_header+16);
+
+ while (!(mb_tag->type==0 && mb_tag->size==8)) {
+ INFO("tag: type 0x%x flags=0x%x size=0x%x\n",mb_tag->type, mb_tag->flags,mb_tag->size);
+ switch (mb_tag->type) {
+ case MB_TAG_INFO: {
+ if (mb_inf) {
+ ERROR("Multiple info tags found!\n");
+ return -1;
+ }
+ mb_inf = (mb_info_t*)mb_tag;
+ INFO(" info request - types follow\n");
+ for (i=0;(mb_tag->size-8)/4;i++) {
+ INFO(" %llu: type 0x%x\n", i, mb_inf->types[i]);
+ }
+ }
+ break;
+
+ case MB_TAG_ADDRESS: {
+ if (mb_addr) {
+ ERROR("Multiple address tags found!\n");
+ return -1;
+ }
+ mb_addr = (mb_addr_t*)mb_tag;
+ INFO(" address\n");
+ INFO(" header_addr = 0x%x\n", mb_addr->header_addr);
+ INFO(" load_addr = 0x%x\n", mb_addr->load_addr);
+ INFO(" load_end_addr = 0x%x\n", mb_addr->load_end_addr);
+ INFO(" bss_end_addr = 0x%x\n", mb_addr->bss_end_addr);
+ }
+ break;
+
+ case MB_TAG_ENTRY: {
+ if (mb_entry) {
+ ERROR("Multiple entry tags found!\n");
+ return -1;
+ }
+ mb_entry=(mb_entry_t*)mb_tag;
+ INFO(" entry\n");
+ INFO(" entry_addr = 0x%x\n", mb_entry->entry_addr);
+ }
+ break;
+
+ case MB_TAG_FLAGS: {
+ if (mb_flags) {
+ ERROR("Multiple flags tags found!\n");
+ return -1;
+ }
+ mb_flags = (mb_flags_t*)mb_tag;
+ INFO(" flags\n");
+ INFO(" console_flags = 0x%x\n", mb_flags->console_flags);
+ }
+ break;
+
+ case MB_TAG_FRAMEBUF: {
+ if (mb_framebuf) {
+ ERROR("Multiple framebuf tags found!\n");
+ return -1;
+ }
+ mb_framebuf = (mb_framebuf_t*)mb_tag;
+ INFO(" framebuf\n");
+ INFO(" width = 0x%x\n", mb_framebuf->width);
+ INFO(" height = 0x%x\n", mb_framebuf->height);
+ INFO(" depth = 0x%x\n", mb_framebuf->depth);
+ }
+ break;
+
+ case MB_TAG_MODALIGN: {
+ if (mb_modalign) {
+ ERROR("Multiple modalign tags found!\n");
+ return -1;
+ }
+ mb_modalign = (mb_modalign_t*)mb_tag;
+ INFO(" modalign\n");
+ INFO(" size = 0x%x\n", mb_modalign->size);
+ }
+ break;
+#if 0
+ case MB_TAG_MB64_HRT: {
+ if (mb_mb64_hrt) {
+ ERROR("Multiple mb64_hrt tags found!\n");
+ return -1;
+ }
+ mb_mb64_hrt = (mb_mb64_hrt_t*)mb_tag;
+ INFO(" mb64_hrt\n");
+ }
+ break;
+#endif
+
+ default:
+ INFO("Unknown tag... Skipping...\n");
+ break;
+ }
+ mb_tag = (mb_tag_t *)(((void*)mb_tag) + mb_tag->size);
+ }
+
+ // copy out to caller
+ mb->header=mb_header;
+ mb->info=mb_inf;
+ mb->addr=mb_addr;
+ mb->entry=mb_entry;
+ mb->flags=mb_flags;
+ mb->framebuf=mb_framebuf;
+ mb->modalign=mb_modalign;
+ mb->mb64_hrt=mb_mb64_hrt;
+
+ return 0;
+}
+
+
+int v3_parse_multiboot_header(struct v3_cfg_file *file, mb_data_t *result)
+{
+ return parse_multiboot_kernel(file->data,file->size,result);
+}
+
+
+#define APIC_BASE 0xfee00000
+#define IOAPIC_BASE 0xfec00000
+
+/*
+ MB_INFO_HEADER
+ MB_HRT (if this is an HVM
+ MB_BASIC_MEMORY
+ MB_MEMORY_MAP
+ 0..640K RAM
+ 640K..1024 reserved
+ 1024..ioapic_base RAM
+ ioapic_base to ioapic_base+page reserved
+ ioapic_base+page to apic_base ram
+ apic_base oto apic_base+page reserved
+ apic_base+page to total RAM
+
+
+ The multiboot structure that is written reflects the
+ perspective of the core given the kind of VM it is part of.
+
+ Regular VM
+ - core does not matter
+ - all memory visible
+
+ HVM
+ ROS core
+ - only ROS memory visible
+ - regular multiboot or bios boot assumed
+ HRT core
+ - full HRT memory visible
+ - HRT64 multiboot assumed
+
+*/
+
+uint64_t v3_build_multiboot_table(struct guest_info *core, uint8_t *dest, uint64_t size)
+{
+ struct v3_vm_info *vm = core->vm_info;
+ mb_info_header_t *header;
+#ifdef V3_CONFIG_HVM
+ mb_info_hrt_t *hrt;
+#endif
+ mb_info_mem_t *mem;
+ mb_info_memmap_t *memmap;
+ mb_info_tag_t *tag;
+ uint64_t num_mem, cur_mem;
+
+ uint64_t total_mem = vm->mem_size;
+
+#ifdef V3_CONFIG_HVM
+ if (vm->hvm_state.is_hvm) {
+ if (v3_is_hvm_ros_core(core)) {
+ PrintDebug(core->vm_info,core,"multiboot: hvm: building mb table from ROS core perspective\n");
+ total_mem = v3_get_hvm_ros_memsize(vm);
+ } else {
+ PrintDebug(core->vm_info,core,"multiboot: hvm: building mb table from HRT core perspective\n");
+ total_mem = v3_get_hvm_hrt_memsize(vm);
+ }
+ }
+#endif
+
+ // assume we have > 1 MB + apic+ioapic
+ num_mem = 5;
+ if (total_mem>IOAPIC_BASE+PAGE_SIZE) {
+ num_mem++;
+ }
+ if (total_mem>APIC_BASE+PAGE_SIZE) {
+ num_mem++;
+ }
+
+
+ uint64_t needed =
+ sizeof(mb_info_header_t) +
+#ifdef V3_CONFIG_HVM
+ core->vm_info->hvm_state.is_hvm && core->hvm_state.is_hrt ? sizeof(mb_info_hrt_t) : 0
+#endif
+ +
+ sizeof(mb_info_mem_t) +
+ sizeof(mb_info_memmap_t) +
+ sizeof(mb_info_memmap_entry_t) * num_mem +
+ sizeof(mb_info_tag_t);
+
+ if (needed>size) {
+ return 0;
+ }
+
+ uint8_t *next;
+
+ if (needed>size) {
+ ERROR("Cannot fit MB info in needed space\n");
+ return -1;
+ }
+
+ next = dest;
+
+ header = (mb_info_header_t*)next;
+ next += sizeof(mb_info_header_t);
+
+#if V3_CONFIG_HVM
+ if (core->vm_info->hvm_state.is_hvm && v3_is_hvm_hrt_core(core)) {
+ hrt = (mb_info_hrt_t*)next;
+ next += sizeof(mb_info_hrt_t);
+ }
+#endif
+
+ mem = (mb_info_mem_t*)next;
+ next += sizeof(mb_info_mem_t);
+
+ memmap = (mb_info_memmap_t*)next;
+ next += sizeof(mb_info_memmap_t) + num_mem * sizeof(mb_info_memmap_entry_t);
+
+ tag = (mb_info_tag_t*)next;
+ next += sizeof(mb_info_tag_t);
+
+ header->totalsize = (uint32_t)(next - dest);
+ header->reserved = 0;
+
+#ifdef V3_CONFIG_HVM
+ if (core->vm_info->hvm_state.is_hvm && v3_is_hvm_hrt_core(core)) {
+ hrt->tag.type = MB_INFO_HRT_TAG;
+ hrt->tag.size = sizeof(mb_info_hrt_t);
+ hrt->total_num_apics = vm->num_cores;
+ hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
+ hrt->have_hrt_ioapic=0;
+ hrt->first_hrt_ioapic_entry=0;
+ }
+#endif
+
+ mem->tag.type = MB_INFO_MEM_TAG;
+ mem->tag.size = sizeof(mb_info_mem_t);
+ mem->mem_lower = 640; // thank you, bill gates
+ mem->mem_upper = (total_mem - 1024 * 1024) / 1024;
+
+ memmap->tag.type = MB_INFO_MEMMAP_TAG;
+ memmap->tag.size = sizeof(mb_info_memmap_t) + num_mem * sizeof(mb_info_memmap_entry_t);
+ memmap->entry_size = 24;
+ memmap->entry_version = 0;
+
+ cur_mem=0;
+
+ // first 640K
+ memmap->entries[cur_mem].base_addr = 0;
+ memmap->entries[cur_mem].length = 640*1024;
+ memmap->entries[cur_mem].type = MEM_RAM;
+ memmap->entries[cur_mem].reserved = 0;
+ cur_mem++;
+
+ // legacy io (640K->1 MB)
+ memmap->entries[cur_mem].base_addr = 640*1024;
+ memmap->entries[cur_mem].length = 384*1024;
+ memmap->entries[cur_mem].type = MEM_RESV;
+ memmap->entries[cur_mem].reserved = 1;
+ cur_mem++;
+
+ // first meg to ioapic
+ memmap->entries[cur_mem].base_addr = 1024*1024;
+ memmap->entries[cur_mem].length = (total_mem < IOAPIC_BASE ? total_mem : IOAPIC_BASE) - 1024*1024;
+ memmap->entries[cur_mem].type = MEM_RAM;
+ memmap->entries[cur_mem].reserved = 0;
+ cur_mem++;
+
+ // ioapic reservation
+ memmap->entries[cur_mem].base_addr = IOAPIC_BASE;
+ memmap->entries[cur_mem].length = PAGE_SIZE;
+ memmap->entries[cur_mem].type = MEM_RESV;
+ memmap->entries[cur_mem].reserved = 1;
+ cur_mem++;
+
+ if (total_mem > (IOAPIC_BASE + PAGE_SIZE)) {
+ // memory between ioapic and apic
+ memmap->entries[cur_mem].base_addr = IOAPIC_BASE+PAGE_SIZE;
+ memmap->entries[cur_mem].length = (total_mem < APIC_BASE ? total_mem : APIC_BASE) - (IOAPIC_BASE+PAGE_SIZE);;
+ memmap->entries[cur_mem].type = MEM_RAM;
+ memmap->entries[cur_mem].reserved = 0;
+ cur_mem++;
+ }
+
+ // apic
+ memmap->entries[cur_mem].base_addr = APIC_BASE;
+ memmap->entries[cur_mem].length = PAGE_SIZE;
+ memmap->entries[cur_mem].type = MEM_RESV;
+ memmap->entries[cur_mem].reserved = 1;
+ cur_mem++;
+
+ if (total_mem > (APIC_BASE + PAGE_SIZE)) {
+ // memory after apic
+ memmap->entries[cur_mem].base_addr = APIC_BASE+PAGE_SIZE;
+ memmap->entries[cur_mem].length = total_mem - (APIC_BASE+PAGE_SIZE);
+ memmap->entries[cur_mem].type = MEM_RAM;
+ memmap->entries[cur_mem].reserved = 0;
+ cur_mem++;
+ }
+
+ for (cur_mem=0;cur_mem<num_mem;cur_mem++) {
+ PrintDebug(vm, VCORE_NONE,
+ "multiboot: entry %llu: %p (%llx bytes) - type %x %s\n",
+ cur_mem,
+ (void*) memmap->entries[cur_mem].base_addr,
+ memmap->entries[cur_mem].length,
+ memmap->entries[cur_mem].type,
+ memmap->entries[cur_mem].reserved ? "reserved" : "");
+ }
+
+
+
+ // This demarcates end of list
+ tag->type = 0;
+ tag->size = 8;
+
+ return header->totalsize;
+
+}
+
+
+int v3_write_multiboot_kernel(struct v3_vm_info *vm, mb_data_t *mb, struct v3_cfg_file *file,
+ void *base, uint64_t limit)
+{
+ uint32_t offset;
+
+ if (!mb->addr || !mb->entry) {
+ PrintError(vm,VCORE_NONE, "multiboot: kernel is missing address or entry point\n");
+ return -1;
+ }
+
+ if (((void*)(uint64_t)(mb->addr->header_addr) < base ) ||
+ ((void*)(uint64_t)(mb->addr->load_end_addr) > base+limit) ||
+ ((void*)(uint64_t)(mb->addr->bss_end_addr) > base+limit)) {
+ PrintError(vm,VCORE_NONE, "multiboot: kernel is not within the allowed portion of VM\n");
+ return -1;
+ }
+
+ offset = mb->addr->load_addr - mb->addr->header_addr;
+
+ // Skip the ELF header - assume 1 page... weird....
+ // We are trying to do as little ELF loading here as humanly possible
+ v3_write_gpa_memory(&vm->cores[0],
+ (addr_t)(mb->addr->load_addr),
+ file->size-PAGE_SIZE-offset,
+ file->data+PAGE_SIZE+offset);
+
+ PrintDebug(vm,VCORE_NONE,
+ "multiboot: wrote 0x%llx bytes starting at offset 0x%llx to %p\n",
+ (uint64_t) file->size-PAGE_SIZE-offset,
+ (uint64_t) PAGE_SIZE+offset,
+ (void*)(addr_t)(mb->addr->load_addr));
+
+ return 0;
+
+}
+
+
+static int setup_multiboot_kernel(struct v3_vm_info *vm)
+{
+ void *base = 0;
+ uint64_t limit = vm->mem_size;
+
+
+ if (vm->mb_state.mb_file->size > limit) {
+ PrintError(vm,VCORE_NONE,"multiboot: Cannot map kernel because it is too big (%llu bytes, but only have %llu space\n", vm->mb_state.mb_file->size, (uint64_t)limit);
+ return -1;
+ }
+
+ if (!is_elf(vm->mb_state.mb_file->data,vm->mb_state.mb_file->size)) {
+ PrintError(vm,VCORE_NONE,"multiboot: supplied kernel is not an ELF\n");
+ return -1;
+ } else {
+ if (find_mb_header(vm->mb_state.mb_file->data,vm->mb_state.mb_file->size)) {
+ PrintDebug(vm,VCORE_NONE,"multiboot: appears to be a multiboot kernel\n");
+ if (v3_parse_multiboot_header(vm->mb_state.mb_file,&vm->mb_state.mb_data)) {
+ PrintError(vm,VCORE_NONE,"multiboot: cannot parse multiboot kernel header\n");
+ return -1;
+ }
+ if (v3_write_multiboot_kernel(vm, &(vm->mb_state.mb_data),vm->mb_state.mb_file,base,limit)) {
+ PrintError(vm,VCORE_NONE,"multiboot: multiboot kernel setup failed\n");
+ return -1;
+ }
+ } else {
+ PrintError(vm,VCORE_NONE,"multiboot: multiboot kernel has no header\n");
+ return -1;
+ }
+ }
+
+ return 0;
+
+}
+
+// 32 bit GDT entries
+//
+// base24-31 flags2 limit16-19 access8 base16-23 base0-15 limit0-15
+// null 0 0 0 0 0 0 0
+// code 0 1100 f 10011010 0 0 ffff
+// data 0 1100 f 10010010 0 0 ffff
+//
+// null = 00 00 00 00 00 00 00 00
+// code = 00 cf 9a 00 00 00 ff ff
+// data = 00 cf 92 00 00 00 ff ff
+//
+static uint64_t gdt32[3] = {
+ 0x0000000000000000, /* null */
+ 0x00cf9a000000ffff, /* code (note lme=0) */
+ 0x00cf92000000ffff, /* data */
+};
+
+static void write_gdt(struct v3_vm_info *vm, void *base, uint64_t limit)
+{
+ v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt32);
+
+ PrintDebug(vm,VCORE_NONE,"multiboot: wrote GDT at %p\n",base);
+}
+
+
+static void write_tss(struct v3_vm_info *vm, void *base, uint64_t limit)
+{
+ int i;
+ uint64_t tss_data=0x0;
+
+ for (i=0;i<limit/8;i++) {
+ v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+8*i),8,(uint8_t*) &tss_data);
+ }
+
+ PrintDebug(vm,VCORE_NONE,"multiboot: wrote TSS at %p\n",base);
+}
+
+static void write_table(struct v3_vm_info *vm, void *base, uint64_t limit)
+{
+ uint64_t size;
+ uint8_t buf[256];
+
+ limit = limit < 256 ? limit : 256;
+
+ size = v3_build_multiboot_table(&vm->cores[0], buf, limit);
+
+ if (size>256 || size==0) {
+ PrintError(vm,VCORE_NONE,"multiboot: cannot build multiboot table\n");
+ return;
+ }
+
+ v3_write_gpa_memory(&vm->cores[0],(addr_t)base,size,buf);
+
+}
+
+
+
+/*
+ GPA layout:
+
+ GDT
+ TSS
+ MBinfo
+ Kernel at its desired load address (or error)
+
+*/
+
+
+int v3_setup_multiboot_vm_for_boot(struct v3_vm_info *vm)
+{
+ void *kernel_start_gpa;
+ void *kernel_end_gpa;
+ void *mb_gpa;
+ void *tss_gpa;
+ void *gdt_gpa;
+
+ if (!vm->mb_state.is_multiboot) {
+ PrintDebug(vm,VCORE_NONE,"multiboot: skipping multiboot setup for boot as this is not a multiboot VM\n");
+ return 0;
+ }
+
+
+ if (setup_multiboot_kernel(vm)) {
+ PrintError(vm,VCORE_NONE,"multiboot: failed to setup kernel\n");
+ return -1;
+ }
+
+ kernel_start_gpa = (void*) (uint64_t) (vm->mb_state.mb_data.addr->load_addr);
+ kernel_end_gpa = (void*) (uint64_t) (vm->mb_state.mb_data.addr->bss_end_addr);
+
+ // Is there room below the kernel?
+ if ((uint64_t)kernel_start_gpa > 19*4096 ) {
+ // at least 3 pages between 64K and start of kernel
+ // place at 64K
+ mb_gpa=(void*)(16*4096);
+ } else {
+ // is there room above the kernel?
+ if ((uint64_t)kernel_end_gpa < vm->mem_size-4*4096) {
+ if (((uint64_t)kernel_end_gpa + 4 * 4096) <= 0xffffffff) {
+ mb_gpa=(void*) (4096*((uint64_t)kernel_end_gpa/4096 + 1));
+ } else {
+ PrintError(vm,VCORE_NONE,"multiboot: no room for mb data below 4 GB\n");
+ return -1;
+ }
+ } else {
+ PrintError(vm,VCORE_NONE,"multiboot: no room for mb data above kernel\n");
+ return -1;
+ }
+ }
+
+ PrintDebug(vm,VCORE_NONE,"multiboot: mb data will start at %p\n",mb_gpa);
+
+ vm->mb_state.mb_data_gpa=mb_gpa;
+
+ tss_gpa = mb_gpa + 1 * 4096;
+ gdt_gpa = mb_gpa + 2 * 4096;
+
+ write_table(vm,mb_gpa,4096);
+
+ write_tss(vm,tss_gpa,4096);
+
+ write_gdt(vm,gdt_gpa,4096);
+
+ PrintDebug(vm,VCORE_NONE,"multiboot: setup of memory done\n");
+
+ return 0;
+}
+
+/*
+ On entry:
+
+ IDTR not set
+ GDTR points to stub GDT
+ TR points to stub TSS
+ CR0 has PE and not PG
+ EIP is entry point to kernel
+ EBX points to multiboot info
+ EAX multiboot magic cookie
+
+*/
+int v3_setup_multiboot_core_for_boot(struct guest_info *core)
+{
+ void *base;
+ uint64_t limit;
+
+ if (!core->vm_info->mb_state.is_multiboot) {
+ PrintDebug(core->vm_info,core,"multiboot: skipping mb core setup as this is not an mb VM\n");
+ return 0;
+ }
+
+ if (core->vcpu_id != 0) {
+ PrintDebug(core->vm_info,core,"multiboot: skipping mb core setup as this is not the BSP core\n");
+ return 0;
+ }
+
+
+ PrintDebug(core->vm_info, core, "multiboot: setting up MB BSP core for boot\n");
+
+
+ memset(&core->vm_regs,0,sizeof(core->vm_regs));
+ memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
+ memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
+ memset(&core->segments,0,sizeof(core->segments));
+ memset(&core->msrs,0,sizeof(core->msrs));
+ memset(&core->fp_state,0,sizeof(core->fp_state));
+
+ // We need to be in protected mode at ring zero
+ core->cpl = 0; // we are going right into the kernel
+ core->cpu_mode = PROTECTED;
+ core->mem_mode = PHYSICAL_MEM;
+ // default run-state is fine, we are core zero
+ // core->core_run_state = CORE_RUNNING ;
+
+ // right into the kernel
+ core->rip = (uint64_t) core->vm_info->mb_state.mb_data.entry->entry_addr;
+
+ // Setup CRs for protected mode
+ // CR0: PE (but no PG)
+ core->ctrl_regs.cr0 = 0x1;
+ core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
+
+ // CR2: don't care (output from #PF)
+ // CR3: don't care (no paging)
+ core->ctrl_regs.cr3 = 0;
+ core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
+
+ // CR4: no features
+ core->ctrl_regs.cr4 = 0x0;
+ core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
+ // CR8 as usual
+ // RFLAGS zeroed is fine: come in with interrupts off
+ // EFER needs SVME and LME but not LMA (last 16 bits: 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
+ core->ctrl_regs.efer = 0x1400;
+ core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
+
+
+ /*
+ Notes on selectors:
+
+ selector is 13 bits of index, 1 bit table indicator
+ (0=>GDT), 2 bit RPL
+
+ index is scaled by 8, even in long mode, where some entries
+ are 16 bytes long....
+ -> code, data descriptors have 8 byte format
+ because base, limit, etc, are ignored (no segmentation)
+ -> interrupt/trap gates have 16 byte format
+ because offset needs to be 64 bits
+ */
+
+ // There is no IDTR set and interrupts are disabled
+
+ // Install our stub GDT
+ core->segments.gdtr.selector = 0;
+ core->segments.gdtr.base = (addr_t) core->vm_info->mb_state.mb_data_gpa+2*4096;
+ core->segments.gdtr.limit = 4096-1;
+ core->segments.gdtr.type = 0x6;
+ core->segments.gdtr.system = 1;
+ core->segments.gdtr.dpl = 0;
+ core->segments.gdtr.present = 1;
+ core->segments.gdtr.long_mode = 0;
+
+ // And our TSS
+ core->segments.tr.selector = 0;
+ core->segments.tr.base = (addr_t) core->vm_info->mb_state.mb_data_gpa+1*4096;
+ core->segments.tr.limit = 4096-1;
+ core->segments.tr.type = 0x6;
+ core->segments.tr.system = 1;
+ core->segments.tr.dpl = 0;
+ core->segments.tr.present = 1;
+ core->segments.tr.long_mode = 0;
+
+ base = 0x0;
+ limit = -1;
+
+ // And CS
+ core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
+ core->segments.cs.base = (addr_t) base;
+ core->segments.cs.limit = limit;
+ core->segments.cs.type = 0xe;
+ core->segments.cs.system = 0;
+ core->segments.cs.dpl = 0;
+ core->segments.cs.present = 1;
+ core->segments.cs.long_mode = 0;
+
+ // DS, SS, etc are identical
+ core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
+ core->segments.ds.base = (addr_t) base;
+ core->segments.ds.limit = limit;
+ core->segments.ds.type = 0x6;
+ core->segments.ds.system = 0;
+ core->segments.ds.dpl = 0;
+ core->segments.ds.present = 1;
+ core->segments.ds.long_mode = 0;
+
+ memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
+ memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
+ memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
+ memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
+
+
+
+ // Now for our magic - this signals
+ // the kernel that a multiboot loader loaded it
+ // and that rbx points to its offered data
+ core->vm_regs.rax = MB2_INFO_MAGIC;
+
+ core->vm_regs.rbx = (uint64_t) (core->vm_info->mb_state.mb_data_gpa);
+
+ // reset paging here for shadow...
+
+ if (core->shdw_pg_mode != NESTED_PAGING) {
+ PrintError(core->vm_info, core, "multiboot: shadow paging guest... this will end badly\n");
+ return -1;
+ }
+
+
+ return 0;
+}