2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org>
11 * All rights reserved.
13 * Author: Peter Dinda <pdinda@northwestern.edu>
15 * This is free software. You are permitted to use,
16 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
27 #include <palacios/vmm_xml.h>
29 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_debug.h>
36 MEM = Total size of memory in the GPA (in MB)
37 ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
39 GPAs [0,ROS_MEM) are what the ROS sees
40 GPAs [ROS_MEM, MEM) are HRT only
41 GPAS [0,MEM) are accessible by the HRT
43 CORES = Total number of cores in VM
44 ROS_CORES = Total numbber of cores for the ROS
46 Cores [0,ROS_CORES) are what the ROS sees
47 Cores [ROS_CORES,CORES) are HRT only
48 Cores [0,CORES) are accessible by the HRT
53 <file id="hrtelf" filename="hrtelf.o" />
56 <mem ... >RAM</mem> (MB) Note these are
57 <cores count="CORES" ...> backward compatible
60 <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
61 <hrt file_id="hrtelf" /hrt>
66 #ifndef V3_CONFIG_DEBUG_HVM
68 #define PrintDebug(fmt, args...)
72 // if set, we will map the first 1 GB of memory using a 3 level
73 // hierarchy, for compatibility with Nautilus out of the box.
74 // Otherwise we will map the first 512 GB using a 2 level
76 #define HVM_MAP_1G_2M 1
80 PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
86 PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
91 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
98 V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
99 hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, c-core->hvm_state.last_boot_start, core->num_exits);
100 //v3_print_core_telemetry(core);
101 // v3_print_guest_state(core);
106 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
108 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
110 v3_cfg_tree_t *hvm_config;
111 v3_cfg_tree_t *ros_config;
112 v3_cfg_tree_t *hrt_config;
118 PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
123 memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
124 vm->hvm_state.is_hvm=0;
125 vm->hvm_state.first_hrt_core=vm->num_cores;
126 vm->hvm_state.first_hrt_gpa=vm->mem_size;
128 if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
129 PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
133 if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
134 PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
138 if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) {
139 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
143 if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) {
144 PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
148 vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
150 if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) {
151 PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
155 vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
157 if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) {
158 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
162 if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) {
163 PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
167 vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
169 if (!vm->hvm_state.hrt_file) {
170 PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
174 if (v3_register_hypercall(vm, HVM_HCALL,
175 hvm_hcall_handler, 0)) {
176 PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
180 // XXX sanity check config here
182 vm->hvm_state.is_hvm=1;
185 if (vm->hvm_state.is_hvm) {
186 V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
187 vm->hvm_state.first_hrt_core-1,
188 (void*) vm->hvm_state.first_hrt_gpa-1,
189 vm->hvm_state.first_hrt_core,
191 (void*) vm->hvm_state.first_hrt_gpa,
192 (void*)vm->mem_size-1,
194 vm->hvm_state.hrt_file->tag);
196 V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
203 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
205 PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
207 v3_remove_hypercall(vm,HVM_HCALL);
212 int v3_init_hvm_core(struct guest_info *core)
214 memset(&core->hvm_state,0,sizeof(core->hvm_state));
215 if (core->vm_info->hvm_state.is_hvm) {
216 if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) {
217 core->hvm_state.is_hrt=1;
223 int v3_deinit_hvm_core(struct guest_info *core)
225 PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
231 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
233 if (vm->hvm_state.is_hvm) {
234 return vm->hvm_state.first_hrt_gpa;
239 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
244 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
246 if (vm->hvm_state.is_hvm) {
247 return vm->hvm_state.first_hrt_core;
249 return vm->num_cores;
253 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
255 if (vm->hvm_state.is_hvm) {
256 return vm->num_cores - vm->hvm_state.first_hrt_core;
263 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
265 if (vm->hvm_state.is_hvm) {
266 return gpa>=0 && gpa<vm->hvm_state.first_hrt_gpa;
272 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
274 if (vm->hvm_state.is_hvm) {
275 return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
281 int v3_is_hvm_hrt_core(struct guest_info *core)
283 return core->hvm_state.is_hrt;
286 int v3_is_hvm_ros_core(struct guest_info *core)
288 return !core->hvm_state.is_hrt;
291 int v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
294 // ioapic or msi to apic
295 return !dest->hvm_state.is_hrt;
298 return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
302 void v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm,
303 uint32_t *start_apic, uint32_t *num_apics)
306 // Seen from ioapic, msi, etc:
307 if (vm->hvm_state.is_hvm) {
308 // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
310 *num_apics = vm->hvm_state.first_hrt_core;
312 // Non-HVM shows all cores/APICs to apic, msi, etc.
314 *num_apics = vm->num_cores;
318 if (core->hvm_state.is_hrt) {
319 // HRT core/apic sees all apics
320 // (this policy may change...)
322 *num_apics = vm->num_cores;
324 // non-HRT core/apic sees only non-HRT cores/apics
326 *num_apics = vm->hvm_state.first_hrt_core;
331 #define MAX(x,y) ((x)>(y)?(x):(y))
332 #define MIN(x,y) ((x)<(y)?(x):(y))
335 #define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x40000000ULL))
337 #define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x800000000ULL))
340 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
342 *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - PAGE_SIZE);
346 extern v3_cpu_arch_t v3_mach_type;
348 extern void *v3_hvm_svm_null_int_handler_start;
349 extern void *v3_hvm_svm_null_int_handler_end;
350 extern void *v3_hvm_vmx_null_int_handler_start;
351 extern void *v3_hvm_vmx_null_int_handler_end;
353 static void write_null_int_handler(struct v3_vm_info *vm)
360 get_null_int_handler_loc(vm,&base,&limit);
362 switch (v3_mach_type) {
365 case V3_SVM_REV3_CPU:
366 data = (void*) &v3_hvm_svm_null_int_handler_start;
367 len = (void*) &v3_hvm_svm_null_int_handler_end - data;
373 case V3_VMX_EPT_UG_CPU:
374 data = (void*) &v3_hvm_vmx_null_int_handler_start;
375 len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
379 PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
385 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
388 PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
392 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
394 *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - 2 * PAGE_SIZE);
398 // default IDT entries (int and trap gates)
400 // Format is 16 bytes long:
402 // 16 selector => (target code selector) => 0x8 // entry 1 of GDT
403 // 3 ist => (stack) = 0 => current stack
405 // 4 type => 0xe=>INT, 0xf=>TRAP
410 // 32 offsethigh => 0 (total is a 64 bit offset)
413 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
415 // Note little endian
417 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
418 static uint64_t idt64_int_gate_entry_mask[2] = { 0x00008e0000080000, 0x0 };
420 static void write_idt(struct v3_vm_info *vm)
425 uint64_t handler_len;
427 uint64_t trap_gate[2];
428 uint64_t int_gate[2];
430 get_idt_loc(vm,&base,&limit);
432 get_null_int_handler_loc(vm,&handler,&handler_len);
434 memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
435 memcpy(int_gate,idt64_int_gate_entry_mask,16);
438 // update the entries for the handler location
442 hand = (uint8_t*) &handler;
444 mask = (uint8_t *)trap_gate;
445 memcpy(&(mask[0]),&(hand[0]),2); // offset low
446 memcpy(&(mask[6]),&(hand[2]),2); // offset med
447 memcpy(&(mask[8]),&(hand[4]),4); // offset high
449 mask = (uint8_t *)int_gate;
450 memcpy(&(mask[0]),&(hand[0]),2); // offset low
451 memcpy(&(mask[6]),&(hand[2]),2); // offset med
452 memcpy(&(mask[8]),&(hand[4]),4); // offset high
454 PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
458 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
461 for (i=32;i<256;i++) {
462 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
465 PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
470 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
472 *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 3 * PAGE_SIZE);
476 static uint64_t gdt64[3] = {
477 0x0000000000000000, /* null */
478 0x00a09a0000000000, /* code (note lme bit) */
479 0x00a0920000000000, /* data (most entries don't matter) */
482 static void write_gdt(struct v3_vm_info *vm)
487 get_gdt_loc(vm,&base,&limit);
488 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
490 PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
495 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
497 *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 4 * PAGE_SIZE);
501 static void write_tss(struct v3_vm_info *vm)
506 get_tss_loc(vm,&base,&limit);
508 v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
510 PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
514 PTS MAP FIRST 512 GB identity mapped:
522 PTS MAP FIRST 1 GB identity mapped:
531 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
534 *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+2)*PAGE_SIZE);
535 *limit = 3*PAGE_SIZE;
537 *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+1)*PAGE_SIZE);
538 *limit = 2*PAGE_SIZE;
542 #ifndef HVM_MAP_1G_2M
543 static void write_pt_2level_512GB(struct v3_vm_info *vm)
547 struct pml4e64 pml4e;
551 get_pt_loc(vm,&base, &size);
552 if (size!=2*PAGE_SIZE) {
553 PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n");
556 if (vm->mem_size > 0x800000000ULL) {
557 PrintError(vm,VCORE_NONE, "VM has more than 512 GB\n");
560 memset(&pdpe,0,sizeof(pdpe));
565 for (i=0;i<512;i++) {
566 pdpe.pd_base_addr = i*0x40000; // 0x4000 = 256K pages = 1 GB
567 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe);
570 memset(&pml4e,0,sizeof(pml4e));
573 pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE));
575 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e);
577 for (i=1;i<512;i++) {
579 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e);
582 PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE) at %p (512 GB mapped)\n",base);
587 static void write_pt_3level_1GB(struct v3_vm_info *vm)
591 struct pml4e64 pml4e;
597 get_pt_loc(vm,&base, &size);
598 if (size!=3*PAGE_SIZE) {
599 PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n");
602 if (vm->mem_size > 0x40000000ULL) {
603 PrintError(vm,VCORE_NONE, "VM has more than 1 GB\n");
606 memset(&pde,0,sizeof(pde));
611 for (i=0;i<512;i++) {
612 pde.pt_base_addr = i*0x200; // 0x200 = 512 pages = 2 MB
613 v3_write_gpa_memory(&vm->cores[0],
614 (addr_t)(base+2*PAGE_SIZE+i*sizeof(pde)),
615 sizeof(pde),(uint8_t*)&pde);
618 memset(&pdpe,0,sizeof(pdpe));
623 pdpe.pd_base_addr = PAGE_BASE_ADDR((addr_t)(base+2*PAGE_SIZE));
625 v3_write_gpa_memory(&vm->cores[0],(addr_t)base+PAGE_SIZE,sizeof(pdpe),(uint8_t*)&pdpe);
627 for (i=1;i<512;i++) {
629 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe);
632 memset(&pml4e,0,sizeof(pml4e));
635 pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE));
637 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e);
639 for (i=1;i<512;i++) {
641 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e);
644 PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE, 1 PDP) at %p (1 GB mapped)\n",base);
649 static void write_pt(struct v3_vm_info *vm)
652 write_pt_3level_1GB(vm);
654 write_pt_2level_512GB(vm);
658 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
661 *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+2)*PAGE_SIZE);
663 *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+1)*PAGE_SIZE);
668 static void write_mb_info(struct v3_vm_info *vm)
670 if (vm->hvm_state.hrt_type!=HRT_MBOOT64) {
671 PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
679 get_mb_info_loc(vm,&base,&limit);
681 if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) {
682 PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
687 PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
691 v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
696 PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
700 #define SCRATCH_STACK_SIZE 4096
703 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
708 get_mb_info_loc(vm,&mb_base,&mb_limit);
710 mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
712 *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
714 if (mb_base < *base+PAGE_SIZE) {
715 PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
718 *limit = mb_base - *base;
722 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
723 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
725 #define ELF_MAGIC 0x464c457f
726 #define MB2_MAGIC 0xe85250d6
728 #define MB2_INFO_MAGIC 0x36d76289
730 static int is_elf(uint8_t *data, uint64_t size)
732 if (*((uint32_t*)data)==ELF_MAGIC) {
739 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
741 uint64_t limit = size > 32768 ? 32768 : size;
744 // Scan for the .boot magic cookie
745 // must be in first 32K, assume 4 byte aligned
746 for (i=0;i<limit;i+=4) {
747 if (*((uint32_t*)&data[i])==MB2_MAGIC) {
748 INFO("Found multiboot header at offset 0x%llx\n",i);
749 return (mb_header_t *) &data[i];
757 // BROKEN - THIS DOES NOT DO WHAT YOU THINK
759 static int setup_elf(struct v3_vm_info *vm, void *base, uint64_t limit)
761 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,vm->hvm_state.hrt_file->size,vm->hvm_state.hrt_file->data);
763 vm->hvm_state.hrt_entry_addr = (uint64_t) (base+0x40);
765 PrintDebug(vm,VCORE_NONE,"hvm: wrote HRT ELF %s at %p\n", vm->hvm_state.hrt_file->tag,base);
766 PrintDebug(vm,VCORE_NONE,"hvm: set ELF entry to %p and hoping for the best...\n", (void*) vm->hvm_state.hrt_entry_addr);
768 vm->hvm_state.hrt_type = HRT_ELF64;
774 static int setup_mb_kernel(struct v3_vm_info *vm, void *base, uint64_t limit)
778 if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) {
779 PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
784 if (v3_write_multiboot_kernel(vm,&mb,vm->hvm_state.hrt_file,base,limit)) {
785 PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
790 if (!mb.addr || !mb.entry) {
791 PrintError(vm,VCORE_NONE, "hvm: kernel is missing address or entry point\n");
795 if (((void*)(uint64_t)(mb.addr->header_addr) < base ) ||
796 ((void*)(uint64_t)(mb.addr->load_end_addr) > base+limit) ||
797 ((void*)(uint64_t)(mb.addr->bss_end_addr) > base+limit)) {
798 PrintError(vm,VCORE_NONE, "hvm: kernel is not within the allowed portion of HVM\n");
802 offset = mb.addr->load_addr - mb.addr->header_addr;
804 // Skip the ELF header - assume 1 page... weird....
805 // FIX ME TO CONFORM TO MULTIBOOT.C
806 v3_write_gpa_memory(&vm->cores[0],
807 (addr_t)(mb.addr->load_addr),
808 vm->hvm_state.hrt_file->size-PAGE_SIZE-offset,
809 vm->hvm_state.hrt_file->data+PAGE_SIZE+offset);
812 // vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + PAGE_SIZE; //HACK PAD
815 PrintDebug(vm,VCORE_NONE,
816 "hvm: wrote 0x%llx bytes starting at offset 0x%llx to %p; set entry to %p\n",
817 (uint64_t) vm->hvm_state.hrt_file->size-PAGE_SIZE-offset,
818 (uint64_t) PAGE_SIZE+offset,
819 (void*)(addr_t)(mb.addr->load_addr),
820 (void*) vm->hvm_state.hrt_entry_addr);
825 vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr;
827 vm->hvm_state.hrt_type = HRT_MBOOT64;
834 static int setup_hrt(struct v3_vm_info *vm)
839 get_hrt_loc(vm,&base,&limit);
841 if (vm->hvm_state.hrt_file->size > limit) {
842 PrintError(vm,VCORE_NONE,"hvm: Cannot map HRT because it is too big (%llu bytes, but only have %llu space\n", vm->hvm_state.hrt_file->size, (uint64_t)limit);
846 if (!is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) {
847 PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not an ELF but we are going to act like it is!\n");
848 if (setup_elf(vm,base,limit)) {
849 PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n");
852 vm->hvm_state.hrt_type=HRT_BLOB;
854 if (find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) {
855 PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
856 if (setup_mb_kernel(vm,base,limit)) {
857 PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
861 PrintDebug(vm,VCORE_NONE,"hvm: supplied HRT is an ELF\n");
862 if (setup_elf(vm,base,limit)) {
863 PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n");
882 We do not touch the ROS portion of the address space.
883 The HRT portion looks like:
885 INT_HANDLER (1 page - page aligned)
886 IDT (1 page - page aligned)
887 GDT (1 page - page aligned)
888 TSS (1 page - page asligned)
889 PAGETABLES (identy map of first N GB)
890 ROOT PT first, followed by 2nd level, etc.
891 Currently PML4 followed by 1 PDPE for 512 GB of mapping
893 SCRATCH_STACK_HRT_CORE0
894 SCRATCH_STACK_HRT_CORE1
896 SCRATCH_STACK_HRT_COREN
898 HRT (as many pages as needed, page-aligned, starting at first HRT address)
905 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
907 if (!vm->hvm_state.is_hvm) {
908 PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
912 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
914 write_null_int_handler(vm);
923 PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
927 // need to parse HRT first
930 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
936 On entry for every core:
938 IDTR points to stub IDT
939 GDTR points to stub GDT
940 TS points to stub TSS
941 CR3 points to root page table
944 RSP is TOS of core's scratch stack (looks like a call)
946 RAX = MB magic cookie
947 RBX = address of multiboot info table
948 RCX = this core id / apic id (0..N-1)
949 RDX = this core id - first HRT core ID (==0 for the first HRT core)
951 Other regs are zeroed
953 shadow/nested paging state reset for long mode
956 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
961 rdtscll(core->hvm_state.last_boot_start);
963 if (!core->hvm_state.is_hrt) {
964 PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
968 PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
973 memset(&core->vm_regs,0,sizeof(core->vm_regs));
974 memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
975 memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
976 memset(&core->segments,0,sizeof(core->segments));
977 memset(&core->msrs,0,sizeof(core->msrs));
978 memset(&core->fp_state,0,sizeof(core->fp_state));
980 // We are in long mode with virtual memory and we want
981 // to start immediatley
982 core->cpl = 0; // we are going right into the kernel
983 core->cpu_mode = LONG;
984 core->mem_mode = VIRTUAL_MEM;
985 core->core_run_state = CORE_RUNNING ;
989 core->vm_regs.rax = MB2_INFO_MAGIC;
991 // multiboot info pointer
992 get_mb_info_loc(core->vm_info, &base,&limit);
993 core->vm_regs.rbx = (uint64_t) base;
996 core->vm_regs.rcx = core->vcpu_id;
999 core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1001 // Now point to scratch stack for this core
1002 // it begins at an ofset relative to the MB info page
1003 get_mb_info_loc(core->vm_info, &base,&limit);
1004 base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1005 core->vm_regs.rsp = (v3_reg_t) base;
1006 core->vm_regs.rbp = (v3_reg_t) base-8;
1008 // push onto the stack a bad rbp and bad return address
1009 core->vm_regs.rsp-=16;
1010 v3_set_gpa_memory(core,
1017 get_hrt_loc(core->vm_info, &base,&limit);
1018 core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr ;
1021 PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1022 (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1024 (void*)(core->vm_regs.rsp),
1025 (void*)(core->vm_regs.rbp),
1026 (void*)(core->vm_regs.rax),
1027 (void*)(core->vm_regs.rbx),
1028 (void*)(core->vm_regs.rcx),
1029 (void*)(core->vm_regs.rdx));
1031 // Setup CRs for long mode and our stub page table
1033 core->ctrl_regs.cr0 = 0x80000001;
1034 core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1036 // CR2: don't care (output from #PF)
1037 // CE3: set to our PML4E, without setting PCD or PWT
1038 get_pt_loc(core->vm_info, &base,&limit);
1039 core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);
1040 core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1042 // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1043 core->ctrl_regs.cr4 = 0xb0;
1044 core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1046 // RFLAGS zeroed is fine: come in with interrupts off
1047 // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0
1048 core->ctrl_regs.efer = 0x1500;
1049 core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1055 selector is 13 bits of index, 1 bit table indicator
1058 index is scaled by 8, even in long mode, where some entries
1059 are 16 bytes long....
1060 -> code, data descriptors have 8 byte format
1061 because base, limit, etc, are ignored (no segmentation)
1062 -> interrupt/trap gates have 16 byte format
1063 because offset needs to be 64 bits
1066 // Install our stub IDT
1067 get_idt_loc(core->vm_info, &base,&limit);
1068 core->segments.idtr.selector = 0; // entry 0 (NULL) of the GDT
1069 core->segments.idtr.base = (addr_t) base;
1070 core->segments.idtr.limit = limit-1;
1071 core->segments.idtr.type = 0xe;
1072 core->segments.idtr.system = 1;
1073 core->segments.idtr.dpl = 0;
1074 core->segments.idtr.present = 1;
1075 core->segments.idtr.long_mode = 1;
1077 // Install our stub GDT
1078 get_gdt_loc(core->vm_info, &base,&limit);
1079 core->segments.gdtr.selector = 0;
1080 core->segments.gdtr.base = (addr_t) base;
1081 core->segments.gdtr.limit = limit-1;
1082 core->segments.gdtr.type = 0x6;
1083 core->segments.gdtr.system = 1;
1084 core->segments.gdtr.dpl = 0;
1085 core->segments.gdtr.present = 1;
1086 core->segments.gdtr.long_mode = 1;
1089 get_tss_loc(core->vm_info, &base,&limit);
1090 core->segments.tr.selector = 0;
1091 core->segments.tr.base = (addr_t) base;
1092 core->segments.tr.limit = limit-1;
1093 core->segments.tr.type = 0x6;
1094 core->segments.tr.system = 1;
1095 core->segments.tr.dpl = 0;
1096 core->segments.tr.present = 1;
1097 core->segments.tr.long_mode = 1;
1103 core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1104 core->segments.cs.base = (addr_t) base;
1105 core->segments.cs.limit = limit;
1106 core->segments.cs.type = 0xe;
1107 core->segments.cs.system = 0;
1108 core->segments.cs.dpl = 0;
1109 core->segments.cs.present = 1;
1110 core->segments.cs.long_mode = 1;
1112 // DS, SS, etc are identical
1113 core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1114 core->segments.ds.base = (addr_t) base;
1115 core->segments.ds.limit = limit;
1116 core->segments.ds.type = 0x6;
1117 core->segments.ds.system = 0;
1118 core->segments.ds.dpl = 0;
1119 core->segments.ds.present = 1;
1120 core->segments.ds.long_mode = 1;
1122 memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1123 memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1124 memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1125 memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1128 // reset paging here for shadow...
1130 if (core->shdw_pg_mode != NESTED_PAGING) {
1131 PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
1139 int v3_handle_hvm_reset(struct guest_info *core)
1142 if (core->core_run_state != CORE_RESETTING) {
1146 if (!core->vm_info->hvm_state.is_hvm) {
1150 if (v3_is_hvm_hrt_core(core)) {
1151 // this is an HRT reset
1154 // wait for all the HRT cores
1155 v3_counting_barrier(&core->vm_info->reset_barrier);
1157 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1159 core->vm_info->run_state = VM_RESETTING;
1162 core->core_run_state = CORE_RESETTING;
1164 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1165 // we really only need to clear the bss
1166 // and recopy the .data, but for now we'll just
1168 rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
1171 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
1175 // now everyone is ready to reset
1176 rc |= v3_setup_hvm_hrt_core_for_boot(core);
1179 PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
1182 core->core_run_state = CORE_RUNNING;
1184 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1186 core->vm_info->run_state = VM_RUNNING;
1189 v3_counting_barrier(&core->vm_info->reset_barrier);
1192 PrintError(core->vm_info,core,"hvm: reset failed\n");
1199 // ROS core will be handled by normal reset functionality