2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org>
11 * All rights reserved.
13 * Author: Peter Dinda <pdinda@northwestern.edu>
15 * This is free software. You are permitted to use,
16 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
27 #include <palacios/vmm_xml.h>
29 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_debug.h>
36 MEM = Total size of memory in the GPA (in MB)
37 ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
39 GPAs [0,ROS_MEM) are what the ROS sees
40 GPAs [ROS_MEM, MEM) are HRT only
41 GPAS [0,MEM) are accessible by the HRT
43 CORES = Total number of cores in VM
44 ROS_CORES = Total numbber of cores for the ROS
46 Cores [0,ROS_CORES) are what the ROS sees
47 Cores [ROS_CORES,CORES) are HRT only
48 Cores [0,CORES) are accessible by the HRT
53 <file id="hrtelf" filename="hrtelf.o" />
56 <mem ... >RAM</mem> (MB) Note these are
57 <cores count="CORES" ...> backward compatible
60 <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
61 <hrt file_id="hrtelf" /hrt>
66 #ifndef V3_CONFIG_DEBUG_HVM
68 #define PrintDebug(fmt, args...)
72 // if set, we will map the first 1 GB of memory using a 3 level
73 // hierarchy, for compatibility with Nautilus out of the box.
74 // Otherwise we will map the first 512 GB using a 2 level
76 #define HVM_MAP_1G_2M 1
80 PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
86 PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
91 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
98 V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
99 hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, c-core->hvm_state.last_boot_start, core->num_exits);
100 v3_print_core_telemetry(core);
101 // v3_print_guest_state(core);
106 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
108 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
110 v3_cfg_tree_t *hvm_config;
111 v3_cfg_tree_t *ros_config;
112 v3_cfg_tree_t *hrt_config;
118 PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
123 memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
124 vm->hvm_state.is_hvm=0;
125 vm->hvm_state.first_hrt_core=vm->num_cores;
126 vm->hvm_state.first_hrt_gpa=vm->mem_size;
128 if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
129 PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
133 if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
134 PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
138 if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) {
139 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
143 if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) {
144 PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
148 vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
150 if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) {
151 PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
155 vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
157 if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) {
158 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
162 if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) {
163 PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
167 vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
169 if (!vm->hvm_state.hrt_file) {
170 PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
174 if (v3_register_hypercall(vm, HVM_HCALL,
175 hvm_hcall_handler, 0)) {
176 PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
180 // XXX sanity check config here
182 vm->hvm_state.is_hvm=1;
185 if (vm->hvm_state.is_hvm) {
186 V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
187 vm->hvm_state.first_hrt_core-1,
188 (void*) vm->hvm_state.first_hrt_gpa-1,
189 vm->hvm_state.first_hrt_core,
191 (void*) vm->hvm_state.first_hrt_gpa,
192 (void*)vm->mem_size-1,
194 vm->hvm_state.hrt_file->tag);
196 V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
203 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
205 PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
207 v3_remove_hypercall(vm,HVM_HCALL);
212 int v3_init_hvm_core(struct guest_info *core)
214 memset(&core->hvm_state,0,sizeof(core->hvm_state));
215 if (core->vm_info->hvm_state.is_hvm) {
216 if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) {
217 core->hvm_state.is_hrt=1;
223 int v3_deinit_hvm_core(struct guest_info *core)
225 PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
231 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
233 if (vm->hvm_state.is_hvm) {
234 return vm->hvm_state.first_hrt_gpa;
239 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
241 if (vm->hvm_state.is_hvm) {
242 return vm->mem_size - vm->hvm_state.first_hrt_gpa;
248 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
250 if (vm->hvm_state.is_hvm) {
251 return vm->hvm_state.first_hrt_core;
253 return vm->num_cores;
257 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
259 if (vm->hvm_state.is_hvm) {
260 return vm->num_cores - vm->hvm_state.first_hrt_core;
267 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
269 if (vm->hvm_state.is_hvm) {
270 return gpa>=0 && gpa<vm->hvm_state.first_hrt_gpa;
276 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
278 if (vm->hvm_state.is_hvm) {
279 return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
285 int v3_is_hvm_hrt_core(struct guest_info *core)
287 return core->hvm_state.is_hrt;
290 int v3_is_hvm_ros_core(struct guest_info *core)
292 return !core->hvm_state.is_hrt;
295 int v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
298 // ioapic or msi to apic
299 return !dest->hvm_state.is_hrt;
302 return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
306 void v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm,
307 uint32_t *start_apic, uint32_t *num_apics)
310 // Seen from ioapic, msi, etc:
311 if (vm->hvm_state.is_hvm) {
312 // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
314 *num_apics = vm->hvm_state.first_hrt_core;
316 // Non-HVM shows all cores/APICs to apic, msi, etc.
318 *num_apics = vm->num_cores;
322 if (core->hvm_state.is_hrt) {
323 // HRT core/apic sees all apics
324 // (this policy may change...)
326 *num_apics = vm->num_cores;
328 // non-HRT core/apic sees only non-HRT cores/apics
330 *num_apics = vm->hvm_state.first_hrt_core;
335 #define MAX(x,y) ((x)>(y)?(x):(y))
336 #define MIN(x,y) ((x)<(y)?(x):(y))
339 #define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x40000000ULL))
341 #define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x800000000ULL))
344 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
346 *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - PAGE_SIZE);
350 extern v3_cpu_arch_t v3_mach_type;
352 extern void *v3_hvm_svm_null_int_handler_start;
353 extern void *v3_hvm_svm_null_int_handler_end;
354 extern void *v3_hvm_vmx_null_int_handler_start;
355 extern void *v3_hvm_vmx_null_int_handler_end;
357 static void write_null_int_handler(struct v3_vm_info *vm)
364 get_null_int_handler_loc(vm,&base,&limit);
366 switch (v3_mach_type) {
369 case V3_SVM_REV3_CPU:
370 data = (void*) &v3_hvm_svm_null_int_handler_start;
371 len = (void*) &v3_hvm_svm_null_int_handler_end - data;
377 case V3_VMX_EPT_UG_CPU:
378 data = (void*) &v3_hvm_vmx_null_int_handler_start;
379 len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
383 PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
389 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
392 PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
396 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
398 *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - 2 * PAGE_SIZE);
402 // default IDT entries (int and trap gates)
404 // Format is 16 bytes long:
406 // 16 selector => (target code selector) => 0x8 // entry 1 of GDT
407 // 3 ist => (stack) = 0 => current stack
409 // 4 type => 0xe=>INT, 0xf=>TRAP
414 // 32 offsethigh => 0 (total is a 64 bit offset)
417 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
419 // Note little endian
421 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
422 static uint64_t idt64_int_gate_entry_mask[2] = { 0x00008e0000080000, 0x0 };
424 static void write_idt(struct v3_vm_info *vm)
429 uint64_t handler_len;
431 uint64_t trap_gate[2];
432 uint64_t int_gate[2];
434 get_idt_loc(vm,&base,&limit);
436 get_null_int_handler_loc(vm,&handler,&handler_len);
438 memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
439 memcpy(int_gate,idt64_int_gate_entry_mask,16);
442 // update the entries for the handler location
446 hand = (uint8_t*) &handler;
448 mask = (uint8_t *)trap_gate;
449 memcpy(&(mask[0]),&(hand[0]),2); // offset low
450 memcpy(&(mask[6]),&(hand[2]),2); // offset med
451 memcpy(&(mask[8]),&(hand[4]),4); // offset high
453 mask = (uint8_t *)int_gate;
454 memcpy(&(mask[0]),&(hand[0]),2); // offset low
455 memcpy(&(mask[6]),&(hand[2]),2); // offset med
456 memcpy(&(mask[8]),&(hand[4]),4); // offset high
458 PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
462 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
465 for (i=32;i<256;i++) {
466 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
469 PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
474 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
476 *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 3 * PAGE_SIZE);
480 static uint64_t gdt64[3] = {
481 0x0000000000000000, /* null */
482 0x00a09a0000000000, /* code (note lme bit) */
483 0x00a0920000000000, /* data (most entries don't matter) */
486 static void write_gdt(struct v3_vm_info *vm)
491 get_gdt_loc(vm,&base,&limit);
492 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
494 PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
499 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
501 *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 4 * PAGE_SIZE);
505 static uint64_t tss_data=0x0;
507 static void write_tss(struct v3_vm_info *vm)
513 get_tss_loc(vm,&base,&limit);
514 for (i=0;i<limit/8;i++) {
515 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+8*i),8,(uint8_t*) &tss_data);
518 PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
522 PTS MAP FIRST 512 GB identity mapped:
530 PTS MAP FIRST 1 GB identity mapped:
539 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
542 *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+2)*PAGE_SIZE);
543 *limit = 3*PAGE_SIZE;
545 *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+1)*PAGE_SIZE);
546 *limit = 2*PAGE_SIZE;
550 #ifndef HVM_MAP_1G_2M
551 static void write_pt_2level_512GB(struct v3_vm_info *vm)
555 struct pml4e64 pml4e;
559 get_pt_loc(vm,&base, &size);
560 if (size!=2*PAGE_SIZE) {
561 PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n");
564 if (vm->mem_size > 0x800000000ULL) {
565 PrintError(vm,VCORE_NONE, "VM has more than 512 GB\n");
568 memset(&pdpe,0,sizeof(pdpe));
573 for (i=0;i<512;i++) {
574 pdpe.pd_base_addr = i*0x40000; // 0x4000 = 256K pages = 1 GB
575 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe);
578 memset(&pml4e,0,sizeof(pml4e));
581 pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE));
583 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e);
585 for (i=1;i<512;i++) {
587 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e);
590 PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE) at %p (512 GB mapped)\n",base);
595 static void write_pt_3level_1GB(struct v3_vm_info *vm)
599 struct pml4e64 pml4e;
605 get_pt_loc(vm,&base, &size);
606 if (size!=3*PAGE_SIZE) {
607 PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n");
610 if (vm->mem_size > 0x40000000ULL) {
611 PrintError(vm,VCORE_NONE, "VM has more than 1 GB\n");
614 memset(&pde,0,sizeof(pde));
619 for (i=0;i<512;i++) {
620 pde.pt_base_addr = i*0x200; // 0x200 = 512 pages = 2 MB
621 v3_write_gpa_memory(&vm->cores[0],
622 (addr_t)(base+2*PAGE_SIZE+i*sizeof(pde)),
623 sizeof(pde),(uint8_t*)&pde);
626 memset(&pdpe,0,sizeof(pdpe));
631 pdpe.pd_base_addr = PAGE_BASE_ADDR((addr_t)(base+2*PAGE_SIZE));
633 v3_write_gpa_memory(&vm->cores[0],(addr_t)base+PAGE_SIZE,sizeof(pdpe),(uint8_t*)&pdpe);
635 for (i=1;i<512;i++) {
637 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe);
640 memset(&pml4e,0,sizeof(pml4e));
643 pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE));
645 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e);
647 for (i=1;i<512;i++) {
649 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e);
652 PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE, 1 PDP) at %p (1 GB mapped)\n",base);
657 static void write_pt(struct v3_vm_info *vm)
660 write_pt_3level_1GB(vm);
662 write_pt_2level_512GB(vm);
666 static void get_bp_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
669 *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+2)*PAGE_SIZE);
671 *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+1)*PAGE_SIZE);
676 static void write_bp(struct v3_vm_info *vm)
683 get_bp_loc(vm,&base,&limit);
685 for (i=0;i<limit/8;i++) {
686 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*8),8,(uint8_t*)&data);
689 PrintDebug(vm,VCORE_NONE,"hvm: wrote boundary page at %p\n", base);
693 #define MIN_STACK (4096*4)
696 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
701 get_bp_loc(vm,&bp_base,&bp_limit);
703 // assume at least a minimal stack
707 *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
709 if (bp_base < *base+PAGE_SIZE) {
710 PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
713 *limit = bp_base - *base;
717 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
718 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
720 #define ELF_MAGIC 0x464c457f
721 #define MB2_MAGIC 0xe85250d6
723 #define MB2_INFO_MAGIC 0x36d76289
725 static int is_elf(uint8_t *data, uint64_t size)
727 if (*((uint32_t*)data)==ELF_MAGIC) {
734 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
736 uint64_t limit = size > 32768 ? 32768 : size;
739 // Scan for the .boot magic cookie
740 // must be in first 32K, assume 4 byte aligned
741 for (i=0;i<limit;i+=4) {
742 if (*((uint32_t*)&data[i])==MB2_MAGIC) {
743 INFO("Found multiboot header at offset 0x%llx\n",i);
744 return (mb_header_t *) &data[i];
752 // BROKEN - THIS DOES NOT DO WHAT YOU THINK
754 static int setup_elf(struct v3_vm_info *vm, void *base, uint64_t limit)
756 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,vm->hvm_state.hrt_file->size,vm->hvm_state.hrt_file->data);
758 vm->hvm_state.hrt_entry_addr = (uint64_t) (base+0x40);
760 PrintDebug(vm,VCORE_NONE,"hvm: wrote HRT ELF %s at %p\n", vm->hvm_state.hrt_file->tag,base);
761 PrintDebug(vm,VCORE_NONE,"hvm: set ELF entry to %p and hoping for the best...\n", (void*) vm->hvm_state.hrt_entry_addr);
763 vm->hvm_state.hrt_type = HRT_ELF64;
769 static int setup_mb_kernel(struct v3_vm_info *vm, void *base, uint64_t limit)
775 // FIX USING GENERIC TOOLS
777 if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) {
778 PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
782 if (!mb.addr || !mb.entry) {
783 PrintError(vm,VCORE_NONE, "hvm: kernel is missing address or entry point\n");
787 if (((void*)(uint64_t)(mb.addr->header_addr) < base ) ||
788 ((void*)(uint64_t)(mb.addr->load_end_addr) > base+limit) ||
789 ((void*)(uint64_t)(mb.addr->bss_end_addr) > base+limit)) {
790 PrintError(vm,VCORE_NONE, "hvm: kernel is not within the allowed portion of HVM\n");
794 offset = mb.addr->load_addr - mb.addr->header_addr;
796 // Skip the ELF header - assume 1 page... weird....
797 v3_write_gpa_memory(&vm->cores[0],
798 (addr_t)(mb.addr->load_addr),
799 vm->hvm_state.hrt_file->size-PAGE_SIZE-offset,
800 vm->hvm_state.hrt_file->data+PAGE_SIZE+offset);
803 // vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + PAGE_SIZE; //HACK PAD
805 vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr;
807 vm->hvm_state.hrt_type = HRT_MBOOT64;
809 PrintDebug(vm,VCORE_NONE,
810 "hvm: wrote 0x%llx bytes starting at offset 0x%llx to %p; set entry to %p\n",
811 (uint64_t) vm->hvm_state.hrt_file->size-PAGE_SIZE-offset,
812 (uint64_t) PAGE_SIZE+offset,
813 (void*)(addr_t)(mb.addr->load_addr),
814 (void*) vm->hvm_state.hrt_entry_addr);
820 static int setup_hrt(struct v3_vm_info *vm)
825 get_hrt_loc(vm,&base,&limit);
827 if (vm->hvm_state.hrt_file->size > limit) {
828 PrintError(vm,VCORE_NONE,"hvm: Cannot map HRT because it is too big (%llu bytes, but only have %llu space\n", vm->hvm_state.hrt_file->size, (uint64_t)limit);
832 if (!is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) {
833 PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not an ELF but we are going to act like it is!\n");
834 if (setup_elf(vm,base,limit)) {
835 PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n");
838 vm->hvm_state.hrt_type=HRT_BLOB;
840 if (find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) {
841 PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
842 if (setup_mb_kernel(vm,base,limit)) {
843 PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
847 PrintDebug(vm,VCORE_NONE,"hvm: supplied HRT is an ELF\n");
848 if (setup_elf(vm,base,limit)) {
849 PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n");
868 We do not touch the ROS portion of the address space.
869 The HRT portion looks like:
871 INT_HANDLER (1 page - page aligned)
872 IDT (1 page - page aligned)
873 GDT (1 page - page aligned)
874 TSS (1 page - page asligned)
875 PAGETABLES (identy map of first N GB)
876 ROOT PT first, followed by 2nd level, etc.
877 Currently PML4 followed by 1 PDPE for 512 GB of mapping
878 BOUNDARY PAGE (all 0xff - avoid smashing page tables in case we keep going...)
879 (stack - we will push machine description)
881 HRT (as many pages as needed, page-aligned, starting at first HRT address)
888 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
890 if (!vm->hvm_state.is_hvm) {
891 PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
895 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
897 write_null_int_handler(vm);
907 PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
912 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
920 IDTR points to stub IDT
921 GDTR points to stub GDT
922 TS points to stub TSS
923 CR3 points to root page table
926 RSP is TOS (looks like a call)
928 0 (fake return address) <= RSP
930 RIP is entry point to HRT
931 RDI points to machine info on stack
933 Other regs are zeroed
935 shadow/nested paging state reset for long mode
938 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
943 rdtscll(core->hvm_state.last_boot_start);
945 if (!core->hvm_state.is_hrt) {
946 PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
950 PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
953 memset(&core->vm_regs,0,sizeof(core->vm_regs));
954 memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
955 memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
956 memset(&core->segments,0,sizeof(core->segments));
957 memset(&core->msrs,0,sizeof(core->msrs));
958 memset(&core->fp_state,0,sizeof(core->fp_state));
960 // We are in long mode with virtual memory and we want
961 // to start immediatley
962 core->cpl = 0; // we are going right into the kernel
963 core->cpu_mode = LONG;
964 core->mem_mode = VIRTUAL_MEM;
965 core->core_run_state = CORE_RUNNING ;
967 // We are going to enter right into the HRT
968 // HRT stack and argument passing
969 get_bp_loc(core->vm_info, &base,&limit);
970 // TODO: push description here
971 core->vm_regs.rsp = (v3_reg_t) base; // so if we ret, we will blow up
972 core->vm_regs.rbp = (v3_reg_t) base;
973 // TODO: RDI should really get pointer to description
974 core->vm_regs.rdi = (v3_reg_t) base;
976 get_hrt_loc(core->vm_info, &base,&limit);
977 core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr ;
979 // Setup CRs for long mode and our stub page table
981 core->ctrl_regs.cr0 = 0x80000001;
982 core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
984 // CR2: don't care (output from #PF)
985 // CE3: set to our PML4E, without setting PCD or PWT
986 get_pt_loc(core->vm_info, &base,&limit);
987 core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);
988 core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
990 // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
991 core->ctrl_regs.cr4 = 0xb0;
992 core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
994 // RFLAGS zeroed is fine: come in with interrupts off
995 // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0
996 core->ctrl_regs.efer = 0x1500;
997 core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1003 selector is 13 bits of index, 1 bit table indicator
1006 index is scaled by 8, even in long mode, where some entries
1007 are 16 bytes long....
1008 -> code, data descriptors have 8 byte format
1009 because base, limit, etc, are ignored (no segmentation)
1010 -> interrupt/trap gates have 16 byte format
1011 because offset needs to be 64 bits
1014 // Install our stub IDT
1015 get_idt_loc(core->vm_info, &base,&limit);
1016 core->segments.idtr.selector = 0; // entry 0 (NULL) of the GDT
1017 core->segments.idtr.base = (addr_t) base;
1018 core->segments.idtr.limit = limit-1;
1019 core->segments.idtr.type = 0xe;
1020 core->segments.idtr.system = 1;
1021 core->segments.idtr.dpl = 0;
1022 core->segments.idtr.present = 1;
1023 core->segments.idtr.long_mode = 1;
1025 // Install our stub GDT
1026 get_gdt_loc(core->vm_info, &base,&limit);
1027 core->segments.gdtr.selector = 0;
1028 core->segments.gdtr.base = (addr_t) base;
1029 core->segments.gdtr.limit = limit-1;
1030 core->segments.gdtr.type = 0x6;
1031 core->segments.gdtr.system = 1;
1032 core->segments.gdtr.dpl = 0;
1033 core->segments.gdtr.present = 1;
1034 core->segments.gdtr.long_mode = 1;
1037 get_tss_loc(core->vm_info, &base,&limit);
1038 core->segments.tr.selector = 0;
1039 core->segments.tr.base = (addr_t) base;
1040 core->segments.tr.limit = limit-1;
1041 core->segments.tr.type = 0x6;
1042 core->segments.tr.system = 1;
1043 core->segments.tr.dpl = 0;
1044 core->segments.tr.present = 1;
1045 core->segments.tr.long_mode = 1;
1051 core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1052 core->segments.cs.base = (addr_t) base;
1053 core->segments.cs.limit = limit;
1054 core->segments.cs.type = 0xe;
1055 core->segments.cs.system = 0;
1056 core->segments.cs.dpl = 0;
1057 core->segments.cs.present = 1;
1058 core->segments.cs.long_mode = 1;
1060 // DS, SS, etc are identical
1061 core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1062 core->segments.ds.base = (addr_t) base;
1063 core->segments.ds.limit = limit;
1064 core->segments.ds.type = 0x6;
1065 core->segments.ds.system = 0;
1066 core->segments.ds.dpl = 0;
1067 core->segments.ds.present = 1;
1068 core->segments.ds.long_mode = 1;
1070 memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1071 memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1072 memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1073 memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1076 if (core->vm_info->hvm_state.hrt_type==HRT_MBOOT64) {
1078 Temporary hackery for multiboot2 "64"
1079 We will push the MB structure onto the stack and update RSP
1085 if ((size=v3_build_multiboot_table(core,buf,256))==-1) {
1086 PrintError(core->vm_info,core,"hvm: Failed to write MB info\n");
1089 core->vm_regs.rsp -= size;
1091 v3_write_gpa_memory(core,
1096 PrintDebug(core->vm_info,core, "hvm: wrote MB info at %p\n", (void*)core->vm_regs.rsp);
1098 if (core->vcpu_id == core->vm_info->hvm_state.first_hrt_core) {
1099 // We are the BSP for this HRT
1100 // this is where rbx needs to point
1101 core->vm_regs.rbx = core->vm_regs.rsp;
1102 PrintDebug(core->vm_info,core, "hvm: \"BSP\" core\n");
1104 // We are an AP for this HRT
1105 // so we don't get the multiboot struct
1106 core->vm_regs.rbx = 0;
1107 PrintDebug(core->vm_info,core, "hvm: \"AP\" core\n");
1112 // one more push, something that looks like a return address
1114 core->vm_regs.rsp -= 8;
1116 v3_write_gpa_memory(core,
1121 // Now for our magic - this signals
1122 // the kernel that a multiboot loader loaded it
1123 // and that rbx points to its offered data
1124 core->vm_regs.rax = MB2_INFO_MAGIC;
1127 Note that "real" MB starts in protected mode without paging
1128 This hack starts in long mode... so these requirements go
1129 out the window for a large part
1134 OK EBX points to MB info
1135 OK CS = base 0, offset big, code (LONG MODE)
1136 OK DS,ES,FS,GS,SS => base 0, offset big, data (LONG MODE)
1138 XXX CR0 PE on PG off (nope)
1139 XXX EFLAGS IF and VM off
1147 // reset paging here for shadow...
1149 if (core->shdw_pg_mode != NESTED_PAGING) {
1150 PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");