2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org>
11 * All rights reserved.
13 * Author: Peter Dinda <pdinda@northwestern.edu>
15 * This is free software. You are permitted to use,
16 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
27 #include <palacios/vmm_xml.h>
29 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_debug.h>
36 MEM = Total size of memory in the GPA (in MB)
37 ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
39 GPAs [0,ROS_MEM) are what the ROS sees
40 GPAs [ROS_MEM, MEM) are HRT only
41 GPAS [0,MEM) are accessible by the HRT
43 CORES = Total number of cores in VM
44 ROS_CORES = Total numbber of cores for the ROS
46 Cores [0,ROS_CORES) are what the ROS sees
47 Cores [ROS_CORES,CORES) are HRT only
48 Cores [0,CORES) are accessible by the HRT
53 <file id="hrtelf" filename="hrtelf.o" />
56 <mem ... >RAM</mem> (MB) Note these are
57 <cores count="CORES" ...> backward compatible
60 <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
61 <hrt file_id="hrtelf" /hrt>
66 #ifndef V3_CONFIG_DEBUG_HVM
68 #define PrintDebug(fmt, args...)
74 PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
80 PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
84 // ignore requests from when we are in the wrong state
85 #define ENFORCE_STATE_MACHINE 1
87 // invoke the HRT using a page fault instead of
88 // the SWINTR mechanism
89 #define USE_UPCALL_MAGIC_PF 1
90 #define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
91 #define UPCALL_MAGIC_ERROR 0xf00df00d
94 64 bit only hypercall:
96 rax = hypercall number
98 then args are: rcx, rdx, rsi, rdi r8, r9, r10, r11
101 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
104 uint64_t bitness = core->vm_regs.rbx;
105 uint64_t a1 = core->vm_regs.rcx;
106 uint64_t a2 = core->vm_regs.rdx;
107 struct v3_vm_hvm *h = &core->vm_info->hvm_state;
110 if (bitness!=0x6464646464646464) {
111 PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
112 core->vm_regs.rax = -1;
121 V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
122 hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
123 //v3_print_core_telemetry(core);
124 // v3_print_guest_state(core);
125 core->vm_regs.rax = 0;
128 case 0x1: // reset ros
129 PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
130 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) {
131 PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
132 core->vm_regs.rax = -1;
134 core->vm_regs.rax = 0;
138 case 0x2: // reset hrt
139 PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
140 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) {
141 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
142 core->vm_regs.rax = -1;
144 core->vm_regs.rax = 0;
148 case 0x3: // reset both
149 PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
150 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) {
151 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
152 core->vm_regs.rax = -1;
154 core->vm_regs.rax = 0;
158 case 0xf: // get HRT state
159 core->vm_regs.rax = h->trans_state;
160 //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
163 case 0x20: // invoke function (ROS->HRT)
164 case 0x21: // invoke parallel function (ROS->HRT)
165 if (v3_is_hvm_hrt_core(core)) {
166 PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
167 core->vm_regs.rax = -1;
169 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
170 PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
171 core->vm_regs.rax = -1;
173 uint64_t *page = (uint64_t *) h->comm_page_hva;
174 uint64_t first, last, cur;
176 PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
181 first=last=h->first_hrt_core;
183 first=h->first_hrt_core;
184 last=core->vm_info->num_cores-1;
187 core->vm_regs.rax = 0;
189 h->trans_count = last-first+1;
191 for (cur=first;cur<=last;cur++) {
193 #if USE_UPCALL_MAGIC_PF
194 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
195 core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
196 if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
198 UPCALL_MAGIC_ERROR)) {
199 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
200 core->vm_regs.rax = -1;
204 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
205 if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) {
206 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
207 core->vm_regs.rax = -1;
211 // Force core to exit now
212 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
215 if (core->vm_regs.rax==0) {
217 h->trans_state = HRT_CALL;
219 h->trans_state = HRT_PARCALL;
222 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
223 h->trans_state = HRT_IDLE;
231 case 0x2f: // function exec done
232 if (v3_is_hvm_ros_core(core)) {
233 PrintError(core->vm_info,core, "hvm: request for exec done from ROS core\n");
234 core->vm_regs.rax=-1;
236 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_CALL && h->trans_state!=HRT_PARCALL) {
237 PrintError(core->vm_info,core,"hvm: function completion when not in HRT_CALL or HRT_PARCALL state\n");
238 core->vm_regs.rax=-1;
241 PrintDebug(core->vm_info,core, "hvm: function complete\n");
242 if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
243 // last one, switch state
244 h->trans_state=HRT_IDLE;
245 PrintDebug(core->vm_info,core, "hvm: function complete - back to idle\n");
253 case 0x30: // merge address space
254 case 0x31: // unmerge address space
255 if (v3_is_hvm_hrt_core(core)) {
256 PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
257 core->vm_regs.rax=-1;
259 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
260 PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state\n",a1==0x30 ? "" : "un");
261 core->vm_regs.rax=-1;
263 uint64_t *page = (uint64_t *) h->comm_page_hva;
265 PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
266 // should sanity check to make sure guest is in 64 bit without anything strange
269 page[1] = core->ctrl_regs.cr3; // this is a do-not-care for an unmerge
271 core->vm_regs.rax = 0;
272 #if USE_UPCALL_MAGIC_PF
273 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %u\n",h->first_hrt_core);
274 core->vm_info->cores[h->first_hrt_core].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
275 if (v3_raise_exception_with_error(&core->vm_info->cores[h->first_hrt_core],
277 UPCALL_MAGIC_ERROR)) {
278 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %u\n",h->first_hrt_core);
279 core->vm_regs.rax = -1;
283 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %u\n",h->hrt_int_vector,h->first_hrt_core);
284 if (v3_raise_swintr(&core->vm_info->cores[h->first_hrt_core],h->hrt_int_vector)) {
285 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %u\n",h->first_hrt_core);
286 core->vm_regs.rax = -1;
289 // Force core to exit now
290 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
292 h->trans_state = HRT_MERGE;
300 case 0x3f: // merge operation done
301 if (v3_is_hvm_ros_core(core)) {
302 PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
303 core->vm_regs.rax=-1;
305 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
306 PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
307 core->vm_regs.rax=-1;
309 PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
310 h->trans_state=HRT_IDLE;
318 PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
319 core->vm_regs.rax=-1;
326 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
328 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
330 v3_cfg_tree_t *hvm_config;
331 v3_cfg_tree_t *ros_config;
332 v3_cfg_tree_t *hrt_config;
338 PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
343 memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
344 vm->hvm_state.is_hvm=0;
345 vm->hvm_state.first_hrt_core=vm->num_cores;
346 vm->hvm_state.first_hrt_gpa=vm->mem_size;
348 if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
349 PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
353 if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
354 PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
358 if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) {
359 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
363 if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) {
364 PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
368 vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
370 if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) {
371 PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
375 vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
377 if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) {
378 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
382 if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) {
383 PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
387 vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
389 if (!vm->hvm_state.hrt_file) {
390 PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
394 if (v3_register_hypercall(vm, HVM_HCALL,
395 hvm_hcall_handler, 0)) {
396 PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
400 // XXX sanity check config here
402 vm->hvm_state.is_hvm=1;
405 if (vm->hvm_state.is_hvm) {
406 V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
407 vm->hvm_state.first_hrt_core-1,
408 (void*) vm->hvm_state.first_hrt_gpa-1,
409 vm->hvm_state.first_hrt_core,
411 (void*) vm->hvm_state.first_hrt_gpa,
412 (void*)vm->mem_size-1,
414 vm->hvm_state.hrt_file->tag);
416 V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
423 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
425 PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
427 v3_remove_hypercall(vm,HVM_HCALL);
429 if (vm->hvm_state.comm_page_hpa) {
430 struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
432 PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
434 v3_delete_mem_region(vm,r);
441 int v3_init_hvm_core(struct guest_info *core)
443 memset(&core->hvm_state,0,sizeof(core->hvm_state));
444 if (core->vm_info->hvm_state.is_hvm) {
445 if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) {
446 core->hvm_state.is_hrt=1;
452 int v3_deinit_hvm_core(struct guest_info *core)
454 PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
460 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
462 if (vm->hvm_state.is_hvm) {
463 return vm->hvm_state.first_hrt_gpa;
468 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
473 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
475 if (vm->hvm_state.is_hvm) {
476 return vm->hvm_state.first_hrt_core;
478 return vm->num_cores;
482 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
484 if (vm->hvm_state.is_hvm) {
485 return vm->num_cores - vm->hvm_state.first_hrt_core;
492 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
494 if (vm->hvm_state.is_hvm) {
495 return gpa>=0 && gpa<vm->hvm_state.first_hrt_gpa;
501 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
503 if (vm->hvm_state.is_hvm) {
504 return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
510 int v3_is_hvm_hrt_core(struct guest_info *core)
512 return core->hvm_state.is_hrt;
515 int v3_is_hvm_ros_core(struct guest_info *core)
517 return !core->hvm_state.is_hrt;
520 int v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
523 // ioapic or msi to apic
524 return !dest->hvm_state.is_hrt;
527 return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
531 void v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm,
532 uint32_t *start_apic, uint32_t *num_apics)
535 // Seen from ioapic, msi, etc:
536 if (vm->hvm_state.is_hvm) {
537 // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
539 *num_apics = vm->hvm_state.first_hrt_core;
541 // Non-HVM shows all cores/APICs to apic, msi, etc.
543 *num_apics = vm->num_cores;
547 if (core->hvm_state.is_hrt) {
548 // HRT core/apic sees all apics
549 // (this policy may change...)
551 *num_apics = vm->num_cores;
553 // non-HRT core/apic sees only non-HRT cores/apics
555 *num_apics = vm->hvm_state.first_hrt_core;
560 #define MAX(x,y) ((x)>(y)?(x):(y))
561 #define MIN(x,y) ((x)<(y)?(x):(y))
564 static uint64_t boot_state_end_addr(struct v3_vm_info *vm)
566 return PAGE_ADDR(vm->mem_size);
569 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
571 *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
575 extern v3_cpu_arch_t v3_mach_type;
577 extern void *v3_hvm_svm_null_int_handler_start;
578 extern void *v3_hvm_svm_null_int_handler_end;
579 extern void *v3_hvm_vmx_null_int_handler_start;
580 extern void *v3_hvm_vmx_null_int_handler_end;
582 static void write_null_int_handler(struct v3_vm_info *vm)
589 get_null_int_handler_loc(vm,&base,&limit);
591 switch (v3_mach_type) {
594 case V3_SVM_REV3_CPU:
595 data = (void*) &v3_hvm_svm_null_int_handler_start;
596 len = (void*) &v3_hvm_svm_null_int_handler_end - data;
602 case V3_VMX_EPT_UG_CPU:
603 data = (void*) &v3_hvm_vmx_null_int_handler_start;
604 len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
608 PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
614 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
617 PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
621 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
623 *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
627 // default IDT entries (int and trap gates)
629 // Format is 16 bytes long:
631 // 16 selector => (target code selector) => 0x8 // entry 1 of GDT
632 // 3 ist => (stack) = 0 => current stack
634 // 4 type => 0xe=>INT, 0xf=>TRAP
635 // 1 reserved => 0 (indicates "system" by being zero)
639 // 32 offsethigh => 0 (total is a 64 bit offset)
642 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
644 // Note little endian
646 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
647 static uint64_t idt64_int_gate_entry_mask[2] = { 0x00008e0000080000, 0x0 };
649 static void write_idt(struct v3_vm_info *vm)
654 uint64_t handler_len;
656 uint64_t trap_gate[2];
657 uint64_t int_gate[2];
659 get_idt_loc(vm,&base,&limit);
661 get_null_int_handler_loc(vm,&handler,&handler_len);
663 handler += vm->hvm_state.gva_offset;
665 memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
666 memcpy(int_gate,idt64_int_gate_entry_mask,16);
669 // update the entries for the handler location
673 hand = (uint8_t*) &handler;
675 mask = (uint8_t *)trap_gate;
676 memcpy(&(mask[0]),&(hand[0]),2); // offset low
677 memcpy(&(mask[6]),&(hand[2]),2); // offset med
678 memcpy(&(mask[8]),&(hand[4]),4); // offset high
680 mask = (uint8_t *)int_gate;
681 memcpy(&(mask[0]),&(hand[0]),2); // offset low
682 memcpy(&(mask[6]),&(hand[2]),2); // offset med
683 memcpy(&(mask[8]),&(hand[4]),4); // offset high
685 PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
689 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
692 for (i=32;i<256;i++) {
693 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
696 PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
701 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
703 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
707 static uint64_t gdt64[3] = {
708 0x0000000000000000, /* null */
709 0x00a09a0000000000, /* code (note lme bit) */
710 0x00a0920000000000, /* data (most entries don't matter) */
713 static void write_gdt(struct v3_vm_info *vm)
718 get_gdt_loc(vm,&base,&limit);
719 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
721 PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
726 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
728 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
732 static void write_tss(struct v3_vm_info *vm)
737 get_tss_loc(vm,&base,&limit);
739 v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
741 PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
745 #define TOP_HALF_START 0xffff800000000000ULL
746 #define BOTTOM_HALF_END 0x00007fffffffffffULL
749 #define L4_UNIT PAGE_SIZE
750 #define L3_UNIT (512ULL * L4_UNIT)
751 #define L2_UNIT (512ULL * L3_UNIT)
752 #define L1_UNIT (512ULL * L2_UNIT)
754 static void compute_pts_4KB(struct v3_vm_info *vm,
755 uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)
758 // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
759 // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
760 // so it is the same number of page tables regardless
762 uint64_t max_gva = vm->hvm_state.max_mem_mapped;
765 *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
766 *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
767 *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
773 PTS MAP using 1 GB pages
774 n second levels pts, highest gva, highest address
780 PTS MAP using 2 MB pages
781 n third level pts, highest gva, highest address
782 m second level pts, highest gva, highest address
787 PTS MAP using 4 KB pages
788 n 4th level, highest gva, highest address
789 m 3rd level, highest gva, hihgest address
790 l second level, highest gva, highest address
794 PTS MAP using 512 GB pages when this becomes available
799 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
801 uint64_t l1,l2,l3,l4;
804 compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
806 if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
808 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
810 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
811 num_pt = l1 + l2 + l3;
812 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
813 num_pt = l1 + l2 + l3 + l4;
815 PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
819 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
820 *limit = num_pt*PAGE_SIZE;
823 static void write_pts(struct v3_vm_info *vm)
826 uint64_t num_l1, num_l2, num_l3, num_l4;
827 void *start_l1, *start_l2, *start_l3, *start_l4;
833 void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
834 void *min_gva = (void*) vm->hvm_state.gva_offset;
835 #ifdef V3_CONFIG_DEBUG_HVM
836 void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
839 uint64_t i_start,i_end;
841 struct pml4e64 *pml4e;
846 if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
847 PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
849 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
851 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
853 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
856 PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
860 get_pt_loc(vm,&start_l1,&size);
861 compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
863 start_l2=start_l1+PAGE_SIZE*num_l1;
864 start_l3=start_l2+PAGE_SIZE*num_l2;
865 start_l4=start_l3+PAGE_SIZE*num_l3;
867 PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
868 PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
869 PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
870 PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
874 // build PML4 (only one)
875 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) {
876 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
880 memset(pml4e,0,PAGE_SIZE);
883 i_start=0; i_end = num_l2;
884 } else if (min_gva==(void*)TOP_HALF_START) {
885 i_start=256; i_end=256+num_l2;
887 PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
891 for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
893 i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
899 PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
900 pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
901 //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
903 pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
904 //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
915 for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
917 cur_pt+=PAGE_SIZE, pt++) {
920 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) {
921 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
925 memset(pdpe,0,PAGE_SIZE);
928 i<512 && cur_gpa<max_gpa;
929 i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
935 pdpe[i].large_page=1;
936 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
937 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
939 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
940 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
950 for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
952 cur_pt+=PAGE_SIZE, pt++) {
955 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) {
956 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
960 memset(pde,0,PAGE_SIZE);
963 i<512 && cur_gpa<max_gpa;
964 i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
971 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
972 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
974 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
975 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
987 for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
989 cur_pt+=PAGE_SIZE, pt++) {
992 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) {
993 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
997 memset(pte,0,PAGE_SIZE);
1000 i<512 && cur_gpa<max_gpa;
1001 i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
1005 pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1006 //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
1014 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1017 get_pt_loc(vm,base, limit);
1023 int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
1025 struct v3_vm_info *vm = core->vm_info;
1027 hrt->tag.type = MB_INFO_HRT_TAG;
1028 hrt->tag.size = sizeof(mb_info_hrt_t);
1030 hrt->total_num_apics = vm->num_cores;
1031 hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
1032 hrt->have_hrt_ioapic=0;
1033 hrt->first_hrt_ioapic_entry=0;
1035 hrt->cpu_freq_khz = V3_CPU_KHZ();
1037 hrt->hrt_flags = vm->hvm_state.hrt_flags;
1038 hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
1039 hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
1040 hrt->gva_offset = vm->hvm_state.gva_offset;
1041 hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
1042 hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
1047 static void write_mb_info(struct v3_vm_info *vm)
1049 if (vm->hvm_state.hrt_type!=HRT_MBOOT64) {
1050 PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
1058 get_mb_info_loc(vm,&base,&limit);
1060 if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) {
1061 PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
1066 PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
1070 v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
1075 PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
1079 #define SCRATCH_STACK_SIZE 4096
1082 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1087 get_mb_info_loc(vm,&mb_base,&mb_limit);
1089 mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
1091 *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
1093 if (mb_base < *base+PAGE_SIZE) {
1094 PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
1097 *limit = mb_base - *base;
1101 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1102 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1104 #define ELF_MAGIC 0x464c457f
1105 #define MB2_MAGIC 0xe85250d6
1107 #define MB2_INFO_MAGIC 0x36d76289
1109 static int is_elf(uint8_t *data, uint64_t size)
1111 if (*((uint32_t*)data)==ELF_MAGIC) {
1118 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
1120 uint64_t limit = size > 32768 ? 32768 : size;
1123 // Scan for the .boot magic cookie
1124 // must be in first 32K, assume 4 byte aligned
1125 for (i=0;i<limit;i+=4) {
1126 if (*((uint32_t*)&data[i])==MB2_MAGIC) {
1127 INFO("Found multiboot header at offset 0x%llx\n",i);
1128 return (mb_header_t *) &data[i];
1135 static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
1137 struct v3_vm_hvm *h = &vm->hvm_state;
1138 uint64_t f = mb->mb64_hrt->hrt_flags;
1139 uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
1140 uint64_t gvaoff = mb->mb64_hrt->gva_offset;
1141 uint64_t gvaentry = mb->mb64_hrt->gva_entry;
1142 uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
1143 uint8_t vec = mb->mb64_hrt->hrt_int_vector;
1146 PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
1147 f, maxmap, gvaoff,gvaentry,commgpa, vec);
1149 if (maxmap<0x100000000ULL) {
1150 PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
1151 maxmap=0x100000000ULL;
1154 if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
1155 PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
1157 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
1159 f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
1160 h->max_mem_mapped = maxmap;
1161 PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
1162 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
1164 f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
1165 h->max_mem_mapped = maxmap;
1166 PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
1167 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
1169 f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
1170 h->max_mem_mapped = maxmap;
1171 PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
1173 PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
1177 if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
1178 PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
1184 if (maxmap>h->max_mem_mapped) {
1185 PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
1189 if (gvaoff!=0 && gvaoff!=TOP_HALF_START) {
1190 PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
1194 h->gva_offset = gvaoff;
1196 h->gva_entry = gvaentry;
1198 if (mb->addr->load_addr < h->first_hrt_gpa) {
1199 PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
1203 if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
1204 PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
1209 PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
1213 h->hrt_int_vector = vec;
1216 if (commgpa < vm->mem_size) {
1217 PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
1221 h->comm_page_gpa = commgpa;
1223 if (!h->comm_page_hpa) {
1224 if (!(h->comm_page_hpa=V3_AllocPages(1))) {
1225 PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
1229 h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
1231 memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1233 if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) {
1234 PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
1235 V3_FreePages((void*)(h->comm_page_gpa),1);
1240 PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
1243 memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1246 PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
1247 h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
1253 static int setup_mb_kernel_hrt(struct v3_vm_info *vm)
1257 if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) {
1258 PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
1262 if (configure_hrt(vm,&mb)) {
1263 PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
1267 if (v3_write_multiboot_kernel(vm,&mb,vm->hvm_state.hrt_file,
1268 (void*)vm->hvm_state.first_hrt_gpa,
1269 vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
1270 PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
1274 if (vm->hvm_state.gva_entry) {
1275 vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
1277 vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
1280 vm->hvm_state.hrt_type = HRT_MBOOT64;
1287 static int setup_hrt(struct v3_vm_info *vm)
1289 if (is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size) &&
1290 find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) {
1292 PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
1293 if (setup_mb_kernel_hrt(vm)) {
1294 PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
1298 PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
1315 We do not touch the ROS portion of the address space.
1316 The HRT portion looks like:
1318 INT_HANDLER (1 page - page aligned)
1319 IDT (1 page - page aligned)
1320 GDT (1 page - page aligned)
1321 TSS (1 page - page asligned)
1322 PAGETABLES (identy map of first N GB)
1323 ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
1324 followed by 3rd level PTs in order, followed by 4th level
1327 SCRATCH_STACK_HRT_CORE0
1328 SCRATCH_STACK_HRT_CORE1
1330 SCRATCH_STACK_HRT_COREN
1332 HRT (as many pages as needed, page-aligned, starting at first HRT address)
1340 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
1342 if (!vm->hvm_state.is_hvm) {
1343 PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
1347 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
1349 if (setup_hrt(vm)) {
1350 PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
1354 // the locations of all the other items are determined by
1355 // the HRT setup, so these must happen after
1357 write_null_int_handler(vm);
1364 // this must happen last
1367 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
1373 On entry for every core:
1375 IDTR points to stub IDT
1376 GDTR points to stub GDT
1377 TS points to stub TSS
1378 CR3 points to root page table
1380 EFER has LME AND LMA (and NX for compatibility with Linux)
1381 RSP is TOS of core's scratch stack (looks like a call)
1383 RAX = MB magic cookie
1384 RBX = address of multiboot info table
1385 RCX = this core id / apic id (0..N-1)
1386 RDX = this core id - first HRT core ID (==0 for the first HRT core)
1388 All addresses are virtual addresses, offset as needed by gva_offset
1390 Other regs are zeroed
1392 shadow/nested paging state reset for long mode
1395 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
1399 uint64_t gva_offset;
1401 rdtscll(core->hvm_state.last_boot_start);
1404 if (!core->hvm_state.is_hrt) {
1405 PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
1410 PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
1412 gva_offset = core->vm_info->hvm_state.gva_offset;
1414 memset(&core->vm_regs,0,sizeof(core->vm_regs));
1415 memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
1416 memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
1417 memset(&core->segments,0,sizeof(core->segments));
1418 memset(&core->msrs,0,sizeof(core->msrs));
1419 memset(&core->fp_state,0,sizeof(core->fp_state));
1421 // We are in long mode with virtual memory and we want
1422 // to start immediatley
1423 core->cpl = 0; // we are going right into the kernel
1424 core->cpu_mode = LONG;
1425 core->mem_mode = VIRTUAL_MEM;
1426 core->core_run_state = CORE_RUNNING ;
1430 core->vm_regs.rax = MB2_INFO_MAGIC;
1432 // multiboot info pointer
1433 get_mb_info_loc(core->vm_info, &base,&limit);
1434 core->vm_regs.rbx = (uint64_t) base + gva_offset;
1437 core->vm_regs.rcx = core->vcpu_id;
1440 core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1442 // Now point to scratch stack for this core
1443 // it begins at an ofset relative to the MB info page
1444 get_mb_info_loc(core->vm_info, &base,&limit);
1445 base = base + gva_offset;
1446 base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1447 core->vm_regs.rsp = (v3_reg_t) base;
1448 core->vm_regs.rbp = (v3_reg_t) base-8;
1450 // push onto the stack a bad rbp and bad return address
1451 core->vm_regs.rsp-=16;
1452 v3_set_gpa_memory(core,
1453 core->vm_regs.rsp-gva_offset,
1459 get_hrt_loc(core->vm_info, &base,&limit);
1460 if (core->vm_info->hvm_state.gva_entry) {
1461 core->rip = core->vm_info->hvm_state.gva_entry;
1463 core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset;
1468 PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1469 (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1471 (void*)(core->vm_regs.rsp),
1472 (void*)(core->vm_regs.rbp),
1473 (void*)(core->vm_regs.rax),
1474 (void*)(core->vm_regs.rbx),
1475 (void*)(core->vm_regs.rcx),
1476 (void*)(core->vm_regs.rdx));
1478 // Setup CRs for long mode and our stub page table
1480 core->ctrl_regs.cr0 = 0x80000001;
1481 core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1483 // CR2: don't care (output from #PF)
1484 // CE3: set to our PML4E, without setting PCD or PWT
1485 get_pt_loc(core->vm_info, &base,&limit);
1486 core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base); // not offset as this is a GPA
1487 core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1489 // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1490 core->ctrl_regs.cr4 = 0xb0;
1491 core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1493 // RFLAGS zeroed is fine: come in with interrupts off
1494 // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
1495 core->ctrl_regs.efer = 0x1d00;
1496 core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1502 selector is 13 bits of index, 1 bit table indicator
1505 index is scaled by 8, even in long mode, where some entries
1506 are 16 bytes long....
1507 -> code, data descriptors have 8 byte format
1508 because base, limit, etc, are ignored (no segmentation)
1509 -> interrupt/trap gates have 16 byte format
1510 because offset needs to be 64 bits
1513 // Install our stub IDT
1514 get_idt_loc(core->vm_info, &base,&limit);
1516 core->segments.idtr.selector = 0; // entry 0 (NULL) of the GDT
1517 core->segments.idtr.base = (addr_t) base; // only base+limit are used
1518 core->segments.idtr.limit = limit-1;
1519 core->segments.idtr.type = 0x0;
1520 core->segments.idtr.system = 0;
1521 core->segments.idtr.dpl = 0;
1522 core->segments.idtr.present = 0;
1523 core->segments.idtr.long_mode = 0;
1525 // Install our stub GDT
1526 get_gdt_loc(core->vm_info, &base,&limit);
1528 core->segments.gdtr.selector = 0; // entry 0 (NULL) of the GDT
1529 core->segments.gdtr.base = (addr_t) base;
1530 core->segments.gdtr.limit = limit-1; // only base+limit are used
1531 core->segments.gdtr.type = 0x0;
1532 core->segments.gdtr.system = 0;
1533 core->segments.gdtr.dpl = 0;
1534 core->segments.gdtr.present = 0;
1535 core->segments.gdtr.long_mode = 0;
1538 get_tss_loc(core->vm_info, &base,&limit);
1540 core->segments.tr.selector = 0;
1541 core->segments.tr.base = (addr_t) base;
1542 core->segments.tr.limit = limit-1;
1543 core->segments.tr.type = 0x9;
1544 core->segments.tr.system = 0; // available 64 bit TSS
1545 core->segments.tr.dpl = 0;
1546 core->segments.tr.present = 1;
1547 core->segments.tr.long_mode = 0; // not used
1549 base = 0x0; // these are not offset as we want to make all gvas visible
1553 core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1554 core->segments.cs.base = (addr_t) base; // not used
1555 core->segments.cs.limit = limit; // not used
1556 core->segments.cs.type = 0xe; // only C is used
1557 core->segments.cs.system = 1; // not a system segment
1558 core->segments.cs.dpl = 0;
1559 core->segments.cs.present = 1;
1560 core->segments.cs.long_mode = 1;
1562 // DS, SS, etc are identical
1563 core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1564 core->segments.ds.base = (addr_t) base;
1565 core->segments.ds.limit = limit;
1566 core->segments.ds.type = 0x6; // ignored
1567 core->segments.ds.system = 1; // not a system segment
1568 core->segments.ds.dpl = 0;
1569 core->segments.ds.present = 1;
1570 core->segments.ds.long_mode = 1;
1572 memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1573 memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1574 memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1575 memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1578 // reset paging here for shadow...
1580 if (core->shdw_pg_mode != NESTED_PAGING) {
1581 PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
1589 int v3_handle_hvm_reset(struct guest_info *core)
1592 if (core->core_run_state != CORE_RESETTING) {
1596 if (!core->vm_info->hvm_state.is_hvm) {
1600 if (v3_is_hvm_hrt_core(core)) {
1601 // this is an HRT reset
1604 // wait for all the HRT cores
1605 v3_counting_barrier(&core->vm_info->reset_barrier);
1607 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1609 core->vm_info->run_state = VM_RESETTING;
1612 core->core_run_state = CORE_RESETTING;
1614 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1615 // we really only need to clear the bss
1616 // and recopy the .data, but for now we'll just
1618 rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
1621 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
1625 // now everyone is ready to reset
1626 rc |= v3_setup_hvm_hrt_core_for_boot(core);
1629 PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
1632 core->core_run_state = CORE_RUNNING;
1634 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1636 core->vm_info->run_state = VM_RUNNING;
1637 core->vm_info->hvm_state.trans_state = HRT_IDLE;
1640 v3_counting_barrier(&core->vm_info->reset_barrier);
1643 PrintError(core->vm_info,core,"hvm: reset failed\n");
1650 // ROS core will be handled by normal reset functionality