2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org>
11 * All rights reserved.
13 * Author: Peter Dinda <pdinda@northwestern.edu>
15 * This is free software. You are permitted to use,
16 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
27 #include <palacios/vmm_xml.h>
29 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_debug.h>
36 MEM = Total size of memory in the GPA (in MB)
37 ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
39 GPAs [0,ROS_MEM) are what the ROS sees
40 GPAs [ROS_MEM, MEM) are HRT only
41 GPAS [0,MEM) are accessible by the HRT
43 CORES = Total number of cores in VM
44 ROS_CORES = Total numbber of cores for the ROS
46 Cores [0,ROS_CORES) are what the ROS sees
47 Cores [ROS_CORES,CORES) are HRT only
48 Cores [0,CORES) are accessible by the HRT
53 <file id="hrtelf" filename="hrtelf.o" />
56 <mem ... >RAM</mem> (MB) Note these are
57 <cores count="CORES" ...> backward compatible
60 <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
61 <hrt file_id="hrtelf" /hrt>
66 #ifndef V3_CONFIG_DEBUG_HVM
68 #define PrintDebug(fmt, args...)
74 PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
80 PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
84 // ignore requests from when we are in the wrong state
85 #define ENFORCE_STATE_MACHINE 1
87 // invoke the HRT using a page fault instead of
88 // the SWINTR mechanism
89 #define USE_UPCALL_MAGIC_PF 1
90 #define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
91 #define UPCALL_MAGIC_ERROR 0xf00df00d
94 64 bit only hypercall:
96 rax = hypercall number
98 then args are: rcx, rdx, rsi, rdi r8, r9, r10, r11
101 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
104 uint64_t bitness = core->vm_regs.rbx;
105 uint64_t a1 = core->vm_regs.rcx;
106 uint64_t a2 = core->vm_regs.rdx;
107 struct v3_vm_hvm *h = &core->vm_info->hvm_state;
110 if (bitness!=0x6464646464646464) {
111 PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
112 core->vm_regs.rax = -1;
121 V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
122 hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
123 //v3_print_core_telemetry(core);
124 // v3_print_guest_state(core);
125 core->vm_regs.rax = 0;
128 case 0x1: // reset ros
129 PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
130 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) {
131 PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
132 core->vm_regs.rax = -1;
134 core->vm_regs.rax = 0;
138 case 0x2: // reset hrt
139 PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
140 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) {
141 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
142 core->vm_regs.rax = -1;
144 core->vm_regs.rax = 0;
148 case 0x3: // reset both
149 PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
150 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) {
151 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
152 core->vm_regs.rax = -1;
154 core->vm_regs.rax = 0;
158 case 0xf: // get HRT state
159 core->vm_regs.rax = h->trans_state;
160 //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
163 case 0x20: // invoke function (ROS->HRT)
164 case 0x21: // invoke parallel function (ROS->HRT)
165 if (v3_is_hvm_hrt_core(core)) {
166 PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
167 core->vm_regs.rax = -1;
169 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
170 PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
171 core->vm_regs.rax = -1;
173 uint64_t *page = (uint64_t *) h->comm_page_hva;
174 uint64_t first, last, cur;
176 PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
181 first=last=h->first_hrt_core;
183 first=h->first_hrt_core;
184 last=core->vm_info->num_cores-1;
187 core->vm_regs.rax = 0;
189 h->trans_count = last-first+1;
191 for (cur=first;cur<=last;cur++) {
193 #if USE_UPCALL_MAGIC_PF
194 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
195 core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
196 if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
198 UPCALL_MAGIC_ERROR)) {
199 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
200 core->vm_regs.rax = -1;
204 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
205 if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) {
206 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
207 core->vm_regs.rax = -1;
211 // Force core to exit now
212 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
215 if (core->vm_regs.rax==0) {
217 h->trans_state = HRT_CALL;
219 h->trans_state = HRT_PARCALL;
222 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
223 h->trans_state = HRT_IDLE;
231 case 0x28: // setup for synchronous operation (ROS->HRT)
232 case 0x29: // teardown for synchronous operation (ROS->HRT)
233 if (v3_is_hvm_hrt_core(core)) {
234 PrintError(core->vm_info,core,"hvm: %ssynchronization invocation not supported from HRT core\n",a1==0x29 ? "de" : "");
235 core->vm_regs.rax = -1;
237 if (ENFORCE_STATE_MACHINE &&
238 ((a1==0x28 && h->trans_state!=HRT_IDLE) || (a1==0x29 && h->trans_state!=HRT_SYNC))) {
239 PrintError(core->vm_info,core, "hvm: cannot invoke %ssynchronization in state %d\n",a1==0x29 ? "de" : "", h->trans_state);
240 core->vm_regs.rax = -1;
242 uint64_t *page = (uint64_t *) h->comm_page_hva;
243 uint64_t first, last, cur;
245 PrintDebug(core->vm_info,core, "hvm: invoke %ssynchronization on address %p\n",a1==0x29 ? "de" : "",(void*)a2);
249 first=last=h->first_hrt_core; // initially we will sync only with BSP
251 core->vm_regs.rax = 0;
253 h->trans_count = last-first+1;
255 for (cur=first;cur<=last;cur++) {
257 #if USE_UPCALL_MAGIC_PF
258 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
259 core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
260 if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
262 UPCALL_MAGIC_ERROR)) {
263 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
264 core->vm_regs.rax = -1;
268 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
269 if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) {
270 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
271 core->vm_regs.rax = -1;
275 // Force core to exit now
276 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
279 if (core->vm_regs.rax==0) {
281 h->trans_state = HRT_SYNCSETUP;
283 h->trans_state = HRT_SYNCTEARDOWN;
286 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
287 h->trans_state = HRT_IDLE;
294 case 0x2f: // function exec or sync done
295 if (v3_is_hvm_ros_core(core)) {
296 PrintError(core->vm_info,core, "hvm: request for exec or sync done from ROS core\n");
297 core->vm_regs.rax=-1;
299 if (ENFORCE_STATE_MACHINE &&
300 h->trans_state!=HRT_CALL &&
301 h->trans_state!=HRT_PARCALL &&
302 h->trans_state!=HRT_SYNCSETUP &&
303 h->trans_state!=HRT_SYNCTEARDOWN) {
304 PrintError(core->vm_info,core,"hvm: function or sync completion when not in HRT_CALL, HRT_PARCALL, HRT_SYNCSETUP, or HRT_SYNCTEARDOWN state\n");
305 core->vm_regs.rax=-1;
308 PrintDebug(core->vm_info,core, "hvm: function or sync complete\n");
309 if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
310 // last one, switch state
311 if (h->trans_state==HRT_SYNCSETUP) {
312 h->trans_state=HRT_SYNC;
313 PrintDebug(core->vm_info,core, "hvm: function complete - now synchronous\n");
315 h->trans_state=HRT_IDLE;
324 case 0x30: // merge address space
325 case 0x31: // unmerge address space
326 if (v3_is_hvm_hrt_core(core)) {
327 PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
328 core->vm_regs.rax=-1;
330 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
331 PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state\n",a1==0x30 ? "" : "un");
332 core->vm_regs.rax=-1;
334 uint64_t *page = (uint64_t *) h->comm_page_hva;
336 PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
337 // should sanity check to make sure guest is in 64 bit without anything strange
340 page[1] = core->ctrl_regs.cr3; // this is a do-not-care for an unmerge
342 core->vm_regs.rax = 0;
343 #if USE_UPCALL_MAGIC_PF
344 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %u\n",h->first_hrt_core);
345 core->vm_info->cores[h->first_hrt_core].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
346 if (v3_raise_exception_with_error(&core->vm_info->cores[h->first_hrt_core],
348 UPCALL_MAGIC_ERROR)) {
349 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %u\n",h->first_hrt_core);
350 core->vm_regs.rax = -1;
354 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %u\n",h->hrt_int_vector,h->first_hrt_core);
355 if (v3_raise_swintr(&core->vm_info->cores[h->first_hrt_core],h->hrt_int_vector)) {
356 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %u\n",h->first_hrt_core);
357 core->vm_regs.rax = -1;
360 // Force core to exit now
361 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
363 h->trans_state = HRT_MERGE;
371 case 0x3f: // merge operation done
372 if (v3_is_hvm_ros_core(core)) {
373 PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
374 core->vm_regs.rax=-1;
376 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
377 PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
378 core->vm_regs.rax=-1;
380 PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
381 h->trans_state=HRT_IDLE;
389 PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
390 core->vm_regs.rax=-1;
397 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
399 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
401 v3_cfg_tree_t *hvm_config;
402 v3_cfg_tree_t *ros_config;
403 v3_cfg_tree_t *hrt_config;
409 PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
414 memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
415 vm->hvm_state.is_hvm=0;
416 vm->hvm_state.first_hrt_core=vm->num_cores;
417 vm->hvm_state.first_hrt_gpa=vm->mem_size;
419 if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
420 PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
424 if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
425 PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
429 if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) {
430 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
434 if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) {
435 PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
439 vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
441 if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) {
442 PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
446 vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
448 if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) {
449 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
453 if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) {
454 PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
458 vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
460 if (!vm->hvm_state.hrt_file) {
461 PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
465 if (v3_register_hypercall(vm, HVM_HCALL,
466 hvm_hcall_handler, 0)) {
467 PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
471 // XXX sanity check config here
473 vm->hvm_state.is_hvm=1;
476 if (vm->hvm_state.is_hvm) {
477 V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
478 vm->hvm_state.first_hrt_core-1,
479 (void*) vm->hvm_state.first_hrt_gpa-1,
480 vm->hvm_state.first_hrt_core,
482 (void*) vm->hvm_state.first_hrt_gpa,
483 (void*)vm->mem_size-1,
485 vm->hvm_state.hrt_file->tag);
487 V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
494 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
496 PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
498 v3_remove_hypercall(vm,HVM_HCALL);
500 if (vm->hvm_state.comm_page_hpa) {
501 struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
503 PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
505 v3_delete_mem_region(vm,r);
512 int v3_init_hvm_core(struct guest_info *core)
514 memset(&core->hvm_state,0,sizeof(core->hvm_state));
515 if (core->vm_info->hvm_state.is_hvm) {
516 if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) {
517 core->hvm_state.is_hrt=1;
523 int v3_deinit_hvm_core(struct guest_info *core)
525 PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
531 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
533 if (vm->hvm_state.is_hvm) {
534 return vm->hvm_state.first_hrt_gpa;
539 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
544 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
546 if (vm->hvm_state.is_hvm) {
547 return vm->hvm_state.first_hrt_core;
549 return vm->num_cores;
553 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
555 if (vm->hvm_state.is_hvm) {
556 return vm->num_cores - vm->hvm_state.first_hrt_core;
563 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
565 if (vm->hvm_state.is_hvm) {
566 return gpa>=0 && gpa<vm->hvm_state.first_hrt_gpa;
572 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
574 if (vm->hvm_state.is_hvm) {
575 return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
581 int v3_is_hvm_hrt_core(struct guest_info *core)
583 return core->hvm_state.is_hrt;
586 int v3_is_hvm_ros_core(struct guest_info *core)
588 return !core->hvm_state.is_hrt;
591 int v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
594 // ioapic or msi to apic
595 return !dest->hvm_state.is_hrt;
598 return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
602 void v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm,
603 uint32_t *start_apic, uint32_t *num_apics)
606 // Seen from ioapic, msi, etc:
607 if (vm->hvm_state.is_hvm) {
608 // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
610 *num_apics = vm->hvm_state.first_hrt_core;
612 // Non-HVM shows all cores/APICs to apic, msi, etc.
614 *num_apics = vm->num_cores;
618 if (core->hvm_state.is_hrt) {
619 // HRT core/apic sees all apics
620 // (this policy may change...)
622 *num_apics = vm->num_cores;
624 // non-HRT core/apic sees only non-HRT cores/apics
626 *num_apics = vm->hvm_state.first_hrt_core;
631 #define MAX(x,y) ((x)>(y)?(x):(y))
632 #define MIN(x,y) ((x)<(y)?(x):(y))
635 static uint64_t boot_state_end_addr(struct v3_vm_info *vm)
637 return PAGE_ADDR(vm->mem_size);
640 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
642 *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
646 extern v3_cpu_arch_t v3_mach_type;
648 extern void *v3_hvm_svm_null_int_handler_start;
649 extern void *v3_hvm_svm_null_int_handler_end;
650 extern void *v3_hvm_vmx_null_int_handler_start;
651 extern void *v3_hvm_vmx_null_int_handler_end;
653 static void write_null_int_handler(struct v3_vm_info *vm)
660 get_null_int_handler_loc(vm,&base,&limit);
662 switch (v3_mach_type) {
665 case V3_SVM_REV3_CPU:
666 data = (void*) &v3_hvm_svm_null_int_handler_start;
667 len = (void*) &v3_hvm_svm_null_int_handler_end - data;
673 case V3_VMX_EPT_UG_CPU:
674 data = (void*) &v3_hvm_vmx_null_int_handler_start;
675 len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
679 PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
685 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
688 PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
692 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
694 *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
698 // default IDT entries (int and trap gates)
700 // Format is 16 bytes long:
702 // 16 selector => (target code selector) => 0x8 // entry 1 of GDT
703 // 3 ist => (stack) = 0 => current stack
705 // 4 type => 0xe=>INT, 0xf=>TRAP
706 // 1 reserved => 0 (indicates "system" by being zero)
710 // 32 offsethigh => 0 (total is a 64 bit offset)
713 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
715 // Note little endian
717 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
718 static uint64_t idt64_int_gate_entry_mask[2] = { 0x00008e0000080000, 0x0 };
720 static void write_idt(struct v3_vm_info *vm)
725 uint64_t handler_len;
727 uint64_t trap_gate[2];
728 uint64_t int_gate[2];
730 get_idt_loc(vm,&base,&limit);
732 get_null_int_handler_loc(vm,&handler,&handler_len);
734 handler += vm->hvm_state.gva_offset;
736 memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
737 memcpy(int_gate,idt64_int_gate_entry_mask,16);
740 // update the entries for the handler location
744 hand = (uint8_t*) &handler;
746 mask = (uint8_t *)trap_gate;
747 memcpy(&(mask[0]),&(hand[0]),2); // offset low
748 memcpy(&(mask[6]),&(hand[2]),2); // offset med
749 memcpy(&(mask[8]),&(hand[4]),4); // offset high
751 mask = (uint8_t *)int_gate;
752 memcpy(&(mask[0]),&(hand[0]),2); // offset low
753 memcpy(&(mask[6]),&(hand[2]),2); // offset med
754 memcpy(&(mask[8]),&(hand[4]),4); // offset high
756 PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
760 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
763 for (i=32;i<256;i++) {
764 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
767 PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
772 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
774 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
778 static uint64_t gdt64[3] = {
779 0x0000000000000000, /* null */
780 0x00a09a0000000000, /* code (note lme bit) */
781 0x00a0920000000000, /* data (most entries don't matter) */
784 static void write_gdt(struct v3_vm_info *vm)
789 get_gdt_loc(vm,&base,&limit);
790 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
792 PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
797 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
799 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
803 static void write_tss(struct v3_vm_info *vm)
808 get_tss_loc(vm,&base,&limit);
810 v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
812 PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
816 #define TOP_HALF_START 0xffff800000000000ULL
817 #define BOTTOM_HALF_END 0x00007fffffffffffULL
820 #define L4_UNIT PAGE_SIZE
821 #define L3_UNIT (512ULL * L4_UNIT)
822 #define L2_UNIT (512ULL * L3_UNIT)
823 #define L1_UNIT (512ULL * L2_UNIT)
825 static void compute_pts_4KB(struct v3_vm_info *vm,
826 uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)
829 // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
830 // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
831 // so it is the same number of page tables regardless
833 uint64_t max_gva = vm->hvm_state.max_mem_mapped;
836 *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
837 *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
838 *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
844 PTS MAP using 1 GB pages
845 n second levels pts, highest gva, highest address
851 PTS MAP using 2 MB pages
852 n third level pts, highest gva, highest address
853 m second level pts, highest gva, highest address
858 PTS MAP using 4 KB pages
859 n 4th level, highest gva, highest address
860 m 3rd level, highest gva, hihgest address
861 l second level, highest gva, highest address
865 PTS MAP using 512 GB pages when this becomes available
870 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
872 uint64_t l1,l2,l3,l4;
875 compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
877 if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
879 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
881 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
882 num_pt = l1 + l2 + l3;
883 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
884 num_pt = l1 + l2 + l3 + l4;
886 PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
890 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
891 *limit = num_pt*PAGE_SIZE;
894 static void write_pts(struct v3_vm_info *vm)
897 uint64_t num_l1, num_l2, num_l3, num_l4;
898 void *start_l1, *start_l2, *start_l3, *start_l4;
904 void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
905 void *min_gva = (void*) vm->hvm_state.gva_offset;
906 #ifdef V3_CONFIG_DEBUG_HVM
907 void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
910 uint64_t i_start,i_end;
912 struct pml4e64 *pml4e;
917 if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
918 PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
920 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
922 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
924 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
927 PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
931 get_pt_loc(vm,&start_l1,&size);
932 compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
934 start_l2=start_l1+PAGE_SIZE*num_l1;
935 start_l3=start_l2+PAGE_SIZE*num_l2;
936 start_l4=start_l3+PAGE_SIZE*num_l3;
938 PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
939 PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
940 PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
941 PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
945 // build PML4 (only one)
946 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) {
947 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
951 memset(pml4e,0,PAGE_SIZE);
954 i_start=0; i_end = num_l2;
955 } else if (min_gva==(void*)TOP_HALF_START) {
956 i_start=256; i_end=256+num_l2;
958 PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
962 for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
964 i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
970 PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
971 pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
972 //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
974 pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
975 //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
986 for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
988 cur_pt+=PAGE_SIZE, pt++) {
991 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) {
992 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
996 memset(pdpe,0,PAGE_SIZE);
999 i<512 && cur_gpa<max_gpa;
1000 i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
1006 pdpe[i].large_page=1;
1007 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1008 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1010 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
1011 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1021 for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1023 cur_pt+=PAGE_SIZE, pt++) {
1026 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) {
1027 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
1031 memset(pde,0,PAGE_SIZE);
1034 i<512 && cur_gpa<max_gpa;
1035 i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
1041 pde[i].large_page=1;
1042 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1043 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
1045 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
1046 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
1058 for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1060 cur_pt+=PAGE_SIZE, pt++) {
1063 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) {
1064 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
1068 memset(pte,0,PAGE_SIZE);
1071 i<512 && cur_gpa<max_gpa;
1072 i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
1076 pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1077 //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
1085 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1088 get_pt_loc(vm,base, limit);
1094 int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
1096 struct v3_vm_info *vm = core->vm_info;
1098 hrt->tag.type = MB_INFO_HRT_TAG;
1099 hrt->tag.size = sizeof(mb_info_hrt_t);
1101 hrt->total_num_apics = vm->num_cores;
1102 hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
1103 hrt->have_hrt_ioapic=0;
1104 hrt->first_hrt_ioapic_entry=0;
1106 hrt->cpu_freq_khz = V3_CPU_KHZ();
1108 hrt->hrt_flags = vm->hvm_state.hrt_flags;
1109 hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
1110 hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
1111 hrt->gva_offset = vm->hvm_state.gva_offset;
1112 hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
1113 hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
1118 static void write_mb_info(struct v3_vm_info *vm)
1120 if (vm->hvm_state.hrt_type!=HRT_MBOOT64) {
1121 PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
1129 get_mb_info_loc(vm,&base,&limit);
1131 if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) {
1132 PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
1137 PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
1141 v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
1146 PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
1150 #define SCRATCH_STACK_SIZE 4096
1153 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1158 get_mb_info_loc(vm,&mb_base,&mb_limit);
1160 mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
1162 *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
1164 if (mb_base < *base+PAGE_SIZE) {
1165 PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
1168 *limit = mb_base - *base;
1172 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1173 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1175 #define ELF_MAGIC 0x464c457f
1176 #define MB2_MAGIC 0xe85250d6
1178 #define MB2_INFO_MAGIC 0x36d76289
1180 static int is_elf(uint8_t *data, uint64_t size)
1182 if (*((uint32_t*)data)==ELF_MAGIC) {
1189 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
1191 uint64_t limit = size > 32768 ? 32768 : size;
1194 // Scan for the .boot magic cookie
1195 // must be in first 32K, assume 4 byte aligned
1196 for (i=0;i<limit;i+=4) {
1197 if (*((uint32_t*)&data[i])==MB2_MAGIC) {
1198 INFO("Found multiboot header at offset 0x%llx\n",i);
1199 return (mb_header_t *) &data[i];
1206 static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
1208 struct v3_vm_hvm *h = &vm->hvm_state;
1209 uint64_t f = mb->mb64_hrt->hrt_flags;
1210 uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
1211 uint64_t gvaoff = mb->mb64_hrt->gva_offset;
1212 uint64_t gvaentry = mb->mb64_hrt->gva_entry;
1213 uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
1214 uint8_t vec = mb->mb64_hrt->hrt_int_vector;
1217 PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
1218 f, maxmap, gvaoff,gvaentry,commgpa, vec);
1220 if (maxmap<0x100000000ULL) {
1221 PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
1222 maxmap=0x100000000ULL;
1225 if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
1226 PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
1228 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
1230 f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
1231 h->max_mem_mapped = maxmap;
1232 PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
1233 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
1235 f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
1236 h->max_mem_mapped = maxmap;
1237 PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
1238 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
1240 f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
1241 h->max_mem_mapped = maxmap;
1242 PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
1244 PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
1248 if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
1249 PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
1255 if (maxmap>h->max_mem_mapped) {
1256 PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
1260 if (gvaoff!=0 && gvaoff!=TOP_HALF_START) {
1261 PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
1265 h->gva_offset = gvaoff;
1267 h->gva_entry = gvaentry;
1269 if (mb->addr->load_addr < h->first_hrt_gpa) {
1270 PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
1274 if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
1275 PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
1280 PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
1284 h->hrt_int_vector = vec;
1287 if (commgpa < vm->mem_size) {
1288 PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
1292 h->comm_page_gpa = commgpa;
1294 if (!h->comm_page_hpa) {
1295 if (!(h->comm_page_hpa=V3_AllocPages(1))) {
1296 PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
1300 h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
1302 memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1304 if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) {
1305 PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
1306 V3_FreePages((void*)(h->comm_page_gpa),1);
1311 PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
1314 memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1317 PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
1318 h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
1324 static int setup_mb_kernel_hrt(struct v3_vm_info *vm)
1328 if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) {
1329 PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
1333 if (configure_hrt(vm,&mb)) {
1334 PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
1338 if (v3_write_multiboot_kernel(vm,&mb,vm->hvm_state.hrt_file,
1339 (void*)vm->hvm_state.first_hrt_gpa,
1340 vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
1341 PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
1345 if (vm->hvm_state.gva_entry) {
1346 vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
1348 vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
1351 vm->hvm_state.hrt_type = HRT_MBOOT64;
1358 static int setup_hrt(struct v3_vm_info *vm)
1360 if (is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size) &&
1361 find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) {
1363 PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
1364 if (setup_mb_kernel_hrt(vm)) {
1365 PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
1369 PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
1386 We do not touch the ROS portion of the address space.
1387 The HRT portion looks like:
1389 INT_HANDLER (1 page - page aligned)
1390 IDT (1 page - page aligned)
1391 GDT (1 page - page aligned)
1392 TSS (1 page - page asligned)
1393 PAGETABLES (identy map of first N GB)
1394 ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
1395 followed by 3rd level PTs in order, followed by 4th level
1398 SCRATCH_STACK_HRT_CORE0
1399 SCRATCH_STACK_HRT_CORE1
1401 SCRATCH_STACK_HRT_COREN
1403 HRT (as many pages as needed, page-aligned, starting at first HRT address)
1411 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
1413 if (!vm->hvm_state.is_hvm) {
1414 PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
1418 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
1420 if (setup_hrt(vm)) {
1421 PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
1425 // the locations of all the other items are determined by
1426 // the HRT setup, so these must happen after
1428 write_null_int_handler(vm);
1435 // this must happen last
1438 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
1444 On entry for every core:
1446 IDTR points to stub IDT
1447 GDTR points to stub GDT
1448 TS points to stub TSS
1449 CR3 points to root page table
1451 EFER has LME AND LMA (and NX for compatibility with Linux)
1452 RSP is TOS of core's scratch stack (looks like a call)
1454 RAX = MB magic cookie
1455 RBX = address of multiboot info table
1456 RCX = this core id / apic id (0..N-1)
1457 RDX = this core id - first HRT core ID (==0 for the first HRT core)
1459 All addresses are virtual addresses, offset as needed by gva_offset
1461 Other regs are zeroed
1463 shadow/nested paging state reset for long mode
1466 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
1470 uint64_t gva_offset;
1472 rdtscll(core->hvm_state.last_boot_start);
1475 if (!core->hvm_state.is_hrt) {
1476 PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
1481 PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
1483 gva_offset = core->vm_info->hvm_state.gva_offset;
1485 memset(&core->vm_regs,0,sizeof(core->vm_regs));
1486 memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
1487 memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
1488 memset(&core->segments,0,sizeof(core->segments));
1489 memset(&core->msrs,0,sizeof(core->msrs));
1490 memset(&core->fp_state,0,sizeof(core->fp_state));
1492 // We are in long mode with virtual memory and we want
1493 // to start immediatley
1494 core->cpl = 0; // we are going right into the kernel
1495 core->cpu_mode = LONG;
1496 core->mem_mode = VIRTUAL_MEM;
1497 core->core_run_state = CORE_RUNNING ;
1501 core->vm_regs.rax = MB2_INFO_MAGIC;
1503 // multiboot info pointer
1504 get_mb_info_loc(core->vm_info, &base,&limit);
1505 core->vm_regs.rbx = (uint64_t) base + gva_offset;
1508 core->vm_regs.rcx = core->vcpu_id;
1511 core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1513 // Now point to scratch stack for this core
1514 // it begins at an ofset relative to the MB info page
1515 get_mb_info_loc(core->vm_info, &base,&limit);
1516 base = base + gva_offset;
1517 base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1518 core->vm_regs.rsp = (v3_reg_t) base;
1519 core->vm_regs.rbp = (v3_reg_t) base-8;
1521 // push onto the stack a bad rbp and bad return address
1522 core->vm_regs.rsp-=16;
1523 v3_set_gpa_memory(core,
1524 core->vm_regs.rsp-gva_offset,
1530 get_hrt_loc(core->vm_info, &base,&limit);
1531 if (core->vm_info->hvm_state.gva_entry) {
1532 core->rip = core->vm_info->hvm_state.gva_entry;
1534 core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset;
1539 PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1540 (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1542 (void*)(core->vm_regs.rsp),
1543 (void*)(core->vm_regs.rbp),
1544 (void*)(core->vm_regs.rax),
1545 (void*)(core->vm_regs.rbx),
1546 (void*)(core->vm_regs.rcx),
1547 (void*)(core->vm_regs.rdx));
1549 // Setup CRs for long mode and our stub page table
1551 core->ctrl_regs.cr0 = 0x80000001;
1552 core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1554 // CR2: don't care (output from #PF)
1555 // CE3: set to our PML4E, without setting PCD or PWT
1556 get_pt_loc(core->vm_info, &base,&limit);
1557 core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base); // not offset as this is a GPA
1558 core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1560 // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1561 core->ctrl_regs.cr4 = 0xb0;
1562 core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1564 // RFLAGS zeroed is fine: come in with interrupts off
1565 // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
1566 core->ctrl_regs.efer = 0x1d00;
1567 core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1573 selector is 13 bits of index, 1 bit table indicator
1576 index is scaled by 8, even in long mode, where some entries
1577 are 16 bytes long....
1578 -> code, data descriptors have 8 byte format
1579 because base, limit, etc, are ignored (no segmentation)
1580 -> interrupt/trap gates have 16 byte format
1581 because offset needs to be 64 bits
1584 // Install our stub IDT
1585 get_idt_loc(core->vm_info, &base,&limit);
1587 core->segments.idtr.selector = 0; // entry 0 (NULL) of the GDT
1588 core->segments.idtr.base = (addr_t) base; // only base+limit are used
1589 core->segments.idtr.limit = limit-1;
1590 core->segments.idtr.type = 0x0;
1591 core->segments.idtr.system = 0;
1592 core->segments.idtr.dpl = 0;
1593 core->segments.idtr.present = 0;
1594 core->segments.idtr.long_mode = 0;
1596 // Install our stub GDT
1597 get_gdt_loc(core->vm_info, &base,&limit);
1599 core->segments.gdtr.selector = 0; // entry 0 (NULL) of the GDT
1600 core->segments.gdtr.base = (addr_t) base;
1601 core->segments.gdtr.limit = limit-1; // only base+limit are used
1602 core->segments.gdtr.type = 0x0;
1603 core->segments.gdtr.system = 0;
1604 core->segments.gdtr.dpl = 0;
1605 core->segments.gdtr.present = 0;
1606 core->segments.gdtr.long_mode = 0;
1609 get_tss_loc(core->vm_info, &base,&limit);
1611 core->segments.tr.selector = 0;
1612 core->segments.tr.base = (addr_t) base;
1613 core->segments.tr.limit = limit-1;
1614 core->segments.tr.type = 0x9;
1615 core->segments.tr.system = 0; // available 64 bit TSS
1616 core->segments.tr.dpl = 0;
1617 core->segments.tr.present = 1;
1618 core->segments.tr.long_mode = 0; // not used
1620 base = 0x0; // these are not offset as we want to make all gvas visible
1624 core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1625 core->segments.cs.base = (addr_t) base; // not used
1626 core->segments.cs.limit = limit; // not used
1627 core->segments.cs.type = 0xe; // only C is used
1628 core->segments.cs.system = 1; // not a system segment
1629 core->segments.cs.dpl = 0;
1630 core->segments.cs.present = 1;
1631 core->segments.cs.long_mode = 1;
1633 // DS, SS, etc are identical
1634 core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1635 core->segments.ds.base = (addr_t) base;
1636 core->segments.ds.limit = limit;
1637 core->segments.ds.type = 0x6; // ignored
1638 core->segments.ds.system = 1; // not a system segment
1639 core->segments.ds.dpl = 0;
1640 core->segments.ds.present = 1;
1641 core->segments.ds.long_mode = 1;
1643 memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1644 memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1645 memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1646 memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1649 // reset paging here for shadow...
1651 if (core->shdw_pg_mode != NESTED_PAGING) {
1652 PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
1660 int v3_handle_hvm_reset(struct guest_info *core)
1663 if (core->core_run_state != CORE_RESETTING) {
1667 if (!core->vm_info->hvm_state.is_hvm) {
1671 if (v3_is_hvm_hrt_core(core)) {
1672 // this is an HRT reset
1675 // wait for all the HRT cores
1676 v3_counting_barrier(&core->vm_info->reset_barrier);
1678 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1680 core->vm_info->run_state = VM_RESETTING;
1683 core->core_run_state = CORE_RESETTING;
1685 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1686 // we really only need to clear the bss
1687 // and recopy the .data, but for now we'll just
1689 rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
1692 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
1696 // now everyone is ready to reset
1697 rc |= v3_setup_hvm_hrt_core_for_boot(core);
1700 PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
1703 core->core_run_state = CORE_RUNNING;
1705 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1707 core->vm_info->run_state = VM_RUNNING;
1708 core->vm_info->hvm_state.trans_state = HRT_IDLE;
1711 v3_counting_barrier(&core->vm_info->reset_barrier);
1714 PrintError(core->vm_info,core,"hvm: reset failed\n");
1721 // ROS core will be handled by normal reset functionality