2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org>
11 * All rights reserved.
13 * Author: Peter Dinda <pdinda@northwestern.edu>
15 * This is free software. You are permitted to use,
16 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
27 #include <palacios/vmm_xml.h>
29 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_debug.h>
36 MEM = Total size of memory in the GPA (in MB)
37 ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
39 GPAs [0,ROS_MEM) are what the ROS sees
40 GPAs [ROS_MEM, MEM) are HRT only
41 GPAS [0,MEM) are accessible by the HRT
43 CORES = Total number of cores in VM
44 ROS_CORES = Total numbber of cores for the ROS
46 Cores [0,ROS_CORES) are what the ROS sees
47 Cores [ROS_CORES,CORES) are HRT only
48 Cores [0,CORES) are accessible by the HRT
53 <file id="hrtelf" filename="hrtelf.o" />
56 <mem ... >RAM</mem> (MB) Note these are
57 <cores count="CORES" ...> backward compatible
60 <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
61 <hrt file_id="hrtelf" /hrt>
66 #ifndef V3_CONFIG_DEBUG_HVM
68 #define PrintDebug(fmt, args...)
74 PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
80 PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
84 // ignore requests from when we are in the wrong state
85 #define ENFORCE_STATE_MACHINE 1
87 // invoke the HRT using a page fault instead of
88 // the SWINTR mechanism
89 #define USE_UPCALL_MAGIC_PF 1
90 #define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
91 #define UPCALL_MAGIC_ERROR 0xf00df00d
94 64 bit only hypercall:
96 rax = hypercall number
98 then args are: rcx, rdx, rsi, rdi r8, r9, r10, r11
101 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
104 uint64_t bitness = core->vm_regs.rbx;
105 uint64_t a1 = core->vm_regs.rcx;
106 uint64_t a2 = core->vm_regs.rdx;
107 uint64_t a3 = core->vm_regs.rsi;
108 struct v3_vm_hvm *h = &core->vm_info->hvm_state;
111 if (bitness!=0x6464646464646464) {
112 PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
113 core->vm_regs.rax = -1;
122 V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
123 hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
124 //v3_print_core_telemetry(core);
125 // v3_print_guest_state(core);
126 core->vm_regs.rax = 0;
129 case 0x1: // reset ros
130 PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
131 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) {
132 PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
133 core->vm_regs.rax = -1;
135 core->vm_regs.rax = 0;
139 case 0x2: // reset hrt
140 PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
141 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) {
142 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
143 core->vm_regs.rax = -1;
145 core->vm_regs.rax = 0;
149 case 0x3: // reset both
150 PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
151 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) {
152 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
153 core->vm_regs.rax = -1;
155 core->vm_regs.rax = 0;
159 case 0x8: // replace HRT image
161 // a3 = size of image
162 PrintDebug(core->vm_info,core,"hvm: request replacement HRT image addr=0x%llx size=0x%llx\n",a2,a3);
166 V3_VFree(h->hrt_image);
170 h->hrt_image = V3_VMalloc(a3);
172 if (!(h->hrt_image)) {
173 PrintError(core->vm_info,core, "hvm: failed to allocate space for replacement image\n");
174 core->vm_regs.rax = -1;
176 if (v3_read_gva_memory(core, a2, a3, (uint8_t*) h->hrt_image)!=a3) {
177 PrintError(core->vm_info, core, "hvm: cannot read replacement image\n");
178 core->vm_regs.rax = -1;
180 h->hrt_image_size = a3;
181 core->vm_regs.rax = 0;
185 if (core->vm_regs.rax) {
186 PrintError(core->vm_info,core,"hvm: Failed to replace HRT image\n");
188 PrintDebug(core->vm_info,core,"hvm: HRT image successfully replaced\n");
193 case 0xf: // get HRT state
194 core->vm_regs.rax = h->trans_state;
195 if (v3_write_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*) &h->ros_event)!=sizeof(h->ros_event)) {
196 PrintError(core->vm_info, core, "hvm: cannot write back ROS event state to %p - continuing\n",(void*)a2);
198 //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
202 PrintDebug(core->vm_info, core, "hvm: ROS event request\n");
203 if (h->ros_event.event_type!=ROS_NONE) {
204 PrintError(core->vm_info, core, "hvm: ROS event is already in progress\n");
205 core->vm_regs.rax = -1;
207 if (v3_read_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*)&h->ros_event)!=sizeof(h->ros_event)) {
208 PrintError(core->vm_info, core, "hvm: cannot read ROS event from %p\n",(void*)a2);
209 core->vm_regs.rax = -1;
211 core->vm_regs.rax = 0;
218 PrintDebug(core->vm_info, core, "hvm: completion of ROS event (rc=0x%llx)\n",a2);
219 h->ros_event.event_type=ROS_NONE;
220 h->ros_event.last_ros_event_result = a2;
223 case 0x20: // invoke function (ROS->HRT)
224 case 0x21: // invoke parallel function (ROS->HRT)
225 if (v3_is_hvm_hrt_core(core)) {
226 PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
227 core->vm_regs.rax = -1;
229 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
230 PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
231 core->vm_regs.rax = -1;
233 uint64_t *page = (uint64_t *) h->comm_page_hva;
234 uint64_t first, last, cur;
236 PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
241 first=last=h->first_hrt_core;
243 first=h->first_hrt_core;
244 last=core->vm_info->num_cores-1;
247 core->vm_regs.rax = 0;
249 h->trans_count = last-first+1;
251 for (cur=first;cur<=last;cur++) {
253 #if USE_UPCALL_MAGIC_PF
254 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
255 core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
256 if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
258 UPCALL_MAGIC_ERROR)) {
259 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
260 core->vm_regs.rax = -1;
264 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
265 if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) {
266 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
267 core->vm_regs.rax = -1;
271 // Force core to exit now
272 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
275 if (core->vm_regs.rax==0) {
277 h->trans_state = HRT_CALL;
279 h->trans_state = HRT_PARCALL;
282 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
283 h->trans_state = HRT_IDLE;
291 case 0x28: // setup for synchronous operation (ROS->HRT)
292 case 0x29: // teardown for synchronous operation (ROS->HRT)
293 if (v3_is_hvm_hrt_core(core)) {
294 PrintError(core->vm_info,core,"hvm: %ssynchronization invocation not supported from HRT core\n",a1==0x29 ? "de" : "");
295 core->vm_regs.rax = -1;
297 if (ENFORCE_STATE_MACHINE &&
298 ((a1==0x28 && h->trans_state!=HRT_IDLE) || (a1==0x29 && h->trans_state!=HRT_SYNC))) {
299 PrintError(core->vm_info,core, "hvm: cannot invoke %ssynchronization in state %d\n",a1==0x29 ? "de" : "", h->trans_state);
300 core->vm_regs.rax = -1;
302 uint64_t *page = (uint64_t *) h->comm_page_hva;
303 uint64_t first, last, cur;
305 PrintDebug(core->vm_info,core, "hvm: invoke %ssynchronization on address %p\n",a1==0x29 ? "de" : "",(void*)a2);
309 first=last=h->first_hrt_core; // initially we will sync only with BSP
311 core->vm_regs.rax = 0;
313 h->trans_count = last-first+1;
315 for (cur=first;cur<=last;cur++) {
317 #if USE_UPCALL_MAGIC_PF
318 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
319 core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
320 if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
322 UPCALL_MAGIC_ERROR)) {
323 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
324 core->vm_regs.rax = -1;
328 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
329 if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) {
330 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
331 core->vm_regs.rax = -1;
335 // Force core to exit now
336 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
339 if (core->vm_regs.rax==0) {
341 h->trans_state = HRT_SYNCSETUP;
343 h->trans_state = HRT_SYNCTEARDOWN;
346 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
347 h->trans_state = HRT_IDLE;
354 case 0x2f: // function exec or sync done
355 if (v3_is_hvm_ros_core(core)) {
356 PrintError(core->vm_info,core, "hvm: request for exec or sync done from ROS core\n");
357 core->vm_regs.rax=-1;
359 if (ENFORCE_STATE_MACHINE &&
360 h->trans_state!=HRT_CALL &&
361 h->trans_state!=HRT_PARCALL &&
362 h->trans_state!=HRT_SYNCSETUP &&
363 h->trans_state!=HRT_SYNCTEARDOWN) {
364 PrintError(core->vm_info,core,"hvm: function or sync completion when not in HRT_CALL, HRT_PARCALL, HRT_SYNCSETUP, or HRT_SYNCTEARDOWN state\n");
365 core->vm_regs.rax=-1;
368 PrintDebug(core->vm_info,core, "hvm: function or sync complete\n");
369 if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
370 // last one, switch state
371 if (h->trans_state==HRT_SYNCSETUP) {
372 h->trans_state=HRT_SYNC;
373 PrintDebug(core->vm_info,core, "hvm: function complete - now synchronous\n");
375 h->trans_state=HRT_IDLE;
384 case 0x30: // merge address space
385 case 0x31: // unmerge address space
386 if (v3_is_hvm_hrt_core(core)) {
387 PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
388 core->vm_regs.rax=-1;
390 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
391 PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state\n",a1==0x30 ? "" : "un");
392 core->vm_regs.rax=-1;
394 uint64_t *page = (uint64_t *) h->comm_page_hva;
396 PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
397 // should sanity check to make sure guest is in 64 bit without anything strange
400 page[1] = core->ctrl_regs.cr3; // this is a do-not-care for an unmerge
402 core->vm_regs.rax = 0;
403 #if USE_UPCALL_MAGIC_PF
404 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %u\n",h->first_hrt_core);
405 core->vm_info->cores[h->first_hrt_core].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
406 if (v3_raise_exception_with_error(&core->vm_info->cores[h->first_hrt_core],
408 UPCALL_MAGIC_ERROR)) {
409 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %u\n",h->first_hrt_core);
410 core->vm_regs.rax = -1;
414 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %u\n",h->hrt_int_vector,h->first_hrt_core);
415 if (v3_raise_swintr(&core->vm_info->cores[h->first_hrt_core],h->hrt_int_vector)) {
416 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %u\n",h->first_hrt_core);
417 core->vm_regs.rax = -1;
420 // Force core to exit now
421 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
423 h->trans_state = HRT_MERGE;
431 case 0x3f: // merge operation done
432 if (v3_is_hvm_ros_core(core)) {
433 PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
434 core->vm_regs.rax=-1;
436 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
437 PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
438 core->vm_regs.rax=-1;
440 PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
441 h->trans_state=HRT_IDLE;
449 PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
450 core->vm_regs.rax=-1;
457 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
459 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
461 v3_cfg_tree_t *hvm_config;
462 v3_cfg_tree_t *ros_config;
463 v3_cfg_tree_t *hrt_config;
469 PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
474 memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
475 vm->hvm_state.is_hvm=0;
476 vm->hvm_state.first_hrt_core=vm->num_cores;
477 vm->hvm_state.first_hrt_gpa=vm->mem_size;
479 if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
480 PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
484 if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
485 PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
489 if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) {
490 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
494 if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) {
495 PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
499 vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
501 if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) {
502 PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
506 vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
508 if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) {
509 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
513 if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) {
514 PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
518 vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
520 if (!vm->hvm_state.hrt_file) {
521 PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
525 if (v3_register_hypercall(vm, HVM_HCALL,
526 hvm_hcall_handler, 0)) {
527 PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
531 // XXX sanity check config here
533 vm->hvm_state.is_hvm=1;
536 if (vm->hvm_state.is_hvm) {
537 V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
538 vm->hvm_state.first_hrt_core-1,
539 (void*) vm->hvm_state.first_hrt_gpa-1,
540 vm->hvm_state.first_hrt_core,
542 (void*) vm->hvm_state.first_hrt_gpa,
543 (void*)vm->mem_size-1,
545 vm->hvm_state.hrt_file->tag);
547 V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
554 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
556 PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
558 if (vm->hvm_state.hrt_image) {
559 V3_VFree(vm->hvm_state.hrt_image);
560 vm->hvm_state.hrt_image=0;
561 vm->hvm_state.hrt_image_size=0;
564 v3_remove_hypercall(vm,HVM_HCALL);
566 if (vm->hvm_state.comm_page_hpa) {
567 struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
569 PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
571 v3_delete_mem_region(vm,r);
578 int v3_init_hvm_core(struct guest_info *core)
580 memset(&core->hvm_state,0,sizeof(core->hvm_state));
581 if (core->vm_info->hvm_state.is_hvm) {
582 if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) {
583 core->hvm_state.is_hrt=1;
589 int v3_deinit_hvm_core(struct guest_info *core)
591 PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
597 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
599 if (vm->hvm_state.is_hvm) {
600 return vm->hvm_state.first_hrt_gpa;
605 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
610 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
612 if (vm->hvm_state.is_hvm) {
613 return vm->hvm_state.first_hrt_core;
615 return vm->num_cores;
619 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
621 if (vm->hvm_state.is_hvm) {
622 return vm->num_cores - vm->hvm_state.first_hrt_core;
629 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
631 if (vm->hvm_state.is_hvm) {
632 return gpa<vm->hvm_state.first_hrt_gpa;
638 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
640 if (vm->hvm_state.is_hvm) {
641 return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
647 int v3_is_hvm_hrt_core(struct guest_info *core)
649 return core->hvm_state.is_hrt;
652 int v3_is_hvm_ros_core(struct guest_info *core)
654 return !core->hvm_state.is_hrt;
657 int v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
660 // ioapic or msi to apic
661 return !dest->hvm_state.is_hrt;
664 return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
668 void v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm,
669 uint32_t *start_apic, uint32_t *num_apics)
672 // Seen from ioapic, msi, etc:
673 if (vm->hvm_state.is_hvm) {
674 // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
676 *num_apics = vm->hvm_state.first_hrt_core;
678 // Non-HVM shows all cores/APICs to apic, msi, etc.
680 *num_apics = vm->num_cores;
684 if (core->hvm_state.is_hrt) {
685 // HRT core/apic sees all apics
686 // (this policy may change...)
688 *num_apics = vm->num_cores;
690 // non-HRT core/apic sees only non-HRT cores/apics
692 *num_apics = vm->hvm_state.first_hrt_core;
697 #define MAX(x,y) ((x)>(y)?(x):(y))
698 #define MIN(x,y) ((x)<(y)?(x):(y))
701 static uint64_t boot_state_end_addr(struct v3_vm_info *vm)
703 return PAGE_ADDR(vm->mem_size);
706 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
708 *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
712 extern v3_cpu_arch_t v3_mach_type;
714 extern void *v3_hvm_svm_null_int_handler_start;
715 extern void *v3_hvm_svm_null_int_handler_end;
716 extern void *v3_hvm_vmx_null_int_handler_start;
717 extern void *v3_hvm_vmx_null_int_handler_end;
719 static void write_null_int_handler(struct v3_vm_info *vm)
726 get_null_int_handler_loc(vm,&base,&limit);
728 switch (v3_mach_type) {
731 case V3_SVM_REV3_CPU:
732 data = (void*) &v3_hvm_svm_null_int_handler_start;
733 len = (void*) &v3_hvm_svm_null_int_handler_end - data;
739 case V3_VMX_EPT_UG_CPU:
740 data = (void*) &v3_hvm_vmx_null_int_handler_start;
741 len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
745 PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
751 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
754 PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
758 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
760 *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
764 // default IDT entries (int and trap gates)
766 // Format is 16 bytes long:
768 // 16 selector => (target code selector) => 0x8 // entry 1 of GDT
769 // 3 ist => (stack) = 0 => current stack
771 // 4 type => 0xe=>INT, 0xf=>TRAP
772 // 1 reserved => 0 (indicates "system" by being zero)
776 // 32 offsethigh => 0 (total is a 64 bit offset)
779 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
781 // Note little endian
783 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
784 static uint64_t idt64_int_gate_entry_mask[2] = { 0x00008e0000080000, 0x0 };
786 static void write_idt(struct v3_vm_info *vm)
791 uint64_t handler_len;
793 uint64_t trap_gate[2];
794 uint64_t int_gate[2];
796 get_idt_loc(vm,&base,&limit);
798 get_null_int_handler_loc(vm,&handler,&handler_len);
800 handler += vm->hvm_state.gva_offset;
802 memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
803 memcpy(int_gate,idt64_int_gate_entry_mask,16);
806 // update the entries for the handler location
810 hand = (uint8_t*) &handler;
812 mask = (uint8_t *)trap_gate;
813 memcpy(&(mask[0]),&(hand[0]),2); // offset low
814 memcpy(&(mask[6]),&(hand[2]),2); // offset med
815 memcpy(&(mask[8]),&(hand[4]),4); // offset high
817 mask = (uint8_t *)int_gate;
818 memcpy(&(mask[0]),&(hand[0]),2); // offset low
819 memcpy(&(mask[6]),&(hand[2]),2); // offset med
820 memcpy(&(mask[8]),&(hand[4]),4); // offset high
822 PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
826 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
829 for (i=32;i<256;i++) {
830 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
833 PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
838 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
840 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
844 static uint64_t gdt64[3] = {
845 0x0000000000000000, /* null */
846 0x00a09a0000000000, /* code (note lme bit) */
847 0x00a0920000000000, /* data (most entries don't matter) */
850 static void write_gdt(struct v3_vm_info *vm)
855 get_gdt_loc(vm,&base,&limit);
856 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
858 PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
863 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
865 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
869 static void write_tss(struct v3_vm_info *vm)
874 get_tss_loc(vm,&base,&limit);
876 v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
878 PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
882 #define TOP_HALF_START 0xffff800000000000ULL
883 #define BOTTOM_HALF_END 0x00007fffffffffffULL
886 #define L4_UNIT PAGE_SIZE
887 #define L3_UNIT (512ULL * L4_UNIT)
888 #define L2_UNIT (512ULL * L3_UNIT)
889 #define L1_UNIT (512ULL * L2_UNIT)
891 static void compute_pts_4KB(struct v3_vm_info *vm,
892 uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)
895 // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
896 // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
897 // so it is the same number of page tables regardless
899 uint64_t max_gva = vm->hvm_state.max_mem_mapped;
902 *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
903 *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
904 *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
910 PTS MAP using 1 GB pages
911 n second levels pts, highest gva, highest address
917 PTS MAP using 2 MB pages
918 n third level pts, highest gva, highest address
919 m second level pts, highest gva, highest address
924 PTS MAP using 4 KB pages
925 n 4th level, highest gva, highest address
926 m 3rd level, highest gva, hihgest address
927 l second level, highest gva, highest address
931 PTS MAP using 512 GB pages when this becomes available
936 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
938 uint64_t l1,l2,l3,l4;
941 compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
943 if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
945 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
947 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
948 num_pt = l1 + l2 + l3;
949 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
950 num_pt = l1 + l2 + l3 + l4;
952 PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
956 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
957 *limit = num_pt*PAGE_SIZE;
960 static void write_pts(struct v3_vm_info *vm)
963 uint64_t num_l1, num_l2, num_l3, num_l4;
964 void *start_l1, *start_l2, *start_l3, *start_l4;
970 void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
971 void *min_gva = (void*) vm->hvm_state.gva_offset;
972 #ifdef V3_CONFIG_DEBUG_HVM
973 void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
976 uint64_t i_start,i_end;
978 struct pml4e64 *pml4e;
983 if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
984 PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
986 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
988 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
990 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
993 PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
997 get_pt_loc(vm,&start_l1,&size);
998 compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
1000 start_l2=start_l1+PAGE_SIZE*num_l1;
1001 start_l3=start_l2+PAGE_SIZE*num_l2;
1002 start_l4=start_l3+PAGE_SIZE*num_l3;
1004 PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
1005 PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
1006 PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
1007 PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
1011 // build PML4 (only one)
1012 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) {
1013 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
1017 memset(pml4e,0,PAGE_SIZE);
1020 i_start=0; i_end = num_l2;
1021 } else if (min_gva==(void*)TOP_HALF_START) {
1022 i_start=256; i_end=256+num_l2;
1024 PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
1028 for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
1030 i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
1033 pml4e[i].writable=1;
1036 PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
1037 pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1038 //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1040 pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
1041 //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1052 for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1054 cur_pt+=PAGE_SIZE, pt++) {
1057 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) {
1058 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
1062 memset(pdpe,0,PAGE_SIZE);
1065 i<512 && cur_gpa<max_gpa;
1066 i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
1072 pdpe[i].large_page=1;
1073 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1074 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1076 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
1077 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1087 for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1089 cur_pt+=PAGE_SIZE, pt++) {
1092 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) {
1093 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
1097 memset(pde,0,PAGE_SIZE);
1100 i<512 && cur_gpa<max_gpa;
1101 i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
1107 pde[i].large_page=1;
1108 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1109 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
1111 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
1112 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
1124 for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1126 cur_pt+=PAGE_SIZE, pt++) {
1129 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) {
1130 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
1134 memset(pte,0,PAGE_SIZE);
1137 i<512 && cur_gpa<max_gpa;
1138 i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
1142 pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1143 //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
1151 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1154 get_pt_loc(vm,base, limit);
1160 int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
1162 struct v3_vm_info *vm = core->vm_info;
1164 hrt->tag.type = MB_INFO_HRT_TAG;
1165 hrt->tag.size = sizeof(mb_info_hrt_t);
1167 hrt->total_num_apics = vm->num_cores;
1168 hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
1169 hrt->have_hrt_ioapic=0;
1170 hrt->first_hrt_ioapic_entry=0;
1172 hrt->cpu_freq_khz = V3_CPU_KHZ();
1174 hrt->hrt_flags = vm->hvm_state.hrt_flags;
1175 hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
1176 hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
1177 hrt->gva_offset = vm->hvm_state.gva_offset;
1178 hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
1179 hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
1184 static void write_mb_info(struct v3_vm_info *vm)
1186 if (vm->hvm_state.hrt_type!=HRT_MBOOT64) {
1187 PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
1195 get_mb_info_loc(vm,&base,&limit);
1197 if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) {
1198 PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
1203 PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
1207 v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
1212 PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
1216 #define SCRATCH_STACK_SIZE 4096
1219 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1224 get_mb_info_loc(vm,&mb_base,&mb_limit);
1226 mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
1228 *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
1230 if (mb_base < *base+PAGE_SIZE) {
1231 PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
1234 *limit = mb_base - *base;
1238 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1239 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1241 #define ELF_MAGIC 0x464c457f
1242 #define MB2_MAGIC 0xe85250d6
1244 #define MB2_INFO_MAGIC 0x36d76289
1246 static int is_elf(uint8_t *data, uint64_t size)
1248 if (*((uint32_t*)data)==ELF_MAGIC) {
1255 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
1257 uint64_t limit = size > 32768 ? 32768 : size;
1260 // Scan for the .boot magic cookie
1261 // must be in first 32K, assume 4 byte aligned
1262 for (i=0;i<limit;i+=4) {
1263 if (*((uint32_t*)&data[i])==MB2_MAGIC) {
1264 INFO("Found multiboot header at offset 0x%llx\n",i);
1265 return (mb_header_t *) &data[i];
1272 static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
1274 struct v3_vm_hvm *h = &vm->hvm_state;
1275 uint64_t f = mb->mb64_hrt->hrt_flags;
1276 uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
1277 uint64_t gvaoff = mb->mb64_hrt->gva_offset;
1278 uint64_t gvaentry = mb->mb64_hrt->gva_entry;
1279 uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
1280 uint8_t vec = mb->mb64_hrt->hrt_int_vector;
1283 PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
1284 f, maxmap, gvaoff,gvaentry,commgpa, vec);
1286 if (maxmap<0x100000000ULL) {
1287 PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
1288 maxmap=0x100000000ULL;
1291 if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
1292 PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
1294 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
1296 f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
1297 h->max_mem_mapped = maxmap;
1298 PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
1299 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
1301 f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
1302 h->max_mem_mapped = maxmap;
1303 PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
1304 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
1306 f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
1307 h->max_mem_mapped = maxmap;
1308 PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
1310 PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
1314 if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
1315 PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
1321 if (maxmap>h->max_mem_mapped) {
1322 PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
1326 if (gvaoff!=0 && gvaoff!=TOP_HALF_START) {
1327 PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
1331 h->gva_offset = gvaoff;
1333 h->gva_entry = gvaentry;
1335 if (mb->addr->load_addr < h->first_hrt_gpa) {
1336 PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
1340 if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
1341 PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
1346 PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
1350 h->hrt_int_vector = vec;
1353 if (commgpa < vm->mem_size) {
1354 PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
1358 h->comm_page_gpa = commgpa;
1360 if (!h->comm_page_hpa) {
1361 if (!(h->comm_page_hpa=V3_AllocPages(1))) {
1362 PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
1366 h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
1368 memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1370 if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) {
1371 PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
1372 V3_FreePages((void*)(h->comm_page_gpa),1);
1377 PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
1380 memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1383 PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
1384 h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
1390 static int setup_mb_kernel_hrt(struct v3_vm_info *vm, void *data, uint64_t size)
1394 if (v3_parse_multiboot_header(data, size, &mb)) {
1395 PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
1400 PrintError(vm,VCORE_NONE,"hvm: invalid HRT - there is no MB64_HRT tag\n");
1404 if (configure_hrt(vm,&mb)) {
1405 PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
1409 if (v3_write_multiboot_kernel(vm,&mb,data,size,
1410 (void*)vm->hvm_state.first_hrt_gpa,
1411 vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
1412 PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
1416 if (vm->hvm_state.gva_entry) {
1417 vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
1419 vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
1422 vm->hvm_state.hrt_type = HRT_MBOOT64;
1429 static int setup_hrt(struct v3_vm_info *vm)
1434 // If the ROS has installed an image, it takes priority
1435 if (vm->hvm_state.hrt_image) {
1436 data = vm->hvm_state.hrt_image;
1437 size = vm->hvm_state.hrt_image_size;
1439 data = vm->hvm_state.hrt_file->data;
1440 size = vm->hvm_state.hrt_file->size;
1443 if (is_elf(data,size) &&
1444 find_mb_header(data,size)) {
1446 PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
1447 if (setup_mb_kernel_hrt(vm,data,size)) {
1448 PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
1452 PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
1469 We do not touch the ROS portion of the address space.
1470 The HRT portion looks like:
1472 INT_HANDLER (1 page - page aligned)
1473 IDT (1 page - page aligned)
1474 GDT (1 page - page aligned)
1475 TSS (1 page - page asligned)
1476 PAGETABLES (identy map of first N GB)
1477 ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
1478 followed by 3rd level PTs in order, followed by 4th level
1481 SCRATCH_STACK_HRT_CORE0
1482 SCRATCH_STACK_HRT_CORE1
1484 SCRATCH_STACK_HRT_COREN
1486 HRT (as many pages as needed, page-aligned, starting at first HRT address)
1494 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
1496 if (!vm->hvm_state.is_hvm) {
1497 PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
1501 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
1503 if (setup_hrt(vm)) {
1504 PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
1508 // the locations of all the other items are determined by
1509 // the HRT setup, so these must happen after
1511 write_null_int_handler(vm);
1518 // this must happen last
1521 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
1527 On entry for every core:
1529 IDTR points to stub IDT
1530 GDTR points to stub GDT
1531 TS points to stub TSS
1532 CR3 points to root page table
1534 EFER has LME AND LMA (and NX for compatibility with Linux)
1535 RSP is TOS of core's scratch stack (looks like a call)
1537 RAX = MB magic cookie
1538 RBX = address of multiboot info table
1539 RCX = this core id / apic id (0..N-1)
1540 RDX = this core id - first HRT core ID (==0 for the first HRT core)
1542 All addresses are virtual addresses, offset as needed by gva_offset
1544 Other regs are zeroed
1546 shadow/nested paging state reset for long mode
1549 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
1553 uint64_t gva_offset;
1555 rdtscll(core->hvm_state.last_boot_start);
1558 if (!core->hvm_state.is_hrt) {
1559 PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
1564 PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
1566 gva_offset = core->vm_info->hvm_state.gva_offset;
1568 memset(&core->vm_regs,0,sizeof(core->vm_regs));
1569 memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
1570 memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
1571 memset(&core->segments,0,sizeof(core->segments));
1572 memset(&core->msrs,0,sizeof(core->msrs));
1573 memset(&core->fp_state,0,sizeof(core->fp_state));
1575 // We are in long mode with virtual memory and we want
1576 // to start immediatley
1577 core->cpl = 0; // we are going right into the kernel
1578 core->cpu_mode = LONG;
1579 core->mem_mode = VIRTUAL_MEM;
1580 core->core_run_state = CORE_RUNNING ;
1584 core->vm_regs.rax = MB2_INFO_MAGIC;
1586 // multiboot info pointer
1587 get_mb_info_loc(core->vm_info, &base,&limit);
1588 core->vm_regs.rbx = (uint64_t) base + gva_offset;
1591 core->vm_regs.rcx = core->vcpu_id;
1594 core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1596 // Now point to scratch stack for this core
1597 // it begins at an ofset relative to the MB info page
1598 get_mb_info_loc(core->vm_info, &base,&limit);
1599 base = base + gva_offset;
1600 base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1601 core->vm_regs.rsp = (v3_reg_t) base;
1602 core->vm_regs.rbp = (v3_reg_t) base-8;
1604 // push onto the stack a bad rbp and bad return address
1605 core->vm_regs.rsp-=16;
1606 v3_set_gpa_memory(core,
1607 core->vm_regs.rsp-gva_offset,
1613 get_hrt_loc(core->vm_info, &base,&limit);
1614 if (core->vm_info->hvm_state.gva_entry) {
1615 core->rip = core->vm_info->hvm_state.gva_entry;
1617 core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset;
1622 PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1623 (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1625 (void*)(core->vm_regs.rsp),
1626 (void*)(core->vm_regs.rbp),
1627 (void*)(core->vm_regs.rax),
1628 (void*)(core->vm_regs.rbx),
1629 (void*)(core->vm_regs.rcx),
1630 (void*)(core->vm_regs.rdx));
1632 // Setup CRs for long mode and our stub page table
1634 core->ctrl_regs.cr0 = 0x80000001;
1635 core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1637 // CR2: don't care (output from #PF)
1638 // CE3: set to our PML4E, without setting PCD or PWT
1639 get_pt_loc(core->vm_info, &base,&limit);
1640 core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base); // not offset as this is a GPA
1641 core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1643 // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1644 core->ctrl_regs.cr4 = 0xb0;
1645 core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1647 // RFLAGS zeroed is fine: come in with interrupts off
1648 // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
1649 core->ctrl_regs.efer = 0x1d00;
1650 core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1656 selector is 13 bits of index, 1 bit table indicator
1659 index is scaled by 8, even in long mode, where some entries
1660 are 16 bytes long....
1661 -> code, data descriptors have 8 byte format
1662 because base, limit, etc, are ignored (no segmentation)
1663 -> interrupt/trap gates have 16 byte format
1664 because offset needs to be 64 bits
1667 // Install our stub IDT
1668 get_idt_loc(core->vm_info, &base,&limit);
1670 core->segments.idtr.selector = 0; // entry 0 (NULL) of the GDT
1671 core->segments.idtr.base = (addr_t) base; // only base+limit are used
1672 core->segments.idtr.limit = limit-1;
1673 core->segments.idtr.type = 0x0;
1674 core->segments.idtr.system = 0;
1675 core->segments.idtr.dpl = 0;
1676 core->segments.idtr.present = 0;
1677 core->segments.idtr.long_mode = 0;
1679 // Install our stub GDT
1680 get_gdt_loc(core->vm_info, &base,&limit);
1682 core->segments.gdtr.selector = 0; // entry 0 (NULL) of the GDT
1683 core->segments.gdtr.base = (addr_t) base;
1684 core->segments.gdtr.limit = limit-1; // only base+limit are used
1685 core->segments.gdtr.type = 0x0;
1686 core->segments.gdtr.system = 0;
1687 core->segments.gdtr.dpl = 0;
1688 core->segments.gdtr.present = 0;
1689 core->segments.gdtr.long_mode = 0;
1692 get_tss_loc(core->vm_info, &base,&limit);
1694 core->segments.tr.selector = 0;
1695 core->segments.tr.base = (addr_t) base;
1696 core->segments.tr.limit = limit-1;
1697 core->segments.tr.type = 0x9;
1698 core->segments.tr.system = 0; // available 64 bit TSS
1699 core->segments.tr.dpl = 0;
1700 core->segments.tr.present = 1;
1701 core->segments.tr.long_mode = 0; // not used
1703 base = 0x0; // these are not offset as we want to make all gvas visible
1707 core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1708 core->segments.cs.base = (addr_t) base; // not used
1709 core->segments.cs.limit = limit; // not used
1710 core->segments.cs.type = 0xe; // only C is used
1711 core->segments.cs.system = 1; // not a system segment
1712 core->segments.cs.dpl = 0;
1713 core->segments.cs.present = 1;
1714 core->segments.cs.long_mode = 1;
1716 // DS, SS, etc are identical
1717 core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1718 core->segments.ds.base = (addr_t) base;
1719 core->segments.ds.limit = limit;
1720 core->segments.ds.type = 0x6; // ignored
1721 core->segments.ds.system = 1; // not a system segment
1722 core->segments.ds.dpl = 0;
1723 core->segments.ds.present = 1;
1724 core->segments.ds.long_mode = 1;
1726 memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1727 memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1728 memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1729 memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1732 // reset paging here for shadow...
1734 if (core->shdw_pg_mode != NESTED_PAGING) {
1735 PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
1743 int v3_handle_hvm_reset(struct guest_info *core)
1746 if (core->core_run_state != CORE_RESETTING) {
1750 if (!core->vm_info->hvm_state.is_hvm) {
1754 if (v3_is_hvm_hrt_core(core)) {
1755 // this is an HRT reset
1758 // wait for all the HRT cores
1759 v3_counting_barrier(&core->vm_info->reset_barrier);
1761 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1763 core->vm_info->run_state = VM_RESETTING;
1766 core->core_run_state = CORE_RESETTING;
1768 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1769 // we really only need to clear the bss
1770 // and recopy the .data, but for now we'll just
1772 rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
1775 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
1779 // now everyone is ready to reset
1780 rc |= v3_setup_hvm_hrt_core_for_boot(core);
1783 PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
1786 core->core_run_state = CORE_RUNNING;
1788 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1790 core->vm_info->run_state = VM_RUNNING;
1791 core->vm_info->hvm_state.trans_state = HRT_IDLE;
1794 v3_counting_barrier(&core->vm_info->reset_barrier);
1797 PrintError(core->vm_info,core,"hvm: reset failed\n");
1804 // ROS core will be handled by normal reset functionality