2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org>
11 * All rights reserved.
13 * Author: Peter Dinda <pdinda@northwestern.edu>
15 * This is free software. You are permitted to use,
16 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
27 #include <palacios/vmm_xml.h>
29 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_debug.h>
36 MEM = Total size of memory in the GPA (in MB)
37 ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
39 GPAs [0,ROS_MEM) are what the ROS sees
40 GPAs [ROS_MEM, MEM) are HRT only
41 GPAS [0,MEM) are accessible by the HRT
43 CORES = Total number of cores in VM
44 ROS_CORES = Total numbber of cores for the ROS
46 Cores [0,ROS_CORES) are what the ROS sees
47 Cores [ROS_CORES,CORES) are HRT only
48 Cores [0,CORES) are accessible by the HRT
53 <file id="hrtelf" filename="hrtelf.o" />
56 <mem ... >RAM</mem> (MB) Note these are
57 <cores count="CORES" ...> backward compatible
60 <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
61 <hrt file_id="hrtelf" /hrt>
66 #ifndef V3_CONFIG_DEBUG_HVM
68 #define PrintDebug(fmt, args...)
74 PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
80 PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
84 // ignore requests from when we are in the wrong state
85 #define ENFORCE_STATE_MACHINE 1
87 // invoke the HRT using a page fault instead of
88 // the SWINTR mechanism
89 #define USE_UPCALL_MAGIC_PF 1
90 #define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
91 #define UPCALL_MAGIC_ERROR 0xf00df00d
94 64 bit only hypercall:
96 rax = hypercall number
98 then args are: rcx, rdx, rsi, rdi r8, r9, r10, r11
101 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
104 uint64_t bitness = core->vm_regs.rbx;
105 uint64_t a1 = core->vm_regs.rcx;
106 uint64_t a2 = core->vm_regs.rdx;
107 uint64_t a3 = core->vm_regs.rsi;
108 struct v3_vm_hvm *h = &core->vm_info->hvm_state;
111 if (bitness!=0x6464646464646464) {
112 PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
113 core->vm_regs.rax = -1;
122 V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
123 hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
124 //v3_print_core_telemetry(core);
125 // v3_print_guest_state(core);
126 core->vm_regs.rax = 0;
129 case 0x1: // reset ros
130 PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
131 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) {
132 PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
133 core->vm_regs.rax = -1;
135 core->vm_regs.rax = 0;
139 case 0x2: // reset hrt
140 PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
141 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) {
142 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
143 core->vm_regs.rax = -1;
145 core->vm_regs.rax = 0;
149 case 0x3: // reset both
150 PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
151 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) {
152 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
153 core->vm_regs.rax = -1;
155 core->vm_regs.rax = 0;
159 case 0x8: // replace HRT image
161 // a3 = size of image
162 PrintDebug(core->vm_info,core,"hvm: request replacement HRT image addr=0x%llx size=0x%llx\n",a2,a3);
166 V3_VFree(h->hrt_image);
170 h->hrt_image = V3_VMalloc(a3);
172 if (!(h->hrt_image)) {
173 PrintError(core->vm_info,core, "hvm: failed to allocate space for replacement image\n");
174 core->vm_regs.rax = -1;
176 if (v3_read_gva_memory(core, a2, a3, (uint8_t*) h->hrt_image)!=a3) {
177 PrintError(core->vm_info, core, "hvm: cannot read replacement image\n");
178 core->vm_regs.rax = -1;
180 h->hrt_image_size = a3;
181 core->vm_regs.rax = 0;
185 if (core->vm_regs.rax) {
186 PrintError(core->vm_info,core,"hvm: Failed to replace HRT image\n");
188 PrintDebug(core->vm_info,core,"hvm: HRT image successfully replaced\n");
193 case 0xf: // get HRT state
194 core->vm_regs.rax = h->trans_state;
195 if (v3_write_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*) &h->ros_event)!=sizeof(h->ros_event)) {
196 PrintError(core->vm_info, core, "hvm: cannot write back ROS event state to %p - continuing\n",(void*)a2);
198 //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
202 PrintDebug(core->vm_info, core, "hvm: ROS event request\n");
203 if (h->ros_event.event_type!=ROS_NONE) {
204 PrintError(core->vm_info, core, "hvm: ROS event is already in progress\n");
205 core->vm_regs.rax = -1;
207 if (v3_read_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*)&h->ros_event)!=sizeof(h->ros_event)) {
208 PrintError(core->vm_info, core, "hvm: cannot read ROS event from %p\n",(void*)a2);
209 core->vm_regs.rax = -1;
211 core->vm_regs.rax = 0;
218 PrintDebug(core->vm_info, core, "hvm: completion of ROS event (rc=0x%llx)\n",a2);
219 h->ros_event.event_type=ROS_NONE;
220 h->ros_event.last_ros_event_result = a2;
223 case 0x20: // invoke function (ROS->HRT)
224 case 0x21: // invoke parallel function (ROS->HRT)
225 if (v3_is_hvm_hrt_core(core)) {
226 PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
227 core->vm_regs.rax = -1;
229 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
230 PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
231 core->vm_regs.rax = -1;
233 uint64_t *page = (uint64_t *) h->comm_page_hva;
234 uint64_t first, last, cur;
236 PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
241 first=last=h->first_hrt_core;
243 first=h->first_hrt_core;
244 last=core->vm_info->num_cores-1;
247 core->vm_regs.rax = 0;
249 h->trans_count = last-first+1;
251 for (cur=first;cur<=last;cur++) {
253 #if USE_UPCALL_MAGIC_PF
254 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
255 core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
256 if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
258 UPCALL_MAGIC_ERROR)) {
259 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
260 core->vm_regs.rax = -1;
264 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
265 if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) {
266 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
267 core->vm_regs.rax = -1;
271 // Force core to exit now
272 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
275 if (core->vm_regs.rax==0) {
277 h->trans_state = HRT_CALL;
279 h->trans_state = HRT_PARCALL;
282 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
283 h->trans_state = HRT_IDLE;
291 case 0x28: // setup for synchronous operation (ROS->HRT)
292 case 0x29: // teardown for synchronous operation (ROS->HRT)
293 if (v3_is_hvm_hrt_core(core)) {
294 PrintError(core->vm_info,core,"hvm: %ssynchronization invocation not supported from HRT core\n",a1==0x29 ? "de" : "");
295 core->vm_regs.rax = -1;
297 if (ENFORCE_STATE_MACHINE &&
298 ((a1==0x28 && h->trans_state!=HRT_IDLE) || (a1==0x29 && h->trans_state!=HRT_SYNC))) {
299 PrintError(core->vm_info,core, "hvm: cannot invoke %ssynchronization in state %d\n",a1==0x29 ? "de" : "", h->trans_state);
300 core->vm_regs.rax = -1;
302 uint64_t *page = (uint64_t *) h->comm_page_hva;
303 uint64_t first, last, cur;
305 PrintDebug(core->vm_info,core, "hvm: invoke %ssynchronization on address %p\n",a1==0x29 ? "de" : "",(void*)a2);
309 first=last=h->first_hrt_core; // initially we will sync only with BSP
311 core->vm_regs.rax = 0;
313 h->trans_count = last-first+1;
315 for (cur=first;cur<=last;cur++) {
317 #if USE_UPCALL_MAGIC_PF
318 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
319 core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
320 if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
322 UPCALL_MAGIC_ERROR)) {
323 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
324 core->vm_regs.rax = -1;
328 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
329 if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) {
330 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
331 core->vm_regs.rax = -1;
335 // Force core to exit now
336 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
339 if (core->vm_regs.rax==0) {
341 h->trans_state = HRT_SYNCSETUP;
343 h->trans_state = HRT_SYNCTEARDOWN;
346 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
347 h->trans_state = HRT_IDLE;
354 case 0x2f: // function exec or sync done
355 if (v3_is_hvm_ros_core(core)) {
356 PrintError(core->vm_info,core, "hvm: request for exec or sync done from ROS core\n");
357 core->vm_regs.rax=-1;
359 if (ENFORCE_STATE_MACHINE &&
360 h->trans_state!=HRT_CALL &&
361 h->trans_state!=HRT_PARCALL &&
362 h->trans_state!=HRT_SYNCSETUP &&
363 h->trans_state!=HRT_SYNCTEARDOWN) {
364 PrintError(core->vm_info,core,"hvm: function or sync completion when not in HRT_CALL, HRT_PARCALL, HRT_SYNCSETUP, or HRT_SYNCTEARDOWN state\n");
365 core->vm_regs.rax=-1;
368 PrintDebug(core->vm_info,core, "hvm: function or sync complete\n");
369 if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
370 // last one, switch state
371 if (h->trans_state==HRT_SYNCSETUP) {
372 h->trans_state=HRT_SYNC;
373 PrintDebug(core->vm_info,core, "hvm: function complete - now synchronous\n");
375 h->trans_state=HRT_IDLE;
384 case 0x30: // merge address space
385 case 0x31: // unmerge address space
386 if (v3_is_hvm_hrt_core(core)) {
387 PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
388 core->vm_regs.rax=-1;
390 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
391 PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state\n",a1==0x30 ? "" : "un");
392 core->vm_regs.rax=-1;
394 uint64_t *page = (uint64_t *) h->comm_page_hva;
396 PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
397 // should sanity check to make sure guest is in 64 bit without anything strange
400 page[1] = core->ctrl_regs.cr3; // this is a do-not-care for an unmerge
402 core->vm_regs.rax = 0;
403 #if USE_UPCALL_MAGIC_PF
404 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %u\n",h->first_hrt_core);
405 core->vm_info->cores[h->first_hrt_core].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
406 if (v3_raise_exception_with_error(&core->vm_info->cores[h->first_hrt_core],
408 UPCALL_MAGIC_ERROR)) {
409 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %u\n",h->first_hrt_core);
410 core->vm_regs.rax = -1;
414 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %u\n",h->hrt_int_vector,h->first_hrt_core);
415 if (v3_raise_swintr(&core->vm_info->cores[h->first_hrt_core],h->hrt_int_vector)) {
416 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %u\n",h->first_hrt_core);
417 core->vm_regs.rax = -1;
420 // Force core to exit now
421 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
423 h->trans_state = HRT_MERGE;
431 case 0x3f: // merge operation done
432 if (v3_is_hvm_ros_core(core)) {
433 PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
434 core->vm_regs.rax=-1;
436 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
437 PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
438 core->vm_regs.rax=-1;
440 PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
441 h->trans_state=HRT_IDLE;
448 case 0x40: // install or remove signal handler
449 if (v3_is_hvm_hrt_core(core)) {
450 PrintError(core->vm_info,core, "hvm: HRT cannot install signal handler...\n");
451 core->vm_regs.rax=-1;
453 PrintDebug(core->vm_info,core,"hvm: install signal handler for CR3=%p, handler=%p, stack=%p\n",(void*)core->ctrl_regs.cr3, (void*)a2, (void*)a3);
454 if (h->ros_signal.code) {
455 PrintError(core->vm_info,core,"hvm: signal is pending...\n");
456 core->vm_regs.rax=-1;
458 if ((a2 || a3) && (h->ros_signal.handler || h->ros_signal.stack)) {
459 PrintError(core->vm_info,core,"hvm: attempt to replace existing handler without removing it first\n");
460 core->vm_regs.rax=-1;
462 // actually make the change
463 h->ros_signal.handler=a2;
464 h->ros_signal.stack=a3;
465 h->ros_signal.cr3=core->ctrl_regs.cr3;
468 // test by signalling back a hello
470 // v3_hvm_signal_ros(core->vm_info,0xf00d);
477 case 0x41: // raise signal in the ROS from HRT or ROS
478 PrintDebug(core->vm_info,core,"hvm: HRT raises signal code=0x%llx\n", a2);
479 core->vm_regs.rax = v3_hvm_signal_ros(core->vm_info,a2);
483 PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
484 core->vm_regs.rax=-1;
491 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
493 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
495 v3_cfg_tree_t *hvm_config;
496 v3_cfg_tree_t *ros_config;
497 v3_cfg_tree_t *hrt_config;
503 PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
508 memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
509 vm->hvm_state.is_hvm=0;
510 vm->hvm_state.first_hrt_core=vm->num_cores;
511 vm->hvm_state.first_hrt_gpa=vm->mem_size;
513 if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
514 PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
518 if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
519 PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
523 if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) {
524 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
528 if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) {
529 PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
533 vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
535 if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) {
536 PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
540 vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
542 if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) {
543 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
547 if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) {
548 PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
552 vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
554 if (!vm->hvm_state.hrt_file) {
555 PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
559 if (v3_register_hypercall(vm, HVM_HCALL,
560 hvm_hcall_handler, 0)) {
561 PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
565 // XXX sanity check config here
567 vm->hvm_state.is_hvm=1;
570 if (vm->hvm_state.is_hvm) {
571 V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
572 vm->hvm_state.first_hrt_core-1,
573 (void*) vm->hvm_state.first_hrt_gpa-1,
574 vm->hvm_state.first_hrt_core,
576 (void*) vm->hvm_state.first_hrt_gpa,
577 (void*)vm->mem_size-1,
579 vm->hvm_state.hrt_file->tag);
581 V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
588 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
590 PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
592 if (vm->hvm_state.hrt_image) {
593 V3_VFree(vm->hvm_state.hrt_image);
594 vm->hvm_state.hrt_image=0;
595 vm->hvm_state.hrt_image_size=0;
598 v3_remove_hypercall(vm,HVM_HCALL);
600 if (vm->hvm_state.comm_page_hpa) {
601 struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
603 PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
605 v3_delete_mem_region(vm,r);
612 int v3_init_hvm_core(struct guest_info *core)
614 memset(&core->hvm_state,0,sizeof(core->hvm_state));
615 if (core->vm_info->hvm_state.is_hvm) {
616 if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) {
617 core->hvm_state.is_hrt=1;
623 int v3_deinit_hvm_core(struct guest_info *core)
625 PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
631 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
633 if (vm->hvm_state.is_hvm) {
634 return vm->hvm_state.first_hrt_gpa;
639 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
644 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
646 if (vm->hvm_state.is_hvm) {
647 return vm->hvm_state.first_hrt_core;
649 return vm->num_cores;
653 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
655 if (vm->hvm_state.is_hvm) {
656 return vm->num_cores - vm->hvm_state.first_hrt_core;
663 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
665 if (vm->hvm_state.is_hvm) {
666 return gpa<vm->hvm_state.first_hrt_gpa;
672 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
674 if (vm->hvm_state.is_hvm) {
675 return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
681 int v3_is_hvm_hrt_core(struct guest_info *core)
683 return core->hvm_state.is_hrt;
686 int v3_is_hvm_ros_core(struct guest_info *core)
688 return !core->hvm_state.is_hrt;
691 int v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
694 // ioapic or msi to apic
695 return !dest->hvm_state.is_hrt;
698 return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
702 void v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm,
703 uint32_t *start_apic, uint32_t *num_apics)
706 // Seen from ioapic, msi, etc:
707 if (vm->hvm_state.is_hvm) {
708 // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
710 *num_apics = vm->hvm_state.first_hrt_core;
712 // Non-HVM shows all cores/APICs to apic, msi, etc.
714 *num_apics = vm->num_cores;
718 if (core->hvm_state.is_hrt) {
719 // HRT core/apic sees all apics
720 // (this policy may change...)
722 *num_apics = vm->num_cores;
724 // non-HRT core/apic sees only non-HRT cores/apics
726 *num_apics = vm->hvm_state.first_hrt_core;
731 #define MAX(x,y) ((x)>(y)?(x):(y))
732 #define MIN(x,y) ((x)<(y)?(x):(y))
735 static uint64_t boot_state_end_addr(struct v3_vm_info *vm)
737 return PAGE_ADDR(vm->mem_size);
740 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
742 *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
746 extern v3_cpu_arch_t v3_mach_type;
748 extern void *v3_hvm_svm_null_int_handler_start;
749 extern void *v3_hvm_svm_null_int_handler_end;
750 extern void *v3_hvm_vmx_null_int_handler_start;
751 extern void *v3_hvm_vmx_null_int_handler_end;
753 static void write_null_int_handler(struct v3_vm_info *vm)
760 get_null_int_handler_loc(vm,&base,&limit);
762 switch (v3_mach_type) {
765 case V3_SVM_REV3_CPU:
766 data = (void*) &v3_hvm_svm_null_int_handler_start;
767 len = (void*) &v3_hvm_svm_null_int_handler_end - data;
773 case V3_VMX_EPT_UG_CPU:
774 data = (void*) &v3_hvm_vmx_null_int_handler_start;
775 len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
779 PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
785 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
788 PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
792 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
794 *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
798 // default IDT entries (int and trap gates)
800 // Format is 16 bytes long:
802 // 16 selector => (target code selector) => 0x8 // entry 1 of GDT
803 // 3 ist => (stack) = 0 => current stack
805 // 4 type => 0xe=>INT, 0xf=>TRAP
806 // 1 reserved => 0 (indicates "system" by being zero)
810 // 32 offsethigh => 0 (total is a 64 bit offset)
813 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
815 // Note little endian
817 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
818 static uint64_t idt64_int_gate_entry_mask[2] = { 0x00008e0000080000, 0x0 };
820 static void write_idt(struct v3_vm_info *vm)
825 uint64_t handler_len;
827 uint64_t trap_gate[2];
828 uint64_t int_gate[2];
830 get_idt_loc(vm,&base,&limit);
832 get_null_int_handler_loc(vm,&handler,&handler_len);
834 handler += vm->hvm_state.gva_offset;
836 memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
837 memcpy(int_gate,idt64_int_gate_entry_mask,16);
840 // update the entries for the handler location
844 hand = (uint8_t*) &handler;
846 mask = (uint8_t *)trap_gate;
847 memcpy(&(mask[0]),&(hand[0]),2); // offset low
848 memcpy(&(mask[6]),&(hand[2]),2); // offset med
849 memcpy(&(mask[8]),&(hand[4]),4); // offset high
851 mask = (uint8_t *)int_gate;
852 memcpy(&(mask[0]),&(hand[0]),2); // offset low
853 memcpy(&(mask[6]),&(hand[2]),2); // offset med
854 memcpy(&(mask[8]),&(hand[4]),4); // offset high
856 PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
860 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
863 for (i=32;i<256;i++) {
864 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
867 PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
872 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
874 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
878 static uint64_t gdt64[3] = {
879 0x0000000000000000, /* null */
880 0x00a09a0000000000, /* code (note lme bit) */
881 0x00a0920000000000, /* data (most entries don't matter) */
884 static void write_gdt(struct v3_vm_info *vm)
889 get_gdt_loc(vm,&base,&limit);
890 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
892 PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
897 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
899 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
903 static void write_tss(struct v3_vm_info *vm)
908 get_tss_loc(vm,&base,&limit);
910 v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
912 PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
916 #define TOP_HALF_START 0xffff800000000000ULL
917 #define BOTTOM_HALF_END 0x00007fffffffffffULL
920 #define L4_UNIT PAGE_SIZE
921 #define L3_UNIT (512ULL * L4_UNIT)
922 #define L2_UNIT (512ULL * L3_UNIT)
923 #define L1_UNIT (512ULL * L2_UNIT)
925 static void compute_pts_4KB(struct v3_vm_info *vm,
926 uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)
929 // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
930 // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
931 // so it is the same number of page tables regardless
933 uint64_t max_gva = vm->hvm_state.max_mem_mapped;
936 *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
937 *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
938 *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
944 PTS MAP using 1 GB pages
945 n second levels pts, highest gva, highest address
951 PTS MAP using 2 MB pages
952 n third level pts, highest gva, highest address
953 m second level pts, highest gva, highest address
958 PTS MAP using 4 KB pages
959 n 4th level, highest gva, highest address
960 m 3rd level, highest gva, hihgest address
961 l second level, highest gva, highest address
965 PTS MAP using 512 GB pages when this becomes available
970 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
972 uint64_t l1,l2,l3,l4;
975 compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
977 if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
979 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
981 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
982 num_pt = l1 + l2 + l3;
983 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
984 num_pt = l1 + l2 + l3 + l4;
986 PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
990 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
991 *limit = num_pt*PAGE_SIZE;
994 static void write_pts(struct v3_vm_info *vm)
997 uint64_t num_l1, num_l2, num_l3, num_l4;
998 void *start_l1, *start_l2, *start_l3, *start_l4;
1004 void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
1005 void *min_gva = (void*) vm->hvm_state.gva_offset;
1006 #ifdef V3_CONFIG_DEBUG_HVM
1007 void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
1010 uint64_t i_start,i_end;
1012 struct pml4e64 *pml4e;
1013 struct pdpe64 *pdpe;
1017 if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
1018 PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
1020 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
1022 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
1024 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
1027 PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
1031 get_pt_loc(vm,&start_l1,&size);
1032 compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
1034 start_l2=start_l1+PAGE_SIZE*num_l1;
1035 start_l3=start_l2+PAGE_SIZE*num_l2;
1036 start_l4=start_l3+PAGE_SIZE*num_l3;
1038 PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
1039 PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
1040 PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
1041 PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
1045 // build PML4 (only one)
1046 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) {
1047 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
1051 memset(pml4e,0,PAGE_SIZE);
1054 i_start=0; i_end = num_l2;
1055 } else if (min_gva==(void*)TOP_HALF_START) {
1056 i_start=256; i_end=256+num_l2;
1058 PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
1062 for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
1064 i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
1067 pml4e[i].writable=1;
1070 PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
1071 pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1072 //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1074 pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
1075 //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1086 for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1088 cur_pt+=PAGE_SIZE, pt++) {
1091 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) {
1092 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
1096 memset(pdpe,0,PAGE_SIZE);
1099 i<512 && cur_gpa<max_gpa;
1100 i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
1106 pdpe[i].large_page=1;
1107 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1108 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1110 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
1111 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1121 for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1123 cur_pt+=PAGE_SIZE, pt++) {
1126 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) {
1127 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
1131 memset(pde,0,PAGE_SIZE);
1134 i<512 && cur_gpa<max_gpa;
1135 i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
1141 pde[i].large_page=1;
1142 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1143 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
1145 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
1146 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
1158 for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1160 cur_pt+=PAGE_SIZE, pt++) {
1163 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) {
1164 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
1168 memset(pte,0,PAGE_SIZE);
1171 i<512 && cur_gpa<max_gpa;
1172 i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
1176 pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1177 //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
1185 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1188 get_pt_loc(vm,base, limit);
1194 int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
1196 struct v3_vm_info *vm = core->vm_info;
1198 hrt->tag.type = MB_INFO_HRT_TAG;
1199 hrt->tag.size = sizeof(mb_info_hrt_t);
1201 hrt->total_num_apics = vm->num_cores;
1202 hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
1203 hrt->have_hrt_ioapic=0;
1204 hrt->first_hrt_ioapic_entry=0;
1206 hrt->cpu_freq_khz = V3_CPU_KHZ();
1208 hrt->hrt_flags = vm->hvm_state.hrt_flags;
1209 hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
1210 hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
1211 hrt->gva_offset = vm->hvm_state.gva_offset;
1212 hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
1213 hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
1218 static void write_mb_info(struct v3_vm_info *vm)
1220 if (vm->hvm_state.hrt_type!=HRT_MBOOT64) {
1221 PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
1229 get_mb_info_loc(vm,&base,&limit);
1231 if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) {
1232 PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
1237 PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
1241 v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
1246 PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
1250 #define SCRATCH_STACK_SIZE 4096
1253 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1258 get_mb_info_loc(vm,&mb_base,&mb_limit);
1260 mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
1262 *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
1264 if (mb_base < *base+PAGE_SIZE) {
1265 PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
1268 *limit = mb_base - *base;
1272 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1273 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1275 #define ELF_MAGIC 0x464c457f
1276 #define MB2_MAGIC 0xe85250d6
1278 #define MB2_INFO_MAGIC 0x36d76289
1280 static int is_elf(uint8_t *data, uint64_t size)
1282 if (*((uint32_t*)data)==ELF_MAGIC) {
1289 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
1291 uint64_t limit = size > 32768 ? 32768 : size;
1294 // Scan for the .boot magic cookie
1295 // must be in first 32K, assume 4 byte aligned
1296 for (i=0;i<limit;i+=4) {
1297 if (*((uint32_t*)&data[i])==MB2_MAGIC) {
1298 INFO("Found multiboot header at offset 0x%llx\n",i);
1299 return (mb_header_t *) &data[i];
1306 static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
1308 struct v3_vm_hvm *h = &vm->hvm_state;
1309 uint64_t f = mb->mb64_hrt->hrt_flags;
1310 uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
1311 uint64_t gvaoff = mb->mb64_hrt->gva_offset;
1312 uint64_t gvaentry = mb->mb64_hrt->gva_entry;
1313 uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
1314 uint8_t vec = mb->mb64_hrt->hrt_int_vector;
1317 PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
1318 f, maxmap, gvaoff,gvaentry,commgpa, vec);
1320 if (maxmap<0x100000000ULL) {
1321 PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
1322 maxmap=0x100000000ULL;
1325 if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
1326 PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
1328 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
1330 f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
1331 h->max_mem_mapped = maxmap;
1332 PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
1333 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
1335 f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
1336 h->max_mem_mapped = maxmap;
1337 PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
1338 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
1340 f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
1341 h->max_mem_mapped = maxmap;
1342 PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
1344 PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
1348 if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
1349 PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
1355 if (maxmap>h->max_mem_mapped) {
1356 PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
1360 if (gvaoff!=0 && gvaoff!=TOP_HALF_START) {
1361 PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
1365 h->gva_offset = gvaoff;
1367 h->gva_entry = gvaentry;
1369 if (mb->addr->load_addr < h->first_hrt_gpa) {
1370 PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
1374 if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
1375 PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
1380 PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
1384 h->hrt_int_vector = vec;
1387 if (commgpa < vm->mem_size) {
1388 PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
1392 h->comm_page_gpa = commgpa;
1394 if (!h->comm_page_hpa) {
1395 if (!(h->comm_page_hpa=V3_AllocPages(1))) {
1396 PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
1400 h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
1402 memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1404 if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) {
1405 PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
1406 V3_FreePages((void*)(h->comm_page_gpa),1);
1411 PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
1414 memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1417 PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
1418 h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
1424 static int setup_mb_kernel_hrt(struct v3_vm_info *vm, void *data, uint64_t size)
1428 if (v3_parse_multiboot_header(data, size, &mb)) {
1429 PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
1434 PrintError(vm,VCORE_NONE,"hvm: invalid HRT - there is no MB64_HRT tag\n");
1438 if (configure_hrt(vm,&mb)) {
1439 PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
1443 if (v3_write_multiboot_kernel(vm,&mb,data,size,
1444 (void*)vm->hvm_state.first_hrt_gpa,
1445 vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
1446 PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
1450 if (vm->hvm_state.gva_entry) {
1451 vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
1453 vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
1456 vm->hvm_state.hrt_type = HRT_MBOOT64;
1463 static int setup_hrt(struct v3_vm_info *vm)
1468 // If the ROS has installed an image, it takes priority
1469 if (vm->hvm_state.hrt_image) {
1470 data = vm->hvm_state.hrt_image;
1471 size = vm->hvm_state.hrt_image_size;
1473 data = vm->hvm_state.hrt_file->data;
1474 size = vm->hvm_state.hrt_file->size;
1477 if (is_elf(data,size) &&
1478 find_mb_header(data,size)) {
1480 PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
1481 if (setup_mb_kernel_hrt(vm,data,size)) {
1482 PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
1486 PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
1503 We do not touch the ROS portion of the address space.
1504 The HRT portion looks like:
1506 INT_HANDLER (1 page - page aligned)
1507 IDT (1 page - page aligned)
1508 GDT (1 page - page aligned)
1509 TSS (1 page - page asligned)
1510 PAGETABLES (identy map of first N GB)
1511 ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
1512 followed by 3rd level PTs in order, followed by 4th level
1515 SCRATCH_STACK_HRT_CORE0
1516 SCRATCH_STACK_HRT_CORE1
1518 SCRATCH_STACK_HRT_COREN
1520 HRT (as many pages as needed, page-aligned, starting at first HRT address)
1528 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
1530 if (!vm->hvm_state.is_hvm) {
1531 PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
1535 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
1537 if (setup_hrt(vm)) {
1538 PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
1542 // the locations of all the other items are determined by
1543 // the HRT setup, so these must happen after
1545 write_null_int_handler(vm);
1552 // this must happen last
1555 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
1561 On entry for every core:
1563 IDTR points to stub IDT
1564 GDTR points to stub GDT
1565 TS points to stub TSS
1566 CR3 points to root page table
1568 EFER has LME AND LMA (and NX for compatibility with Linux)
1569 RSP is TOS of core's scratch stack (looks like a call)
1571 RAX = MB magic cookie
1572 RBX = address of multiboot info table
1573 RCX = this core id / apic id (0..N-1)
1574 RDX = this core id - first HRT core ID (==0 for the first HRT core)
1576 All addresses are virtual addresses, offset as needed by gva_offset
1578 Other regs are zeroed
1580 shadow/nested paging state reset for long mode
1583 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
1587 uint64_t gva_offset;
1589 rdtscll(core->hvm_state.last_boot_start);
1592 if (!core->hvm_state.is_hrt) {
1593 PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
1598 PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
1600 gva_offset = core->vm_info->hvm_state.gva_offset;
1602 memset(&core->vm_regs,0,sizeof(core->vm_regs));
1603 memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
1604 memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
1605 memset(&core->segments,0,sizeof(core->segments));
1606 memset(&core->msrs,0,sizeof(core->msrs));
1607 memset(&core->fp_state,0,sizeof(core->fp_state));
1609 // We are in long mode with virtual memory and we want
1610 // to start immediatley
1611 core->cpl = 0; // we are going right into the kernel
1612 core->cpu_mode = LONG;
1613 core->mem_mode = VIRTUAL_MEM;
1614 core->core_run_state = CORE_RUNNING ;
1618 core->vm_regs.rax = MB2_INFO_MAGIC;
1620 // multiboot info pointer
1621 get_mb_info_loc(core->vm_info, &base,&limit);
1622 core->vm_regs.rbx = (uint64_t) base + gva_offset;
1625 core->vm_regs.rcx = core->vcpu_id;
1628 core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1630 // Now point to scratch stack for this core
1631 // it begins at an ofset relative to the MB info page
1632 get_mb_info_loc(core->vm_info, &base,&limit);
1633 base = base + gva_offset;
1634 base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1635 core->vm_regs.rsp = (v3_reg_t) base;
1636 core->vm_regs.rbp = (v3_reg_t) base-8;
1638 // push onto the stack a bad rbp and bad return address
1639 core->vm_regs.rsp-=16;
1640 v3_set_gpa_memory(core,
1641 core->vm_regs.rsp-gva_offset,
1647 get_hrt_loc(core->vm_info, &base,&limit);
1648 if (core->vm_info->hvm_state.gva_entry) {
1649 core->rip = core->vm_info->hvm_state.gva_entry;
1651 core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset;
1656 PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1657 (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1659 (void*)(core->vm_regs.rsp),
1660 (void*)(core->vm_regs.rbp),
1661 (void*)(core->vm_regs.rax),
1662 (void*)(core->vm_regs.rbx),
1663 (void*)(core->vm_regs.rcx),
1664 (void*)(core->vm_regs.rdx));
1666 // Setup CRs for long mode and our stub page table
1668 core->ctrl_regs.cr0 = 0x80000001;
1669 core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1671 // CR2: don't care (output from #PF)
1672 // CE3: set to our PML4E, without setting PCD or PWT
1673 get_pt_loc(core->vm_info, &base,&limit);
1674 core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base); // not offset as this is a GPA
1675 core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1677 // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1678 core->ctrl_regs.cr4 = 0xb0;
1679 core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1681 // RFLAGS zeroed is fine: come in with interrupts off
1682 // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
1683 core->ctrl_regs.efer = 0x1d00;
1684 core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1690 selector is 13 bits of index, 1 bit table indicator
1693 index is scaled by 8, even in long mode, where some entries
1694 are 16 bytes long....
1695 -> code, data descriptors have 8 byte format
1696 because base, limit, etc, are ignored (no segmentation)
1697 -> interrupt/trap gates have 16 byte format
1698 because offset needs to be 64 bits
1701 // Install our stub IDT
1702 get_idt_loc(core->vm_info, &base,&limit);
1704 core->segments.idtr.selector = 0; // entry 0 (NULL) of the GDT
1705 core->segments.idtr.base = (addr_t) base; // only base+limit are used
1706 core->segments.idtr.limit = limit-1;
1707 core->segments.idtr.type = 0x0;
1708 core->segments.idtr.system = 0;
1709 core->segments.idtr.dpl = 0;
1710 core->segments.idtr.present = 0;
1711 core->segments.idtr.long_mode = 0;
1713 // Install our stub GDT
1714 get_gdt_loc(core->vm_info, &base,&limit);
1716 core->segments.gdtr.selector = 0; // entry 0 (NULL) of the GDT
1717 core->segments.gdtr.base = (addr_t) base;
1718 core->segments.gdtr.limit = limit-1; // only base+limit are used
1719 core->segments.gdtr.type = 0x0;
1720 core->segments.gdtr.system = 0;
1721 core->segments.gdtr.dpl = 0;
1722 core->segments.gdtr.present = 0;
1723 core->segments.gdtr.long_mode = 0;
1726 get_tss_loc(core->vm_info, &base,&limit);
1728 core->segments.tr.selector = 0;
1729 core->segments.tr.base = (addr_t) base;
1730 core->segments.tr.limit = limit-1;
1731 core->segments.tr.type = 0x9;
1732 core->segments.tr.system = 0; // available 64 bit TSS
1733 core->segments.tr.dpl = 0;
1734 core->segments.tr.present = 1;
1735 core->segments.tr.long_mode = 0; // not used
1737 base = 0x0; // these are not offset as we want to make all gvas visible
1741 core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1742 core->segments.cs.base = (addr_t) base; // not used
1743 core->segments.cs.limit = limit; // not used
1744 core->segments.cs.type = 0xe; // only C is used
1745 core->segments.cs.system = 1; // not a system segment
1746 core->segments.cs.dpl = 0;
1747 core->segments.cs.present = 1;
1748 core->segments.cs.long_mode = 1;
1750 // DS, SS, etc are identical
1751 core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1752 core->segments.ds.base = (addr_t) base;
1753 core->segments.ds.limit = limit;
1754 core->segments.ds.type = 0x6; // ignored
1755 core->segments.ds.system = 1; // not a system segment
1756 core->segments.ds.dpl = 0;
1757 core->segments.ds.present = 1;
1758 core->segments.ds.long_mode = 1;
1760 memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1761 memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1762 memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1763 memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1766 // reset paging here for shadow...
1768 if (core->shdw_pg_mode != NESTED_PAGING) {
1769 PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
1777 int v3_handle_hvm_reset(struct guest_info *core)
1780 if (core->core_run_state != CORE_RESETTING) {
1784 if (!core->vm_info->hvm_state.is_hvm) {
1788 if (v3_is_hvm_hrt_core(core)) {
1789 // this is an HRT reset
1792 // wait for all the HRT cores
1793 v3_counting_barrier(&core->vm_info->reset_barrier);
1795 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1797 core->vm_info->run_state = VM_RESETTING;
1800 core->core_run_state = CORE_RESETTING;
1802 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1803 // we really only need to clear the bss
1804 // and recopy the .data, but for now we'll just
1806 rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
1809 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
1813 // now everyone is ready to reset
1814 rc |= v3_setup_hvm_hrt_core_for_boot(core);
1817 PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
1820 core->core_run_state = CORE_RUNNING;
1822 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1824 core->vm_info->run_state = VM_RUNNING;
1825 core->vm_info->hvm_state.trans_state = HRT_IDLE;
1828 v3_counting_barrier(&core->vm_info->reset_barrier);
1831 PrintError(core->vm_info,core,"hvm: reset failed\n");
1838 // ROS core will be handled by normal reset functionality
1844 int v3_handle_hvm_entry(struct guest_info *core)
1846 if (!core->vm_info->hvm_state.is_hvm // not relevant to non-HVM
1847 || core->hvm_state.is_hrt // not relevant to an HRT in an HVM
1848 || !core->vm_info->hvm_state.ros_signal.code) { // not relevant if there is no code to inject
1850 // Note that above check for code could race with a writer, but
1851 // if that happens, we'll simply inject at the next opportunity instead of
1852 // this one (see below for atomic update)
1855 struct v3_ros_signal *s = &core->vm_info->hvm_state.ros_signal;
1858 if (! (s->handler && // handler installed
1859 s->cr3 && // process installed
1860 s->stack && // stack installed
1861 core->cpl == 3 && // user mode
1862 core->ctrl_regs.cr3 == s->cr3) // right process active
1864 // Cannot inject at this time
1867 // We can inject now, let's atomically see if we have something
1868 // and commit to doing it if we do
1871 // Get code, reset to allow next one
1872 code = __sync_fetch_and_and(&(s->code), 0);
1875 // nothing to do after all
1879 // actually do inject
1884 PrintDebug(core->vm_info,core,"hvm: ROS interrupt starting with rip=%p rsp=%p\n", (void*) core->rip, (void*) core->vm_regs.rsp);
1885 // build interrupt frame
1887 frame[1] = core->rip;
1888 frame[2] = core->segments.cs.selector; // return cs
1889 frame[3] = core->ctrl_regs.rflags;
1890 frame[4] = core->vm_regs.rsp;
1891 frame[5] = core->segments.ss.selector; // return ss
1893 rsp = (s->stack - 8) & (~0x7); // make sure we are aligned
1894 rsp -= sizeof(frame);
1897 if (v3_write_gva_memory(core,(addr_t)rsp,sizeof(frame),(uint8_t*)frame)!=sizeof(frame)) {
1898 PrintError(core->vm_info,core,"hvm: failed to write interrupt frame\n");
1899 // we just lost this inject
1903 // now make us look like we are jumping to the entry
1904 core->rip = s->handler;
1905 core->vm_regs.rsp = rsp;
1907 PrintDebug(core->vm_info,core,"hvm: ROS frame is 0x%llx|0x%llx|0x%llx|0x%llx|0x%llx|0x%llx and and on entry rip=%p and rsp=%p\n", frame[0],frame[1],frame[2],frame[3],frame[4],frame[5],(void*) core->rip, (void*) core->vm_regs.rsp);
1909 // and we should be good to go
1916 int v3_handle_hvm_exit(struct guest_info *core)
1918 // currently nothing
1922 int v3_hvm_signal_ros(struct v3_vm_info *vm, uint64_t code)
1924 struct v3_ros_signal *s = &vm->hvm_state.ros_signal;
1927 PrintError(vm,VCORE_NONE,"hvm: cannot signal ros with code zero\n");
1931 // handler, etc, must exist
1932 if (!s->handler || !s->stack) {
1933 PrintError(vm,VCORE_NONE,"hvm: cannot signal ros with no installed handler\n");
1936 // we set the code only if we are idle (code 0),
1937 // and we do so only
1938 if (!__sync_bool_compare_and_swap(&(s->code), 0, code)) {
1939 PrintError(vm,VCORE_NONE,"hvm: signal was already asserted\n");
1942 PrintDebug(vm,VCORE_NONE,"hvm: raised signal 0x%llx to the ROS\n",code);