2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org>
11 * All rights reserved.
13 * Author: Peter Dinda <pdinda@northwestern.edu>
15 * This is free software. You are permitted to use,
16 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
27 #include <palacios/vmm_xml.h>
29 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_debug.h>
38 } __attribute__((packed)) gdtr;
49 } __attribute__((packed));
54 MEM = Total size of memory in the GPA (in MB)
55 ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
57 GPAs [0,ROS_MEM) are what the ROS sees
58 GPAs [ROS_MEM, MEM) are HRT only
59 GPAS [0,MEM) are accessible by the HRT
61 CORES = Total number of cores in VM
62 ROS_CORES = Total numbber of cores for the ROS
64 Cores [0,ROS_CORES) are what the ROS sees
65 Cores [ROS_CORES,CORES) are HRT only
66 Cores [0,CORES) are accessible by the HRT
71 <file id="hrtelf" filename="hrtelf.o" />
74 <mem ... >RAM</mem> (MB) Note these are
75 <cores count="CORES" ...> backward compatible
78 <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
79 <hrt file_id="hrtelf" /hrt>
84 #ifndef V3_CONFIG_DEBUG_HVM
86 #define PrintDebug(fmt, args...)
92 PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
98 PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
102 // ignore requests from when we are in the wrong state
103 #define ENFORCE_STATE_MACHINE 1
105 // invoke the HRT using one of the followng mechanisms
106 #define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
107 #define UPCALL_MAGIC_ERROR 0xf00df00d
110 static int magic_upcall(struct guest_info *core, uint64_t num)
112 #ifdef V3_CONFIG_HVM_UPCALL_MAGIC_GPF
113 PrintDebug(core->vm_info, core, "hvm: injecting magic #GP into core %llu\n",num);
114 if (v3_raise_exception_with_error(&core->vm_info->cores[num],
116 UPCALL_MAGIC_ERROR)) {
117 PrintError(core->vm_info, core,"hvm: cannot inject HRT #GP to core %llu\n",num);
124 #ifdef V3_CONFIG_HVM_UPCALL_MAGIC_PF
125 PrintDebug(core->vm_info,core,"hvm: injecting magic #GP into core %llu\n",num);
126 core->vm_info->cores[num].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
127 if (v3_raise_exception_with_error(&core->vm_info->cores[num],
129 UPCALL_MAGIC_ERROR)) {
130 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",num);
136 #ifdef V3_CONFIG_HVM_UPCALL_MAGIC_SWIN
137 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",core->vm_info->hvm_info.hrt_int_vector,num);
138 if (v3_raise_swintr(&core->vm_info->cores[cur],core->vm_info->hvm_info-->hrt_int_vector)) {
139 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
146 PrintError(core->vm_info,core,"hvm: no upcall mechanism is enabled!\n");
152 64 bit only hypercall:
154 rax = hypercall number
156 then args are: rcx, rdx, rsi, rdi r8, r9, r10, r11
159 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
162 uint64_t bitness = core->vm_regs.rbx;
163 uint64_t a1 = core->vm_regs.rcx;
164 uint64_t a2 = core->vm_regs.rdx;
165 uint64_t a3 = core->vm_regs.rsi;
166 struct v3_vm_hvm *h = &core->vm_info->hvm_state;
169 // Let's be paranoid here
170 irq_state = v3_lock_irqsave(h->hypercall_lock);
172 if (bitness!=0x6464646464646464) {
173 PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
174 core->vm_regs.rax = -1;
175 v3_unlock_irqrestore(h->hypercall_lock,irq_state);
184 V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
185 hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
186 //v3_print_core_telemetry(core);
187 // v3_print_guest_state(core);
188 core->vm_regs.rax = 0;
191 case 0x1: // reset ros
192 PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
193 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) {
194 PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
195 core->vm_regs.rax = -1;
197 core->vm_regs.rax = 0;
201 case 0x2: // reset hrt
202 PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
203 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) {
204 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
205 core->vm_regs.rax = -1;
207 core->vm_regs.rax = 0;
211 case 0x3: // reset both
212 PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
213 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) {
214 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
215 core->vm_regs.rax = -1;
217 core->vm_regs.rax = 0;
221 case 0x8: // replace HRT image
223 // a3 = size of image
224 PrintDebug(core->vm_info,core,"hvm: request replacement HRT image addr=0x%llx size=0x%llx\n",a2,a3);
228 V3_VFree(h->hrt_image);
232 h->hrt_image = V3_VMalloc(a3);
234 if (!(h->hrt_image)) {
235 PrintError(core->vm_info,core, "hvm: failed to allocate space for replacement image\n");
236 core->vm_regs.rax = -1;
238 if (v3_read_gva_memory(core, a2, a3, (uint8_t*) h->hrt_image)!=a3) {
239 PrintError(core->vm_info, core, "hvm: cannot read replacement image\n");
240 core->vm_regs.rax = -1;
242 h->hrt_image_size = a3;
243 core->vm_regs.rax = 0;
247 if (core->vm_regs.rax) {
248 PrintError(core->vm_info,core,"hvm: Failed to replace HRT image\n");
250 PrintDebug(core->vm_info,core,"hvm: HRT image successfully replaced\n");
256 case 0xf: // get HRT state
257 core->vm_regs.rax = h->trans_state;
258 if (v3_write_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*) &h->ros_event)!=sizeof(h->ros_event)) {
259 PrintError(core->vm_info, core, "hvm: cannot write back ROS event state to %p - continuing\n",(void*)a2);
261 //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
265 PrintDebug(core->vm_info, core, "hvm: ROS event request\n");
266 if (h->ros_event.event_type!=ROS_NONE) {
267 PrintError(core->vm_info, core, "hvm: ROS event is already in progress\n");
268 core->vm_regs.rax = -1;
270 if (v3_read_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*)&h->ros_event)!=sizeof(h->ros_event)) {
271 PrintError(core->vm_info, core, "hvm: cannot read ROS event from %p\n",(void*)a2);
272 core->vm_regs.rax = -1;
274 core->vm_regs.rax = 0;
275 PrintDebug(core->vm_info, core, "hvm: copied new ROS event (type=%s)\n",
276 h->ros_event.event_type == ROS_PAGE_FAULT ? "page fault" :
277 (h->ros_event.event_type == ROS_SYSCALL ? "syscall" : "none"));
284 case 0x1e: // ack result (HRT has read the result of the finished event)
285 if (h->ros_event.event_type != ROS_DONE) {
286 PrintError(core->vm_info, core, "hvm: cannot ack event result when not in ROS_DONE state\n");
287 core->vm_regs.rax = -1;
289 h->ros_event.event_type=ROS_NONE;
290 PrintDebug(core->vm_info, core, "hvm: HRT core acks event result\n");
291 core->vm_regs.rax = 0;
296 PrintDebug(core->vm_info, core, "hvm: completion of ROS event (rc=0x%llx)\n",a2);
297 h->ros_event.event_type=ROS_DONE;
298 h->ros_event.last_ros_event_result = a2;
301 case 0x20: // invoke function (ROS->HRT)
302 case 0x21: // invoke parallel function (ROS->HRT)
303 if (v3_is_hvm_hrt_core(core)) {
304 PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
305 core->vm_regs.rax = -1;
307 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
308 PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
309 core->vm_regs.rax = -1;
311 uint64_t *page = (uint64_t *) h->comm_page_hva;
312 uint64_t first, last, cur;
314 PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
319 first=last=h->first_hrt_core;
321 first=h->first_hrt_core;
322 last=core->vm_info->num_cores-1;
325 core->vm_regs.rax = 0;
327 h->trans_count = last-first+1;
329 for (cur=first;cur<=last;cur++) {
330 if (magic_upcall(core,cur)) {
331 core->vm_regs.rax = -1;
334 // Force core to exit now
335 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
337 if (core->vm_regs.rax==0) {
339 h->trans_state = HRT_CALL;
341 h->trans_state = HRT_PARCALL;
344 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
345 h->trans_state = HRT_IDLE;
353 case 0x28: // setup for synchronous operation (ROS->HRT)
354 case 0x29: // teardown for synchronous operation (ROS->HRT)
355 if (v3_is_hvm_hrt_core(core)) {
356 PrintError(core->vm_info,core,"hvm: %ssynchronization invocation not supported from HRT core\n",a1==0x29 ? "de" : "");
357 core->vm_regs.rax = -1;
359 if (ENFORCE_STATE_MACHINE &&
360 ((a1==0x28 && h->trans_state!=HRT_IDLE) || (a1==0x29 && h->trans_state!=HRT_SYNC))) {
361 PrintError(core->vm_info,core, "hvm: cannot invoke %ssynchronization in state %d\n",a1==0x29 ? "de" : "", h->trans_state);
362 core->vm_regs.rax = -1;
364 uint64_t *page = (uint64_t *) h->comm_page_hva;
365 uint64_t first, last, cur;
367 PrintDebug(core->vm_info,core, "hvm: invoke %ssynchronization on address %p\n",a1==0x29 ? "de" : "",(void*)a2);
371 first=last=h->first_hrt_core; // initially we will sync only with BSP
373 core->vm_regs.rax = 0;
375 h->trans_count = last-first+1;
377 for (cur=first;cur<=last;cur++) {
379 if (magic_upcall(core,cur)) {
380 core->vm_regs.rax = -1;
383 // Force core to exit now
384 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
387 if (core->vm_regs.rax==0) {
389 h->trans_state = HRT_SYNCSETUP;
391 h->trans_state = HRT_SYNCTEARDOWN;
394 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
395 h->trans_state = HRT_IDLE;
402 case 0x2f: // function exec or sync done
403 if (v3_is_hvm_ros_core(core)) {
404 PrintError(core->vm_info,core, "hvm: request for exec or sync done from ROS core\n");
405 core->vm_regs.rax=-1;
407 if (ENFORCE_STATE_MACHINE &&
408 h->trans_state!=HRT_CALL &&
409 h->trans_state!=HRT_PARCALL &&
410 h->trans_state!=HRT_SYNCSETUP &&
411 h->trans_state!=HRT_SYNCTEARDOWN) {
412 PrintError(core->vm_info,core,"hvm: function or sync completion when not in HRT_CALL, HRT_PARCALL, HRT_SYNCSETUP, or HRT_SYNCTEARDOWN state\n");
413 core->vm_regs.rax=-1;
416 PrintDebug(core->vm_info,core, "hvm: function or sync complete\n");
417 if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
418 // last one, switch state
419 if (h->trans_state==HRT_SYNCSETUP) {
420 h->trans_state=HRT_SYNC;
421 PrintDebug(core->vm_info,core, "hvm: function complete - now synchronous\n");
423 h->trans_state=HRT_IDLE;
432 case 0x30: // merge address space
433 case 0x31: // unmerge address space
434 if (v3_is_hvm_hrt_core(core)) {
435 PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
436 core->vm_regs.rax=-1;
438 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
439 PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state (%d)\n",a1==0x30 ? "" : "un", h->trans_state);
440 core->vm_regs.rax=-1;
442 uint64_t *page = (uint64_t *) h->comm_page_hva;
444 PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
445 // should sanity check to make sure guest is in 64 bit without anything strange
448 page[1] = core->ctrl_regs.cr3; // this is a do-not-care for an unmerge
450 core->vm_regs.rax = 0;
452 h->trans_state = HRT_MERGE;
454 if (magic_upcall(core,h->first_hrt_core)) {
455 core->vm_regs.rax = -1;
459 // Force core to exit now
460 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
469 case 0x3f: // merge operation done
470 if (v3_is_hvm_ros_core(core)) {
471 PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
472 core->vm_regs.rax=-1;
474 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
475 PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
476 core->vm_regs.rax=-1;
478 PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
479 h->trans_state=HRT_IDLE;
486 case 0x40: // install or remove signal handler
487 if (v3_is_hvm_hrt_core(core)) {
488 PrintError(core->vm_info,core, "hvm: HRT cannot install signal handler...\n");
489 core->vm_regs.rax=-1;
491 PrintDebug(core->vm_info,core,"hvm: install signal handler for CR3=%p, handler=%p, stack=%p\n",(void*)core->ctrl_regs.cr3, (void*)a2, (void*)a3);
492 if (h->ros_signal.code) {
493 PrintError(core->vm_info,core,"hvm: signal is pending...\n");
494 core->vm_regs.rax=-1;
496 if ((a2 || a3) && (h->ros_signal.handler || h->ros_signal.stack)) {
497 PrintError(core->vm_info,core,"hvm: attempt to replace existing handler without removing it first\n");
498 core->vm_regs.rax=-1;
500 // actually make the change
501 h->ros_signal.handler=a2;
502 h->ros_signal.stack=a3;
503 h->ros_signal.cr3=core->ctrl_regs.cr3;
506 // test by signalling back a hello
508 // v3_hvm_signal_ros(core->vm_info,0xf00d);
515 case 0x41: // raise signal in the ROS from HRT or ROS
516 PrintDebug(core->vm_info,core,"hvm: HRT raises signal code=0x%llx\n", a2);
517 core->vm_regs.rax = v3_hvm_signal_ros(core->vm_info,a2);
520 case 0x51: // fill GDT area (HRT only)
521 if (v3_is_hvm_hrt_core(core)) {
522 PrintError(core->vm_info, core, "hvm: HRT cannot request a GDT area fill\n");
523 core->vm_regs.rax = -1;
525 struct guest_info * hrt_core = &core->vm_info->cores[h->first_hrt_core];
526 struct gdt_area * area = V3_Malloc(sizeof(struct gdt_area) + core->segments.gdtr.limit);
528 PrintError(core->vm_info, core, "hvm: could not allocate GDT area\n");
529 core->vm_regs.rax = -1;
533 PrintDebug(core->vm_info, core, "hvm: ROS requests to fill GDT area with fsbase=%p\n", (void*)a2);
535 if (!h->hrt_gdt_gva) {
536 PrintError(core->vm_info, core, "hvm: HRT has not registered a GDT state save area\n");
537 core->vm_regs.rax = -1;
542 area->gdtr.base = h->hrt_gdt_gva + sizeof(struct gdt_area);
543 area->gdtr.limit = core->segments.gdtr.limit;
545 area->cs = core->segments.cs.selector;
546 area->ds = core->segments.ds.selector;
547 area->es = core->segments.es.selector;
548 area->fs = core->segments.fs.selector;
549 area->gs = core->segments.gs.selector;
550 area->ss = core->segments.ss.selector;
552 if (v3_read_gva_memory(core,
553 core->segments.gdtr.base,
554 core->segments.gdtr.limit,
555 (uint8_t*)area->gdt) != core->segments.gdtr.limit) {
556 PrintError(core->vm_info, core, "hvm: could not copy GDT from ROS\n");
557 core->vm_regs.rax = -1;
562 uint_t area_size = sizeof(struct gdt_area) + core->segments.gdtr.limit;
564 // copy the entire area over
565 PrintDebug(core->vm_info, core, "hvm: copying %u bytes into GDT area\n", area_size);
567 if (v3_write_gva_memory(hrt_core, h->hrt_gdt_gva, area_size, (uchar_t*)area) != area_size) {
568 PrintError(core->vm_info, core, "hvm: could not copy GDT area\n");
569 core->vm_regs.rax = -1;
574 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
575 PrintError(core->vm_info,core, "hvm: cannot sync GDT in state %d\n", h->trans_state);
576 core->vm_regs.rax = -1;
580 uint64_t *page = (uint64_t *) h->comm_page_hva;
581 uint64_t first, last, cur;
583 PrintDebug(core->vm_info,core, "hvm: sync GDT\n");
585 page[1] = h->hrt_gdt_gva;
588 first=last=h->first_hrt_core;
590 core->vm_regs.rax = 0;
592 h->trans_count = last-first+1;
594 for (cur=first;cur<=last;cur++) {
595 if (magic_upcall(core,cur)) {
596 core->vm_regs.rax = -1;
599 // Force core to exit now
600 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
603 if (core->vm_regs.rax==0) {
604 h->trans_state = HRT_GDTSYNC;
606 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT GDT SYNC failure\n");
607 h->trans_state = HRT_IDLE;
619 case 0x52: // register HRT GDT area
620 if (!v3_is_hvm_hrt_core(core)) {
621 PrintError(core->vm_info, core, "hvm: ROS cannot install a GDT area\n");
622 core->vm_regs.rax = -1;
624 PrintDebug(core->vm_info, core, "hvm: HRT registers GDT save area at gva=%p\n", (void*)a2);
626 core->vm_regs.rax = 0;
629 PrintDebug(core->vm_info, core, "hvm: Printing current HRT GDT...\n");
630 #ifdef V3_CONFIG_DEBUG_HVM
631 v3_print_gdt(core, core->segments.gdtr.base);
636 case 0x53: // restore GDT
638 if (v3_is_hvm_hrt_core(core)) {
639 PrintError(core->vm_info, core, "hvm: HRT cannot request GDT restoration\n");
640 core->vm_regs.rax = -1;
643 PrintDebug(core->vm_info, core, "hvm: ROS requesting to restore original GDT\n");
644 core->vm_regs.rax = 0;
647 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
648 PrintError(core->vm_info,core, "hvm: cannot sync GDT in state %d\n", h->trans_state);
649 core->vm_regs.rax = -1;
652 uint64_t *page = (uint64_t *) h->comm_page_hva;
653 uint64_t first, last, cur;
655 PrintDebug(core->vm_info,core, "hvm: restore GDT\n");
658 first=last=h->first_hrt_core;
660 core->vm_regs.rax = 0;
662 h->trans_count = last-first+1;
664 for (cur=first;cur<=last;cur++) {
665 if (magic_upcall(core,cur)) {
666 core->vm_regs.rax = -1;
669 // Force core to exit now
670 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
673 if (core->vm_regs.rax==0) {
674 h->trans_state = HRT_GDTSYNC;
676 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT GDT SYNC failure\n");
677 h->trans_state = HRT_IDLE;
684 case 0x5f: // GDT sync operation done
685 if (v3_is_hvm_ros_core(core)) {
686 PrintError(core->vm_info,core, "hvm: invalid request for GDT sync done from ROS core\n");
687 core->vm_regs.rax=-1;
689 if (ENFORCE_STATE_MACHINE && h->trans_state != HRT_GDTSYNC) {
690 PrintError(core->vm_info,core,"hvm: GDT sync done when in incorrect state (%d)\n", h->trans_state);
691 core->vm_regs.rax=-1;
693 PrintDebug(core->vm_info,core, "hvm: GDT sync complete - back to idle\n");
694 PrintDebug(core->vm_info, core, "hvm: Dumping new HRT GDT...\n");
695 #ifdef V3_CONFIG_DEBUG_HVM
696 v3_print_gdt(core, core->segments.gdtr.base);
698 h->trans_state=HRT_IDLE;
706 PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
707 core->vm_regs.rax=-1;
711 v3_unlock_irqrestore(h->hypercall_lock,irq_state);
716 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
718 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
720 v3_cfg_tree_t *hvm_config;
721 v3_cfg_tree_t *ros_config;
722 v3_cfg_tree_t *hrt_config;
728 PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
733 memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
734 vm->hvm_state.is_hvm=0;
735 vm->hvm_state.first_hrt_core=vm->num_cores;
736 vm->hvm_state.first_hrt_gpa=vm->mem_size;
738 if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
739 PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
743 if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
744 PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
748 if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) {
749 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
753 if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) {
754 PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
758 vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
760 if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) {
761 PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
765 vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
767 if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) {
768 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
772 if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) {
773 PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
777 vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
779 if (!vm->hvm_state.hrt_file) {
780 PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
784 if (v3_register_hypercall(vm, HVM_HCALL,
785 hvm_hcall_handler, 0)) {
786 PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
790 v3_lock_init(&(vm->hvm_state.hypercall_lock));
792 // XXX sanity check config here
794 vm->hvm_state.is_hvm=1;
797 if (vm->hvm_state.is_hvm) {
798 V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
799 vm->hvm_state.first_hrt_core-1,
800 (void*) vm->hvm_state.first_hrt_gpa-1,
801 vm->hvm_state.first_hrt_core,
803 (void*) vm->hvm_state.first_hrt_gpa,
804 (void*)vm->mem_size-1,
806 vm->hvm_state.hrt_file->tag);
808 V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
815 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
817 PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
820 if (vm->hvm_state.hrt_image) {
821 V3_VFree(vm->hvm_state.hrt_image);
822 vm->hvm_state.hrt_image=0;
823 vm->hvm_state.hrt_image_size=0;
826 v3_remove_hypercall(vm,HVM_HCALL);
828 v3_lock_deinit(&(vm->hvm_state.hypercall_lock));
830 if (vm->hvm_state.comm_page_hpa) {
831 struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
833 PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
835 v3_delete_mem_region(vm,r);
842 int v3_init_hvm_core(struct guest_info *core)
844 memset(&core->hvm_state,0,sizeof(core->hvm_state));
845 if (core->vm_info->hvm_state.is_hvm) {
846 if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) {
847 core->hvm_state.is_hrt=1;
853 int v3_deinit_hvm_core(struct guest_info *core)
855 PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
861 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
863 if (vm->hvm_state.is_hvm) {
864 return vm->hvm_state.first_hrt_gpa;
869 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
874 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
876 if (vm->hvm_state.is_hvm) {
877 return vm->hvm_state.first_hrt_core;
879 return vm->num_cores;
883 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
885 if (vm->hvm_state.is_hvm) {
886 return vm->num_cores - vm->hvm_state.first_hrt_core;
893 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
895 if (vm->hvm_state.is_hvm) {
896 return gpa<vm->hvm_state.first_hrt_gpa;
902 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
904 if (vm->hvm_state.is_hvm) {
905 return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
911 int v3_is_hvm_hrt_core(struct guest_info *core)
913 return core->hvm_state.is_hrt;
916 int v3_is_hvm_ros_core(struct guest_info *core)
918 return !core->hvm_state.is_hrt;
921 int v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
924 // ioapic or msi to apic
925 return !dest->hvm_state.is_hrt;
928 return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
932 void v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm,
933 uint32_t *start_apic, uint32_t *num_apics)
936 // Seen from ioapic, msi, etc:
937 if (vm->hvm_state.is_hvm) {
938 // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
940 *num_apics = vm->hvm_state.first_hrt_core;
942 // Non-HVM shows all cores/APICs to apic, msi, etc.
944 *num_apics = vm->num_cores;
948 if (core->hvm_state.is_hrt) {
949 // HRT core/apic sees all apics
950 // (this policy may change...)
952 *num_apics = vm->num_cores;
954 // non-HRT core/apic sees only non-HRT cores/apics
956 *num_apics = vm->hvm_state.first_hrt_core;
961 #define MAX(x,y) ((x)>(y)?(x):(y))
962 #define MIN(x,y) ((x)<(y)?(x):(y))
965 static uint64_t boot_state_end_addr(struct v3_vm_info *vm)
967 return PAGE_ADDR(vm->mem_size);
970 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
972 *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
976 extern v3_cpu_arch_t v3_mach_type;
978 extern void *v3_hvm_svm_null_int_handler_start;
979 extern void *v3_hvm_svm_null_int_handler_end;
980 extern void *v3_hvm_vmx_null_int_handler_start;
981 extern void *v3_hvm_vmx_null_int_handler_end;
983 static void write_null_int_handler(struct v3_vm_info *vm)
990 get_null_int_handler_loc(vm,&base,&limit);
992 switch (v3_mach_type) {
995 case V3_SVM_REV3_CPU:
996 data = (void*) &v3_hvm_svm_null_int_handler_start;
997 len = (void*) &v3_hvm_svm_null_int_handler_end - data;
1002 case V3_VMX_EPT_CPU:
1003 case V3_VMX_EPT_UG_CPU:
1004 data = (void*) &v3_hvm_vmx_null_int_handler_start;
1005 len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
1009 PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
1015 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
1018 PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
1022 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1024 *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
1028 // default IDT entries (int and trap gates)
1030 // Format is 16 bytes long:
1032 // 16 selector => (target code selector) => 0x8 // entry 1 of GDT
1033 // 3 ist => (stack) = 0 => current stack
1035 // 4 type => 0xe=>INT, 0xf=>TRAP
1036 // 1 reserved => 0 (indicates "system" by being zero)
1039 // 16 offsetmid => 0
1040 // 32 offsethigh => 0 (total is a 64 bit offset)
1043 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
1045 // Note little endian
1047 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
1048 static uint64_t idt64_int_gate_entry_mask[2] = { 0x00008e0000080000, 0x0 };
1050 static void write_idt(struct v3_vm_info *vm)
1055 uint64_t handler_len;
1057 uint64_t trap_gate[2];
1058 uint64_t int_gate[2];
1060 get_idt_loc(vm,&base,&limit);
1062 get_null_int_handler_loc(vm,&handler,&handler_len);
1064 handler += vm->hvm_state.gva_offset;
1066 memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
1067 memcpy(int_gate,idt64_int_gate_entry_mask,16);
1070 // update the entries for the handler location
1074 hand = (uint8_t*) &handler;
1076 mask = (uint8_t *)trap_gate;
1077 memcpy(&(mask[0]),&(hand[0]),2); // offset low
1078 memcpy(&(mask[6]),&(hand[2]),2); // offset med
1079 memcpy(&(mask[8]),&(hand[4]),4); // offset high
1081 mask = (uint8_t *)int_gate;
1082 memcpy(&(mask[0]),&(hand[0]),2); // offset low
1083 memcpy(&(mask[6]),&(hand[2]),2); // offset med
1084 memcpy(&(mask[8]),&(hand[4]),4); // offset high
1086 PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
1089 for (i=0;i<32;i++) {
1090 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
1093 for (i=32;i<256;i++) {
1094 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
1097 PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
1102 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1104 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
1108 static uint64_t gdt64[3] = {
1109 0x0000000000000000, /* null */
1110 0x00a09a0000000000, /* code (note lme bit) */
1111 0x00a0920000000000, /* data (most entries don't matter) */
1114 static void write_gdt(struct v3_vm_info *vm)
1119 get_gdt_loc(vm,&base,&limit);
1120 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
1122 PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
1127 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1129 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
1133 static void write_tss(struct v3_vm_info *vm)
1138 get_tss_loc(vm,&base,&limit);
1140 v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
1142 PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
1146 #define TOP_HALF_START 0xffff800000000000ULL
1147 #define BOTTOM_HALF_END 0x00007fffffffffffULL
1150 #define L4_UNIT PAGE_SIZE
1151 #define L3_UNIT (512ULL * L4_UNIT)
1152 #define L2_UNIT (512ULL * L3_UNIT)
1153 #define L1_UNIT (512ULL * L2_UNIT)
1155 static void compute_pts_4KB(struct v3_vm_info *vm,
1156 uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)
1159 // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
1160 // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
1161 // so it is the same number of page tables regardless
1163 uint64_t max_gva = vm->hvm_state.max_mem_mapped;
1166 *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
1167 *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
1168 *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
1174 PTS MAP using 1 GB pages
1175 n second levels pts, highest gva, highest address
1181 PTS MAP using 2 MB pages
1182 n third level pts, highest gva, highest address
1183 m second level pts, highest gva, highest address
1188 PTS MAP using 4 KB pages
1189 n 4th level, highest gva, highest address
1190 m 3rd level, highest gva, hihgest address
1191 l second level, highest gva, highest address
1195 PTS MAP using 512 GB pages when this becomes available
1200 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1202 uint64_t l1,l2,l3,l4;
1205 compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
1207 if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
1209 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
1211 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
1212 num_pt = l1 + l2 + l3;
1213 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
1214 num_pt = l1 + l2 + l3 + l4;
1216 PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
1220 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
1221 *limit = num_pt*PAGE_SIZE;
1224 static void write_pts(struct v3_vm_info *vm)
1227 uint64_t num_l1, num_l2, num_l3, num_l4;
1228 void *start_l1, *start_l2, *start_l3, *start_l4;
1234 void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
1235 void *min_gva = (void*) vm->hvm_state.gva_offset;
1236 #ifdef V3_CONFIG_DEBUG_HVM
1237 void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
1240 uint64_t i_start,i_end;
1242 struct pml4e64 *pml4e;
1243 struct pdpe64 *pdpe;
1247 if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
1248 PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
1250 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
1252 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
1254 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
1257 PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
1261 get_pt_loc(vm,&start_l1,&size);
1262 compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
1264 start_l2=start_l1+PAGE_SIZE*num_l1;
1265 start_l3=start_l2+PAGE_SIZE*num_l2;
1266 start_l4=start_l3+PAGE_SIZE*num_l3;
1268 PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
1269 PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
1270 PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
1271 PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
1275 // build PML4 (only one)
1276 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) {
1277 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
1281 memset(pml4e,0,PAGE_SIZE);
1284 i_start=0; i_end = num_l2;
1285 } else if (min_gva==(void*)TOP_HALF_START) {
1286 i_start=256; i_end=256+num_l2;
1288 PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
1292 for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
1294 i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
1297 pml4e[i].writable=1;
1300 PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
1301 pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1302 //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1304 pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
1305 //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1316 for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1318 cur_pt+=PAGE_SIZE, pt++) {
1321 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) {
1322 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
1326 memset(pdpe,0,PAGE_SIZE);
1329 i<512 && cur_gpa<max_gpa;
1330 i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
1336 pdpe[i].large_page=1;
1337 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1338 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1340 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
1341 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1351 for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1353 cur_pt+=PAGE_SIZE, pt++) {
1356 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) {
1357 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
1361 memset(pde,0,PAGE_SIZE);
1364 i<512 && cur_gpa<max_gpa;
1365 i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
1371 pde[i].large_page=1;
1372 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1373 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
1375 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
1376 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
1388 for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1390 cur_pt+=PAGE_SIZE, pt++) {
1393 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) {
1394 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
1398 memset(pte,0,PAGE_SIZE);
1401 i<512 && cur_gpa<max_gpa;
1402 i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
1406 pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1407 //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
1415 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1418 get_pt_loc(vm,base, limit);
1424 int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
1426 struct v3_vm_info *vm = core->vm_info;
1428 hrt->tag.type = MB_INFO_HRT_TAG;
1429 hrt->tag.size = sizeof(mb_info_hrt_t);
1431 hrt->total_num_apics = vm->num_cores;
1432 hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
1433 hrt->have_hrt_ioapic=0;
1434 hrt->first_hrt_ioapic_entry=0;
1436 hrt->cpu_freq_khz = V3_CPU_KHZ();
1438 hrt->hrt_flags = vm->hvm_state.hrt_flags;
1439 hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
1440 hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
1441 hrt->gva_offset = vm->hvm_state.gva_offset;
1442 hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
1443 hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
1448 static void write_mb_info(struct v3_vm_info *vm)
1450 if (vm->hvm_state.hrt_type!=HRT_MBOOT64) {
1451 PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
1459 get_mb_info_loc(vm,&base,&limit);
1461 if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) {
1462 PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
1467 PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
1471 v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
1476 PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
1480 #define SCRATCH_STACK_SIZE 4096
1483 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1488 get_mb_info_loc(vm,&mb_base,&mb_limit);
1490 mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
1492 *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
1494 if (mb_base < *base+PAGE_SIZE) {
1495 PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
1498 *limit = mb_base - *base;
1502 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1503 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1505 #define ELF_MAGIC 0x464c457f
1506 #define MB2_MAGIC 0xe85250d6
1508 #define MB2_INFO_MAGIC 0x36d76289
1510 static int is_elf(uint8_t *data, uint64_t size)
1512 if (*((uint32_t*)data)==ELF_MAGIC) {
1519 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
1521 uint64_t limit = size > 32768 ? 32768 : size;
1524 // Scan for the .boot magic cookie
1525 // must be in first 32K, assume 4 byte aligned
1526 for (i=0;i<limit;i+=4) {
1527 if (*((uint32_t*)&data[i])==MB2_MAGIC) {
1528 INFO("Found multiboot header at offset 0x%llx\n",i);
1529 return (mb_header_t *) &data[i];
1536 static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
1538 struct v3_vm_hvm *h = &vm->hvm_state;
1539 uint64_t f = mb->mb64_hrt->hrt_flags;
1540 uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
1541 uint64_t gvaoff = mb->mb64_hrt->gva_offset;
1542 uint64_t gvaentry = mb->mb64_hrt->gva_entry;
1543 uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
1544 uint8_t vec = mb->mb64_hrt->hrt_int_vector;
1547 PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
1548 f, maxmap, gvaoff,gvaentry,commgpa, vec);
1550 if (maxmap<0x100000000ULL) {
1551 PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
1552 maxmap=0x100000000ULL;
1555 if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
1556 PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
1558 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
1560 f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
1561 h->max_mem_mapped = maxmap;
1562 PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
1563 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
1565 f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
1566 h->max_mem_mapped = maxmap;
1567 PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
1568 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
1570 f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
1571 h->max_mem_mapped = maxmap;
1572 PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
1574 PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
1578 if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
1579 PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
1585 if (maxmap>h->max_mem_mapped) {
1586 PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
1590 if (gvaoff!=0 && gvaoff!=TOP_HALF_START) {
1591 PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
1595 h->gva_offset = gvaoff;
1597 h->gva_entry = gvaentry;
1599 if (mb->addr->load_addr < h->first_hrt_gpa) {
1600 PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
1604 if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
1605 PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
1610 PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
1614 h->hrt_int_vector = vec;
1617 if (commgpa < vm->mem_size) {
1618 PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
1622 h->comm_page_gpa = commgpa;
1624 if (!h->comm_page_hpa) {
1625 if (!(h->comm_page_hpa=V3_AllocPages(1))) {
1626 PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
1630 h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
1632 memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1634 if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) {
1635 PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
1636 V3_FreePages((void*)(h->comm_page_gpa),1);
1641 PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
1644 memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1647 PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
1648 h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
1654 static int setup_mb_kernel_hrt(struct v3_vm_info *vm, void *data, uint64_t size)
1658 if (v3_parse_multiboot_header(data, size, &mb)) {
1659 PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
1664 PrintError(vm,VCORE_NONE,"hvm: invalid HRT - there is no MB64_HRT tag\n");
1668 if (configure_hrt(vm,&mb)) {
1669 PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
1673 if (v3_write_multiboot_kernel(vm,&mb,data,size,
1674 (void*)vm->hvm_state.first_hrt_gpa,
1675 vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
1676 PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
1680 if (vm->hvm_state.gva_entry) {
1681 vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
1683 vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
1686 vm->hvm_state.hrt_type = HRT_MBOOT64;
1693 static int setup_hrt(struct v3_vm_info *vm)
1698 // If the ROS has installed an image, it takes priority
1699 if (vm->hvm_state.hrt_image) {
1700 data = vm->hvm_state.hrt_image;
1701 size = vm->hvm_state.hrt_image_size;
1703 data = vm->hvm_state.hrt_file->data;
1704 size = vm->hvm_state.hrt_file->size;
1707 if (is_elf(data,size) &&
1708 find_mb_header(data,size)) {
1710 PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
1711 if (setup_mb_kernel_hrt(vm,data,size)) {
1712 PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
1716 PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
1733 We do not touch the ROS portion of the address space.
1734 The HRT portion looks like:
1736 INT_HANDLER (1 page - page aligned)
1737 IDT (1 page - page aligned)
1738 GDT (1 page - page aligned)
1739 TSS (1 page - page asligned)
1740 PAGETABLES (identy map of first N GB)
1741 ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
1742 followed by 3rd level PTs in order, followed by 4th level
1745 SCRATCH_STACK_HRT_CORE0
1746 SCRATCH_STACK_HRT_CORE1
1748 SCRATCH_STACK_HRT_COREN
1750 HRT (as many pages as needed, page-aligned, starting at first HRT address)
1758 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
1760 if (!vm->hvm_state.is_hvm) {
1761 PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
1765 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
1767 if (setup_hrt(vm)) {
1768 PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
1772 // the locations of all the other items are determined by
1773 // the HRT setup, so these must happen after
1775 write_null_int_handler(vm);
1782 // this must happen last
1785 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
1791 On entry for every core:
1793 IDTR points to stub IDT
1794 GDTR points to stub GDT
1795 TS points to stub TSS
1796 CR3 points to root page table
1797 CR0 has PE, PG, and WP
1798 EFER has LME AND LMA (and NX for compatibility with Linux)
1799 RSP is TOS of core's scratch stack (looks like a call)
1801 RAX = MB magic cookie
1802 RBX = address of multiboot info table
1803 RCX = this core id / apic id (0..N-1)
1804 RDX = this core id - first HRT core ID (==0 for the first HRT core)
1806 All addresses are virtual addresses, offset as needed by gva_offset
1808 Other regs are zeroed
1810 shadow/nested paging state reset for long mode
1813 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
1817 uint64_t gva_offset;
1819 rdtscll(core->hvm_state.last_boot_start);
1822 if (!core->hvm_state.is_hrt) {
1823 PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
1828 PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
1830 gva_offset = core->vm_info->hvm_state.gva_offset;
1832 memset(&core->vm_regs,0,sizeof(core->vm_regs));
1833 memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
1834 memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
1835 memset(&core->segments,0,sizeof(core->segments));
1836 memset(&core->msrs,0,sizeof(core->msrs));
1837 memset(&core->fp_state,0,sizeof(core->fp_state));
1839 // We are in long mode with virtual memory and we want
1840 // to start immediatley
1841 core->cpl = 0; // we are going right into the kernel
1842 core->cpu_mode = LONG;
1843 core->mem_mode = VIRTUAL_MEM;
1844 core->core_run_state = CORE_RUNNING ;
1848 core->vm_regs.rax = MB2_INFO_MAGIC;
1850 // multiboot info pointer
1851 get_mb_info_loc(core->vm_info, &base,&limit);
1852 core->vm_regs.rbx = (uint64_t) base + gva_offset;
1855 core->vm_regs.rcx = core->vcpu_id;
1858 core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1860 // Now point to scratch stack for this core
1861 // it begins at an ofset relative to the MB info page
1862 get_mb_info_loc(core->vm_info, &base,&limit);
1863 base = base + gva_offset;
1864 base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1865 core->vm_regs.rsp = (v3_reg_t) base;
1866 core->vm_regs.rbp = (v3_reg_t) base-8;
1868 // push onto the stack a bad rbp and bad return address
1869 core->vm_regs.rsp-=16;
1870 v3_set_gpa_memory(core,
1871 core->vm_regs.rsp-gva_offset,
1877 get_hrt_loc(core->vm_info, &base,&limit);
1878 if (core->vm_info->hvm_state.gva_entry) {
1879 core->rip = core->vm_info->hvm_state.gva_entry;
1881 core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset;
1886 PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1887 (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1889 (void*)(core->vm_regs.rsp),
1890 (void*)(core->vm_regs.rbp),
1891 (void*)(core->vm_regs.rax),
1892 (void*)(core->vm_regs.rbx),
1893 (void*)(core->vm_regs.rcx),
1894 (void*)(core->vm_regs.rdx));
1896 // Setup CRs for long mode and our stub page table
1897 // CR0: PG, PE, and WP for catching COW faults in kernel-mode (which is not default behavior)
1898 core->ctrl_regs.cr0 = 0x80010001;
1899 core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1901 // CR2: don't care (output from #PF)
1902 // CE3: set to our PML4E, without setting PCD or PWT
1903 get_pt_loc(core->vm_info, &base,&limit);
1904 core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base); // not offset as this is a GPA
1905 core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1907 // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1908 core->ctrl_regs.cr4 = 0xb0;
1909 core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1911 // RFLAGS zeroed is fine: come in with interrupts off
1912 // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
1913 core->ctrl_regs.efer = 0x1d00;
1914 core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1920 selector is 13 bits of index, 1 bit table indicator
1923 index is scaled by 8, even in long mode, where some entries
1924 are 16 bytes long....
1925 -> code, data descriptors have 8 byte format
1926 because base, limit, etc, are ignored (no segmentation)
1927 -> interrupt/trap gates have 16 byte format
1928 because offset needs to be 64 bits
1931 // Install our stub IDT
1932 get_idt_loc(core->vm_info, &base,&limit);
1934 core->segments.idtr.selector = 0; // entry 0 (NULL) of the GDT
1935 core->segments.idtr.base = (addr_t) base; // only base+limit are used
1936 core->segments.idtr.limit = limit-1;
1937 core->segments.idtr.type = 0x0;
1938 core->segments.idtr.system = 0;
1939 core->segments.idtr.dpl = 0;
1940 core->segments.idtr.present = 0;
1941 core->segments.idtr.long_mode = 0;
1943 // Install our stub GDT
1944 get_gdt_loc(core->vm_info, &base,&limit);
1946 core->segments.gdtr.selector = 0; // entry 0 (NULL) of the GDT
1947 core->segments.gdtr.base = (addr_t) base;
1948 core->segments.gdtr.limit = limit-1; // only base+limit are used
1949 core->segments.gdtr.type = 0x0;
1950 core->segments.gdtr.system = 0;
1951 core->segments.gdtr.dpl = 0;
1952 core->segments.gdtr.present = 0;
1953 core->segments.gdtr.long_mode = 0;
1956 get_tss_loc(core->vm_info, &base,&limit);
1958 core->segments.tr.selector = 0;
1959 core->segments.tr.base = (addr_t) base;
1960 core->segments.tr.limit = limit-1;
1961 core->segments.tr.type = 0x9;
1962 core->segments.tr.system = 0; // available 64 bit TSS
1963 core->segments.tr.dpl = 0;
1964 core->segments.tr.present = 1;
1965 core->segments.tr.long_mode = 0; // not used
1967 base = 0x0; // these are not offset as we want to make all gvas visible
1971 core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1972 core->segments.cs.base = (addr_t) base; // not used
1973 core->segments.cs.limit = limit; // not used
1974 core->segments.cs.type = 0xe; // only C is used
1975 core->segments.cs.system = 1; // not a system segment
1976 core->segments.cs.dpl = 0;
1977 core->segments.cs.present = 1;
1978 core->segments.cs.long_mode = 1;
1980 // DS, SS, etc are identical
1981 core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1982 core->segments.ds.base = (addr_t) base;
1983 core->segments.ds.limit = limit;
1984 core->segments.ds.type = 0x6; // ignored
1985 core->segments.ds.system = 1; // not a system segment
1986 core->segments.ds.dpl = 0;
1987 core->segments.ds.present = 1;
1988 core->segments.ds.long_mode = 1;
1990 memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1991 memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1992 memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1993 memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1996 // reset paging here for shadow...
1998 if (core->shdw_pg_mode != NESTED_PAGING) {
1999 PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
2007 int v3_handle_hvm_reset(struct guest_info *core)
2010 if (core->core_run_state != CORE_RESETTING) {
2014 if (!core->vm_info->hvm_state.is_hvm) {
2018 if (v3_is_hvm_hrt_core(core)) {
2019 // this is an HRT reset
2022 // wait for all the HRT cores
2023 v3_counting_barrier(&core->vm_info->reset_barrier);
2025 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
2027 core->vm_info->run_state = VM_RESETTING;
2030 core->core_run_state = CORE_RESETTING;
2032 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
2033 // we really only need to clear the bss
2034 // and recopy the .data, but for now we'll just
2036 rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
2039 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
2043 // now everyone is ready to reset
2044 rc |= v3_setup_hvm_hrt_core_for_boot(core);
2047 PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
2050 core->core_run_state = CORE_RUNNING;
2052 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
2054 core->vm_info->run_state = VM_RUNNING;
2055 core->vm_info->hvm_state.trans_state = HRT_IDLE;
2058 v3_counting_barrier(&core->vm_info->reset_barrier);
2061 PrintError(core->vm_info,core,"hvm: reset failed\n");
2068 // ROS core will be handled by normal reset functionality
2073 int v3_handle_hvm_entry(struct guest_info *core)
2075 if (!core->vm_info->hvm_state.is_hvm // not relevant to non-HVM
2076 || core->hvm_state.is_hrt // not relevant to an HRT in an HVM
2077 || !core->vm_info->hvm_state.ros_signal.code) { // not relevant if there is no code to inject
2079 // Note that above check for code could race with a writer, but
2080 // if that happens, we'll simply inject at the next opportunity instead of
2081 // this one (see below for atomic update)
2084 struct v3_ros_signal *s = &core->vm_info->hvm_state.ros_signal;
2087 if (! (s->handler && // handler installed
2088 s->cr3 && // process installed
2089 s->stack && // stack installed
2090 core->cpl == 3 && // user mode
2091 core->ctrl_regs.cr3 == s->cr3) // right process active
2093 // Cannot inject at this time
2096 // We can inject now, let's atomically see if we have something
2097 // and commit to doing it if we do
2100 // Get code, reset to allow next one
2101 code = __sync_fetch_and_and(&(s->code), 0);
2104 // nothing to do after all
2108 // actually do inject
2113 PrintDebug(core->vm_info,core,"hvm: ROS interrupt starting with rip=%p rsp=%p\n", (void*) core->rip, (void*) core->vm_regs.rsp);
2114 // build interrupt frame
2116 frame[1] = core->rip;
2117 frame[2] = core->segments.cs.selector; // return cs
2118 frame[3] = core->ctrl_regs.rflags;
2119 frame[4] = core->vm_regs.rsp;
2120 frame[5] = core->segments.ss.selector; // return ss
2122 rsp = (s->stack - 16) & (~0xf); // We should be 16 byte aligned to start
2123 rsp -= sizeof(frame);
2126 if (v3_write_gva_memory(core,(addr_t)rsp,sizeof(frame),(uint8_t*)frame)!=sizeof(frame)) {
2127 PrintError(core->vm_info,core,"hvm: failed to write interrupt frame\n");
2128 // we just lost this inject
2132 // now make us look like we are jumping to the entry
2133 core->rip = s->handler;
2134 core->vm_regs.rsp = rsp;
2136 PrintDebug(core->vm_info,core,"hvm: ROS frame is 0x%llx|0x%llx|0x%llx|0x%llx|0x%llx|0x%llx and and on entry rip=%p and rsp=%p\n", frame[0],frame[1],frame[2],frame[3],frame[4],frame[5],(void*) core->rip, (void*) core->vm_regs.rsp);
2138 // and we should be good to go
2145 int v3_handle_hvm_exit(struct guest_info *core)
2147 // currently nothing
2152 int v3_hvm_signal_ros(struct v3_vm_info *vm, uint64_t code)
2154 struct v3_ros_signal *s = &vm->hvm_state.ros_signal;
2157 PrintError(vm,VCORE_NONE,"hvm: cannot signal ros with code zero\n");
2161 // handler, etc, must exist
2162 if (!s->handler || !s->stack) {
2163 PrintError(vm,VCORE_NONE,"hvm: cannot signal ros with no installed handler\n");
2166 // we set the code only if we are idle (code 0),
2167 // and we do so only
2168 if (!__sync_bool_compare_and_swap(&(s->code), 0, code)) {
2169 PrintError(vm,VCORE_NONE,"hvm: signal was already asserted\n");
2172 PrintDebug(vm,VCORE_NONE,"hvm: raised signal 0x%llx to the ROS\n",code);