2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org>
11 * All rights reserved.
13 * Author: Peter Dinda <pdinda@northwestern.edu>
15 * This is free software. You are permitted to use,
16 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
27 #include <palacios/vmm_xml.h>
29 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_debug.h>
36 MEM = Total size of memory in the GPA (in MB)
37 ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
39 GPAs [0,ROS_MEM) are what the ROS sees
40 GPAs [ROS_MEM, MEM) are HRT only
41 GPAS [0,MEM) are accessible by the HRT
43 CORES = Total number of cores in VM
44 ROS_CORES = Total numbber of cores for the ROS
46 Cores [0,ROS_CORES) are what the ROS sees
47 Cores [ROS_CORES,CORES) are HRT only
48 Cores [0,CORES) are accessible by the HRT
53 <file id="hrtelf" filename="hrtelf.o" />
56 <mem ... >RAM</mem> (MB) Note these are
57 <cores count="CORES" ...> backward compatible
60 <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
61 <hrt file_id="hrtelf" /hrt>
66 #ifndef V3_CONFIG_DEBUG_HVM
68 #define PrintDebug(fmt, args...)
74 PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
80 PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
84 // ignore requests from when we are in the wrong state
85 #define ENFORCE_STATE_MACHINE 1
87 // invoke the HRT using a page fault instead of
88 // the SWINTR mechanism
89 #define USE_UPCALL_MAGIC_PF 1
90 #define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
91 #define UPCALL_MAGIC_ERROR 0xf00df00d
94 64 bit only hypercall:
96 rax = hypercall number
98 then args are: rcx, rdx, rsi, rdi r8, r9, r10, r11
101 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
104 uint64_t bitness = core->vm_regs.rbx;
105 uint64_t a1 = core->vm_regs.rcx;
106 uint64_t a2 = core->vm_regs.rdx;
107 struct v3_vm_hvm *h = &core->vm_info->hvm_state;
110 if (bitness!=0x6464646464646464) {
111 PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
112 core->vm_regs.rax = -1;
121 V3_Print(core->vm_info,core, "hvm: received hypercall %x rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
122 hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
123 //v3_print_core_telemetry(core);
124 // v3_print_guest_state(core);
125 core->vm_regs.rax = 0;
128 case 0x1: // reset ros
129 PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
130 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) {
131 PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
132 core->vm_regs.rax = -1;
134 core->vm_regs.rax = 0;
138 case 0x2: // reset hrt
139 PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
140 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) {
141 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
142 core->vm_regs.rax = -1;
144 core->vm_regs.rax = 0;
148 case 0x3: // reset both
149 PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
150 if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) {
151 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
152 core->vm_regs.rax = -1;
154 core->vm_regs.rax = 0;
158 case 0xf: // get HRT state
159 core->vm_regs.rax = h->trans_state;
160 if (v3_write_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*) &h->ros_event)!=sizeof(h->ros_event)) {
161 PrintError(core->vm_info, core, "hvm: cannot write back ROS event state to %p - continuing\n",(void*)a2);
163 //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
167 PrintDebug(core->vm_info, core, "hvm: ROS event request\n");
168 if (h->ros_event.event_type!=ROS_NONE) {
169 PrintError(core->vm_info, core, "hvm: ROS event is already in progress\n");
170 core->vm_regs.rax = -1;
172 if (v3_read_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*)&h->ros_event)!=sizeof(h->ros_event)) {
173 PrintError(core->vm_info, core, "hvm: cannot read ROS event from %p\n",(void*)a2);
174 core->vm_regs.rax = -1;
176 core->vm_regs.rax = 0;
183 PrintDebug(core->vm_info, core, "hvm: completion of ROS event (rc=0x%llx)\n",a2);
184 h->ros_event.event_type=ROS_NONE;
185 h->ros_event.last_ros_event_result = a2;
188 case 0x20: // invoke function (ROS->HRT)
189 case 0x21: // invoke parallel function (ROS->HRT)
190 if (v3_is_hvm_hrt_core(core)) {
191 PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
192 core->vm_regs.rax = -1;
194 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
195 PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
196 core->vm_regs.rax = -1;
198 uint64_t *page = (uint64_t *) h->comm_page_hva;
199 uint64_t first, last, cur;
201 PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
206 first=last=h->first_hrt_core;
208 first=h->first_hrt_core;
209 last=core->vm_info->num_cores-1;
212 core->vm_regs.rax = 0;
214 h->trans_count = last-first+1;
216 for (cur=first;cur<=last;cur++) {
218 #if USE_UPCALL_MAGIC_PF
219 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
220 core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
221 if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
223 UPCALL_MAGIC_ERROR)) {
224 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
225 core->vm_regs.rax = -1;
229 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
230 if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) {
231 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
232 core->vm_regs.rax = -1;
236 // Force core to exit now
237 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
240 if (core->vm_regs.rax==0) {
242 h->trans_state = HRT_CALL;
244 h->trans_state = HRT_PARCALL;
247 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
248 h->trans_state = HRT_IDLE;
256 case 0x28: // setup for synchronous operation (ROS->HRT)
257 case 0x29: // teardown for synchronous operation (ROS->HRT)
258 if (v3_is_hvm_hrt_core(core)) {
259 PrintError(core->vm_info,core,"hvm: %ssynchronization invocation not supported from HRT core\n",a1==0x29 ? "de" : "");
260 core->vm_regs.rax = -1;
262 if (ENFORCE_STATE_MACHINE &&
263 ((a1==0x28 && h->trans_state!=HRT_IDLE) || (a1==0x29 && h->trans_state!=HRT_SYNC))) {
264 PrintError(core->vm_info,core, "hvm: cannot invoke %ssynchronization in state %d\n",a1==0x29 ? "de" : "", h->trans_state);
265 core->vm_regs.rax = -1;
267 uint64_t *page = (uint64_t *) h->comm_page_hva;
268 uint64_t first, last, cur;
270 PrintDebug(core->vm_info,core, "hvm: invoke %ssynchronization on address %p\n",a1==0x29 ? "de" : "",(void*)a2);
274 first=last=h->first_hrt_core; // initially we will sync only with BSP
276 core->vm_regs.rax = 0;
278 h->trans_count = last-first+1;
280 for (cur=first;cur<=last;cur++) {
282 #if USE_UPCALL_MAGIC_PF
283 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
284 core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
285 if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
287 UPCALL_MAGIC_ERROR)) {
288 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
289 core->vm_regs.rax = -1;
293 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
294 if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) {
295 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
296 core->vm_regs.rax = -1;
300 // Force core to exit now
301 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
304 if (core->vm_regs.rax==0) {
306 h->trans_state = HRT_SYNCSETUP;
308 h->trans_state = HRT_SYNCTEARDOWN;
311 PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
312 h->trans_state = HRT_IDLE;
319 case 0x2f: // function exec or sync done
320 if (v3_is_hvm_ros_core(core)) {
321 PrintError(core->vm_info,core, "hvm: request for exec or sync done from ROS core\n");
322 core->vm_regs.rax=-1;
324 if (ENFORCE_STATE_MACHINE &&
325 h->trans_state!=HRT_CALL &&
326 h->trans_state!=HRT_PARCALL &&
327 h->trans_state!=HRT_SYNCSETUP &&
328 h->trans_state!=HRT_SYNCTEARDOWN) {
329 PrintError(core->vm_info,core,"hvm: function or sync completion when not in HRT_CALL, HRT_PARCALL, HRT_SYNCSETUP, or HRT_SYNCTEARDOWN state\n");
330 core->vm_regs.rax=-1;
333 PrintDebug(core->vm_info,core, "hvm: function or sync complete\n");
334 if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
335 // last one, switch state
336 if (h->trans_state==HRT_SYNCSETUP) {
337 h->trans_state=HRT_SYNC;
338 PrintDebug(core->vm_info,core, "hvm: function complete - now synchronous\n");
340 h->trans_state=HRT_IDLE;
349 case 0x30: // merge address space
350 case 0x31: // unmerge address space
351 if (v3_is_hvm_hrt_core(core)) {
352 PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
353 core->vm_regs.rax=-1;
355 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) {
356 PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state\n",a1==0x30 ? "" : "un");
357 core->vm_regs.rax=-1;
359 uint64_t *page = (uint64_t *) h->comm_page_hva;
361 PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
362 // should sanity check to make sure guest is in 64 bit without anything strange
365 page[1] = core->ctrl_regs.cr3; // this is a do-not-care for an unmerge
367 core->vm_regs.rax = 0;
368 #if USE_UPCALL_MAGIC_PF
369 PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %u\n",h->first_hrt_core);
370 core->vm_info->cores[h->first_hrt_core].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
371 if (v3_raise_exception_with_error(&core->vm_info->cores[h->first_hrt_core],
373 UPCALL_MAGIC_ERROR)) {
374 PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %u\n",h->first_hrt_core);
375 core->vm_regs.rax = -1;
379 PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %u\n",h->hrt_int_vector,h->first_hrt_core);
380 if (v3_raise_swintr(&core->vm_info->cores[h->first_hrt_core],h->hrt_int_vector)) {
381 PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %u\n",h->first_hrt_core);
382 core->vm_regs.rax = -1;
385 // Force core to exit now
386 v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
388 h->trans_state = HRT_MERGE;
396 case 0x3f: // merge operation done
397 if (v3_is_hvm_ros_core(core)) {
398 PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
399 core->vm_regs.rax=-1;
401 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
402 PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
403 core->vm_regs.rax=-1;
405 PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
406 h->trans_state=HRT_IDLE;
414 PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
415 core->vm_regs.rax=-1;
422 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
424 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
426 v3_cfg_tree_t *hvm_config;
427 v3_cfg_tree_t *ros_config;
428 v3_cfg_tree_t *hrt_config;
434 PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
439 memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
440 vm->hvm_state.is_hvm=0;
441 vm->hvm_state.first_hrt_core=vm->num_cores;
442 vm->hvm_state.first_hrt_gpa=vm->mem_size;
444 if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
445 PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
449 if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
450 PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
454 if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) {
455 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
459 if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) {
460 PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
464 vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
466 if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) {
467 PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
471 vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
473 if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) {
474 PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
478 if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) {
479 PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
483 vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
485 if (!vm->hvm_state.hrt_file) {
486 PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
490 if (v3_register_hypercall(vm, HVM_HCALL,
491 hvm_hcall_handler, 0)) {
492 PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
496 // XXX sanity check config here
498 vm->hvm_state.is_hvm=1;
501 if (vm->hvm_state.is_hvm) {
502 V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
503 vm->hvm_state.first_hrt_core-1,
504 (void*) vm->hvm_state.first_hrt_gpa-1,
505 vm->hvm_state.first_hrt_core,
507 (void*) vm->hvm_state.first_hrt_gpa,
508 (void*)vm->mem_size-1,
510 vm->hvm_state.hrt_file->tag);
512 V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
519 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
521 PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
523 v3_remove_hypercall(vm,HVM_HCALL);
525 if (vm->hvm_state.comm_page_hpa) {
526 struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
528 PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
530 v3_delete_mem_region(vm,r);
537 int v3_init_hvm_core(struct guest_info *core)
539 memset(&core->hvm_state,0,sizeof(core->hvm_state));
540 if (core->vm_info->hvm_state.is_hvm) {
541 if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) {
542 core->hvm_state.is_hrt=1;
548 int v3_deinit_hvm_core(struct guest_info *core)
550 PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
556 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
558 if (vm->hvm_state.is_hvm) {
559 return vm->hvm_state.first_hrt_gpa;
564 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
569 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
571 if (vm->hvm_state.is_hvm) {
572 return vm->hvm_state.first_hrt_core;
574 return vm->num_cores;
578 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
580 if (vm->hvm_state.is_hvm) {
581 return vm->num_cores - vm->hvm_state.first_hrt_core;
588 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
590 if (vm->hvm_state.is_hvm) {
591 return gpa<vm->hvm_state.first_hrt_gpa;
597 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
599 if (vm->hvm_state.is_hvm) {
600 return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
606 int v3_is_hvm_hrt_core(struct guest_info *core)
608 return core->hvm_state.is_hrt;
611 int v3_is_hvm_ros_core(struct guest_info *core)
613 return !core->hvm_state.is_hrt;
616 int v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
619 // ioapic or msi to apic
620 return !dest->hvm_state.is_hrt;
623 return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
627 void v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm,
628 uint32_t *start_apic, uint32_t *num_apics)
631 // Seen from ioapic, msi, etc:
632 if (vm->hvm_state.is_hvm) {
633 // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
635 *num_apics = vm->hvm_state.first_hrt_core;
637 // Non-HVM shows all cores/APICs to apic, msi, etc.
639 *num_apics = vm->num_cores;
643 if (core->hvm_state.is_hrt) {
644 // HRT core/apic sees all apics
645 // (this policy may change...)
647 *num_apics = vm->num_cores;
649 // non-HRT core/apic sees only non-HRT cores/apics
651 *num_apics = vm->hvm_state.first_hrt_core;
656 #define MAX(x,y) ((x)>(y)?(x):(y))
657 #define MIN(x,y) ((x)<(y)?(x):(y))
660 static uint64_t boot_state_end_addr(struct v3_vm_info *vm)
662 return PAGE_ADDR(vm->mem_size);
665 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
667 *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
671 extern v3_cpu_arch_t v3_mach_type;
673 extern void *v3_hvm_svm_null_int_handler_start;
674 extern void *v3_hvm_svm_null_int_handler_end;
675 extern void *v3_hvm_vmx_null_int_handler_start;
676 extern void *v3_hvm_vmx_null_int_handler_end;
678 static void write_null_int_handler(struct v3_vm_info *vm)
685 get_null_int_handler_loc(vm,&base,&limit);
687 switch (v3_mach_type) {
690 case V3_SVM_REV3_CPU:
691 data = (void*) &v3_hvm_svm_null_int_handler_start;
692 len = (void*) &v3_hvm_svm_null_int_handler_end - data;
698 case V3_VMX_EPT_UG_CPU:
699 data = (void*) &v3_hvm_vmx_null_int_handler_start;
700 len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
704 PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
710 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
713 PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
717 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
719 *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
723 // default IDT entries (int and trap gates)
725 // Format is 16 bytes long:
727 // 16 selector => (target code selector) => 0x8 // entry 1 of GDT
728 // 3 ist => (stack) = 0 => current stack
730 // 4 type => 0xe=>INT, 0xf=>TRAP
731 // 1 reserved => 0 (indicates "system" by being zero)
735 // 32 offsethigh => 0 (total is a 64 bit offset)
738 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
740 // Note little endian
742 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
743 static uint64_t idt64_int_gate_entry_mask[2] = { 0x00008e0000080000, 0x0 };
745 static void write_idt(struct v3_vm_info *vm)
750 uint64_t handler_len;
752 uint64_t trap_gate[2];
753 uint64_t int_gate[2];
755 get_idt_loc(vm,&base,&limit);
757 get_null_int_handler_loc(vm,&handler,&handler_len);
759 handler += vm->hvm_state.gva_offset;
761 memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
762 memcpy(int_gate,idt64_int_gate_entry_mask,16);
765 // update the entries for the handler location
769 hand = (uint8_t*) &handler;
771 mask = (uint8_t *)trap_gate;
772 memcpy(&(mask[0]),&(hand[0]),2); // offset low
773 memcpy(&(mask[6]),&(hand[2]),2); // offset med
774 memcpy(&(mask[8]),&(hand[4]),4); // offset high
776 mask = (uint8_t *)int_gate;
777 memcpy(&(mask[0]),&(hand[0]),2); // offset low
778 memcpy(&(mask[6]),&(hand[2]),2); // offset med
779 memcpy(&(mask[8]),&(hand[4]),4); // offset high
781 PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
785 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
788 for (i=32;i<256;i++) {
789 v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
792 PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
797 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
799 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
803 static uint64_t gdt64[3] = {
804 0x0000000000000000, /* null */
805 0x00a09a0000000000, /* code (note lme bit) */
806 0x00a0920000000000, /* data (most entries don't matter) */
809 static void write_gdt(struct v3_vm_info *vm)
814 get_gdt_loc(vm,&base,&limit);
815 v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
817 PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
822 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
824 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
828 static void write_tss(struct v3_vm_info *vm)
833 get_tss_loc(vm,&base,&limit);
835 v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
837 PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
841 #define TOP_HALF_START 0xffff800000000000ULL
842 #define BOTTOM_HALF_END 0x00007fffffffffffULL
845 #define L4_UNIT PAGE_SIZE
846 #define L3_UNIT (512ULL * L4_UNIT)
847 #define L2_UNIT (512ULL * L3_UNIT)
848 #define L1_UNIT (512ULL * L2_UNIT)
850 static void compute_pts_4KB(struct v3_vm_info *vm,
851 uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)
854 // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
855 // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
856 // so it is the same number of page tables regardless
858 uint64_t max_gva = vm->hvm_state.max_mem_mapped;
861 *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
862 *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
863 *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
869 PTS MAP using 1 GB pages
870 n second levels pts, highest gva, highest address
876 PTS MAP using 2 MB pages
877 n third level pts, highest gva, highest address
878 m second level pts, highest gva, highest address
883 PTS MAP using 4 KB pages
884 n 4th level, highest gva, highest address
885 m 3rd level, highest gva, hihgest address
886 l second level, highest gva, highest address
890 PTS MAP using 512 GB pages when this becomes available
895 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
897 uint64_t l1,l2,l3,l4;
900 compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
902 if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
904 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
906 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
907 num_pt = l1 + l2 + l3;
908 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
909 num_pt = l1 + l2 + l3 + l4;
911 PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
915 *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
916 *limit = num_pt*PAGE_SIZE;
919 static void write_pts(struct v3_vm_info *vm)
922 uint64_t num_l1, num_l2, num_l3, num_l4;
923 void *start_l1, *start_l2, *start_l3, *start_l4;
929 void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
930 void *min_gva = (void*) vm->hvm_state.gva_offset;
931 #ifdef V3_CONFIG_DEBUG_HVM
932 void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
935 uint64_t i_start,i_end;
937 struct pml4e64 *pml4e;
942 if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
943 PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
945 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
947 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
949 } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
952 PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
956 get_pt_loc(vm,&start_l1,&size);
957 compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
959 start_l2=start_l1+PAGE_SIZE*num_l1;
960 start_l3=start_l2+PAGE_SIZE*num_l2;
961 start_l4=start_l3+PAGE_SIZE*num_l3;
963 PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
964 PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
965 PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
966 PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
970 // build PML4 (only one)
971 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) {
972 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
976 memset(pml4e,0,PAGE_SIZE);
979 i_start=0; i_end = num_l2;
980 } else if (min_gva==(void*)TOP_HALF_START) {
981 i_start=256; i_end=256+num_l2;
983 PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
987 for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
989 i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
995 PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
996 pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
997 //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
999 pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
1000 //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1011 for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1013 cur_pt+=PAGE_SIZE, pt++) {
1016 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) {
1017 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
1021 memset(pdpe,0,PAGE_SIZE);
1024 i<512 && cur_gpa<max_gpa;
1025 i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
1031 pdpe[i].large_page=1;
1032 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1033 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1035 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
1036 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1046 for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1048 cur_pt+=PAGE_SIZE, pt++) {
1051 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) {
1052 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
1056 memset(pde,0,PAGE_SIZE);
1059 i<512 && cur_gpa<max_gpa;
1060 i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
1066 pde[i].large_page=1;
1067 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1068 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
1070 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
1071 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
1083 for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1085 cur_pt+=PAGE_SIZE, pt++) {
1088 if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) {
1089 PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
1093 memset(pte,0,PAGE_SIZE);
1096 i<512 && cur_gpa<max_gpa;
1097 i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
1101 pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1102 //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
1110 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1113 get_pt_loc(vm,base, limit);
1119 int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
1121 struct v3_vm_info *vm = core->vm_info;
1123 hrt->tag.type = MB_INFO_HRT_TAG;
1124 hrt->tag.size = sizeof(mb_info_hrt_t);
1126 hrt->total_num_apics = vm->num_cores;
1127 hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
1128 hrt->have_hrt_ioapic=0;
1129 hrt->first_hrt_ioapic_entry=0;
1131 hrt->cpu_freq_khz = V3_CPU_KHZ();
1133 hrt->hrt_flags = vm->hvm_state.hrt_flags;
1134 hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
1135 hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
1136 hrt->gva_offset = vm->hvm_state.gva_offset;
1137 hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
1138 hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
1143 static void write_mb_info(struct v3_vm_info *vm)
1145 if (vm->hvm_state.hrt_type!=HRT_MBOOT64) {
1146 PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
1154 get_mb_info_loc(vm,&base,&limit);
1156 if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) {
1157 PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
1162 PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
1166 v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
1171 PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
1175 #define SCRATCH_STACK_SIZE 4096
1178 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1183 get_mb_info_loc(vm,&mb_base,&mb_limit);
1185 mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
1187 *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
1189 if (mb_base < *base+PAGE_SIZE) {
1190 PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
1193 *limit = mb_base - *base;
1197 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1198 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1200 #define ELF_MAGIC 0x464c457f
1201 #define MB2_MAGIC 0xe85250d6
1203 #define MB2_INFO_MAGIC 0x36d76289
1205 static int is_elf(uint8_t *data, uint64_t size)
1207 if (*((uint32_t*)data)==ELF_MAGIC) {
1214 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
1216 uint64_t limit = size > 32768 ? 32768 : size;
1219 // Scan for the .boot magic cookie
1220 // must be in first 32K, assume 4 byte aligned
1221 for (i=0;i<limit;i+=4) {
1222 if (*((uint32_t*)&data[i])==MB2_MAGIC) {
1223 INFO("Found multiboot header at offset 0x%llx\n",i);
1224 return (mb_header_t *) &data[i];
1231 static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
1233 struct v3_vm_hvm *h = &vm->hvm_state;
1234 uint64_t f = mb->mb64_hrt->hrt_flags;
1235 uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
1236 uint64_t gvaoff = mb->mb64_hrt->gva_offset;
1237 uint64_t gvaentry = mb->mb64_hrt->gva_entry;
1238 uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
1239 uint8_t vec = mb->mb64_hrt->hrt_int_vector;
1242 PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
1243 f, maxmap, gvaoff,gvaentry,commgpa, vec);
1245 if (maxmap<0x100000000ULL) {
1246 PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
1247 maxmap=0x100000000ULL;
1250 if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) {
1251 PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
1253 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) {
1255 f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
1256 h->max_mem_mapped = maxmap;
1257 PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
1258 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
1260 f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
1261 h->max_mem_mapped = maxmap;
1262 PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
1263 } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) {
1265 f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
1266 h->max_mem_mapped = maxmap;
1267 PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
1269 PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
1273 if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
1274 PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
1280 if (maxmap>h->max_mem_mapped) {
1281 PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
1285 if (gvaoff!=0 && gvaoff!=TOP_HALF_START) {
1286 PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
1290 h->gva_offset = gvaoff;
1292 h->gva_entry = gvaentry;
1294 if (mb->addr->load_addr < h->first_hrt_gpa) {
1295 PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
1299 if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
1300 PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
1305 PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
1309 h->hrt_int_vector = vec;
1312 if (commgpa < vm->mem_size) {
1313 PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
1317 h->comm_page_gpa = commgpa;
1319 if (!h->comm_page_hpa) {
1320 if (!(h->comm_page_hpa=V3_AllocPages(1))) {
1321 PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
1325 h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
1327 memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1329 if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) {
1330 PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
1331 V3_FreePages((void*)(h->comm_page_gpa),1);
1336 PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
1339 memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1342 PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
1343 h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
1349 static int setup_mb_kernel_hrt(struct v3_vm_info *vm)
1353 if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) {
1354 PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
1358 if (configure_hrt(vm,&mb)) {
1359 PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
1363 if (v3_write_multiboot_kernel(vm,&mb,vm->hvm_state.hrt_file,
1364 (void*)vm->hvm_state.first_hrt_gpa,
1365 vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
1366 PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
1370 if (vm->hvm_state.gva_entry) {
1371 vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
1373 vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
1376 vm->hvm_state.hrt_type = HRT_MBOOT64;
1383 static int setup_hrt(struct v3_vm_info *vm)
1385 if (is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size) &&
1386 find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) {
1388 PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
1389 if (setup_mb_kernel_hrt(vm)) {
1390 PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
1394 PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
1411 We do not touch the ROS portion of the address space.
1412 The HRT portion looks like:
1414 INT_HANDLER (1 page - page aligned)
1415 IDT (1 page - page aligned)
1416 GDT (1 page - page aligned)
1417 TSS (1 page - page asligned)
1418 PAGETABLES (identy map of first N GB)
1419 ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
1420 followed by 3rd level PTs in order, followed by 4th level
1423 SCRATCH_STACK_HRT_CORE0
1424 SCRATCH_STACK_HRT_CORE1
1426 SCRATCH_STACK_HRT_COREN
1428 HRT (as many pages as needed, page-aligned, starting at first HRT address)
1436 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
1438 if (!vm->hvm_state.is_hvm) {
1439 PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
1443 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
1445 if (setup_hrt(vm)) {
1446 PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
1450 // the locations of all the other items are determined by
1451 // the HRT setup, so these must happen after
1453 write_null_int_handler(vm);
1460 // this must happen last
1463 PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
1469 On entry for every core:
1471 IDTR points to stub IDT
1472 GDTR points to stub GDT
1473 TS points to stub TSS
1474 CR3 points to root page table
1476 EFER has LME AND LMA (and NX for compatibility with Linux)
1477 RSP is TOS of core's scratch stack (looks like a call)
1479 RAX = MB magic cookie
1480 RBX = address of multiboot info table
1481 RCX = this core id / apic id (0..N-1)
1482 RDX = this core id - first HRT core ID (==0 for the first HRT core)
1484 All addresses are virtual addresses, offset as needed by gva_offset
1486 Other regs are zeroed
1488 shadow/nested paging state reset for long mode
1491 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
1495 uint64_t gva_offset;
1497 rdtscll(core->hvm_state.last_boot_start);
1500 if (!core->hvm_state.is_hrt) {
1501 PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
1506 PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
1508 gva_offset = core->vm_info->hvm_state.gva_offset;
1510 memset(&core->vm_regs,0,sizeof(core->vm_regs));
1511 memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
1512 memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
1513 memset(&core->segments,0,sizeof(core->segments));
1514 memset(&core->msrs,0,sizeof(core->msrs));
1515 memset(&core->fp_state,0,sizeof(core->fp_state));
1517 // We are in long mode with virtual memory and we want
1518 // to start immediatley
1519 core->cpl = 0; // we are going right into the kernel
1520 core->cpu_mode = LONG;
1521 core->mem_mode = VIRTUAL_MEM;
1522 core->core_run_state = CORE_RUNNING ;
1526 core->vm_regs.rax = MB2_INFO_MAGIC;
1528 // multiboot info pointer
1529 get_mb_info_loc(core->vm_info, &base,&limit);
1530 core->vm_regs.rbx = (uint64_t) base + gva_offset;
1533 core->vm_regs.rcx = core->vcpu_id;
1536 core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1538 // Now point to scratch stack for this core
1539 // it begins at an ofset relative to the MB info page
1540 get_mb_info_loc(core->vm_info, &base,&limit);
1541 base = base + gva_offset;
1542 base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1543 core->vm_regs.rsp = (v3_reg_t) base;
1544 core->vm_regs.rbp = (v3_reg_t) base-8;
1546 // push onto the stack a bad rbp and bad return address
1547 core->vm_regs.rsp-=16;
1548 v3_set_gpa_memory(core,
1549 core->vm_regs.rsp-gva_offset,
1555 get_hrt_loc(core->vm_info, &base,&limit);
1556 if (core->vm_info->hvm_state.gva_entry) {
1557 core->rip = core->vm_info->hvm_state.gva_entry;
1559 core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset;
1564 PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1565 (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1567 (void*)(core->vm_regs.rsp),
1568 (void*)(core->vm_regs.rbp),
1569 (void*)(core->vm_regs.rax),
1570 (void*)(core->vm_regs.rbx),
1571 (void*)(core->vm_regs.rcx),
1572 (void*)(core->vm_regs.rdx));
1574 // Setup CRs for long mode and our stub page table
1576 core->ctrl_regs.cr0 = 0x80000001;
1577 core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1579 // CR2: don't care (output from #PF)
1580 // CE3: set to our PML4E, without setting PCD or PWT
1581 get_pt_loc(core->vm_info, &base,&limit);
1582 core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base); // not offset as this is a GPA
1583 core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1585 // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1586 core->ctrl_regs.cr4 = 0xb0;
1587 core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1589 // RFLAGS zeroed is fine: come in with interrupts off
1590 // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
1591 core->ctrl_regs.efer = 0x1d00;
1592 core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1598 selector is 13 bits of index, 1 bit table indicator
1601 index is scaled by 8, even in long mode, where some entries
1602 are 16 bytes long....
1603 -> code, data descriptors have 8 byte format
1604 because base, limit, etc, are ignored (no segmentation)
1605 -> interrupt/trap gates have 16 byte format
1606 because offset needs to be 64 bits
1609 // Install our stub IDT
1610 get_idt_loc(core->vm_info, &base,&limit);
1612 core->segments.idtr.selector = 0; // entry 0 (NULL) of the GDT
1613 core->segments.idtr.base = (addr_t) base; // only base+limit are used
1614 core->segments.idtr.limit = limit-1;
1615 core->segments.idtr.type = 0x0;
1616 core->segments.idtr.system = 0;
1617 core->segments.idtr.dpl = 0;
1618 core->segments.idtr.present = 0;
1619 core->segments.idtr.long_mode = 0;
1621 // Install our stub GDT
1622 get_gdt_loc(core->vm_info, &base,&limit);
1624 core->segments.gdtr.selector = 0; // entry 0 (NULL) of the GDT
1625 core->segments.gdtr.base = (addr_t) base;
1626 core->segments.gdtr.limit = limit-1; // only base+limit are used
1627 core->segments.gdtr.type = 0x0;
1628 core->segments.gdtr.system = 0;
1629 core->segments.gdtr.dpl = 0;
1630 core->segments.gdtr.present = 0;
1631 core->segments.gdtr.long_mode = 0;
1634 get_tss_loc(core->vm_info, &base,&limit);
1636 core->segments.tr.selector = 0;
1637 core->segments.tr.base = (addr_t) base;
1638 core->segments.tr.limit = limit-1;
1639 core->segments.tr.type = 0x9;
1640 core->segments.tr.system = 0; // available 64 bit TSS
1641 core->segments.tr.dpl = 0;
1642 core->segments.tr.present = 1;
1643 core->segments.tr.long_mode = 0; // not used
1645 base = 0x0; // these are not offset as we want to make all gvas visible
1649 core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1650 core->segments.cs.base = (addr_t) base; // not used
1651 core->segments.cs.limit = limit; // not used
1652 core->segments.cs.type = 0xe; // only C is used
1653 core->segments.cs.system = 1; // not a system segment
1654 core->segments.cs.dpl = 0;
1655 core->segments.cs.present = 1;
1656 core->segments.cs.long_mode = 1;
1658 // DS, SS, etc are identical
1659 core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1660 core->segments.ds.base = (addr_t) base;
1661 core->segments.ds.limit = limit;
1662 core->segments.ds.type = 0x6; // ignored
1663 core->segments.ds.system = 1; // not a system segment
1664 core->segments.ds.dpl = 0;
1665 core->segments.ds.present = 1;
1666 core->segments.ds.long_mode = 1;
1668 memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1669 memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1670 memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1671 memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1674 // reset paging here for shadow...
1676 if (core->shdw_pg_mode != NESTED_PAGING) {
1677 PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
1685 int v3_handle_hvm_reset(struct guest_info *core)
1688 if (core->core_run_state != CORE_RESETTING) {
1692 if (!core->vm_info->hvm_state.is_hvm) {
1696 if (v3_is_hvm_hrt_core(core)) {
1697 // this is an HRT reset
1700 // wait for all the HRT cores
1701 v3_counting_barrier(&core->vm_info->reset_barrier);
1703 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1705 core->vm_info->run_state = VM_RESETTING;
1708 core->core_run_state = CORE_RESETTING;
1710 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1711 // we really only need to clear the bss
1712 // and recopy the .data, but for now we'll just
1714 rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
1717 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
1721 // now everyone is ready to reset
1722 rc |= v3_setup_hvm_hrt_core_for_boot(core);
1725 PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
1728 core->core_run_state = CORE_RUNNING;
1730 if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1732 core->vm_info->run_state = VM_RUNNING;
1733 core->vm_info->hvm_state.trans_state = HRT_IDLE;
1736 v3_counting_barrier(&core->vm_info->reset_barrier);
1739 PrintError(core->vm_info,core,"hvm: reset failed\n");
1746 // ROS core will be handled by normal reset functionality