Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


3e506d76982cf95ff0c9a95518d6f3a3763a10fb
[palacios.git] / palacios / src / palacios / vmm_hvm.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org> 
11  * All rights reserved.
12  *
13  * Author:  Peter Dinda <pdinda@northwestern.edu>
14  *
15  * This is free software.  You are permitted to use,
16  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
17  */
18
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
26
27 #include <palacios/vmm_xml.h>
28
29 #include <palacios/vm_guest_mem.h>
30
31 #include <palacios/vmm_debug.h>
32
33
34 /*
35
36   MEM     = Total size of memory in the GPA (in MB)
37   ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
38
39   GPAs [0,ROS_MEM) are what the ROS sees
40   GPAs [ROS_MEM, MEM) are HRT only
41   GPAS [0,MEM) are accessible by the HRT
42
43   CORES   = Total number of cores in VM
44   ROS_CORES = Total numbber of cores for the ROS
45
46   Cores [0,ROS_CORES) are what the ROS sees
47   Cores [ROS_CORES,CORES) are HRT only
48   Cores [0,CORES) are accessible by the HRT
49
50   In a Pal file:
51
52   <files> 
53     <file id="hrtelf" filename="hrtelf.o" />
54   </files>
55
56   <mem ... >RAM</mem>   (MB)  Note these are  
57   <cores count="CORES" ...>   backward compatible
58
59   <hvm enable="y" >
60     <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
61     <hrt file_id="hrtelf" /hrt>
62   </hvm>
63
64 */
65
66 #ifndef V3_CONFIG_DEBUG_HVM
67 #undef PrintDebug
68 #define PrintDebug(fmt, args...)
69 #endif
70
71
72 int v3_init_hvm()
73 {
74     PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
75     return 0;
76 }
77
78 int v3_deinit_hvm()
79 {
80     PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
81     return 0;
82 }
83
84 // ignore requests from when we are in the wrong state
85 #define ENFORCE_STATE_MACHINE 1
86
87 // invoke the HRT using a page fault instead of
88 // the SWINTR mechanism
89 #define USE_UPCALL_MAGIC_PF  1
90 #define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
91 #define UPCALL_MAGIC_ERROR   0xf00df00d
92
93 /*
94   64 bit only hypercall:
95
96   rax = hypercall number
97   rbx = 0x646464...
98   then args are:  rcx, rdx, rsi, rdi r8, r9, r10, r11
99   rcx = 1st arg
100 */
101 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
102 {
103     uint64_t c;
104     uint64_t bitness = core->vm_regs.rbx;
105     uint64_t a1 = core->vm_regs.rcx;
106     uint64_t a2 = core->vm_regs.rdx;
107     struct v3_vm_hvm *h = &core->vm_info->hvm_state;
108
109
110     if (bitness!=0x6464646464646464) { 
111         PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
112         core->vm_regs.rax = -1;
113         return 0;
114     }
115
116     switch (a1) {
117         case 0x0:   // null
118             
119             rdtscll(c);
120             
121             V3_Print(core->vm_info,core, "hvm: received hypercall %x  rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
122                      hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
123             //v3_print_core_telemetry(core);
124             //    v3_print_guest_state(core);
125             core->vm_regs.rax = 0;
126             break;
127             
128         case 0x1: // reset ros
129             PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
130             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) { 
131                 PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
132                 core->vm_regs.rax = -1;
133             } else {
134                 core->vm_regs.rax = 0;
135             }
136             break;
137
138         case 0x2: // reset hrt
139             PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
140             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) { 
141                 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
142                 core->vm_regs.rax = -1;
143             } else {
144                 core->vm_regs.rax = 0;
145             }
146             break;
147
148         case 0x3: // reset both
149             PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
150             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) { 
151                 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
152                 core->vm_regs.rax = -1;
153             } else {
154                 core->vm_regs.rax = 0;
155             }
156             break;
157             
158         case 0xf: // get HRT state
159             core->vm_regs.rax = h->trans_state;
160             //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
161             break;
162
163         case 0x20: // invoke function (ROS->HRT)
164         case 0x21: // invoke parallel function (ROS->HRT)
165             if (v3_is_hvm_hrt_core(core)) { 
166                 PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
167                 core->vm_regs.rax = -1;
168             } else {
169                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
170                     PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
171                     core->vm_regs.rax = -1;
172                 } else {
173                     uint64_t *page = (uint64_t *) h->comm_page_hva;
174                     uint64_t first, last, cur;
175
176                     PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
177                     page[0] = a1;
178                     page[1] = a2;
179
180                     if (a1==0x20) { 
181                         first=last=h->first_hrt_core;
182                     } else {
183                         first=h->first_hrt_core;
184                         last=core->vm_info->num_cores-1;
185                     }
186
187                     core->vm_regs.rax = 0;
188
189                     h->trans_count = last-first+1;
190
191                     for (cur=first;cur<=last;cur++) { 
192
193 #if USE_UPCALL_MAGIC_PF
194                         PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
195                         core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
196                         if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
197                                                           PF_EXCEPTION, 
198                                                           UPCALL_MAGIC_ERROR)) { 
199                             PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
200                             core->vm_regs.rax = -1;
201                             break;
202                         }
203 #else
204                         PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
205                         if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) { 
206                             PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
207                             core->vm_regs.rax = -1;
208                             break;
209                         }
210 #endif
211                         // Force core to exit now
212                         v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
213                           
214                     }
215                     if (core->vm_regs.rax==0) { 
216                         if (a1==0x20) { 
217                             h->trans_state = HRT_CALL;
218                         } else {
219                             h->trans_state = HRT_PARCALL;
220                         }
221                     }  else {
222                         PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
223                         h->trans_state = HRT_IDLE;
224                         h->trans_count = 0;
225                     }
226                 }
227             }
228             break;
229
230
231         case 0x2f: // function exec done
232             if (v3_is_hvm_ros_core(core)) { 
233                 PrintError(core->vm_info,core, "hvm: request for exec done from ROS core\n");
234                 core->vm_regs.rax=-1;
235             } else {
236                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_CALL && h->trans_state!=HRT_PARCALL) {
237                     PrintError(core->vm_info,core,"hvm: function completion when not in HRT_CALL or HRT_PARCALL state\n");
238                     core->vm_regs.rax=-1;
239                 } else {
240                     uint64_t one=1;
241                     PrintDebug(core->vm_info,core, "hvm: function complete\n");
242                     if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
243                         // last one, switch state
244                         h->trans_state=HRT_IDLE;
245                         PrintDebug(core->vm_info,core, "hvm: function complete - back to idle\n");
246                     }
247                     core->vm_regs.rax=0;
248                 }
249             }
250                     
251             break;
252
253         case 0x30: // merge address space
254         case 0x31: // unmerge address space
255             if (v3_is_hvm_hrt_core(core)) { 
256                 PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
257                 core->vm_regs.rax=-1;
258             } else {
259                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
260                     PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state\n",a1==0x30 ? "" : "un");
261                     core->vm_regs.rax=-1;
262                 } else {
263                     uint64_t *page = (uint64_t *) h->comm_page_hva;
264
265                     PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
266                     // should sanity check to make sure guest is in 64 bit without anything strange
267
268                     page[0] = a1;
269                     page[1] = core->ctrl_regs.cr3;  // this is a do-not-care for an unmerge
270
271                     core->vm_regs.rax = 0;
272 #if USE_UPCALL_MAGIC_PF
273                     PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %u\n",h->first_hrt_core);
274                     core->vm_info->cores[h->first_hrt_core].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
275                     if (v3_raise_exception_with_error(&core->vm_info->cores[h->first_hrt_core],
276                                                       PF_EXCEPTION,  
277                                                       UPCALL_MAGIC_ERROR)) { 
278                       PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %u\n",h->first_hrt_core);
279                       core->vm_regs.rax = -1;
280                       break;
281                     }
282 #else
283                     PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %u\n",h->hrt_int_vector,h->first_hrt_core);
284                     if (v3_raise_swintr(&core->vm_info->cores[h->first_hrt_core],h->hrt_int_vector)) { 
285                         PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %u\n",h->first_hrt_core);
286                         core->vm_regs.rax = -1;
287                     } 
288 #endif          
289                     // Force core to exit now
290                     v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
291
292                     h->trans_state = HRT_MERGE;
293                 }
294                 
295             }
296                 
297             break;
298             
299
300         case 0x3f: // merge operation done
301             if (v3_is_hvm_ros_core(core)) { 
302                 PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
303                 core->vm_regs.rax=-1;
304             } else {
305                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
306                     PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
307                     core->vm_regs.rax=-1;
308                 } else {
309                     PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
310                     h->trans_state=HRT_IDLE;
311                     core->vm_regs.rax=0;
312                 }
313             }
314                     
315             break;
316
317         default:
318             PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
319             core->vm_regs.rax=-1;
320             break;
321     }
322                 
323     return 0;
324 }
325
326 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
327
328 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
329 {
330     v3_cfg_tree_t *hvm_config;
331     v3_cfg_tree_t *ros_config;
332     v3_cfg_tree_t *hrt_config;
333     char *enable;
334     char *ros_cores;
335     char *ros_mem;
336     char *hrt_file_id=0;
337
338     PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
339
340     /* 
341        Defaults - all ROS
342     */
343     memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
344     vm->hvm_state.is_hvm=0;
345     vm->hvm_state.first_hrt_core=vm->num_cores;
346     vm->hvm_state.first_hrt_gpa=vm->mem_size;
347
348     if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
349         PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
350         goto out_ok;
351     }
352     
353     if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
354         PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
355         goto out_ok;
356     }
357
358     if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) { 
359         PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
360         return -1;
361     }
362  
363     if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) { 
364         PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
365         return -1;
366     }
367    
368     vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
369     
370     if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) { 
371         PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
372         return -1;
373     }
374
375     vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
376
377     if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) { 
378         PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
379         return -1;
380     }
381  
382     if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) { 
383         PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
384         return -1;
385     }
386
387     vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
388     
389     if (!vm->hvm_state.hrt_file) { 
390         PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
391         return -1;
392     }
393
394     if (v3_register_hypercall(vm, HVM_HCALL, 
395                               hvm_hcall_handler, 0)) { 
396         PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
397         return -1;
398     }
399
400     // XXX sanity check config here
401
402     vm->hvm_state.is_hvm=1;
403
404  out_ok:
405     if (vm->hvm_state.is_hvm) {
406         V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
407                  vm->hvm_state.first_hrt_core-1,
408                  (void*) vm->hvm_state.first_hrt_gpa-1,
409                  vm->hvm_state.first_hrt_core,
410                  vm->num_cores-1,
411                  (void*) vm->hvm_state.first_hrt_gpa,
412                  (void*)vm->mem_size-1,
413                  hrt_file_id,
414                  vm->hvm_state.hrt_file->tag);
415     } else {
416         V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
417     }
418     return 0;
419     
420 }
421
422
423 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
424 {
425     PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
426
427     v3_remove_hypercall(vm,HVM_HCALL);
428
429     if (vm->hvm_state.comm_page_hpa) { 
430         struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
431         if (!r) { 
432             PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
433         } else {
434             v3_delete_mem_region(vm,r);
435         }
436     }
437
438     return 0;
439 }
440
441 int v3_init_hvm_core(struct guest_info *core)
442 {
443     memset(&core->hvm_state,0,sizeof(core->hvm_state));
444     if (core->vm_info->hvm_state.is_hvm) { 
445         if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) { 
446             core->hvm_state.is_hrt=1;
447         }
448     }
449     return 0;
450 }
451
452 int v3_deinit_hvm_core(struct guest_info *core)
453 {
454     PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
455
456     return 0;
457 }
458
459
460 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
461 {
462     if (vm->hvm_state.is_hvm) { 
463         return vm->hvm_state.first_hrt_gpa;
464     } else {
465         return vm->mem_size;
466     }
467 }
468 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
469 {
470     return vm->mem_size;
471 }
472
473 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
474 {
475     if (vm->hvm_state.is_hvm) { 
476         return vm->hvm_state.first_hrt_core;
477     } else {
478         return vm->num_cores;
479     }
480 }
481
482 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
483 {
484     if (vm->hvm_state.is_hvm) { 
485         return vm->num_cores - vm->hvm_state.first_hrt_core;
486     } else {
487         return 0;
488     }
489 }
490
491
492 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
493 {
494     if (vm->hvm_state.is_hvm) { 
495         return gpa>=0 && gpa<vm->hvm_state.first_hrt_gpa;
496     } else {
497         return 1;
498     }
499 }
500
501 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
502 {
503     if (vm->hvm_state.is_hvm) { 
504         return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
505     } else {
506         return 0;
507     }
508 }
509
510 int v3_is_hvm_hrt_core(struct guest_info *core)
511 {
512     return core->hvm_state.is_hrt;
513 }
514
515 int v3_is_hvm_ros_core(struct guest_info *core)
516 {
517     return !core->hvm_state.is_hrt;
518 }
519
520 int      v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
521 {
522     if (!src) {
523         // ioapic or msi to apic
524         return !dest->hvm_state.is_hrt;
525     } else {
526         // apic to apic
527         return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
528     }
529 }
530
531 void     v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm, 
532                                         uint32_t *start_apic, uint32_t *num_apics)
533 {
534     if (!core) { 
535         // Seen from ioapic, msi, etc: 
536         if (vm->hvm_state.is_hvm) {
537             // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
538             *start_apic = 0;
539             *num_apics = vm->hvm_state.first_hrt_core;
540         } else {
541             // Non-HVM shows all cores/APICs to apic, msi, etc.
542             *start_apic = 0;
543             *num_apics = vm->num_cores;
544         }
545     } else {
546         // Seen from apic
547         if (core->hvm_state.is_hrt) { 
548             // HRT core/apic sees all apics
549             // (this policy may change...)
550             *start_apic = 0;
551             *num_apics = vm->num_cores;
552         } else {
553             // non-HRT core/apic sees only non-HRT cores/apics
554             *start_apic = 0 ;
555             *num_apics = vm->hvm_state.first_hrt_core;
556         }
557     }
558 }
559
560 #define MAX(x,y) ((x)>(y)?(x):(y))
561 #define MIN(x,y) ((x)<(y)?(x):(y))
562
563
564 static uint64_t boot_state_end_addr(struct v3_vm_info *vm) 
565 {
566     return PAGE_ADDR(vm->mem_size);
567 }
568    
569 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
570 {
571     *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
572     *limit = PAGE_SIZE;
573 }
574
575 extern v3_cpu_arch_t v3_mach_type;
576
577 extern void *v3_hvm_svm_null_int_handler_start;
578 extern void *v3_hvm_svm_null_int_handler_end;
579 extern void *v3_hvm_vmx_null_int_handler_start;
580 extern void *v3_hvm_vmx_null_int_handler_end;
581
582 static void write_null_int_handler(struct v3_vm_info *vm)
583 {
584     void *base;
585     uint64_t limit;
586     void *data;
587     uint64_t len;
588
589     get_null_int_handler_loc(vm,&base,&limit);
590
591     switch (v3_mach_type) {
592 #ifdef V3_CONFIG_SVM
593         case V3_SVM_CPU:
594         case V3_SVM_REV3_CPU:
595             data = (void*) &v3_hvm_svm_null_int_handler_start;
596             len = (void*) &v3_hvm_svm_null_int_handler_end - data;
597             break;
598 #endif
599 #if V3_CONFIG_VMX
600         case V3_VMX_CPU:
601         case V3_VMX_EPT_CPU:
602         case V3_VMX_EPT_UG_CPU:
603             data = (void*) &v3_hvm_vmx_null_int_handler_start;
604             len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
605             break;
606 #endif
607         default:
608             PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
609             data = 0;
610             len = 0;
611     }
612
613     if (data) {
614         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
615     }
616
617     PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
618 }
619
620
621 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
622 {
623     *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
624     *limit = 16*256;
625 }
626
627 // default IDT entries (int and trap gates)
628 //
629 // Format is 16 bytes long:
630 //   16 offsetlo   => 0
631 //   16 selector   => (target code selector) => 0x8 // entry 1 of GDT
632 //    3 ist        => (stack) = 0 => current stack
633 //    5 reserved   => 0
634 //    4 type       => 0xe=>INT, 0xf=>TRAP 
635 //    1 reserved   => 0  (indicates "system" by being zero)
636 //    2 dpl        => 0
637 //    1 present    => 1
638 //   16 offsetmid  => 0
639 //   32 offsethigh => 0   (total is a 64 bit offset)
640 //   32 reserved   => 0
641 //
642 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
643 // 
644 // Note little endian
645 //
646 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
647 static uint64_t idt64_int_gate_entry_mask[2] =  { 0x00008e0000080000, 0x0 };
648
649 static void write_idt(struct v3_vm_info *vm)
650 {
651     void *base;
652     uint64_t limit;
653     void *handler;
654     uint64_t handler_len;
655     int i;
656     uint64_t trap_gate[2];
657     uint64_t int_gate[2];
658
659     get_idt_loc(vm,&base,&limit);
660
661     get_null_int_handler_loc(vm,&handler,&handler_len);
662
663     handler += vm->hvm_state.gva_offset;
664
665     memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
666     memcpy(int_gate,idt64_int_gate_entry_mask,16);
667
668     if (handler) {
669         // update the entries for the handler location
670         uint8_t *mask;
671         uint8_t *hand;
672         
673         hand = (uint8_t*) &handler;
674
675         mask = (uint8_t *)trap_gate;
676         memcpy(&(mask[0]),&(hand[0]),2); // offset low
677         memcpy(&(mask[6]),&(hand[2]),2); // offset med
678         memcpy(&(mask[8]),&(hand[4]),4); // offset high
679
680         mask = (uint8_t *)int_gate;
681         memcpy(&(mask[0]),&(hand[0]),2); // offset low
682         memcpy(&(mask[6]),&(hand[2]),2); // offset med
683         memcpy(&(mask[8]),&(hand[4]),4); // offset high
684
685         PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
686     }
687
688     for (i=0;i<32;i++) { 
689         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
690     }
691
692     for (i=32;i<256;i++) { 
693         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
694     }
695
696     PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
697 }
698
699
700
701 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
702 {
703     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
704     *limit = 8*3;
705 }
706
707 static uint64_t gdt64[3] = {
708     0x0000000000000000, /* null */
709     0x00a09a0000000000, /* code (note lme bit) */
710     0x00a0920000000000, /* data (most entries don't matter) */
711 };
712
713 static void write_gdt(struct v3_vm_info *vm)
714 {
715     void *base;
716     uint64_t limit;
717
718     get_gdt_loc(vm,&base,&limit);
719     v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
720
721     PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
722 }
723
724
725
726 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
727 {
728     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
729     *limit = PAGE_SIZE;
730 }
731
732 static void write_tss(struct v3_vm_info *vm)
733 {
734     void *base;
735     uint64_t limit;
736
737     get_tss_loc(vm,&base,&limit);
738
739     v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
740
741     PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
742 }
743
744
745 #define TOP_HALF_START  0xffff800000000000ULL
746 #define BOTTOM_HALF_END 0x00007fffffffffffULL
747
748
749 #define L4_UNIT PAGE_SIZE
750 #define L3_UNIT (512ULL * L4_UNIT)
751 #define L2_UNIT (512ULL * L3_UNIT)
752 #define L1_UNIT (512ULL * L2_UNIT)
753
754 static void compute_pts_4KB(struct v3_vm_info *vm, 
755                             uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)    
756 {
757
758     // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
759     // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
760     // so it is the same number of page tables regardless
761
762     uint64_t max_gva = vm->hvm_state.max_mem_mapped;
763
764     *l1 = 1;  // 1 PML4
765     *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
766     *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
767     *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
768 }
769
770
771
772 /*
773   PTS MAP using 1 GB pages
774   n second levels pts, highest gva, highest address
775   1 top level
776
777
778 OR
779   
780   PTS MAP using 2 MB pages
781   n third level pts, highest gva, highest address
782   m second level pts, highest gva, highest address
783   1 top level pt
784
785 OR
786
787   PTS MAP using 4 KB pages
788   n 4th level, highest gva, highest address
789   m 3rd level, highest gva, hihgest address
790   l second level, highest gva, highest address
791   1 top level pt
792
793 OR
794   PTS MAP using 512 GB pages when this becomes available
795
796 */
797
798
799 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
800 {
801     uint64_t l1,l2,l3,l4;
802     uint64_t num_pt;
803
804     compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
805
806     if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
807         num_pt = l1;
808     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
809         num_pt = l1 + l2;
810     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
811         num_pt = l1 + l2 + l3;
812     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
813         num_pt = l1 + l2 + l3 + l4;
814     } else {
815         PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
816         return;
817     }
818
819     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
820     *limit = num_pt*PAGE_SIZE;
821 }
822
823 static void write_pts(struct v3_vm_info *vm)
824 {
825     uint64_t size;
826     uint64_t num_l1, num_l2, num_l3, num_l4;
827     void *start_l1, *start_l2, *start_l3, *start_l4;
828     uint64_t max_level;
829     void *cur_pt;
830     void *cur_gva;
831     void *cur_gpa;
832     void *min_gpa = 0;
833     void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
834     void *min_gva = (void*) vm->hvm_state.gva_offset;
835 #ifdef V3_CONFIG_DEBUG_HVM
836     void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
837 #endif
838     uint64_t i, pt;
839     uint64_t i_start,i_end;
840     
841     struct pml4e64 *pml4e;
842     struct pdpe64 *pdpe;
843     struct pde64 *pde;
844     struct pte64 *pte;
845
846     if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
847         PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
848         max_level = 1;
849     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
850         max_level = 2;
851     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
852         max_level = 3;
853     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
854         max_level = 4;
855     } else {
856         PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
857         return;
858     }
859
860     get_pt_loc(vm,&start_l1,&size);
861     compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
862
863     start_l2=start_l1+PAGE_SIZE*num_l1;
864     start_l3=start_l2+PAGE_SIZE*num_l2;
865     start_l4=start_l3+PAGE_SIZE*num_l3;
866
867     PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
868     PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
869     PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
870     PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
871
872     cur_pt=start_l1;
873
874     // build PML4 (only one)
875     if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) { 
876         PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
877         return;
878     }
879
880     memset(pml4e,0,PAGE_SIZE);
881
882     if (min_gva==0x0) { 
883         i_start=0; i_end = num_l2;
884     } else if (min_gva==(void*)TOP_HALF_START) { 
885         i_start=256; i_end=256+num_l2;
886     } else {
887         PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
888         return;
889     }
890
891     for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
892          (i<i_end);
893          i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
894
895         pml4e[i].present=1;
896         pml4e[i].writable=1;
897         
898         if (max_level==1) { 
899             PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
900             pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
901             //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
902         } else {
903             pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
904             //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
905         }
906     }
907
908     // 512 GB only
909     if (max_level==1) {
910         return;
911     }
912
913
914
915     for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
916          pt<num_l2;
917          cur_pt+=PAGE_SIZE, pt++) { 
918
919         // build PDPE
920         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) { 
921             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
922             return;
923         }
924         
925         memset(pdpe,0,PAGE_SIZE);
926         
927         for (i=0; 
928              i<512 && cur_gpa<max_gpa; 
929              i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
930
931             pdpe[i].present=1;
932             pdpe[i].writable=1;
933         
934             if (max_level==2) { 
935                 pdpe[i].large_page=1;
936                 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
937                 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
938             } else {
939                 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
940                 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
941             }
942         }
943     }
944         
945     //1 GB only
946     if (max_level==2) { 
947         return;
948     }
949
950     for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
951          pt<num_l3;
952          cur_pt+=PAGE_SIZE, pt++) { 
953
954         // build PDE
955         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) { 
956             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
957             return;
958         }
959         
960         memset(pde,0,PAGE_SIZE);
961         
962         for (i=0; 
963              i<512 && cur_gpa<max_gpa; 
964              i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
965
966             pde[i].present=1;
967             pde[i].writable=1;
968         
969             if (max_level==3) { 
970                 pde[i].large_page=1;
971                 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
972                 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
973             } else {
974                 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
975                 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
976             }
977         }
978     }
979
980     //2 MB only
981     if (max_level==3) { 
982         return;
983     }
984
985
986     // 4 KB
987     for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
988          pt<num_l4;
989          cur_pt+=PAGE_SIZE, pt++) { 
990
991         // build PTE
992         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) { 
993             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
994             return;
995         }
996         
997         memset(pte,0,PAGE_SIZE);
998         
999         for (i=0; 
1000              i<512 && cur_gpa<max_gpa; 
1001              i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
1002
1003             pte[i].present=1;
1004             pte[i].writable=1;
1005             pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1006             //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
1007         }
1008     }
1009
1010     return;
1011 }
1012
1013
1014 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1015 {
1016     
1017     get_pt_loc(vm,base, limit);
1018     *base-=PAGE_SIZE;
1019     *limit=PAGE_SIZE;
1020 }
1021
1022
1023 int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
1024 {
1025     struct v3_vm_info *vm = core->vm_info;
1026
1027     hrt->tag.type = MB_INFO_HRT_TAG;
1028     hrt->tag.size = sizeof(mb_info_hrt_t);
1029
1030     hrt->total_num_apics = vm->num_cores;
1031     hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
1032     hrt->have_hrt_ioapic=0;
1033     hrt->first_hrt_ioapic_entry=0;
1034
1035     hrt->cpu_freq_khz = V3_CPU_KHZ();
1036
1037     hrt->hrt_flags = vm->hvm_state.hrt_flags;
1038     hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
1039     hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
1040     hrt->gva_offset = vm->hvm_state.gva_offset;
1041     hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
1042     hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
1043     
1044     return 0;
1045 }
1046
1047 static void write_mb_info(struct v3_vm_info *vm) 
1048 {
1049     if (vm->hvm_state.hrt_type!=HRT_MBOOT64) { 
1050         PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
1051         return;
1052     } else {
1053         uint8_t buf[256];
1054         uint64_t size;
1055         void *base;
1056         uint64_t limit;
1057
1058         get_mb_info_loc(vm,&base,&limit);
1059         
1060         if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) { 
1061             PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
1062             return;
1063         }
1064
1065         if (size>limit) { 
1066             PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
1067             return;
1068         }
1069         
1070         v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
1071                             (addr_t)base,
1072                             size,
1073                             buf);
1074
1075         PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
1076     }
1077 }
1078
1079 #define SCRATCH_STACK_SIZE 4096
1080
1081
1082 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1083 {
1084     void *mb_base;
1085     uint64_t mb_limit;
1086     
1087     get_mb_info_loc(vm,&mb_base,&mb_limit);
1088     
1089     mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
1090
1091     *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
1092
1093     if (mb_base < *base+PAGE_SIZE) { 
1094         PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
1095     }
1096
1097     *limit = mb_base - *base;
1098 }
1099
1100
1101 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1102 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1103
1104 #define ELF_MAGIC    0x464c457f
1105 #define MB2_MAGIC    0xe85250d6
1106
1107 #define MB2_INFO_MAGIC    0x36d76289
1108
1109 static int is_elf(uint8_t *data, uint64_t size)
1110 {
1111     if (*((uint32_t*)data)==ELF_MAGIC) {
1112         return 1;
1113     } else { 
1114         return 0;
1115     }
1116 }
1117
1118 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
1119 {
1120     uint64_t limit = size > 32768 ? 32768 : size;
1121     uint64_t i;
1122
1123     // Scan for the .boot magic cookie
1124     // must be in first 32K, assume 4 byte aligned
1125     for (i=0;i<limit;i+=4) { 
1126         if (*((uint32_t*)&data[i])==MB2_MAGIC) {
1127             INFO("Found multiboot header at offset 0x%llx\n",i);
1128             return (mb_header_t *) &data[i];
1129         }
1130     }
1131     return 0;
1132 }
1133
1134
1135 static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
1136 {
1137     struct v3_vm_hvm *h = &vm->hvm_state;
1138     uint64_t f = mb->mb64_hrt->hrt_flags;
1139     uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
1140     uint64_t gvaoff = mb->mb64_hrt->gva_offset;
1141     uint64_t gvaentry = mb->mb64_hrt->gva_entry;
1142     uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
1143     uint8_t  vec = mb->mb64_hrt->hrt_int_vector;
1144     
1145
1146     PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
1147                f, maxmap, gvaoff,gvaentry,commgpa, vec);
1148
1149     if (maxmap<0x100000000ULL) { 
1150         PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
1151         maxmap=0x100000000ULL;
1152     }
1153
1154     if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
1155         PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
1156         return -1;
1157     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
1158         f &= ~0x3c;
1159         f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
1160         h->max_mem_mapped = maxmap;
1161         PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
1162     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { 
1163         f &= ~0x3c;
1164         f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
1165         h->max_mem_mapped = maxmap;
1166         PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
1167     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
1168         f &= ~0x3c;
1169         f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
1170         h->max_mem_mapped = maxmap;
1171         PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
1172     } else {
1173         PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
1174         return -1;
1175     }
1176
1177     if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
1178         PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
1179         return -1;
1180     }
1181
1182     h->hrt_flags = f;
1183
1184     if (maxmap>h->max_mem_mapped) { 
1185         PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
1186         return -1;
1187     }
1188
1189     if (gvaoff!=0 && gvaoff!=TOP_HALF_START) { 
1190         PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
1191         return -1;
1192     }
1193     
1194     h->gva_offset = gvaoff;
1195
1196     h->gva_entry = gvaentry;
1197
1198     if (mb->addr->load_addr < h->first_hrt_gpa) { 
1199         PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
1200         return -1;
1201     }
1202     
1203     if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
1204         PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
1205         return -1;
1206     }
1207     
1208     if (vec<32) { 
1209         PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
1210         return -1;
1211     }
1212     
1213     h->hrt_int_vector = vec;
1214     
1215     
1216     if (commgpa < vm->mem_size) { 
1217         PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
1218         return -1;
1219     } 
1220
1221     h->comm_page_gpa = commgpa;
1222
1223     if (!h->comm_page_hpa) { 
1224         if (!(h->comm_page_hpa=V3_AllocPages(1))) { 
1225             PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
1226             return -1;
1227         }
1228
1229         h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
1230         
1231         memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1232         
1233         if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) { 
1234             PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
1235             V3_FreePages((void*)(h->comm_page_gpa),1);
1236             return -1;
1237         }
1238         
1239         
1240         PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
1241     }
1242
1243     memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1244     
1245     
1246     PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
1247                h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
1248     
1249     return 0;
1250
1251 }
1252
1253 static int setup_mb_kernel_hrt(struct v3_vm_info *vm)
1254 {
1255     mb_data_t mb;
1256
1257     if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) { 
1258         PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
1259         return -1;
1260     }
1261
1262     if (configure_hrt(vm,&mb)) {
1263         PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
1264         return -1;
1265     }
1266     
1267     if (v3_write_multiboot_kernel(vm,&mb,vm->hvm_state.hrt_file,
1268                                   (void*)vm->hvm_state.first_hrt_gpa,
1269                                   vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
1270         PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
1271         return -1;
1272     }
1273
1274     if (vm->hvm_state.gva_entry) { 
1275         vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
1276     } else {
1277         vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
1278     }
1279
1280     vm->hvm_state.hrt_type = HRT_MBOOT64;
1281
1282     return 0;
1283
1284 }
1285
1286
1287 static int setup_hrt(struct v3_vm_info *vm)
1288 {
1289     if (is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size) && 
1290         find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { 
1291
1292         PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
1293         if (setup_mb_kernel_hrt(vm)) { 
1294             PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
1295             return -1;
1296         } 
1297     } else {
1298         PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
1299         return -1;
1300     }
1301
1302     return 0;
1303 }
1304
1305
1306         
1307
1308 /*
1309   GPA layout:
1310
1311   HRT
1312   ---
1313   ROS
1314
1315   We do not touch the ROS portion of the address space.
1316   The HRT portion looks like:
1317
1318   INT_HANDLER (1 page - page aligned)
1319   IDT (1 page - page aligned)
1320   GDT (1 page - page aligned)
1321   TSS (1 page - page asligned)
1322   PAGETABLES  (identy map of first N GB)
1323      ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
1324      followed by 3rd level PTs in order, followed by 4th level
1325      PTs in order.  
1326   MBINFO_PAGE
1327   SCRATCH_STACK_HRT_CORE0 
1328   SCRATCH_STACK_HRT_CORE1
1329   ..
1330   SCRATCH_STACK_HRT_COREN
1331   ...
1332   HRT (as many pages as needed, page-aligned, starting at first HRT address)
1333   ---
1334   ROS
1335
1336
1337 */
1338
1339
1340 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
1341 {
1342     if (!vm->hvm_state.is_hvm) { 
1343         PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
1344         return 0;
1345     }
1346
1347     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
1348
1349     if (setup_hrt(vm)) {
1350         PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
1351         return -1;
1352     } 
1353
1354     // the locations of all the other items are determined by
1355     // the HRT setup, so these must happen after
1356
1357     write_null_int_handler(vm);
1358     write_idt(vm);
1359     write_gdt(vm);
1360     write_tss(vm);
1361
1362     write_pts(vm);
1363
1364     // this must happen last
1365     write_mb_info(vm);
1366
1367     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
1368
1369     return 0;
1370 }
1371
1372 /*
1373   On entry for every core:
1374
1375    IDTR points to stub IDT
1376    GDTR points to stub GDT
1377    TS   points to stub TSS
1378    CR3 points to root page table
1379    CR0 has PE and PG
1380    EFER has LME AND LMA (and NX for compatibility with Linux)
1381    RSP is TOS of core's scratch stack (looks like a call)
1382
1383    RAX = MB magic cookie
1384    RBX = address of multiboot info table
1385    RCX = this core id / apic id (0..N-1)
1386    RDX = this core id - first HRT core ID (==0 for the first HRT core)
1387
1388    All addresses are virtual addresses, offset as needed by gva_offset
1389
1390    Other regs are zeroed
1391
1392    shadow/nested paging state reset for long mode
1393
1394 */
1395 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
1396 {
1397     void *base;
1398     uint64_t limit;
1399     uint64_t gva_offset;
1400
1401     rdtscll(core->hvm_state.last_boot_start);
1402     
1403
1404     if (!core->hvm_state.is_hrt) { 
1405         PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
1406         return 0;
1407     }
1408
1409
1410     PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
1411
1412     gva_offset = core->vm_info->hvm_state.gva_offset;
1413     
1414     memset(&core->vm_regs,0,sizeof(core->vm_regs));
1415     memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
1416     memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
1417     memset(&core->segments,0,sizeof(core->segments));    
1418     memset(&core->msrs,0,sizeof(core->msrs));    
1419     memset(&core->fp_state,0,sizeof(core->fp_state));    
1420
1421     // We are in long mode with virtual memory and we want
1422     // to start immediatley
1423     core->cpl = 0; // we are going right into the kernel
1424     core->cpu_mode = LONG;
1425     core->mem_mode = VIRTUAL_MEM; 
1426     core->core_run_state = CORE_RUNNING ;
1427
1428
1429     // magic
1430     core->vm_regs.rax = MB2_INFO_MAGIC;
1431
1432     // multiboot info pointer
1433     get_mb_info_loc(core->vm_info, &base,&limit);
1434     core->vm_regs.rbx = (uint64_t) base + gva_offset;  
1435
1436     // core number
1437     core->vm_regs.rcx = core->vcpu_id;
1438     
1439     // HRT core number
1440     core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1441
1442     // Now point to scratch stack for this core
1443     // it begins at an ofset relative to the MB info page
1444     get_mb_info_loc(core->vm_info, &base,&limit);
1445     base = base + gva_offset;
1446     base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1447     core->vm_regs.rsp = (v3_reg_t) base;  
1448     core->vm_regs.rbp = (v3_reg_t) base-8; 
1449
1450     // push onto the stack a bad rbp and bad return address
1451     core->vm_regs.rsp-=16;
1452     v3_set_gpa_memory(core,
1453                       core->vm_regs.rsp-gva_offset,
1454                       16,
1455                       0xff);
1456
1457
1458     // HRT entry point
1459     get_hrt_loc(core->vm_info, &base,&limit);
1460     if (core->vm_info->hvm_state.gva_entry) { 
1461       core->rip = core->vm_info->hvm_state.gva_entry;
1462     } else {
1463       core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset; 
1464     }
1465       
1466
1467
1468     PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1469                (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1470                (void*)(core->rip),
1471                (void*)(core->vm_regs.rsp),
1472                (void*)(core->vm_regs.rbp),
1473                (void*)(core->vm_regs.rax),
1474                (void*)(core->vm_regs.rbx),
1475                (void*)(core->vm_regs.rcx),
1476                (void*)(core->vm_regs.rdx));
1477
1478     // Setup CRs for long mode and our stub page table
1479     // CR0: PG, PE
1480     core->ctrl_regs.cr0 = 0x80000001;
1481     core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1482
1483     // CR2: don't care (output from #PF)
1484     // CE3: set to our PML4E, without setting PCD or PWT
1485     get_pt_loc(core->vm_info, &base,&limit);
1486     core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);  // not offset as this is a GPA
1487     core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1488
1489     // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1490     core->ctrl_regs.cr4 = 0xb0;
1491     core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1492     // CR8 as usual
1493     // RFLAGS zeroed is fine: come in with interrupts off
1494     // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
1495     core->ctrl_regs.efer = 0x1d00;
1496     core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1497
1498
1499     /* 
1500        Notes on selectors:
1501
1502        selector is 13 bits of index, 1 bit table indicator 
1503        (0=>GDT), 2 bit RPL
1504        
1505        index is scaled by 8, even in long mode, where some entries 
1506        are 16 bytes long.... 
1507           -> code, data descriptors have 8 byte format
1508              because base, limit, etc, are ignored (no segmentation)
1509           -> interrupt/trap gates have 16 byte format 
1510              because offset needs to be 64 bits
1511     */
1512     
1513     // Install our stub IDT
1514     get_idt_loc(core->vm_info, &base,&limit);
1515     base += gva_offset;
1516     core->segments.idtr.selector = 0;  // entry 0 (NULL) of the GDT
1517     core->segments.idtr.base = (addr_t) base;  // only base+limit are used
1518     core->segments.idtr.limit = limit-1;
1519     core->segments.idtr.type = 0x0;
1520     core->segments.idtr.system = 0; 
1521     core->segments.idtr.dpl = 0;
1522     core->segments.idtr.present = 0;
1523     core->segments.idtr.long_mode = 0;
1524
1525     // Install our stub GDT
1526     get_gdt_loc(core->vm_info, &base,&limit);
1527     base += gva_offset;
1528     core->segments.gdtr.selector = 0;  // entry 0 (NULL) of the GDT
1529     core->segments.gdtr.base = (addr_t) base;
1530     core->segments.gdtr.limit = limit-1;   // only base+limit are used
1531     core->segments.gdtr.type = 0x0;
1532     core->segments.gdtr.system = 0; 
1533     core->segments.gdtr.dpl = 0;
1534     core->segments.gdtr.present = 0;
1535     core->segments.gdtr.long_mode = 0;
1536     
1537     // And our TSS
1538     get_tss_loc(core->vm_info, &base,&limit);
1539     base += gva_offset;  
1540     core->segments.tr.selector = 0;
1541     core->segments.tr.base = (addr_t) base;
1542     core->segments.tr.limit = limit-1;
1543     core->segments.tr.type = 0x9;
1544     core->segments.tr.system = 0;   // available 64 bit TSS 
1545     core->segments.tr.dpl = 0;
1546     core->segments.tr.present = 1;
1547     core->segments.tr.long_mode = 0; // not used
1548     
1549     base = 0x0; // these are not offset as we want to make all gvas visible
1550     limit = -1;
1551
1552     // And CS
1553     core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1554     core->segments.cs.base = (addr_t) base;   // not used
1555     core->segments.cs.limit = limit;          // not used
1556     core->segments.cs.type = 0xe;             // only C is used
1557     core->segments.cs.system = 1;             // not a system segment
1558     core->segments.cs.dpl = 0;                       
1559     core->segments.cs.present = 1;
1560     core->segments.cs.long_mode = 1;
1561
1562     // DS, SS, etc are identical
1563     core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1564     core->segments.ds.base = (addr_t) base;
1565     core->segments.ds.limit = limit;
1566     core->segments.ds.type = 0x6;            // ignored
1567     core->segments.ds.system = 1;            // not a system segment
1568     core->segments.ds.dpl = 0;
1569     core->segments.ds.present = 1;
1570     core->segments.ds.long_mode = 1;
1571     
1572     memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1573     memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1574     memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1575     memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1576     
1577
1578     // reset paging here for shadow... 
1579
1580     if (core->shdw_pg_mode != NESTED_PAGING) { 
1581         PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
1582         return -1;
1583     }
1584
1585
1586     return 0;
1587 }
1588
1589 int v3_handle_hvm_reset(struct guest_info *core)
1590 {
1591
1592     if (core->core_run_state != CORE_RESETTING) { 
1593         return 0;
1594     }
1595
1596     if (!core->vm_info->hvm_state.is_hvm) { 
1597         return 0;
1598     }
1599
1600     if (v3_is_hvm_hrt_core(core)) { 
1601         // this is an HRT reset
1602         int rc=0;
1603
1604         // wait for all the HRT cores
1605         v3_counting_barrier(&core->vm_info->reset_barrier);
1606
1607         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
1608             // I am leader
1609             core->vm_info->run_state = VM_RESETTING;
1610         }
1611
1612         core->core_run_state = CORE_RESETTING;
1613
1614         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1615             // we really only need to clear the bss
1616             // and recopy the .data, but for now we'll just
1617             // do everything
1618             rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
1619
1620             if (rc) { 
1621                 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
1622             }
1623         }
1624
1625         // now everyone is ready to reset
1626         rc |= v3_setup_hvm_hrt_core_for_boot(core);
1627
1628         if (rc) { 
1629             PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
1630         }
1631
1632         core->core_run_state = CORE_RUNNING;
1633
1634         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
1635             // leader
1636             core->vm_info->run_state = VM_RUNNING;
1637             core->vm_info->hvm_state.trans_state = HRT_IDLE;
1638         }
1639
1640         v3_counting_barrier(&core->vm_info->reset_barrier);
1641
1642         if (rc<0) { 
1643             PrintError(core->vm_info,core,"hvm: reset failed\n");
1644             return rc;
1645         } else {
1646             return 1;
1647         }
1648
1649     } else { 
1650         // ROS core will be handled by normal reset functionality
1651         return 0;
1652     }
1653 }