Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


HVM synchronous operation facility
[palacios.git] / palacios / src / palacios / vmm_hvm.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org> 
11  * All rights reserved.
12  *
13  * Author:  Peter Dinda <pdinda@northwestern.edu>
14  *
15  * This is free software.  You are permitted to use,
16  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
17  */
18
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
26
27 #include <palacios/vmm_xml.h>
28
29 #include <palacios/vm_guest_mem.h>
30
31 #include <palacios/vmm_debug.h>
32
33
34 /*
35
36   MEM     = Total size of memory in the GPA (in MB)
37   ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
38
39   GPAs [0,ROS_MEM) are what the ROS sees
40   GPAs [ROS_MEM, MEM) are HRT only
41   GPAS [0,MEM) are accessible by the HRT
42
43   CORES   = Total number of cores in VM
44   ROS_CORES = Total numbber of cores for the ROS
45
46   Cores [0,ROS_CORES) are what the ROS sees
47   Cores [ROS_CORES,CORES) are HRT only
48   Cores [0,CORES) are accessible by the HRT
49
50   In a Pal file:
51
52   <files> 
53     <file id="hrtelf" filename="hrtelf.o" />
54   </files>
55
56   <mem ... >RAM</mem>   (MB)  Note these are  
57   <cores count="CORES" ...>   backward compatible
58
59   <hvm enable="y" >
60     <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
61     <hrt file_id="hrtelf" /hrt>
62   </hvm>
63
64 */
65
66 #ifndef V3_CONFIG_DEBUG_HVM
67 #undef PrintDebug
68 #define PrintDebug(fmt, args...)
69 #endif
70
71
72 int v3_init_hvm()
73 {
74     PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
75     return 0;
76 }
77
78 int v3_deinit_hvm()
79 {
80     PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
81     return 0;
82 }
83
84 // ignore requests from when we are in the wrong state
85 #define ENFORCE_STATE_MACHINE 1
86
87 // invoke the HRT using a page fault instead of
88 // the SWINTR mechanism
89 #define USE_UPCALL_MAGIC_PF  1
90 #define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
91 #define UPCALL_MAGIC_ERROR   0xf00df00d
92
93 /*
94   64 bit only hypercall:
95
96   rax = hypercall number
97   rbx = 0x646464...
98   then args are:  rcx, rdx, rsi, rdi r8, r9, r10, r11
99   rcx = 1st arg
100 */
101 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
102 {
103     uint64_t c;
104     uint64_t bitness = core->vm_regs.rbx;
105     uint64_t a1 = core->vm_regs.rcx;
106     uint64_t a2 = core->vm_regs.rdx;
107     struct v3_vm_hvm *h = &core->vm_info->hvm_state;
108
109
110     if (bitness!=0x6464646464646464) { 
111         PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
112         core->vm_regs.rax = -1;
113         return 0;
114     }
115
116     switch (a1) {
117         case 0x0:   // null
118             
119             rdtscll(c);
120             
121             V3_Print(core->vm_info,core, "hvm: received hypercall %x  rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
122                      hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
123             //v3_print_core_telemetry(core);
124             //    v3_print_guest_state(core);
125             core->vm_regs.rax = 0;
126             break;
127             
128         case 0x1: // reset ros
129             PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
130             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) { 
131                 PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
132                 core->vm_regs.rax = -1;
133             } else {
134                 core->vm_regs.rax = 0;
135             }
136             break;
137
138         case 0x2: // reset hrt
139             PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
140             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) { 
141                 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
142                 core->vm_regs.rax = -1;
143             } else {
144                 core->vm_regs.rax = 0;
145             }
146             break;
147
148         case 0x3: // reset both
149             PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
150             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) { 
151                 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
152                 core->vm_regs.rax = -1;
153             } else {
154                 core->vm_regs.rax = 0;
155             }
156             break;
157             
158         case 0xf: // get HRT state
159             core->vm_regs.rax = h->trans_state;
160             //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
161             break;
162
163         case 0x20: // invoke function (ROS->HRT)
164         case 0x21: // invoke parallel function (ROS->HRT)
165             if (v3_is_hvm_hrt_core(core)) { 
166                 PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
167                 core->vm_regs.rax = -1;
168             } else {
169                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
170                     PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
171                     core->vm_regs.rax = -1;
172                 } else {
173                     uint64_t *page = (uint64_t *) h->comm_page_hva;
174                     uint64_t first, last, cur;
175
176                     PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
177                     page[0] = a1;
178                     page[1] = a2;
179
180                     if (a1==0x20) { 
181                         first=last=h->first_hrt_core;
182                     } else {
183                         first=h->first_hrt_core;
184                         last=core->vm_info->num_cores-1;
185                     }
186
187                     core->vm_regs.rax = 0;
188
189                     h->trans_count = last-first+1;
190
191                     for (cur=first;cur<=last;cur++) { 
192
193 #if USE_UPCALL_MAGIC_PF
194                         PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
195                         core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
196                         if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
197                                                           PF_EXCEPTION, 
198                                                           UPCALL_MAGIC_ERROR)) { 
199                             PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
200                             core->vm_regs.rax = -1;
201                             break;
202                         }
203 #else
204                         PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
205                         if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) { 
206                             PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
207                             core->vm_regs.rax = -1;
208                             break;
209                         }
210 #endif
211                         // Force core to exit now
212                         v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
213                           
214                     }
215                     if (core->vm_regs.rax==0) { 
216                         if (a1==0x20) { 
217                             h->trans_state = HRT_CALL;
218                         } else {
219                             h->trans_state = HRT_PARCALL;
220                         }
221                     }  else {
222                         PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
223                         h->trans_state = HRT_IDLE;
224                         h->trans_count = 0;
225                     }
226                 }
227             }
228             break;
229
230
231         case 0x28: // setup for synchronous operation (ROS->HRT)
232         case 0x29: // teardown for synchronous operation (ROS->HRT)
233             if (v3_is_hvm_hrt_core(core)) { 
234                 PrintError(core->vm_info,core,"hvm: %ssynchronization invocation not supported from HRT core\n",a1==0x29 ? "de" : "");
235                 core->vm_regs.rax = -1;
236             } else {
237                 if (ENFORCE_STATE_MACHINE && 
238                     ((a1==0x28 && h->trans_state!=HRT_IDLE) || (a1==0x29 && h->trans_state!=HRT_SYNC))) { 
239                     PrintError(core->vm_info,core, "hvm: cannot invoke %ssynchronization in state %d\n",a1==0x29 ? "de" : "", h->trans_state);
240                     core->vm_regs.rax = -1;
241                 } else {
242                     uint64_t *page = (uint64_t *) h->comm_page_hva;
243                     uint64_t first, last, cur;
244
245                     PrintDebug(core->vm_info,core, "hvm: invoke %ssynchronization on address %p\n",a1==0x29 ? "de" : "",(void*)a2);
246                     page[0] = a1;
247                     page[1] = a2;
248
249                     first=last=h->first_hrt_core;  // initially we will sync only with BSP
250
251                     core->vm_regs.rax = 0;
252
253                     h->trans_count = last-first+1;
254
255                     for (cur=first;cur<=last;cur++) { 
256
257 #if USE_UPCALL_MAGIC_PF
258                         PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
259                         core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
260                         if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
261                                                           PF_EXCEPTION, 
262                                                           UPCALL_MAGIC_ERROR)) { 
263                             PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
264                             core->vm_regs.rax = -1;
265                             break;
266                         }
267 #else
268                         PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
269                         if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) { 
270                             PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
271                             core->vm_regs.rax = -1;
272                             break;
273                         }
274 #endif
275                         // Force core to exit now
276                         v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
277                           
278                     }
279                     if (core->vm_regs.rax==0) { 
280                         if (a1==0x28) { 
281                             h->trans_state = HRT_SYNCSETUP;
282                         } else {
283                             h->trans_state = HRT_SYNCTEARDOWN;                      
284                         }
285                     }  else {
286                         PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
287                         h->trans_state = HRT_IDLE;
288                         h->trans_count = 0;
289                     }
290                 }
291             }
292             break;
293
294         case 0x2f: // function exec or sync done
295             if (v3_is_hvm_ros_core(core)) { 
296                 PrintError(core->vm_info,core, "hvm: request for exec or sync done from ROS core\n");
297                 core->vm_regs.rax=-1;
298             } else {
299                 if (ENFORCE_STATE_MACHINE && 
300                     h->trans_state!=HRT_CALL && 
301                     h->trans_state!=HRT_PARCALL && 
302                     h->trans_state!=HRT_SYNCSETUP &&
303                     h->trans_state!=HRT_SYNCTEARDOWN) {
304                     PrintError(core->vm_info,core,"hvm: function or sync completion when not in HRT_CALL, HRT_PARCALL, HRT_SYNCSETUP, or HRT_SYNCTEARDOWN state\n");
305                     core->vm_regs.rax=-1;
306                 } else {
307                     uint64_t one=1;
308                     PrintDebug(core->vm_info,core, "hvm: function or sync complete\n");
309                     if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
310                         // last one, switch state
311                         if (h->trans_state==HRT_SYNCSETUP) { 
312                             h->trans_state=HRT_SYNC;
313                             PrintDebug(core->vm_info,core, "hvm: function complete - now synchronous\n");
314                         } else {
315                             h->trans_state=HRT_IDLE;
316                         }
317                     }
318                     core->vm_regs.rax=0;
319                 }
320             }
321                     
322             break;
323
324         case 0x30: // merge address space
325         case 0x31: // unmerge address space
326             if (v3_is_hvm_hrt_core(core)) { 
327                 PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
328                 core->vm_regs.rax=-1;
329             } else {
330                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
331                     PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state\n",a1==0x30 ? "" : "un");
332                     core->vm_regs.rax=-1;
333                 } else {
334                     uint64_t *page = (uint64_t *) h->comm_page_hva;
335
336                     PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
337                     // should sanity check to make sure guest is in 64 bit without anything strange
338
339                     page[0] = a1;
340                     page[1] = core->ctrl_regs.cr3;  // this is a do-not-care for an unmerge
341
342                     core->vm_regs.rax = 0;
343 #if USE_UPCALL_MAGIC_PF
344                     PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %u\n",h->first_hrt_core);
345                     core->vm_info->cores[h->first_hrt_core].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
346                     if (v3_raise_exception_with_error(&core->vm_info->cores[h->first_hrt_core],
347                                                       PF_EXCEPTION,  
348                                                       UPCALL_MAGIC_ERROR)) { 
349                       PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %u\n",h->first_hrt_core);
350                       core->vm_regs.rax = -1;
351                       break;
352                     }
353 #else
354                     PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %u\n",h->hrt_int_vector,h->first_hrt_core);
355                     if (v3_raise_swintr(&core->vm_info->cores[h->first_hrt_core],h->hrt_int_vector)) { 
356                         PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %u\n",h->first_hrt_core);
357                         core->vm_regs.rax = -1;
358                     } 
359 #endif          
360                     // Force core to exit now
361                     v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
362
363                     h->trans_state = HRT_MERGE;
364                 }
365                 
366             }
367                 
368             break;
369             
370
371         case 0x3f: // merge operation done
372             if (v3_is_hvm_ros_core(core)) { 
373                 PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
374                 core->vm_regs.rax=-1;
375             } else {
376                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
377                     PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
378                     core->vm_regs.rax=-1;
379                 } else {
380                     PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
381                     h->trans_state=HRT_IDLE;
382                     core->vm_regs.rax=0;
383                 }
384             }
385                     
386             break;
387
388         default:
389             PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
390             core->vm_regs.rax=-1;
391             break;
392     }
393                 
394     return 0;
395 }
396
397 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
398
399 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
400 {
401     v3_cfg_tree_t *hvm_config;
402     v3_cfg_tree_t *ros_config;
403     v3_cfg_tree_t *hrt_config;
404     char *enable;
405     char *ros_cores;
406     char *ros_mem;
407     char *hrt_file_id=0;
408
409     PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
410
411     /* 
412        Defaults - all ROS
413     */
414     memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
415     vm->hvm_state.is_hvm=0;
416     vm->hvm_state.first_hrt_core=vm->num_cores;
417     vm->hvm_state.first_hrt_gpa=vm->mem_size;
418
419     if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
420         PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
421         goto out_ok;
422     }
423     
424     if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
425         PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
426         goto out_ok;
427     }
428
429     if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) { 
430         PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
431         return -1;
432     }
433  
434     if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) { 
435         PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
436         return -1;
437     }
438    
439     vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
440     
441     if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) { 
442         PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
443         return -1;
444     }
445
446     vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
447
448     if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) { 
449         PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
450         return -1;
451     }
452  
453     if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) { 
454         PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
455         return -1;
456     }
457
458     vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
459     
460     if (!vm->hvm_state.hrt_file) { 
461         PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
462         return -1;
463     }
464
465     if (v3_register_hypercall(vm, HVM_HCALL, 
466                               hvm_hcall_handler, 0)) { 
467         PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
468         return -1;
469     }
470
471     // XXX sanity check config here
472
473     vm->hvm_state.is_hvm=1;
474
475  out_ok:
476     if (vm->hvm_state.is_hvm) {
477         V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
478                  vm->hvm_state.first_hrt_core-1,
479                  (void*) vm->hvm_state.first_hrt_gpa-1,
480                  vm->hvm_state.first_hrt_core,
481                  vm->num_cores-1,
482                  (void*) vm->hvm_state.first_hrt_gpa,
483                  (void*)vm->mem_size-1,
484                  hrt_file_id,
485                  vm->hvm_state.hrt_file->tag);
486     } else {
487         V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
488     }
489     return 0;
490     
491 }
492
493
494 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
495 {
496     PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
497
498     v3_remove_hypercall(vm,HVM_HCALL);
499
500     if (vm->hvm_state.comm_page_hpa) { 
501         struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
502         if (!r) { 
503             PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
504         } else {
505             v3_delete_mem_region(vm,r);
506         }
507     }
508
509     return 0;
510 }
511
512 int v3_init_hvm_core(struct guest_info *core)
513 {
514     memset(&core->hvm_state,0,sizeof(core->hvm_state));
515     if (core->vm_info->hvm_state.is_hvm) { 
516         if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) { 
517             core->hvm_state.is_hrt=1;
518         }
519     }
520     return 0;
521 }
522
523 int v3_deinit_hvm_core(struct guest_info *core)
524 {
525     PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
526
527     return 0;
528 }
529
530
531 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
532 {
533     if (vm->hvm_state.is_hvm) { 
534         return vm->hvm_state.first_hrt_gpa;
535     } else {
536         return vm->mem_size;
537     }
538 }
539 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
540 {
541     return vm->mem_size;
542 }
543
544 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
545 {
546     if (vm->hvm_state.is_hvm) { 
547         return vm->hvm_state.first_hrt_core;
548     } else {
549         return vm->num_cores;
550     }
551 }
552
553 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
554 {
555     if (vm->hvm_state.is_hvm) { 
556         return vm->num_cores - vm->hvm_state.first_hrt_core;
557     } else {
558         return 0;
559     }
560 }
561
562
563 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
564 {
565     if (vm->hvm_state.is_hvm) { 
566         return gpa>=0 && gpa<vm->hvm_state.first_hrt_gpa;
567     } else {
568         return 1;
569     }
570 }
571
572 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
573 {
574     if (vm->hvm_state.is_hvm) { 
575         return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
576     } else {
577         return 0;
578     }
579 }
580
581 int v3_is_hvm_hrt_core(struct guest_info *core)
582 {
583     return core->hvm_state.is_hrt;
584 }
585
586 int v3_is_hvm_ros_core(struct guest_info *core)
587 {
588     return !core->hvm_state.is_hrt;
589 }
590
591 int      v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
592 {
593     if (!src) {
594         // ioapic or msi to apic
595         return !dest->hvm_state.is_hrt;
596     } else {
597         // apic to apic
598         return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
599     }
600 }
601
602 void     v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm, 
603                                         uint32_t *start_apic, uint32_t *num_apics)
604 {
605     if (!core) { 
606         // Seen from ioapic, msi, etc: 
607         if (vm->hvm_state.is_hvm) {
608             // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
609             *start_apic = 0;
610             *num_apics = vm->hvm_state.first_hrt_core;
611         } else {
612             // Non-HVM shows all cores/APICs to apic, msi, etc.
613             *start_apic = 0;
614             *num_apics = vm->num_cores;
615         }
616     } else {
617         // Seen from apic
618         if (core->hvm_state.is_hrt) { 
619             // HRT core/apic sees all apics
620             // (this policy may change...)
621             *start_apic = 0;
622             *num_apics = vm->num_cores;
623         } else {
624             // non-HRT core/apic sees only non-HRT cores/apics
625             *start_apic = 0 ;
626             *num_apics = vm->hvm_state.first_hrt_core;
627         }
628     }
629 }
630
631 #define MAX(x,y) ((x)>(y)?(x):(y))
632 #define MIN(x,y) ((x)<(y)?(x):(y))
633
634
635 static uint64_t boot_state_end_addr(struct v3_vm_info *vm) 
636 {
637     return PAGE_ADDR(vm->mem_size);
638 }
639    
640 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
641 {
642     *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
643     *limit = PAGE_SIZE;
644 }
645
646 extern v3_cpu_arch_t v3_mach_type;
647
648 extern void *v3_hvm_svm_null_int_handler_start;
649 extern void *v3_hvm_svm_null_int_handler_end;
650 extern void *v3_hvm_vmx_null_int_handler_start;
651 extern void *v3_hvm_vmx_null_int_handler_end;
652
653 static void write_null_int_handler(struct v3_vm_info *vm)
654 {
655     void *base;
656     uint64_t limit;
657     void *data;
658     uint64_t len;
659
660     get_null_int_handler_loc(vm,&base,&limit);
661
662     switch (v3_mach_type) {
663 #ifdef V3_CONFIG_SVM
664         case V3_SVM_CPU:
665         case V3_SVM_REV3_CPU:
666             data = (void*) &v3_hvm_svm_null_int_handler_start;
667             len = (void*) &v3_hvm_svm_null_int_handler_end - data;
668             break;
669 #endif
670 #if V3_CONFIG_VMX
671         case V3_VMX_CPU:
672         case V3_VMX_EPT_CPU:
673         case V3_VMX_EPT_UG_CPU:
674             data = (void*) &v3_hvm_vmx_null_int_handler_start;
675             len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
676             break;
677 #endif
678         default:
679             PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
680             data = 0;
681             len = 0;
682     }
683
684     if (data) {
685         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
686     }
687
688     PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
689 }
690
691
692 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
693 {
694     *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
695     *limit = 16*256;
696 }
697
698 // default IDT entries (int and trap gates)
699 //
700 // Format is 16 bytes long:
701 //   16 offsetlo   => 0
702 //   16 selector   => (target code selector) => 0x8 // entry 1 of GDT
703 //    3 ist        => (stack) = 0 => current stack
704 //    5 reserved   => 0
705 //    4 type       => 0xe=>INT, 0xf=>TRAP 
706 //    1 reserved   => 0  (indicates "system" by being zero)
707 //    2 dpl        => 0
708 //    1 present    => 1
709 //   16 offsetmid  => 0
710 //   32 offsethigh => 0   (total is a 64 bit offset)
711 //   32 reserved   => 0
712 //
713 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
714 // 
715 // Note little endian
716 //
717 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
718 static uint64_t idt64_int_gate_entry_mask[2] =  { 0x00008e0000080000, 0x0 };
719
720 static void write_idt(struct v3_vm_info *vm)
721 {
722     void *base;
723     uint64_t limit;
724     void *handler;
725     uint64_t handler_len;
726     int i;
727     uint64_t trap_gate[2];
728     uint64_t int_gate[2];
729
730     get_idt_loc(vm,&base,&limit);
731
732     get_null_int_handler_loc(vm,&handler,&handler_len);
733
734     handler += vm->hvm_state.gva_offset;
735
736     memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
737     memcpy(int_gate,idt64_int_gate_entry_mask,16);
738
739     if (handler) {
740         // update the entries for the handler location
741         uint8_t *mask;
742         uint8_t *hand;
743         
744         hand = (uint8_t*) &handler;
745
746         mask = (uint8_t *)trap_gate;
747         memcpy(&(mask[0]),&(hand[0]),2); // offset low
748         memcpy(&(mask[6]),&(hand[2]),2); // offset med
749         memcpy(&(mask[8]),&(hand[4]),4); // offset high
750
751         mask = (uint8_t *)int_gate;
752         memcpy(&(mask[0]),&(hand[0]),2); // offset low
753         memcpy(&(mask[6]),&(hand[2]),2); // offset med
754         memcpy(&(mask[8]),&(hand[4]),4); // offset high
755
756         PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
757     }
758
759     for (i=0;i<32;i++) { 
760         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
761     }
762
763     for (i=32;i<256;i++) { 
764         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
765     }
766
767     PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
768 }
769
770
771
772 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
773 {
774     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
775     *limit = 8*3;
776 }
777
778 static uint64_t gdt64[3] = {
779     0x0000000000000000, /* null */
780     0x00a09a0000000000, /* code (note lme bit) */
781     0x00a0920000000000, /* data (most entries don't matter) */
782 };
783
784 static void write_gdt(struct v3_vm_info *vm)
785 {
786     void *base;
787     uint64_t limit;
788
789     get_gdt_loc(vm,&base,&limit);
790     v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
791
792     PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
793 }
794
795
796
797 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
798 {
799     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
800     *limit = PAGE_SIZE;
801 }
802
803 static void write_tss(struct v3_vm_info *vm)
804 {
805     void *base;
806     uint64_t limit;
807
808     get_tss_loc(vm,&base,&limit);
809
810     v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
811
812     PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
813 }
814
815
816 #define TOP_HALF_START  0xffff800000000000ULL
817 #define BOTTOM_HALF_END 0x00007fffffffffffULL
818
819
820 #define L4_UNIT PAGE_SIZE
821 #define L3_UNIT (512ULL * L4_UNIT)
822 #define L2_UNIT (512ULL * L3_UNIT)
823 #define L1_UNIT (512ULL * L2_UNIT)
824
825 static void compute_pts_4KB(struct v3_vm_info *vm, 
826                             uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)    
827 {
828
829     // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
830     // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
831     // so it is the same number of page tables regardless
832
833     uint64_t max_gva = vm->hvm_state.max_mem_mapped;
834
835     *l1 = 1;  // 1 PML4
836     *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
837     *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
838     *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
839 }
840
841
842
843 /*
844   PTS MAP using 1 GB pages
845   n second levels pts, highest gva, highest address
846   1 top level
847
848
849 OR
850   
851   PTS MAP using 2 MB pages
852   n third level pts, highest gva, highest address
853   m second level pts, highest gva, highest address
854   1 top level pt
855
856 OR
857
858   PTS MAP using 4 KB pages
859   n 4th level, highest gva, highest address
860   m 3rd level, highest gva, hihgest address
861   l second level, highest gva, highest address
862   1 top level pt
863
864 OR
865   PTS MAP using 512 GB pages when this becomes available
866
867 */
868
869
870 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
871 {
872     uint64_t l1,l2,l3,l4;
873     uint64_t num_pt;
874
875     compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
876
877     if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
878         num_pt = l1;
879     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
880         num_pt = l1 + l2;
881     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
882         num_pt = l1 + l2 + l3;
883     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
884         num_pt = l1 + l2 + l3 + l4;
885     } else {
886         PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
887         return;
888     }
889
890     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
891     *limit = num_pt*PAGE_SIZE;
892 }
893
894 static void write_pts(struct v3_vm_info *vm)
895 {
896     uint64_t size;
897     uint64_t num_l1, num_l2, num_l3, num_l4;
898     void *start_l1, *start_l2, *start_l3, *start_l4;
899     uint64_t max_level;
900     void *cur_pt;
901     void *cur_gva;
902     void *cur_gpa;
903     void *min_gpa = 0;
904     void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
905     void *min_gva = (void*) vm->hvm_state.gva_offset;
906 #ifdef V3_CONFIG_DEBUG_HVM
907     void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
908 #endif
909     uint64_t i, pt;
910     uint64_t i_start,i_end;
911     
912     struct pml4e64 *pml4e;
913     struct pdpe64 *pdpe;
914     struct pde64 *pde;
915     struct pte64 *pte;
916
917     if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
918         PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
919         max_level = 1;
920     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
921         max_level = 2;
922     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
923         max_level = 3;
924     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
925         max_level = 4;
926     } else {
927         PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
928         return;
929     }
930
931     get_pt_loc(vm,&start_l1,&size);
932     compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
933
934     start_l2=start_l1+PAGE_SIZE*num_l1;
935     start_l3=start_l2+PAGE_SIZE*num_l2;
936     start_l4=start_l3+PAGE_SIZE*num_l3;
937
938     PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
939     PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
940     PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
941     PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
942
943     cur_pt=start_l1;
944
945     // build PML4 (only one)
946     if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) { 
947         PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
948         return;
949     }
950
951     memset(pml4e,0,PAGE_SIZE);
952
953     if (min_gva==0x0) { 
954         i_start=0; i_end = num_l2;
955     } else if (min_gva==(void*)TOP_HALF_START) { 
956         i_start=256; i_end=256+num_l2;
957     } else {
958         PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
959         return;
960     }
961
962     for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
963          (i<i_end);
964          i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
965
966         pml4e[i].present=1;
967         pml4e[i].writable=1;
968         
969         if (max_level==1) { 
970             PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
971             pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
972             //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
973         } else {
974             pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
975             //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
976         }
977     }
978
979     // 512 GB only
980     if (max_level==1) {
981         return;
982     }
983
984
985
986     for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
987          pt<num_l2;
988          cur_pt+=PAGE_SIZE, pt++) { 
989
990         // build PDPE
991         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) { 
992             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
993             return;
994         }
995         
996         memset(pdpe,0,PAGE_SIZE);
997         
998         for (i=0; 
999              i<512 && cur_gpa<max_gpa; 
1000              i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
1001
1002             pdpe[i].present=1;
1003             pdpe[i].writable=1;
1004         
1005             if (max_level==2) { 
1006                 pdpe[i].large_page=1;
1007                 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1008                 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1009             } else {
1010                 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
1011                 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1012             }
1013         }
1014     }
1015         
1016     //1 GB only
1017     if (max_level==2) { 
1018         return;
1019     }
1020
1021     for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1022          pt<num_l3;
1023          cur_pt+=PAGE_SIZE, pt++) { 
1024
1025         // build PDE
1026         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) { 
1027             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
1028             return;
1029         }
1030         
1031         memset(pde,0,PAGE_SIZE);
1032         
1033         for (i=0; 
1034              i<512 && cur_gpa<max_gpa; 
1035              i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
1036
1037             pde[i].present=1;
1038             pde[i].writable=1;
1039         
1040             if (max_level==3) { 
1041                 pde[i].large_page=1;
1042                 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1043                 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
1044             } else {
1045                 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
1046                 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
1047             }
1048         }
1049     }
1050
1051     //2 MB only
1052     if (max_level==3) { 
1053         return;
1054     }
1055
1056
1057     // 4 KB
1058     for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1059          pt<num_l4;
1060          cur_pt+=PAGE_SIZE, pt++) { 
1061
1062         // build PTE
1063         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) { 
1064             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
1065             return;
1066         }
1067         
1068         memset(pte,0,PAGE_SIZE);
1069         
1070         for (i=0; 
1071              i<512 && cur_gpa<max_gpa; 
1072              i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
1073
1074             pte[i].present=1;
1075             pte[i].writable=1;
1076             pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1077             //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
1078         }
1079     }
1080
1081     return;
1082 }
1083
1084
1085 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1086 {
1087     
1088     get_pt_loc(vm,base, limit);
1089     *base-=PAGE_SIZE;
1090     *limit=PAGE_SIZE;
1091 }
1092
1093
1094 int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
1095 {
1096     struct v3_vm_info *vm = core->vm_info;
1097
1098     hrt->tag.type = MB_INFO_HRT_TAG;
1099     hrt->tag.size = sizeof(mb_info_hrt_t);
1100
1101     hrt->total_num_apics = vm->num_cores;
1102     hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
1103     hrt->have_hrt_ioapic=0;
1104     hrt->first_hrt_ioapic_entry=0;
1105
1106     hrt->cpu_freq_khz = V3_CPU_KHZ();
1107
1108     hrt->hrt_flags = vm->hvm_state.hrt_flags;
1109     hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
1110     hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
1111     hrt->gva_offset = vm->hvm_state.gva_offset;
1112     hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
1113     hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
1114     
1115     return 0;
1116 }
1117
1118 static void write_mb_info(struct v3_vm_info *vm) 
1119 {
1120     if (vm->hvm_state.hrt_type!=HRT_MBOOT64) { 
1121         PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
1122         return;
1123     } else {
1124         uint8_t buf[256];
1125         uint64_t size;
1126         void *base;
1127         uint64_t limit;
1128
1129         get_mb_info_loc(vm,&base,&limit);
1130         
1131         if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) { 
1132             PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
1133             return;
1134         }
1135
1136         if (size>limit) { 
1137             PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
1138             return;
1139         }
1140         
1141         v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
1142                             (addr_t)base,
1143                             size,
1144                             buf);
1145
1146         PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
1147     }
1148 }
1149
1150 #define SCRATCH_STACK_SIZE 4096
1151
1152
1153 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1154 {
1155     void *mb_base;
1156     uint64_t mb_limit;
1157     
1158     get_mb_info_loc(vm,&mb_base,&mb_limit);
1159     
1160     mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
1161
1162     *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
1163
1164     if (mb_base < *base+PAGE_SIZE) { 
1165         PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
1166     }
1167
1168     *limit = mb_base - *base;
1169 }
1170
1171
1172 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1173 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1174
1175 #define ELF_MAGIC    0x464c457f
1176 #define MB2_MAGIC    0xe85250d6
1177
1178 #define MB2_INFO_MAGIC    0x36d76289
1179
1180 static int is_elf(uint8_t *data, uint64_t size)
1181 {
1182     if (*((uint32_t*)data)==ELF_MAGIC) {
1183         return 1;
1184     } else { 
1185         return 0;
1186     }
1187 }
1188
1189 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
1190 {
1191     uint64_t limit = size > 32768 ? 32768 : size;
1192     uint64_t i;
1193
1194     // Scan for the .boot magic cookie
1195     // must be in first 32K, assume 4 byte aligned
1196     for (i=0;i<limit;i+=4) { 
1197         if (*((uint32_t*)&data[i])==MB2_MAGIC) {
1198             INFO("Found multiboot header at offset 0x%llx\n",i);
1199             return (mb_header_t *) &data[i];
1200         }
1201     }
1202     return 0;
1203 }
1204
1205
1206 static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
1207 {
1208     struct v3_vm_hvm *h = &vm->hvm_state;
1209     uint64_t f = mb->mb64_hrt->hrt_flags;
1210     uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
1211     uint64_t gvaoff = mb->mb64_hrt->gva_offset;
1212     uint64_t gvaentry = mb->mb64_hrt->gva_entry;
1213     uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
1214     uint8_t  vec = mb->mb64_hrt->hrt_int_vector;
1215     
1216
1217     PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
1218                f, maxmap, gvaoff,gvaentry,commgpa, vec);
1219
1220     if (maxmap<0x100000000ULL) { 
1221         PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
1222         maxmap=0x100000000ULL;
1223     }
1224
1225     if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
1226         PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
1227         return -1;
1228     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
1229         f &= ~0x3c;
1230         f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
1231         h->max_mem_mapped = maxmap;
1232         PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
1233     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { 
1234         f &= ~0x3c;
1235         f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
1236         h->max_mem_mapped = maxmap;
1237         PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
1238     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
1239         f &= ~0x3c;
1240         f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
1241         h->max_mem_mapped = maxmap;
1242         PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
1243     } else {
1244         PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
1245         return -1;
1246     }
1247
1248     if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
1249         PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
1250         return -1;
1251     }
1252
1253     h->hrt_flags = f;
1254
1255     if (maxmap>h->max_mem_mapped) { 
1256         PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
1257         return -1;
1258     }
1259
1260     if (gvaoff!=0 && gvaoff!=TOP_HALF_START) { 
1261         PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
1262         return -1;
1263     }
1264     
1265     h->gva_offset = gvaoff;
1266
1267     h->gva_entry = gvaentry;
1268
1269     if (mb->addr->load_addr < h->first_hrt_gpa) { 
1270         PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
1271         return -1;
1272     }
1273     
1274     if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
1275         PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
1276         return -1;
1277     }
1278     
1279     if (vec<32) { 
1280         PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
1281         return -1;
1282     }
1283     
1284     h->hrt_int_vector = vec;
1285     
1286     
1287     if (commgpa < vm->mem_size) { 
1288         PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
1289         return -1;
1290     } 
1291
1292     h->comm_page_gpa = commgpa;
1293
1294     if (!h->comm_page_hpa) { 
1295         if (!(h->comm_page_hpa=V3_AllocPages(1))) { 
1296             PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
1297             return -1;
1298         }
1299
1300         h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
1301         
1302         memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1303         
1304         if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) { 
1305             PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
1306             V3_FreePages((void*)(h->comm_page_gpa),1);
1307             return -1;
1308         }
1309         
1310         
1311         PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
1312     }
1313
1314     memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1315     
1316     
1317     PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
1318                h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
1319     
1320     return 0;
1321
1322 }
1323
1324 static int setup_mb_kernel_hrt(struct v3_vm_info *vm)
1325 {
1326     mb_data_t mb;
1327
1328     if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) { 
1329         PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
1330         return -1;
1331     }
1332
1333     if (configure_hrt(vm,&mb)) {
1334         PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
1335         return -1;
1336     }
1337     
1338     if (v3_write_multiboot_kernel(vm,&mb,vm->hvm_state.hrt_file,
1339                                   (void*)vm->hvm_state.first_hrt_gpa,
1340                                   vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
1341         PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
1342         return -1;
1343     }
1344
1345     if (vm->hvm_state.gva_entry) { 
1346         vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
1347     } else {
1348         vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
1349     }
1350
1351     vm->hvm_state.hrt_type = HRT_MBOOT64;
1352
1353     return 0;
1354
1355 }
1356
1357
1358 static int setup_hrt(struct v3_vm_info *vm)
1359 {
1360     if (is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size) && 
1361         find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { 
1362
1363         PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
1364         if (setup_mb_kernel_hrt(vm)) { 
1365             PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
1366             return -1;
1367         } 
1368     } else {
1369         PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
1370         return -1;
1371     }
1372
1373     return 0;
1374 }
1375
1376
1377         
1378
1379 /*
1380   GPA layout:
1381
1382   HRT
1383   ---
1384   ROS
1385
1386   We do not touch the ROS portion of the address space.
1387   The HRT portion looks like:
1388
1389   INT_HANDLER (1 page - page aligned)
1390   IDT (1 page - page aligned)
1391   GDT (1 page - page aligned)
1392   TSS (1 page - page asligned)
1393   PAGETABLES  (identy map of first N GB)
1394      ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
1395      followed by 3rd level PTs in order, followed by 4th level
1396      PTs in order.  
1397   MBINFO_PAGE
1398   SCRATCH_STACK_HRT_CORE0 
1399   SCRATCH_STACK_HRT_CORE1
1400   ..
1401   SCRATCH_STACK_HRT_COREN
1402   ...
1403   HRT (as many pages as needed, page-aligned, starting at first HRT address)
1404   ---
1405   ROS
1406
1407
1408 */
1409
1410
1411 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
1412 {
1413     if (!vm->hvm_state.is_hvm) { 
1414         PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
1415         return 0;
1416     }
1417
1418     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
1419
1420     if (setup_hrt(vm)) {
1421         PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
1422         return -1;
1423     } 
1424
1425     // the locations of all the other items are determined by
1426     // the HRT setup, so these must happen after
1427
1428     write_null_int_handler(vm);
1429     write_idt(vm);
1430     write_gdt(vm);
1431     write_tss(vm);
1432
1433     write_pts(vm);
1434
1435     // this must happen last
1436     write_mb_info(vm);
1437
1438     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
1439
1440     return 0;
1441 }
1442
1443 /*
1444   On entry for every core:
1445
1446    IDTR points to stub IDT
1447    GDTR points to stub GDT
1448    TS   points to stub TSS
1449    CR3 points to root page table
1450    CR0 has PE and PG
1451    EFER has LME AND LMA (and NX for compatibility with Linux)
1452    RSP is TOS of core's scratch stack (looks like a call)
1453
1454    RAX = MB magic cookie
1455    RBX = address of multiboot info table
1456    RCX = this core id / apic id (0..N-1)
1457    RDX = this core id - first HRT core ID (==0 for the first HRT core)
1458
1459    All addresses are virtual addresses, offset as needed by gva_offset
1460
1461    Other regs are zeroed
1462
1463    shadow/nested paging state reset for long mode
1464
1465 */
1466 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
1467 {
1468     void *base;
1469     uint64_t limit;
1470     uint64_t gva_offset;
1471
1472     rdtscll(core->hvm_state.last_boot_start);
1473     
1474
1475     if (!core->hvm_state.is_hrt) { 
1476         PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
1477         return 0;
1478     }
1479
1480
1481     PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
1482
1483     gva_offset = core->vm_info->hvm_state.gva_offset;
1484     
1485     memset(&core->vm_regs,0,sizeof(core->vm_regs));
1486     memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
1487     memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
1488     memset(&core->segments,0,sizeof(core->segments));    
1489     memset(&core->msrs,0,sizeof(core->msrs));    
1490     memset(&core->fp_state,0,sizeof(core->fp_state));    
1491
1492     // We are in long mode with virtual memory and we want
1493     // to start immediatley
1494     core->cpl = 0; // we are going right into the kernel
1495     core->cpu_mode = LONG;
1496     core->mem_mode = VIRTUAL_MEM; 
1497     core->core_run_state = CORE_RUNNING ;
1498
1499
1500     // magic
1501     core->vm_regs.rax = MB2_INFO_MAGIC;
1502
1503     // multiboot info pointer
1504     get_mb_info_loc(core->vm_info, &base,&limit);
1505     core->vm_regs.rbx = (uint64_t) base + gva_offset;  
1506
1507     // core number
1508     core->vm_regs.rcx = core->vcpu_id;
1509     
1510     // HRT core number
1511     core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1512
1513     // Now point to scratch stack for this core
1514     // it begins at an ofset relative to the MB info page
1515     get_mb_info_loc(core->vm_info, &base,&limit);
1516     base = base + gva_offset;
1517     base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1518     core->vm_regs.rsp = (v3_reg_t) base;  
1519     core->vm_regs.rbp = (v3_reg_t) base-8; 
1520
1521     // push onto the stack a bad rbp and bad return address
1522     core->vm_regs.rsp-=16;
1523     v3_set_gpa_memory(core,
1524                       core->vm_regs.rsp-gva_offset,
1525                       16,
1526                       0xff);
1527
1528
1529     // HRT entry point
1530     get_hrt_loc(core->vm_info, &base,&limit);
1531     if (core->vm_info->hvm_state.gva_entry) { 
1532       core->rip = core->vm_info->hvm_state.gva_entry;
1533     } else {
1534       core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset; 
1535     }
1536       
1537
1538
1539     PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1540                (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1541                (void*)(core->rip),
1542                (void*)(core->vm_regs.rsp),
1543                (void*)(core->vm_regs.rbp),
1544                (void*)(core->vm_regs.rax),
1545                (void*)(core->vm_regs.rbx),
1546                (void*)(core->vm_regs.rcx),
1547                (void*)(core->vm_regs.rdx));
1548
1549     // Setup CRs for long mode and our stub page table
1550     // CR0: PG, PE
1551     core->ctrl_regs.cr0 = 0x80000001;
1552     core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1553
1554     // CR2: don't care (output from #PF)
1555     // CE3: set to our PML4E, without setting PCD or PWT
1556     get_pt_loc(core->vm_info, &base,&limit);
1557     core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);  // not offset as this is a GPA
1558     core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1559
1560     // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1561     core->ctrl_regs.cr4 = 0xb0;
1562     core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1563     // CR8 as usual
1564     // RFLAGS zeroed is fine: come in with interrupts off
1565     // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
1566     core->ctrl_regs.efer = 0x1d00;
1567     core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1568
1569
1570     /* 
1571        Notes on selectors:
1572
1573        selector is 13 bits of index, 1 bit table indicator 
1574        (0=>GDT), 2 bit RPL
1575        
1576        index is scaled by 8, even in long mode, where some entries 
1577        are 16 bytes long.... 
1578           -> code, data descriptors have 8 byte format
1579              because base, limit, etc, are ignored (no segmentation)
1580           -> interrupt/trap gates have 16 byte format 
1581              because offset needs to be 64 bits
1582     */
1583     
1584     // Install our stub IDT
1585     get_idt_loc(core->vm_info, &base,&limit);
1586     base += gva_offset;
1587     core->segments.idtr.selector = 0;  // entry 0 (NULL) of the GDT
1588     core->segments.idtr.base = (addr_t) base;  // only base+limit are used
1589     core->segments.idtr.limit = limit-1;
1590     core->segments.idtr.type = 0x0;
1591     core->segments.idtr.system = 0; 
1592     core->segments.idtr.dpl = 0;
1593     core->segments.idtr.present = 0;
1594     core->segments.idtr.long_mode = 0;
1595
1596     // Install our stub GDT
1597     get_gdt_loc(core->vm_info, &base,&limit);
1598     base += gva_offset;
1599     core->segments.gdtr.selector = 0;  // entry 0 (NULL) of the GDT
1600     core->segments.gdtr.base = (addr_t) base;
1601     core->segments.gdtr.limit = limit-1;   // only base+limit are used
1602     core->segments.gdtr.type = 0x0;
1603     core->segments.gdtr.system = 0; 
1604     core->segments.gdtr.dpl = 0;
1605     core->segments.gdtr.present = 0;
1606     core->segments.gdtr.long_mode = 0;
1607     
1608     // And our TSS
1609     get_tss_loc(core->vm_info, &base,&limit);
1610     base += gva_offset;  
1611     core->segments.tr.selector = 0;
1612     core->segments.tr.base = (addr_t) base;
1613     core->segments.tr.limit = limit-1;
1614     core->segments.tr.type = 0x9;
1615     core->segments.tr.system = 0;   // available 64 bit TSS 
1616     core->segments.tr.dpl = 0;
1617     core->segments.tr.present = 1;
1618     core->segments.tr.long_mode = 0; // not used
1619     
1620     base = 0x0; // these are not offset as we want to make all gvas visible
1621     limit = -1;
1622
1623     // And CS
1624     core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1625     core->segments.cs.base = (addr_t) base;   // not used
1626     core->segments.cs.limit = limit;          // not used
1627     core->segments.cs.type = 0xe;             // only C is used
1628     core->segments.cs.system = 1;             // not a system segment
1629     core->segments.cs.dpl = 0;                       
1630     core->segments.cs.present = 1;
1631     core->segments.cs.long_mode = 1;
1632
1633     // DS, SS, etc are identical
1634     core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1635     core->segments.ds.base = (addr_t) base;
1636     core->segments.ds.limit = limit;
1637     core->segments.ds.type = 0x6;            // ignored
1638     core->segments.ds.system = 1;            // not a system segment
1639     core->segments.ds.dpl = 0;
1640     core->segments.ds.present = 1;
1641     core->segments.ds.long_mode = 1;
1642     
1643     memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1644     memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1645     memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1646     memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1647     
1648
1649     // reset paging here for shadow... 
1650
1651     if (core->shdw_pg_mode != NESTED_PAGING) { 
1652         PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
1653         return -1;
1654     }
1655
1656
1657     return 0;
1658 }
1659
1660 int v3_handle_hvm_reset(struct guest_info *core)
1661 {
1662
1663     if (core->core_run_state != CORE_RESETTING) { 
1664         return 0;
1665     }
1666
1667     if (!core->vm_info->hvm_state.is_hvm) { 
1668         return 0;
1669     }
1670
1671     if (v3_is_hvm_hrt_core(core)) { 
1672         // this is an HRT reset
1673         int rc=0;
1674
1675         // wait for all the HRT cores
1676         v3_counting_barrier(&core->vm_info->reset_barrier);
1677
1678         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
1679             // I am leader
1680             core->vm_info->run_state = VM_RESETTING;
1681         }
1682
1683         core->core_run_state = CORE_RESETTING;
1684
1685         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1686             // we really only need to clear the bss
1687             // and recopy the .data, but for now we'll just
1688             // do everything
1689             rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
1690
1691             if (rc) { 
1692                 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
1693             }
1694         }
1695
1696         // now everyone is ready to reset
1697         rc |= v3_setup_hvm_hrt_core_for_boot(core);
1698
1699         if (rc) { 
1700             PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
1701         }
1702
1703         core->core_run_state = CORE_RUNNING;
1704
1705         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
1706             // leader
1707             core->vm_info->run_state = VM_RUNNING;
1708             core->vm_info->hvm_state.trans_state = HRT_IDLE;
1709         }
1710
1711         v3_counting_barrier(&core->vm_info->reset_barrier);
1712
1713         if (rc<0) { 
1714             PrintError(core->vm_info,core,"hvm: reset failed\n");
1715             return rc;
1716         } else {
1717             return 1;
1718         }
1719
1720     } else { 
1721         // ROS core will be handled by normal reset functionality
1722         return 0;
1723     }
1724 }