Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


43e7c332a0efd87f01a00b35a5c5bb49b4d48a57
[palacios.git] / palacios / src / palacios / vmm_hvm.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org> 
11  * All rights reserved.
12  *
13  * Author:  Peter Dinda <pdinda@northwestern.edu>
14  *
15  * This is free software.  You are permitted to use,
16  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
17  */
18
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
26
27 #include <palacios/vmm_xml.h>
28
29 #include <palacios/vm_guest_mem.h>
30
31 #include <palacios/vmm_debug.h>
32
33
34 /*
35
36   MEM     = Total size of memory in the GPA (in MB)
37   ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
38
39   GPAs [0,ROS_MEM) are what the ROS sees
40   GPAs [ROS_MEM, MEM) are HRT only
41   GPAS [0,MEM) are accessible by the HRT
42
43   CORES   = Total number of cores in VM
44   ROS_CORES = Total numbber of cores for the ROS
45
46   Cores [0,ROS_CORES) are what the ROS sees
47   Cores [ROS_CORES,CORES) are HRT only
48   Cores [0,CORES) are accessible by the HRT
49
50   In a Pal file:
51
52   <files> 
53     <file id="hrtelf" filename="hrtelf.o" />
54   </files>
55
56   <mem ... >RAM</mem>   (MB)  Note these are  
57   <cores count="CORES" ...>   backward compatible
58
59   <hvm enable="y" >
60     <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
61     <hrt file_id="hrtelf" /hrt>
62   </hvm>
63
64 */
65
66 #ifndef V3_CONFIG_DEBUG_HVM
67 #undef PrintDebug
68 #define PrintDebug(fmt, args...)
69 #endif
70
71
72 int v3_init_hvm()
73 {
74     PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
75     return 0;
76 }
77
78 int v3_deinit_hvm()
79 {
80     PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
81     return 0;
82 }
83
84 // ignore requests from when we are in the wrong state
85 #define ENFORCE_STATE_MACHINE 1
86
87 // invoke the HRT using a page fault instead of
88 // the SWINTR mechanism
89 #define USE_UPCALL_MAGIC_PF  1
90 #define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
91 #define UPCALL_MAGIC_ERROR   0xf00df00d
92
93 /*
94   64 bit only hypercall:
95
96   rax = hypercall number
97   rbx = 0x646464...
98   then args are:  rcx, rdx, rsi, rdi r8, r9, r10, r11
99   rcx = 1st arg
100 */
101 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
102 {
103     uint64_t c;
104     uint64_t bitness = core->vm_regs.rbx;
105     uint64_t a1 = core->vm_regs.rcx;
106     uint64_t a2 = core->vm_regs.rdx;
107     uint64_t a3 = core->vm_regs.rsi;
108     struct v3_vm_hvm *h = &core->vm_info->hvm_state;
109
110
111     if (bitness!=0x6464646464646464) { 
112         PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
113         core->vm_regs.rax = -1;
114         return 0;
115     }
116
117     switch (a1) {
118         case 0x0:   // null
119             
120             rdtscll(c);
121             
122             V3_Print(core->vm_info,core, "hvm: received hypercall %x  rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
123                      hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
124             //v3_print_core_telemetry(core);
125             //    v3_print_guest_state(core);
126             core->vm_regs.rax = 0;
127             break;
128             
129         case 0x1: // reset ros
130             PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
131             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) { 
132                 PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
133                 core->vm_regs.rax = -1;
134             } else {
135                 core->vm_regs.rax = 0;
136             }
137             break;
138
139         case 0x2: // reset hrt
140             PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
141             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) { 
142                 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
143                 core->vm_regs.rax = -1;
144             } else {
145                 core->vm_regs.rax = 0;
146             }
147             break;
148
149         case 0x3: // reset both
150             PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
151             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) { 
152                 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
153                 core->vm_regs.rax = -1;
154             } else {
155                 core->vm_regs.rax = 0;
156             }
157             break;
158             
159         case 0x8: // replace HRT image
160             // a2 = gva of image
161             // a3 = size of image
162             PrintDebug(core->vm_info,core,"hvm: request replacement HRT image addr=0x%llx size=0x%llx\n",a2,a3);
163
164             if (h->hrt_image) { 
165                 // delete old
166                 V3_VFree(h->hrt_image);
167                 h->hrt_image = 0;
168             }
169
170             h->hrt_image = V3_VMalloc(a3);
171
172             if (!(h->hrt_image)) {
173                 PrintError(core->vm_info,core, "hvm: failed to allocate space for replacement image\n");
174                 core->vm_regs.rax = -1;
175             } else {
176                 if (v3_read_gva_memory(core, a2, a3, (uint8_t*) h->hrt_image)!=a3) { 
177                     PrintError(core->vm_info, core, "hvm: cannot read replacement image\n");
178                     core->vm_regs.rax = -1;
179                 } else {
180                     h->hrt_image_size = a3; 
181                     core->vm_regs.rax = 0;
182                 }
183             }
184
185             if (core->vm_regs.rax) { 
186                 PrintError(core->vm_info,core,"hvm: Failed to replace HRT image\n");
187             } else {
188                 PrintDebug(core->vm_info,core,"hvm: HRT image successfully replaced\n");
189             }
190
191             break;
192
193         case 0xf: // get HRT state
194             core->vm_regs.rax = h->trans_state;
195             if (v3_write_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*) &h->ros_event)!=sizeof(h->ros_event)) { 
196                 PrintError(core->vm_info, core, "hvm: cannot write back ROS event state to %p - continuing\n",(void*)a2);
197             }
198             //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
199             break;
200
201         case 0x10:
202             PrintDebug(core->vm_info, core, "hvm: ROS event request\n");
203             if (h->ros_event.event_type!=ROS_NONE) { 
204                 PrintError(core->vm_info, core, "hvm: ROS event is already in progress\n");
205                 core->vm_regs.rax = -1;
206             } else {
207                 if (v3_read_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*)&h->ros_event)!=sizeof(h->ros_event)) { 
208                     PrintError(core->vm_info, core, "hvm: cannot read ROS event from %p\n",(void*)a2);
209                     core->vm_regs.rax = -1;
210                 } else {
211                     core->vm_regs.rax = 0;
212                 }
213             }
214
215             break;
216
217         case 0x1f:
218             PrintDebug(core->vm_info, core, "hvm: completion of ROS event (rc=0x%llx)\n",a2);
219             h->ros_event.event_type=ROS_NONE;
220             h->ros_event.last_ros_event_result = a2;
221             break;
222
223         case 0x20: // invoke function (ROS->HRT)
224         case 0x21: // invoke parallel function (ROS->HRT)
225             if (v3_is_hvm_hrt_core(core)) { 
226                 PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
227                 core->vm_regs.rax = -1;
228             } else {
229                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
230                     PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
231                     core->vm_regs.rax = -1;
232                 } else {
233                     uint64_t *page = (uint64_t *) h->comm_page_hva;
234                     uint64_t first, last, cur;
235
236                     PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
237                     page[0] = a1;
238                     page[1] = a2;
239
240                     if (a1==0x20) { 
241                         first=last=h->first_hrt_core;
242                     } else {
243                         first=h->first_hrt_core;
244                         last=core->vm_info->num_cores-1;
245                     }
246
247                     core->vm_regs.rax = 0;
248
249                     h->trans_count = last-first+1;
250
251                     for (cur=first;cur<=last;cur++) { 
252
253 #if USE_UPCALL_MAGIC_PF
254                         PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
255                         core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
256                         if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
257                                                           PF_EXCEPTION, 
258                                                           UPCALL_MAGIC_ERROR)) { 
259                             PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
260                             core->vm_regs.rax = -1;
261                             break;
262                         }
263 #else
264                         PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
265                         if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) { 
266                             PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
267                             core->vm_regs.rax = -1;
268                             break;
269                         }
270 #endif
271                         // Force core to exit now
272                         v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
273                           
274                     }
275                     if (core->vm_regs.rax==0) { 
276                         if (a1==0x20) { 
277                             h->trans_state = HRT_CALL;
278                         } else {
279                             h->trans_state = HRT_PARCALL;
280                         }
281                     }  else {
282                         PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
283                         h->trans_state = HRT_IDLE;
284                         h->trans_count = 0;
285                     }
286                 }
287             }
288             break;
289
290
291         case 0x28: // setup for synchronous operation (ROS->HRT)
292         case 0x29: // teardown for synchronous operation (ROS->HRT)
293             if (v3_is_hvm_hrt_core(core)) { 
294                 PrintError(core->vm_info,core,"hvm: %ssynchronization invocation not supported from HRT core\n",a1==0x29 ? "de" : "");
295                 core->vm_regs.rax = -1;
296             } else {
297                 if (ENFORCE_STATE_MACHINE && 
298                     ((a1==0x28 && h->trans_state!=HRT_IDLE) || (a1==0x29 && h->trans_state!=HRT_SYNC))) { 
299                     PrintError(core->vm_info,core, "hvm: cannot invoke %ssynchronization in state %d\n",a1==0x29 ? "de" : "", h->trans_state);
300                     core->vm_regs.rax = -1;
301                 } else {
302                     uint64_t *page = (uint64_t *) h->comm_page_hva;
303                     uint64_t first, last, cur;
304
305                     PrintDebug(core->vm_info,core, "hvm: invoke %ssynchronization on address %p\n",a1==0x29 ? "de" : "",(void*)a2);
306                     page[0] = a1;
307                     page[1] = a2;
308
309                     first=last=h->first_hrt_core;  // initially we will sync only with BSP
310
311                     core->vm_regs.rax = 0;
312
313                     h->trans_count = last-first+1;
314
315                     for (cur=first;cur<=last;cur++) { 
316
317 #if USE_UPCALL_MAGIC_PF
318                         PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
319                         core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
320                         if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
321                                                           PF_EXCEPTION, 
322                                                           UPCALL_MAGIC_ERROR)) { 
323                             PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
324                             core->vm_regs.rax = -1;
325                             break;
326                         }
327 #else
328                         PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
329                         if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) { 
330                             PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
331                             core->vm_regs.rax = -1;
332                             break;
333                         }
334 #endif
335                         // Force core to exit now
336                         v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
337                           
338                     }
339                     if (core->vm_regs.rax==0) { 
340                         if (a1==0x28) { 
341                             h->trans_state = HRT_SYNCSETUP;
342                         } else {
343                             h->trans_state = HRT_SYNCTEARDOWN;                      
344                         }
345                     }  else {
346                         PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
347                         h->trans_state = HRT_IDLE;
348                         h->trans_count = 0;
349                     }
350                 }
351             }
352             break;
353
354         case 0x2f: // function exec or sync done
355             if (v3_is_hvm_ros_core(core)) { 
356                 PrintError(core->vm_info,core, "hvm: request for exec or sync done from ROS core\n");
357                 core->vm_regs.rax=-1;
358             } else {
359                 if (ENFORCE_STATE_MACHINE && 
360                     h->trans_state!=HRT_CALL && 
361                     h->trans_state!=HRT_PARCALL && 
362                     h->trans_state!=HRT_SYNCSETUP &&
363                     h->trans_state!=HRT_SYNCTEARDOWN) {
364                     PrintError(core->vm_info,core,"hvm: function or sync completion when not in HRT_CALL, HRT_PARCALL, HRT_SYNCSETUP, or HRT_SYNCTEARDOWN state\n");
365                     core->vm_regs.rax=-1;
366                 } else {
367                     uint64_t one=1;
368                     PrintDebug(core->vm_info,core, "hvm: function or sync complete\n");
369                     if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
370                         // last one, switch state
371                         if (h->trans_state==HRT_SYNCSETUP) { 
372                             h->trans_state=HRT_SYNC;
373                             PrintDebug(core->vm_info,core, "hvm: function complete - now synchronous\n");
374                         } else {
375                             h->trans_state=HRT_IDLE;
376                         }
377                     }
378                     core->vm_regs.rax=0;
379                 }
380             }
381                     
382             break;
383
384         case 0x30: // merge address space
385         case 0x31: // unmerge address space
386             if (v3_is_hvm_hrt_core(core)) { 
387                 PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
388                 core->vm_regs.rax=-1;
389             } else {
390                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
391                     PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state\n",a1==0x30 ? "" : "un");
392                     core->vm_regs.rax=-1;
393                 } else {
394                     uint64_t *page = (uint64_t *) h->comm_page_hva;
395
396                     PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
397                     // should sanity check to make sure guest is in 64 bit without anything strange
398
399                     page[0] = a1;
400                     page[1] = core->ctrl_regs.cr3;  // this is a do-not-care for an unmerge
401
402                     core->vm_regs.rax = 0;
403 #if USE_UPCALL_MAGIC_PF
404                     PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %u\n",h->first_hrt_core);
405                     core->vm_info->cores[h->first_hrt_core].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
406                     if (v3_raise_exception_with_error(&core->vm_info->cores[h->first_hrt_core],
407                                                       PF_EXCEPTION,  
408                                                       UPCALL_MAGIC_ERROR)) { 
409                       PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %u\n",h->first_hrt_core);
410                       core->vm_regs.rax = -1;
411                       break;
412                     }
413 #else
414                     PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %u\n",h->hrt_int_vector,h->first_hrt_core);
415                     if (v3_raise_swintr(&core->vm_info->cores[h->first_hrt_core],h->hrt_int_vector)) { 
416                         PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %u\n",h->first_hrt_core);
417                         core->vm_regs.rax = -1;
418                     } 
419 #endif          
420                     // Force core to exit now
421                     v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
422
423                     h->trans_state = HRT_MERGE;
424                 }
425                 
426             }
427                 
428             break;
429             
430
431         case 0x3f: // merge operation done
432             if (v3_is_hvm_ros_core(core)) { 
433                 PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
434                 core->vm_regs.rax=-1;
435             } else {
436                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
437                     PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
438                     core->vm_regs.rax=-1;
439                 } else {
440                     PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
441                     h->trans_state=HRT_IDLE;
442                     core->vm_regs.rax=0;
443                 }
444             }
445                     
446             break;
447             
448         case 0x40: // install or remove signal handler
449             if (v3_is_hvm_hrt_core(core)) { 
450                 PrintError(core->vm_info,core, "hvm: HRT cannot install signal handler...\n");
451                 core->vm_regs.rax=-1;
452             } else {
453                 PrintDebug(core->vm_info,core,"hvm: install signal handler for CR3=%p, handler=%p, stack=%p\n",(void*)core->ctrl_regs.cr3, (void*)a2, (void*)a3);
454                 if (h->ros_signal.code) { 
455                     PrintError(core->vm_info,core,"hvm: signal is pending...\n");
456                     core->vm_regs.rax=-1;
457                 } else {
458                     if ((a2 || a3) && (h->ros_signal.handler || h->ros_signal.stack)) { 
459                         PrintError(core->vm_info,core,"hvm: attempt to replace existing handler without removing it first\n");
460                         core->vm_regs.rax=-1;
461                     } else {
462                         // actually make the change
463                         h->ros_signal.handler=a2;
464                         h->ros_signal.stack=a3;
465                         h->ros_signal.cr3=core->ctrl_regs.cr3;
466                         core->vm_regs.rax=0;
467
468                         // test by signalling back a hello 
469                         // if (a2 && a3) { 
470                         //    v3_hvm_signal_ros(core->vm_info,0xf00d);
471                         //}
472                     }
473                 }
474             }
475             break;
476
477         case 0x41: // raise signal in the ROS from HRT or ROS
478             PrintDebug(core->vm_info,core,"hvm: HRT raises signal code=0x%llx\n", a2);
479             core->vm_regs.rax = v3_hvm_signal_ros(core->vm_info,a2);
480             break;
481
482         default:
483             PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
484             core->vm_regs.rax=-1;
485             break;
486     }
487                 
488     return 0;
489 }
490
491 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
492
493 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
494 {
495     v3_cfg_tree_t *hvm_config;
496     v3_cfg_tree_t *ros_config;
497     v3_cfg_tree_t *hrt_config;
498     char *enable;
499     char *ros_cores;
500     char *ros_mem;
501     char *hrt_file_id=0;
502
503     PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
504
505     /* 
506        Defaults - all ROS
507     */
508     memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
509     vm->hvm_state.is_hvm=0;
510     vm->hvm_state.first_hrt_core=vm->num_cores;
511     vm->hvm_state.first_hrt_gpa=vm->mem_size;
512
513     if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
514         PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
515         goto out_ok;
516     }
517     
518     if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
519         PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
520         goto out_ok;
521     }
522
523     if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) { 
524         PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
525         return -1;
526     }
527  
528     if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) { 
529         PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
530         return -1;
531     }
532    
533     vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
534     
535     if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) { 
536         PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
537         return -1;
538     }
539
540     vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
541
542     if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) { 
543         PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
544         return -1;
545     }
546  
547     if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) { 
548         PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
549         return -1;
550     }
551
552     vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
553     
554     if (!vm->hvm_state.hrt_file) { 
555         PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
556         return -1;
557     }
558
559     if (v3_register_hypercall(vm, HVM_HCALL, 
560                               hvm_hcall_handler, 0)) { 
561         PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
562         return -1;
563     }
564
565     // XXX sanity check config here
566
567     vm->hvm_state.is_hvm=1;
568
569  out_ok:
570     if (vm->hvm_state.is_hvm) {
571         V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
572                  vm->hvm_state.first_hrt_core-1,
573                  (void*) vm->hvm_state.first_hrt_gpa-1,
574                  vm->hvm_state.first_hrt_core,
575                  vm->num_cores-1,
576                  (void*) vm->hvm_state.first_hrt_gpa,
577                  (void*)vm->mem_size-1,
578                  hrt_file_id,
579                  vm->hvm_state.hrt_file->tag);
580     } else {
581         V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
582     }
583     return 0;
584     
585 }
586
587
588 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
589 {
590     PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
591
592     if (vm->hvm_state.hrt_image) { 
593         V3_VFree(vm->hvm_state.hrt_image);
594         vm->hvm_state.hrt_image=0;
595         vm->hvm_state.hrt_image_size=0;
596     }
597
598     v3_remove_hypercall(vm,HVM_HCALL);
599
600     if (vm->hvm_state.comm_page_hpa) { 
601         struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
602         if (!r) { 
603             PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
604         } else {
605             v3_delete_mem_region(vm,r);
606         }
607     }
608
609     return 0;
610 }
611
612 int v3_init_hvm_core(struct guest_info *core)
613 {
614     memset(&core->hvm_state,0,sizeof(core->hvm_state));
615     if (core->vm_info->hvm_state.is_hvm) { 
616         if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) { 
617             core->hvm_state.is_hrt=1;
618         }
619     }
620     return 0;
621 }
622
623 int v3_deinit_hvm_core(struct guest_info *core)
624 {
625     PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
626
627     return 0;
628 }
629
630
631 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
632 {
633     if (vm->hvm_state.is_hvm) { 
634         return vm->hvm_state.first_hrt_gpa;
635     } else {
636         return vm->mem_size;
637     }
638 }
639 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
640 {
641     return vm->mem_size;
642 }
643
644 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
645 {
646     if (vm->hvm_state.is_hvm) { 
647         return vm->hvm_state.first_hrt_core;
648     } else {
649         return vm->num_cores;
650     }
651 }
652
653 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
654 {
655     if (vm->hvm_state.is_hvm) { 
656         return vm->num_cores - vm->hvm_state.first_hrt_core;
657     } else {
658         return 0;
659     }
660 }
661
662
663 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
664 {
665     if (vm->hvm_state.is_hvm) { 
666         return gpa<vm->hvm_state.first_hrt_gpa;
667     } else {
668         return 1;
669     }
670 }
671
672 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
673 {
674     if (vm->hvm_state.is_hvm) { 
675         return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
676     } else {
677         return 0;
678     }
679 }
680
681 int v3_is_hvm_hrt_core(struct guest_info *core)
682 {
683     return core->hvm_state.is_hrt;
684 }
685
686 int v3_is_hvm_ros_core(struct guest_info *core)
687 {
688     return !core->hvm_state.is_hrt;
689 }
690
691 int      v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
692 {
693     if (!src) {
694         // ioapic or msi to apic
695         return !dest->hvm_state.is_hrt;
696     } else {
697         // apic to apic
698         return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
699     }
700 }
701
702 void     v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm, 
703                                         uint32_t *start_apic, uint32_t *num_apics)
704 {
705     if (!core) { 
706         // Seen from ioapic, msi, etc: 
707         if (vm->hvm_state.is_hvm) {
708             // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
709             *start_apic = 0;
710             *num_apics = vm->hvm_state.first_hrt_core;
711         } else {
712             // Non-HVM shows all cores/APICs to apic, msi, etc.
713             *start_apic = 0;
714             *num_apics = vm->num_cores;
715         }
716     } else {
717         // Seen from apic
718         if (core->hvm_state.is_hrt) { 
719             // HRT core/apic sees all apics
720             // (this policy may change...)
721             *start_apic = 0;
722             *num_apics = vm->num_cores;
723         } else {
724             // non-HRT core/apic sees only non-HRT cores/apics
725             *start_apic = 0 ;
726             *num_apics = vm->hvm_state.first_hrt_core;
727         }
728     }
729 }
730
731 #define MAX(x,y) ((x)>(y)?(x):(y))
732 #define MIN(x,y) ((x)<(y)?(x):(y))
733
734
735 static uint64_t boot_state_end_addr(struct v3_vm_info *vm) 
736 {
737     return PAGE_ADDR(vm->mem_size);
738 }
739    
740 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
741 {
742     *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
743     *limit = PAGE_SIZE;
744 }
745
746 extern v3_cpu_arch_t v3_mach_type;
747
748 extern void *v3_hvm_svm_null_int_handler_start;
749 extern void *v3_hvm_svm_null_int_handler_end;
750 extern void *v3_hvm_vmx_null_int_handler_start;
751 extern void *v3_hvm_vmx_null_int_handler_end;
752
753 static void write_null_int_handler(struct v3_vm_info *vm)
754 {
755     void *base;
756     uint64_t limit;
757     void *data;
758     uint64_t len;
759
760     get_null_int_handler_loc(vm,&base,&limit);
761
762     switch (v3_mach_type) {
763 #ifdef V3_CONFIG_SVM
764         case V3_SVM_CPU:
765         case V3_SVM_REV3_CPU:
766             data = (void*) &v3_hvm_svm_null_int_handler_start;
767             len = (void*) &v3_hvm_svm_null_int_handler_end - data;
768             break;
769 #endif
770 #if V3_CONFIG_VMX
771         case V3_VMX_CPU:
772         case V3_VMX_EPT_CPU:
773         case V3_VMX_EPT_UG_CPU:
774             data = (void*) &v3_hvm_vmx_null_int_handler_start;
775             len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
776             break;
777 #endif
778         default:
779             PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
780             data = 0;
781             len = 0;
782     }
783
784     if (data) {
785         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
786     }
787
788     PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
789 }
790
791
792 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
793 {
794     *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
795     *limit = 16*256;
796 }
797
798 // default IDT entries (int and trap gates)
799 //
800 // Format is 16 bytes long:
801 //   16 offsetlo   => 0
802 //   16 selector   => (target code selector) => 0x8 // entry 1 of GDT
803 //    3 ist        => (stack) = 0 => current stack
804 //    5 reserved   => 0
805 //    4 type       => 0xe=>INT, 0xf=>TRAP 
806 //    1 reserved   => 0  (indicates "system" by being zero)
807 //    2 dpl        => 0
808 //    1 present    => 1
809 //   16 offsetmid  => 0
810 //   32 offsethigh => 0   (total is a 64 bit offset)
811 //   32 reserved   => 0
812 //
813 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
814 // 
815 // Note little endian
816 //
817 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
818 static uint64_t idt64_int_gate_entry_mask[2] =  { 0x00008e0000080000, 0x0 };
819
820 static void write_idt(struct v3_vm_info *vm)
821 {
822     void *base;
823     uint64_t limit;
824     void *handler;
825     uint64_t handler_len;
826     int i;
827     uint64_t trap_gate[2];
828     uint64_t int_gate[2];
829
830     get_idt_loc(vm,&base,&limit);
831
832     get_null_int_handler_loc(vm,&handler,&handler_len);
833
834     handler += vm->hvm_state.gva_offset;
835
836     memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
837     memcpy(int_gate,idt64_int_gate_entry_mask,16);
838
839     if (handler) {
840         // update the entries for the handler location
841         uint8_t *mask;
842         uint8_t *hand;
843         
844         hand = (uint8_t*) &handler;
845
846         mask = (uint8_t *)trap_gate;
847         memcpy(&(mask[0]),&(hand[0]),2); // offset low
848         memcpy(&(mask[6]),&(hand[2]),2); // offset med
849         memcpy(&(mask[8]),&(hand[4]),4); // offset high
850
851         mask = (uint8_t *)int_gate;
852         memcpy(&(mask[0]),&(hand[0]),2); // offset low
853         memcpy(&(mask[6]),&(hand[2]),2); // offset med
854         memcpy(&(mask[8]),&(hand[4]),4); // offset high
855
856         PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
857     }
858
859     for (i=0;i<32;i++) { 
860         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
861     }
862
863     for (i=32;i<256;i++) { 
864         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
865     }
866
867     PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
868 }
869
870
871
872 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
873 {
874     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
875     *limit = 8*3;
876 }
877
878 static uint64_t gdt64[3] = {
879     0x0000000000000000, /* null */
880     0x00a09a0000000000, /* code (note lme bit) */
881     0x00a0920000000000, /* data (most entries don't matter) */
882 };
883
884 static void write_gdt(struct v3_vm_info *vm)
885 {
886     void *base;
887     uint64_t limit;
888
889     get_gdt_loc(vm,&base,&limit);
890     v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
891
892     PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
893 }
894
895
896
897 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
898 {
899     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
900     *limit = PAGE_SIZE;
901 }
902
903 static void write_tss(struct v3_vm_info *vm)
904 {
905     void *base;
906     uint64_t limit;
907
908     get_tss_loc(vm,&base,&limit);
909
910     v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
911
912     PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
913 }
914
915
916 #define TOP_HALF_START  0xffff800000000000ULL
917 #define BOTTOM_HALF_END 0x00007fffffffffffULL
918
919
920 #define L4_UNIT PAGE_SIZE
921 #define L3_UNIT (512ULL * L4_UNIT)
922 #define L2_UNIT (512ULL * L3_UNIT)
923 #define L1_UNIT (512ULL * L2_UNIT)
924
925 static void compute_pts_4KB(struct v3_vm_info *vm, 
926                             uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)    
927 {
928
929     // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
930     // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
931     // so it is the same number of page tables regardless
932
933     uint64_t max_gva = vm->hvm_state.max_mem_mapped;
934
935     *l1 = 1;  // 1 PML4
936     *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
937     *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
938     *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
939 }
940
941
942
943 /*
944   PTS MAP using 1 GB pages
945   n second levels pts, highest gva, highest address
946   1 top level
947
948
949 OR
950   
951   PTS MAP using 2 MB pages
952   n third level pts, highest gva, highest address
953   m second level pts, highest gva, highest address
954   1 top level pt
955
956 OR
957
958   PTS MAP using 4 KB pages
959   n 4th level, highest gva, highest address
960   m 3rd level, highest gva, hihgest address
961   l second level, highest gva, highest address
962   1 top level pt
963
964 OR
965   PTS MAP using 512 GB pages when this becomes available
966
967 */
968
969
970 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
971 {
972     uint64_t l1,l2,l3,l4;
973     uint64_t num_pt;
974
975     compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
976
977     if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
978         num_pt = l1;
979     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
980         num_pt = l1 + l2;
981     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
982         num_pt = l1 + l2 + l3;
983     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
984         num_pt = l1 + l2 + l3 + l4;
985     } else {
986         PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
987         return;
988     }
989
990     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
991     *limit = num_pt*PAGE_SIZE;
992 }
993
994 static void write_pts(struct v3_vm_info *vm)
995 {
996     uint64_t size;
997     uint64_t num_l1, num_l2, num_l3, num_l4;
998     void *start_l1, *start_l2, *start_l3, *start_l4;
999     uint64_t max_level;
1000     void *cur_pt;
1001     void *cur_gva;
1002     void *cur_gpa;
1003     void *min_gpa = 0;
1004     void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
1005     void *min_gva = (void*) vm->hvm_state.gva_offset;
1006 #ifdef V3_CONFIG_DEBUG_HVM
1007     void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
1008 #endif
1009     uint64_t i, pt;
1010     uint64_t i_start,i_end;
1011     
1012     struct pml4e64 *pml4e;
1013     struct pdpe64 *pdpe;
1014     struct pde64 *pde;
1015     struct pte64 *pte;
1016
1017     if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
1018         PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
1019         max_level = 1;
1020     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
1021         max_level = 2;
1022     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
1023         max_level = 3;
1024     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
1025         max_level = 4;
1026     } else {
1027         PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
1028         return;
1029     }
1030
1031     get_pt_loc(vm,&start_l1,&size);
1032     compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
1033
1034     start_l2=start_l1+PAGE_SIZE*num_l1;
1035     start_l3=start_l2+PAGE_SIZE*num_l2;
1036     start_l4=start_l3+PAGE_SIZE*num_l3;
1037
1038     PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
1039     PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
1040     PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
1041     PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
1042
1043     cur_pt=start_l1;
1044
1045     // build PML4 (only one)
1046     if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) { 
1047         PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
1048         return;
1049     }
1050
1051     memset(pml4e,0,PAGE_SIZE);
1052
1053     if (min_gva==0x0) { 
1054         i_start=0; i_end = num_l2;
1055     } else if (min_gva==(void*)TOP_HALF_START) { 
1056         i_start=256; i_end=256+num_l2;
1057     } else {
1058         PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
1059         return;
1060     }
1061
1062     for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
1063          (i<i_end);
1064          i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
1065
1066         pml4e[i].present=1;
1067         pml4e[i].writable=1;
1068         
1069         if (max_level==1) { 
1070             PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
1071             pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1072             //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1073         } else {
1074             pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
1075             //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1076         }
1077     }
1078
1079     // 512 GB only
1080     if (max_level==1) {
1081         return;
1082     }
1083
1084
1085
1086     for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1087          pt<num_l2;
1088          cur_pt+=PAGE_SIZE, pt++) { 
1089
1090         // build PDPE
1091         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) { 
1092             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
1093             return;
1094         }
1095         
1096         memset(pdpe,0,PAGE_SIZE);
1097         
1098         for (i=0; 
1099              i<512 && cur_gpa<max_gpa; 
1100              i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
1101
1102             pdpe[i].present=1;
1103             pdpe[i].writable=1;
1104         
1105             if (max_level==2) { 
1106                 pdpe[i].large_page=1;
1107                 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1108                 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1109             } else {
1110                 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
1111                 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1112             }
1113         }
1114     }
1115         
1116     //1 GB only
1117     if (max_level==2) { 
1118         return;
1119     }
1120
1121     for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1122          pt<num_l3;
1123          cur_pt+=PAGE_SIZE, pt++) { 
1124
1125         // build PDE
1126         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) { 
1127             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
1128             return;
1129         }
1130         
1131         memset(pde,0,PAGE_SIZE);
1132         
1133         for (i=0; 
1134              i<512 && cur_gpa<max_gpa; 
1135              i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
1136
1137             pde[i].present=1;
1138             pde[i].writable=1;
1139         
1140             if (max_level==3) { 
1141                 pde[i].large_page=1;
1142                 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1143                 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
1144             } else {
1145                 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
1146                 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
1147             }
1148         }
1149     }
1150
1151     //2 MB only
1152     if (max_level==3) { 
1153         return;
1154     }
1155
1156
1157     // 4 KB
1158     for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1159          pt<num_l4;
1160          cur_pt+=PAGE_SIZE, pt++) { 
1161
1162         // build PTE
1163         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) { 
1164             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
1165             return;
1166         }
1167         
1168         memset(pte,0,PAGE_SIZE);
1169         
1170         for (i=0; 
1171              i<512 && cur_gpa<max_gpa; 
1172              i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
1173
1174             pte[i].present=1;
1175             pte[i].writable=1;
1176             pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1177             //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
1178         }
1179     }
1180
1181     return;
1182 }
1183
1184
1185 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1186 {
1187     
1188     get_pt_loc(vm,base, limit);
1189     *base-=PAGE_SIZE;
1190     *limit=PAGE_SIZE;
1191 }
1192
1193
1194 int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
1195 {
1196     struct v3_vm_info *vm = core->vm_info;
1197
1198     hrt->tag.type = MB_INFO_HRT_TAG;
1199     hrt->tag.size = sizeof(mb_info_hrt_t);
1200
1201     hrt->total_num_apics = vm->num_cores;
1202     hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
1203     hrt->have_hrt_ioapic=0;
1204     hrt->first_hrt_ioapic_entry=0;
1205
1206     hrt->cpu_freq_khz = V3_CPU_KHZ();
1207
1208     hrt->hrt_flags = vm->hvm_state.hrt_flags;
1209     hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
1210     hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
1211     hrt->gva_offset = vm->hvm_state.gva_offset;
1212     hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
1213     hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
1214     
1215     return 0;
1216 }
1217
1218 static void write_mb_info(struct v3_vm_info *vm) 
1219 {
1220     if (vm->hvm_state.hrt_type!=HRT_MBOOT64) { 
1221         PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
1222         return;
1223     } else {
1224         uint8_t buf[256];
1225         uint64_t size;
1226         void *base;
1227         uint64_t limit;
1228
1229         get_mb_info_loc(vm,&base,&limit);
1230         
1231         if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) { 
1232             PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
1233             return;
1234         }
1235
1236         if (size>limit) { 
1237             PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
1238             return;
1239         }
1240         
1241         v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
1242                             (addr_t)base,
1243                             size,
1244                             buf);
1245
1246         PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
1247     }
1248 }
1249
1250 #define SCRATCH_STACK_SIZE 4096
1251
1252
1253 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1254 {
1255     void *mb_base;
1256     uint64_t mb_limit;
1257     
1258     get_mb_info_loc(vm,&mb_base,&mb_limit);
1259     
1260     mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
1261
1262     *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
1263
1264     if (mb_base < *base+PAGE_SIZE) { 
1265         PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
1266     }
1267
1268     *limit = mb_base - *base;
1269 }
1270
1271
1272 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1273 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1274
1275 #define ELF_MAGIC    0x464c457f
1276 #define MB2_MAGIC    0xe85250d6
1277
1278 #define MB2_INFO_MAGIC    0x36d76289
1279
1280 static int is_elf(uint8_t *data, uint64_t size)
1281 {
1282     if (*((uint32_t*)data)==ELF_MAGIC) {
1283         return 1;
1284     } else { 
1285         return 0;
1286     }
1287 }
1288
1289 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
1290 {
1291     uint64_t limit = size > 32768 ? 32768 : size;
1292     uint64_t i;
1293
1294     // Scan for the .boot magic cookie
1295     // must be in first 32K, assume 4 byte aligned
1296     for (i=0;i<limit;i+=4) { 
1297         if (*((uint32_t*)&data[i])==MB2_MAGIC) {
1298             INFO("Found multiboot header at offset 0x%llx\n",i);
1299             return (mb_header_t *) &data[i];
1300         }
1301     }
1302     return 0;
1303 }
1304
1305
1306 static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
1307 {
1308     struct v3_vm_hvm *h = &vm->hvm_state;
1309     uint64_t f = mb->mb64_hrt->hrt_flags;
1310     uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
1311     uint64_t gvaoff = mb->mb64_hrt->gva_offset;
1312     uint64_t gvaentry = mb->mb64_hrt->gva_entry;
1313     uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
1314     uint8_t  vec = mb->mb64_hrt->hrt_int_vector;
1315     
1316
1317     PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
1318                f, maxmap, gvaoff,gvaentry,commgpa, vec);
1319
1320     if (maxmap<0x100000000ULL) { 
1321         PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
1322         maxmap=0x100000000ULL;
1323     }
1324
1325     if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
1326         PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
1327         return -1;
1328     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
1329         f &= ~0x3c;
1330         f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
1331         h->max_mem_mapped = maxmap;
1332         PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
1333     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { 
1334         f &= ~0x3c;
1335         f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
1336         h->max_mem_mapped = maxmap;
1337         PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
1338     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
1339         f &= ~0x3c;
1340         f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
1341         h->max_mem_mapped = maxmap;
1342         PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
1343     } else {
1344         PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
1345         return -1;
1346     }
1347
1348     if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
1349         PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
1350         return -1;
1351     }
1352
1353     h->hrt_flags = f;
1354
1355     if (maxmap>h->max_mem_mapped) { 
1356         PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
1357         return -1;
1358     }
1359
1360     if (gvaoff!=0 && gvaoff!=TOP_HALF_START) { 
1361         PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
1362         return -1;
1363     }
1364     
1365     h->gva_offset = gvaoff;
1366
1367     h->gva_entry = gvaentry;
1368
1369     if (mb->addr->load_addr < h->first_hrt_gpa) { 
1370         PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
1371         return -1;
1372     }
1373     
1374     if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
1375         PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
1376         return -1;
1377     }
1378     
1379     if (vec<32) { 
1380         PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
1381         return -1;
1382     }
1383     
1384     h->hrt_int_vector = vec;
1385     
1386     
1387     if (commgpa < vm->mem_size) { 
1388         PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
1389         return -1;
1390     } 
1391
1392     h->comm_page_gpa = commgpa;
1393
1394     if (!h->comm_page_hpa) { 
1395         if (!(h->comm_page_hpa=V3_AllocPages(1))) { 
1396             PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
1397             return -1;
1398         }
1399
1400         h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
1401         
1402         memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1403         
1404         if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) { 
1405             PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
1406             V3_FreePages((void*)(h->comm_page_gpa),1);
1407             return -1;
1408         }
1409         
1410         
1411         PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
1412     }
1413
1414     memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1415     
1416     
1417     PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
1418                h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
1419     
1420     return 0;
1421
1422 }
1423
1424 static int setup_mb_kernel_hrt(struct v3_vm_info *vm, void *data, uint64_t size)
1425 {
1426     mb_data_t mb;
1427
1428     if (v3_parse_multiboot_header(data, size, &mb)) { 
1429         PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
1430         return -1;
1431     }
1432
1433     if (!mb.mb64_hrt) { 
1434         PrintError(vm,VCORE_NONE,"hvm: invalid HRT - there is no MB64_HRT tag\n");
1435         return -1;
1436     }
1437
1438     if (configure_hrt(vm,&mb)) {
1439         PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
1440         return -1;
1441     }
1442     
1443     if (v3_write_multiboot_kernel(vm,&mb,data,size,
1444                                   (void*)vm->hvm_state.first_hrt_gpa,
1445                                   vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
1446         PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
1447         return -1;
1448     }
1449
1450     if (vm->hvm_state.gva_entry) { 
1451         vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
1452     } else {
1453         vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
1454     }
1455
1456     vm->hvm_state.hrt_type = HRT_MBOOT64;
1457
1458     return 0;
1459
1460 }
1461
1462
1463 static int setup_hrt(struct v3_vm_info *vm)
1464 {
1465     void *data;
1466     uint64_t size;
1467
1468     // If the ROS has installed an image, it takes priority
1469     if (vm->hvm_state.hrt_image) { 
1470         data = vm->hvm_state.hrt_image;
1471         size = vm->hvm_state.hrt_image_size;
1472     } else {
1473         data = vm->hvm_state.hrt_file->data;
1474         size = vm->hvm_state.hrt_file->size;
1475     }
1476         
1477     if (is_elf(data,size) &&
1478         find_mb_header(data,size)) {
1479
1480         PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
1481         if (setup_mb_kernel_hrt(vm,data,size)) { 
1482             PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
1483             return -1;
1484         } 
1485     } else {
1486         PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
1487         return -1;
1488     }
1489
1490     return 0;
1491 }
1492
1493
1494         
1495
1496 /*
1497   GPA layout:
1498
1499   HRT
1500   ---
1501   ROS
1502
1503   We do not touch the ROS portion of the address space.
1504   The HRT portion looks like:
1505
1506   INT_HANDLER (1 page - page aligned)
1507   IDT (1 page - page aligned)
1508   GDT (1 page - page aligned)
1509   TSS (1 page - page asligned)
1510   PAGETABLES  (identy map of first N GB)
1511      ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
1512      followed by 3rd level PTs in order, followed by 4th level
1513      PTs in order.  
1514   MBINFO_PAGE
1515   SCRATCH_STACK_HRT_CORE0 
1516   SCRATCH_STACK_HRT_CORE1
1517   ..
1518   SCRATCH_STACK_HRT_COREN
1519   ...
1520   HRT (as many pages as needed, page-aligned, starting at first HRT address)
1521   ---
1522   ROS
1523
1524
1525 */
1526
1527
1528 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
1529 {
1530     if (!vm->hvm_state.is_hvm) { 
1531         PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
1532         return 0;
1533     }
1534
1535     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
1536
1537     if (setup_hrt(vm)) {
1538         PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
1539         return -1;
1540     } 
1541
1542     // the locations of all the other items are determined by
1543     // the HRT setup, so these must happen after
1544
1545     write_null_int_handler(vm);
1546     write_idt(vm);
1547     write_gdt(vm);
1548     write_tss(vm);
1549
1550     write_pts(vm);
1551
1552     // this must happen last
1553     write_mb_info(vm);
1554
1555     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
1556
1557     return 0;
1558 }
1559
1560 /*
1561   On entry for every core:
1562
1563    IDTR points to stub IDT
1564    GDTR points to stub GDT
1565    TS   points to stub TSS
1566    CR3 points to root page table
1567    CR0 has PE and PG
1568    EFER has LME AND LMA (and NX for compatibility with Linux)
1569    RSP is TOS of core's scratch stack (looks like a call)
1570
1571    RAX = MB magic cookie
1572    RBX = address of multiboot info table
1573    RCX = this core id / apic id (0..N-1)
1574    RDX = this core id - first HRT core ID (==0 for the first HRT core)
1575
1576    All addresses are virtual addresses, offset as needed by gva_offset
1577
1578    Other regs are zeroed
1579
1580    shadow/nested paging state reset for long mode
1581
1582 */
1583 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
1584 {
1585     void *base;
1586     uint64_t limit;
1587     uint64_t gva_offset;
1588
1589     rdtscll(core->hvm_state.last_boot_start);
1590     
1591
1592     if (!core->hvm_state.is_hrt) { 
1593         PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
1594         return 0;
1595     }
1596
1597
1598     PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
1599
1600     gva_offset = core->vm_info->hvm_state.gva_offset;
1601     
1602     memset(&core->vm_regs,0,sizeof(core->vm_regs));
1603     memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
1604     memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
1605     memset(&core->segments,0,sizeof(core->segments));    
1606     memset(&core->msrs,0,sizeof(core->msrs));    
1607     memset(&core->fp_state,0,sizeof(core->fp_state));    
1608
1609     // We are in long mode with virtual memory and we want
1610     // to start immediatley
1611     core->cpl = 0; // we are going right into the kernel
1612     core->cpu_mode = LONG;
1613     core->mem_mode = VIRTUAL_MEM; 
1614     core->core_run_state = CORE_RUNNING ;
1615
1616
1617     // magic
1618     core->vm_regs.rax = MB2_INFO_MAGIC;
1619
1620     // multiboot info pointer
1621     get_mb_info_loc(core->vm_info, &base,&limit);
1622     core->vm_regs.rbx = (uint64_t) base + gva_offset;  
1623
1624     // core number
1625     core->vm_regs.rcx = core->vcpu_id;
1626     
1627     // HRT core number
1628     core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1629
1630     // Now point to scratch stack for this core
1631     // it begins at an ofset relative to the MB info page
1632     get_mb_info_loc(core->vm_info, &base,&limit);
1633     base = base + gva_offset;
1634     base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1635     core->vm_regs.rsp = (v3_reg_t) base;  
1636     core->vm_regs.rbp = (v3_reg_t) base-8; 
1637
1638     // push onto the stack a bad rbp and bad return address
1639     core->vm_regs.rsp-=16;
1640     v3_set_gpa_memory(core,
1641                       core->vm_regs.rsp-gva_offset,
1642                       16,
1643                       0xff);
1644
1645
1646     // HRT entry point
1647     get_hrt_loc(core->vm_info, &base,&limit);
1648     if (core->vm_info->hvm_state.gva_entry) { 
1649       core->rip = core->vm_info->hvm_state.gva_entry;
1650     } else {
1651       core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset; 
1652     }
1653       
1654
1655
1656     PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1657                (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1658                (void*)(core->rip),
1659                (void*)(core->vm_regs.rsp),
1660                (void*)(core->vm_regs.rbp),
1661                (void*)(core->vm_regs.rax),
1662                (void*)(core->vm_regs.rbx),
1663                (void*)(core->vm_regs.rcx),
1664                (void*)(core->vm_regs.rdx));
1665
1666     // Setup CRs for long mode and our stub page table
1667     // CR0: PG, PE
1668     core->ctrl_regs.cr0 = 0x80000001;
1669     core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1670
1671     // CR2: don't care (output from #PF)
1672     // CE3: set to our PML4E, without setting PCD or PWT
1673     get_pt_loc(core->vm_info, &base,&limit);
1674     core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);  // not offset as this is a GPA
1675     core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1676
1677     // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1678     core->ctrl_regs.cr4 = 0xb0;
1679     core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1680     // CR8 as usual
1681     // RFLAGS zeroed is fine: come in with interrupts off
1682     // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
1683     core->ctrl_regs.efer = 0x1d00;
1684     core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1685
1686
1687     /* 
1688        Notes on selectors:
1689
1690        selector is 13 bits of index, 1 bit table indicator 
1691        (0=>GDT), 2 bit RPL
1692        
1693        index is scaled by 8, even in long mode, where some entries 
1694        are 16 bytes long.... 
1695           -> code, data descriptors have 8 byte format
1696              because base, limit, etc, are ignored (no segmentation)
1697           -> interrupt/trap gates have 16 byte format 
1698              because offset needs to be 64 bits
1699     */
1700     
1701     // Install our stub IDT
1702     get_idt_loc(core->vm_info, &base,&limit);
1703     base += gva_offset;
1704     core->segments.idtr.selector = 0;  // entry 0 (NULL) of the GDT
1705     core->segments.idtr.base = (addr_t) base;  // only base+limit are used
1706     core->segments.idtr.limit = limit-1;
1707     core->segments.idtr.type = 0x0;
1708     core->segments.idtr.system = 0; 
1709     core->segments.idtr.dpl = 0;
1710     core->segments.idtr.present = 0;
1711     core->segments.idtr.long_mode = 0;
1712
1713     // Install our stub GDT
1714     get_gdt_loc(core->vm_info, &base,&limit);
1715     base += gva_offset;
1716     core->segments.gdtr.selector = 0;  // entry 0 (NULL) of the GDT
1717     core->segments.gdtr.base = (addr_t) base;
1718     core->segments.gdtr.limit = limit-1;   // only base+limit are used
1719     core->segments.gdtr.type = 0x0;
1720     core->segments.gdtr.system = 0; 
1721     core->segments.gdtr.dpl = 0;
1722     core->segments.gdtr.present = 0;
1723     core->segments.gdtr.long_mode = 0;
1724     
1725     // And our TSS
1726     get_tss_loc(core->vm_info, &base,&limit);
1727     base += gva_offset;  
1728     core->segments.tr.selector = 0;
1729     core->segments.tr.base = (addr_t) base;
1730     core->segments.tr.limit = limit-1;
1731     core->segments.tr.type = 0x9;
1732     core->segments.tr.system = 0;   // available 64 bit TSS 
1733     core->segments.tr.dpl = 0;
1734     core->segments.tr.present = 1;
1735     core->segments.tr.long_mode = 0; // not used
1736     
1737     base = 0x0; // these are not offset as we want to make all gvas visible
1738     limit = -1;
1739
1740     // And CS
1741     core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1742     core->segments.cs.base = (addr_t) base;   // not used
1743     core->segments.cs.limit = limit;          // not used
1744     core->segments.cs.type = 0xe;             // only C is used
1745     core->segments.cs.system = 1;             // not a system segment
1746     core->segments.cs.dpl = 0;                       
1747     core->segments.cs.present = 1;
1748     core->segments.cs.long_mode = 1;
1749
1750     // DS, SS, etc are identical
1751     core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1752     core->segments.ds.base = (addr_t) base;
1753     core->segments.ds.limit = limit;
1754     core->segments.ds.type = 0x6;            // ignored
1755     core->segments.ds.system = 1;            // not a system segment
1756     core->segments.ds.dpl = 0;
1757     core->segments.ds.present = 1;
1758     core->segments.ds.long_mode = 1;
1759     
1760     memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1761     memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1762     memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1763     memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1764     
1765
1766     // reset paging here for shadow... 
1767
1768     if (core->shdw_pg_mode != NESTED_PAGING) { 
1769         PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
1770         return -1;
1771     }
1772
1773
1774     return 0;
1775 }
1776
1777 int v3_handle_hvm_reset(struct guest_info *core)
1778 {
1779
1780     if (core->core_run_state != CORE_RESETTING) { 
1781         return 0;
1782     }
1783
1784     if (!core->vm_info->hvm_state.is_hvm) { 
1785         return 0;
1786     }
1787
1788     if (v3_is_hvm_hrt_core(core)) { 
1789         // this is an HRT reset
1790         int rc=0;
1791
1792         // wait for all the HRT cores
1793         v3_counting_barrier(&core->vm_info->reset_barrier);
1794
1795         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
1796             // I am leader
1797             core->vm_info->run_state = VM_RESETTING;
1798         }
1799
1800         core->core_run_state = CORE_RESETTING;
1801
1802         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1803             // we really only need to clear the bss
1804             // and recopy the .data, but for now we'll just
1805             // do everything
1806             rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
1807
1808             if (rc) { 
1809                 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
1810             }
1811         }
1812
1813         // now everyone is ready to reset
1814         rc |= v3_setup_hvm_hrt_core_for_boot(core);
1815
1816         if (rc) { 
1817             PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
1818         }
1819
1820         core->core_run_state = CORE_RUNNING;
1821
1822         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
1823             // leader
1824             core->vm_info->run_state = VM_RUNNING;
1825             core->vm_info->hvm_state.trans_state = HRT_IDLE;
1826         }
1827
1828         v3_counting_barrier(&core->vm_info->reset_barrier);
1829
1830         if (rc<0) { 
1831             PrintError(core->vm_info,core,"hvm: reset failed\n");
1832             return rc;
1833         } else {
1834             return 1;
1835         }
1836
1837     } else { 
1838         // ROS core will be handled by normal reset functionality
1839         return 0;
1840     }
1841 }
1842
1843
1844 int v3_handle_hvm_entry(struct guest_info *core)
1845 {
1846     if (!core->vm_info->hvm_state.is_hvm        // not relevant to non-HVM
1847         || core->hvm_state.is_hrt              // not relevant to an HRT in an HVM
1848         || !core->vm_info->hvm_state.ros_signal.code) { // not relevant if there is no code to inject
1849
1850         // Note that above check for code could race with a writer, but
1851         // if that happens, we'll simply inject at the next opportunity instead of 
1852         // this one (see below for atomic update)
1853         return 0;
1854     } else {
1855         struct v3_ros_signal *s = &core->vm_info->hvm_state.ros_signal;
1856
1857         // HVM ROS
1858         if (! (s->handler && // handler installed
1859                s->cr3 &&     // process installed
1860                s->stack &&   // stack installed
1861                core->cpl == 3 &&  // user mode
1862                core->ctrl_regs.cr3 == s->cr3) // right process active
1863             ) {
1864             // Cannot inject at this time
1865             return 0;
1866         } else {
1867             // We can inject now, let's atomically see if we have something
1868             // and commit to doing it if we do
1869             uint64_t code;
1870
1871             // Get code, reset to allow next one
1872             code = __sync_fetch_and_and(&(s->code), 0);
1873
1874             if (!code) { 
1875                 // nothing to do after all
1876                 return 0;
1877             } else {
1878
1879                 // actually do inject
1880
1881                 uint64_t rsp;
1882                 uint64_t frame[6];
1883                 
1884                 PrintDebug(core->vm_info,core,"hvm: ROS interrupt starting with rip=%p rsp=%p\n", (void*) core->rip, (void*) core->vm_regs.rsp);
1885                 // build interrupt frame
1886                 frame[0] = code;
1887                 frame[1] = core->rip;
1888                 frame[2] = core->segments.cs.selector; // return cs
1889                 frame[3] = core->ctrl_regs.rflags;
1890                 frame[4] = core->vm_regs.rsp;
1891                 frame[5] = core->segments.ss.selector; // return ss
1892                 
1893                 rsp = (s->stack - 8) & (~0x7); // make sure we are aligned
1894                 rsp -= sizeof(frame);
1895                 
1896
1897                 if (v3_write_gva_memory(core,(addr_t)rsp,sizeof(frame),(uint8_t*)frame)!=sizeof(frame)) { 
1898                     PrintError(core->vm_info,core,"hvm: failed to write interrupt frame\n");
1899                     // we just lost this inject
1900                     return -1;
1901                 }
1902                 
1903                 // now make us look like we are jumping to the entry
1904                 core->rip = s->handler;
1905                 core->vm_regs.rsp = rsp;
1906
1907                 PrintDebug(core->vm_info,core,"hvm: ROS frame is 0x%llx|0x%llx|0x%llx|0x%llx|0x%llx|0x%llx and and on entry rip=%p and rsp=%p\n", frame[0],frame[1],frame[2],frame[3],frame[4],frame[5],(void*) core->rip, (void*) core->vm_regs.rsp);
1908                 
1909                 // and we should be good to go
1910                 return 0;
1911             } 
1912         }
1913     }
1914 }
1915
1916 int v3_handle_hvm_exit(struct guest_info *core)
1917 {
1918     // currently nothing
1919     return 0;
1920 }
1921
1922 int v3_hvm_signal_ros(struct v3_vm_info *vm, uint64_t code)
1923 {
1924     struct v3_ros_signal *s = &vm->hvm_state.ros_signal;
1925
1926     if (!code) { 
1927         PrintError(vm,VCORE_NONE,"hvm: cannot signal ros with code zero\n");
1928         return -1;
1929     }
1930
1931     // handler, etc, must exist
1932     if (!s->handler || !s->stack) { 
1933         PrintError(vm,VCORE_NONE,"hvm: cannot signal ros with no installed handler\n");
1934         return -1;
1935     } else {
1936         // we set the code only if we are idle (code 0), 
1937         // and we do so only 
1938         if (!__sync_bool_compare_and_swap(&(s->code), 0, code)) {
1939             PrintError(vm,VCORE_NONE,"hvm: signal was already asserted\n");
1940             return -1;
1941         } else {
1942             PrintDebug(vm,VCORE_NONE,"hvm: raised signal 0x%llx to the ROS\n",code);
1943             return 0;
1944         }
1945     }
1946 }
1947
1948
1949