Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


HRT image replacement from ROS application
[palacios.git] / palacios / src / palacios / vmm_hvm.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org> 
11  * All rights reserved.
12  *
13  * Author:  Peter Dinda <pdinda@northwestern.edu>
14  *
15  * This is free software.  You are permitted to use,
16  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
17  */
18
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
26
27 #include <palacios/vmm_xml.h>
28
29 #include <palacios/vm_guest_mem.h>
30
31 #include <palacios/vmm_debug.h>
32
33
34 /*
35
36   MEM     = Total size of memory in the GPA (in MB)
37   ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
38
39   GPAs [0,ROS_MEM) are what the ROS sees
40   GPAs [ROS_MEM, MEM) are HRT only
41   GPAS [0,MEM) are accessible by the HRT
42
43   CORES   = Total number of cores in VM
44   ROS_CORES = Total numbber of cores for the ROS
45
46   Cores [0,ROS_CORES) are what the ROS sees
47   Cores [ROS_CORES,CORES) are HRT only
48   Cores [0,CORES) are accessible by the HRT
49
50   In a Pal file:
51
52   <files> 
53     <file id="hrtelf" filename="hrtelf.o" />
54   </files>
55
56   <mem ... >RAM</mem>   (MB)  Note these are  
57   <cores count="CORES" ...>   backward compatible
58
59   <hvm enable="y" >
60     <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
61     <hrt file_id="hrtelf" /hrt>
62   </hvm>
63
64 */
65
66 #ifndef V3_CONFIG_DEBUG_HVM
67 #undef PrintDebug
68 #define PrintDebug(fmt, args...)
69 #endif
70
71
72 int v3_init_hvm()
73 {
74     PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
75     return 0;
76 }
77
78 int v3_deinit_hvm()
79 {
80     PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
81     return 0;
82 }
83
84 // ignore requests from when we are in the wrong state
85 #define ENFORCE_STATE_MACHINE 1
86
87 // invoke the HRT using a page fault instead of
88 // the SWINTR mechanism
89 #define USE_UPCALL_MAGIC_PF  1
90 #define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
91 #define UPCALL_MAGIC_ERROR   0xf00df00d
92
93 /*
94   64 bit only hypercall:
95
96   rax = hypercall number
97   rbx = 0x646464...
98   then args are:  rcx, rdx, rsi, rdi r8, r9, r10, r11
99   rcx = 1st arg
100 */
101 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
102 {
103     uint64_t c;
104     uint64_t bitness = core->vm_regs.rbx;
105     uint64_t a1 = core->vm_regs.rcx;
106     uint64_t a2 = core->vm_regs.rdx;
107     uint64_t a3 = core->vm_regs.rsi;
108     struct v3_vm_hvm *h = &core->vm_info->hvm_state;
109
110
111     if (bitness!=0x6464646464646464) { 
112         PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
113         core->vm_regs.rax = -1;
114         return 0;
115     }
116
117     switch (a1) {
118         case 0x0:   // null
119             
120             rdtscll(c);
121             
122             V3_Print(core->vm_info,core, "hvm: received hypercall %x  rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
123                      hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
124             //v3_print_core_telemetry(core);
125             //    v3_print_guest_state(core);
126             core->vm_regs.rax = 0;
127             break;
128             
129         case 0x1: // reset ros
130             PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
131             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) { 
132                 PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
133                 core->vm_regs.rax = -1;
134             } else {
135                 core->vm_regs.rax = 0;
136             }
137             break;
138
139         case 0x2: // reset hrt
140             PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
141             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) { 
142                 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
143                 core->vm_regs.rax = -1;
144             } else {
145                 core->vm_regs.rax = 0;
146             }
147             break;
148
149         case 0x3: // reset both
150             PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
151             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) { 
152                 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
153                 core->vm_regs.rax = -1;
154             } else {
155                 core->vm_regs.rax = 0;
156             }
157             break;
158             
159         case 0x8: // replace HRT image
160             // a2 = gva of image
161             // a3 = size of image
162             PrintDebug(core->vm_info,core,"hvm: request replacement HRT image addr=0x%llx size=0x%llx\n",a2,a3);
163
164             if (h->hrt_image) { 
165                 // delete old
166                 V3_VFree(h->hrt_image);
167                 h->hrt_image = 0;
168             }
169
170             h->hrt_image = V3_VMalloc(a3);
171
172             if (!(h->hrt_image)) {
173                 PrintError(core->vm_info,core, "hvm: failed to allocate space for replacement image\n");
174                 core->vm_regs.rax = -1;
175             } else {
176                 if (v3_read_gva_memory(core, a2, a3, (uint8_t*) h->hrt_image)!=a3) { 
177                     PrintError(core->vm_info, core, "hvm: cannot read replacement image\n");
178                     core->vm_regs.rax = -1;
179                 } else {
180                     h->hrt_image_size = a3; 
181                     core->vm_regs.rax = 0;
182                 }
183             }
184
185             if (core->vm_regs.rax) { 
186                 PrintError(core->vm_info,core,"hvm: Failed to replace HRT image\n");
187             } else {
188                 PrintDebug(core->vm_info,core,"hvm: HRT image successfully replaced\n");
189             }
190
191             break;
192
193         case 0xf: // get HRT state
194             core->vm_regs.rax = h->trans_state;
195             if (v3_write_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*) &h->ros_event)!=sizeof(h->ros_event)) { 
196                 PrintError(core->vm_info, core, "hvm: cannot write back ROS event state to %p - continuing\n",(void*)a2);
197             }
198             //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
199             break;
200
201         case 0x10:
202             PrintDebug(core->vm_info, core, "hvm: ROS event request\n");
203             if (h->ros_event.event_type!=ROS_NONE) { 
204                 PrintError(core->vm_info, core, "hvm: ROS event is already in progress\n");
205                 core->vm_regs.rax = -1;
206             } else {
207                 if (v3_read_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*)&h->ros_event)!=sizeof(h->ros_event)) { 
208                     PrintError(core->vm_info, core, "hvm: cannot read ROS event from %p\n",(void*)a2);
209                     core->vm_regs.rax = -1;
210                 } else {
211                     core->vm_regs.rax = 0;
212                 }
213             }
214
215             break;
216
217         case 0x1f:
218             PrintDebug(core->vm_info, core, "hvm: completion of ROS event (rc=0x%llx)\n",a2);
219             h->ros_event.event_type=ROS_NONE;
220             h->ros_event.last_ros_event_result = a2;
221             break;
222
223         case 0x20: // invoke function (ROS->HRT)
224         case 0x21: // invoke parallel function (ROS->HRT)
225             if (v3_is_hvm_hrt_core(core)) { 
226                 PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
227                 core->vm_regs.rax = -1;
228             } else {
229                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
230                     PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
231                     core->vm_regs.rax = -1;
232                 } else {
233                     uint64_t *page = (uint64_t *) h->comm_page_hva;
234                     uint64_t first, last, cur;
235
236                     PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
237                     page[0] = a1;
238                     page[1] = a2;
239
240                     if (a1==0x20) { 
241                         first=last=h->first_hrt_core;
242                     } else {
243                         first=h->first_hrt_core;
244                         last=core->vm_info->num_cores-1;
245                     }
246
247                     core->vm_regs.rax = 0;
248
249                     h->trans_count = last-first+1;
250
251                     for (cur=first;cur<=last;cur++) { 
252
253 #if USE_UPCALL_MAGIC_PF
254                         PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
255                         core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
256                         if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
257                                                           PF_EXCEPTION, 
258                                                           UPCALL_MAGIC_ERROR)) { 
259                             PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
260                             core->vm_regs.rax = -1;
261                             break;
262                         }
263 #else
264                         PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
265                         if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) { 
266                             PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
267                             core->vm_regs.rax = -1;
268                             break;
269                         }
270 #endif
271                         // Force core to exit now
272                         v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
273                           
274                     }
275                     if (core->vm_regs.rax==0) { 
276                         if (a1==0x20) { 
277                             h->trans_state = HRT_CALL;
278                         } else {
279                             h->trans_state = HRT_PARCALL;
280                         }
281                     }  else {
282                         PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
283                         h->trans_state = HRT_IDLE;
284                         h->trans_count = 0;
285                     }
286                 }
287             }
288             break;
289
290
291         case 0x28: // setup for synchronous operation (ROS->HRT)
292         case 0x29: // teardown for synchronous operation (ROS->HRT)
293             if (v3_is_hvm_hrt_core(core)) { 
294                 PrintError(core->vm_info,core,"hvm: %ssynchronization invocation not supported from HRT core\n",a1==0x29 ? "de" : "");
295                 core->vm_regs.rax = -1;
296             } else {
297                 if (ENFORCE_STATE_MACHINE && 
298                     ((a1==0x28 && h->trans_state!=HRT_IDLE) || (a1==0x29 && h->trans_state!=HRT_SYNC))) { 
299                     PrintError(core->vm_info,core, "hvm: cannot invoke %ssynchronization in state %d\n",a1==0x29 ? "de" : "", h->trans_state);
300                     core->vm_regs.rax = -1;
301                 } else {
302                     uint64_t *page = (uint64_t *) h->comm_page_hva;
303                     uint64_t first, last, cur;
304
305                     PrintDebug(core->vm_info,core, "hvm: invoke %ssynchronization on address %p\n",a1==0x29 ? "de" : "",(void*)a2);
306                     page[0] = a1;
307                     page[1] = a2;
308
309                     first=last=h->first_hrt_core;  // initially we will sync only with BSP
310
311                     core->vm_regs.rax = 0;
312
313                     h->trans_count = last-first+1;
314
315                     for (cur=first;cur<=last;cur++) { 
316
317 #if USE_UPCALL_MAGIC_PF
318                         PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %llu\n",cur);
319                         core->vm_info->cores[cur].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
320                         if (v3_raise_exception_with_error(&core->vm_info->cores[cur],
321                                                           PF_EXCEPTION, 
322                                                           UPCALL_MAGIC_ERROR)) { 
323                             PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",cur);
324                             core->vm_regs.rax = -1;
325                             break;
326                         }
327 #else
328                         PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",h->hrt_int_vector,cur);
329                         if (v3_raise_swintr(&core->vm_info->cores[cur],h->hrt_int_vector)) { 
330                             PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
331                             core->vm_regs.rax = -1;
332                             break;
333                         }
334 #endif
335                         // Force core to exit now
336                         v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
337                           
338                     }
339                     if (core->vm_regs.rax==0) { 
340                         if (a1==0x28) { 
341                             h->trans_state = HRT_SYNCSETUP;
342                         } else {
343                             h->trans_state = HRT_SYNCTEARDOWN;                      
344                         }
345                     }  else {
346                         PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
347                         h->trans_state = HRT_IDLE;
348                         h->trans_count = 0;
349                     }
350                 }
351             }
352             break;
353
354         case 0x2f: // function exec or sync done
355             if (v3_is_hvm_ros_core(core)) { 
356                 PrintError(core->vm_info,core, "hvm: request for exec or sync done from ROS core\n");
357                 core->vm_regs.rax=-1;
358             } else {
359                 if (ENFORCE_STATE_MACHINE && 
360                     h->trans_state!=HRT_CALL && 
361                     h->trans_state!=HRT_PARCALL && 
362                     h->trans_state!=HRT_SYNCSETUP &&
363                     h->trans_state!=HRT_SYNCTEARDOWN) {
364                     PrintError(core->vm_info,core,"hvm: function or sync completion when not in HRT_CALL, HRT_PARCALL, HRT_SYNCSETUP, or HRT_SYNCTEARDOWN state\n");
365                     core->vm_regs.rax=-1;
366                 } else {
367                     uint64_t one=1;
368                     PrintDebug(core->vm_info,core, "hvm: function or sync complete\n");
369                     if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
370                         // last one, switch state
371                         if (h->trans_state==HRT_SYNCSETUP) { 
372                             h->trans_state=HRT_SYNC;
373                             PrintDebug(core->vm_info,core, "hvm: function complete - now synchronous\n");
374                         } else {
375                             h->trans_state=HRT_IDLE;
376                         }
377                     }
378                     core->vm_regs.rax=0;
379                 }
380             }
381                     
382             break;
383
384         case 0x30: // merge address space
385         case 0x31: // unmerge address space
386             if (v3_is_hvm_hrt_core(core)) { 
387                 PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
388                 core->vm_regs.rax=-1;
389             } else {
390                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
391                     PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state\n",a1==0x30 ? "" : "un");
392                     core->vm_regs.rax=-1;
393                 } else {
394                     uint64_t *page = (uint64_t *) h->comm_page_hva;
395
396                     PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
397                     // should sanity check to make sure guest is in 64 bit without anything strange
398
399                     page[0] = a1;
400                     page[1] = core->ctrl_regs.cr3;  // this is a do-not-care for an unmerge
401
402                     core->vm_regs.rax = 0;
403 #if USE_UPCALL_MAGIC_PF
404                     PrintDebug(core->vm_info,core,"hvm: injecting magic #PF into core %u\n",h->first_hrt_core);
405                     core->vm_info->cores[h->first_hrt_core].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
406                     if (v3_raise_exception_with_error(&core->vm_info->cores[h->first_hrt_core],
407                                                       PF_EXCEPTION,  
408                                                       UPCALL_MAGIC_ERROR)) { 
409                       PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %u\n",h->first_hrt_core);
410                       core->vm_regs.rax = -1;
411                       break;
412                     }
413 #else
414                     PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %u\n",h->hrt_int_vector,h->first_hrt_core);
415                     if (v3_raise_swintr(&core->vm_info->cores[h->first_hrt_core],h->hrt_int_vector)) { 
416                         PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %u\n",h->first_hrt_core);
417                         core->vm_regs.rax = -1;
418                     } 
419 #endif          
420                     // Force core to exit now
421                     v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
422
423                     h->trans_state = HRT_MERGE;
424                 }
425                 
426             }
427                 
428             break;
429             
430
431         case 0x3f: // merge operation done
432             if (v3_is_hvm_ros_core(core)) { 
433                 PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
434                 core->vm_regs.rax=-1;
435             } else {
436                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
437                     PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
438                     core->vm_regs.rax=-1;
439                 } else {
440                     PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
441                     h->trans_state=HRT_IDLE;
442                     core->vm_regs.rax=0;
443                 }
444             }
445                     
446             break;
447
448         default:
449             PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
450             core->vm_regs.rax=-1;
451             break;
452     }
453                 
454     return 0;
455 }
456
457 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
458
459 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
460 {
461     v3_cfg_tree_t *hvm_config;
462     v3_cfg_tree_t *ros_config;
463     v3_cfg_tree_t *hrt_config;
464     char *enable;
465     char *ros_cores;
466     char *ros_mem;
467     char *hrt_file_id=0;
468
469     PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
470
471     /* 
472        Defaults - all ROS
473     */
474     memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
475     vm->hvm_state.is_hvm=0;
476     vm->hvm_state.first_hrt_core=vm->num_cores;
477     vm->hvm_state.first_hrt_gpa=vm->mem_size;
478
479     if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
480         PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
481         goto out_ok;
482     }
483     
484     if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
485         PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
486         goto out_ok;
487     }
488
489     if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) { 
490         PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
491         return -1;
492     }
493  
494     if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) { 
495         PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
496         return -1;
497     }
498    
499     vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
500     
501     if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) { 
502         PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
503         return -1;
504     }
505
506     vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
507
508     if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) { 
509         PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
510         return -1;
511     }
512  
513     if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) { 
514         PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
515         return -1;
516     }
517
518     vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
519     
520     if (!vm->hvm_state.hrt_file) { 
521         PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
522         return -1;
523     }
524
525     if (v3_register_hypercall(vm, HVM_HCALL, 
526                               hvm_hcall_handler, 0)) { 
527         PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
528         return -1;
529     }
530
531     // XXX sanity check config here
532
533     vm->hvm_state.is_hvm=1;
534
535  out_ok:
536     if (vm->hvm_state.is_hvm) {
537         V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
538                  vm->hvm_state.first_hrt_core-1,
539                  (void*) vm->hvm_state.first_hrt_gpa-1,
540                  vm->hvm_state.first_hrt_core,
541                  vm->num_cores-1,
542                  (void*) vm->hvm_state.first_hrt_gpa,
543                  (void*)vm->mem_size-1,
544                  hrt_file_id,
545                  vm->hvm_state.hrt_file->tag);
546     } else {
547         V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
548     }
549     return 0;
550     
551 }
552
553
554 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
555 {
556     PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
557
558     if (vm->hvm_state.hrt_image) { 
559         V3_VFree(vm->hvm_state.hrt_image);
560         vm->hvm_state.hrt_image=0;
561         vm->hvm_state.hrt_image_size=0;
562     }
563
564     v3_remove_hypercall(vm,HVM_HCALL);
565
566     if (vm->hvm_state.comm_page_hpa) { 
567         struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
568         if (!r) { 
569             PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
570         } else {
571             v3_delete_mem_region(vm,r);
572         }
573     }
574
575     return 0;
576 }
577
578 int v3_init_hvm_core(struct guest_info *core)
579 {
580     memset(&core->hvm_state,0,sizeof(core->hvm_state));
581     if (core->vm_info->hvm_state.is_hvm) { 
582         if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) { 
583             core->hvm_state.is_hrt=1;
584         }
585     }
586     return 0;
587 }
588
589 int v3_deinit_hvm_core(struct guest_info *core)
590 {
591     PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
592
593     return 0;
594 }
595
596
597 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
598 {
599     if (vm->hvm_state.is_hvm) { 
600         return vm->hvm_state.first_hrt_gpa;
601     } else {
602         return vm->mem_size;
603     }
604 }
605 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
606 {
607     return vm->mem_size;
608 }
609
610 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
611 {
612     if (vm->hvm_state.is_hvm) { 
613         return vm->hvm_state.first_hrt_core;
614     } else {
615         return vm->num_cores;
616     }
617 }
618
619 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
620 {
621     if (vm->hvm_state.is_hvm) { 
622         return vm->num_cores - vm->hvm_state.first_hrt_core;
623     } else {
624         return 0;
625     }
626 }
627
628
629 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
630 {
631     if (vm->hvm_state.is_hvm) { 
632         return gpa<vm->hvm_state.first_hrt_gpa;
633     } else {
634         return 1;
635     }
636 }
637
638 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
639 {
640     if (vm->hvm_state.is_hvm) { 
641         return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
642     } else {
643         return 0;
644     }
645 }
646
647 int v3_is_hvm_hrt_core(struct guest_info *core)
648 {
649     return core->hvm_state.is_hrt;
650 }
651
652 int v3_is_hvm_ros_core(struct guest_info *core)
653 {
654     return !core->hvm_state.is_hrt;
655 }
656
657 int      v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
658 {
659     if (!src) {
660         // ioapic or msi to apic
661         return !dest->hvm_state.is_hrt;
662     } else {
663         // apic to apic
664         return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
665     }
666 }
667
668 void     v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm, 
669                                         uint32_t *start_apic, uint32_t *num_apics)
670 {
671     if (!core) { 
672         // Seen from ioapic, msi, etc: 
673         if (vm->hvm_state.is_hvm) {
674             // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
675             *start_apic = 0;
676             *num_apics = vm->hvm_state.first_hrt_core;
677         } else {
678             // Non-HVM shows all cores/APICs to apic, msi, etc.
679             *start_apic = 0;
680             *num_apics = vm->num_cores;
681         }
682     } else {
683         // Seen from apic
684         if (core->hvm_state.is_hrt) { 
685             // HRT core/apic sees all apics
686             // (this policy may change...)
687             *start_apic = 0;
688             *num_apics = vm->num_cores;
689         } else {
690             // non-HRT core/apic sees only non-HRT cores/apics
691             *start_apic = 0 ;
692             *num_apics = vm->hvm_state.first_hrt_core;
693         }
694     }
695 }
696
697 #define MAX(x,y) ((x)>(y)?(x):(y))
698 #define MIN(x,y) ((x)<(y)?(x):(y))
699
700
701 static uint64_t boot_state_end_addr(struct v3_vm_info *vm) 
702 {
703     return PAGE_ADDR(vm->mem_size);
704 }
705    
706 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
707 {
708     *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
709     *limit = PAGE_SIZE;
710 }
711
712 extern v3_cpu_arch_t v3_mach_type;
713
714 extern void *v3_hvm_svm_null_int_handler_start;
715 extern void *v3_hvm_svm_null_int_handler_end;
716 extern void *v3_hvm_vmx_null_int_handler_start;
717 extern void *v3_hvm_vmx_null_int_handler_end;
718
719 static void write_null_int_handler(struct v3_vm_info *vm)
720 {
721     void *base;
722     uint64_t limit;
723     void *data;
724     uint64_t len;
725
726     get_null_int_handler_loc(vm,&base,&limit);
727
728     switch (v3_mach_type) {
729 #ifdef V3_CONFIG_SVM
730         case V3_SVM_CPU:
731         case V3_SVM_REV3_CPU:
732             data = (void*) &v3_hvm_svm_null_int_handler_start;
733             len = (void*) &v3_hvm_svm_null_int_handler_end - data;
734             break;
735 #endif
736 #if V3_CONFIG_VMX
737         case V3_VMX_CPU:
738         case V3_VMX_EPT_CPU:
739         case V3_VMX_EPT_UG_CPU:
740             data = (void*) &v3_hvm_vmx_null_int_handler_start;
741             len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
742             break;
743 #endif
744         default:
745             PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
746             data = 0;
747             len = 0;
748     }
749
750     if (data) {
751         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
752     }
753
754     PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
755 }
756
757
758 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
759 {
760     *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
761     *limit = 16*256;
762 }
763
764 // default IDT entries (int and trap gates)
765 //
766 // Format is 16 bytes long:
767 //   16 offsetlo   => 0
768 //   16 selector   => (target code selector) => 0x8 // entry 1 of GDT
769 //    3 ist        => (stack) = 0 => current stack
770 //    5 reserved   => 0
771 //    4 type       => 0xe=>INT, 0xf=>TRAP 
772 //    1 reserved   => 0  (indicates "system" by being zero)
773 //    2 dpl        => 0
774 //    1 present    => 1
775 //   16 offsetmid  => 0
776 //   32 offsethigh => 0   (total is a 64 bit offset)
777 //   32 reserved   => 0
778 //
779 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
780 // 
781 // Note little endian
782 //
783 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
784 static uint64_t idt64_int_gate_entry_mask[2] =  { 0x00008e0000080000, 0x0 };
785
786 static void write_idt(struct v3_vm_info *vm)
787 {
788     void *base;
789     uint64_t limit;
790     void *handler;
791     uint64_t handler_len;
792     int i;
793     uint64_t trap_gate[2];
794     uint64_t int_gate[2];
795
796     get_idt_loc(vm,&base,&limit);
797
798     get_null_int_handler_loc(vm,&handler,&handler_len);
799
800     handler += vm->hvm_state.gva_offset;
801
802     memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
803     memcpy(int_gate,idt64_int_gate_entry_mask,16);
804
805     if (handler) {
806         // update the entries for the handler location
807         uint8_t *mask;
808         uint8_t *hand;
809         
810         hand = (uint8_t*) &handler;
811
812         mask = (uint8_t *)trap_gate;
813         memcpy(&(mask[0]),&(hand[0]),2); // offset low
814         memcpy(&(mask[6]),&(hand[2]),2); // offset med
815         memcpy(&(mask[8]),&(hand[4]),4); // offset high
816
817         mask = (uint8_t *)int_gate;
818         memcpy(&(mask[0]),&(hand[0]),2); // offset low
819         memcpy(&(mask[6]),&(hand[2]),2); // offset med
820         memcpy(&(mask[8]),&(hand[4]),4); // offset high
821
822         PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
823     }
824
825     for (i=0;i<32;i++) { 
826         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
827     }
828
829     for (i=32;i<256;i++) { 
830         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
831     }
832
833     PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
834 }
835
836
837
838 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
839 {
840     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
841     *limit = 8*3;
842 }
843
844 static uint64_t gdt64[3] = {
845     0x0000000000000000, /* null */
846     0x00a09a0000000000, /* code (note lme bit) */
847     0x00a0920000000000, /* data (most entries don't matter) */
848 };
849
850 static void write_gdt(struct v3_vm_info *vm)
851 {
852     void *base;
853     uint64_t limit;
854
855     get_gdt_loc(vm,&base,&limit);
856     v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
857
858     PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
859 }
860
861
862
863 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
864 {
865     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
866     *limit = PAGE_SIZE;
867 }
868
869 static void write_tss(struct v3_vm_info *vm)
870 {
871     void *base;
872     uint64_t limit;
873
874     get_tss_loc(vm,&base,&limit);
875
876     v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
877
878     PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
879 }
880
881
882 #define TOP_HALF_START  0xffff800000000000ULL
883 #define BOTTOM_HALF_END 0x00007fffffffffffULL
884
885
886 #define L4_UNIT PAGE_SIZE
887 #define L3_UNIT (512ULL * L4_UNIT)
888 #define L2_UNIT (512ULL * L3_UNIT)
889 #define L1_UNIT (512ULL * L2_UNIT)
890
891 static void compute_pts_4KB(struct v3_vm_info *vm, 
892                             uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)    
893 {
894
895     // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
896     // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
897     // so it is the same number of page tables regardless
898
899     uint64_t max_gva = vm->hvm_state.max_mem_mapped;
900
901     *l1 = 1;  // 1 PML4
902     *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
903     *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
904     *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
905 }
906
907
908
909 /*
910   PTS MAP using 1 GB pages
911   n second levels pts, highest gva, highest address
912   1 top level
913
914
915 OR
916   
917   PTS MAP using 2 MB pages
918   n third level pts, highest gva, highest address
919   m second level pts, highest gva, highest address
920   1 top level pt
921
922 OR
923
924   PTS MAP using 4 KB pages
925   n 4th level, highest gva, highest address
926   m 3rd level, highest gva, hihgest address
927   l second level, highest gva, highest address
928   1 top level pt
929
930 OR
931   PTS MAP using 512 GB pages when this becomes available
932
933 */
934
935
936 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
937 {
938     uint64_t l1,l2,l3,l4;
939     uint64_t num_pt;
940
941     compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
942
943     if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
944         num_pt = l1;
945     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
946         num_pt = l1 + l2;
947     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
948         num_pt = l1 + l2 + l3;
949     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
950         num_pt = l1 + l2 + l3 + l4;
951     } else {
952         PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
953         return;
954     }
955
956     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
957     *limit = num_pt*PAGE_SIZE;
958 }
959
960 static void write_pts(struct v3_vm_info *vm)
961 {
962     uint64_t size;
963     uint64_t num_l1, num_l2, num_l3, num_l4;
964     void *start_l1, *start_l2, *start_l3, *start_l4;
965     uint64_t max_level;
966     void *cur_pt;
967     void *cur_gva;
968     void *cur_gpa;
969     void *min_gpa = 0;
970     void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
971     void *min_gva = (void*) vm->hvm_state.gva_offset;
972 #ifdef V3_CONFIG_DEBUG_HVM
973     void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
974 #endif
975     uint64_t i, pt;
976     uint64_t i_start,i_end;
977     
978     struct pml4e64 *pml4e;
979     struct pdpe64 *pdpe;
980     struct pde64 *pde;
981     struct pte64 *pte;
982
983     if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
984         PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
985         max_level = 1;
986     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
987         max_level = 2;
988     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
989         max_level = 3;
990     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
991         max_level = 4;
992     } else {
993         PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
994         return;
995     }
996
997     get_pt_loc(vm,&start_l1,&size);
998     compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
999
1000     start_l2=start_l1+PAGE_SIZE*num_l1;
1001     start_l3=start_l2+PAGE_SIZE*num_l2;
1002     start_l4=start_l3+PAGE_SIZE*num_l3;
1003
1004     PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
1005     PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
1006     PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
1007     PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
1008
1009     cur_pt=start_l1;
1010
1011     // build PML4 (only one)
1012     if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) { 
1013         PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
1014         return;
1015     }
1016
1017     memset(pml4e,0,PAGE_SIZE);
1018
1019     if (min_gva==0x0) { 
1020         i_start=0; i_end = num_l2;
1021     } else if (min_gva==(void*)TOP_HALF_START) { 
1022         i_start=256; i_end=256+num_l2;
1023     } else {
1024         PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
1025         return;
1026     }
1027
1028     for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
1029          (i<i_end);
1030          i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
1031
1032         pml4e[i].present=1;
1033         pml4e[i].writable=1;
1034         
1035         if (max_level==1) { 
1036             PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
1037             pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1038             //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1039         } else {
1040             pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
1041             //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1042         }
1043     }
1044
1045     // 512 GB only
1046     if (max_level==1) {
1047         return;
1048     }
1049
1050
1051
1052     for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1053          pt<num_l2;
1054          cur_pt+=PAGE_SIZE, pt++) { 
1055
1056         // build PDPE
1057         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) { 
1058             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
1059             return;
1060         }
1061         
1062         memset(pdpe,0,PAGE_SIZE);
1063         
1064         for (i=0; 
1065              i<512 && cur_gpa<max_gpa; 
1066              i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
1067
1068             pdpe[i].present=1;
1069             pdpe[i].writable=1;
1070         
1071             if (max_level==2) { 
1072                 pdpe[i].large_page=1;
1073                 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1074                 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1075             } else {
1076                 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
1077                 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1078             }
1079         }
1080     }
1081         
1082     //1 GB only
1083     if (max_level==2) { 
1084         return;
1085     }
1086
1087     for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1088          pt<num_l3;
1089          cur_pt+=PAGE_SIZE, pt++) { 
1090
1091         // build PDE
1092         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) { 
1093             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
1094             return;
1095         }
1096         
1097         memset(pde,0,PAGE_SIZE);
1098         
1099         for (i=0; 
1100              i<512 && cur_gpa<max_gpa; 
1101              i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
1102
1103             pde[i].present=1;
1104             pde[i].writable=1;
1105         
1106             if (max_level==3) { 
1107                 pde[i].large_page=1;
1108                 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1109                 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
1110             } else {
1111                 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
1112                 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
1113             }
1114         }
1115     }
1116
1117     //2 MB only
1118     if (max_level==3) { 
1119         return;
1120     }
1121
1122
1123     // 4 KB
1124     for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1125          pt<num_l4;
1126          cur_pt+=PAGE_SIZE, pt++) { 
1127
1128         // build PTE
1129         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) { 
1130             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
1131             return;
1132         }
1133         
1134         memset(pte,0,PAGE_SIZE);
1135         
1136         for (i=0; 
1137              i<512 && cur_gpa<max_gpa; 
1138              i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
1139
1140             pte[i].present=1;
1141             pte[i].writable=1;
1142             pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1143             //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
1144         }
1145     }
1146
1147     return;
1148 }
1149
1150
1151 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1152 {
1153     
1154     get_pt_loc(vm,base, limit);
1155     *base-=PAGE_SIZE;
1156     *limit=PAGE_SIZE;
1157 }
1158
1159
1160 int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
1161 {
1162     struct v3_vm_info *vm = core->vm_info;
1163
1164     hrt->tag.type = MB_INFO_HRT_TAG;
1165     hrt->tag.size = sizeof(mb_info_hrt_t);
1166
1167     hrt->total_num_apics = vm->num_cores;
1168     hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
1169     hrt->have_hrt_ioapic=0;
1170     hrt->first_hrt_ioapic_entry=0;
1171
1172     hrt->cpu_freq_khz = V3_CPU_KHZ();
1173
1174     hrt->hrt_flags = vm->hvm_state.hrt_flags;
1175     hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
1176     hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
1177     hrt->gva_offset = vm->hvm_state.gva_offset;
1178     hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
1179     hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
1180     
1181     return 0;
1182 }
1183
1184 static void write_mb_info(struct v3_vm_info *vm) 
1185 {
1186     if (vm->hvm_state.hrt_type!=HRT_MBOOT64) { 
1187         PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
1188         return;
1189     } else {
1190         uint8_t buf[256];
1191         uint64_t size;
1192         void *base;
1193         uint64_t limit;
1194
1195         get_mb_info_loc(vm,&base,&limit);
1196         
1197         if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) { 
1198             PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
1199             return;
1200         }
1201
1202         if (size>limit) { 
1203             PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
1204             return;
1205         }
1206         
1207         v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
1208                             (addr_t)base,
1209                             size,
1210                             buf);
1211
1212         PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
1213     }
1214 }
1215
1216 #define SCRATCH_STACK_SIZE 4096
1217
1218
1219 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1220 {
1221     void *mb_base;
1222     uint64_t mb_limit;
1223     
1224     get_mb_info_loc(vm,&mb_base,&mb_limit);
1225     
1226     mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
1227
1228     *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
1229
1230     if (mb_base < *base+PAGE_SIZE) { 
1231         PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
1232     }
1233
1234     *limit = mb_base - *base;
1235 }
1236
1237
1238 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1239 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1240
1241 #define ELF_MAGIC    0x464c457f
1242 #define MB2_MAGIC    0xe85250d6
1243
1244 #define MB2_INFO_MAGIC    0x36d76289
1245
1246 static int is_elf(uint8_t *data, uint64_t size)
1247 {
1248     if (*((uint32_t*)data)==ELF_MAGIC) {
1249         return 1;
1250     } else { 
1251         return 0;
1252     }
1253 }
1254
1255 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
1256 {
1257     uint64_t limit = size > 32768 ? 32768 : size;
1258     uint64_t i;
1259
1260     // Scan for the .boot magic cookie
1261     // must be in first 32K, assume 4 byte aligned
1262     for (i=0;i<limit;i+=4) { 
1263         if (*((uint32_t*)&data[i])==MB2_MAGIC) {
1264             INFO("Found multiboot header at offset 0x%llx\n",i);
1265             return (mb_header_t *) &data[i];
1266         }
1267     }
1268     return 0;
1269 }
1270
1271
1272 static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
1273 {
1274     struct v3_vm_hvm *h = &vm->hvm_state;
1275     uint64_t f = mb->mb64_hrt->hrt_flags;
1276     uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
1277     uint64_t gvaoff = mb->mb64_hrt->gva_offset;
1278     uint64_t gvaentry = mb->mb64_hrt->gva_entry;
1279     uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
1280     uint8_t  vec = mb->mb64_hrt->hrt_int_vector;
1281     
1282
1283     PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
1284                f, maxmap, gvaoff,gvaentry,commgpa, vec);
1285
1286     if (maxmap<0x100000000ULL) { 
1287         PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
1288         maxmap=0x100000000ULL;
1289     }
1290
1291     if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
1292         PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
1293         return -1;
1294     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
1295         f &= ~0x3c;
1296         f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
1297         h->max_mem_mapped = maxmap;
1298         PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
1299     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { 
1300         f &= ~0x3c;
1301         f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
1302         h->max_mem_mapped = maxmap;
1303         PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
1304     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
1305         f &= ~0x3c;
1306         f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
1307         h->max_mem_mapped = maxmap;
1308         PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
1309     } else {
1310         PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
1311         return -1;
1312     }
1313
1314     if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
1315         PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
1316         return -1;
1317     }
1318
1319     h->hrt_flags = f;
1320
1321     if (maxmap>h->max_mem_mapped) { 
1322         PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
1323         return -1;
1324     }
1325
1326     if (gvaoff!=0 && gvaoff!=TOP_HALF_START) { 
1327         PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
1328         return -1;
1329     }
1330     
1331     h->gva_offset = gvaoff;
1332
1333     h->gva_entry = gvaentry;
1334
1335     if (mb->addr->load_addr < h->first_hrt_gpa) { 
1336         PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
1337         return -1;
1338     }
1339     
1340     if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
1341         PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
1342         return -1;
1343     }
1344     
1345     if (vec<32) { 
1346         PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
1347         return -1;
1348     }
1349     
1350     h->hrt_int_vector = vec;
1351     
1352     
1353     if (commgpa < vm->mem_size) { 
1354         PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
1355         return -1;
1356     } 
1357
1358     h->comm_page_gpa = commgpa;
1359
1360     if (!h->comm_page_hpa) { 
1361         if (!(h->comm_page_hpa=V3_AllocPages(1))) { 
1362             PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
1363             return -1;
1364         }
1365
1366         h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
1367         
1368         memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1369         
1370         if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) { 
1371             PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
1372             V3_FreePages((void*)(h->comm_page_gpa),1);
1373             return -1;
1374         }
1375         
1376         
1377         PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
1378     }
1379
1380     memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1381     
1382     
1383     PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
1384                h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
1385     
1386     return 0;
1387
1388 }
1389
1390 static int setup_mb_kernel_hrt(struct v3_vm_info *vm, void *data, uint64_t size)
1391 {
1392     mb_data_t mb;
1393
1394     if (v3_parse_multiboot_header(data, size, &mb)) { 
1395         PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
1396         return -1;
1397     }
1398
1399     if (!mb.mb64_hrt) { 
1400         PrintError(vm,VCORE_NONE,"hvm: invalid HRT - there is no MB64_HRT tag\n");
1401         return -1;
1402     }
1403
1404     if (configure_hrt(vm,&mb)) {
1405         PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
1406         return -1;
1407     }
1408     
1409     if (v3_write_multiboot_kernel(vm,&mb,data,size,
1410                                   (void*)vm->hvm_state.first_hrt_gpa,
1411                                   vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
1412         PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
1413         return -1;
1414     }
1415
1416     if (vm->hvm_state.gva_entry) { 
1417         vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
1418     } else {
1419         vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
1420     }
1421
1422     vm->hvm_state.hrt_type = HRT_MBOOT64;
1423
1424     return 0;
1425
1426 }
1427
1428
1429 static int setup_hrt(struct v3_vm_info *vm)
1430 {
1431     void *data;
1432     uint64_t size;
1433
1434     // If the ROS has installed an image, it takes priority
1435     if (vm->hvm_state.hrt_image) { 
1436         data = vm->hvm_state.hrt_image;
1437         size = vm->hvm_state.hrt_image_size;
1438     } else {
1439         data = vm->hvm_state.hrt_file->data;
1440         size = vm->hvm_state.hrt_file->size;
1441     }
1442         
1443     if (is_elf(data,size) &&
1444         find_mb_header(data,size)) {
1445
1446         PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
1447         if (setup_mb_kernel_hrt(vm,data,size)) { 
1448             PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
1449             return -1;
1450         } 
1451     } else {
1452         PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
1453         return -1;
1454     }
1455
1456     return 0;
1457 }
1458
1459
1460         
1461
1462 /*
1463   GPA layout:
1464
1465   HRT
1466   ---
1467   ROS
1468
1469   We do not touch the ROS portion of the address space.
1470   The HRT portion looks like:
1471
1472   INT_HANDLER (1 page - page aligned)
1473   IDT (1 page - page aligned)
1474   GDT (1 page - page aligned)
1475   TSS (1 page - page asligned)
1476   PAGETABLES  (identy map of first N GB)
1477      ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
1478      followed by 3rd level PTs in order, followed by 4th level
1479      PTs in order.  
1480   MBINFO_PAGE
1481   SCRATCH_STACK_HRT_CORE0 
1482   SCRATCH_STACK_HRT_CORE1
1483   ..
1484   SCRATCH_STACK_HRT_COREN
1485   ...
1486   HRT (as many pages as needed, page-aligned, starting at first HRT address)
1487   ---
1488   ROS
1489
1490
1491 */
1492
1493
1494 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
1495 {
1496     if (!vm->hvm_state.is_hvm) { 
1497         PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
1498         return 0;
1499     }
1500
1501     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
1502
1503     if (setup_hrt(vm)) {
1504         PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
1505         return -1;
1506     } 
1507
1508     // the locations of all the other items are determined by
1509     // the HRT setup, so these must happen after
1510
1511     write_null_int_handler(vm);
1512     write_idt(vm);
1513     write_gdt(vm);
1514     write_tss(vm);
1515
1516     write_pts(vm);
1517
1518     // this must happen last
1519     write_mb_info(vm);
1520
1521     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
1522
1523     return 0;
1524 }
1525
1526 /*
1527   On entry for every core:
1528
1529    IDTR points to stub IDT
1530    GDTR points to stub GDT
1531    TS   points to stub TSS
1532    CR3 points to root page table
1533    CR0 has PE and PG
1534    EFER has LME AND LMA (and NX for compatibility with Linux)
1535    RSP is TOS of core's scratch stack (looks like a call)
1536
1537    RAX = MB magic cookie
1538    RBX = address of multiboot info table
1539    RCX = this core id / apic id (0..N-1)
1540    RDX = this core id - first HRT core ID (==0 for the first HRT core)
1541
1542    All addresses are virtual addresses, offset as needed by gva_offset
1543
1544    Other regs are zeroed
1545
1546    shadow/nested paging state reset for long mode
1547
1548 */
1549 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
1550 {
1551     void *base;
1552     uint64_t limit;
1553     uint64_t gva_offset;
1554
1555     rdtscll(core->hvm_state.last_boot_start);
1556     
1557
1558     if (!core->hvm_state.is_hrt) { 
1559         PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
1560         return 0;
1561     }
1562
1563
1564     PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
1565
1566     gva_offset = core->vm_info->hvm_state.gva_offset;
1567     
1568     memset(&core->vm_regs,0,sizeof(core->vm_regs));
1569     memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
1570     memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
1571     memset(&core->segments,0,sizeof(core->segments));    
1572     memset(&core->msrs,0,sizeof(core->msrs));    
1573     memset(&core->fp_state,0,sizeof(core->fp_state));    
1574
1575     // We are in long mode with virtual memory and we want
1576     // to start immediatley
1577     core->cpl = 0; // we are going right into the kernel
1578     core->cpu_mode = LONG;
1579     core->mem_mode = VIRTUAL_MEM; 
1580     core->core_run_state = CORE_RUNNING ;
1581
1582
1583     // magic
1584     core->vm_regs.rax = MB2_INFO_MAGIC;
1585
1586     // multiboot info pointer
1587     get_mb_info_loc(core->vm_info, &base,&limit);
1588     core->vm_regs.rbx = (uint64_t) base + gva_offset;  
1589
1590     // core number
1591     core->vm_regs.rcx = core->vcpu_id;
1592     
1593     // HRT core number
1594     core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1595
1596     // Now point to scratch stack for this core
1597     // it begins at an ofset relative to the MB info page
1598     get_mb_info_loc(core->vm_info, &base,&limit);
1599     base = base + gva_offset;
1600     base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1601     core->vm_regs.rsp = (v3_reg_t) base;  
1602     core->vm_regs.rbp = (v3_reg_t) base-8; 
1603
1604     // push onto the stack a bad rbp and bad return address
1605     core->vm_regs.rsp-=16;
1606     v3_set_gpa_memory(core,
1607                       core->vm_regs.rsp-gva_offset,
1608                       16,
1609                       0xff);
1610
1611
1612     // HRT entry point
1613     get_hrt_loc(core->vm_info, &base,&limit);
1614     if (core->vm_info->hvm_state.gva_entry) { 
1615       core->rip = core->vm_info->hvm_state.gva_entry;
1616     } else {
1617       core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset; 
1618     }
1619       
1620
1621
1622     PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1623                (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1624                (void*)(core->rip),
1625                (void*)(core->vm_regs.rsp),
1626                (void*)(core->vm_regs.rbp),
1627                (void*)(core->vm_regs.rax),
1628                (void*)(core->vm_regs.rbx),
1629                (void*)(core->vm_regs.rcx),
1630                (void*)(core->vm_regs.rdx));
1631
1632     // Setup CRs for long mode and our stub page table
1633     // CR0: PG, PE
1634     core->ctrl_regs.cr0 = 0x80000001;
1635     core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1636
1637     // CR2: don't care (output from #PF)
1638     // CE3: set to our PML4E, without setting PCD or PWT
1639     get_pt_loc(core->vm_info, &base,&limit);
1640     core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);  // not offset as this is a GPA
1641     core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1642
1643     // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1644     core->ctrl_regs.cr4 = 0xb0;
1645     core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1646     // CR8 as usual
1647     // RFLAGS zeroed is fine: come in with interrupts off
1648     // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
1649     core->ctrl_regs.efer = 0x1d00;
1650     core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1651
1652
1653     /* 
1654        Notes on selectors:
1655
1656        selector is 13 bits of index, 1 bit table indicator 
1657        (0=>GDT), 2 bit RPL
1658        
1659        index is scaled by 8, even in long mode, where some entries 
1660        are 16 bytes long.... 
1661           -> code, data descriptors have 8 byte format
1662              because base, limit, etc, are ignored (no segmentation)
1663           -> interrupt/trap gates have 16 byte format 
1664              because offset needs to be 64 bits
1665     */
1666     
1667     // Install our stub IDT
1668     get_idt_loc(core->vm_info, &base,&limit);
1669     base += gva_offset;
1670     core->segments.idtr.selector = 0;  // entry 0 (NULL) of the GDT
1671     core->segments.idtr.base = (addr_t) base;  // only base+limit are used
1672     core->segments.idtr.limit = limit-1;
1673     core->segments.idtr.type = 0x0;
1674     core->segments.idtr.system = 0; 
1675     core->segments.idtr.dpl = 0;
1676     core->segments.idtr.present = 0;
1677     core->segments.idtr.long_mode = 0;
1678
1679     // Install our stub GDT
1680     get_gdt_loc(core->vm_info, &base,&limit);
1681     base += gva_offset;
1682     core->segments.gdtr.selector = 0;  // entry 0 (NULL) of the GDT
1683     core->segments.gdtr.base = (addr_t) base;
1684     core->segments.gdtr.limit = limit-1;   // only base+limit are used
1685     core->segments.gdtr.type = 0x0;
1686     core->segments.gdtr.system = 0; 
1687     core->segments.gdtr.dpl = 0;
1688     core->segments.gdtr.present = 0;
1689     core->segments.gdtr.long_mode = 0;
1690     
1691     // And our TSS
1692     get_tss_loc(core->vm_info, &base,&limit);
1693     base += gva_offset;  
1694     core->segments.tr.selector = 0;
1695     core->segments.tr.base = (addr_t) base;
1696     core->segments.tr.limit = limit-1;
1697     core->segments.tr.type = 0x9;
1698     core->segments.tr.system = 0;   // available 64 bit TSS 
1699     core->segments.tr.dpl = 0;
1700     core->segments.tr.present = 1;
1701     core->segments.tr.long_mode = 0; // not used
1702     
1703     base = 0x0; // these are not offset as we want to make all gvas visible
1704     limit = -1;
1705
1706     // And CS
1707     core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1708     core->segments.cs.base = (addr_t) base;   // not used
1709     core->segments.cs.limit = limit;          // not used
1710     core->segments.cs.type = 0xe;             // only C is used
1711     core->segments.cs.system = 1;             // not a system segment
1712     core->segments.cs.dpl = 0;                       
1713     core->segments.cs.present = 1;
1714     core->segments.cs.long_mode = 1;
1715
1716     // DS, SS, etc are identical
1717     core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1718     core->segments.ds.base = (addr_t) base;
1719     core->segments.ds.limit = limit;
1720     core->segments.ds.type = 0x6;            // ignored
1721     core->segments.ds.system = 1;            // not a system segment
1722     core->segments.ds.dpl = 0;
1723     core->segments.ds.present = 1;
1724     core->segments.ds.long_mode = 1;
1725     
1726     memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1727     memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1728     memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1729     memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1730     
1731
1732     // reset paging here for shadow... 
1733
1734     if (core->shdw_pg_mode != NESTED_PAGING) { 
1735         PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
1736         return -1;
1737     }
1738
1739
1740     return 0;
1741 }
1742
1743 int v3_handle_hvm_reset(struct guest_info *core)
1744 {
1745
1746     if (core->core_run_state != CORE_RESETTING) { 
1747         return 0;
1748     }
1749
1750     if (!core->vm_info->hvm_state.is_hvm) { 
1751         return 0;
1752     }
1753
1754     if (v3_is_hvm_hrt_core(core)) { 
1755         // this is an HRT reset
1756         int rc=0;
1757
1758         // wait for all the HRT cores
1759         v3_counting_barrier(&core->vm_info->reset_barrier);
1760
1761         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
1762             // I am leader
1763             core->vm_info->run_state = VM_RESETTING;
1764         }
1765
1766         core->core_run_state = CORE_RESETTING;
1767
1768         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1769             // we really only need to clear the bss
1770             // and recopy the .data, but for now we'll just
1771             // do everything
1772             rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
1773
1774             if (rc) { 
1775                 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
1776             }
1777         }
1778
1779         // now everyone is ready to reset
1780         rc |= v3_setup_hvm_hrt_core_for_boot(core);
1781
1782         if (rc) { 
1783             PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
1784         }
1785
1786         core->core_run_state = CORE_RUNNING;
1787
1788         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
1789             // leader
1790             core->vm_info->run_state = VM_RUNNING;
1791             core->vm_info->hvm_state.trans_state = HRT_IDLE;
1792         }
1793
1794         v3_counting_barrier(&core->vm_info->reset_barrier);
1795
1796         if (rc<0) { 
1797             PrintError(core->vm_info,core,"hvm: reset failed\n");
1798             return rc;
1799         } else {
1800             return 1;
1801         }
1802
1803     } else { 
1804         // ROS core will be handled by normal reset functionality
1805         return 0;
1806     }
1807 }