Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


HVM Enhancements + Bug Fixes
[palacios.git] / palacios / src / palacios / vmm_hvm.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org> 
11  * All rights reserved.
12  *
13  * Author:  Peter Dinda <pdinda@northwestern.edu>
14  *
15  * This is free software.  You are permitted to use,
16  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
17  */
18
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
26
27 #include <palacios/vmm_xml.h>
28
29 #include <palacios/vm_guest_mem.h>
30
31 #include <palacios/vmm_debug.h>
32
33
34 struct gdt_area {
35     struct {
36         uint16_t limit;
37         uint64_t base;
38     } __attribute__((packed)) gdtr;
39
40     uint64_t fsbase;
41     uint16_t cs;
42     uint16_t ds;
43     uint16_t es;
44     uint16_t fs;
45     uint16_t gs;
46     uint16_t ss;
47
48     uint64_t gdt[0];
49 } __attribute__((packed));
50
51
52 /*
53
54   MEM     = Total size of memory in the GPA (in MB)
55   ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
56
57   GPAs [0,ROS_MEM) are what the ROS sees
58   GPAs [ROS_MEM, MEM) are HRT only
59   GPAS [0,MEM) are accessible by the HRT
60
61   CORES   = Total number of cores in VM
62   ROS_CORES = Total numbber of cores for the ROS
63
64   Cores [0,ROS_CORES) are what the ROS sees
65   Cores [ROS_CORES,CORES) are HRT only
66   Cores [0,CORES) are accessible by the HRT
67
68   In a Pal file:
69
70   <files> 
71     <file id="hrtelf" filename="hrtelf.o" />
72   </files>
73
74   <mem ... >RAM</mem>   (MB)  Note these are  
75   <cores count="CORES" ...>   backward compatible
76
77   <hvm enable="y" >
78     <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
79     <hrt file_id="hrtelf" /hrt>
80   </hvm>
81
82 */
83
84 #ifndef V3_CONFIG_DEBUG_HVM
85 #undef PrintDebug
86 #define PrintDebug(fmt, args...)
87 #endif
88
89
90 int v3_init_hvm()
91 {
92     PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
93     return 0;
94 }
95
96 int v3_deinit_hvm()
97 {
98     PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
99     return 0;
100 }
101
102 // ignore requests from when we are in the wrong state
103 #define ENFORCE_STATE_MACHINE 1
104
105 // invoke the HRT using one of the followng mechanisms
106 #define UPCALL_MAGIC_ADDRESS 0x0000800df00df00dULL
107 #define UPCALL_MAGIC_ERROR   0xf00df00d
108
109
110 static int magic_upcall(struct guest_info *core, uint64_t num)
111 {
112 #ifdef V3_CONFIG_HVM_UPCALL_MAGIC_GPF
113     PrintDebug(core->vm_info, core, "hvm: injecting magic #GP into core %llu\n",num);
114     if (v3_raise_exception_with_error(&core->vm_info->cores[num],
115                                       GPF_EXCEPTION, 
116                                       UPCALL_MAGIC_ERROR)) { 
117         PrintError(core->vm_info, core,"hvm: cannot inject HRT #GP to core %llu\n",num);
118         return -1;
119     } else {
120         return 0;
121     }
122 #endif
123
124 #ifdef V3_CONFIG_HVM_UPCALL_MAGIC_PF
125     PrintDebug(core->vm_info,core,"hvm: injecting magic #GP into core %llu\n",num);
126     core->vm_info->cores[num].ctrl_regs.cr2 = UPCALL_MAGIC_ADDRESS;
127     if (v3_raise_exception_with_error(&core->vm_info->cores[num],
128                                       PF_EXCEPTION, 
129                                       UPCALL_MAGIC_ERROR)) { 
130         PrintError(core->vm_info,core, "hvm: cannot inject HRT #PF to core %llu\n",num);
131         return -1;
132     } else {
133         return 0;
134     }
135 #endif
136 #ifdef V3_CONFIG_HVM_UPCALL_MAGIC_SWIN
137     PrintDebug(core->vm_info,core,"hvm: injecting SW intr 0x%u into core %llu\n",core->vm_info->hvm_info.hrt_int_vector,num);
138     if (v3_raise_swintr(&core->vm_info->cores[cur],core->vm_info->hvm_info-->hrt_int_vector)) { 
139         PrintError(core->vm_info,core, "hvm: cannot inject HRT interrupt to core %llu\n",cur);
140         return -1;
141     } else {
142         return 0;
143     }
144 #endif
145
146     PrintError(core->vm_info,core,"hvm: no upcall mechanism is enabled!\n");
147     return -1;
148 }
149
150
151 /*
152   64 bit only hypercall:
153
154   rax = hypercall number
155   rbx = 0x646464...
156   then args are:  rcx, rdx, rsi, rdi r8, r9, r10, r11
157   rcx = 1st arg
158 */
159 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
160 {
161     uint64_t c;
162     uint64_t bitness = core->vm_regs.rbx;
163     uint64_t a1 = core->vm_regs.rcx;
164     uint64_t a2 = core->vm_regs.rdx;
165     uint64_t a3 = core->vm_regs.rsi;
166     struct v3_vm_hvm *h = &core->vm_info->hvm_state;
167     addr_t irq_state;
168
169     // Let's be paranoid here
170     irq_state = v3_lock_irqsave(h->hypercall_lock);
171
172     if (bitness!=0x6464646464646464) { 
173         PrintError(core->vm_info,core,"hvm: unable to handle non-64 bit hypercall\n");
174         core->vm_regs.rax = -1;
175         v3_unlock_irqrestore(h->hypercall_lock,irq_state);
176         return 0;
177     }
178
179     switch (a1) {
180         case 0x0:   // null
181             
182             rdtscll(c);
183             
184             V3_Print(core->vm_info,core, "hvm: received hypercall %x  rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
185                      hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, core->hvm_state.last_boot_start, core->num_exits);
186             //v3_print_core_telemetry(core);
187             //    v3_print_guest_state(core);
188             core->vm_regs.rax = 0;
189             break;
190             
191         case 0x1: // reset ros
192             PrintDebug(core->vm_info,core, "hvm: reset ROS\n");
193             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ROS,0)) { 
194                 PrintError(core->vm_info,core, "hvm: reset of ROS failed\n");
195                 core->vm_regs.rax = -1;
196             } else {
197                 core->vm_regs.rax = 0;
198             }
199             break;
200
201         case 0x2: // reset hrt
202             PrintDebug(core->vm_info,core, "hvm: reset HRT\n");
203             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_HRT,0)) { 
204                 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
205                 core->vm_regs.rax = -1;
206             } else {
207                 core->vm_regs.rax = 0;
208             }
209             break;
210
211         case 0x3: // reset both
212             PrintDebug(core->vm_info,core, "hvm: reset ROS+HRT\n");
213             if (v3_reset_vm_extended(core->vm_info,V3_VM_RESET_ALL,0)) { 
214                 PrintError(core->vm_info,core, "hvm: reset of HRT failed\n");
215                 core->vm_regs.rax = -1;
216             } else {
217                 core->vm_regs.rax = 0;
218             }
219             break;
220             
221         case 0x8: // replace HRT image
222             // a2 = gva of image
223             // a3 = size of image
224             PrintDebug(core->vm_info,core,"hvm: request replacement HRT image addr=0x%llx size=0x%llx\n",a2,a3);
225
226             if (h->hrt_image) { 
227                 // delete old
228                 V3_VFree(h->hrt_image);
229                 h->hrt_image = 0;
230             }
231
232             h->hrt_image = V3_VMalloc(a3);
233
234             if (!(h->hrt_image)) {
235                 PrintError(core->vm_info,core, "hvm: failed to allocate space for replacement image\n");
236                 core->vm_regs.rax = -1;
237             } else {
238                 if (v3_read_gva_memory(core, a2, a3, (uint8_t*) h->hrt_image)!=a3) { 
239                     PrintError(core->vm_info, core, "hvm: cannot read replacement image\n");
240                     core->vm_regs.rax = -1;
241                 } else {
242                     h->hrt_image_size = a3; 
243                     core->vm_regs.rax = 0;
244                 }
245             }
246
247             if (core->vm_regs.rax) { 
248                 PrintError(core->vm_info,core,"hvm: Failed to replace HRT image\n");
249             } else {
250                 PrintDebug(core->vm_info,core,"hvm: HRT image successfully replaced\n");
251             }
252
253             break;
254
255
256         case 0xf: // get HRT state
257             core->vm_regs.rax = h->trans_state;
258             if (v3_write_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*) &h->ros_event)!=sizeof(h->ros_event)) { 
259                 PrintError(core->vm_info, core, "hvm: cannot write back ROS event state to %p - continuing\n",(void*)a2);
260             }
261             //PrintDebug(core->vm_info,core,"hvm: get HRT transaction state 0x%llx\n",core->vm_regs.rax);
262             break;
263
264         case 0x10:
265             PrintDebug(core->vm_info, core, "hvm: ROS event request\n");
266             if (h->ros_event.event_type!=ROS_NONE) { 
267                 PrintError(core->vm_info, core, "hvm: ROS event is already in progress\n");
268                 core->vm_regs.rax = -1;
269             } else {
270                 if (v3_read_gva_memory(core, a2, sizeof(h->ros_event), (uint8_t*)&h->ros_event)!=sizeof(h->ros_event)) { 
271                     PrintError(core->vm_info, core, "hvm: cannot read ROS event from %p\n",(void*)a2);
272                     core->vm_regs.rax = -1;
273                 } else {
274                     core->vm_regs.rax = 0;
275                     PrintDebug(core->vm_info, core, "hvm: copied new ROS event (type=%s)\n",
276                                h->ros_event.event_type == ROS_PAGE_FAULT ? "page fault" : 
277                                (h->ros_event.event_type == ROS_SYSCALL ? "syscall" : "none"));
278                     
279                 }
280             }
281
282             break;
283
284         case 0x1e: // ack result (HRT has read the result of the finished event)
285             if (h->ros_event.event_type != ROS_DONE) {
286                 PrintError(core->vm_info, core, "hvm: cannot ack event result when not in ROS_DONE state\n");
287                 core->vm_regs.rax = -1;
288             } else {
289                 h->ros_event.event_type=ROS_NONE;
290                 PrintDebug(core->vm_info, core, "hvm: HRT core acks event result\n");
291                 core->vm_regs.rax = 0;
292             }
293             break;
294
295         case 0x1f:
296             PrintDebug(core->vm_info, core, "hvm: completion of ROS event (rc=0x%llx)\n",a2);
297             h->ros_event.event_type=ROS_DONE;
298             h->ros_event.last_ros_event_result = a2;
299             break;
300
301         case 0x20: // invoke function (ROS->HRT)
302         case 0x21: // invoke parallel function (ROS->HRT)
303             if (v3_is_hvm_hrt_core(core)) { 
304                 PrintError(core->vm_info,core,"hvm: %s function invocation not supported from HRT core\n", a1==0x20 ? "" : "parallel");
305                 core->vm_regs.rax = -1;
306             } else {
307                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
308                     PrintError(core->vm_info,core, "hvm: cannot invoke %s function %p in state %d\n",a1==0x20 ? "" : "parallel", (void*)a2,h->trans_state);
309                     core->vm_regs.rax = -1;
310                 } else {
311                     uint64_t *page = (uint64_t *) h->comm_page_hva;
312                     uint64_t first, last, cur;
313
314                     PrintDebug(core->vm_info,core, "hvm: %s invoke function %p\n",a1==0x20 ? "" : "parallel",(void*)a2);
315                     page[0] = a1;
316                     page[1] = a2;
317
318                     if (a1==0x20) { 
319                         first=last=h->first_hrt_core;
320                     } else {
321                         first=h->first_hrt_core;
322                         last=core->vm_info->num_cores-1;
323                     }
324
325                     core->vm_regs.rax = 0;
326
327                     h->trans_count = last-first+1;
328
329                     for (cur=first;cur<=last;cur++) { 
330                         if (magic_upcall(core,cur)) {
331                             core->vm_regs.rax = -1;
332                             break;
333                         }
334                         // Force core to exit now
335                         v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
336                     }
337                     if (core->vm_regs.rax==0) { 
338                         if (a1==0x20) { 
339                             h->trans_state = HRT_CALL;
340                         } else {
341                             h->trans_state = HRT_PARCALL;
342                         }
343                     }  else {
344                         PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
345                         h->trans_state = HRT_IDLE;
346                         h->trans_count = 0;
347                     }
348                 }
349             }
350             break;
351
352
353         case 0x28: // setup for synchronous operation (ROS->HRT)
354         case 0x29: // teardown for synchronous operation (ROS->HRT)
355             if (v3_is_hvm_hrt_core(core)) { 
356                 PrintError(core->vm_info,core,"hvm: %ssynchronization invocation not supported from HRT core\n",a1==0x29 ? "de" : "");
357                 core->vm_regs.rax = -1;
358             } else {
359                 if (ENFORCE_STATE_MACHINE && 
360                     ((a1==0x28 && h->trans_state!=HRT_IDLE) || (a1==0x29 && h->trans_state!=HRT_SYNC))) { 
361                     PrintError(core->vm_info,core, "hvm: cannot invoke %ssynchronization in state %d\n",a1==0x29 ? "de" : "", h->trans_state);
362                     core->vm_regs.rax = -1;
363                 } else {
364                     uint64_t *page = (uint64_t *) h->comm_page_hva;
365                     uint64_t first, last, cur;
366
367                     PrintDebug(core->vm_info,core, "hvm: invoke %ssynchronization on address %p\n",a1==0x29 ? "de" : "",(void*)a2);
368                     page[0] = a1;
369                     page[1] = a2;
370
371                     first=last=h->first_hrt_core;  // initially we will sync only with BSP
372
373                     core->vm_regs.rax = 0;
374
375                     h->trans_count = last-first+1;
376
377                     for (cur=first;cur<=last;cur++) { 
378                         
379                         if (magic_upcall(core,cur)) { 
380                             core->vm_regs.rax = -1;
381                             break;
382                         }
383                         // Force core to exit now
384                         v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
385                           
386                     }
387                     if (core->vm_regs.rax==0) { 
388                         if (a1==0x28) { 
389                             h->trans_state = HRT_SYNCSETUP;
390                         } else {
391                             h->trans_state = HRT_SYNCTEARDOWN;                      
392                         }
393                     }  else {
394                         PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT call failure\n");
395                         h->trans_state = HRT_IDLE;
396                         h->trans_count = 0;
397                     }
398                 }
399             }
400             break;
401
402         case 0x2f: // function exec or sync done
403             if (v3_is_hvm_ros_core(core)) { 
404                 PrintError(core->vm_info,core, "hvm: request for exec or sync done from ROS core\n");
405                 core->vm_regs.rax=-1;
406             } else {
407                 if (ENFORCE_STATE_MACHINE && 
408                     h->trans_state!=HRT_CALL && 
409                     h->trans_state!=HRT_PARCALL && 
410                     h->trans_state!=HRT_SYNCSETUP &&
411                     h->trans_state!=HRT_SYNCTEARDOWN) {
412                     PrintError(core->vm_info,core,"hvm: function or sync completion when not in HRT_CALL, HRT_PARCALL, HRT_SYNCSETUP, or HRT_SYNCTEARDOWN state\n");
413                     core->vm_regs.rax=-1;
414                 } else {
415                     uint64_t one=1;
416                     PrintDebug(core->vm_info,core, "hvm: function or sync complete\n");
417                     if (__sync_fetch_and_sub(&h->trans_count,one)==1) {
418                         // last one, switch state
419                         if (h->trans_state==HRT_SYNCSETUP) { 
420                             h->trans_state=HRT_SYNC;
421                             PrintDebug(core->vm_info,core, "hvm: function complete - now synchronous\n");
422                         } else {
423                             h->trans_state=HRT_IDLE;
424                         }
425                     }
426                     core->vm_regs.rax=0;
427                 }
428             }
429                     
430             break;
431
432         case 0x30: // merge address space
433         case 0x31: // unmerge address space
434             if (v3_is_hvm_hrt_core(core)) { 
435                 PrintError(core->vm_info,core,"hvm: request to %smerge address space from HRT core\n", a1==0x30 ? "" : "un");
436                 core->vm_regs.rax=-1;
437             } else {
438                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
439                     PrintError(core->vm_info,core,"hvm: request to %smerge address space in non-idle state (%d)\n",a1==0x30 ? "" : "un", h->trans_state);
440                     core->vm_regs.rax=-1;
441                 } else {
442                     uint64_t *page = (uint64_t *) h->comm_page_hva;
443
444                     PrintDebug(core->vm_info,core,"hvm: %smerge address space request with %p\n",a1==0x30 ? "" : "un",(void*)core->ctrl_regs.cr3);
445                     // should sanity check to make sure guest is in 64 bit without anything strange
446
447                     page[0] = a1;
448                     page[1] = core->ctrl_regs.cr3;  // this is a do-not-care for an unmerge
449
450                     core->vm_regs.rax = 0;
451
452                     h->trans_state = HRT_MERGE;
453
454                     if (magic_upcall(core,h->first_hrt_core)) {
455                         core->vm_regs.rax = -1;
456                         break;
457                     }
458
459                     // Force core to exit now
460                     v3_interrupt_cpu(core->vm_info,core->vm_info->cores[h->first_hrt_core].pcpu_id,0);
461
462                 }
463                 
464             }
465                 
466             break;
467             
468
469         case 0x3f: // merge operation done
470             if (v3_is_hvm_ros_core(core)) { 
471                 PrintError(core->vm_info,core, "hvm: request for merge done from ROS core\n");
472                 core->vm_regs.rax=-1;
473             } else {
474                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_MERGE) {
475                     PrintError(core->vm_info,core,"hvm: merge/unmerge done when in non-idle state\n");
476                     core->vm_regs.rax=-1;
477                 } else {
478                     PrintDebug(core->vm_info,core, "hvm: merge or unmerge complete - back to idle\n");
479                     h->trans_state=HRT_IDLE;
480                     core->vm_regs.rax=0;
481                 }
482             }
483                     
484             break;
485             
486         case 0x40: // install or remove signal handler
487             if (v3_is_hvm_hrt_core(core)) { 
488                 PrintError(core->vm_info,core, "hvm: HRT cannot install signal handler...\n");
489                 core->vm_regs.rax=-1;
490             } else {
491                 PrintDebug(core->vm_info,core,"hvm: install signal handler for CR3=%p, handler=%p, stack=%p\n",(void*)core->ctrl_regs.cr3, (void*)a2, (void*)a3);
492                 if (h->ros_signal.code) { 
493                     PrintError(core->vm_info,core,"hvm: signal is pending...\n");
494                     core->vm_regs.rax=-1;
495                 } else {
496                     if ((a2 || a3) && (h->ros_signal.handler || h->ros_signal.stack)) { 
497                         PrintError(core->vm_info,core,"hvm: attempt to replace existing handler without removing it first\n");
498                         core->vm_regs.rax=-1;
499                     } else {
500                         // actually make the change
501                         h->ros_signal.handler=a2;
502                         h->ros_signal.stack=a3;
503                         h->ros_signal.cr3=core->ctrl_regs.cr3;
504                         core->vm_regs.rax=0;
505
506                         // test by signalling back a hello 
507                         // if (a2 && a3) { 
508                         //    v3_hvm_signal_ros(core->vm_info,0xf00d);
509                         //}
510                     }
511                 }
512             }
513             break;
514
515         case 0x41: // raise signal in the ROS from HRT or ROS
516             PrintDebug(core->vm_info,core,"hvm: HRT raises signal code=0x%llx\n", a2);
517             core->vm_regs.rax = v3_hvm_signal_ros(core->vm_info,a2);
518             break;
519
520         case 0x51: // fill GDT area (HRT only)
521             if (v3_is_hvm_hrt_core(core)) {
522                 PrintError(core->vm_info, core, "hvm: HRT cannot request a GDT area fill\n");
523                 core->vm_regs.rax = -1;
524             } else {
525                 struct guest_info * hrt_core = &core->vm_info->cores[h->first_hrt_core];
526                 struct gdt_area * area = V3_Malloc(sizeof(struct gdt_area) + core->segments.gdtr.limit);
527                 if (!area) {
528                     PrintError(core->vm_info, core, "hvm: could not allocate GDT area\n");
529                     core->vm_regs.rax = -1;
530                     break;
531                 }
532
533                 PrintDebug(core->vm_info, core, "hvm: ROS requests to fill GDT area with fsbase=%p\n", (void*)a2);
534
535                 if (!h->hrt_gdt_gva) {
536                     PrintError(core->vm_info, core, "hvm: HRT has not registered a GDT state save area\n");
537                     core->vm_regs.rax = -1;
538                     V3_Free(area);
539                     break;
540                 }
541
542                 area->gdtr.base  = h->hrt_gdt_gva + sizeof(struct gdt_area);
543                 area->gdtr.limit = core->segments.gdtr.limit;
544                 area->fsbase     = a2;
545                 area->cs         = core->segments.cs.selector;
546                 area->ds         = core->segments.ds.selector;
547                 area->es         = core->segments.es.selector;
548                 area->fs         = core->segments.fs.selector;
549                 area->gs         = core->segments.gs.selector;
550                 area->ss         = core->segments.ss.selector;
551                 
552                 if (v3_read_gva_memory(core, 
553                                        core->segments.gdtr.base,
554                                        core->segments.gdtr.limit,
555                                        (uint8_t*)area->gdt) != core->segments.gdtr.limit) {
556                     PrintError(core->vm_info, core, "hvm: could not copy GDT from ROS\n");
557                     core->vm_regs.rax = -1;
558                     V3_Free(area);
559                     break;
560                 }
561                                         
562                 uint_t area_size = sizeof(struct gdt_area) + core->segments.gdtr.limit;
563
564                 // copy the entire area over
565                 PrintDebug(core->vm_info, core, "hvm: copying %u bytes into GDT area\n", area_size);
566
567                 if (v3_write_gva_memory(hrt_core, h->hrt_gdt_gva, area_size, (uchar_t*)area) != area_size) {
568                     PrintError(core->vm_info, core, "hvm: could not copy GDT area\n");
569                     core->vm_regs.rax = -1;
570                     V3_Free(area);
571                     break;
572                 }
573
574                 if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
575                     PrintError(core->vm_info,core, "hvm: cannot sync GDT in state %d\n", h->trans_state);
576                     core->vm_regs.rax = -1;
577                     V3_Free(area);
578                     break;
579                 } else {
580                     uint64_t *page = (uint64_t *) h->comm_page_hva;
581                     uint64_t first, last, cur;
582
583                     PrintDebug(core->vm_info,core, "hvm: sync GDT\n");
584                     page[0] = a1;
585                     page[1] = h->hrt_gdt_gva;
586                     page[2] = a3;
587
588                     first=last=h->first_hrt_core;
589                     
590                     core->vm_regs.rax = 0;
591                     
592                     h->trans_count = last-first+1;
593
594                     for (cur=first;cur<=last;cur++) { 
595                         if (magic_upcall(core,cur)) {
596                             core->vm_regs.rax = -1;
597                             break;
598                         }
599                         // Force core to exit now
600                         v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
601                     }
602                     
603                     if (core->vm_regs.rax==0) { 
604                         h->trans_state = HRT_GDTSYNC;
605                     }  else {
606                         PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT GDT SYNC failure\n");
607                         h->trans_state = HRT_IDLE;
608                         h->trans_count = 0;
609                     }
610
611                     V3_Free(area);
612
613                 }
614                 
615             }
616             
617             break;
618         
619         case 0x52: // register HRT GDT area
620             if (!v3_is_hvm_hrt_core(core)) {
621                 PrintError(core->vm_info, core, "hvm: ROS cannot install a GDT area\n"); 
622                 core->vm_regs.rax = -1;
623             } else {
624                 PrintDebug(core->vm_info, core, "hvm: HRT registers GDT save area at gva=%p\n", (void*)a2);
625                 h->hrt_gdt_gva = a2;
626                 core->vm_regs.rax = 0;
627             }
628
629         PrintDebug(core->vm_info, core, "hvm: Printing current HRT GDT...\n");
630 #ifdef V3_CONFIG_DEBUG_HVM
631         v3_print_gdt(core, core->segments.gdtr.base);
632 #endif
633         
634         break;
635         
636         case 0x53: // restore GDT
637
638             if (v3_is_hvm_hrt_core(core)) {
639                 PrintError(core->vm_info, core, "hvm: HRT cannot request GDT restoration\n");
640                 core->vm_regs.rax = -1;
641                 break;
642             } else {
643                 PrintDebug(core->vm_info, core, "hvm: ROS requesting to restore original GDT\n");
644                 core->vm_regs.rax = 0;
645             }
646             
647             if (ENFORCE_STATE_MACHINE && h->trans_state!=HRT_IDLE) { 
648                 PrintError(core->vm_info,core, "hvm: cannot sync GDT in state %d\n", h->trans_state);
649                 core->vm_regs.rax = -1;
650                 break;
651             } else {
652                 uint64_t *page = (uint64_t *) h->comm_page_hva;
653                 uint64_t first, last, cur;
654                 
655                 PrintDebug(core->vm_info,core, "hvm: restore GDT\n");
656                 page[0] = a1;
657                 
658                 first=last=h->first_hrt_core;
659                 
660                 core->vm_regs.rax = 0;
661                 
662                 h->trans_count = last-first+1;
663                 
664                 for (cur=first;cur<=last;cur++) { 
665                     if (magic_upcall(core,cur)) {
666                         core->vm_regs.rax = -1;
667                         break;
668                     }
669                     // Force core to exit now
670                     v3_interrupt_cpu(core->vm_info,core->vm_info->cores[cur].pcpu_id,0);
671                 }
672                 
673                 if (core->vm_regs.rax==0) { 
674                     h->trans_state = HRT_GDTSYNC;
675                 }  else {
676                     PrintError(core->vm_info,core,"hvm: in inconsistent state due to HRT GDT SYNC failure\n");
677                     h->trans_state = HRT_IDLE;
678                     h->trans_count = 0;
679                 }
680             }
681             
682             break;
683             
684         case 0x5f: // GDT sync operation done
685             if (v3_is_hvm_ros_core(core)) { 
686                 PrintError(core->vm_info,core, "hvm: invalid request for GDT sync done from ROS core\n");
687                 core->vm_regs.rax=-1;
688             } else {
689                 if (ENFORCE_STATE_MACHINE && h->trans_state != HRT_GDTSYNC) {
690                     PrintError(core->vm_info,core,"hvm: GDT sync done when in incorrect state (%d)\n", h->trans_state);
691                     core->vm_regs.rax=-1;
692                 } else {
693                     PrintDebug(core->vm_info,core, "hvm: GDT sync complete - back to idle\n");
694                     PrintDebug(core->vm_info, core, "hvm: Dumping new HRT GDT...\n");
695 #ifdef V3_CONFIG_DEBUG_HVM
696                     v3_print_gdt(core, core->segments.gdtr.base);
697 #endif
698                     h->trans_state=HRT_IDLE;
699                     core->vm_regs.rax=0;
700                 }
701                 
702             }
703             break;
704
705         default:
706             PrintError(core->vm_info,core,"hvm: unknown hypercall %llx\n",a1);
707             core->vm_regs.rax=-1;
708             break;
709     }
710                 
711     v3_unlock_irqrestore(h->hypercall_lock,irq_state);
712     return 0;
713 }
714
715
716 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
717
718 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
719 {
720     v3_cfg_tree_t *hvm_config;
721     v3_cfg_tree_t *ros_config;
722     v3_cfg_tree_t *hrt_config;
723     char *enable;
724     char *ros_cores;
725     char *ros_mem;
726     char *hrt_file_id=0;
727
728     PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
729
730     /* 
731        Defaults - all ROS
732     */
733     memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
734     vm->hvm_state.is_hvm=0;
735     vm->hvm_state.first_hrt_core=vm->num_cores;
736     vm->hvm_state.first_hrt_gpa=vm->mem_size;
737
738     if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
739         PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
740         goto out_ok;
741     }
742     
743     if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
744         PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
745         goto out_ok;
746     }
747
748     if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) { 
749         PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
750         return -1;
751     }
752  
753     if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) { 
754         PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
755         return -1;
756     }
757    
758     vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
759     
760     if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) { 
761         PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
762         return -1;
763     }
764
765     vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
766
767     if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) { 
768         PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
769         return -1;
770     }
771  
772     if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) { 
773         PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
774         return -1;
775     }
776
777     vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
778     
779     if (!vm->hvm_state.hrt_file) { 
780         PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
781         return -1;
782     }
783
784     if (v3_register_hypercall(vm, HVM_HCALL, 
785                               hvm_hcall_handler, 0)) { 
786         PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
787         return -1;
788     }
789
790     v3_lock_init(&(vm->hvm_state.hypercall_lock));
791
792     // XXX sanity check config here
793
794     vm->hvm_state.is_hvm=1;
795
796  out_ok:
797     if (vm->hvm_state.is_hvm) {
798         V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
799                  vm->hvm_state.first_hrt_core-1,
800                  (void*) vm->hvm_state.first_hrt_gpa-1,
801                  vm->hvm_state.first_hrt_core,
802                  vm->num_cores-1,
803                  (void*) vm->hvm_state.first_hrt_gpa,
804                  (void*)vm->mem_size-1,
805                  hrt_file_id,
806                  vm->hvm_state.hrt_file->tag);
807     } else {
808         V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
809     }
810     return 0;
811     
812 }
813
814
815 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
816 {
817     PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
818
819
820     if (vm->hvm_state.hrt_image) { 
821         V3_VFree(vm->hvm_state.hrt_image);
822         vm->hvm_state.hrt_image=0;
823         vm->hvm_state.hrt_image_size=0;
824     }
825
826     v3_remove_hypercall(vm,HVM_HCALL);
827
828     v3_lock_deinit(&(vm->hvm_state.hypercall_lock));
829
830     if (vm->hvm_state.comm_page_hpa) { 
831         struct v3_mem_region *r = v3_get_mem_region(vm,-1,(addr_t)vm->hvm_state.comm_page_hpa);
832         if (!r) { 
833             PrintError(vm,VCORE_NONE,"hvm: odd, VM has comm_page_hpa, but no shadow memory\n");
834         } else {
835             v3_delete_mem_region(vm,r);
836         }
837     }
838
839     return 0;
840 }
841
842 int v3_init_hvm_core(struct guest_info *core)
843 {
844     memset(&core->hvm_state,0,sizeof(core->hvm_state));
845     if (core->vm_info->hvm_state.is_hvm) { 
846         if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) { 
847             core->hvm_state.is_hrt=1;
848         }
849     }
850     return 0;
851 }
852
853 int v3_deinit_hvm_core(struct guest_info *core)
854 {
855     PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
856
857     return 0;
858 }
859
860
861 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
862 {
863     if (vm->hvm_state.is_hvm) { 
864         return vm->hvm_state.first_hrt_gpa;
865     } else {
866         return vm->mem_size;
867     }
868 }
869 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
870 {
871     return vm->mem_size;
872 }
873
874 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
875 {
876     if (vm->hvm_state.is_hvm) { 
877         return vm->hvm_state.first_hrt_core;
878     } else {
879         return vm->num_cores;
880     }
881 }
882
883 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
884 {
885     if (vm->hvm_state.is_hvm) { 
886         return vm->num_cores - vm->hvm_state.first_hrt_core;
887     } else {
888         return 0;
889     }
890 }
891
892
893 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
894 {
895     if (vm->hvm_state.is_hvm) { 
896         return gpa<vm->hvm_state.first_hrt_gpa;
897     } else {
898         return 1;
899     }
900 }
901
902 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
903 {
904     if (vm->hvm_state.is_hvm) { 
905         return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
906     } else {
907         return 0;
908     }
909 }
910
911 int v3_is_hvm_hrt_core(struct guest_info *core)
912 {
913     return core->hvm_state.is_hrt;
914 }
915
916 int v3_is_hvm_ros_core(struct guest_info *core)
917 {
918     return !core->hvm_state.is_hrt;
919 }
920
921 int      v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
922 {
923     if (!src) {
924         // ioapic or msi to apic
925         return !dest->hvm_state.is_hrt;
926     } else {
927         // apic to apic
928         return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
929     }
930 }
931
932 void     v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm, 
933                                         uint32_t *start_apic, uint32_t *num_apics)
934 {
935     if (!core) { 
936         // Seen from ioapic, msi, etc: 
937         if (vm->hvm_state.is_hvm) {
938             // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
939             *start_apic = 0;
940             *num_apics = vm->hvm_state.first_hrt_core;
941         } else {
942             // Non-HVM shows all cores/APICs to apic, msi, etc.
943             *start_apic = 0;
944             *num_apics = vm->num_cores;
945         }
946     } else {
947         // Seen from apic
948         if (core->hvm_state.is_hrt) { 
949             // HRT core/apic sees all apics
950             // (this policy may change...)
951             *start_apic = 0;
952             *num_apics = vm->num_cores;
953         } else {
954             // non-HRT core/apic sees only non-HRT cores/apics
955             *start_apic = 0 ;
956             *num_apics = vm->hvm_state.first_hrt_core;
957         }
958     }
959 }
960
961 #define MAX(x,y) ((x)>(y)?(x):(y))
962 #define MIN(x,y) ((x)<(y)?(x):(y))
963
964
965 static uint64_t boot_state_end_addr(struct v3_vm_info *vm) 
966 {
967     return PAGE_ADDR(vm->mem_size);
968 }
969    
970 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
971 {
972     *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - PAGE_SIZE);
973     *limit = PAGE_SIZE;
974 }
975
976 extern v3_cpu_arch_t v3_mach_type;
977
978 extern void *v3_hvm_svm_null_int_handler_start;
979 extern void *v3_hvm_svm_null_int_handler_end;
980 extern void *v3_hvm_vmx_null_int_handler_start;
981 extern void *v3_hvm_vmx_null_int_handler_end;
982
983 static void write_null_int_handler(struct v3_vm_info *vm)
984 {
985     void *base;
986     uint64_t limit;
987     void *data;
988     uint64_t len;
989
990     get_null_int_handler_loc(vm,&base,&limit);
991
992     switch (v3_mach_type) {
993 #ifdef V3_CONFIG_SVM
994         case V3_SVM_CPU:
995         case V3_SVM_REV3_CPU:
996             data = (void*) &v3_hvm_svm_null_int_handler_start;
997             len = (void*) &v3_hvm_svm_null_int_handler_end - data;
998             break;
999 #endif
1000 #if V3_CONFIG_VMX
1001         case V3_VMX_CPU:
1002         case V3_VMX_EPT_CPU:
1003         case V3_VMX_EPT_UG_CPU:
1004             data = (void*) &v3_hvm_vmx_null_int_handler_start;
1005             len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
1006             break;
1007 #endif
1008         default:
1009             PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
1010             data = 0;
1011             len = 0;
1012     }
1013
1014     if (data) {
1015         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
1016     }
1017
1018     PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
1019 }
1020
1021
1022 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1023 {
1024     *base = (void*) PAGE_ADDR(boot_state_end_addr(vm) - 2 * PAGE_SIZE);
1025     *limit = 16*256;
1026 }
1027
1028 // default IDT entries (int and trap gates)
1029 //
1030 // Format is 16 bytes long:
1031 //   16 offsetlo   => 0
1032 //   16 selector   => (target code selector) => 0x8 // entry 1 of GDT
1033 //    3 ist        => (stack) = 0 => current stack
1034 //    5 reserved   => 0
1035 //    4 type       => 0xe=>INT, 0xf=>TRAP 
1036 //    1 reserved   => 0  (indicates "system" by being zero)
1037 //    2 dpl        => 0
1038 //    1 present    => 1
1039 //   16 offsetmid  => 0
1040 //   32 offsethigh => 0   (total is a 64 bit offset)
1041 //   32 reserved   => 0
1042 //
1043 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
1044 // 
1045 // Note little endian
1046 //
1047 static uint64_t idt64_trap_gate_entry_mask[2] = { 0x00008f0000080000, 0x0 } ;
1048 static uint64_t idt64_int_gate_entry_mask[2] =  { 0x00008e0000080000, 0x0 };
1049
1050 static void write_idt(struct v3_vm_info *vm)
1051 {
1052     void *base;
1053     uint64_t limit;
1054     void *handler;
1055     uint64_t handler_len;
1056     int i;
1057     uint64_t trap_gate[2];
1058     uint64_t int_gate[2];
1059
1060     get_idt_loc(vm,&base,&limit);
1061
1062     get_null_int_handler_loc(vm,&handler,&handler_len);
1063
1064     handler += vm->hvm_state.gva_offset;
1065
1066     memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
1067     memcpy(int_gate,idt64_int_gate_entry_mask,16);
1068
1069     if (handler) {
1070         // update the entries for the handler location
1071         uint8_t *mask;
1072         uint8_t *hand;
1073         
1074         hand = (uint8_t*) &handler;
1075
1076         mask = (uint8_t *)trap_gate;
1077         memcpy(&(mask[0]),&(hand[0]),2); // offset low
1078         memcpy(&(mask[6]),&(hand[2]),2); // offset med
1079         memcpy(&(mask[8]),&(hand[4]),4); // offset high
1080
1081         mask = (uint8_t *)int_gate;
1082         memcpy(&(mask[0]),&(hand[0]),2); // offset low
1083         memcpy(&(mask[6]),&(hand[2]),2); // offset med
1084         memcpy(&(mask[8]),&(hand[4]),4); // offset high
1085
1086         PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
1087     }
1088
1089     for (i=0;i<32;i++) { 
1090         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
1091     }
1092
1093     for (i=32;i<256;i++) { 
1094         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
1095     }
1096
1097     PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
1098 }
1099
1100
1101
1102 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1103 {
1104     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 3 * PAGE_SIZE);
1105     *limit = 8*3;
1106 }
1107
1108 static uint64_t gdt64[3] = {
1109     0x0000000000000000, /* null */
1110     0x00a09a0000000000, /* code (note lme bit) */
1111     0x00a0920000000000, /* data (most entries don't matter) */
1112 };
1113
1114 static void write_gdt(struct v3_vm_info *vm)
1115 {
1116     void *base;
1117     uint64_t limit;
1118
1119     get_gdt_loc(vm,&base,&limit);
1120     v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
1121
1122     PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
1123 }
1124
1125
1126
1127 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1128 {
1129     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm) - 4 * PAGE_SIZE);
1130     *limit = PAGE_SIZE;
1131 }
1132
1133 static void write_tss(struct v3_vm_info *vm)
1134 {
1135     void *base;
1136     uint64_t limit;
1137
1138     get_tss_loc(vm,&base,&limit);
1139
1140     v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
1141
1142     PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
1143 }
1144
1145
1146 #define TOP_HALF_START  0xffff800000000000ULL
1147 #define BOTTOM_HALF_END 0x00007fffffffffffULL
1148
1149
1150 #define L4_UNIT PAGE_SIZE
1151 #define L3_UNIT (512ULL * L4_UNIT)
1152 #define L2_UNIT (512ULL * L3_UNIT)
1153 #define L1_UNIT (512ULL * L2_UNIT)
1154
1155 static void compute_pts_4KB(struct v3_vm_info *vm, 
1156                             uint64_t *l1, uint64_t *l2, uint64_t *l3, uint64_t *l4)    
1157 {
1158
1159     // we map the physical memory up to max_mem_mapped either at 0x0 or at TOP_HALF start
1160     // that is, it either fills in the first 256 rows of PML4 or the last 256 rows of PML4
1161     // so it is the same number of page tables regardless
1162
1163     uint64_t max_gva = vm->hvm_state.max_mem_mapped;
1164
1165     *l1 = 1;  // 1 PML4
1166     *l2 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*512ULL*4096ULL),512);
1167     *l3 = CEIL_DIV(CEIL_DIV(max_gva,512ULL*4096ULL),512);
1168     *l4 = CEIL_DIV(CEIL_DIV(max_gva,4096ULL),512);
1169 }
1170
1171
1172
1173 /*
1174   PTS MAP using 1 GB pages
1175   n second levels pts, highest gva, highest address
1176   1 top level
1177
1178
1179 OR
1180   
1181   PTS MAP using 2 MB pages
1182   n third level pts, highest gva, highest address
1183   m second level pts, highest gva, highest address
1184   1 top level pt
1185
1186 OR
1187
1188   PTS MAP using 4 KB pages
1189   n 4th level, highest gva, highest address
1190   m 3rd level, highest gva, hihgest address
1191   l second level, highest gva, highest address
1192   1 top level pt
1193
1194 OR
1195   PTS MAP using 512 GB pages when this becomes available
1196
1197 */
1198
1199
1200 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1201 {
1202     uint64_t l1,l2,l3,l4;
1203     uint64_t num_pt;
1204
1205     compute_pts_4KB(vm,&l1,&l2,&l3,&l4);
1206
1207     if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
1208         num_pt = l1;
1209     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
1210         num_pt = l1 + l2;
1211     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
1212         num_pt = l1 + l2 + l3;
1213     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
1214         num_pt = l1 + l2 + l3 + l4;
1215     } else {
1216         PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT location flags=0x%llx memsize=0x%llx\n",vm->hvm_state.hrt_flags,(uint64_t)vm->mem_size);
1217         return;
1218     }
1219
1220     *base = (void*)PAGE_ADDR(boot_state_end_addr(vm)-(4+num_pt)*PAGE_SIZE);
1221     *limit = num_pt*PAGE_SIZE;
1222 }
1223
1224 static void write_pts(struct v3_vm_info *vm)
1225 {
1226     uint64_t size;
1227     uint64_t num_l1, num_l2, num_l3, num_l4;
1228     void *start_l1, *start_l2, *start_l3, *start_l4;
1229     uint64_t max_level;
1230     void *cur_pt;
1231     void *cur_gva;
1232     void *cur_gpa;
1233     void *min_gpa = 0;
1234     void *max_gpa = (void*) vm->hvm_state.max_mem_mapped;
1235     void *min_gva = (void*) vm->hvm_state.gva_offset;
1236 #ifdef V3_CONFIG_DEBUG_HVM
1237     void *max_gva = min_gva+vm->hvm_state.max_mem_mapped;
1238 #endif
1239     uint64_t i, pt;
1240     uint64_t i_start,i_end;
1241     
1242     struct pml4e64 *pml4e;
1243     struct pdpe64 *pdpe;
1244     struct pde64 *pde;
1245     struct pte64 *pte;
1246
1247     if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
1248         PrintError(vm,VCORE_NONE,"hvm: Attempt to build 512 GB pages\n");
1249         max_level = 1;
1250     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
1251         max_level = 2;
1252     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_2MB) {
1253         max_level = 3;
1254     } else if (vm->hvm_state.hrt_flags & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
1255         max_level = 4;
1256     } else {
1257         PrintError(vm,VCORE_NONE,"hvm: Cannot determine PT levels\n");
1258         return;
1259     }
1260
1261     get_pt_loc(vm,&start_l1,&size);
1262     compute_pts_4KB(vm,&num_l1,&num_l2,&num_l3,&num_l4);
1263
1264     start_l2=start_l1+PAGE_SIZE*num_l1;
1265     start_l3=start_l2+PAGE_SIZE*num_l2;
1266     start_l4=start_l3+PAGE_SIZE*num_l3;
1267
1268     PrintDebug(vm,VCORE_NONE,"hvm: writing %llu levels of PTs start at address %p\n", max_level,start_l1);
1269     PrintDebug(vm,VCORE_NONE,"hvm: min_gva=%p, max_gva=%p, min_gpa=%p, max_gpa=%p\n",min_gva,max_gva,min_gpa,max_gpa);
1270     PrintDebug(vm,VCORE_NONE,"hvm: num_l1=%llu, num_l2=%llu, num_l3=%llu, num_l4=%llu\n", num_l1, num_l2, num_l3, num_l4);
1271     PrintDebug(vm,VCORE_NONE,"hvm: start_l1=%p, start_l2=%p, start_l3=%p, start_l4=%p\n", start_l1, start_l2, start_l3, start_l4);
1272
1273     cur_pt=start_l1;
1274
1275     // build PML4 (only one)
1276     if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pml4e)) { 
1277         PrintError(vm,VCORE_NONE,"hvm: Cannot translate pml4 location\n");
1278         return;
1279     }
1280
1281     memset(pml4e,0,PAGE_SIZE);
1282
1283     if (min_gva==0x0) { 
1284         i_start=0; i_end = num_l2;
1285     } else if (min_gva==(void*)TOP_HALF_START) { 
1286         i_start=256; i_end=256+num_l2;
1287     } else {
1288         PrintError(vm,VCORE_NONE,"hvm: unsupported gva offset\n");
1289         return;
1290     }
1291
1292     for (i=i_start, cur_gva=min_gva, cur_gpa=min_gpa;
1293          (i<i_end);
1294          i++, cur_gva+=L1_UNIT, cur_gpa+=L1_UNIT) {
1295
1296         pml4e[i].present=1;
1297         pml4e[i].writable=1;
1298         
1299         if (max_level==1) { 
1300             PrintError(vm,VCORE_NONE,"hvm: Intel has not yet defined a PML4E large page\n");
1301             pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1302             //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1303         } else {
1304             pml4e[i].pdp_base_addr = PAGE_BASE_ADDR((addr_t)(start_l2+(i-i_start)*PAGE_SIZE));
1305             //PrintDebug(vm,VCORE_NONE,"hvm: pml4: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pml4e[i].pdp_base_addr);
1306         }
1307     }
1308
1309     // 512 GB only
1310     if (max_level==1) {
1311         return;
1312     }
1313
1314
1315
1316     for (cur_pt=start_l2, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1317          pt<num_l2;
1318          cur_pt+=PAGE_SIZE, pt++) { 
1319
1320         // build PDPE
1321         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pdpe)) { 
1322             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pdpe location\n");
1323             return;
1324         }
1325         
1326         memset(pdpe,0,PAGE_SIZE);
1327         
1328         for (i=0; 
1329              i<512 && cur_gpa<max_gpa; 
1330              i++, cur_gva+=L2_UNIT, cur_gpa+=L2_UNIT) {
1331
1332             pdpe[i].present=1;
1333             pdpe[i].writable=1;
1334         
1335             if (max_level==2) { 
1336                 pdpe[i].large_page=1;
1337                 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1338                 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1339             } else {
1340                 pdpe[i].pd_base_addr = PAGE_BASE_ADDR((addr_t)(start_l3+(pt*512+i)*PAGE_SIZE));
1341                 //PrintDebug(vm,VCORE_NONE,"hvm: pdpe: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pdpe[i].pd_base_addr);
1342             }
1343         }
1344     }
1345         
1346     //1 GB only
1347     if (max_level==2) { 
1348         return;
1349     }
1350
1351     for (cur_pt=start_l3, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1352          pt<num_l3;
1353          cur_pt+=PAGE_SIZE, pt++) { 
1354
1355         // build PDE
1356         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pde)) { 
1357             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pde location\n");
1358             return;
1359         }
1360         
1361         memset(pde,0,PAGE_SIZE);
1362         
1363         for (i=0; 
1364              i<512 && cur_gpa<max_gpa; 
1365              i++, cur_gva+=L3_UNIT, cur_gpa+=L3_UNIT) {
1366
1367             pde[i].present=1;
1368             pde[i].writable=1;
1369         
1370             if (max_level==3) { 
1371                 pde[i].large_page=1;
1372                 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1373                 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t) pde[i].pt_base_addr);
1374             } else {
1375                 pde[i].pt_base_addr = PAGE_BASE_ADDR((addr_t)(start_l4+(pt*512+i)*PAGE_SIZE));
1376                 //PrintDebug(vm,VCORE_NONE,"hvm: pde: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pde[i].pt_base_addr);
1377             }
1378         }
1379     }
1380
1381     //2 MB only
1382     if (max_level==3) { 
1383         return;
1384     }
1385
1386
1387     // 4 KB
1388     for (cur_pt=start_l4, pt=0, cur_gpa=min_gpa, cur_gva=min_gva;
1389          pt<num_l4;
1390          cur_pt+=PAGE_SIZE, pt++) { 
1391
1392         // build PTE
1393         if (v3_gpa_to_hva(&vm->cores[0],(addr_t)cur_pt,(addr_t*)&pte)) { 
1394             PrintError(vm,VCORE_NONE,"hvm: Cannot translate pte location\n");
1395             return;
1396         }
1397         
1398         memset(pte,0,PAGE_SIZE);
1399         
1400         for (i=0; 
1401              i<512 && cur_gpa<max_gpa; 
1402              i++, cur_gva+=L4_UNIT, cur_gpa+=L4_UNIT) {
1403
1404             pte[i].present=1;
1405             pte[i].writable=1;
1406             pte[i].page_base_addr = PAGE_BASE_ADDR((addr_t)(cur_gpa));
1407             //PrintDebug(vm,VCORE_NONE,"hvm: pte: gva %p to frame 0%llx\n", cur_gva, (uint64_t)pte[i].page_base_addr);
1408         }
1409     }
1410
1411     return;
1412 }
1413
1414
1415 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1416 {
1417     
1418     get_pt_loc(vm,base, limit);
1419     *base-=PAGE_SIZE;
1420     *limit=PAGE_SIZE;
1421 }
1422
1423
1424 int v3_build_hrt_multiboot_tag(struct guest_info *core, mb_info_hrt_t *hrt)
1425 {
1426     struct v3_vm_info *vm = core->vm_info;
1427
1428     hrt->tag.type = MB_INFO_HRT_TAG;
1429     hrt->tag.size = sizeof(mb_info_hrt_t);
1430
1431     hrt->total_num_apics = vm->num_cores;
1432     hrt->first_hrt_apic_id = vm->hvm_state.first_hrt_core;
1433     hrt->have_hrt_ioapic=0;
1434     hrt->first_hrt_ioapic_entry=0;
1435
1436     hrt->cpu_freq_khz = V3_CPU_KHZ();
1437
1438     hrt->hrt_flags = vm->hvm_state.hrt_flags;
1439     hrt->max_mem_mapped = vm->hvm_state.max_mem_mapped;
1440     hrt->first_hrt_gpa = vm->hvm_state.first_hrt_gpa;
1441     hrt->gva_offset = vm->hvm_state.gva_offset;
1442     hrt->comm_page_gpa = vm->hvm_state.comm_page_gpa;
1443     hrt->hrt_int_vector = vm->hvm_state.hrt_int_vector;
1444     
1445     return 0;
1446 }
1447
1448 static void write_mb_info(struct v3_vm_info *vm) 
1449 {
1450     if (vm->hvm_state.hrt_type!=HRT_MBOOT64) { 
1451         PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
1452         return;
1453     } else {
1454         uint8_t buf[256];
1455         uint64_t size;
1456         void *base;
1457         uint64_t limit;
1458
1459         get_mb_info_loc(vm,&base,&limit);
1460         
1461         if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) { 
1462             PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
1463             return;
1464         }
1465
1466         if (size>limit) { 
1467             PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
1468             return;
1469         }
1470         
1471         v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
1472                             (addr_t)base,
1473                             size,
1474                             buf);
1475
1476         PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
1477     }
1478 }
1479
1480 #define SCRATCH_STACK_SIZE 4096
1481
1482
1483 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
1484 {
1485     void *mb_base;
1486     uint64_t mb_limit;
1487     
1488     get_mb_info_loc(vm,&mb_base,&mb_limit);
1489     
1490     mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
1491
1492     *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
1493
1494     if (mb_base < *base+PAGE_SIZE) { 
1495         PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
1496     }
1497
1498     *limit = mb_base - *base;
1499 }
1500
1501
1502 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1503 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
1504
1505 #define ELF_MAGIC    0x464c457f
1506 #define MB2_MAGIC    0xe85250d6
1507
1508 #define MB2_INFO_MAGIC    0x36d76289
1509
1510 static int is_elf(uint8_t *data, uint64_t size)
1511 {
1512     if (*((uint32_t*)data)==ELF_MAGIC) {
1513         return 1;
1514     } else { 
1515         return 0;
1516     }
1517 }
1518
1519 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
1520 {
1521     uint64_t limit = size > 32768 ? 32768 : size;
1522     uint64_t i;
1523
1524     // Scan for the .boot magic cookie
1525     // must be in first 32K, assume 4 byte aligned
1526     for (i=0;i<limit;i+=4) { 
1527         if (*((uint32_t*)&data[i])==MB2_MAGIC) {
1528             INFO("Found multiboot header at offset 0x%llx\n",i);
1529             return (mb_header_t *) &data[i];
1530         }
1531     }
1532     return 0;
1533 }
1534
1535
1536 static int configure_hrt(struct v3_vm_info *vm, mb_data_t *mb)
1537 {
1538     struct v3_vm_hvm *h = &vm->hvm_state;
1539     uint64_t f = mb->mb64_hrt->hrt_flags;
1540     uint64_t maxmap = mb->mb64_hrt->max_mem_to_map;
1541     uint64_t gvaoff = mb->mb64_hrt->gva_offset;
1542     uint64_t gvaentry = mb->mb64_hrt->gva_entry;
1543     uint64_t commgpa = mb->mb64_hrt->comm_page_gpa;
1544     uint8_t  vec = mb->mb64_hrt->hrt_int_vector;
1545     
1546
1547     PrintDebug(vm,VCORE_NONE,"hvm: HRT request: flags=0x%llx max_map=0x%llx gva_off=%llx gva_entry=%llx comm_page=0x%llx vector=0x%x\n",
1548                f, maxmap, gvaoff,gvaentry,commgpa, vec);
1549
1550     if (maxmap<0x100000000ULL) { 
1551         PrintDebug(vm,VCORE_NONE,"hvm: revising request up to 4 GB max map\n");
1552         maxmap=0x100000000ULL;
1553     }
1554
1555     if (f & MB_TAG_MB64_HRT_FLAG_MAP_512GB) { 
1556         PrintError(vm,VCORE_NONE,"hvm: support for 512 GB pages is not yet available in hardware\n");
1557         return -1;
1558     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_1GB) { 
1559         f &= ~0x3c;
1560         f |= MB_TAG_MB64_HRT_FLAG_MAP_1GB;
1561         h->max_mem_mapped = maxmap;
1562         PrintDebug(vm,VCORE_NONE,"hvm: 1 GB pages selected\n");
1563     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_2MB) { 
1564         f &= ~0x3c;
1565         f |= MB_TAG_MB64_HRT_FLAG_MAP_2MB;
1566         h->max_mem_mapped = maxmap;
1567         PrintDebug(vm,VCORE_NONE,"hvm: 2 MB pages selected\n");
1568     } else if (f & MB_TAG_MB64_HRT_FLAG_MAP_4KB) { 
1569         f &= ~0x3c;
1570         f |= MB_TAG_MB64_HRT_FLAG_MAP_4KB;
1571         h->max_mem_mapped = maxmap;
1572         PrintDebug(vm,VCORE_NONE,"hvm: 4 KB pages selected\n");
1573     } else {
1574         PrintError(vm,VCORE_NONE,"hvm: no page table model is requested\n");
1575         return -1;
1576     }
1577
1578     if (f & MB_TAG_MB64_HRT_FLAG_RELOC) {
1579         PrintError(vm,VCORE_NONE,"hvm: relocatable hrt not currently supported\n");
1580         return -1;
1581     }
1582
1583     h->hrt_flags = f;
1584
1585     if (maxmap>h->max_mem_mapped) { 
1586         PrintError(vm,VCORE_NONE,"hvm: requested 0x%llx bytes mapped, which is more than currently supported\n",maxmap);
1587         return -1;
1588     }
1589
1590     if (gvaoff!=0 && gvaoff!=TOP_HALF_START) { 
1591         PrintError(vm,VCORE_NONE,"hvm: currently only GVA offsets of 0 and %llx are supported\n", TOP_HALF_START);
1592         return -1;
1593     }
1594     
1595     h->gva_offset = gvaoff;
1596
1597     h->gva_entry = gvaentry;
1598
1599     if (mb->addr->load_addr < h->first_hrt_gpa) { 
1600         PrintError(vm,VCORE_NONE,"hvm: load start address of HRT is below first HRT GPA\n");
1601         return -1;
1602     }
1603     
1604     if (mb->addr->bss_end_addr > (vm->mem_size-(1024*1024*64))) {
1605         PrintError(vm,VCORE_NONE,"hvm: bss end address of HRT above last allowed GPA\n");
1606         return -1;
1607     }
1608     
1609     if (vec<32) { 
1610         PrintError(vm,VCORE_NONE,"hvm: cannot support vector %x\n",vec);
1611         return -1;
1612     }
1613     
1614     h->hrt_int_vector = vec;
1615     
1616     
1617     if (commgpa < vm->mem_size) { 
1618         PrintError(vm,VCORE_NONE,"hvm: cannot map comm page over physical memory\n");
1619         return -1;
1620     } 
1621
1622     h->comm_page_gpa = commgpa;
1623
1624     if (!h->comm_page_hpa) { 
1625         if (!(h->comm_page_hpa=V3_AllocPages(1))) { 
1626             PrintError(vm,VCORE_NONE,"hvm: unable to allocate space for comm page\n");
1627             return -1;
1628         }
1629
1630         h->comm_page_hva = V3_VAddr(h->comm_page_hpa);
1631         
1632         memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1633         
1634         if (v3_add_shadow_mem(vm,-1,h->comm_page_gpa,h->comm_page_gpa+PAGE_SIZE_4KB,(addr_t)h->comm_page_hpa)) { 
1635             PrintError(vm,VCORE_NONE,"hvm: unable to map communication page\n");
1636             V3_FreePages((void*)(h->comm_page_gpa),1);
1637             return -1;
1638         }
1639         
1640         
1641         PrintDebug(vm,VCORE_NONE,"hvm: added comm page for first time\n");
1642     }
1643
1644     memset(h->comm_page_hva,0,PAGE_SIZE_4KB);
1645     
1646     
1647     PrintDebug(vm,VCORE_NONE,"hvm: HRT configuration: flags=0x%llx max_mem_mapped=0x%llx gva_offset=0x%llx gva_entry=0x%llx comm_page=0x%llx vector=0x%x\n",
1648                h->hrt_flags,h->max_mem_mapped, h->gva_offset,h->gva_entry, h->comm_page_gpa, h->hrt_int_vector);
1649     
1650     return 0;
1651
1652 }
1653
1654 static int setup_mb_kernel_hrt(struct v3_vm_info *vm, void *data, uint64_t size)
1655 {
1656     mb_data_t mb;
1657
1658     if (v3_parse_multiboot_header(data, size, &mb)) { 
1659         PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
1660         return -1;
1661     }
1662
1663     if (!mb.mb64_hrt) { 
1664         PrintError(vm,VCORE_NONE,"hvm: invalid HRT - there is no MB64_HRT tag\n");
1665         return -1;
1666     }
1667
1668     if (configure_hrt(vm,&mb)) {
1669         PrintError(vm,VCORE_NONE, "hvm: cannot configure HRT\n");
1670         return -1;
1671     }
1672     
1673     if (v3_write_multiboot_kernel(vm,&mb,data,size,
1674                                   (void*)vm->hvm_state.first_hrt_gpa,
1675                                   vm->mem_size-vm->hvm_state.first_hrt_gpa)) {
1676         PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
1677         return -1;
1678     }
1679
1680     if (vm->hvm_state.gva_entry) { 
1681         vm->hvm_state.hrt_entry_addr = vm->hvm_state.gva_entry;
1682     } else {
1683         vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + vm->hvm_state.gva_offset;
1684     }
1685
1686     vm->hvm_state.hrt_type = HRT_MBOOT64;
1687
1688     return 0;
1689
1690 }
1691
1692
1693 static int setup_hrt(struct v3_vm_info *vm)
1694 {
1695     void *data;
1696     uint64_t size;
1697
1698     // If the ROS has installed an image, it takes priority
1699     if (vm->hvm_state.hrt_image) { 
1700         data = vm->hvm_state.hrt_image;
1701         size = vm->hvm_state.hrt_image_size;
1702     } else {
1703         data = vm->hvm_state.hrt_file->data;
1704         size = vm->hvm_state.hrt_file->size;
1705     }
1706         
1707     if (is_elf(data,size) &&
1708         find_mb_header(data,size)) {
1709
1710         PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
1711         if (setup_mb_kernel_hrt(vm,data,size)) { 
1712             PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
1713             return -1;
1714         } 
1715     } else {
1716         PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not a multiboot kernel\n");
1717         return -1;
1718     }
1719
1720     return 0;
1721 }
1722
1723
1724         
1725
1726 /*
1727   GPA layout:
1728
1729   HRT
1730   ---
1731   ROS
1732
1733   We do not touch the ROS portion of the address space.
1734   The HRT portion looks like:
1735
1736   INT_HANDLER (1 page - page aligned)
1737   IDT (1 page - page aligned)
1738   GDT (1 page - page aligned)
1739   TSS (1 page - page asligned)
1740   PAGETABLES  (identy map of first N GB)
1741      ROOT PT first (lowest memory addr), followed by 2nd level PTs in order,
1742      followed by 3rd level PTs in order, followed by 4th level
1743      PTs in order.  
1744   MBINFO_PAGE
1745   SCRATCH_STACK_HRT_CORE0 
1746   SCRATCH_STACK_HRT_CORE1
1747   ..
1748   SCRATCH_STACK_HRT_COREN
1749   ...
1750   HRT (as many pages as needed, page-aligned, starting at first HRT address)
1751   ---
1752   ROS
1753
1754
1755 */
1756
1757
1758 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
1759 {
1760     if (!vm->hvm_state.is_hvm) { 
1761         PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
1762         return 0;
1763     }
1764
1765     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
1766
1767     if (setup_hrt(vm)) {
1768         PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
1769         return -1;
1770     } 
1771
1772     // the locations of all the other items are determined by
1773     // the HRT setup, so these must happen after
1774
1775     write_null_int_handler(vm);
1776     write_idt(vm);
1777     write_gdt(vm);
1778     write_tss(vm);
1779
1780     write_pts(vm);
1781
1782     // this must happen last
1783     write_mb_info(vm);
1784
1785     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
1786
1787     return 0;
1788 }
1789
1790 /*
1791   On entry for every core:
1792
1793    IDTR points to stub IDT
1794    GDTR points to stub GDT
1795    TS   points to stub TSS
1796    CR3 points to root page table
1797    CR0 has PE, PG, and WP
1798    EFER has LME AND LMA (and NX for compatibility with Linux)
1799    RSP is TOS of core's scratch stack (looks like a call)
1800
1801    RAX = MB magic cookie
1802    RBX = address of multiboot info table
1803    RCX = this core id / apic id (0..N-1)
1804    RDX = this core id - first HRT core ID (==0 for the first HRT core)
1805
1806    All addresses are virtual addresses, offset as needed by gva_offset
1807
1808    Other regs are zeroed
1809
1810    shadow/nested paging state reset for long mode
1811
1812 */
1813 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
1814 {
1815     void *base;
1816     uint64_t limit;
1817     uint64_t gva_offset;
1818
1819     rdtscll(core->hvm_state.last_boot_start);
1820     
1821
1822     if (!core->hvm_state.is_hrt) { 
1823         PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
1824         return 0;
1825     }
1826
1827
1828     PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
1829
1830     gva_offset = core->vm_info->hvm_state.gva_offset;
1831     
1832     memset(&core->vm_regs,0,sizeof(core->vm_regs));
1833     memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
1834     memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
1835     memset(&core->segments,0,sizeof(core->segments));    
1836     memset(&core->msrs,0,sizeof(core->msrs));    
1837     memset(&core->fp_state,0,sizeof(core->fp_state));    
1838
1839     // We are in long mode with virtual memory and we want
1840     // to start immediatley
1841     core->cpl = 0; // we are going right into the kernel
1842     core->cpu_mode = LONG;
1843     core->mem_mode = VIRTUAL_MEM; 
1844     core->core_run_state = CORE_RUNNING ;
1845
1846
1847     // magic
1848     core->vm_regs.rax = MB2_INFO_MAGIC;
1849
1850     // multiboot info pointer
1851     get_mb_info_loc(core->vm_info, &base,&limit);
1852     core->vm_regs.rbx = (uint64_t) base + gva_offset;  
1853
1854     // core number
1855     core->vm_regs.rcx = core->vcpu_id;
1856     
1857     // HRT core number
1858     core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1859
1860     // Now point to scratch stack for this core
1861     // it begins at an ofset relative to the MB info page
1862     get_mb_info_loc(core->vm_info, &base,&limit);
1863     base = base + gva_offset;
1864     base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1865     core->vm_regs.rsp = (v3_reg_t) base;  
1866     core->vm_regs.rbp = (v3_reg_t) base-8; 
1867
1868     // push onto the stack a bad rbp and bad return address
1869     core->vm_regs.rsp-=16;
1870     v3_set_gpa_memory(core,
1871                       core->vm_regs.rsp-gva_offset,
1872                       16,
1873                       0xff);
1874
1875
1876     // HRT entry point
1877     get_hrt_loc(core->vm_info, &base,&limit);
1878     if (core->vm_info->hvm_state.gva_entry) { 
1879       core->rip = core->vm_info->hvm_state.gva_entry;
1880     } else {
1881       core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr + gva_offset; 
1882     }
1883       
1884
1885
1886     PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1887                (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1888                (void*)(core->rip),
1889                (void*)(core->vm_regs.rsp),
1890                (void*)(core->vm_regs.rbp),
1891                (void*)(core->vm_regs.rax),
1892                (void*)(core->vm_regs.rbx),
1893                (void*)(core->vm_regs.rcx),
1894                (void*)(core->vm_regs.rdx));
1895
1896     // Setup CRs for long mode and our stub page table
1897     // CR0: PG, PE, and WP for catching COW faults in kernel-mode (which is not default behavior)
1898     core->ctrl_regs.cr0 = 0x80010001;
1899     core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1900
1901     // CR2: don't care (output from #PF)
1902     // CE3: set to our PML4E, without setting PCD or PWT
1903     get_pt_loc(core->vm_info, &base,&limit);
1904     core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);  // not offset as this is a GPA
1905     core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1906
1907     // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1908     core->ctrl_regs.cr4 = 0xb0;
1909     core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1910     // CR8 as usual
1911     // RFLAGS zeroed is fine: come in with interrupts off
1912     // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
1913     core->ctrl_regs.efer = 0x1d00;
1914     core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1915
1916
1917     /* 
1918        Notes on selectors:
1919
1920        selector is 13 bits of index, 1 bit table indicator 
1921        (0=>GDT), 2 bit RPL
1922        
1923        index is scaled by 8, even in long mode, where some entries 
1924        are 16 bytes long.... 
1925           -> code, data descriptors have 8 byte format
1926              because base, limit, etc, are ignored (no segmentation)
1927           -> interrupt/trap gates have 16 byte format 
1928              because offset needs to be 64 bits
1929     */
1930     
1931     // Install our stub IDT
1932     get_idt_loc(core->vm_info, &base,&limit);
1933     base += gva_offset;
1934     core->segments.idtr.selector = 0;  // entry 0 (NULL) of the GDT
1935     core->segments.idtr.base = (addr_t) base;  // only base+limit are used
1936     core->segments.idtr.limit = limit-1;
1937     core->segments.idtr.type = 0x0;
1938     core->segments.idtr.system = 0; 
1939     core->segments.idtr.dpl = 0;
1940     core->segments.idtr.present = 0;
1941     core->segments.idtr.long_mode = 0;
1942
1943     // Install our stub GDT
1944     get_gdt_loc(core->vm_info, &base,&limit);
1945     base += gva_offset;
1946     core->segments.gdtr.selector = 0;  // entry 0 (NULL) of the GDT
1947     core->segments.gdtr.base = (addr_t) base;
1948     core->segments.gdtr.limit = limit-1;   // only base+limit are used
1949     core->segments.gdtr.type = 0x0;
1950     core->segments.gdtr.system = 0; 
1951     core->segments.gdtr.dpl = 0;
1952     core->segments.gdtr.present = 0;
1953     core->segments.gdtr.long_mode = 0;
1954     
1955     // And our TSS
1956     get_tss_loc(core->vm_info, &base,&limit);
1957     base += gva_offset;  
1958     core->segments.tr.selector = 0;
1959     core->segments.tr.base = (addr_t) base;
1960     core->segments.tr.limit = limit-1;
1961     core->segments.tr.type = 0x9;
1962     core->segments.tr.system = 0;   // available 64 bit TSS 
1963     core->segments.tr.dpl = 0;
1964     core->segments.tr.present = 1;
1965     core->segments.tr.long_mode = 0; // not used
1966     
1967     base = 0x0; // these are not offset as we want to make all gvas visible
1968     limit = -1;
1969
1970     // And CS
1971     core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1972     core->segments.cs.base = (addr_t) base;   // not used
1973     core->segments.cs.limit = limit;          // not used
1974     core->segments.cs.type = 0xe;             // only C is used
1975     core->segments.cs.system = 1;             // not a system segment
1976     core->segments.cs.dpl = 0;                       
1977     core->segments.cs.present = 1;
1978     core->segments.cs.long_mode = 1;
1979
1980     // DS, SS, etc are identical
1981     core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1982     core->segments.ds.base = (addr_t) base;
1983     core->segments.ds.limit = limit;
1984     core->segments.ds.type = 0x6;            // ignored
1985     core->segments.ds.system = 1;            // not a system segment
1986     core->segments.ds.dpl = 0;
1987     core->segments.ds.present = 1;
1988     core->segments.ds.long_mode = 1;
1989     
1990     memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1991     memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1992     memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1993     memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1994     
1995
1996     // reset paging here for shadow... 
1997
1998     if (core->shdw_pg_mode != NESTED_PAGING) { 
1999         PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
2000         return -1;
2001     }
2002
2003
2004     return 0;
2005 }
2006
2007 int v3_handle_hvm_reset(struct guest_info *core)
2008 {
2009
2010     if (core->core_run_state != CORE_RESETTING) { 
2011         return 0;
2012     }
2013
2014     if (!core->vm_info->hvm_state.is_hvm) { 
2015         return 0;
2016     }
2017
2018     if (v3_is_hvm_hrt_core(core)) { 
2019         // this is an HRT reset
2020         int rc=0;
2021
2022         // wait for all the HRT cores
2023         v3_counting_barrier(&core->vm_info->reset_barrier);
2024
2025         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
2026             // I am leader
2027             core->vm_info->run_state = VM_RESETTING;
2028         }
2029
2030         core->core_run_state = CORE_RESETTING;
2031
2032         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
2033             // we really only need to clear the bss
2034             // and recopy the .data, but for now we'll just
2035             // do everything
2036             rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
2037
2038             if (rc) { 
2039                 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
2040             }
2041         }
2042
2043         // now everyone is ready to reset
2044         rc |= v3_setup_hvm_hrt_core_for_boot(core);
2045
2046         if (rc) { 
2047             PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
2048         }
2049
2050         core->core_run_state = CORE_RUNNING;
2051
2052         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
2053             // leader
2054             core->vm_info->run_state = VM_RUNNING;
2055             core->vm_info->hvm_state.trans_state = HRT_IDLE;
2056         }
2057
2058         v3_counting_barrier(&core->vm_info->reset_barrier);
2059
2060         if (rc<0) { 
2061             PrintError(core->vm_info,core,"hvm: reset failed\n");
2062             return rc;
2063         } else {
2064             return 1;
2065         }
2066
2067     } else { 
2068         // ROS core will be handled by normal reset functionality
2069         return 0;
2070     }
2071 }
2072
2073 int v3_handle_hvm_entry(struct guest_info *core)
2074 {
2075     if (!core->vm_info->hvm_state.is_hvm        // not relevant to non-HVM
2076         || core->hvm_state.is_hrt              // not relevant to an HRT in an HVM
2077         || !core->vm_info->hvm_state.ros_signal.code) { // not relevant if there is no code to inject
2078
2079         // Note that above check for code could race with a writer, but
2080         // if that happens, we'll simply inject at the next opportunity instead of 
2081         // this one (see below for atomic update)
2082         return 0;
2083     } else {
2084         struct v3_ros_signal *s = &core->vm_info->hvm_state.ros_signal;
2085
2086         // HVM ROS
2087         if (! (s->handler && // handler installed
2088                s->cr3 &&     // process installed
2089                s->stack &&   // stack installed
2090                core->cpl == 3 &&  // user mode
2091                core->ctrl_regs.cr3 == s->cr3) // right process active
2092             ) {
2093             // Cannot inject at this time
2094             return 0;
2095         } else {
2096             // We can inject now, let's atomically see if we have something
2097             // and commit to doing it if we do
2098             uint64_t code;
2099
2100             // Get code, reset to allow next one
2101             code = __sync_fetch_and_and(&(s->code), 0);
2102
2103             if (!code) { 
2104                 // nothing to do after all
2105                 return 0;
2106             } else {
2107
2108                 // actually do inject
2109
2110                 uint64_t rsp;
2111                 uint64_t frame[6];
2112                 
2113                 PrintDebug(core->vm_info,core,"hvm: ROS interrupt starting with rip=%p rsp=%p\n", (void*) core->rip, (void*) core->vm_regs.rsp);
2114                 // build interrupt frame
2115                 frame[0] = code;
2116                 frame[1] = core->rip;
2117                 frame[2] = core->segments.cs.selector; // return cs
2118                 frame[3] = core->ctrl_regs.rflags;
2119                 frame[4] = core->vm_regs.rsp;
2120                 frame[5] = core->segments.ss.selector; // return ss
2121                 
2122                 rsp = (s->stack - 16) & (~0xf); // We should be 16 byte aligned to start
2123                 rsp -= sizeof(frame);
2124                 
2125
2126                 if (v3_write_gva_memory(core,(addr_t)rsp,sizeof(frame),(uint8_t*)frame)!=sizeof(frame)) { 
2127                     PrintError(core->vm_info,core,"hvm: failed to write interrupt frame\n");
2128                     // we just lost this inject
2129                     return -1;
2130                 }
2131                 
2132                 // now make us look like we are jumping to the entry
2133                 core->rip = s->handler;
2134                 core->vm_regs.rsp = rsp;
2135
2136                 PrintDebug(core->vm_info,core,"hvm: ROS frame is 0x%llx|0x%llx|0x%llx|0x%llx|0x%llx|0x%llx and and on entry rip=%p and rsp=%p\n", frame[0],frame[1],frame[2],frame[3],frame[4],frame[5],(void*) core->rip, (void*) core->vm_regs.rsp);
2137                 
2138                 // and we should be good to go
2139                 return 0;
2140             } 
2141         }
2142     }
2143 }
2144
2145 int v3_handle_hvm_exit(struct guest_info *core)
2146 {
2147     // currently nothing
2148     return 0;
2149 }
2150
2151
2152 int v3_hvm_signal_ros(struct v3_vm_info *vm, uint64_t code)
2153 {
2154     struct v3_ros_signal *s = &vm->hvm_state.ros_signal;
2155
2156     if (!code) { 
2157         PrintError(vm,VCORE_NONE,"hvm: cannot signal ros with code zero\n");
2158         return -1;
2159     }
2160
2161     // handler, etc, must exist
2162     if (!s->handler || !s->stack) { 
2163         PrintError(vm,VCORE_NONE,"hvm: cannot signal ros with no installed handler\n");
2164         return -1;
2165     } else {
2166         // we set the code only if we are idle (code 0), 
2167         // and we do so only 
2168         if (!__sync_bool_compare_and_swap(&(s->code), 0, code)) {
2169             PrintError(vm,VCORE_NONE,"hvm: signal was already asserted\n");
2170             return -1;
2171         } else {
2172             PrintDebug(vm,VCORE_NONE,"hvm: raised signal 0x%llx to the ROS\n",code);
2173             return 0;
2174         }
2175     }
2176 }
2177
2178
2179