Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


VM Reset Bugfixes
[palacios.git] / palacios / src / palacios / vmm_hvm.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2015, The V3VEE Project <http://www.v3vee.org> 
11  * All rights reserved.
12  *
13  * Author:  Peter Dinda <pdinda@northwestern.edu>
14  *
15  * This is free software.  You are permitted to use,
16  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
17  */
18
19 #include <palacios/vmm_mem.h>
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_util.h>
22 #include <palacios/vmm_emulator.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_debug.h>
25 #include <palacios/vmm_hypercall.h>
26
27 #include <palacios/vmm_xml.h>
28
29 #include <palacios/vm_guest_mem.h>
30
31 #include <palacios/vmm_debug.h>
32
33
34 /*
35
36   MEM     = Total size of memory in the GPA (in MB)
37   ROS_MEM = Total size of memory for the ROS (in MB) (<RAM)
38
39   GPAs [0,ROS_MEM) are what the ROS sees
40   GPAs [ROS_MEM, MEM) are HRT only
41   GPAS [0,MEM) are accessible by the HRT
42
43   CORES   = Total number of cores in VM
44   ROS_CORES = Total numbber of cores for the ROS
45
46   Cores [0,ROS_CORES) are what the ROS sees
47   Cores [ROS_CORES,CORES) are HRT only
48   Cores [0,CORES) are accessible by the HRT
49
50   In a Pal file:
51
52   <files> 
53     <file id="hrtelf" filename="hrtelf.o" />
54   </files>
55
56   <mem ... >RAM</mem>   (MB)  Note these are  
57   <cores count="CORES" ...>   backward compatible
58
59   <hvm enable="y">
60     <ros cores="ROS_CORES" mem="ROS_MEM" /> (MB)
61     <hrt file_id="hrtelf" /hrt>
62   </hvm>
63
64 */
65
66 #ifndef V3_CONFIG_DEBUG_HVM
67 #undef PrintDebug
68 #define PrintDebug(fmt, args...)
69 #endif
70
71
72 // if set, we will map the first 1 GB of memory using a 3 level
73 // hierarchy, for compatibility with Nautilus out of the box.
74 // Otherwise we will map the first 512 GB using a 2 level
75 // hieratchy
76 #define HVM_MAP_1G_2M 1
77
78 int v3_init_hvm()
79 {
80     PrintDebug(VM_NONE,VCORE_NONE, "hvm: init\n");
81     return 0;
82 }
83
84 int v3_deinit_hvm()
85 {
86     PrintDebug(VM_NONE,VCORE_NONE, "hvm: deinit\n");
87     return 0;
88 }
89
90
91 static int hvm_hcall_handler(struct guest_info * core , hcall_id_t hcall_id, void * priv_data)
92 {
93     uint64_t c;
94
95     rdtscll(c);
96
97
98     V3_Print(core->vm_info,core, "hvm: received hypercall %x  rax=%llx rbx=%llx rcx=%llx at cycle count %llu (%llu cycles since last boot start) num_exits=%llu since initial boot\n",
99              hcall_id, core->vm_regs.rax, core->vm_regs.rbx, core->vm_regs.rcx, c, c-core->hvm_state.last_boot_start, core->num_exits);
100     //v3_print_core_telemetry(core);
101     //    v3_print_guest_state(core);
102
103     return 0;
104 }
105
106 #define CEIL_DIV(x,y) (((x)/(y)) + !!((x)%(y)))
107
108 int v3_init_hvm_vm(struct v3_vm_info *vm, struct v3_xml *config)
109 {
110     v3_cfg_tree_t *hvm_config;
111     v3_cfg_tree_t *ros_config;
112     v3_cfg_tree_t *hrt_config;
113     char *enable;
114     char *ros_cores;
115     char *ros_mem;
116     char *hrt_file_id=0;
117
118     PrintDebug(vm, VCORE_NONE, "hvm: vm init\n");
119
120     /* 
121        Defaults - all ROS
122     */
123     memset(&vm->hvm_state,0,sizeof(struct v3_vm_hvm));
124     vm->hvm_state.is_hvm=0;
125     vm->hvm_state.first_hrt_core=vm->num_cores;
126     vm->hvm_state.first_hrt_gpa=vm->mem_size;
127
128     if (!config || !(hvm_config=v3_cfg_subtree(config,"hvm"))) {
129         PrintDebug(vm,VCORE_NONE,"hvm: no HVM configuration found (all HW is ROS)\n");
130         goto out_ok;
131     }
132     
133     if (!(enable=v3_cfg_val(hvm_config,"enable")) || strcasecmp(enable,"y")) {
134         PrintDebug(vm,VCORE_NONE,"hvm: HVM configuration disabled (all HW is ROS)\n");
135         goto out_ok;
136     }
137
138     if (!(ros_config=v3_cfg_subtree(hvm_config,"ros"))) { 
139         PrintError(vm,VCORE_NONE,"hvm: HVM configuration without ROS block...\n");
140         return -1;
141     }
142  
143     if (!(ros_cores=v3_cfg_val(ros_config,"cores"))) { 
144         PrintError(vm,VCORE_NONE,"hvm: ROS block without cores...\n");
145         return -1;
146     }
147    
148     vm->hvm_state.first_hrt_core = ((uint32_t)atoi(ros_cores));
149     
150     if (!(ros_mem=v3_cfg_val(ros_config,"mem"))) { 
151         PrintError(vm,VCORE_NONE,"hvm: ROS block without mem...\n");
152         return -1;
153     }
154
155     vm->hvm_state.first_hrt_gpa = ((uint64_t)atoi(ros_mem))*1024*1024;
156         
157     if (!(hrt_config=v3_cfg_subtree(hvm_config,"hrt"))) { 
158         PrintError(vm,VCORE_NONE,"hvm: HVM configuration without HRT block...\n");
159         return -1;
160     }
161  
162     if (!(hrt_file_id=v3_cfg_val(hrt_config,"file_id"))) { 
163         PrintError(vm,VCORE_NONE,"hvm: HRT block without file_id...\n");
164         return -1;
165     }
166
167     vm->hvm_state.hrt_file = v3_cfg_get_file(vm,hrt_file_id);
168     
169     if (!vm->hvm_state.hrt_file) { 
170         PrintError(vm,VCORE_NONE,"hvm: HRT block contains bad file_id (%s)\n",hrt_file_id);
171         return -1;
172     }
173
174     if (v3_register_hypercall(vm, HVM_HCALL, 
175                               hvm_hcall_handler, 0)) { 
176         PrintError(vm,VCORE_NONE, "hvm: cannot register hypercall....\n");
177         return -1;
178     }
179
180     // XXX sanity check config here
181
182     vm->hvm_state.is_hvm=1;
183
184  out_ok:
185     if (vm->hvm_state.is_hvm) {
186         V3_Print(vm,VCORE_NONE,"hvm: [ROS: cores 0..%u, mem 0..%p] [HRT: cores %u..%u, mem %p..%p, file_id=%s (tag %s)]\n",
187                  vm->hvm_state.first_hrt_core-1,
188                  (void*) vm->hvm_state.first_hrt_gpa-1,
189                  vm->hvm_state.first_hrt_core,
190                  vm->num_cores-1,
191                  (void*) vm->hvm_state.first_hrt_gpa,
192                  (void*)vm->mem_size-1,
193                  hrt_file_id,
194                  vm->hvm_state.hrt_file->tag);
195     } else {
196         V3_Print(vm,VCORE_NONE,"hvm: This is a pure ROS VM\n");
197     }
198     return 0;
199     
200 }
201
202
203 int v3_deinit_hvm_vm(struct v3_vm_info *vm)
204 {
205     PrintDebug(vm, VCORE_NONE, "hvm: HVM VM deinit\n");
206
207     v3_remove_hypercall(vm,HVM_HCALL);
208
209     return 0;
210 }
211
212 int v3_init_hvm_core(struct guest_info *core)
213 {
214     memset(&core->hvm_state,0,sizeof(core->hvm_state));
215     if (core->vm_info->hvm_state.is_hvm) { 
216         if (core->vcpu_id >= core->vm_info->hvm_state.first_hrt_core) { 
217             core->hvm_state.is_hrt=1;
218         }
219     }
220     return 0;
221 }
222
223 int v3_deinit_hvm_core(struct guest_info *core)
224 {
225     PrintDebug(core->vm_info, VCORE_NONE, "hvm: HVM core deinit\n");
226
227     return 0;
228 }
229
230
231 uint64_t v3_get_hvm_ros_memsize(struct v3_vm_info *vm)
232 {
233     if (vm->hvm_state.is_hvm) { 
234         return vm->hvm_state.first_hrt_gpa;
235     } else {
236         return vm->mem_size;
237     }
238 }
239 uint64_t v3_get_hvm_hrt_memsize(struct v3_vm_info *vm)
240 {
241     return vm->mem_size;
242 }
243
244 uint32_t v3_get_hvm_ros_cores(struct v3_vm_info *vm)
245 {
246     if (vm->hvm_state.is_hvm) { 
247         return vm->hvm_state.first_hrt_core;
248     } else {
249         return vm->num_cores;
250     }
251 }
252
253 uint32_t v3_get_hvm_hrt_cores(struct v3_vm_info *vm)
254 {
255     if (vm->hvm_state.is_hvm) { 
256         return vm->num_cores - vm->hvm_state.first_hrt_core;
257     } else {
258         return 0;
259     }
260 }
261
262
263 int v3_is_hvm_ros_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
264 {
265     if (vm->hvm_state.is_hvm) { 
266         return gpa>=0 && gpa<vm->hvm_state.first_hrt_gpa;
267     } else {
268         return 1;
269     }
270 }
271
272 int v3_is_hvm_hrt_mem_gpa(struct v3_vm_info *vm, addr_t gpa)
273 {
274     if (vm->hvm_state.is_hvm) { 
275         return gpa>=vm->hvm_state.first_hrt_gpa && gpa<vm->mem_size;
276     } else {
277         return 0;
278     }
279 }
280
281 int v3_is_hvm_hrt_core(struct guest_info *core)
282 {
283     return core->hvm_state.is_hrt;
284 }
285
286 int v3_is_hvm_ros_core(struct guest_info *core)
287 {
288     return !core->hvm_state.is_hrt;
289 }
290
291 int      v3_hvm_should_deliver_ipi(struct guest_info *src, struct guest_info *dest)
292 {
293     if (!src) {
294         // ioapic or msi to apic
295         return !dest->hvm_state.is_hrt;
296     } else {
297         // apic to apic
298         return src->hvm_state.is_hrt || (!src->hvm_state.is_hrt && !dest->hvm_state.is_hrt) ;
299     }
300 }
301
302 void     v3_hvm_find_apics_seen_by_core(struct guest_info *core, struct v3_vm_info *vm, 
303                                         uint32_t *start_apic, uint32_t *num_apics)
304 {
305     if (!core) { 
306         // Seen from ioapic, msi, etc: 
307         if (vm->hvm_state.is_hvm) {
308             // HVM VM shows only the ROS cores/apics to ioapic, msi, etc
309             *start_apic = 0;
310             *num_apics = vm->hvm_state.first_hrt_core;
311         } else {
312             // Non-HVM shows all cores/APICs to apic, msi, etc.
313             *start_apic = 0;
314             *num_apics = vm->num_cores;
315         }
316     } else {
317         // Seen from apic
318         if (core->hvm_state.is_hrt) { 
319             // HRT core/apic sees all apics
320             // (this policy may change...)
321             *start_apic = 0;
322             *num_apics = vm->num_cores;
323         } else {
324             // non-HRT core/apic sees only non-HRT cores/apics
325             *start_apic = 0 ;
326             *num_apics = vm->hvm_state.first_hrt_core;
327         }
328     }
329 }
330
331 #define MAX(x,y) ((x)>(y)?(x):(y))
332 #define MIN(x,y) ((x)<(y)?(x):(y))
333
334 #ifdef HVM_MAP_1G_2M
335 #define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x40000000ULL))
336 #else
337 #define BOOT_STATE_END_ADDR (MIN(vm->mem_size,0x800000000ULL))
338 #endif
339
340 static void get_null_int_handler_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
341 {
342     *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - PAGE_SIZE);
343     *limit = PAGE_SIZE;
344 }
345
346 extern v3_cpu_arch_t v3_mach_type;
347
348 extern void *v3_hvm_svm_null_int_handler_start;
349 extern void *v3_hvm_svm_null_int_handler_end;
350 extern void *v3_hvm_vmx_null_int_handler_start;
351 extern void *v3_hvm_vmx_null_int_handler_end;
352
353 static void write_null_int_handler(struct v3_vm_info *vm)
354 {
355     void *base;
356     uint64_t limit;
357     void *data;
358     uint64_t len;
359
360     get_null_int_handler_loc(vm,&base,&limit);
361
362     switch (v3_mach_type) {
363 #ifdef V3_CONFIG_SVM
364         case V3_SVM_CPU:
365         case V3_SVM_REV3_CPU:
366             data = (void*) &v3_hvm_svm_null_int_handler_start;
367             len = (void*) &v3_hvm_svm_null_int_handler_end - data;
368             break;
369 #endif
370 #if V3_CONFIG_VMX
371         case V3_VMX_CPU:
372         case V3_VMX_EPT_CPU:
373         case V3_VMX_EPT_UG_CPU:
374             data = (void*) &v3_hvm_vmx_null_int_handler_start;
375             len = (void*) &v3_hvm_vmx_null_int_handler_end - data;
376             break;
377 #endif
378         default:
379             PrintError(vm,VCORE_NONE,"hvm: cannot determine CPU type to select null interrupt handler...\n");
380             data = 0;
381             len = 0;
382     }
383
384     if (data) {
385         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base),len,(uint8_t*)data);
386     }
387
388     PrintDebug(vm,VCORE_NONE,"hvm: wrote null interrupt handler at %p (%llu bytes)\n",base,len);
389 }
390
391
392 static void get_idt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
393 {
394     *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR - 2 * PAGE_SIZE);
395     *limit = 16*256;
396 }
397
398 // default IDT entries (int and trap gates)
399 //
400 // Format is 16 bytes long:
401 //   16 offsetlo   => 0
402 //   16 selector   => (target code selector) => 0x8 // entry 1 of GDT
403 //    3 ist        => (stack) = 0 => current stack
404 //    5 reserved   => 0
405 //    4 type       => 0xe=>INT, 0xf=>TRAP 
406 //    1 reserved   => 0
407 //    2 dpl        => 0
408 //    1 present    => 1
409 //   16 offsetmid  => 0
410 //   32 offsethigh => 0   (total is a 64 bit offset)
411 //   32 reserved   => 0
412 //
413 // 00 00 | 08 00 | 00 | 8[typenybble] | offsetmid | offsethigh | reserved
414 // 
415 // Note little endian
416 //
417 static uint64_t idt64_trap_gate_entry_mask[2] = {  0x00008f0000080000, 0x0 } ;
418 static uint64_t idt64_int_gate_entry_mask[2] =  { 0x00008e0000080000, 0x0 };
419
420 static void write_idt(struct v3_vm_info *vm)
421 {
422     void *base;
423     uint64_t limit;
424     void *handler;
425     uint64_t handler_len;
426     int i;
427     uint64_t trap_gate[2];
428     uint64_t int_gate[2];
429
430     get_idt_loc(vm,&base,&limit);
431
432     get_null_int_handler_loc(vm,&handler,&handler_len);
433
434     memcpy(trap_gate,idt64_trap_gate_entry_mask,16);
435     memcpy(int_gate,idt64_int_gate_entry_mask,16);
436
437     if (handler) {
438         // update the entries for the handler location
439         uint8_t *mask;
440         uint8_t *hand;
441         
442         hand = (uint8_t*) &handler;
443
444         mask = (uint8_t *)trap_gate;
445         memcpy(&(mask[0]),&(hand[0]),2); // offset low
446         memcpy(&(mask[6]),&(hand[2]),2); // offset med
447         memcpy(&(mask[8]),&(hand[4]),4); // offset high
448
449         mask = (uint8_t *)int_gate;
450         memcpy(&(mask[0]),&(hand[0]),2); // offset low
451         memcpy(&(mask[6]),&(hand[2]),2); // offset med
452         memcpy(&(mask[8]),&(hand[4]),4); // offset high
453
454         PrintDebug(vm,VCORE_NONE,"hvm: Adding default null trap and int gates\n");
455     }
456
457     for (i=0;i<32;i++) { 
458         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)trap_gate);
459     }
460
461     for (i=32;i<256;i++) { 
462         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*16),16,(uint8_t*)int_gate);
463     }
464
465     PrintDebug(vm,VCORE_NONE,"hvm: wrote IDT at %p\n",base);
466 }
467
468
469
470 static void get_gdt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
471 {
472     *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 3 * PAGE_SIZE);
473     *limit = 8*3;
474 }
475
476 static uint64_t gdt64[3] = {
477     0x0000000000000000, /* null */
478     0x00a09a0000000000, /* code (note lme bit) */
479     0x00a0920000000000, /* data (most entries don't matter) */
480 };
481
482 static void write_gdt(struct v3_vm_info *vm)
483 {
484     void *base;
485     uint64_t limit;
486
487     get_gdt_loc(vm,&base,&limit);
488     v3_write_gpa_memory(&vm->cores[0],(addr_t)base,limit,(uint8_t*) gdt64);
489
490     PrintDebug(vm,VCORE_NONE,"hvm: wrote GDT at %p\n",base);
491 }
492
493
494
495 static void get_tss_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
496 {
497     *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR - 4 * PAGE_SIZE);
498     *limit = PAGE_SIZE;
499 }
500
501 static void write_tss(struct v3_vm_info *vm)
502 {
503     void *base;
504     uint64_t limit;
505
506     get_tss_loc(vm,&base,&limit);
507
508     v3_set_gpa_memory(&vm->cores[0],(addr_t)base,limit,0);
509
510     PrintDebug(vm,VCORE_NONE,"hvm: wrote TSS at %p\n",base);
511 }
512
513 /*
514   PTS MAP FIRST 512 GB identity mapped: 
515   1 second level
516      512 entries
517   1 top level
518      1 entries
519
520 OR
521   
522   PTS MAP FIRST 1 GB identity mapped:
523   1 third level
524     512 entries
525   1 second level
526     1 entries
527   1 top level
528     1 entries
529 */
530
531 static void get_pt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
532 {
533 #ifdef HVM_MAP_1G_2M
534     *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+2)*PAGE_SIZE);
535     *limit =  3*PAGE_SIZE;
536 #else
537     *base = (void*)PAGE_ADDR(BOOT_STATE_END_ADDR-(5+1)*PAGE_SIZE);
538     *limit =  2*PAGE_SIZE;
539 #endif
540 }
541
542 #ifndef HVM_MAP_1G_2M
543 static void write_pt_2level_512GB(struct v3_vm_info *vm)
544 {
545     void *base;
546     uint64_t size;
547     struct pml4e64 pml4e;
548     struct pdpe64 pdpe;
549     uint64_t i;
550
551     get_pt_loc(vm,&base, &size);
552     if (size!=2*PAGE_SIZE) { 
553         PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n");
554     }
555
556     if (vm->mem_size > 0x800000000ULL) { 
557         PrintError(vm,VCORE_NONE, "VM has more than 512 GB\n");
558     }
559
560     memset(&pdpe,0,sizeof(pdpe));
561     pdpe.present=1;
562     pdpe.writable=1;
563     pdpe.large_page=1;
564     
565     for (i=0;i<512;i++) {
566         pdpe.pd_base_addr = i*0x40000;  // 0x4000 = 256K pages = 1 GB
567         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe);
568     }
569
570     memset(&pml4e,0,sizeof(pml4e));
571     pml4e.present=1;
572     pml4e.writable=1;
573     pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE));
574
575     v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e);    
576
577     for (i=1;i<512;i++) {
578         pml4e.present=0;
579         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e);
580     }
581
582     PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE) at %p (512 GB mapped)\n",base);
583 }
584
585 #else 
586
587 static void write_pt_3level_1GB(struct v3_vm_info *vm)
588 {
589     void *base;
590     uint64_t size;
591     struct pml4e64 pml4e;
592     struct pdpe64 pdpe;
593     struct pde64 pde;
594
595     uint64_t i;
596
597     get_pt_loc(vm,&base, &size);
598     if (size!=3*PAGE_SIZE) { 
599         PrintError(vm,VCORE_NONE,"Cannot support pt request, defaulting\n");
600     }
601
602     if (vm->mem_size > 0x40000000ULL) { 
603         PrintError(vm,VCORE_NONE, "VM has more than 1 GB\n");
604     }
605
606     memset(&pde,0,sizeof(pde));
607     pde.present=1;
608     pde.writable=1;
609     pde.large_page=1;
610     
611     for (i=0;i<512;i++) {
612         pde.pt_base_addr = i*0x200;  // 0x200 = 512 pages = 2 MB
613         v3_write_gpa_memory(&vm->cores[0],
614                             (addr_t)(base+2*PAGE_SIZE+i*sizeof(pde)),
615                             sizeof(pde),(uint8_t*)&pde);
616     }
617
618     memset(&pdpe,0,sizeof(pdpe));
619     pdpe.present=1;
620     pdpe.writable=1;
621     pdpe.large_page=0;
622
623     pdpe.pd_base_addr = PAGE_BASE_ADDR((addr_t)(base+2*PAGE_SIZE));
624
625     v3_write_gpa_memory(&vm->cores[0],(addr_t)base+PAGE_SIZE,sizeof(pdpe),(uint8_t*)&pdpe);    
626     
627     for (i=1;i<512;i++) {
628         pdpe.present = 0; 
629         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+PAGE_SIZE+i*sizeof(pdpe)),sizeof(pdpe),(uint8_t*)&pdpe);
630     }
631
632     memset(&pml4e,0,sizeof(pml4e));
633     pml4e.present=1;
634     pml4e.writable=1;
635     pml4e.pdp_base_addr = PAGE_BASE_ADDR((addr_t)(base+PAGE_SIZE));
636
637     v3_write_gpa_memory(&vm->cores[0],(addr_t)base,sizeof(pml4e),(uint8_t*)&pml4e);    
638
639     for (i=1;i<512;i++) {
640         pml4e.present=0;
641         v3_write_gpa_memory(&vm->cores[0],(addr_t)(base+i*sizeof(pml4e)),sizeof(pml4e),(uint8_t*)&pml4e);
642     }
643
644     PrintDebug(vm,VCORE_NONE,"hvm: Wrote page tables (1 PML4, 1 PDPE, 1 PDP) at %p (1 GB mapped)\n",base);
645 }
646
647 #endif
648
649 static void write_pt(struct v3_vm_info *vm)
650 {
651 #ifdef HVM_MAP_1G_2M
652     write_pt_3level_1GB(vm);
653 #else
654     write_pt_2level_512GB(vm);
655 #endif
656 }
657
658 static void get_mb_info_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
659 {
660 #ifdef HVM_MAP_1G_2M
661     *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+2)*PAGE_SIZE);
662 #else
663     *base = (void*) PAGE_ADDR(BOOT_STATE_END_ADDR-(6+1)*PAGE_SIZE);
664 #endif
665     *limit =  PAGE_SIZE;
666 }
667
668 static void write_mb_info(struct v3_vm_info *vm) 
669 {
670     if (vm->hvm_state.hrt_type!=HRT_MBOOT64) { 
671         PrintError(vm, VCORE_NONE,"hvm: Cannot handle this HRT type\n");
672         return;
673     } else {
674         uint8_t buf[256];
675         uint64_t size;
676         void *base;
677         uint64_t limit;
678
679         get_mb_info_loc(vm,&base,&limit);
680         
681         if ((size=v3_build_multiboot_table(&vm->cores[vm->hvm_state.first_hrt_core],buf,256))==-1) { 
682             PrintError(vm,VCORE_NONE,"hvm: Failed to build MB info\n");
683             return;
684         }
685
686         if (size>limit) { 
687             PrintError(vm,VCORE_NONE,"hvm: MB info is too large\n");
688             return;
689         }
690         
691         v3_write_gpa_memory(&vm->cores[vm->hvm_state.first_hrt_core],
692                             (addr_t)base,
693                             size,
694                             buf);
695
696         PrintDebug(vm,VCORE_NONE, "hvm: wrote MB info at %p\n", base);
697     }
698 }
699
700 #define SCRATCH_STACK_SIZE 4096
701
702
703 static void get_hrt_loc(struct v3_vm_info *vm, void **base, uint64_t *limit)
704 {
705     void *mb_base;
706     uint64_t mb_limit;
707     
708     get_mb_info_loc(vm,&mb_base,&mb_limit);
709     
710     mb_base-=SCRATCH_STACK_SIZE*v3_get_hvm_hrt_cores(vm);
711
712     *base = (void*)PAGE_ADDR(vm->hvm_state.first_hrt_gpa);
713
714     if (mb_base < *base+PAGE_SIZE) { 
715         PrintError(vm,VCORE_NONE,"hvm: HRT stack colides with HRT\n");
716     }
717
718     *limit = mb_base - *base;
719 }
720
721
722 #define ERROR(fmt, args...) PrintError(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
723 #define INFO(fmt, args...) PrintDebug(VM_NONE,VCORE_NONE,"hvm: " fmt,##args)
724
725 #define ELF_MAGIC    0x464c457f
726 #define MB2_MAGIC    0xe85250d6
727
728 #define MB2_INFO_MAGIC    0x36d76289
729
730 static int is_elf(uint8_t *data, uint64_t size)
731 {
732     if (*((uint32_t*)data)==ELF_MAGIC) {
733         return 1;
734     } else { 
735         return 0;
736     }
737 }
738
739 static mb_header_t *find_mb_header(uint8_t *data, uint64_t size)
740 {
741     uint64_t limit = size > 32768 ? 32768 : size;
742     uint64_t i;
743
744     // Scan for the .boot magic cookie
745     // must be in first 32K, assume 4 byte aligned
746     for (i=0;i<limit;i+=4) { 
747         if (*((uint32_t*)&data[i])==MB2_MAGIC) {
748             INFO("Found multiboot header at offset 0x%llx\n",i);
749             return (mb_header_t *) &data[i];
750         }
751     }
752     return 0;
753 }
754
755
756 // 
757 // BROKEN - THIS DOES NOT DO WHAT YOU THINK
758 //
759 static int setup_elf(struct v3_vm_info *vm, void *base, uint64_t limit)
760 {
761     v3_write_gpa_memory(&vm->cores[0],(addr_t)base,vm->hvm_state.hrt_file->size,vm->hvm_state.hrt_file->data);
762
763     vm->hvm_state.hrt_entry_addr = (uint64_t) (base+0x40);
764
765     PrintDebug(vm,VCORE_NONE,"hvm: wrote HRT ELF %s at %p\n", vm->hvm_state.hrt_file->tag,base);
766     PrintDebug(vm,VCORE_NONE,"hvm: set ELF entry to %p and hoping for the best...\n", (void*) vm->hvm_state.hrt_entry_addr);
767     
768     vm->hvm_state.hrt_type = HRT_ELF64;
769
770     return 0;
771
772 }
773
774 static int setup_mb_kernel(struct v3_vm_info *vm, void *base, uint64_t limit)
775 {
776     mb_data_t mb;
777
778     if (v3_parse_multiboot_header(vm->hvm_state.hrt_file,&mb)) { 
779         PrintError(vm,VCORE_NONE, "hvm: failed to parse multiboot kernel header\n");
780         return -1;
781     }
782
783
784     if (v3_write_multiboot_kernel(vm,&mb,vm->hvm_state.hrt_file,base,limit)) {
785         PrintError(vm,VCORE_NONE, "hvm: failed to write multiboot kernel into memory\n");
786         return -1;
787     }
788
789     /*
790     if (!mb.addr || !mb.entry) { 
791         PrintError(vm,VCORE_NONE, "hvm: kernel is missing address or entry point\n");
792         return -1;
793     }
794
795     if (((void*)(uint64_t)(mb.addr->header_addr) < base ) ||
796         ((void*)(uint64_t)(mb.addr->load_end_addr) > base+limit) ||
797         ((void*)(uint64_t)(mb.addr->bss_end_addr) > base+limit)) { 
798         PrintError(vm,VCORE_NONE, "hvm: kernel is not within the allowed portion of HVM\n");
799         return -1;
800     }
801
802     offset = mb.addr->load_addr - mb.addr->header_addr;
803
804     // Skip the ELF header - assume 1 page... weird.... 
805     // FIX ME TO CONFORM TO MULTIBOOT.C
806     v3_write_gpa_memory(&vm->cores[0],
807                         (addr_t)(mb.addr->load_addr),
808                         vm->hvm_state.hrt_file->size-PAGE_SIZE-offset,
809                         vm->hvm_state.hrt_file->data+PAGE_SIZE+offset);
810
811         
812     // vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr + PAGE_SIZE; //HACK PAD
813
814
815     PrintDebug(vm,VCORE_NONE,
816                "hvm: wrote 0x%llx bytes starting at offset 0x%llx to %p; set entry to %p\n",
817                (uint64_t) vm->hvm_state.hrt_file->size-PAGE_SIZE-offset,
818                (uint64_t) PAGE_SIZE+offset,
819                (void*)(addr_t)(mb.addr->load_addr),
820                (void*) vm->hvm_state.hrt_entry_addr);
821
822
823     */
824
825     vm->hvm_state.hrt_entry_addr = (uint64_t) mb.entry->entry_addr;
826     
827     vm->hvm_state.hrt_type = HRT_MBOOT64;
828
829     return 0;
830
831 }
832
833
834 static int setup_hrt(struct v3_vm_info *vm)
835 {
836     void *base;
837     uint64_t limit;
838
839     get_hrt_loc(vm,&base,&limit);
840
841     if (vm->hvm_state.hrt_file->size > limit) { 
842         PrintError(vm,VCORE_NONE,"hvm: Cannot map HRT because it is too big (%llu bytes, but only have %llu space\n", vm->hvm_state.hrt_file->size, (uint64_t)limit);
843         return -1;
844     }
845
846     if (!is_elf(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { 
847         PrintError(vm,VCORE_NONE,"hvm: supplied HRT is not an ELF but we are going to act like it is!\n");
848         if (setup_elf(vm,base,limit)) {
849             PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n");
850             return -1;
851         }
852         vm->hvm_state.hrt_type=HRT_BLOB;
853     } else {
854         if (find_mb_header(vm->hvm_state.hrt_file->data,vm->hvm_state.hrt_file->size)) { 
855             PrintDebug(vm,VCORE_NONE,"hvm: appears to be a multiboot kernel\n");
856             if (setup_mb_kernel(vm,base,limit)) { 
857                 PrintError(vm,VCORE_NONE,"hvm: multiboot kernel setup failed\n");
858                 return -1;
859             } 
860         } else {
861             PrintDebug(vm,VCORE_NONE,"hvm: supplied HRT is an ELF\n");
862             if (setup_elf(vm,base,limit)) {
863                 PrintError(vm,VCORE_NONE,"hvm: Fake ELF setup failed\n");
864                 return -1;
865             }
866         }
867     }
868
869     return 0;
870 }
871
872
873         
874
875 /*
876   GPA layout:
877
878   HRT
879   ---
880   ROS
881
882   We do not touch the ROS portion of the address space.
883   The HRT portion looks like:
884
885   INT_HANDLER (1 page - page aligned)
886   IDT (1 page - page aligned)
887   GDT (1 page - page aligned)
888   TSS (1 page - page asligned)
889   PAGETABLES  (identy map of first N GB)
890      ROOT PT first, followed by 2nd level, etc.
891      Currently PML4 followed by 1 PDPE for 512 GB of mapping
892   MBINFO_PAGE
893   SCRATCH_STACK_HRT_CORE0 
894   SCRATCH_STACK_HRT_CORE1
895   ..
896   SCRATCH_STACK_HRT_COREN
897   ...
898   HRT (as many pages as needed, page-aligned, starting at first HRT address)
899   ---
900   ROS
901       
902 */
903
904
905 int v3_setup_hvm_vm_for_boot(struct v3_vm_info *vm)
906 {
907     if (!vm->hvm_state.is_hvm) { 
908         PrintDebug(vm,VCORE_NONE,"hvm: skipping HVM setup for boot as this is not an HVM\n");
909         return 0;
910     }
911
912     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory begins\n");
913
914     write_null_int_handler(vm);
915     write_idt(vm);
916     write_gdt(vm);
917     write_tss(vm);
918
919     write_pt(vm);
920
921     
922     if (setup_hrt(vm)) {
923         PrintError(vm,VCORE_NONE,"hvm: failed to setup HRT\n");
924         return -1;
925     } 
926
927     // need to parse HRT first
928     write_mb_info(vm);
929
930     PrintDebug(vm,VCORE_NONE,"hvm: setup of HVM memory done\n");
931
932     return 0;
933 }
934
935 /*
936   On entry for every core:
937
938    IDTR points to stub IDT
939    GDTR points to stub GDT
940    TS   points to stub TSS
941    CR3 points to root page table
942    CR0 has PE and PG
943    EFER has LME AND LMA
944    RSP is TOS of core's scratch stack (looks like a call)
945
946    RAX = MB magic cookie
947    RBX = address of multiboot info table
948    RCX = this core id / apic id (0..N-1)
949    RDX = this core id - first HRT core ID (==0 for the first HRT core)
950
951    Other regs are zeroed
952
953    shadow/nested paging state reset for long mode
954
955 */
956 int v3_setup_hvm_hrt_core_for_boot(struct guest_info *core)
957 {
958     void *base;
959     uint64_t limit;
960
961     rdtscll(core->hvm_state.last_boot_start);
962
963     if (!core->hvm_state.is_hrt) { 
964         PrintDebug(core->vm_info,core,"hvm: skipping HRT setup for core %u as it is not an HRT core\n", core->vcpu_id);
965         return 0;
966     }
967
968     PrintDebug(core->vm_info, core, "hvm: setting up HRT core (%u) for boot\n", core->vcpu_id);
969
970     
971
972     
973     memset(&core->vm_regs,0,sizeof(core->vm_regs));
974     memset(&core->ctrl_regs,0,sizeof(core->ctrl_regs));
975     memset(&core->dbg_regs,0,sizeof(core->dbg_regs));
976     memset(&core->segments,0,sizeof(core->segments));    
977     memset(&core->msrs,0,sizeof(core->msrs));    
978     memset(&core->fp_state,0,sizeof(core->fp_state));    
979
980     // We are in long mode with virtual memory and we want
981     // to start immediatley
982     core->cpl = 0; // we are going right into the kernel
983     core->cpu_mode = LONG;
984     core->mem_mode = VIRTUAL_MEM; 
985     core->core_run_state = CORE_RUNNING ;
986
987
988     // magic
989     core->vm_regs.rax = MB2_INFO_MAGIC;
990
991     // multiboot info pointer
992     get_mb_info_loc(core->vm_info, &base,&limit);
993     core->vm_regs.rbx = (uint64_t) base;  
994
995     // core number
996     core->vm_regs.rcx = core->vcpu_id;
997     
998     // HRT core number
999     core->vm_regs.rdx = core->vcpu_id - core->vm_info->hvm_state.first_hrt_core;
1000
1001     // Now point to scratch stack for this core
1002     // it begins at an ofset relative to the MB info page
1003     get_mb_info_loc(core->vm_info, &base,&limit);
1004     base -= core->vm_regs.rdx * SCRATCH_STACK_SIZE;
1005     core->vm_regs.rsp = (v3_reg_t) base;  
1006     core->vm_regs.rbp = (v3_reg_t) base-8; 
1007
1008     // push onto the stack a bad rbp and bad return address
1009     core->vm_regs.rsp-=16;
1010     v3_set_gpa_memory(core,
1011                       core->vm_regs.rsp,
1012                       16,
1013                       0xff);
1014
1015
1016     // HRT entry point
1017     get_hrt_loc(core->vm_info, &base,&limit);
1018     core->rip = (uint64_t) core->vm_info->hvm_state.hrt_entry_addr ; 
1019
1020
1021     PrintDebug(core->vm_info,core,"hvm: hrt core %u has rip=%p, rsp=%p, rbp=%p, rax=%p, rbx=%p, rcx=%p, rdx=%p\n",
1022                (core->vcpu_id - core->vm_info->hvm_state.first_hrt_core),
1023                (void*)(core->rip),
1024                (void*)(core->vm_regs.rsp),
1025                (void*)(core->vm_regs.rbp),
1026                (void*)(core->vm_regs.rax),
1027                (void*)(core->vm_regs.rbx),
1028                (void*)(core->vm_regs.rcx),
1029                (void*)(core->vm_regs.rdx));
1030
1031     // Setup CRs for long mode and our stub page table
1032     // CR0: PG, PE
1033     core->ctrl_regs.cr0 = 0x80000001;
1034     core->shdw_pg_state.guest_cr0 = core->ctrl_regs.cr0;
1035
1036     // CR2: don't care (output from #PF)
1037     // CE3: set to our PML4E, without setting PCD or PWT
1038     get_pt_loc(core->vm_info, &base,&limit);
1039     core->ctrl_regs.cr3 = PAGE_ADDR((addr_t)base);
1040     core->shdw_pg_state.guest_cr3 = core->ctrl_regs.cr3;
1041
1042     // CR4: PGE, PAE, PSE (last byte: 1 0 1 1 0 0 0 0)
1043     core->ctrl_regs.cr4 = 0xb0;
1044     core->shdw_pg_state.guest_cr4 = core->ctrl_regs.cr4;
1045     // CR8 as usual
1046     // RFLAGS zeroed is fine: come in with interrupts off
1047     // EFER needs SVME LMA LME (last 16 bits: 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0
1048     core->ctrl_regs.efer = 0x1500;
1049     core->shdw_pg_state.guest_efer.value = core->ctrl_regs.efer;
1050
1051
1052     /* 
1053        Notes on selectors:
1054
1055        selector is 13 bits of index, 1 bit table indicator 
1056        (0=>GDT), 2 bit RPL
1057        
1058        index is scaled by 8, even in long mode, where some entries 
1059        are 16 bytes long.... 
1060           -> code, data descriptors have 8 byte format
1061              because base, limit, etc, are ignored (no segmentation)
1062           -> interrupt/trap gates have 16 byte format 
1063              because offset needs to be 64 bits
1064     */
1065     
1066     // Install our stub IDT
1067     get_idt_loc(core->vm_info, &base,&limit);
1068     core->segments.idtr.selector = 0;  // entry 0 (NULL) of the GDT
1069     core->segments.idtr.base = (addr_t) base;
1070     core->segments.idtr.limit = limit-1;
1071     core->segments.idtr.type = 0xe;
1072     core->segments.idtr.system = 1; 
1073     core->segments.idtr.dpl = 0;
1074     core->segments.idtr.present = 1;
1075     core->segments.idtr.long_mode = 1;
1076
1077     // Install our stub GDT
1078     get_gdt_loc(core->vm_info, &base,&limit);
1079     core->segments.gdtr.selector = 0;
1080     core->segments.gdtr.base = (addr_t) base;
1081     core->segments.gdtr.limit = limit-1;
1082     core->segments.gdtr.type = 0x6;
1083     core->segments.gdtr.system = 1; 
1084     core->segments.gdtr.dpl = 0;
1085     core->segments.gdtr.present = 1;
1086     core->segments.gdtr.long_mode = 1;
1087     
1088     // And our TSS
1089     get_tss_loc(core->vm_info, &base,&limit);
1090     core->segments.tr.selector = 0;
1091     core->segments.tr.base = (addr_t) base;
1092     core->segments.tr.limit = limit-1;
1093     core->segments.tr.type = 0x6;
1094     core->segments.tr.system = 1; 
1095     core->segments.tr.dpl = 0;
1096     core->segments.tr.present = 1;
1097     core->segments.tr.long_mode = 1;
1098     
1099     base = 0x0;
1100     limit = -1;
1101
1102     // And CS
1103     core->segments.cs.selector = 0x8 ; // entry 1 of GDT (RPL=0)
1104     core->segments.cs.base = (addr_t) base;
1105     core->segments.cs.limit = limit;
1106     core->segments.cs.type = 0xe;
1107     core->segments.cs.system = 0; 
1108     core->segments.cs.dpl = 0;
1109     core->segments.cs.present = 1;
1110     core->segments.cs.long_mode = 1;
1111
1112     // DS, SS, etc are identical
1113     core->segments.ds.selector = 0x10; // entry 2 of GDT (RPL=0)
1114     core->segments.ds.base = (addr_t) base;
1115     core->segments.ds.limit = limit;
1116     core->segments.ds.type = 0x6;
1117     core->segments.ds.system = 0; 
1118     core->segments.ds.dpl = 0;
1119     core->segments.ds.present = 1;
1120     core->segments.ds.long_mode = 1;
1121     
1122     memcpy(&core->segments.ss,&core->segments.ds,sizeof(core->segments.ds));
1123     memcpy(&core->segments.es,&core->segments.ds,sizeof(core->segments.ds));
1124     memcpy(&core->segments.fs,&core->segments.ds,sizeof(core->segments.ds));
1125     memcpy(&core->segments.gs,&core->segments.ds,sizeof(core->segments.ds));
1126     
1127
1128     // reset paging here for shadow... 
1129
1130     if (core->shdw_pg_mode != NESTED_PAGING) { 
1131         PrintError(core->vm_info, core, "hvm: shadow paging guest... this will end badly\n");
1132         return -1;
1133     }
1134
1135
1136     return 0;
1137 }
1138
1139 int v3_handle_hvm_reset(struct guest_info *core)
1140 {
1141
1142     if (core->core_run_state != CORE_RESETTING) { 
1143         return 0;
1144     }
1145
1146     if (!core->vm_info->hvm_state.is_hvm) { 
1147         return 0;
1148     }
1149
1150     if (v3_is_hvm_hrt_core(core)) { 
1151         // this is an HRT reset
1152         int rc=0;
1153
1154         // wait for all the HRT cores
1155         v3_counting_barrier(&core->vm_info->reset_barrier);
1156
1157         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
1158             // I am leader
1159             core->vm_info->run_state = VM_RESETTING;
1160         }
1161
1162         core->core_run_state = CORE_RESETTING;
1163
1164         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) {
1165             // we really only need to clear the bss
1166             // and recopy the .data, but for now we'll just
1167             // do everything
1168             rc |= v3_setup_hvm_vm_for_boot(core->vm_info);
1169
1170             if (rc) { 
1171                 PrintError(core->vm_info,core,"hvm: failed to setup HVM VM for boot rc=%d\n",rc);
1172             }
1173         }
1174
1175         // now everyone is ready to reset
1176         rc |= v3_setup_hvm_hrt_core_for_boot(core);
1177
1178         if (rc) { 
1179             PrintError(core->vm_info,core,"hvm: failed to setup HVM core for boot rc=%d\n",rc);
1180         }
1181
1182         core->core_run_state = CORE_RUNNING;
1183
1184         if (core->vcpu_id==core->vm_info->hvm_state.first_hrt_core) { 
1185             // leader
1186             core->vm_info->run_state = VM_RUNNING;
1187         }
1188
1189         v3_counting_barrier(&core->vm_info->reset_barrier);
1190
1191         if (rc<0) { 
1192             PrintError(core->vm_info,core,"hvm: reset failed\n");
1193             return rc;
1194         } else {
1195             return 1;
1196         }
1197
1198     } else { 
1199         // ROS core will be handled by normal reset functionality
1200         return 0;
1201     }
1202 }