Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


Gears MPI Accelerator Service
[palacios.git] / gears / services / mpi / mpi.c
1 /* 
2    MPI module
3   
4    (c) 2012 Peter Dinda
5
6  */
7
8
9 #include <linux/module.h>
10 #include <linux/kernel.h>
11 #include <linux/slab.h>
12 #include <linux/mm.h>
13 #include <linux/sched.h>
14
15 #include <palacios/vmm.h>
16 #include <palacios/vm_guest.h>
17 #include <interfaces/vmm_host_hypercall.h>
18
19 #include "mpi_hc.h"
20
21 #define DEEP_DEBUG    0
22 #define SHALLOW_DEBUG 0
23
24 #if DEEP_DEBUG
25 #define DEEP_DEBUG_PRINT(fmt, args...) printk((fmt), ##args)
26 #else
27 #define DEEP_DEBUG_PRINT(fmt, args...) 
28 #endif
29
30 #if SHALLOW_DEBUG
31 #define SHALLOW_DEBUG_PRINT(fmt, args...) printk((fmt), ##args)
32 #else
33 #define SHALLOW_DEBUG_PRINT(fmt, args...) 
34 #endif
35
36
37 #define ERROR(fmt, args...) printk((fmt), ##args)
38 #define INFO(fmt, args...) printk((fmt), ##args)
39
40 #define RENDEZVOUS_TABLE_MAX 32
41 #define EXEC_NAME_MAX 128
42
43 struct rendezvous_table_row {
44   enum {
45     FREE=0,
46     INITED,
47     RANKED,
48   }  state;
49   
50   char exec[EXEC_NAME_MAX];
51   uint64_t rank;
52   struct guest_info *core;
53   struct guest_accessors *acc;
54   uint64_t cr3;
55   wait_queue_head_t send_wait_queue;
56   int send_pending;
57   uint64_t send_vaddr;
58   uint64_t send_size;
59   uint64_t send_dest;
60   uint64_t send_tag;
61   uint64_t send_rc;
62   wait_queue_head_t recv_wait_queue;
63   int recv_pending;
64   uint64_t recv_vaddr;
65   uint64_t recv_size;
66   uint64_t recv_src;
67   uint64_t recv_tag;
68   uint64_t recv_stat_vaddr;
69   uint64_t recv_rc;
70 };
71
72 static struct rendezvous_table_row *rtab;
73
74
75 static int mpi_init_hcall(struct guest_info *core,
76                           struct guest_accessors *acc,
77                           int *argc, 
78                           char ***argv)
79 {
80   int i;
81   struct rendezvous_table_row *r;
82   uint32_t va;
83
84   SHALLOW_DEBUG_PRINT("mpi: mpi_init_hcall(%p,%p)\n",(void*)argc,(void*)argv);
85   
86   if (!rtab) { 
87     ERROR("mpi: no rtab!\n");
88     return -1;
89   } 
90
91   for (i=0;i<RENDEZVOUS_TABLE_MAX;i++) { 
92     if (rtab[i].state==FREE) {
93       break;
94     }
95   }
96   
97   if (i==RENDEZVOUS_TABLE_MAX) { 
98     ERROR("mpi: no room in rtab\n");
99     return -1;
100   }
101
102   r=&(rtab[i]);
103   r->rank=0;
104   r->core=core;
105   r->acc=acc;
106   r->cr3=acc->get_cr3(core);
107   r->send_pending=0;
108   r->recv_pending=0;
109
110   // The following hideously assumes that FIX FIX FIX
111   // the guest app is 32 bit!  FIX FIX FIX
112   // THIS IS COMMON ASSUMPTION THROUGHOUT FIX FIX FIX
113   if (acc->read_gva(core,(uint64_t)argv,4,&va)<0) { 
114     ERROR("mpi: init cannot copy argv (first deref)\n");
115     return -1;
116   } else {
117     //now we have *argv
118     // we want **argv
119     if (acc->read_gva(core,(uint64_t)va,4,&va)<0) { 
120       ERROR("mpi: init cannot copy argv (second deref)\n");
121       return -1;
122     } else {
123       // now we have **argv, and we want the array it points to
124       if (acc->read_gva(core,(uint64_t)va,EXEC_NAME_MAX,r->exec)<0) { 
125         ERROR("mpi: init cannot copy exec name (third deref)\n");
126         return -1;
127       }
128       // for good measure
129       r->exec[EXEC_NAME_MAX-1]=0;
130     }
131   }
132   
133   init_waitqueue_head(&(r->send_wait_queue));
134   init_waitqueue_head(&(r->recv_wait_queue));
135
136   r->state=INITED;
137   
138   DEEP_DEBUG_PRINT("mpi: inited entry %d to '%s' core=%p cr3=%p\n",
139                    i,r->exec,r->core,(void*)(r->cr3));
140
141   return 0;
142 }
143
144 static int mpi_deinit_hcall(struct guest_info *core,
145                             struct guest_accessors *acc)
146 {
147   int i;
148   uint64_t cr3;
149
150   SHALLOW_DEBUG_PRINT("mpi: mpi_deinit_hcall()\n");
151
152   cr3=acc->get_cr3(core);
153
154   for (i=0;i<RENDEZVOUS_TABLE_MAX;i++) { 
155     if (rtab[i].state!=FREE && 
156         rtab[i].core==core &&
157         rtab[i].cr3==cr3) {
158       break;
159     }
160   }
161   
162   if (i==RENDEZVOUS_TABLE_MAX) { 
163     ERROR("mpi: could not find matching row in rtab to delete\n");
164     return -1;
165   }
166
167   if (rtab[i].send_pending) { 
168     ERROR("mpi: warning: deleting matching row with send pending\n");
169   }
170
171   if (rtab[i].recv_pending) { 
172     ERROR("mpi: warning: deleting matching row with recv pending\n");
173   }
174
175   DEEP_DEBUG_PRINT("mpi: removing row for core %p, cr3 %p, exec '%s'\n",
176                    core, (void*)cr3, rtab[i].exec);
177
178   
179   memset(&(rtab[i]),0,sizeof(struct rendezvous_table_row));
180   
181   return 0;
182 }
183
184 static int mpi_comm_rank_hcall(struct guest_info *core,
185                                struct guest_accessors *acc,
186                                void *comm_va, 
187                                int *rank_va)
188 {
189   int i;
190   uint64_t cr3;
191
192   SHALLOW_DEBUG_PRINT("mpi_comm_rank_hcall(%p,%p)\n",(void*)comm_va,(void*)rank_va);
193
194   cr3=acc->get_cr3(core);
195
196   for (i=0;i<RENDEZVOUS_TABLE_MAX;i++) { 
197     if (rtab[i].state==INITED && 
198         rtab[i].core==core &&
199         rtab[i].cr3==cr3) {
200       break;
201     }
202   }
203   
204   if (i==RENDEZVOUS_TABLE_MAX) { 
205     ERROR("mpi: no matching row found\n");
206     return -1;
207   }
208   
209   //
210   // The following completely ignores the communicator
211   // Throughout we assume everyone is in MPI_COMM_WORLD
212   // FIX FIX FIX FIX
213   //
214
215   if (acc->read_gva(core,(uint64_t)rank_va,4,&(rtab[i].rank))<0) { 
216     ERROR("mpi: rank cannot copy rank\n");
217     return -1;
218   } 
219
220   rtab[i].state=RANKED;
221   
222   SHALLOW_DEBUG_PRINT("mpi: ranking rcore %p, cr3 %p, exec '%s' as %llu\n",
223                       core, (void*)cr3, rtab[i].exec, rtab[i].rank);
224
225   return 0;
226 }
227
228 #define PAGE_ADDR(x) ((x)&~((uint64_t)0xfff))
229 #define PAGE_NEXT_ADDR(x) (PAGE_ADDR(x)+0x1000)
230
231
232
233 static uint64_t fast_inter_vm_copy(struct guest_info      *dest_core,
234                                    struct guest_accessors *dest_acc,
235                                    uint64_t                dest_va,
236                                    struct guest_info      *src_core,
237                                    struct guest_accessors *src_acc,
238                                    uint64_t                src_va,
239                                    uint64_t                count)
240 {
241
242   uint64_t left, chunk;
243   uint64_t src_page_left, dest_page_left;
244   uint64_t src_host_va, dest_host_va;
245   
246   left = count;
247
248   while (left) { 
249     src_page_left = PAGE_NEXT_ADDR(src_va) - src_va;
250     dest_page_left = PAGE_NEXT_ADDR(dest_va) - dest_va;
251     
252     chunk = src_page_left < dest_page_left ? src_page_left : dest_page_left;
253     chunk = chunk < left ? chunk : left;
254
255     DEEP_DEBUG_PRINT("mpi: copy chunk=%d, src_va=%p, dest_va=%p\n", 
256                      chunk, src_va, dest_va);
257
258     if (src_acc->gva_to_hva(src_core,src_va,&src_host_va)<0) { 
259       ERROR("mpi: cannot translate src address %p in VM core %p\n",src_va,src_core);
260       return count-left;
261     }
262     if (dest_acc->gva_to_hva(dest_core,dest_va,&dest_host_va)<0) { 
263       ERROR("mpi: cannot translate dest address %p in VM core %p\n",dest_va,dest_core);
264       return count-left;
265     }
266
267     DEEP_DEBUG_PRINT("mpi: copy chunk=%d, src_host_va=%p, dest_host_va=%p\n",
268                      chunk, src_host_va, dest_host_va);
269
270     memcpy((void*)dest_host_va,(void*)src_host_va,chunk);
271  
272     src_va += chunk;
273     dest_va += chunk;
274     left -= chunk;
275   }
276
277   return count;
278
279 }
280                                  
281
282
283 static int mpi_send_hcall(struct guest_info *core,
284                           struct guest_accessors *acc,
285                           void *buf, 
286                           int n, 
287                           int dtype, 
288                           int dest, 
289                           int tag, 
290                           int comm)
291 {
292   uint64_t cr3;
293   int i;
294   struct rendezvous_table_row *sender, *receiver;
295
296   SHALLOW_DEBUG_PRINT("mpi: mpi_send_hcall(%p,%p,%p,%p,%p,%p)\n",(void*)buf,(void*)n,(void*)dtype,(void*)dest,(void*)tag,(void*)comm);
297
298   cr3=acc->get_cr3(core);
299
300   // First find me
301   for (i=0;i<RENDEZVOUS_TABLE_MAX;i++) { 
302     if (rtab[i].state==RANKED && 
303         rtab[i].core==core &&
304         rtab[i].cr3==cr3) {
305       break;
306     }
307   }
308
309   if (i==RENDEZVOUS_TABLE_MAX) { 
310     ERROR("mpi: existential panic in send\n");
311     return -1;
312   }
313
314   sender=&(rtab[i]);
315
316   // Next try to find a matching receive
317
318   for (i=0;i<RENDEZVOUS_TABLE_MAX;i++) { 
319     if (&(rtab[i])!=sender &&
320         rtab[i].state==RANKED && 
321         strncmp(rtab[i].exec,sender->exec,EXEC_NAME_MAX)==0) {
322       break;
323     }
324   }
325
326   if (i==RENDEZVOUS_TABLE_MAX) { 
327     DEEP_DEBUG_PRINT("mpi: receiver does not exist yet - pending ourselves\n");
328     goto pending;
329   } else {
330     receiver=&(rtab[i]);
331     if (!(receiver->recv_pending)) { 
332       DEEP_DEBUG_PRINT("mpi: receiver has no pending receive - pending ourselves\n");
333       goto pending;
334     } 
335     // totally ignores communicator!!!  FIX FIX FIX
336     // simplistic fully qualified matching FIX FIX FIX
337     if (receiver->recv_tag==tag &&
338         receiver->recv_src==sender->rank) { 
339       // fast path
340       // totally ignores types and assumes byte xfer FIX FIX FIX
341       uint64_t size = n < receiver->recv_size ? n : receiver->recv_size;
342
343       SHALLOW_DEBUG_PRINT("mpi: mpi_send: copying %llu bytes\n", size);
344       
345       if (fast_inter_vm_copy(receiver->core,
346                              receiver->acc,
347                              receiver->recv_vaddr,
348                              core,
349                              acc,
350                              buf,
351                              size) != size) { 
352         ERROR("mpi: fast_inter_vm_copy failed in mpi_send: destvm=%p, destacc=%p, dest_va=%p, srcvm=%p, srcacc=%p, src_va=%p, size=%llu\n",receiver->core,receiver->acc,receiver->recv_vaddr,core,acc,buf,size);
353         return -1;
354       }
355                              
356
357       SHALLOW_DEBUG_PRINT("mpi: mpi_send: finished copying\n");
358       
359
360       // Now we release the receiver
361       receiver->recv_rc = 0;
362       receiver->recv_pending = 0;
363   
364       wake_up_interruptible(&(receiver->recv_wait_queue));
365
366       // And we are also done
367
368       return 0;
369
370     } else {
371       DEEP_DEBUG_PRINT("mpi: receiver's pending receive does not match - pending ourselves\n");
372       goto pending;
373     }
374   }
375       
376
377
378  pending:
379   
380   // we store our state
381   sender->send_vaddr=buf;
382   sender->send_size=n;
383   sender->send_dest=dest;
384   sender->send_tag=tag;
385   sender->send_rc=-1;
386
387   // And now we wait for the receive to do the job
388   sender->send_pending=1;
389   while (wait_event_interruptible(sender->send_wait_queue,
390                                   !(sender->send_pending)) !=0) {
391     // wait wait wait
392   }
393   
394   // released
395
396   return sender->send_rc;
397 }
398
399 static int mpi_recv_hcall(struct guest_info *core,
400                           struct guest_accessors *acc,
401                           void *buf, 
402                           int n, 
403                           int dtype, 
404                           int src, 
405                           int tag, 
406                           int comm, 
407                           void *stat) 
408 {
409   uint64_t cr3;
410   int i;
411   struct rendezvous_table_row *sender, *receiver;
412
413   SHALLOW_DEBUG_PRINT("mpi_recv_hcall(%p,%p,%p,%p,%p,%p,%p)\n",(void*)buf,(void*)n,(void*)dtype,(void*)src,(void*)tag,(void*)comm,(void*)stat);
414
415   cr3=acc->get_cr3(core);
416
417   // First find me
418   for (i=0;i<RENDEZVOUS_TABLE_MAX;i++) { 
419     if (rtab[i].state==RANKED && 
420         rtab[i].core==core &&
421         rtab[i].cr3==cr3) {
422       break;
423     }
424   }
425
426   if (i==RENDEZVOUS_TABLE_MAX) { 
427     ERROR("mpi: existential panic in receive\n");
428     return -1;
429   }
430
431   receiver=&(rtab[i]);
432
433   // Next try to find a matching send
434
435   for (i=0;i<RENDEZVOUS_TABLE_MAX;i++) { 
436     if (&(rtab[i])!=receiver &&
437         rtab[i].state==RANKED && 
438         strncmp(rtab[i].exec,receiver->exec,EXEC_NAME_MAX)==0) {
439       break;
440     }
441   }
442
443   if (i==RENDEZVOUS_TABLE_MAX) { 
444     DEEP_DEBUG_PRINT("mpi: sender does not exist yet - pending ourselves\n");
445     goto pending;
446   } else {
447     sender=&(rtab[i]);
448     if (!(sender->send_pending)) { 
449       DEEP_DEBUG_PRINT("mpi: sender has no pending receive - pending ourselves\n");
450       goto pending;
451     } 
452     // totally ignores communicator!!!  FIX FIX FIX
453     // simplistic fully qualified matching FIX FIX FIX
454     if (sender->send_tag==tag &&
455         sender->send_dest==receiver->rank) { 
456
457       uint64_t size = n < sender->send_size ? n : sender->send_size;
458       
459       SHALLOW_DEBUG_PRINT("mpi: mpi_recv: copying %llu bytes\n", size);
460
461       if (fast_inter_vm_copy(core,
462                              acc,
463                              buf,
464                              sender->core,
465                              sender->acc,
466                              sender->send_vaddr,
467                              size) != size) { 
468         ERROR("mpi: fast_inter_vm_copy failed in mpi_recv: destvm=%p, destacc=%p, dest_va=%p, srcvm=%p, srcacc=%p, src_va=%p, size=%llu\n",core,acc,buf,sender->core,sender->acc,sender->send_vaddr,size);
469         return -1;
470       }
471       
472       SHALLOW_DEBUG_PRINT("mpi: mpi_recv: finished copying\n");
473
474       // Now we release the sender
475       sender->send_rc = 0;
476       sender->send_pending = 0;
477
478       wake_up_interruptible(&(sender->send_wait_queue));
479
480       // And we are also done
481
482       return 0;
483
484     } else {
485       DEEP_DEBUG_PRINT("mpi: sender's pending send does not match - pending ourselves\n");
486       goto pending;
487     }
488   }
489       
490
491
492  pending:
493   
494   // we store our state
495   receiver->recv_vaddr=buf;
496   receiver->recv_size=n;
497   receiver->recv_src=src;
498   receiver->recv_tag=tag;
499   receiver->recv_rc=-1;
500
501   // And now we wait for the send to do the job
502   receiver->recv_pending=1;
503   while (wait_event_interruptible(receiver->recv_wait_queue,
504                                   !(receiver->recv_pending)) !=0) {
505     // wait wait wait
506   }
507
508   // released
509
510   return receiver->recv_rc;
511 }
512
513
514 static void get_args_64(palacios_core_t core,
515                         struct guest_accessors *acc,
516                         uint64_t *a1,
517                         uint64_t *a2,
518                         uint64_t *a3,
519                         uint64_t *a4,
520                         uint64_t *a5,
521                         uint64_t *a6,
522                         uint64_t *a7,
523                         uint64_t *a8)
524 {
525   *a1 = acc->get_rcx(core);
526   *a2 = acc->get_rdx(core);
527   *a3 = acc->get_rsi(core);
528   *a4 = acc->get_rdi(core);
529   *a5 = acc->get_r8(core);
530   *a6 = acc->get_r9(core);
531   *a7 = acc->get_r10(core);
532   *a8 = acc->get_r11(core);
533 }
534
535 static void get_args_32(palacios_core_t core,
536                         struct guest_accessors *acc,
537                         uint64_t *a1,
538                         uint64_t *a2,
539                         uint64_t *a3,
540                         uint64_t *a4,
541                         uint64_t *a5,
542                         uint64_t *a6,
543                         uint64_t *a7,
544                         uint64_t *a8)
545 {
546   uint64_t rsp;
547   uint32_t temp;
548
549
550   rsp = acc->get_rsp(core);
551
552   acc->read_gva(core,rsp,4,&temp); *a1=temp;
553   acc->read_gva(core,rsp+4,4,&temp); *a2=temp;
554   acc->read_gva(core,rsp+8,4,&temp); *a3=temp;
555   acc->read_gva(core,rsp+12,4,&temp); *a4=temp;
556   acc->read_gva(core,rsp+16,4,&temp); *a5=temp;
557   acc->read_gva(core,rsp+20,4,&temp); *a6=temp;
558   acc->read_gva(core,rsp+24,4,&temp); *a7=temp;
559   acc->read_gva(core,rsp+28,4,&temp); *a8=temp;
560   
561 }
562
563 static void get_args(palacios_core_t core,
564                      struct guest_accessors *acc,
565                      uint64_t *a1,
566                      uint64_t *a2,
567                      uint64_t *a3,
568                      uint64_t *a4,
569                      uint64_t *a5,
570                      uint64_t *a6,
571                      uint64_t *a7,
572                      uint64_t *a8)
573 {
574   uint64_t rbx;
575   uint32_t ebx;
576
577   rbx=acc->get_rbx(core);
578   ebx=rbx&0xffffffff;
579   
580   switch (ebx) {
581   case 0x64646464:
582     DEEP_DEBUG_PRINT("64 bit hcall\n");
583     return get_args_64(core,acc,a1,a2,a3,a4,a5,a6,a7,a8);
584     break;
585   case 0x32323232:
586     DEEP_DEBUG_PRINT("32 bit hcall\n");
587     return get_args_32(core,acc,a1,a2,a3,a4,a5,a6,a7,a8);
588     break;
589   default:
590     ERROR("UNKNOWN hcall calling convention\n");
591     break;
592   }
593 }
594
595 static void put_return(palacios_core_t core, 
596                        struct guest_accessors *acc,
597                        uint64_t rc)
598 {
599   acc->set_rax(core,rc);
600 }
601                      
602
603 int mpi_hypercall(palacios_core_t *core,
604                   unsigned int hid,
605                   struct guest_accessors *acc,
606                   void *p)
607 {
608   uint64_t a1,a2,a3,a4,a5,a6,a7,a8;
609   uint64_t rc;
610
611   DEEP_DEBUG_PRINT("palacios: mpi_hypercall(%p,0x%x,%p,%p)\n",
612                   core,hid,acc,p);
613
614   get_args(core,acc,&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8);
615
616   DEEP_DEBUG_PRINT("palacios: arguments: %p, %p, %p, %p, %p, %p, %p, %p\n",
617                     a1,a2,a3,a4,a5,a6,a7,a8);
618
619   switch (hid) { 
620   case MPI_INIT:
621     rc = mpi_init_hcall(core,acc,(int*)a1,(char ***)a2);
622     break;
623   case MPI_DEINIT:
624     rc = mpi_deinit_hcall(core,acc);
625     break;
626   case MPI_RANK:
627     rc = mpi_comm_rank_hcall(core,acc,(void*)a1,(int*)a2);
628     break;
629   case MPI_SEND:
630     rc = mpi_send_hcall(core,acc,(void*)a1,(int)a2,(int)a3,(int)a4,(int)a5,(int)a6);
631     break;
632   case MPI_RECV:
633     rc = mpi_recv_hcall(core,acc,(void*)a1,(int)a2,(int)a3,(int)a4,(int)a5,(int)a6,(void*)a7);
634     break;
635   default:
636     ERROR("palacios: mpi: unknown hcall number\n");
637     rc = -1;
638   }
639
640   put_return(core,acc,rc);
641
642   return 0;
643
644
645
646
647
648 EXPORT_SYMBOL(mpi_hypercall);
649
650
651 int init_module(void) 
652 {
653
654   rtab = kmalloc(sizeof(struct rendezvous_table_row)*RENDEZVOUS_TABLE_MAX,GFP_KERNEL);
655   if (!rtab) { 
656     ERROR("mpi: could not allocate memory\n");
657     return -1;
658   } else {
659     memset(rtab,0,sizeof(struct rendezvous_table_row)*RENDEZVOUS_TABLE_MAX);
660     INFO("mpi: inited\n");
661     return 0;
662   }
663   
664 }
665
666
667 void cleanup_module(void) 
668 {
669   if (rtab) { 
670     kfree(rtab);
671     rtab=0;
672   }
673
674   INFO("mpi: deinited\n");
675  
676 }
677
678