Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


updated virtio block device to partially handle non-aligned IO requests
[palacios.git] / palacios / src / devices / lnx_virtio_blk.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2008, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2008, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_dev_mgr.h>
22 #include <devices/lnx_virtio_pci.h>
23 #include <palacios/vm_guest_mem.h>
24
25 #include <devices/pci.h>
26
27
28
29 #ifndef V3_CONFIG_DEBUG_VIRTIO_BLK
30 #undef PrintDebug
31 #define PrintDebug(fmt, args...)
32 #endif
33
34
35 #define SECTOR_SIZE 512
36
37 #define BLK_CAPACITY_PORT     20
38 #define BLK_MAX_SIZE_PORT     28
39 #define BLK_MAX_SEG_PORT      32
40 #define BLK_CYLINDERS_PORT    36
41 #define BLK_HEADS_PORT        38
42 #define BLK_SECTS_PORT        39
43
44 #define BLK_IN_REQ            0
45 #define BLK_OUT_REQ           1
46 #define BLK_SCSI_CMD          2
47
48 #define BLK_BARRIER_FLAG     0x80000000
49
50 #define BLK_STATUS_OK             0
51 #define BLK_STATUS_ERR            1
52 #define BLK_STATUS_NOT_SUPPORTED  2
53
54
55 struct blk_config {
56     uint64_t capacity;
57     uint32_t max_size;
58     uint32_t max_seg;
59     uint16_t cylinders;
60     uint8_t heads;
61     uint8_t sectors;
62 } __attribute__((packed));
63
64
65
66 struct blk_op_hdr {
67     uint32_t type;
68     uint32_t prior;
69     uint64_t sector;
70 } __attribute__((packed));
71
72 #define QUEUE_SIZE 128
73
74 /* Host Feature flags */
75 #define VIRTIO_BARRIER       0x01       /* Does host support barriers? */
76 #define VIRTIO_SIZE_MAX      0x02       /* Indicates maximum segment size */
77 #define VIRTIO_SEG_MAX       0x04       /* Indicates maximum # of segments */
78 #define VIRTIO_LEGACY_GEOM   0x10       /* Indicates support of legacy geometry */
79
80
81 struct virtio_dev_state {
82     struct vm_device * pci_bus;
83     struct list_head dev_list;
84 };
85
86 struct virtio_blk_state {
87
88     struct pci_device * pci_dev;
89     struct blk_config block_cfg;
90     struct virtio_config virtio_cfg;
91
92     
93     struct virtio_queue queue;
94
95     struct v3_dev_blk_ops * ops;
96
97     void * backend_data;
98
99     int io_range_size;
100
101     struct virtio_dev_state * virtio_dev;
102
103     struct list_head dev_link;
104 };
105
106
107
108
109 static int blk_reset(struct virtio_blk_state * virtio) {
110
111     virtio->queue.ring_desc_addr = 0;
112     virtio->queue.ring_avail_addr = 0;
113     virtio->queue.ring_used_addr = 0;
114     virtio->queue.pfn = 0;
115     virtio->queue.cur_avail_idx = 0;
116
117     virtio->virtio_cfg.status = 0;
118     virtio->virtio_cfg.pci_isr = 0;
119     return 0;
120 }
121
122
123
124
125 static int handle_read_op(struct virtio_blk_state * blk_state, uint8_t * buf, uint64_t * sector, uint64_t len) {
126     int ret = -1;
127
128     PrintDebug("Reading Disk\n");
129     ret = blk_state->ops->read(buf, (*sector) * SECTOR_SIZE, len, (void *)(blk_state->backend_data));
130     *sector += (len / SECTOR_SIZE);
131
132     return ret;
133 }
134
135
136 static int handle_write_op(struct virtio_blk_state * blk_state, uint8_t * buf, uint64_t * sector, uint64_t len) {
137     int ret = -1;
138
139     PrintDebug("Writing Disk\n");
140     ret = blk_state->ops->write(buf, (*sector) * SECTOR_SIZE, len, (void *)(blk_state->backend_data));
141     *sector += (len / SECTOR_SIZE);
142
143     return ret;
144 }
145
146
147
148 // multiple block operations need to increment the sector 
149
150 static int handle_block_op(struct guest_info * core, struct virtio_blk_state * blk_state, struct blk_op_hdr * hdr, 
151                            struct vring_desc * buf_desc, uint8_t * status) {
152     uint8_t * buf = NULL;
153
154     PrintDebug("Handling Block op\n");
155     if (v3_gpa_to_hva(core, buf_desc->addr_gpa, (addr_t *)&(buf)) == -1) {
156         PrintError("Could not translate buffer address\n");
157         return -1;
158     }
159
160     PrintDebug("Sector=%p Length=%d\n", (void *)(addr_t)(hdr->sector), buf_desc->length);
161
162     if (hdr->type == BLK_IN_REQ) {
163         if (handle_read_op(blk_state, buf, &(hdr->sector), buf_desc->length) == -1) {
164             *status = BLK_STATUS_ERR;
165             return -1;
166         } else {
167             *status = BLK_STATUS_OK;
168         }
169     } else if (hdr->type == BLK_OUT_REQ) {
170         if (handle_write_op(blk_state, buf, &(hdr->sector), buf_desc->length) == -1) {
171             *status = BLK_STATUS_ERR;
172             return -1;
173         } else {
174             *status = BLK_STATUS_OK;
175         }
176     } else if (hdr->type == BLK_SCSI_CMD) {
177         PrintError("VIRTIO: SCSI Command Not supported!!!\n");
178         *status = BLK_STATUS_NOT_SUPPORTED;
179         return -1;
180     }
181
182     PrintDebug("Returning Status: %d\n", *status);
183
184     return 0;
185 }
186
187 static int get_desc_count(struct virtio_queue * q, int index) {
188     struct vring_desc * tmp_desc = &(q->desc[index]);
189     int cnt = 1;
190     
191     while (tmp_desc->flags & VIRTIO_NEXT_FLAG) {
192         tmp_desc = &(q->desc[tmp_desc->next]);
193         cnt++;
194     }
195
196     return cnt;
197 }
198
199
200
201 static int handle_kick(struct guest_info * core, struct virtio_blk_state * blk_state) {  
202     struct virtio_queue * q = &(blk_state->queue);
203
204     PrintDebug("VIRTIO KICK: cur_index=%d (mod=%d), avail_index=%d\n", 
205                q->cur_avail_idx, q->cur_avail_idx % QUEUE_SIZE, q->avail->index);
206
207     while (q->cur_avail_idx != q->avail->index) {
208         struct vring_desc * hdr_desc = NULL;
209         struct vring_desc * buf_desc = NULL;
210         struct vring_desc * status_desc = NULL;
211         struct blk_op_hdr hdr;
212         addr_t hdr_addr = 0;
213         uint16_t desc_idx = q->avail->ring[q->cur_avail_idx % QUEUE_SIZE];
214         int desc_cnt = get_desc_count(q, desc_idx);
215         int i = 0;
216         uint8_t * status_ptr = NULL;
217         uint8_t status = BLK_STATUS_OK;
218         uint32_t req_len = 0;
219
220         PrintDebug("Descriptor Count=%d, index=%d\n", desc_cnt, q->cur_avail_idx % QUEUE_SIZE);
221
222         if (desc_cnt < 3) {
223             PrintError("Block operations must include at least 3 descriptors\n");
224             return -1;
225         }
226
227         hdr_desc = &(q->desc[desc_idx]);
228
229
230         PrintDebug("Header Descriptor (ptr=%p) gpa=%p, len=%d, flags=%x, next=%d\n", hdr_desc, 
231                    (void *)(hdr_desc->addr_gpa), hdr_desc->length, hdr_desc->flags, hdr_desc->next);    
232
233         if (v3_gpa_to_hva(core, hdr_desc->addr_gpa, &(hdr_addr)) == -1) {
234             PrintError("Could not translate block header address\n");
235             return -1;
236         }
237
238         // We copy the block op header out because we are going to modify its contents
239         memcpy(&hdr, (void *)hdr_addr, sizeof(struct blk_op_hdr));
240         
241         PrintDebug("Blk Op Hdr (ptr=%p) type=%d, sector=%p\n", (void *)hdr_addr, hdr.type, (void *)hdr.sector);
242
243         desc_idx = hdr_desc->next;
244
245         for (i = 0; i < desc_cnt - 2; i++) {
246             uint8_t tmp_status = BLK_STATUS_OK;
247
248             buf_desc = &(q->desc[desc_idx]);
249
250             PrintDebug("Buffer Descriptor (ptr=%p) gpa=%p, len=%d, flags=%x, next=%d\n", buf_desc, 
251                        (void *)(buf_desc->addr_gpa), buf_desc->length, buf_desc->flags, buf_desc->next);
252
253             if (handle_block_op(core, blk_state, &hdr, buf_desc, &tmp_status) == -1) {
254                 PrintError("Error handling block operation\n");
255                 return -1;
256             }
257
258             if (tmp_status != BLK_STATUS_OK) {
259                 status = tmp_status;
260             }
261
262             req_len += buf_desc->length;
263             desc_idx = buf_desc->next;
264         }
265
266         status_desc = &(q->desc[desc_idx]);
267
268         PrintDebug("Status Descriptor (ptr=%p) gpa=%p, len=%d, flags=%x, next=%d\n", status_desc, 
269                    (void *)(status_desc->addr_gpa), status_desc->length, status_desc->flags, status_desc->next);
270
271         if (v3_gpa_to_hva(core, status_desc->addr_gpa, (addr_t *)&(status_ptr)) == -1) {
272             PrintError("Could not translate status address\n");
273             return -1;
274         }
275
276         req_len += status_desc->length;
277         *status_ptr = status;
278
279         q->used->ring[q->used->index % QUEUE_SIZE].id = q->avail->ring[q->cur_avail_idx % QUEUE_SIZE];
280         q->used->ring[q->used->index % QUEUE_SIZE].length = req_len; // What do we set this to????
281
282         q->used->index++;
283         q->cur_avail_idx++;
284     }
285
286     if (!(q->avail->flags & VIRTIO_NO_IRQ_FLAG)) {
287         PrintDebug("Raising IRQ %d\n",  blk_state->pci_dev->config_header.intr_line);
288         v3_pci_raise_irq(blk_state->virtio_dev->pci_bus, 0, blk_state->pci_dev);
289         blk_state->virtio_cfg.pci_isr = 1;
290     }
291
292     return 0;
293 }
294
295 static int virtio_io_write(struct guest_info * core, uint16_t port, void * src, uint_t length, void * private_data) {
296     struct virtio_blk_state * blk_state = (struct virtio_blk_state *)private_data;
297     int port_idx = port % blk_state->io_range_size;
298
299
300     PrintDebug("VIRTIO BLOCK Write for port %d (index=%d) len=%d, value=%x\n", 
301                port, port_idx,  length, *(uint32_t *)src);
302
303
304
305     switch (port_idx) {
306         case GUEST_FEATURES_PORT:
307             if (length != 4) {
308                 PrintError("Illegal write length for guest features\n");
309                 return -1;
310             }
311             
312             blk_state->virtio_cfg.guest_features = *(uint32_t *)src;
313             PrintDebug("Setting Guest Features to %x\n", blk_state->virtio_cfg.guest_features);
314
315             break;
316         case VRING_PG_NUM_PORT:
317             if (length == 4) {
318                 addr_t pfn = *(uint32_t *)src;
319                 addr_t page_addr = (pfn << VIRTIO_PAGE_SHIFT);
320
321
322                 blk_state->queue.pfn = pfn;
323                 
324                 blk_state->queue.ring_desc_addr = page_addr ;
325                 blk_state->queue.ring_avail_addr = page_addr + (QUEUE_SIZE * sizeof(struct vring_desc));
326                 blk_state->queue.ring_used_addr = ( blk_state->queue.ring_avail_addr + \
327                                                  sizeof(struct vring_avail)    + \
328                                                  (QUEUE_SIZE * sizeof(uint16_t)));
329                 
330                 // round up to next page boundary.
331                 blk_state->queue.ring_used_addr = (blk_state->queue.ring_used_addr + 0xfff) & ~0xfff;
332
333                 if (v3_gpa_to_hva(core, blk_state->queue.ring_desc_addr, (addr_t *)&(blk_state->queue.desc)) == -1) {
334                     PrintError("Could not translate ring descriptor address\n");
335                     return -1;
336                 }
337
338
339                 if (v3_gpa_to_hva(core, blk_state->queue.ring_avail_addr, (addr_t *)&(blk_state->queue.avail)) == -1) {
340                     PrintError("Could not translate ring available address\n");
341                     return -1;
342                 }
343
344
345                 if (v3_gpa_to_hva(core, blk_state->queue.ring_used_addr, (addr_t *)&(blk_state->queue.used)) == -1) {
346                     PrintError("Could not translate ring used address\n");
347                     return -1;
348                 }
349
350                 PrintDebug("RingDesc_addr=%p, Avail_addr=%p, Used_addr=%p\n",
351                            (void *)(blk_state->queue.ring_desc_addr),
352                            (void *)(blk_state->queue.ring_avail_addr),
353                            (void *)(blk_state->queue.ring_used_addr));
354
355                 PrintDebug("RingDesc=%p, Avail=%p, Used=%p\n", 
356                            blk_state->queue.desc, blk_state->queue.avail, blk_state->queue.used);
357
358             } else {
359                 PrintError("Illegal write length for page frame number\n");
360                 return -1;
361             }
362             break;
363         case VRING_Q_SEL_PORT:
364             blk_state->virtio_cfg.vring_queue_selector = *(uint16_t *)src;
365
366             if (blk_state->virtio_cfg.vring_queue_selector != 0) {
367                 PrintError("Virtio Block device only uses 1 queue, selected %d\n", 
368                            blk_state->virtio_cfg.vring_queue_selector);
369                 return -1;
370             }
371
372             break;
373         case VRING_Q_NOTIFY_PORT:
374             PrintDebug("Handling Kick\n");
375             if (handle_kick(core, blk_state) == -1) {
376                 PrintError("Could not handle Block Notification\n");
377                 return -1;
378             }
379             break;
380         case VIRTIO_STATUS_PORT:
381             blk_state->virtio_cfg.status = *(uint8_t *)src;
382
383             if (blk_state->virtio_cfg.status == 0) {
384                 PrintDebug("Resetting device\n");
385                 blk_reset(blk_state);
386             }
387
388             break;
389
390         case VIRTIO_ISR_PORT:
391             blk_state->virtio_cfg.pci_isr = *(uint8_t *)src;
392             break;
393         default:
394             return -1;
395             break;
396     }
397
398     return length;
399 }
400
401
402 static int virtio_io_read(struct guest_info * core, uint16_t port, void * dst, uint_t length, void * private_data) {
403     struct virtio_blk_state * blk_state = (struct virtio_blk_state *)private_data;
404     int port_idx = port % blk_state->io_range_size;
405
406
407     PrintDebug("VIRTIO BLOCK Read  for port %d (index =%d), length=%d\n", 
408                port, port_idx, length);
409
410
411     switch (port_idx) {
412         case HOST_FEATURES_PORT:
413         case HOST_FEATURES_PORT + 1:
414         case HOST_FEATURES_PORT + 2:
415         case HOST_FEATURES_PORT + 3:
416             if (port_idx + length > HOST_FEATURES_PORT + 4) {
417                 PrintError("Illegal read length for host features (len=%d)\n", length);
418                 return -1;
419             }
420
421             memcpy(dst, &(blk_state->virtio_cfg.host_features), length);
422             break;
423         case VRING_PG_NUM_PORT:
424         case VRING_PG_NUM_PORT + 1:
425         case VRING_PG_NUM_PORT + 2:
426         case VRING_PG_NUM_PORT + 3:
427             if (port_idx + length > VRING_PG_NUM_PORT + 4) {
428                 PrintError("Illegal read length for vring pg num (len=%d)\n", length);
429                 return -1;
430             }
431
432             memcpy(dst, &(blk_state->queue.pfn), length);
433             break;
434         case VRING_SIZE_PORT:
435         case VRING_SIZE_PORT + 1:
436             if (length > 2) {
437                 PrintError("Illegal read length for vring size (len=%d)\n", length);
438                 return -1;
439             }
440             
441             memcpy(dst, &(blk_state->queue.queue_size), length);
442
443             break;
444
445         case VIRTIO_STATUS_PORT:
446             if (length != 1) {
447                 PrintError("Illegal read length for status (len=%d)\n", length);
448                 return -1;
449             }
450
451             *(uint8_t *)dst = blk_state->virtio_cfg.status;
452             break;
453
454         case VIRTIO_ISR_PORT:
455             *(uint8_t *)dst = blk_state->virtio_cfg.pci_isr;
456             blk_state->virtio_cfg.pci_isr = 0;
457             v3_pci_lower_irq(blk_state->virtio_dev->pci_bus, 0, blk_state->pci_dev);
458             break;
459
460         default:
461             if ( (port_idx >= sizeof(struct virtio_config)) && 
462                  (port_idx < (sizeof(struct virtio_config) + sizeof(struct blk_config))) ) {
463                 int cfg_offset = port_idx - sizeof(struct virtio_config);
464                 uint8_t * cfg_ptr = (uint8_t *)&(blk_state->block_cfg);
465
466                 memcpy(dst, cfg_ptr + cfg_offset, length);
467                 
468             } else {
469                 PrintError("Read of Unhandled Virtio Read\n");
470                 return -1;
471             }
472           
473             break;
474     }
475
476     return length;
477 }
478
479
480 static int virtio_free(struct virtio_dev_state * virtio) {
481     struct virtio_blk_state * blk_state = NULL;
482     struct virtio_blk_state * tmp = NULL;
483
484     list_for_each_entry_safe(blk_state, tmp, &(virtio->dev_list), dev_link) {
485
486         // unregister from PCI
487
488         list_del(&(blk_state->dev_link));
489         V3_Free(blk_state);
490     }
491     
492
493     V3_Free(virtio);
494
495     return 0;
496 }
497
498
499
500 static struct v3_device_ops dev_ops = {
501     .free = (int (*)(void *))virtio_free,
502
503 };
504
505
506
507
508
509 static int register_dev(struct virtio_dev_state * virtio, struct virtio_blk_state * blk_state) {
510     // initialize PCI
511     struct pci_device * pci_dev = NULL;
512     struct v3_pci_bar bars[6];
513     int num_ports = sizeof(struct virtio_config) + sizeof(struct blk_config);
514     int tmp_ports = num_ports;
515     int i;
516
517
518
519     // This gets the number of ports, rounded up to a power of 2
520     blk_state->io_range_size = 1; // must be a power of 2
521     
522     while (tmp_ports > 0) {
523         tmp_ports >>= 1;
524         blk_state->io_range_size <<= 1;
525     }
526         
527     // this is to account for any low order bits being set in num_ports
528     // if there are none, then num_ports was already a power of 2 so we shift right to reset it
529     if ((num_ports & ((blk_state->io_range_size >> 1) - 1)) == 0) {
530         blk_state->io_range_size >>= 1;
531     }
532     
533     
534     for (i = 0; i < 6; i++) {
535         bars[i].type = PCI_BAR_NONE;
536     }
537     
538     PrintDebug("Virtio-BLK io_range_size = %d\n", blk_state->io_range_size);
539     
540     bars[0].type = PCI_BAR_IO;
541     bars[0].default_base_port = -1;
542     bars[0].num_ports = blk_state->io_range_size;
543     
544     bars[0].io_read = virtio_io_read;
545     bars[0].io_write = virtio_io_write;
546     bars[0].private_data = blk_state;
547     
548     pci_dev = v3_pci_register_device(virtio->pci_bus, PCI_STD_DEVICE, 
549                                      0, PCI_AUTO_DEV_NUM, 0,
550                                      "LNX_VIRTIO_BLK", bars,
551                                      NULL, NULL, NULL, blk_state);
552     
553     if (!pci_dev) {
554         PrintError("Could not register PCI Device\n");
555         return -1;
556     }
557     
558     pci_dev->config_header.vendor_id = VIRTIO_VENDOR_ID;
559     pci_dev->config_header.subsystem_vendor_id = VIRTIO_SUBVENDOR_ID;
560     
561     
562     pci_dev->config_header.device_id = VIRTIO_BLOCK_DEV_ID;
563     pci_dev->config_header.class = PCI_CLASS_STORAGE;
564     pci_dev->config_header.subclass = PCI_STORAGE_SUBCLASS_OTHER;
565     
566     pci_dev->config_header.subsystem_id = VIRTIO_BLOCK_SUBDEVICE_ID;
567     
568     
569     pci_dev->config_header.intr_pin = 1;
570     
571     pci_dev->config_header.max_latency = 1; // ?? (qemu does it...)
572     
573     
574     blk_state->pci_dev = pci_dev;
575
576
577     /* Add backend to list of devices */
578     list_add(&(blk_state->dev_link), &(virtio->dev_list));
579     
580     /* Block configuration */
581     blk_state->virtio_cfg.host_features = VIRTIO_SEG_MAX;
582     blk_state->block_cfg.max_seg = QUEUE_SIZE - 2;
583
584     // Virtio Block only uses one queue
585     blk_state->queue.queue_size = QUEUE_SIZE;
586
587     blk_state->virtio_dev = virtio;
588
589     blk_reset(blk_state);
590
591
592     return 0;
593 }
594
595
596 static int connect_fn(struct v3_vm_info * vm, 
597                       void * frontend_data, 
598                       struct v3_dev_blk_ops * ops, 
599                       v3_cfg_tree_t * cfg, 
600                       void * private_data) {
601
602     struct virtio_dev_state * virtio = (struct virtio_dev_state *)frontend_data;
603
604     struct virtio_blk_state * blk_state  = (struct virtio_blk_state *)V3_Malloc(sizeof(struct virtio_blk_state));
605     memset(blk_state, 0, sizeof(struct virtio_blk_state));
606
607     register_dev(virtio, blk_state);
608
609     blk_state->ops = ops;
610     blk_state->backend_data = private_data;
611
612     blk_state->block_cfg.capacity = ops->get_capacity(private_data) / SECTOR_SIZE;
613
614     PrintDebug("Virtio Capacity = %d -- 0x%p\n", (int)(blk_state->block_cfg.capacity), 
615                (void *)(addr_t)(blk_state->block_cfg.capacity));
616
617     return 0;
618 }
619
620
621 static int virtio_init(struct v3_vm_info * vm, v3_cfg_tree_t * cfg) {
622     struct vm_device * pci_bus = v3_find_dev(vm, v3_cfg_val(cfg, "bus"));
623     struct virtio_dev_state * virtio_state = NULL;
624     char * dev_id = v3_cfg_val(cfg, "ID");
625
626     PrintDebug("Initializing VIRTIO Block device\n");
627
628     if (pci_bus == NULL) {
629         PrintError("VirtIO devices require a PCI Bus");
630         return -1;
631     }
632
633
634     virtio_state  = (struct virtio_dev_state *)V3_Malloc(sizeof(struct virtio_dev_state));
635     memset(virtio_state, 0, sizeof(struct virtio_dev_state));
636
637     INIT_LIST_HEAD(&(virtio_state->dev_list));
638     virtio_state->pci_bus = pci_bus;
639
640
641     struct vm_device * dev = v3_add_device(vm, dev_id, &dev_ops, virtio_state);
642
643     if (dev == NULL) {
644         PrintError("Could not attach device %s\n", dev_id);
645         V3_Free(virtio_state);
646         return -1;
647     }
648
649     if (v3_dev_add_blk_frontend(vm, dev_id, connect_fn, (void *)virtio_state) == -1) {
650         PrintError("Could not register %s as block frontend\n", dev_id);
651         v3_remove_device(dev);
652         return -1;
653     }
654
655     return 0;
656 }
657
658
659 device_register("LNX_VIRTIO_BLK", virtio_init)