1 /* Linux host side PCI passthrough support
2 * Jack Lange <jacklange@cs.pitt.edu>, 2012
6 #include <linux/iommu.h>
7 #include <linux/interrupt.h>
8 #include <linux/version.h>
11 #define PCI_HDR_SIZE 256
14 static int setup_hw_pci_dev(struct host_pci_device * host_dev) {
16 struct pci_dev * dev = NULL;
17 struct v3_host_pci_dev * v3_dev = &(host_dev->v3_dev);
19 dev = pci_get_bus_and_slot(host_dev->hw_dev.bus,
20 host_dev->hw_dev.devfn);
24 printk("Could not find HW pci device (bus=%d, devfn=%d)\n",
25 host_dev->hw_dev.bus, host_dev->hw_dev.devfn);
29 // record pointer in dev state
30 host_dev->hw_dev.dev = dev;
32 host_dev->hw_dev.intx_disabled = 1;
33 palacios_spinlock_init(&(host_dev->hw_dev.intx_lock));
35 if (pci_enable_device(dev)) {
36 printk("Could not enable Device\n");
40 ret = pci_request_regions(dev, "v3vee");
42 printk("Could not reservce PCI regions\n");
47 pci_reset_function(host_dev->hw_dev.dev);
48 pci_save_state(host_dev->hw_dev.dev);
53 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
54 printk("Resource %d\n", i);
55 printk("\tflags = 0x%lx\n", pci_resource_flags(dev, i));
56 printk("\t name=%s, start=%lx, size=%d\n",
57 host_dev->hw_dev.dev->resource[i].name, (uintptr_t)pci_resource_start(dev, i),
58 (u32)pci_resource_len(dev, i));
62 printk("Rom BAR=%d\n", dev->rom_base_reg);
65 /* Cache first 6 BAR regs */
69 for (i = 0; i < 6; i++) {
70 struct v3_host_pci_bar * bar = &(v3_dev->bars[i]);
73 bar->size = pci_resource_len(dev, i);
74 bar->addr = pci_resource_start(dev, i);
75 flags = pci_resource_flags(dev, i);
77 if (flags & IORESOURCE_IO) {
78 bar->type = PT_BAR_IO;
79 } else if (flags & IORESOURCE_MEM) {
80 if (flags & IORESOURCE_MEM_64) {
81 struct v3_host_pci_bar * hi_bar = &(v3_dev->bars[i + 1]);
83 bar->type = PT_BAR_MEM64_LO;
85 hi_bar->type = PT_BAR_MEM64_HI;
86 hi_bar->size = bar->size;
87 hi_bar->addr = bar->addr;
88 hi_bar->cacheable = ((flags & IORESOURCE_CACHEABLE) != 0);
89 hi_bar->prefetchable = ((flags & IORESOURCE_PREFETCH) != 0);
92 } else if (flags & IORESOURCE_DMA) {
93 bar->type = PT_BAR_MEM24;
95 bar->type = PT_BAR_MEM32;
98 bar->cacheable = ((flags & IORESOURCE_CACHEABLE) != 0);
99 bar->prefetchable = ((flags & IORESOURCE_PREFETCH) != 0);
102 bar->type = PT_BAR_NONE;
107 /* Cache expansion rom bar */
109 struct resource * rom_res = &(dev->resource[PCI_ROM_RESOURCE]);
110 int rom_size = pci_resource_len(dev, PCI_ROM_RESOURCE);
113 //unsigned long flags;
115 v3_dev->exp_rom.size = rom_size;
116 v3_dev->exp_rom.addr = pci_resource_start(dev, PCI_ROM_RESOURCE);
117 // flags = pci_resource_flags(dev, PCI_ROM_RESOURCE);
119 v3_dev->exp_rom.type = PT_EXP_ROM;
121 v3_dev->exp_rom.exp_rom_enabled = rom_res->flags & IORESOURCE_ROM_ENABLE;
125 /* Cache entire configuration space */
129 // Copy the configuration space to the local cached version
130 for (m = 0; m < PCI_HDR_SIZE; m += 4) {
131 pci_read_config_dword(dev, m, (u32 *)&(v3_dev->cfg_space[m]));
136 /* HARDCODED for now but this will need to depend on IOMMU support detection */
138 printk("Setting host PCI device (%s) as IOMMU\n", host_dev->name);
139 v3_dev->iface = IOMMU;
141 printk("Setting host PCI device (%s) as SYMBIOTIC\n", host_dev->name);
142 v3_dev->iface = SYMBIOTIC;
151 static irqreturn_t host_pci_intx_irq_handler(int irq, void * priv_data) {
152 struct host_pci_device * host_dev = priv_data;
154 // printk("Host PCI IRQ handler (%d)\n", irq);
156 palacios_spinlock_lock(&(host_dev->hw_dev.intx_lock));
157 disable_irq_nosync(irq);
158 host_dev->hw_dev.intx_disabled = 1;
159 palacios_spinlock_unlock(&(host_dev->hw_dev.intx_lock));
161 V3_host_pci_raise_irq(&(host_dev->v3_dev), 0);
168 static irqreturn_t host_pci_msi_irq_handler(int irq, void * priv_data) {
169 struct host_pci_device * host_dev = priv_data;
170 // printk("Host PCI MSI IRQ Handler (%d)\n", irq);
172 V3_host_pci_raise_irq(&(host_dev->v3_dev), 0);
177 static irqreturn_t host_pci_msix_irq_handler(int irq, void * priv_data) {
178 struct host_pci_device * host_dev = priv_data;
181 // printk("Host PCI MSIX IRQ Handler (%d)\n", irq);
184 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
185 if (irq == host_dev->hw_dev.msix_entries[i].vector) {
186 V3_host_pci_raise_irq(&(host_dev->v3_dev), i);
188 printk("Error Could not find matching MSIX vector for IRQ %d\n", irq);
195 static int hw_pci_cmd(struct host_pci_device * host_dev, host_pci_cmd_t cmd, u64 arg) {
196 //struct v3_host_pci_dev * v3_dev = &(host_dev->v3_dev);
197 struct pci_dev * dev = host_dev->hw_dev.dev;
200 case HOST_PCI_CMD_DMA_DISABLE:
201 printk("Passthrough PCI device disabling BMDMA\n");
202 pci_clear_master(host_dev->hw_dev.dev);
204 case HOST_PCI_CMD_DMA_ENABLE:
205 printk("Passthrough PCI device Enabling BMDMA\n");
206 pci_set_master(host_dev->hw_dev.dev);
209 case HOST_PCI_CMD_INTX_DISABLE:
210 printk("Passthrough PCI device disabling INTx IRQ\n");
212 disable_irq(dev->irq);
213 free_irq(dev->irq, (void *)host_dev);
216 case HOST_PCI_CMD_INTX_ENABLE:
217 printk("Passthrough PCI device Enabling INTx IRQ\n");
219 if (request_threaded_irq(dev->irq, NULL, host_pci_intx_irq_handler,
220 IRQF_ONESHOT, "V3Vee_Host_PCI_INTx", (void *)host_dev)) {
221 printk("ERROR Could not assign IRQ to host PCI device (%s)\n", host_dev->name);
226 case HOST_PCI_CMD_MSI_DISABLE:
227 printk("Passthrough PCI device Disabling MSIs\n");
229 disable_irq(dev->irq);
230 free_irq(dev->irq, (void *)host_dev);
232 pci_disable_msi(dev);
235 case HOST_PCI_CMD_MSI_ENABLE:
236 printk("Passthrough PCI device Enabling MSI\n");
238 if (!dev->msi_enabled) {
241 if (request_irq(dev->irq, host_pci_msi_irq_handler,
242 0, "V3Vee_host_PCI_MSI", (void *)host_dev)) {
243 printk("Error Requesting IRQ %d for Passthrough MSI IRQ\n", dev->irq);
251 case HOST_PCI_CMD_MSIX_ENABLE: {
254 printk("Passthrough PCI device Enabling MSIX\n");
255 host_dev->hw_dev.num_msix_vecs = arg;;
256 host_dev->hw_dev.msix_entries = kcalloc(host_dev->hw_dev.num_msix_vecs,
257 sizeof(struct msix_entry), GFP_KERNEL);
259 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
260 host_dev->hw_dev.msix_entries[i].entry = i;
263 pci_enable_msix(dev, host_dev->hw_dev.msix_entries,
264 host_dev->hw_dev.num_msix_vecs);
266 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
267 if (request_irq(host_dev->hw_dev.msix_entries[i].vector,
268 host_pci_msix_irq_handler,
269 0, "V3VEE_host_PCI_MSIX", (void *)host_dev)) {
270 printk("Error requesting IRQ %d for Passthrough MSIX IRQ\n",
271 host_dev->hw_dev.msix_entries[i].vector);
278 case HOST_PCI_CMD_MSIX_DISABLE: {
281 printk("Passthrough PCI device Disabling MSIX\n");
283 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
284 disable_irq(host_dev->hw_dev.msix_entries[i].vector);
287 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
288 free_irq(host_dev->hw_dev.msix_entries[i].vector, (void *)host_dev);
291 host_dev->hw_dev.num_msix_vecs = 0;
292 palacios_free(host_dev->hw_dev.msix_entries);
294 pci_disable_msix(dev);
299 printk("Error: unhandled passthrough PCI command: %d\n", cmd);
308 static int hw_ack_irq(struct host_pci_device * host_dev, u32 vector) {
309 struct pci_dev * dev = host_dev->hw_dev.dev;
312 // printk("Acking IRQ vector %d\n", vector);
314 palacios_spinlock_lock_irqsave(&(host_dev->hw_dev.intx_lock), flags);
315 // printk("Enabling IRQ %d\n", dev->irq);
316 enable_irq(dev->irq);
317 host_dev->hw_dev.intx_disabled = 0;
318 palacios_spinlock_unlock_irqrestore(&(host_dev->hw_dev.intx_lock), flags);
326 static int reserve_hw_pci_dev(struct host_pci_device * host_dev, void * v3_ctx) {
329 struct v3_host_pci_dev * v3_dev = &(host_dev->v3_dev);
330 struct pci_dev * dev = host_dev->hw_dev.dev;
332 palacios_spinlock_lock_irqsave(&lock, flags);
333 if (host_dev->hw_dev.in_use == 0) {
334 host_dev->hw_dev.in_use = 1;
338 palacios_spinlock_unlock_irqrestore(&lock, flags);
341 if (v3_dev->iface == IOMMU) {
342 struct v3_guest_mem_region region;
346 host_dev->hw_dev.iommu_domain = iommu_domain_alloc();
348 while (V3_get_guest_mem_region(v3_ctx, ®ion, gpa)) {
350 printk("Memory region: start=%p, end=%p\n", (void *)region.start, (void *)region.end);
353 flags = IOMMU_READ | IOMMU_WRITE; // Need to see what IOMMU_CACHE means
355 /* This version could be wrong */
356 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
357 // Guest VAs start at zero and go to end of memory
358 iommu_map_range(host_dev->hw_dev.iommu_domain, 0, region.start, (region.end - region.start), flags);
360 /* Linux actually made the interface worse... Now you can only map memory in powers of 2 (meant to only be pages...) */
362 u64 size = region.end - region.start;
363 u32 page_size = 512 * 4096; // assume large 64bit pages (2MB)
364 u64 hpa = region.start;
367 if (size < page_size) {
368 page_size = 4096; // less than a 2MB granularity, so we switch to small pages (4KB)
371 printk("Mapping IOMMU region gpa=%p hpa=%p (size=%d)\n", (void *)gpa, (void *)hpa, page_size);
373 if (iommu_map(host_dev->hw_dev.iommu_domain, gpa, hpa,
374 get_order(page_size), flags)) {
375 printk("ERROR: Could not map sub region (GPA=%p) (HPA=%p) (order=%d)\n",
376 (void *)gpa, (void *)hpa, get_order(page_size));
389 if (iommu_attach_device(host_dev->hw_dev.iommu_domain, &(dev->dev))) {
390 printk("ERROR attaching host PCI device to IOMMU domain\n");
396 printk("Requesting Threaded IRQ handler for IRQ %d\n", dev->irq);
397 // setup regular IRQs until advanced IRQ mechanisms are enabled
398 if (request_threaded_irq(dev->irq, NULL, host_pci_intx_irq_handler,
399 IRQF_ONESHOT, "V3Vee_Host_PCI_INTx", (void *)host_dev)) {
400 printk("ERROR Could not assign IRQ to host PCI device (%s)\n", host_dev->name);
411 static int write_hw_pci_config(struct host_pci_device * host_dev, u32 reg, void * data, u32 length) {
412 struct pci_dev * dev = host_dev->hw_dev.dev;
419 pci_write_config_byte(dev, reg, *(u8 *)data);
420 } else if (length == 2) {
421 pci_write_config_word(dev, reg, *(u16 *)data);
422 } else if (length == 4) {
423 pci_write_config_dword(dev, reg, *(u32 *)data);
425 printk("Invalid length of host PCI config update\n");
434 static int read_hw_pci_config(struct host_pci_device * host_dev, u32 reg, void * data, u32 length) {
435 struct pci_dev * dev = host_dev->hw_dev.dev;
439 pci_read_config_byte(dev, reg, data);
440 } else if (length == 2) {
441 pci_read_config_word(dev, reg, data);
442 } else if (length == 4) {
443 pci_read_config_dword(dev, reg, data);
445 printk("Invalid length of host PCI config read\n");
455 // Should be a matching teardown function here, otherwise we
456 // are at least leaking the lock from the lockchecker's perspective
457 // we would like to be able to do a palacios_spinlock_deinit() here...