1 /* Linux host side PCI passthrough support
2 * Jack Lange <jacklange@cs.pitt.edu>, 2012
6 #include <linux/iommu.h>
7 #include <linux/interrupt.h>
8 #include <linux/version.h>
11 #define PCI_HDR_SIZE 256
14 static int setup_hw_pci_dev(struct host_pci_device * host_dev) {
16 struct pci_dev * dev = NULL;
17 struct v3_host_pci_dev * v3_dev = &(host_dev->v3_dev);
19 dev = pci_get_bus_and_slot(host_dev->hw_dev.bus,
20 host_dev->hw_dev.devfn);
24 printk("Could not find HW pci device (bus=%d, devfn=%d)\n",
25 host_dev->hw_dev.bus, host_dev->hw_dev.devfn);
29 // record pointer in dev state
30 host_dev->hw_dev.dev = dev;
32 host_dev->hw_dev.intx_disabled = 1;
33 palacios_spinlock_init(&(host_dev->hw_dev.intx_lock));
35 if (pci_enable_device(dev)) {
36 printk("Could not enable Device\n");
40 ret = pci_request_regions(dev, "v3vee");
42 printk("Could not reservce PCI regions\n");
47 pci_reset_function(host_dev->hw_dev.dev);
48 pci_save_state(host_dev->hw_dev.dev);
53 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
54 printk("Resource %d\n", i);
55 printk("\tflags = 0x%lx\n", pci_resource_flags(dev, i));
56 printk("\t name=%s, start=%lx, size=%d\n",
57 host_dev->hw_dev.dev->resource[i].name, (uintptr_t)pci_resource_start(dev, i),
58 (u32)pci_resource_len(dev, i));
62 printk("Rom BAR=%d\n", dev->rom_base_reg);
65 /* Cache first 6 BAR regs */
69 for (i = 0; i < 6; i++) {
70 struct v3_host_pci_bar * bar = &(v3_dev->bars[i]);
73 bar->size = pci_resource_len(dev, i);
74 bar->addr = pci_resource_start(dev, i);
75 flags = pci_resource_flags(dev, i);
77 if (flags & IORESOURCE_IO) {
78 bar->type = PT_BAR_IO;
79 } else if (flags & IORESOURCE_MEM) {
80 if ((flags & IORESOURCE_MEM_64)) {
81 // this should never happen with i==5, but it
82 // is technically an OOB access without the modulo
83 struct v3_host_pci_bar * hi_bar = &(v3_dev->bars[(i + 1) % 6]);
85 bar->type = PT_BAR_MEM64_LO;
87 hi_bar->type = PT_BAR_MEM64_HI;
88 hi_bar->size = bar->size;
89 hi_bar->addr = bar->addr;
90 hi_bar->cacheable = ((flags & IORESOURCE_CACHEABLE) != 0);
91 hi_bar->prefetchable = ((flags & IORESOURCE_PREFETCH) != 0);
94 } else if (flags & IORESOURCE_DMA) {
95 bar->type = PT_BAR_MEM24;
97 bar->type = PT_BAR_MEM32;
100 bar->cacheable = ((flags & IORESOURCE_CACHEABLE) != 0);
101 bar->prefetchable = ((flags & IORESOURCE_PREFETCH) != 0);
104 bar->type = PT_BAR_NONE;
109 /* Cache expansion rom bar */
111 struct resource * rom_res = &(dev->resource[PCI_ROM_RESOURCE]);
112 int rom_size = pci_resource_len(dev, PCI_ROM_RESOURCE);
115 //unsigned long flags;
117 v3_dev->exp_rom.size = rom_size;
118 v3_dev->exp_rom.addr = pci_resource_start(dev, PCI_ROM_RESOURCE);
119 // flags = pci_resource_flags(dev, PCI_ROM_RESOURCE);
121 v3_dev->exp_rom.type = PT_EXP_ROM;
123 v3_dev->exp_rom.exp_rom_enabled = rom_res->flags & IORESOURCE_ROM_ENABLE;
127 /* Cache entire configuration space */
131 // Copy the configuration space to the local cached version
132 for (m = 0; m < PCI_HDR_SIZE; m += 4) {
133 pci_read_config_dword(dev, m, (u32 *)&(v3_dev->cfg_space[m]));
138 /* HARDCODED for now but this will need to depend on IOMMU support detection */
140 printk("Setting host PCI device (%s) as IOMMU\n", host_dev->name);
141 v3_dev->iface = IOMMU;
143 printk("Setting host PCI device (%s) as SYMBIOTIC\n", host_dev->name);
144 v3_dev->iface = SYMBIOTIC;
153 static irqreturn_t host_pci_intx_irq_handler(int irq, void * priv_data) {
154 struct host_pci_device * host_dev = priv_data;
156 // printk("Host PCI IRQ handler (%d)\n", irq);
158 palacios_spinlock_lock(&(host_dev->hw_dev.intx_lock));
159 disable_irq_nosync(irq);
160 host_dev->hw_dev.intx_disabled = 1;
161 palacios_spinlock_unlock(&(host_dev->hw_dev.intx_lock));
163 V3_host_pci_raise_irq(&(host_dev->v3_dev), 0);
170 static irqreturn_t host_pci_msi_irq_handler(int irq, void * priv_data) {
171 struct host_pci_device * host_dev = priv_data;
172 // printk("Host PCI MSI IRQ Handler (%d)\n", irq);
174 V3_host_pci_raise_irq(&(host_dev->v3_dev), 0);
179 static irqreturn_t host_pci_msix_irq_handler(int irq, void * priv_data) {
180 struct host_pci_device * host_dev = priv_data;
183 // printk("Host PCI MSIX IRQ Handler (%d)\n", irq);
186 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
187 if (irq == host_dev->hw_dev.msix_entries[i].vector) {
188 V3_host_pci_raise_irq(&(host_dev->v3_dev), i);
190 printk("Error Could not find matching MSIX vector for IRQ %d\n", irq);
197 static int hw_pci_cmd(struct host_pci_device * host_dev, host_pci_cmd_t cmd, u64 arg) {
198 //struct v3_host_pci_dev * v3_dev = &(host_dev->v3_dev);
199 struct pci_dev * dev = host_dev->hw_dev.dev;
202 case HOST_PCI_CMD_DMA_DISABLE:
203 printk("Passthrough PCI device disabling BMDMA\n");
204 pci_clear_master(host_dev->hw_dev.dev);
206 case HOST_PCI_CMD_DMA_ENABLE:
207 printk("Passthrough PCI device Enabling BMDMA\n");
208 pci_set_master(host_dev->hw_dev.dev);
211 case HOST_PCI_CMD_INTX_DISABLE:
212 printk("Passthrough PCI device disabling INTx IRQ\n");
214 disable_irq(dev->irq);
215 free_irq(dev->irq, (void *)host_dev);
218 case HOST_PCI_CMD_INTX_ENABLE:
219 printk("Passthrough PCI device Enabling INTx IRQ\n");
221 if (request_threaded_irq(dev->irq, NULL, host_pci_intx_irq_handler,
222 IRQF_ONESHOT, "V3Vee_Host_PCI_INTx", (void *)host_dev)) {
223 printk("ERROR Could not assign IRQ to host PCI device (%s)\n", host_dev->name);
228 case HOST_PCI_CMD_MSI_DISABLE:
229 printk("Passthrough PCI device Disabling MSIs\n");
231 disable_irq(dev->irq);
232 free_irq(dev->irq, (void *)host_dev);
234 pci_disable_msi(dev);
237 case HOST_PCI_CMD_MSI_ENABLE:
238 printk("Passthrough PCI device Enabling MSI\n");
240 if (!dev->msi_enabled) {
243 if (request_irq(dev->irq, host_pci_msi_irq_handler,
244 0, "V3Vee_host_PCI_MSI", (void *)host_dev)) {
245 printk("Error Requesting IRQ %d for Passthrough MSI IRQ\n", dev->irq);
253 case HOST_PCI_CMD_MSIX_ENABLE: {
256 printk("Passthrough PCI device Enabling MSIX\n");
257 host_dev->hw_dev.num_msix_vecs = arg;;
258 host_dev->hw_dev.msix_entries = kcalloc(host_dev->hw_dev.num_msix_vecs,
259 sizeof(struct msix_entry), GFP_KERNEL);
261 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
262 host_dev->hw_dev.msix_entries[i].entry = i;
265 pci_enable_msix(dev, host_dev->hw_dev.msix_entries,
266 host_dev->hw_dev.num_msix_vecs);
268 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
269 if (request_irq(host_dev->hw_dev.msix_entries[i].vector,
270 host_pci_msix_irq_handler,
271 0, "V3VEE_host_PCI_MSIX", (void *)host_dev)) {
272 printk("Error requesting IRQ %d for Passthrough MSIX IRQ\n",
273 host_dev->hw_dev.msix_entries[i].vector);
280 case HOST_PCI_CMD_MSIX_DISABLE: {
283 printk("Passthrough PCI device Disabling MSIX\n");
285 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
286 disable_irq(host_dev->hw_dev.msix_entries[i].vector);
289 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
290 free_irq(host_dev->hw_dev.msix_entries[i].vector, (void *)host_dev);
293 host_dev->hw_dev.num_msix_vecs = 0;
294 palacios_free(host_dev->hw_dev.msix_entries);
296 pci_disable_msix(dev);
301 printk("Error: unhandled passthrough PCI command: %d\n", cmd);
310 static int hw_ack_irq(struct host_pci_device * host_dev, u32 vector) {
311 struct pci_dev * dev = host_dev->hw_dev.dev;
314 // printk("Acking IRQ vector %d\n", vector);
316 palacios_spinlock_lock_irqsave(&(host_dev->hw_dev.intx_lock), flags);
317 // printk("Enabling IRQ %d\n", dev->irq);
318 enable_irq(dev->irq);
319 host_dev->hw_dev.intx_disabled = 0;
320 palacios_spinlock_unlock_irqrestore(&(host_dev->hw_dev.intx_lock), flags);
328 static int reserve_hw_pci_dev(struct host_pci_device * host_dev, void * v3_ctx) {
331 struct v3_host_pci_dev * v3_dev = &(host_dev->v3_dev);
332 struct pci_dev * dev = host_dev->hw_dev.dev;
334 palacios_spinlock_lock_irqsave(&lock, flags);
335 if (host_dev->hw_dev.in_use == 0) {
336 host_dev->hw_dev.in_use = 1;
340 palacios_spinlock_unlock_irqrestore(&lock, flags);
343 if (v3_dev->iface == IOMMU) {
344 struct v3_guest_mem_region region;
348 host_dev->hw_dev.iommu_domain = iommu_domain_alloc();
350 while (V3_get_guest_mem_region(v3_ctx, ®ion, gpa)) {
352 printk("Memory region: start=%p, end=%p\n", (void *)region.start, (void *)region.end);
355 flags = IOMMU_READ | IOMMU_WRITE; // Need to see what IOMMU_CACHE means
357 /* This version could be wrong */
358 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
359 // Guest VAs start at zero and go to end of memory
360 iommu_map_range(host_dev->hw_dev.iommu_domain, 0, region.start, (region.end - region.start), flags);
362 /* Linux actually made the interface worse... Now you can only map memory in powers of 2 (meant to only be pages...) */
364 u64 size = region.end - region.start;
365 u32 page_size = 512 * 4096; // assume large 64bit pages (2MB)
366 u64 hpa = region.start;
369 if (size < page_size) {
370 page_size = 4096; // less than a 2MB granularity, so we switch to small pages (4KB)
373 printk("Mapping IOMMU region gpa=%p hpa=%p (size=%d)\n", (void *)gpa, (void *)hpa, page_size);
375 if (iommu_map(host_dev->hw_dev.iommu_domain, gpa, hpa,
376 get_order(page_size), flags)) {
377 printk("ERROR: Could not map sub region (GPA=%p) (HPA=%p) (order=%d)\n",
378 (void *)gpa, (void *)hpa, get_order(page_size));
391 if (iommu_attach_device(host_dev->hw_dev.iommu_domain, &(dev->dev))) {
392 printk("ERROR attaching host PCI device to IOMMU domain\n");
398 printk("Requesting Threaded IRQ handler for IRQ %d\n", dev->irq);
399 // setup regular IRQs until advanced IRQ mechanisms are enabled
400 if (request_threaded_irq(dev->irq, NULL, host_pci_intx_irq_handler,
401 IRQF_ONESHOT, "V3Vee_Host_PCI_INTx", (void *)host_dev)) {
402 printk("ERROR Could not assign IRQ to host PCI device (%s)\n", host_dev->name);
413 static int write_hw_pci_config(struct host_pci_device * host_dev, u32 reg, void * data, u32 length) {
414 struct pci_dev * dev = host_dev->hw_dev.dev;
421 pci_write_config_byte(dev, reg, *(u8 *)data);
422 } else if (length == 2) {
423 pci_write_config_word(dev, reg, *(u16 *)data);
424 } else if (length == 4) {
425 pci_write_config_dword(dev, reg, *(u32 *)data);
427 printk("Invalid length of host PCI config update\n");
436 static int read_hw_pci_config(struct host_pci_device * host_dev, u32 reg, void * data, u32 length) {
437 struct pci_dev * dev = host_dev->hw_dev.dev;
441 pci_read_config_byte(dev, reg, data);
442 } else if (length == 2) {
443 pci_read_config_word(dev, reg, data);
444 } else if (length == 4) {
445 pci_read_config_dword(dev, reg, data);
447 printk("Invalid length of host PCI config read\n");
457 // Should be a matching teardown function here, otherwise we
458 // are at least leaking the lock from the lockchecker's perspective
459 // we would like to be able to do a palacios_spinlock_deinit() here...