1 /* Linux host side PCI passthrough support
2 * Jack Lange <jacklange@cs.pitt.edu>, 2012
6 #include <linux/iommu.h>
7 #include <linux/interrupt.h>
8 #include <linux/version.h>
11 #define PCI_HDR_SIZE 256
13 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 43)
14 #define IOMMU_FOUND() iommu_present(&pci_bus_type)
15 #define IOMMU_DOMAIN_ALLOC() iommu_domain_alloc(&pci_bus_type)
17 #define IOMMU_FOUND() iommu_found()
18 #define IOMMU_DOMAIN_ALLOC() iommu_domain_alloc()
23 static int setup_hw_pci_dev(struct host_pci_device * host_dev) {
25 struct pci_dev * dev = NULL;
26 struct v3_host_pci_dev * v3_dev = &(host_dev->v3_dev);
28 dev = pci_get_bus_and_slot(host_dev->hw_dev.bus,
29 host_dev->hw_dev.devfn);
33 printk("Could not find HW pci device (bus=%d, devfn=%d)\n",
34 host_dev->hw_dev.bus, host_dev->hw_dev.devfn);
38 // record pointer in dev state
39 host_dev->hw_dev.dev = dev;
41 host_dev->hw_dev.intx_disabled = 1;
42 palacios_spinlock_init(&(host_dev->hw_dev.intx_lock));
44 if (pci_enable_device(dev)) {
45 printk("Could not enable Device\n");
49 ret = pci_request_regions(dev, "v3vee");
51 printk("Could not reservce PCI regions\n");
56 pci_reset_function(host_dev->hw_dev.dev);
57 pci_save_state(host_dev->hw_dev.dev);
62 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
63 printk("Resource %d\n", i);
64 printk("\tflags = 0x%lx\n", pci_resource_flags(dev, i));
65 printk("\t name=%s, start=%lx, size=%d\n",
66 host_dev->hw_dev.dev->resource[i].name, (uintptr_t)pci_resource_start(dev, i),
67 (u32)pci_resource_len(dev, i));
71 printk("Rom BAR=%d\n", dev->rom_base_reg);
74 /* Cache first 6 BAR regs */
78 for (i = 0; i < 6; i++) {
79 struct v3_host_pci_bar * bar = &(v3_dev->bars[i]);
82 bar->size = pci_resource_len(dev, i);
83 bar->addr = pci_resource_start(dev, i);
84 flags = pci_resource_flags(dev, i);
86 if (flags & IORESOURCE_IO) {
87 bar->type = PT_BAR_IO;
88 } else if (flags & IORESOURCE_MEM) {
89 if ((flags & IORESOURCE_MEM_64)) {
90 // this should never happen with i==5, but it
91 // is technically an OOB access without the modulo
92 struct v3_host_pci_bar * hi_bar = &(v3_dev->bars[(i + 1) % 6]);
94 bar->type = PT_BAR_MEM64_LO;
96 hi_bar->type = PT_BAR_MEM64_HI;
97 hi_bar->size = bar->size;
98 hi_bar->addr = bar->addr;
99 hi_bar->cacheable = ((flags & IORESOURCE_CACHEABLE) != 0);
100 hi_bar->prefetchable = ((flags & IORESOURCE_PREFETCH) != 0);
103 } else if (flags & IORESOURCE_DMA) {
104 bar->type = PT_BAR_MEM24;
106 bar->type = PT_BAR_MEM32;
109 bar->cacheable = ((flags & IORESOURCE_CACHEABLE) != 0);
110 bar->prefetchable = ((flags & IORESOURCE_PREFETCH) != 0);
113 bar->type = PT_BAR_NONE;
118 /* Cache expansion rom bar */
120 struct resource * rom_res = &(dev->resource[PCI_ROM_RESOURCE]);
121 int rom_size = pci_resource_len(dev, PCI_ROM_RESOURCE);
124 //unsigned long flags;
126 v3_dev->exp_rom.size = rom_size;
127 v3_dev->exp_rom.addr = pci_resource_start(dev, PCI_ROM_RESOURCE);
128 // flags = pci_resource_flags(dev, PCI_ROM_RESOURCE);
130 v3_dev->exp_rom.type = PT_EXP_ROM;
132 v3_dev->exp_rom.exp_rom_enabled = rom_res->flags & IORESOURCE_ROM_ENABLE;
136 /* Cache entire configuration space */
140 // Copy the configuration space to the local cached version
141 for (m = 0; m < PCI_HDR_SIZE; m += 4) {
142 pci_read_config_dword(dev, m, (u32 *)&(v3_dev->cfg_space[m]));
147 /* HARDCODED for now but this will need to depend on IOMMU support detection */
149 printk("Setting host PCI device (%s) as IOMMU\n", host_dev->name);
150 v3_dev->iface = IOMMU;
152 printk("Setting host PCI device (%s) as SYMBIOTIC\n", host_dev->name);
153 v3_dev->iface = SYMBIOTIC;
162 static irqreturn_t host_pci_intx_irq_handler(int irq, void * priv_data) {
163 struct host_pci_device * host_dev = priv_data;
165 // printk("Host PCI IRQ handler (%d)\n", irq);
167 palacios_spinlock_lock(&(host_dev->hw_dev.intx_lock));
168 disable_irq_nosync(irq);
169 host_dev->hw_dev.intx_disabled = 1;
170 palacios_spinlock_unlock(&(host_dev->hw_dev.intx_lock));
172 V3_host_pci_raise_irq(&(host_dev->v3_dev), 0);
179 static irqreturn_t host_pci_msi_irq_handler(int irq, void * priv_data) {
180 struct host_pci_device * host_dev = priv_data;
181 // printk("Host PCI MSI IRQ Handler (%d)\n", irq);
183 V3_host_pci_raise_irq(&(host_dev->v3_dev), 0);
188 static irqreturn_t host_pci_msix_irq_handler(int irq, void * priv_data) {
189 struct host_pci_device * host_dev = priv_data;
192 // printk("Host PCI MSIX IRQ Handler (%d)\n", irq);
195 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
196 if (irq == host_dev->hw_dev.msix_entries[i].vector) {
197 V3_host_pci_raise_irq(&(host_dev->v3_dev), i);
199 printk("Error Could not find matching MSIX vector for IRQ %d\n", irq);
206 static int hw_pci_cmd(struct host_pci_device * host_dev, host_pci_cmd_t cmd, u64 arg) {
207 //struct v3_host_pci_dev * v3_dev = &(host_dev->v3_dev);
208 struct pci_dev * dev = host_dev->hw_dev.dev;
211 case HOST_PCI_CMD_DMA_DISABLE:
212 printk("Passthrough PCI device disabling BMDMA\n");
213 pci_clear_master(host_dev->hw_dev.dev);
215 case HOST_PCI_CMD_DMA_ENABLE:
216 printk("Passthrough PCI device Enabling BMDMA\n");
217 pci_set_master(host_dev->hw_dev.dev);
220 case HOST_PCI_CMD_INTX_DISABLE:
221 printk("Passthrough PCI device disabling INTx IRQ\n");
223 disable_irq(dev->irq);
224 free_irq(dev->irq, (void *)host_dev);
227 case HOST_PCI_CMD_INTX_ENABLE:
228 printk("Passthrough PCI device Enabling INTx IRQ\n");
230 if (request_threaded_irq(dev->irq, NULL, host_pci_intx_irq_handler,
231 IRQF_ONESHOT, "V3Vee_Host_PCI_INTx", (void *)host_dev)) {
232 printk("ERROR Could not assign IRQ to host PCI device (%s)\n", host_dev->name);
237 case HOST_PCI_CMD_MSI_DISABLE:
238 printk("Passthrough PCI device Disabling MSIs\n");
240 disable_irq(dev->irq);
241 free_irq(dev->irq, (void *)host_dev);
243 pci_disable_msi(dev);
246 case HOST_PCI_CMD_MSI_ENABLE:
247 printk("Passthrough PCI device Enabling MSI\n");
249 if (!dev->msi_enabled) {
252 if (request_irq(dev->irq, host_pci_msi_irq_handler,
253 0, "V3Vee_host_PCI_MSI", (void *)host_dev)) {
254 printk("Error Requesting IRQ %d for Passthrough MSI IRQ\n", dev->irq);
262 case HOST_PCI_CMD_MSIX_ENABLE: {
265 printk("Passthrough PCI device Enabling MSIX\n");
266 host_dev->hw_dev.num_msix_vecs = arg;;
267 host_dev->hw_dev.msix_entries = kcalloc(host_dev->hw_dev.num_msix_vecs,
268 sizeof(struct msix_entry), GFP_KERNEL);
270 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
271 host_dev->hw_dev.msix_entries[i].entry = i;
274 pci_enable_msix(dev, host_dev->hw_dev.msix_entries,
275 host_dev->hw_dev.num_msix_vecs);
277 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
278 if (request_irq(host_dev->hw_dev.msix_entries[i].vector,
279 host_pci_msix_irq_handler,
280 0, "V3VEE_host_PCI_MSIX", (void *)host_dev)) {
281 printk("Error requesting IRQ %d for Passthrough MSIX IRQ\n",
282 host_dev->hw_dev.msix_entries[i].vector);
289 case HOST_PCI_CMD_MSIX_DISABLE: {
292 printk("Passthrough PCI device Disabling MSIX\n");
294 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
295 disable_irq(host_dev->hw_dev.msix_entries[i].vector);
298 for (i = 0; i < host_dev->hw_dev.num_msix_vecs; i++) {
299 free_irq(host_dev->hw_dev.msix_entries[i].vector, (void *)host_dev);
302 host_dev->hw_dev.num_msix_vecs = 0;
303 palacios_free(host_dev->hw_dev.msix_entries);
305 pci_disable_msix(dev);
310 printk("Error: unhandled passthrough PCI command: %d\n", cmd);
319 static int hw_ack_irq(struct host_pci_device * host_dev, u32 vector) {
320 struct pci_dev * dev = host_dev->hw_dev.dev;
323 // printk("Acking IRQ vector %d\n", vector);
325 palacios_spinlock_lock_irqsave(&(host_dev->hw_dev.intx_lock), flags);
326 // printk("Enabling IRQ %d\n", dev->irq);
327 enable_irq(dev->irq);
328 host_dev->hw_dev.intx_disabled = 0;
329 palacios_spinlock_unlock_irqrestore(&(host_dev->hw_dev.intx_lock), flags);
337 static int reserve_hw_pci_dev(struct host_pci_device * host_dev, void * v3_ctx) {
340 struct v3_host_pci_dev * v3_dev = &(host_dev->v3_dev);
341 struct pci_dev * dev = host_dev->hw_dev.dev;
343 palacios_spinlock_lock_irqsave(&lock, flags);
344 if (host_dev->hw_dev.in_use == 0) {
345 host_dev->hw_dev.in_use = 1;
349 palacios_spinlock_unlock_irqrestore(&lock, flags);
352 if (v3_dev->iface == IOMMU) {
353 struct v3_guest_mem_region region;
357 host_dev->hw_dev.iommu_domain = IOMMU_DOMAIN_ALLOC();
359 while (V3_get_guest_mem_region(v3_ctx, ®ion, gpa)) {
361 printk("Memory region: start=%p, end=%p\n", (void *)region.start, (void *)region.end);
364 flags = IOMMU_READ | IOMMU_WRITE; // Need to see what IOMMU_CACHE means
366 /* This version could be wrong */
367 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
368 // Guest VAs start at zero and go to end of memory
369 iommu_map_range(host_dev->hw_dev.iommu_domain, 0, region.start, (region.end - region.start), flags);
371 /* Linux actually made the interface worse... Now you can only map memory in powers of 2 (meant to only be pages...) */
373 u64 size = region.end - region.start;
374 u32 page_size = 512 * 4096; // assume large 64bit pages (2MB)
375 u64 hpa = region.start;
378 if (size < page_size) {
379 page_size = 4096; // less than a 2MB granularity, so we switch to small pages (4KB)
382 printk("Mapping IOMMU region gpa=%p hpa=%p (size=%d)\n", (void *)gpa, (void *)hpa, page_size);
384 if (iommu_map(host_dev->hw_dev.iommu_domain, gpa, hpa,
385 get_order(page_size), flags)) {
386 printk("ERROR: Could not map sub region (GPA=%p) (HPA=%p) (order=%d)\n",
387 (void *)gpa, (void *)hpa, get_order(page_size));
400 if (iommu_attach_device(host_dev->hw_dev.iommu_domain, &(dev->dev))) {
401 printk("ERROR attaching host PCI device to IOMMU domain\n");
407 printk("Requesting Threaded IRQ handler for IRQ %d\n", dev->irq);
408 // setup regular IRQs until advanced IRQ mechanisms are enabled
409 if (request_threaded_irq(dev->irq, NULL, host_pci_intx_irq_handler,
410 IRQF_ONESHOT, "V3Vee_Host_PCI_INTx", (void *)host_dev)) {
411 printk("ERROR Could not assign IRQ to host PCI device (%s)\n", host_dev->name);
422 static int write_hw_pci_config(struct host_pci_device * host_dev, u32 reg, void * data, u32 length) {
423 struct pci_dev * dev = host_dev->hw_dev.dev;
430 pci_write_config_byte(dev, reg, *(u8 *)data);
431 } else if (length == 2) {
432 pci_write_config_word(dev, reg, *(u16 *)data);
433 } else if (length == 4) {
434 pci_write_config_dword(dev, reg, *(u32 *)data);
436 printk("Invalid length of host PCI config update\n");
445 static int read_hw_pci_config(struct host_pci_device * host_dev, u32 reg, void * data, u32 length) {
446 struct pci_dev * dev = host_dev->hw_dev.dev;
450 pci_read_config_byte(dev, reg, data);
451 } else if (length == 2) {
452 pci_read_config_word(dev, reg, data);
453 } else if (length == 4) {
454 pci_read_config_dword(dev, reg, data);
456 printk("Invalid length of host PCI config read\n");
466 // Should be a matching teardown function here, otherwise we
467 // are at least leaking the lock from the lockchecker's perspective
468 // we would like to be able to do a palacios_spinlock_deinit() here...