From c30984e2a7fc722519746bd86fe20f1590fa89dd Mon Sep 17 00:00:00 2001 From: Alexander Kudryavtsev Date: Fri, 23 Sep 2011 23:20:42 +0400 Subject: [PATCH 12/32] PCI MSI-X initial support --- palacios/include/devices/pci_msi_router.h | 3 + palacios/src/devices/io_apic.c | 6 + palacios/src/devices/pci_msi_router.c | 469 ++++++++++++++++++---- palacios/src/devices/pci_msi_types.h | 52 +++- palacios/src/devices/pci_passthrough.c | 628 ++++++++++++++++++++++++----- 5 files changed, 975 insertions(+), 183 deletions(-) diff --git a/palacios/include/devices/pci_msi_router.h b/palacios/include/devices/pci_msi_router.h index bc6f0ed..faacfef 100644 --- a/palacios/include/devices/pci_msi_router.h +++ b/palacios/include/devices/pci_msi_router.h @@ -46,6 +46,9 @@ int v3_msi_capability_write(struct pci_device *dev, uint_t reg_num, uint_t lengt int v3_msi_get_info(struct pci_device *dev, struct msi_info *info, void *private_data); +int v3_msix_register_device(struct pci_device *dev, int msix_cap, void* table, void* pba, void *private_data); +int v3_msix_capability_write(struct pci_device *dev, uint_t reg_num, uint_t length, void *private_data); +int v3_msix_memory_write(struct pci_device *dev, addr_t guest_addr, void * src, uint_t length, void * private_data); #endif diff --git a/palacios/src/devices/io_apic.c b/palacios/src/devices/io_apic.c index e8f33c3..80ab1d8 100644 --- a/palacios/src/devices/io_apic.c +++ b/palacios/src/devices/io_apic.c @@ -268,6 +268,9 @@ static int ioapic_raise_irq(struct v3_vm_info * vm, void * private_data, struct struct redir_tbl_entry * irq_entry = NULL; int irq = irq_data.irq; + if(irq_data.host_vector == 76) { + PrintError("benet1!\n"); + } // check whether this IRQ can be processed by IOAPIC if(irq_data.flags.valid && !irq_data.flags.uses_irqline) return -1; @@ -285,6 +288,9 @@ static int ioapic_raise_irq(struct v3_vm_info * vm, void * private_data, struct PrintDebug("ioapic %u: IOAPIC Signaling APIC to raise INTR %d\n", ioapic->ioapic_id.id, irq_entry->vec); + if(irq_data.host_vector == 76) { + PrintError("benet2!\n"); + } ipi.vector = irq_entry->vec; ipi.mode = irq_entry->del_mode; diff --git a/palacios/src/devices/pci_msi_router.c b/palacios/src/devices/pci_msi_router.c index 05ad81e..0320014 100644 --- a/palacios/src/devices/pci_msi_router.c +++ b/palacios/src/devices/pci_msi_router.c @@ -35,23 +35,51 @@ #define PrintDebug(fmt, args...) #endif -struct msi_desc { - int cap; - int cap_len; - - enum msi_dest_mode dest_mode; - int vector; - int vec_count; - int raise_vector_hint; - - union msi_control *control; // points to guest config_space - union msi_msg_address *msg_address_lo; // points to guest config_space - union msi_msg_data *msg_data; // points to guest config_space - uint32_t *mask_bits; // points to guest config_space - uint32_t *pending_bits; // points to guest config_space - - union msi_msg_data msg_data_cached; // contains previous value of MSI data, to check for changes - +union mem_bar { + uint32_t val; + struct { + uint32_t io: 1; // 1 - IO, 0 - mem +#define MEM_BAR_32 0 +#define MEM_BAR_64 2 + uint32_t type: 2; // 00 - 32bit, 10 - 64 bit + uint32_t prefetchable: 1; + uint32_t base: 28; + }; +} __attribute__((packed)); + +struct msi_dev_data { + struct { + uint8_t msi: 1; // MSI registered for device + uint8_t msix: 1; // MSI-X registered for device + } f; + + struct msi_desc { + int cap; // capability offset + int cap_len; + + enum msi_dest_mode dest_mode; + int vector; + int vec_count; + int raise_vector_hint; + + union msi_control *control; // points to guest config_space + union msi_msg_address *msg_address_lo; // points to guest config_space + union msi_msg_data *msg_data; // points to guest config_space + uint32_t *mask_bits; // points to guest config_space + uint32_t *pending_bits; // points to guest config_space + + union msi_msg_data msg_data_cached; // contains previous value of MSI data, to check for changes + } m; + struct msix_desc { + int cap_offset; // capability offset + struct msix_tbl_entry *table; // points to MSI-X table + uint64_t *pba; // points to Pending Bits Array + int *vector_to_entry; // maps vectors to MSI-X table entries + struct msix_capability *cap; // points to device's virtual capability + + union mem_bar *tbl_bar; + union mem_bar *pba_bar; + } mx; struct pci_device *dev; }; @@ -61,7 +89,7 @@ static int msi_raise_intr(struct v3_vm_info *vm, void *private_data, struct irq_ struct msi_router_state { char *name; - struct msi_desc msi[MSI_DEV_COUNT]; + struct msi_dev_data msi[MSI_DEV_COUNT]; int last_index; int vector_to_index[MAX_IRQ]; // maps vectors to msi array elements for fast access @@ -71,25 +99,128 @@ struct msi_router_state { void *apic_dev_data; }; +// table and pba are memory locations with MSI-X table data and Pending Bits array. +int v3_msix_register_device(struct pci_device *dev, int msix_cap, void* table, void* pba, void *private_data) { + V3_ASSERT(private_data && table && pba && dev); + struct msi_router_state *state = (struct msi_router_state*)private_data; + struct msi_dev_data *mdd; + struct msix_desc *mx; + int index; + + V3_ASSERT(msix_cap && msix_cap < 0xff); + if(dev->config_space[msix_cap] != PCI_CAP_ID_MSIX) { + PrintError("Device %s: MSI-X capability ID (%x) is incorrect at supplied offset %x!\n", + dev->name, dev->config_space[msix_cap], msix_cap); + return -1; + } + if(dev->msi_handle) { + V3_ASSERT(dev->msi_handle > 0 && dev->msi_handle <= MSI_DEV_COUNT && state->msi[dev->msi_handle - 1].dev == dev); + index = dev->msi_handle - 1; + } else { + if(state->last_index == MSI_DEV_COUNT) { + PrintError("Out of MSI device slots.\n"); + return -1; + } + index = state->last_index; + ++state->last_index; + } + mdd = &state->msi[index]; + if(dev->msi_handle && mdd->f.msix) { + PrintError("MSI-X already registered for device %s!\n", dev->name); + return -1; + } + mx = &mdd->mx; + + mx->cap_offset = msix_cap; + mx->cap = (struct msix_capability *)&dev->config_space[msix_cap]; + mx->table = (struct msix_tbl_entry *)table; + mx->pba = pba; + if(mx->cap->control.enable) { + PrintError("MSI-X already enabled! Smth is wrong?\n"); + return -1; + } + + int tbl_size = mx->cap->control.tbl_size + 1, i; + mx->vector_to_entry = V3_Malloc((MAX_IRQ > tbl_size ? MAX_IRQ : tbl_size) * sizeof(int)); + V3_ASSERT(mx->vector_to_entry); + for(i = 0; i < tbl_size; ++i) { + mx->vector_to_entry[i] = -1; + } + union mem_bar *bars = (union mem_bar *)&dev->config_space[0x10], *bar; + bar = &bars[mx->cap->tbl_bir]; + if(bar->io) { + PrintError("MSI-X table BAR is not memory bar!\n"); + return -1; + } + //if(bar->type == MEM_BAR_64) // Use LOW dword; > 4GB is not supported. + // bar = &bars[mx->cap->tbl_bir + 1]; + mx->tbl_bar = bar; + + bar = &bars[mx->cap->pba_bir]; + if(bar->io) { + PrintError("MSI-X PBA BAR is not memory bar!\n"); + return -1; + } + //if(bar->type == MEM_BAR_64) // Use LOW dword; > 4GB is not supported. + // bar = &bars[mx->cap->pba_bir + 1]; + mx->pba_bar = bar; + +#ifdef V3_CONFIG_DEBUG_PCI_MSI + PrintDebug("Device %s registered MSI-X feature with MSI router: cap. at %x, func_mask %d," + " table size %d, table BAR index %d, table BAR offset %x, PBA BAR index %d," + " PBA BAR offset %x. Table entries:\n", dev->name, msix_cap, mx->cap->control.func_mask, + tbl_size, mx->cap->tbl_bir, mx->cap->tbl_offset << 3, mx->cap->pba_bir, mx->cap->pba_offset << 3); + for(i = 0; i < tbl_size; ++i) { + struct msix_tbl_entry *e = &mx->table[i]; + PrintDebug(" %d: addr %08x:%08x data %04x mask %d (PBA bit %d) \n", i, e->address_hi, e->address_lo.val, + e->data.val, e->ctl.mask, MSIX_PBA_BIT(mx->pba, i)); + } +#endif + + + + if(!dev->msi_handle) + dev->msi_handle = state->last_index; + mdd->dev = dev; + mdd->f.msix = 1; + + return 0; +} int v3_msi_register_device(struct pci_device *dev, int msi_cap, void *private_data) { V3_ASSERT(private_data); struct msi_router_state *state = (struct msi_router_state*)private_data; - if(dev->msi_handle) { - PrintError("MSI already registered for device %s!\n", dev->name); + struct msi_dev_data *mdd; + struct msi_desc *md; + int index; + + V3_ASSERT(msi_cap && msi_cap < 0xff); + if(dev->config_space[msi_cap] != PCI_CAP_ID_MSI) { + PrintError("Device %s: MSI capability ID (%x) is incorrect at supplied offset %x!\n", + dev->name, dev->config_space[msi_cap], msi_cap); return -1; } - if(state->last_index == MSI_DEV_COUNT) { - PrintError("Out of MSI device slots.\n"); + if(dev->msi_handle) { + V3_ASSERT(dev->msi_handle > 0 && dev->msi_handle <= MSI_DEV_COUNT && state->msi[dev->msi_handle - 1].dev == dev); + index = dev->msi_handle - 1; + } else { + if(state->last_index == MSI_DEV_COUNT) { + PrintError("Out of MSI device slots.\n"); + return -1; + } + index = state->last_index; + ++state->last_index; + } + mdd = &state->msi[index]; + if(dev->msi_handle && mdd->f.msi) { + PrintError("MSI already registered for device %s!\n", dev->name); return -1; } - - - struct msi_desc *md = &state->msi[state->last_index]; + md = &mdd->m; md->cap = msi_cap; - md->control = (union msi_control*)&dev->config_space[msi_cap + 2]; + md->control = (union msi_control*)&dev->config_space[MSI_CAP_CTL(msi_cap)]; if(md->control->enable) { PrintError("MSI already enabled! Smth is wrong?\n"); @@ -109,12 +240,13 @@ int v3_msi_register_device(struct pci_device *dev, int msi_cap, void *private_da md->msg_address_lo = (union msi_msg_address*)&dev->config_space[msi_cap + 4]; md->msg_data = (union msi_msg_data*)&dev->config_space[MSI_CAP_DATA(msi_cap, md->control->is_64bit_cap)]; - md->dev = dev; + if(!dev->msi_handle) + dev->msi_handle = state->last_index; + mdd->dev = dev; + mdd->f.msi = 1; - dev->msi_handle = state->last_index + 1; - ++state->last_index; - - PrintDebug("Device %s registered with MSI router: cap %x:%x, device MSI handle %d\n", dev->name, md->cap, md->cap + md->cap_len, dev->msi_handle); + PrintDebug("Device %s registered MSI feature with MSI router: cap %x:%x, device MSI handle %d\n", + dev->name, md->cap, md->cap + md->cap_len, dev->msi_handle); return 0; } @@ -123,8 +255,33 @@ int v3_msi_register_device(struct pci_device *dev, int msi_cap, void *private_da #define MSI_VECTOR_MAX 0xfe #define MSI_ADDR_HDR 0xfee +static int msi_check_msg_address_and_data(union msi_msg_address *ad, union msi_msg_data *dt) { + if(ad->addr_hdr != MSI_ADDR_HDR) { + PrintError("MSI: Strange Address header %x.\n", ad->addr_hdr); + return -1; + } + if(ad->mbz0 || ad->mbz1) { + PrintError("MSI: MBZ fields are not zero.\n"); + return -1; + } + if(dt->vector < MSI_VECTOR_MIN || dt->vector > MSI_VECTOR_MAX) { + PrintError("MSI vector out of limits!\n"); + return -1; + } + if(dt->del_mode > 1) { + PrintError("Unsupported delivery mode specified!\n"); + return -1; + } + if(dt->trigger) { + PrintError("MSI level interrupt is not supported!\n"); + return -1; + } + return 0; +} + static int msi_update_mode(struct msi_router_state *state, int msi_dev_index) { - struct msi_desc *md = &state->msi[msi_dev_index]; + struct msi_dev_data *mdd = &state->msi[msi_dev_index]; + struct msi_desc *md = &mdd->m; union msi_control *c = (union msi_control*)md->control; union msi_msg_address *ad = (union msi_msg_address *)md->msg_address_lo; union msi_msg_data *dt = (union msi_msg_data *)md->msg_data; @@ -141,36 +298,20 @@ static int msi_update_mode(struct msi_router_state *state, int msi_dev_index) { return -1; } if(c->is_64bit_cap) { - if(*(uint32_t*)&md->dev->config_space[MSI_CAP_ADDR_HI(md->cap, 1)] != 0) { + if(*(uint32_t*)&mdd->dev->config_space[MSI_CAP_ADDR_HI(md->cap, 1)] != 0) { PrintError("MSI upper address is non-zero!\n"); return -1; } } - if(ad->addr_hdr != MSI_ADDR_HDR) { - PrintError("MSI: Strange Address header %x.\n", ad->addr_hdr); - return -1; - } - if(ad->mbz0 || ad->mbz1) { - PrintError("MSI: MBZ fields are not zero.\n"); - return -1; - } + + if(msi_check_msg_address_and_data(ad, dt) != 0) return -1; + if(!ad->redir_cpu || !ad->dest_mode) { md->dest_mode = MODE_PHYS_DEST; } else { md->dest_mode = MODE_LOGIC_DEST; } - if(dt->vector < MSI_VECTOR_MIN || dt->vector > MSI_VECTOR_MAX) { - PrintError("MSI vector out of limits!\n"); - return -1; - } - if(dt->del_mode > 1) { - PrintError("Unsupported delivery mode specified!\n"); - return -1; - } - if(dt->trigger) { - PrintError("MSI level interrupt is not supported!\n"); - return -1; - } + md->vector = dt->vector; md->vec_count = 1 << c->mult_msg_en; @@ -179,13 +320,13 @@ static int msi_update_mode(struct msi_router_state *state, int msi_dev_index) { } PrintDebug(" enabled %d vectors starting at %x, device index %d\n", md->vec_count, md->vector, msi_dev_index); - md->dev->irq_type = PCI_IRQ_MSI; + mdd->dev->irq_type = PCI_IRQ_MSI; } else { if (md->vector) for(i = md->vector; i < md->vector + md->vec_count; ++i) { state->vector_to_index[i] = -1; } PrintDebug(" disabled %d vectors starting at %x, device index %d\n", md->vec_count, md->vector, msi_dev_index); - md->dev->irq_type = PCI_IRQ_INTX; + mdd->dev->irq_type = PCI_IRQ_INTX; } return 0; @@ -198,7 +339,8 @@ int v3_msi_get_info(struct pci_device *dev, struct msi_info *info, void *private V3_ASSERT(dev->msi_handle > 0 && dev->msi_handle <= MSI_DEV_COUNT && state->msi[dev->msi_handle - 1].dev == dev); int index = dev->msi_handle - 1; - struct msi_desc *md = &state->msi[index]; + V3_ASSERT(state->msi[index].f.msi); + struct msi_desc *md = &state->msi[index].m; info->enabled = md->control->enable; info->vector = md->vector; @@ -217,7 +359,9 @@ int v3_msi_capability_write(struct pci_device *dev, uint_t reg_num, uint_t lengt V3_ASSERT(dev->msi_handle > 0 && dev->msi_handle <= MSI_DEV_COUNT && state->msi[dev->msi_handle - 1].dev == dev); int index = dev->msi_handle - 1; - struct msi_desc *md = &state->msi[index]; + V3_ASSERT(state->msi[index].f.msi); + struct msi_dev_data *mdd = &state->msi[index]; + struct msi_desc *md = &mdd->m; if(reg_num < MSI_CAP_CTL(md->cap)) { PrintError("MSI: Write to read-only capability area!\n"); @@ -332,6 +476,129 @@ int v3_msi_capability_write(struct pci_device *dev, uint_t reg_num, uint_t lengt return 0; } +int v3_msix_capability_write(struct pci_device *dev, uint_t reg_num, uint_t length, void *private_data) { + V3_ASSERT(private_data); + struct msi_router_state *state = (struct msi_router_state*)private_data; + + V3_ASSERT(dev->msi_handle > 0 && dev->msi_handle <= MSI_DEV_COUNT && state->msi[dev->msi_handle - 1].dev == dev); + + int index = dev->msi_handle - 1, i; + V3_ASSERT(state->msi[index].f.msix); + struct msi_dev_data *mdd = &state->msi[index]; + struct msix_desc *mx = &mdd->mx; + + if(reg_num < MSIX_CAP_CTL(mx->cap_offset) || reg_num + length > MSIX_CAP_CTL(mx->cap_offset) + 2) { + PrintError("MSI-X: Write to read-only capability area!\n"); + return -1; + } + + dev->irq_type = mx->cap->control.enable ? PCI_IRQ_MSIX : PCI_IRQ_INTX; + PrintDebug("MSI-X: device %s: write to capability, new control %04x (enable %d func_mask %d) new IRQ type %d\n", + dev->name, mx->cap->control.val, mx->cap->control.enable, mx->cap->control.func_mask, + dev->irq_type); + + if(mx->cap->control.enable) { + for(i = 0; i < mx->cap->control.tbl_size + 1; ++i) { + if(!mx->table[i].ctl.mask) { + state->vector_to_index[mx->table[i].data.vector] = index; + } + } + } else { + for(i = 0; i < mx->cap->control.tbl_size + 1; ++i) { + if(!mx->table[i].ctl.mask) + state->vector_to_index[mx->table[i].data.vector] = -1; + } + } + return 0; +} + +int v3_msix_memory_write(struct pci_device *dev, addr_t guest_addr, void * src, uint_t length, void * private_data) { + V3_ASSERT(private_data); + struct msi_router_state *state = (struct msi_router_state*)private_data; + + V3_ASSERT(dev->msi_handle > 0 && dev->msi_handle <= MSI_DEV_COUNT && state->msi[dev->msi_handle - 1].dev == dev); + + int index = dev->msi_handle - 1; + V3_ASSERT(state->msi[index].f.msix); + struct msi_dev_data *mdd = &state->msi[index]; + struct msix_desc *mx = &mdd->mx; + struct msix_capability *cap = mx->cap; + + int tbl_size = (cap->control.tbl_size + 1); + uint64_t bar_start = (((uint64_t)mx->tbl_bar->base) << 4); + uint64_t bar_offset = cap->tbl_offset << 3, reg_size = tbl_size * sizeof(struct msix_tbl_entry); + uint64_t start1 = bar_start + bar_offset, end1 = bar_start + bar_offset + reg_size - 1; + + //PrintDebug("Device %s: MSI-X Memory write: addr %llx, length %u\n", dev->name, (uint64_t)guest_addr, length); + + if(guest_addr & 3) { + PrintError("MSI-X memory access not aligned!\n"); + return -1; + } + if(length != 4 && length != 8) { + PrintError("MSI-X memory access incorrect length %d!\n", length); + return -1; + } + if(guest_addr >= start1 && guest_addr + length <= end1 + 1) { + // access to MSI-X table + union msix_control *c = &mx->cap->control; + int table_num = (guest_addr - start1) / sizeof(struct msix_tbl_entry); + int table_offset = (guest_addr - start1) % sizeof(struct msix_tbl_entry); + struct msix_tbl_entry *table = &mx->table[table_num]; + union msix_vec_ctl old_ctl = table->ctl; + while(length) { + if (table_offset != 12) PrintDebug("Device %s: MSI-X table %d, offset %d write (value %08x).\n", dev->name, table_num, table_offset, *(uint32_t*)src); + switch(table_offset) { + case 0: // message address + table->address_lo = *(union msi_msg_address*)src; + break; + case 4: // message upper address + table->address_hi = *(uint32_t*)src; + if(table->address_hi) { + PrintError("Address high is non-zero (%x), not supported!\n", table->address_hi); + return -1; + } + break; + case 8: // message data + table->data = *(union msi_msg_data*)src; + break; + case 12: { // vector control + table->ctl = *(union msix_vec_ctl*)src; + break; + } + default: + PrintError("Impossible MSI-X table entry offset %d!\n", table_offset); + return -1; + } + if(table_offset != 12 && !table->ctl.mask) { + PrintError("Writing to MSI-X table while vector is enabled and not masked!\n"); + return -1; + } + length -= 4; + table_offset += 4; + src = (void*)((addr_t)src + 4); + } + if(old_ctl.mask && !table->ctl.mask) { + // unmasked right now + //PrintDebug("Device %s: MSI-X interrupt %x unmasked now!\n", dev->name, table->data.vector); + if(msi_check_msg_address_and_data(&table->address_lo, &table->data) != 0) return -1; + mx->vector_to_entry[table->data.vector] = table_num; + } else if(!old_ctl.mask) { + // masked right now + mx->vector_to_entry[table->data.vector] = -1; + state->vector_to_index[table->data.vector] = -1; + } + if(old_ctl.mask && !table->ctl.mask && c->enable) { + // unmasked right now and fully enabled + state->vector_to_index[table->data.vector] = index; + } + } else { + PrintError("Impossible access at %p (BAR %p)!\n", (void*)guest_addr, (void*)bar_start); + return -1; + } + + return 0; +} static int msi_raise_intr(struct v3_vm_info *vm, void *private_data, struct irq_data irq_data) { struct msi_router_state *state = (struct msi_router_state*)private_data; @@ -348,34 +615,55 @@ static int msi_raise_intr(struct v3_vm_info *vm, void *private_data, struct irq_ return -1; } - struct msi_desc *md = &state->msi[state->vector_to_index[irq]]; + struct msi_dev_data *mdd = &state->msi[state->vector_to_index[irq]]; + struct pci_device *dev = mdd->dev; + union msi_msg_data *data; + union msi_msg_address *addr; - if(!md->control->enable) { - PrintDebug("MSI vector 0x%x is known, but disabled.\n", irq); - return -1; - } + if(dev->irq_type == PCI_IRQ_MSI) { + struct msi_desc *md = &mdd->m; + if(!md->control->enable) { + PrintError("Error while raising MSI vector %x: MSI is disabled.\n", irq); + return -1; + } + + V3_ASSERT(irq >= md->vector && irq <= md->vector + md->vec_count); - V3_ASSERT(irq >= md->vector && irq <= md->vector + md->vec_count); + if(md->control->mask_cap) { + // if masked, setup pending bit. + uint32_t irq_bit = (1 << (uint32_t)irq); + if(*md->mask_bits & irq_bit) { + PrintDebug("MSI vector 0x%x is masked, setting pending bit.\n", irq); + *md->pending_bits |= irq_bit; + return 0; + } + } - if(md->control->mask_cap) { - // if masked, setup pending bit. - uint32_t irq_bit = (1 << (uint32_t)irq); - if(*md->mask_bits & irq_bit) { - PrintDebug("MSI vector 0x%x is masked, setting pending bit.\n", irq); - *md->pending_bits |= irq_bit; - return 0; + data = md->msg_data; + addr = md->msg_address_lo; + } else if(dev->irq_type == PCI_IRQ_MSIX) { + struct msix_desc *mx = &mdd->mx; + if(!mx->cap->control.enable) { + PrintError("Error while raising MSI-X vector %x: MSI-X is disabled.\n", irq); + return -1; } + int table_num = mx->vector_to_entry[irq]; + V3_ASSERT(table_num != -1); + V3_ASSERT(mx->table[table_num].data.vector == irq); + data = &mx->table[table_num].data; + addr = &mx->table[table_num].address_lo; + } else { + PrintError("Bad device IRQ type %d!\n", dev->irq_type); + return -1; } struct v3_gen_ipi ipi; - - ipi.vector = irq; - ipi.mode = md->msg_data->del_mode; - ipi.logical = md->dest_mode; - ipi.trigger_mode = md->msg_data->trigger; - ipi.dst = md->msg_address_lo->dest_cpu; + ipi.mode = data->del_mode; + ipi.logical = MSI_DEST_MODE(addr); + ipi.trigger_mode = data->trigger; + ipi.dst = addr->dest_cpu; ipi.dst_shorthand = 0; //PrintDebug("MSI router signalling APIC to raise vector %x. Current pcpu_id %d, " @@ -399,19 +687,28 @@ static int msi_lower_intr(struct v3_vm_info *vm, void *private_data, int irq) { return -1; } - struct msi_desc *md = &state->msi[state->vector_to_index[irq]]; + struct msi_dev_data *mdd = &state->msi[state->vector_to_index[irq]]; - if(!md->control->enable) { - PrintDebug("MSI vector 0x%x is known, but disabled.\n", irq); - return -1; - } + struct pci_device *dev = mdd->dev; - V3_ASSERT(irq >= md->vector && irq <= md->vector + md->vec_count); + if(dev->irq_type == PCI_IRQ_MSI) { + struct msi_desc *md = &state->msi[state->vector_to_index[irq]].m; + if(!md->control->enable) { + PrintDebug("MSI vector 0x%x is known, but disabled.\n", irq); + return -1; + } + V3_ASSERT(irq >= md->vector && irq <= md->vector + md->vec_count); - if(md->control->mask_cap) { - // clear pending bit. - uint32_t irq_bit = (1 << (uint32_t)irq); - *md->pending_bits &= ~irq_bit; + if(md->control->mask_cap) { + // clear pending bit. + uint32_t irq_bit = (1 << (uint32_t)irq); + *md->pending_bits &= ~irq_bit; + } + } else if(dev->irq_type == PCI_IRQ_MSIX) { + // XXX + } else { + PrintError("Bad device IRQ type %d!\n", dev->irq_type); + return -1; } return 0; diff --git a/palacios/src/devices/pci_msi_types.h b/palacios/src/devices/pci_msi_types.h index 648fef6..52712b9 100644 --- a/palacios/src/devices/pci_msi_types.h +++ b/palacios/src/devices/pci_msi_types.h @@ -24,6 +24,8 @@ #include +#define PCI_CAP_ID_MSI 0x05 +#define PCI_CAP_ID_MSIX 0x11 union msi_control { uint16_t val; @@ -58,8 +60,6 @@ union msi_msg_data { }; } __attribute__((packed)); - - #define MSI_CAP_CTL(cap) ((cap) + 2) #define MSI_CAP_ADDR_LO(cap) ((cap) + 4) #define MSI_CAP_ADDR_HI(cap, is_64) ((is_64) ? (cap) + 8 : 0) @@ -67,6 +67,54 @@ union msi_msg_data { #define MSI_CAP_MASK_BITS(cap, is_64) ((is_64) ? (cap) + 0x10 : (cap) + 0xc) #define MSI_CAP_PEND_BITS(cap, is_64) ((is_64) ? (cap) + 0x14 : (cap) + 0x10) +#define MSI_DEST_LOGICAL(msg_address_ptr) ((msg_address_ptr)->redir_cpu && (msg_address_ptr)->dest_mode) +#define MSI_DEST_PHYSICAL(msg_address_ptr) (!(msg_address_ptr)->redir_cpu || !(msg_address_ptr)->dest_mode) +#define MSI_DEST_MODE(msg_address_ptr) MSI_DEST_LOGICAL(msg_address_ptr) + +union msix_control { + uint16_t val; + struct { + uint16_t tbl_size: 11; // RO, N - 1 + uint16_t rsv1: 3; + uint16_t func_mask: 1; + uint16_t enable: 1; + }; +} __attribute__((packed)); + +struct msix_capability { + uint8_t cap_id; + uint8_t next; + union msix_control control; + // following fields are all read-only + struct { + uint32_t tbl_bir: 3; + uint32_t tbl_offset: 29; + }; + struct { + uint32_t pba_bir: 3; + uint32_t pba_offset: 29; + }; +} __attribute__((packed)); + +union msix_vec_ctl { + uint32_t val; + struct { + uint32_t mask: 1; + uint32_t rsv1: 31; + }; +} __attribute__((packed)); + +struct msix_tbl_entry { + union msi_msg_address address_lo; + uint32_t address_hi; + union msi_msg_data data; + uint16_t pad; + union msix_vec_ctl ctl; +} __attribute__((packed)); + +#define MSIX_CAP_CTL(cap) ((cap) + 2) +#define MSIX_PBA_BIT(pba, num) (int)((((uint64_t*)pba)[num / 64] >> (num & 64)) & 1) + #endif #endif diff --git a/palacios/src/devices/pci_passthrough.c b/palacios/src/devices/pci_passthrough.c index 262f2b7..a6e5bdf 100644 --- a/palacios/src/devices/pci_passthrough.c +++ b/palacios/src/devices/pci_passthrough.c @@ -72,8 +72,6 @@ #define PCI_CB_CAPABILITY_LIST 0x14 #define PCI_CAP_LIST_NEXT 1 -#define PCI_CAP_ID_MSI 0x05 -#define PCI_CAP_ID_MSIX 0x11 union pci_addr_reg { @@ -108,26 +106,51 @@ struct pt_bar { */ uint64_t addr; + struct { + uint8_t msix: 1; // true if MSI-X memory is located inside this BAR. + } flags; + uint32_t val; }; // NOTE: see Intel manual 3A, 10.11 on APIC MSI generation (address and data format) struct msi_pt_data { - uint8_t cap; - int cap_len; + // data related to MSI + struct { + uint8_t cap; + int cap_len; + + union msi_control *control; // points to guest config_space + union msi_msg_address *msg_address_lo; // points to guest config_space + union msi_msg_data *msg_data; // points to guest config_space + + union msi_msg_address msg_address_lo_orig; + union msi_msg_data msg_data_orig; + } m; - union msi_control *control; // points to guest config_space - union msi_msg_address *msg_address_lo; // points to guest config_space - union msi_msg_data *msg_data; // points to guest config_space + // data related to MSI-X + struct { + uint8_t cap_offset; + struct msix_capability *cap; + + // following table and pba are passed to MSI router + struct msix_tbl_entry *table, *orig_table; + uint64_t *pba; + + // these point to real device's table and pba (virtual addresses). + // do not access using structures unions etc. + void *hw_table; + void *hw_pba; - union msi_msg_address msg_address_lo_orig; - union msi_msg_data msg_data_orig; + } mx; void *router_private; // msi_router private data }; + // XXX: where can we put this shared data? +static int irqs_hooked[MAX_IRQ]; static int dev_vcpus[MSI_DEV_COUNT]; // indexed by pci_dev->msi_handle - 1, contains current VCPU for MSI (or -1). Length is MSI_DEV_COUNT. static int target_vcpus[V3_CONFIG_MAX_CPUS]; // for each MSI-targeted VCPU, +1. Length is V3_CONFIG_MAX_CPUS. @@ -215,6 +238,28 @@ static inline int pci_cfg_write(uint32_t addr, int len, uint32_t value) return 0; } +static inline uint32_t read32(void *addr) { + uint32_t val = *(volatile uint32_t*)addr; + //PrintDebug("Reading from %p value 0x%08x\n", addr, val); + return val; +} + +static inline void write32(void *addr, uint32_t val) { + //PrintDebug("Writing to %p value 0x%08x\n", addr, val); + *(volatile uint32_t*)addr = val; +} + +#define MSIX_HWTABLE_READ_ADDR_LO(base, table_index) read32((void*)((addr_t)(base) + (addr_t)(table_index) * 16 + 0)) +#define MSIX_HWTABLE_READ_ADDR_HI(base, table_index) read32((void*)((addr_t)(base) + (addr_t)(table_index) * 16 + 4)) +#define MSIX_HWTABLE_READ_DATA(base, table_index) read32((void*)((addr_t)(base) + (addr_t)(table_index) * 16 + 8)) +#define MSIX_HWTABLE_READ_VEC_CTL(base, table_index) read32((void*)((addr_t)(base) + (addr_t)(table_index) * 16 + 12)) + +#define MSIX_HWTABLE_WRITE_ADDR_LO(base, table_index, val) write32((void*)((addr_t)(base) + (addr_t)(table_index) * 16 + 0), val) +#define MSIX_HWTABLE_WRITE_ADDR_HI(base, table_index, val) write32((void*)((addr_t)(base) + (addr_t)(table_index) * 16 + 4), val) +#define MSIX_HWTABLE_WRITE_DATA(base, table_index, val) write32((void*)((addr_t)(base) + (addr_t)(table_index) * 16 + 8), val) +#define MSIX_HWTABLE_WRITE_VEC_CTL(base, table_index, val) write32((void*)((addr_t)(base) + (addr_t)(table_index) * 16 + 12), val) + + static int pci_exp_rom_init(struct vm_device * dev, struct pt_dev_state * state) { struct pci_device * pci_dev = state->pci_dev; const uint32_t exp_rom_base_reg = 12; @@ -524,7 +569,10 @@ static int pt_io_write(struct guest_info * core, uint16_t port, void * src, uint - +static int msix_set_bar_hooks(struct vm_device *dev, + int bar_index, uint64_t bar_offset, uint64_t reg_size); +static int msix_unset_bar_hooks(struct vm_device *dev, + int bar_index, uint64_t bar_offset, uint64_t reg_size); static int pci_bar_write(int bar_num, uint32_t * src, void * private_data) { struct vm_device * dev = (struct vm_device *)private_data; @@ -625,17 +673,23 @@ static int pci_bar_write(int bar_num, uint32_t * src, void * private_data) { } else if (vbar->type == PT_BAR_MEM64_HI) { struct pt_bar * lo_vbar = &(state->virt_bars[bar_num - 1]); - struct v3_mem_region * old_reg = v3_get_mem_region(dev->vm, V3_MEM_CORE_ANY, vbar->addr); - if (old_reg == NULL) { - // uh oh... - PrintError("Could not find PCI Passthrough memory redirection region (addr=%p)\n", - (void *)(addr_t)vbar->addr); - return -1; - } + if(vbar->flags.msix) { + if(msix_unset_bar_hooks(dev, bar_num, state->msi.mx.cap->tbl_offset << 3, + (state->msi.mx.cap->control.tbl_size + 1) * sizeof(struct msix_tbl_entry)) != 0) return -1; + } else { + struct v3_mem_region * old_reg = v3_get_mem_region(dev->vm, V3_MEM_CORE_ANY, vbar->addr); - // remove old mapping - v3_delete_mem_region(dev->vm, old_reg); + if (old_reg == NULL) { + // uh oh... + PrintError("Could not find PCI Passthrough memory redirection region (addr=%p)\n", + (void *)(addr_t)vbar->addr); + return -1; + } + + // remove old mapping + v3_delete_mem_region(dev->vm, old_reg); + } // We don't set size, because we assume region is less than 4GB @@ -646,18 +700,24 @@ static int pci_bar_write(int bar_num, uint32_t * src, void * private_data) { vbar->addr <<= 32; vbar->addr += lo_vbar->addr; - PrintDebug("Adding pci Passthrough remapping: start=%p, size=%p, end=%p\n", - (void *)(addr_t)vbar->addr, (void *)(addr_t)vbar->size, - (void *)(addr_t)(vbar->addr + vbar->size)); + if(vbar->flags.msix) { + if(msix_set_bar_hooks(dev, bar_num, state->msi.mx.cap->tbl_offset << 3, + (state->msi.mx.cap->control.tbl_size + 1) * sizeof(struct msix_tbl_entry)) != 0) + return -1; + } else { + PrintDebug("Adding pci Passthrough remapping: start=%p, size=%p, end=%p\n", + (void *)(addr_t)vbar->addr, (void *)(addr_t)vbar->size, + (void *)(addr_t)(vbar->addr + vbar->size)); - if (v3_add_shadow_mem(dev->vm, V3_MEM_CORE_ANY, vbar->addr, - vbar->addr + vbar->size - 1, pbar->addr) == -1) { + if (v3_add_shadow_mem(dev->vm, V3_MEM_CORE_ANY, vbar->addr, + vbar->addr + vbar->size - 1, pbar->addr) == -1) { - PrintDebug("Fail to insert shadow region (%p, %p) -> %p\n", - (void *)(addr_t)vbar->addr, - (void *)(addr_t)(vbar->addr + vbar->size - 1), - (void *)(addr_t)pbar->addr); - return -1; + PrintDebug("Fail to insert shadow region (%p, %p) -> %p\n", + (void *)(addr_t)vbar->addr, + (void *)(addr_t)(vbar->addr + vbar->size - 1), + (void *)(addr_t)pbar->addr); + return -1; + } } } else { @@ -784,7 +844,7 @@ static inline int cfg_range_hooked(uint_t reg, uint_t length, struct cfg_range_h // cheat the guest and fix its message too to be in physical fixed mode and to arbitrate // passed through lowest priority messages on our own (sending them to one core always). -static int msi_passthrough_fix_guest_message(struct vm_device *dev) { +static int msi_passthrough_fix_guest_message(struct vm_device *dev, union msi_msg_address *addr, union msi_msg_data *data) { struct pt_dev_state *state = (struct pt_dev_state *)dev->private_data; int index = state->pci_dev->msi_handle - 1; @@ -792,24 +852,17 @@ static int msi_passthrough_fix_guest_message(struct vm_device *dev) { struct v3_vm_info *vm = dev->vm; int prev_vcpu, new_vcpu = -1, i; - union msi_control *c = state->msi.control; - union msi_msg_address *addr = state->msi.msg_address_lo; - union msi_msg_data *data = state->msi.msg_data; - - // copy guest data. it will be returned to guest when it reads. - state->msi.msg_data_orig = *data; - state->msi.msg_address_lo_orig = *addr; - - if(!c->enable) - return 0; + union msi_msg_data data_orig = *data; + union msi_msg_address address_lo_orig = *addr; + // XXX: locking // update MSI balancing structures prev_vcpu = dev_vcpus[index]; if (prev_vcpu != -1) { --target_vcpus[prev_vcpu]; } - if(addr->redir_cpu && addr->dest_mode) { + if(MSI_DEST_LOGICAL(addr)) { // logical destination; we should fix it // find out good candidate for MSI target VCPU @@ -831,7 +884,7 @@ static int msi_passthrough_fix_guest_message(struct vm_device *dev) { addr->redir_cpu = 0; PrintDebug("MSI PT device %s: guest wanted to use low_pri messages. Original message" " address %x, data %x; fixed address %x, data %x.\n", - state->name, state->msi.msg_address_lo_orig.val, state->msi.msg_data_orig.val, + state->name, address_lo_orig.val, data_orig.val, addr->val, data->val); } else { // physical destination @@ -852,21 +905,18 @@ extern int v3_pcpu_to_apic_id[V3_CONFIG_MAX_CPUS]; static int msi_passthrough_compose_host_message( struct vm_device *dev, - struct msi_info *mi, union msi_msg_address *addr, - union msi_msg_data *data, - union msi_msg_address *phys_addr, - union msi_msg_data *phys_data) { + union msi_msg_address *phys_addr) { struct v3_vm_info *vm = dev->vm; int phys_cpu; - *phys_data = *data; *phys_addr = *addr; - if (mi->dest_mode == MODE_PHYS_DEST) { - if (mi->dest_cpu > vm->num_cores) { - PrintError("MSI PT device %s: Guest tries to use non-existent APIC ID (%x)!\n", dev->name, mi->dest_cpu); + if (MSI_DEST_PHYSICAL(addr)) { + // physical destination + if (addr->dest_cpu > vm->num_cores) { + PrintError("MSI PT device %s: Guest tries to use non-existent APIC ID (%x)!\n", dev->name, addr->dest_cpu); return -1; } phys_cpu = v3_pcpu_to_apic_id[vm->cores[addr->dest_cpu].pcpu_id]; @@ -885,18 +935,19 @@ static int msi_passthrough_compose_host_message( static int cfg_msi_passthrough_capability_read(struct vm_device *dev, uint_t reg_num, void *ptr, uint_t length) { struct pt_dev_state * state = (struct pt_dev_state *)dev->private_data; - int msi_cap = state->msi.cap, i; - union msi_control *c = state->msi.control; + int msi_cap = state->msi.m.cap, i; + union msi_control *c = state->msi.m.control; uint32_t pci_addr = state->phys_pci_addr.value & ~0xff; int status = 0; if(reg_num == MSI_CAP_ADDR_LO(msi_cap)) { - *(uint32_t *)ptr = state->msi.msg_address_lo_orig.val; + *(uint32_t *)ptr = state->msi.m.msg_address_lo_orig.val; } else if(reg_num == MSI_CAP_DATA(msi_cap, c->is_64bit_cap)) { - *(uint32_t *)ptr = state->msi.msg_data_orig.val; + *(uint32_t *)ptr = state->msi.m.msg_data_orig.val; } else if(c->mask_cap && reg_num == MSI_CAP_PEND_BITS(msi_cap, c->is_64bit_cap)) { status |= pci_cfg_read(pci_addr | MSI_CAP_PEND_BITS(msi_cap, c->is_64bit_cap), 4, (uint32_t*)ptr); + } else { // read virtualized part of config space for (i = 0; i < length; i++) { @@ -913,28 +964,32 @@ static int cfg_msi_passthrough_capability_read(struct vm_device *dev, uint_t reg static int cfg_msi_passthrough_capability_write(struct vm_device *dev, uint_t reg_num, void *ptr, uint_t length) { struct pt_dev_state * state = (struct pt_dev_state *)dev->private_data; - int msi_cap = state->msi.cap; + int msi_cap = state->msi.m.cap; int status = 0, i; struct msi_info mi; uint32_t pci_addr = state->phys_pci_addr.value & ~0xff; + union msi_msg_address *address = state->msi.m.msg_address_lo, phys_address; + union msi_msg_data *data = state->msi.m.msg_data, phys_data; + union msi_control *c = state->msi.m.control; - if(msi_passthrough_fix_guest_message(dev) != 0) return -1; + // copy guest data. it will be returned to guest when it reads. + state->msi.m.msg_data_orig = *data; + state->msi.m.msg_address_lo_orig = *address; + + if(c->enable && msi_passthrough_fix_guest_message(dev, address, data) != 0) return -1; if(v3_msi_capability_write(state->pci_dev, reg_num, length, state->msi.router_private) != 0) return -1; if(v3_msi_get_info(state->pci_dev, &mi, state->msi.router_private) != 0) return -1; - union msi_msg_address *address = state->msi.msg_address_lo, phys_address; - union msi_msg_data *data = state->msi.msg_data, phys_data; - PrintDebug("MSI PT device %s: MSI router returned info: %s vec 0x%x vec_cnt %d dest_mode %s dest_cpu %x del_mode %s\n", dev->name, mi.enabled ? "enabled" : "disabled", mi.vector, mi.vec_count, mi.dest_mode == MODE_LOGIC_DEST ? "logic" : "phys", mi.dest_cpu, mi.del_mode == DEL_MODE_FIXED ? "fixed" : "low_pri"); - union msi_control *c = state->msi.control; + // note that all checks were already done in v3_msi_capability_write. if(reg_num == MSI_CAP_CTL(msi_cap)) { if(c->enable) { - if(msi_passthrough_compose_host_message(dev, &mi, address, data, &phys_address, &phys_data) != 0) return -1; + if(msi_passthrough_compose_host_message(dev, address, &phys_address) != 0) return -1; for(i = 0; i < mi.vec_count; ++i) { v3_hook_irq(dev->vm, phys_data.vector + i, irq_handler, dev); @@ -952,7 +1007,7 @@ static int cfg_msi_passthrough_capability_write(struct vm_device *dev, uint_t re } } else if(reg_num == MSI_CAP_ADDR_LO(msi_cap)) { if(c->enable) { - if(msi_passthrough_compose_host_message(dev, &mi, address, data, &phys_address, &phys_data) != 0) return -1; + if(msi_passthrough_compose_host_message(dev, address, &phys_address) != 0) return -1; if(c->is_64bit_cap) status |= pci_cfg_write(pci_addr | MSI_CAP_ADDR_HI(msi_cap, 1), 4, 0); @@ -966,7 +1021,7 @@ static int cfg_msi_passthrough_capability_write(struct vm_device *dev, uint_t re } } else if(reg_num == MSI_CAP_DATA(msi_cap, c->is_64bit_cap)) { if(c->enable) { - if(msi_passthrough_compose_host_message(dev, &mi, address, data, &phys_address, &phys_data) != 0) return -1; + if(msi_passthrough_compose_host_message(dev, address, &phys_address) != 0) return -1; for(i = 0; i < mi.vec_count; ++i) { v3_hook_irq(dev->vm, phys_data.vector + i, irq_handler, dev); @@ -993,56 +1048,439 @@ static int cfg_msi_passthrough_capability_write(struct vm_device *dev, uint_t re return 0; } +static int cfg_msix_passthrough_capability_write(struct vm_device *dev, uint_t reg_num, void *ptr, uint_t length) { + struct pt_dev_state * state = (struct pt_dev_state *)dev->private_data; + int msix_cap = state->msi.mx.cap_offset; + int status = 0; + uint32_t pci_addr = state->phys_pci_addr.value & ~0xff; + union msix_control *c = &state->msi.mx.cap->control; + + if(v3_msix_capability_write(state->pci_dev, reg_num, length, state->msi.router_private) != 0) return -1; + + + // note that all checks were already done in v3_msix_capability_write. + if(reg_num == MSIX_CAP_CTL(msix_cap)) { + + if(c->enable && !c->func_mask) { + int tbl_size = state->msi.mx.cap->control.tbl_size + 1, i; + void *base = state->msi.mx.hw_table; + for(i = 0; i < tbl_size; ++i) { + union msix_vec_ctl ctl; + ctl.val = MSIX_HWTABLE_READ_VEC_CTL(base, i); + PrintDebug(" Entry %d: addr %08x addr_hi %08x data %08x ctl %08x\n", i, + MSIX_HWTABLE_READ_ADDR_LO(base, i),MSIX_HWTABLE_READ_ADDR_HI(base, i), + MSIX_HWTABLE_READ_DATA(base, i), ctl.val); + if(!ctl.mask) { + int vec = ((union msi_msg_data)(uint16_t)MSIX_HWTABLE_READ_DATA(base, i)).vector; + if(irqs_hooked[vec] == 0) { + v3_hook_irq(dev->vm, ((union msi_msg_data)(uint16_t)MSIX_HWTABLE_READ_DATA(base, i)).vector, irq_handler, dev); + irqs_hooked[vec] = 1; + } + } + } + } + + status |= pci_cfg_write(pci_addr | MSIX_CAP_CTL(msix_cap), sizeof(c->val), c->val); + } else { + PrintError("MSI PT device %s: Unhandled MSI-X access at %x (MSI-X CAP %x)!\n", dev->name, reg_num, msix_cap); + return -1; + } + if(status) { + PrintError("MSI PT device %s: Write to MSI capability failed.\n", dev->name); + return status; + } + return 0; +} + +static int msix_memory_read(struct guest_info * core, addr_t guest_addr, void * dst, uint_t length, void * priv_data) { + struct vm_device *dev = (struct vm_device *)priv_data; + struct pt_dev_state *state = (struct pt_dev_state *)dev->private_data; + struct msix_capability *cap = state->msi.mx.cap; + struct pt_bar *vbar = &state->virt_bars[cap->tbl_bir]; + if(vbar->type == PT_BAR_MEM64_LO) + vbar = &state->virt_bars[cap->tbl_bir + 1]; + + uint64_t bar_offset = cap->tbl_offset << 3, reg_size = (cap->control.tbl_size + 1) * sizeof(struct msix_tbl_entry); + uint64_t start1 = vbar->addr + bar_offset, end1 = vbar->addr + bar_offset + reg_size - 1; + + if(guest_addr & 3) { + PrintError("MSI-X memory access not aligned!\n"); + return -1; + } + if(length != 4 && length != 8) { + PrintError("MSI-X memory access incorrect length %d!\n", length); + return -1; + } + if(guest_addr >= start1 && guest_addr + length <= end1 + 1) { + int table_num = (guest_addr - start1) / sizeof(struct msix_tbl_entry); + int table_offset = (guest_addr - start1) % sizeof(struct msix_tbl_entry); + PrintDebug("MSI-X table %d, offset %d read.\n", table_num, table_offset); + struct msix_tbl_entry *otable = &state->msi.mx.orig_table[table_num]; + while(length) { + switch(table_offset) { + case 0: + *(uint32_t*)dst = *(uint32_t*)&otable->address_lo; + break; + case 4: + *(uint32_t*)dst = *(uint32_t*)&otable->address_hi; + break; + case 8: + *(uint32_t*)dst = *(uint32_t*)&otable->data; + break; + case 12: + *(uint32_t*)dst = *(uint32_t*)&otable->ctl; + break; + default: + PrintError("Impossible MSI-X table entry offset %d!\n", table_offset); + return -1; + } + length -= 4; + table_offset += 4; + dst = (void*)((addr_t)dst + 4); + } + } else { + PrintError("Incorrect access address!\n"); + return -1; + } + return 0; +} + +static int msix_memory_write(struct guest_info * core, addr_t guest_addr, void * src, uint_t length, void * priv_data) { + struct vm_device *dev = (struct vm_device *)priv_data; + struct pt_dev_state *state = (struct pt_dev_state *)dev->private_data; + struct msix_capability *cap = state->msi.mx.cap; + struct pt_bar *vbar = &state->virt_bars[cap->tbl_bir]; + if(vbar->type == PT_BAR_MEM64_LO) + vbar = &state->virt_bars[cap->tbl_bir + 1]; + + uint64_t bar_offset = cap->tbl_offset << 3, reg_size = (cap->control.tbl_size + 1) * sizeof(struct msix_tbl_entry); + uint64_t start1 = vbar->addr + bar_offset, end1 = vbar->addr + bar_offset + reg_size - 1; + + if(guest_addr & 3) { + PrintError("MSI-X memory access not aligned!\n"); + return -1; + } + if(length != 4 && length != 8) { + PrintError("MSI-X memory access incorrect length %d!\n", length); + return -1; + } + if(guest_addr >= start1 && guest_addr + length <= end1 + 1) { + // access to MSI-X table + union msix_control *c = &state->msi.mx.cap->control; + int table_num = (guest_addr - start1) / sizeof(struct msix_tbl_entry); + int table_offset = (guest_addr - start1) % sizeof(struct msix_tbl_entry); + struct msix_tbl_entry *table = &state->msi.mx.table[table_num]; + struct msix_tbl_entry *otable = &state->msi.mx.orig_table[table_num]; + void *base = state->msi.mx.hw_table; + while(length) { + switch(table_offset) { + case 0: // message address + otable->address_lo = table->address_lo = *(union msi_msg_address*)src; + if(msi_passthrough_fix_guest_message(dev, &table->address_lo, &table->data) != 0) return -1; + if(v3_msix_memory_write(state->pci_dev, start1 + table_num * 16 + table_offset, src, 4, state->msi.router_private) != 0) return -1; + union msi_msg_address wr_address; + if(msi_passthrough_compose_host_message(dev, &table->address_lo, + &wr_address) != 0) return -1; + MSIX_HWTABLE_WRITE_ADDR_LO(base, table_num, wr_address.val); + break; + case 4: // message upper address + otable->address_hi = table->address_hi = *(uint32_t*)src; + if(v3_msix_memory_write(state->pci_dev, start1 + table_num * 16 + table_offset, src, 4, state->msi.router_private) != 0) return -1; + MSIX_HWTABLE_WRITE_ADDR_HI(base, table_num, table->address_hi); + break; + case 8: // message data + otable->data = table->data = *(union msi_msg_data*)src; + if(msi_passthrough_fix_guest_message(dev, &table->address_lo, &table->data) != 0) return -1; + if(v3_msix_memory_write(state->pci_dev, start1 + table_num * 16 + table_offset, src, 4, state->msi.router_private) != 0) return -1; + MSIX_HWTABLE_WRITE_DATA(base, table_num, table->data.val); + break; + case 12: { // vector control + int old_mask = table->ctl.mask; + otable->ctl = *(union msix_vec_ctl*)src; + if(v3_msix_memory_write(state->pci_dev, start1 + table_num * 16 + table_offset, src, 4, state->msi.router_private) != 0) return -1; + if(old_mask && !table->ctl.mask && c->enable && !c->func_mask) { + // interrupt is enabled, need to hook it. + if(irqs_hooked[table->data.vector] == 0) { + v3_hook_irq(dev->vm, table->data.vector, irq_handler, dev); + irqs_hooked[table->data.vector] = 1; + } + } + + MSIX_HWTABLE_WRITE_VEC_CTL(base, table_num, table->ctl.val); + break; + } + default: + PrintError("Impossible MSI-X table entry offset %d!\n", table_offset); + return -1; + } + if(c->enable && !c->func_mask && table_offset != 12 && !table->ctl.mask) { + PrintError("Writing to MSI-X table while vector is enabled and not masked!\n"); + return -1; + } + length -= 4; + table_offset += 4; + src = (void*)((addr_t)src + 4); + } + + } else { + PrintError("Incorrect access address!\n"); + return -1; + } + + return 0; +} + +static int msix_unset_bar_hooks(struct vm_device *dev, + int bar_index, uint64_t bar_offset, uint64_t reg_size) { + struct pt_dev_state *state = (struct pt_dev_state *)dev->private_data; + struct v3_vm_info * vm = dev->vm; + if(state->phys_bars[bar_index].type == PT_BAR_MEM64_LO) + ++bar_index; + struct pt_bar *vbar = &state->virt_bars[bar_index]; + + /* + PCI 2.3 specification: + "If a Base Address register that maps address space for the MSI-X Table or MSI-X PBA + also maps other usable address space that is not associated with MSI-X structures, + locations (e.g., for CSRs) used in the other address space must not share any naturally + aligned 4 KB address range with one where either MSI-X structure resides." + */ + + + struct v3_mem_region *old_reg; + uint64_t start0 = vbar->addr, end0 = vbar->addr + vbar->size - 1; + uint64_t start1 = vbar->addr + bar_offset, end1 = vbar->addr + bar_offset + reg_size - 1; + PrintDebug("Device %s: BAR %d (%llx:%llx), removing MSI-X memory hooks (%llx (aligned %llx):%llx)\n", + state->name, bar_index, start0, end0, start1, start1 & ~0xfff, end1); + start1 &= ~0xfff; + if(!(start1 >= start0 && start1 < end0 && end1 <= end0 && ((start1 - start0) & 0xfff) == 0)) { + PrintError("Something is wrong with ranges.\n"); + return -1; + } + if(start1 != start0) { + old_reg = v3_get_mem_region(vm, V3_MEM_CORE_ANY, start0); + if(old_reg == NULL) { + PrintError("Cannot find old memory region (addr = %llx)!\n", start0); + return -1; + } + v3_delete_mem_region(vm, old_reg); + + } + if(v3_unhook_mem(vm, V3_MEM_CORE_ANY, start1) != 0) { + PrintError("Failed to unhook MSI-X memory.\n"); + return -1; + } + + if((end0 - end1) & ~0xfff) { + old_reg = v3_get_mem_region(vm, V3_MEM_CORE_ANY, end1 + 1); + if(old_reg == NULL) { + PrintError("Cannot find old memory region (addr = %llx)!\n", end1 + 1); + return -1; + } + v3_delete_mem_region(vm, old_reg); + } + return 0; +} + +static int msix_set_bar_hooks(struct vm_device *dev, + int bar_index, uint64_t bar_offset, uint64_t reg_size) { + + struct v3_vm_info * vm = dev->vm; + struct pt_dev_state *state = (struct pt_dev_state *)dev->private_data; + if(state->phys_bars[bar_index].type == PT_BAR_MEM64_LO) + ++bar_index; + struct pt_bar *vbar = &state->virt_bars[bar_index]; + struct pt_bar *pbar = &state->phys_bars[bar_index]; + + /* + PCI 2.3 specification: + "If a Base Address register that maps address space for the MSI-X Table or MSI-X PBA + also maps other usable address space that is not associated with MSI-X structures, + locations (e.g., for CSRs) used in the other address space must not share any naturally + aligned 4 KB address range with one where either MSI-X structure resides." + */ + + + uint64_t start0 = vbar->addr, end0 = vbar->addr + vbar->size - 1; + uint64_t start1 = vbar->addr + bar_offset, end1 = vbar->addr + bar_offset + reg_size - 1; + PrintDebug("Device %s: BAR %d (%llx:%llx), injecting MSI-X memory hook (%llx (aligned %llx):%llx)\n", + state->name, bar_index, start0, end0, start1, start1 & ~0xfff, end1); + start1 &= ~0xfff; + if(!(start1 >= start0 && start1 < end0 && end1 <= end0 && ((start1 - start0) & 0xfff) == 0)) { + PrintError("Something is wrong with ranges.\n"); + return -1; + } + if(start1 != start0) { + if(v3_add_shadow_mem(vm, V3_MEM_CORE_ANY, start0, start1, pbar->addr) != 0) { + PrintError("Failed to add shadow mem for BAR!\n"); + return -1; + } + } + if(v3_hook_full_mem(vm, V3_MEM_CORE_ANY, start1, end1 + 1, msix_memory_read, msix_memory_write, dev) != 0) { + PrintError("Failed to hook MSI-X memory.\n"); + return -1; + } + if((end0 - end1) & ~0xfff) { + if(v3_add_shadow_mem(vm, V3_MEM_CORE_ANY, end1 + 1, end0 + 1, pbar->addr + bar_offset + reg_size) != 0) { + PrintError("Failed to add shadow mem for BAR!\n"); + return -1; + } + } + return 0; +} -static int setup_msi_passthrough(struct v3_vm_info * vm_info, struct pt_dev_state *state) +static int setup_msi_passthrough(struct vm_device * vm_dev, struct pt_dev_state *state) { + struct v3_vm_info * vm_info = vm_dev->vm; struct pci_device *dev = state->pci_dev; uint8_t unused; dev->msi_handle = 0; + state->msi.router_private = v3_pci_get_msi_router(state->pci_bus); // search for PCI MSI capability and remember its offset. int msi_cap = pci_find_capability_and_pointer(dev, PCI_CAP_ID_MSI, &unused); - if(!msi_cap) - return 0; + if(msi_cap) { - int msi_cap_len = 10; // Minimal MSI capability size - state->msi.control = (union msi_control*)&dev->config_space[MSI_CAP_CTL(msi_cap)]; + int msi_cap_len = 10; // Minimal MSI capability size + state->msi.m.control = (union msi_control*)&dev->config_space[MSI_CAP_CTL(msi_cap)]; - if(state->msi.control->is_64bit_cap) - msi_cap_len += 4; - if(state->msi.control->mask_cap) - msi_cap_len += 10; + if(state->msi.m.control->is_64bit_cap) + msi_cap_len += 4; + if(state->msi.m.control->mask_cap) + msi_cap_len += 10; - PrintDebug("MSI PT device %s: detected MSI capability at offset %02x, length %x, control %04x\n", - state->name, msi_cap, msi_cap_len, state->msi.control->val); + PrintDebug("MSI PT device %s: detected MSI capability at offset %02x, length %x, control %04x\n", + state->name, msi_cap, msi_cap_len, state->msi.m.control->val); - // get MSI router - state->msi.router_private = v3_pci_get_msi_router(state->pci_bus); - if(!state->msi.router_private) { - PrintError("MSI PT device %s: MSI capability passed through, but no MSI router found!\n", state->name); - return -1; - } + // get MSI router + if(!state->msi.router_private) { + PrintError("MSI PT device %s: MSI capability passed through, but no MSI router found!\n", state->name); + return -1; + } - if(v3_msi_register_device(dev, msi_cap, state->msi.router_private) != 0) { - PrintError("MSI PT device %s: Failed to register MSI-capable passthrough device in MSI router.\n", state->name); - return -1; + if(v3_msi_register_device(dev, msi_cap, state->msi.router_private) != 0) { + PrintError("MSI PT device %s: Failed to register MSI-capable passthrough device in MSI router.\n", state->name); + return -1; + } + + if(cfg_range_hook_add(msi_cap, (msi_cap_len & 3) == 0 ? msi_cap_len : ((msi_cap_len & ~3) + 4), + cfg_msi_passthrough_capability_read, cfg_msi_passthrough_capability_write, state) != 0) return -1; + + state->msi.m.cap = msi_cap; + state->msi.m.cap_len = msi_cap_len; + state->msi.m.control = (union msi_control*)&dev->config_space[MSI_CAP_CTL(msi_cap)]; + state->msi.m.msg_address_lo = (union msi_msg_address*)&dev->config_space[MSI_CAP_ADDR_LO(msi_cap)]; + state->msi.m.msg_data = (union msi_msg_data*)&dev->config_space[MSI_CAP_DATA(msi_cap, state->msi.m.control->is_64bit_cap)]; + + int i; + for (i = 0; i < MSI_DEV_COUNT; ++i) { + // XXX: this data is shared, need to place it somewhere + dev_vcpus[i] = -1; + } } - if(cfg_range_hook_add(msi_cap, (msi_cap_len & 3) == 0 ? msi_cap_len : ((msi_cap_len & ~3) + 4), - cfg_msi_passthrough_capability_read, cfg_msi_passthrough_capability_write, state) != 0) return -1; + int msix_cap = pci_find_capability_and_pointer(dev, PCI_CAP_ID_MSIX, &unused); + if(msix_cap) { + struct msix_capability *cap = (struct msix_capability *)&dev->config_space[msix_cap]; + int tbl_size = cap->control.tbl_size + 1; + PrintDebug("MSI PT device %s: detected MSI-X capability at offset %02x, control %04x (table size %d), " + "table BIR %d, table offset %x, PBA BIR %d, PBA offset %x\n", + state->name, msix_cap, cap->control.val, tbl_size, cap->tbl_bir, + cap->tbl_offset << 3, cap->pba_bir, cap->pba_offset << 3); + if(!state->msi.router_private) { + PrintError("MSI PT device %s: MSI-X capability passed through, but no MSI router found!\n", state->name); + return -1; + } + // need to allocate resources for table and PBA. + int pba_qw_count = tbl_size / 64 + (tbl_size % 64 ? 1 : 0); + struct msix_tbl_entry *table = (struct msix_tbl_entry *)V3_Malloc(tbl_size * sizeof(*table)); + struct msix_tbl_entry *orig_table = (struct msix_tbl_entry *)V3_Malloc(tbl_size * sizeof(*table)); + uint64_t *pba = (uint64_t*)V3_Malloc(pba_qw_count * sizeof(*pba)); + V3_ASSERT(table && pba && orig_table); + + struct pt_bar *tbl_bar = &state->phys_bars[cap->tbl_bir]; + struct pt_bar *pba_bar = &state->phys_bars[cap->pba_bir]; + if(tbl_bar->type == PT_BAR_MEM64_LO) + tbl_bar = &state->phys_bars[cap->tbl_bir + 1]; + if(pba_bar->type == PT_BAR_MEM64_LO) + pba_bar = &state->phys_bars[cap->pba_bir + 1]; + if(tbl_bar->type != PT_BAR_MEM32 && tbl_bar->type != PT_BAR_MEM64_HI) { + PrintError("MSI PT device %s: table BIR points to bad BAR type %d!\n", state->name, tbl_bar->type); + return -1; + } + if(pba_bar->type != PT_BAR_MEM32 && pba_bar->type != PT_BAR_MEM64_HI) { + PrintError("MSI PT device %s: PBA BIR points to bad BAR type %d!\n", state->name, pba_bar->type); + return -1; + } + struct msix_tbl_entry *hw_table = (struct msix_tbl_entry *)(tbl_bar->addr + (cap->tbl_offset << 3)); + addr_t hw_table_end = (addr_t)hw_table + tbl_size * sizeof(*hw_table); + uint64_t *hw_pba = (uint64_t*)(pba_bar->addr + (cap->pba_offset << 3)); + addr_t hw_pba_end = (addr_t)hw_pba + pba_qw_count * sizeof(*hw_pba); + PrintDebug("MSI PT device %s: MSI-X table at BAR %d (%llx:%llx), location %p:%p, " + "PBA at BAR %d (%llx:%llx), location %p:%p\n", state->name, cap->tbl_bir, + tbl_bar->addr, tbl_bar->addr + tbl_bar->size, hw_table, (void*)hw_table_end, + cap->pba_bir, pba_bar->addr, pba_bar->addr + pba_bar->size, hw_pba, (void*)hw_pba_end); + V3_ASSERT(tbl_bar->addr <= (addr_t)hw_table && hw_table_end <= tbl_bar->addr + tbl_bar->size); + V3_ASSERT(pba_bar->addr <= (addr_t)hw_pba && hw_pba_end <= pba_bar->addr + pba_bar->size); + + // some hardware has mask bits not set up. this may confuse our code later and this is wrong. so we fix them. + int i; + void *base = V3_VAddr(hw_table); + for(i = 0; i < tbl_size; ++i) { + union msix_vec_ctl vecc; + vecc.val = MSIX_HWTABLE_READ_VEC_CTL(base, i); + if(!vecc.mask) { + PrintError("MSI PT device %s: table entry %d, vector is not masked! Mask written to hardware.\n", state->name, i); + vecc.mask = 1; + MSIX_HWTABLE_WRITE_VEC_CTL(base, i, vecc.val); + } + } - state->msi.cap = msi_cap; - state->msi.cap_len = msi_cap_len; - state->msi.control = (union msi_control*)&dev->config_space[MSI_CAP_CTL(msi_cap)]; - state->msi.msg_address_lo = (union msi_msg_address*)&dev->config_space[MSI_CAP_ADDR_LO(msi_cap)]; - state->msi.msg_data = (union msi_msg_data*)&dev->config_space[MSI_CAP_DATA(msi_cap, state->msi.control->is_64bit_cap)]; + // copy full table data + for(i = 0; i < tbl_size; ++i) { + table[i].address_lo.val = MSIX_HWTABLE_READ_ADDR_LO(base, i); + table[i].address_hi = MSIX_HWTABLE_READ_ADDR_HI(base, i); + table[i].data.val = (uint16_t)MSIX_HWTABLE_READ_DATA(base, i); + table[i].ctl.val = MSIX_HWTABLE_READ_VEC_CTL(base, i); + } + memcpy(orig_table, table, sizeof(struct msix_tbl_entry) * tbl_size); + for(i = 0; i < pba_qw_count; ++i) { + pba[i] = read32((void*)((addr_t)V3_VAddr(hw_pba) + i * 8)); + pba[i] |= (uint64_t)read32((void*)((addr_t)V3_VAddr(hw_pba) + i * 8 + 4)) << 32; + } - int i; - for (i = 0; i < MSI_DEV_COUNT; ++i) { - // XXX: this data is shared, need to place it somewhere - dev_vcpus[i] = -1; + if(v3_msix_register_device(dev, msix_cap, table, pba, state->msi.router_private) != 0) { + PrintError("MSI PT device %s: Failed to register MSI-X-capable passthrough device in MSI router.\n", state->name); + return -1; + } + + // hook capability. + if(cfg_range_hook_add(msix_cap, sizeof(struct msix_capability), + cfg_range_read_virtual, cfg_msix_passthrough_capability_write, state) != 0) return -1; + // rehook BARs + struct v3_mem_region *old_reg; + struct pt_bar *vbar = &state->virt_bars[cap->tbl_bir]; + if(vbar->type == PT_BAR_MEM64_LO) + vbar = &state->virt_bars[cap->tbl_bir + 1]; + old_reg = v3_get_mem_region(vm_info, V3_MEM_CORE_ANY, vbar->addr); + if (old_reg == NULL) { + PrintError("Could not find PCI Passthrough memory redirection region (addr=0x%x)\n", (uint32_t)vbar->addr); + return -1; + } + v3_delete_mem_region(vm_info, old_reg); + if(msix_set_bar_hooks(vm_dev, cap->tbl_bir, cap->tbl_offset << 3, tbl_size * sizeof(struct msix_tbl_entry))) return -1; + + tbl_bar->flags.msix = vbar->flags.msix = 1; + + state->msi.mx.pba = pba; + state->msi.mx.hw_pba = V3_VAddr(hw_pba); + state->msi.mx.table = table; + state->msi.mx.orig_table = orig_table; + state->msi.mx.hw_table = V3_VAddr(hw_table); + state->msi.mx.cap_offset = msix_cap; + state->msi.mx.cap = cap; } return 0; @@ -1342,7 +1780,7 @@ static int setup_virt_pci_dev(struct v3_vm_info * vm_info, struct vm_device * de cfg = v3_cfg_next_branch(cfg); } - if(setup_msi_passthrough(vm_info, state)) return -1; + if(setup_msi_passthrough(dev, state)) return -1; if(v3_sym_map_pci_passthrough(vm_info, pci_dev->bus_num, pci_dev->dev_num, pci_dev->fn_num)) return -1; -- 1.7.5.4