From 147383d4878f9e1b2446b56c4831f2c759b93f22 Mon Sep 17 00:00:00 2001 From: Alexander Kudryavtsev Date: Fri, 23 Sep 2011 23:13:44 +0400 Subject: [PATCH 05/32] PCI MSI passthrough support --- palacios/src/devices/pci_passthrough.c | 336 ++++++++++++++++++++++++++++++-- 1 files changed, 319 insertions(+), 17 deletions(-) diff --git a/palacios/src/devices/pci_passthrough.c b/palacios/src/devices/pci_passthrough.c index 050cd02..ba12e34 100644 --- a/palacios/src/devices/pci_passthrough.c +++ b/palacios/src/devices/pci_passthrough.c @@ -40,6 +40,9 @@ #include #include +#include + +#include "pci_msi_types.h" #define MAX_PASSTHROUGH_DEVICES 64 @@ -108,6 +111,25 @@ struct pt_bar { uint32_t val; }; +// NOTE: see Intel manual 3A, 10.11 on APIC MSI generation (address and data format) +struct msi_pt_data { + uint8_t cap; + int cap_len; + + union msi_control *control; // points to guest config_space + union msi_msg_address *msg_address_lo; // points to guest config_space + union msi_msg_data *msg_data; // points to guest config_space + + union msi_msg_address msg_address_lo_orig; + union msi_msg_data msg_data_orig; + + void *router_private; // msi_router private data + +}; + +// XXX: where can we put this shared data? +static int dev_vcpus[MSI_DEV_COUNT]; // indexed by pci_dev->msi_handle - 1, contains current VCPU for MSI (or -1). Length is MSI_DEV_COUNT. +static int target_vcpus[V3_CONFIG_MAX_CPUS]; // for each MSI-targeted VCPU, +1. Length is V3_CONFIG_MAX_CPUS. struct cfg_range_hook { uint_t start; @@ -134,6 +156,8 @@ struct pt_dev_state { union pci_addr_reg phys_pci_addr; + struct msi_pt_data msi; + // ranges which should never be accessed in real PCI config space, only virtual copy. For example, excluded capabilities. struct list_head cfg_virtual_ranges; @@ -646,6 +670,22 @@ static int pci_bar_write(int bar_num, uint32_t * src, void * private_data) { return 0; } + +static int irq_handler(struct v3_vm_info * vm, struct v3_interrupt * intr, void * private_data) { + struct vm_device * dev = (struct vm_device *)private_data; + struct pt_dev_state * state = (struct pt_dev_state *)dev->private_data; + + + //PrintError("passthrough host irq %d\n", intr->irq); + if(v3_msi_set_vector_hint(state->pci_dev, intr->irq, state->msi.router_private)) return -1; + if(v3_pci_raise_irq(state->pci_bus, 0, state->pci_dev)) return -1; + + V3_ACK_IRQ(intr->irq); + + + return 0; +} + // find capability position and position of pointer to that capability. static int pci_find_capability_and_pointer(struct pci_device *dev, int cap, uint8_t *ptr_pos) { if(!(dev->config_header.status & PCI_STATUS_CAP_LIST)) { @@ -736,6 +776,279 @@ static inline int cfg_range_hooked(uint_t reg, uint_t length, struct cfg_range_h return 0; } +// This function copies original guest data and fixes it not to use lowest priority messages. +// The problem is that we can use only physical fixed mode in real MSI hardware (because we do +// not have access to physical APIC). As a result, if guest enables lowest priority mode, we +// transform it to physical fixed. +// If we leave the guest message type as lowest priority, MSI router will send corresponding IPI +// which will be handled by APIC code and do arbitration. It seems to be more performance efficient to +// cheat the guest and fix its message too to be in physical fixed mode and to arbitrate +// passed through lowest priority messages on our own (sending them to one core always). + +static int msi_passthrough_fix_guest_message(struct vm_device *dev) { + + struct pt_dev_state *state = (struct pt_dev_state *)dev->private_data; + int index = state->pci_dev->msi_handle - 1; + V3_ASSERT(index >= 0 && index < MSI_DEV_COUNT); + struct v3_vm_info *vm = dev->vm; + int prev_vcpu, new_vcpu = -1, i; + + union msi_control *c = state->msi.control; + union msi_msg_address *addr = state->msi.msg_address_lo; + union msi_msg_data *data = state->msi.msg_data; + + // copy guest data. it will be returned to guest when it reads. + state->msi.msg_data_orig = *data; + state->msi.msg_address_lo_orig = *addr; + + if(!c->enable) + return 0; + + // update MSI balancing structures + prev_vcpu = dev_vcpus[index]; + if (prev_vcpu != -1) { + --target_vcpus[prev_vcpu]; + } + + if(addr->redir_cpu && addr->dest_mode) { + // logical destination; we should fix it + + // find out good candidate for MSI target VCPU + int cur_min = 0; + while(cur_min < MSI_DEV_COUNT) { + for(i = 0; i < vm->num_cores; ++i) { + if(target_vcpus[i] == cur_min) { + new_vcpu = i; + cur_min = MSI_DEV_COUNT; + break; + } + } + ++cur_min; + } + // so we fix the message to have physical destination, fixed delivery mode. + data->del_mode = DEL_MODE_FIXED; + addr->dest_cpu = new_vcpu; + addr->dest_mode = MODE_PHYS_DEST; + addr->redir_cpu = 0; + PrintDebug("MSI PT device %s: guest wanted to use low_pri messages. Original message" + " address %x, data %x; fixed address %x, data %x.\n", + state->name, state->msi.msg_address_lo_orig.val, state->msi.msg_data_orig.val, + addr->val, data->val); + } else { + // physical destination + new_vcpu = addr->dest_cpu; + } + + dev_vcpus[index] = new_vcpu; + ++target_vcpus[new_vcpu]; + + PrintDebug("MSI PT: VCPU load: "); + for(i = 0; i < vm->num_cores; ++i) PrintDebug("%d ", target_vcpus[i]); + PrintDebug("\n"); + + return 0; +} + +extern int v3_pcpu_to_apic_id[V3_CONFIG_MAX_CPUS]; + +static int msi_passthrough_compose_host_message( + struct vm_device *dev, + struct msi_info *mi, + union msi_msg_address *addr, + union msi_msg_data *data, + union msi_msg_address *phys_addr, + union msi_msg_data *phys_data) { + + struct v3_vm_info *vm = dev->vm; + int phys_cpu; + + *phys_data = *data; + *phys_addr = *addr; + + if (mi->dest_mode == MODE_PHYS_DEST) { + if (mi->dest_cpu > vm->num_cores) { + PrintError("MSI PT device %s: Guest tries to use non-existent APIC ID (%x)!\n", dev->name, mi->dest_cpu); + return -1; + } + phys_cpu = v3_pcpu_to_apic_id[vm->cores[addr->dest_cpu].pcpu_id]; + PrintDebug("MSI PT device %s: message address transformed for physical dest_mode: vcpu id %x to pcpu id %x to real APIC id %x\n", + dev->name, addr->dest_cpu, vm->cores[addr->dest_cpu].pcpu_id, phys_cpu); + phys_addr->dest_cpu = phys_cpu; + } else { + // Logical destination. It could not happen, because we should have called + // msi_passthrough_fix_guest_message + PrintError("MSI PT device %s: guest message address (%x) has logical destination mode, which is not expected.\n", dev->name, addr->val); + return -1; + } + + return 0; +} + +static int cfg_msi_passthrough_capability_read(struct vm_device *dev, uint_t reg_num, void *ptr, uint_t length) { + struct pt_dev_state * state = (struct pt_dev_state *)dev->private_data; + int msi_cap = state->msi.cap, i; + union msi_control *c = state->msi.control; + uint32_t pci_addr = state->phys_pci_addr.value & ~0xff; + int status = 0; + + if(reg_num == MSI_CAP_ADDR_LO(msi_cap)) { + *(uint32_t *)ptr = state->msi.msg_address_lo_orig.val; + } else if(reg_num == MSI_CAP_DATA(msi_cap, c->is_64bit_cap)) { + *(uint32_t *)ptr = state->msi.msg_data_orig.val; + } else if(c->mask_cap && reg_num == MSI_CAP_PEND_BITS(msi_cap, c->is_64bit_cap)) { + status |= pci_cfg_read(pci_addr | MSI_CAP_PEND_BITS(msi_cap, c->is_64bit_cap), 4, + (uint32_t*)ptr); + } else { + // read virtualized part of config space + for (i = 0; i < length; i++) { + *(uint8_t *)((uint8_t *)ptr + i) = state->pci_dev->config_space[reg_num + i]; + } + } + + if(status) { + PrintError("MSI PT device %s: Read of MSI capability failed.\n", dev->name); + return status; + } + return 0; +} + +static int cfg_msi_passthrough_capability_write(struct vm_device *dev, uint_t reg_num, void *ptr, uint_t length) { + struct pt_dev_state * state = (struct pt_dev_state *)dev->private_data; + int msi_cap = state->msi.cap; + int status = 0, i; + struct msi_info mi; + uint32_t pci_addr = state->phys_pci_addr.value & ~0xff; + + if(msi_passthrough_fix_guest_message(dev) != 0) return -1; + if(v3_msi_capability_write(state->pci_dev, reg_num, length, state->msi.router_private) != 0) return -1; + if(v3_msi_get_info(state->pci_dev, &mi, state->msi.router_private) != 0) return -1; + + union msi_msg_address *address = state->msi.msg_address_lo, phys_address; + union msi_msg_data *data = state->msi.msg_data, phys_data; + + PrintDebug("MSI PT device %s: MSI router returned info: %s vec 0x%x vec_cnt %d dest_mode %s dest_cpu %x del_mode %s\n", + dev->name, mi.enabled ? "enabled" : "disabled", mi.vector, mi.vec_count, + mi.dest_mode == MODE_LOGIC_DEST ? "logic" : "phys", + mi.dest_cpu, mi.del_mode == DEL_MODE_FIXED ? "fixed" : "low_pri"); + + union msi_control *c = state->msi.control; + // note that all checks were already done in v3_msi_capability_write. + if(reg_num == MSI_CAP_CTL(msi_cap)) { + if(c->enable) { + if(msi_passthrough_compose_host_message(dev, &mi, address, data, &phys_address, &phys_data) != 0) return -1; + + for(i = 0; i < mi.vec_count; ++i) { + v3_hook_irq(dev->vm, phys_data.vector + i, irq_handler, dev); + } + + if(c->is_64bit_cap) + status |= pci_cfg_write(pci_addr | MSI_CAP_ADDR_HI(msi_cap, 1), 4, 0); + status |= pci_cfg_write(pci_addr | MSI_CAP_ADDR_LO(msi_cap), 4, phys_address.val); + status |= pci_cfg_write(pci_addr | MSI_CAP_DATA(msi_cap, c->is_64bit_cap), 2, phys_data.val); + status |= pci_cfg_write(pci_addr | MSI_CAP_CTL(msi_cap), 2, c->val); + } else { + // MSI disabled, need to reflect it + // XXX: need function to unhook irq? + status |= pci_cfg_write(pci_addr | MSI_CAP_CTL(msi_cap), 2, c->val); + } + } else if(reg_num == MSI_CAP_ADDR_LO(msi_cap)) { + if(c->enable) { + if(msi_passthrough_compose_host_message(dev, &mi, address, data, &phys_address, &phys_data) != 0) return -1; + + if(c->is_64bit_cap) + status |= pci_cfg_write(pci_addr | MSI_CAP_ADDR_HI(msi_cap, 1), 4, 0); + status |= pci_cfg_write(pci_addr | MSI_CAP_ADDR_LO(msi_cap), 4, phys_address.val); + status |= pci_cfg_write(pci_addr | MSI_CAP_DATA(msi_cap, c->is_64bit_cap), 2, phys_data.val); + status |= pci_cfg_write(pci_addr | MSI_CAP_CTL(msi_cap), 2, c->val); + } + } else if(reg_num == MSI_CAP_ADDR_HI(msi_cap, c->is_64bit_cap)) { + if(c->enable) { + status |= pci_cfg_write(pci_addr | MSI_CAP_ADDR_HI(msi_cap, 1), 4, 0); + } + } else if(reg_num == MSI_CAP_DATA(msi_cap, c->is_64bit_cap)) { + if(c->enable) { + if(msi_passthrough_compose_host_message(dev, &mi, address, data, &phys_address, &phys_data) != 0) return -1; + + for(i = 0; i < mi.vec_count; ++i) { + v3_hook_irq(dev->vm, phys_data.vector + i, irq_handler, dev); + } + + if(c->is_64bit_cap) + status |= pci_cfg_write(pci_addr | MSI_CAP_ADDR_HI(msi_cap, 1), 4, 0); + status |= pci_cfg_write(pci_addr | MSI_CAP_ADDR_LO(msi_cap), 4, phys_address.val); + status |= pci_cfg_write(pci_addr | MSI_CAP_DATA(msi_cap, c->is_64bit_cap), 2, phys_data.val); + status |= pci_cfg_write(pci_addr | MSI_CAP_CTL(msi_cap), 2, c->val); + } + } else if (c->mask_cap && reg_num == MSI_CAP_MASK_BITS(msi_cap, c->is_64bit_cap)) { + // mask bits + status |= pci_cfg_write(pci_addr | MSI_CAP_MASK_BITS(msi_cap, c->is_64bit_cap), 4, + *(uint32_t*)&state->pci_dev->config_space[MSI_CAP_MASK_BITS(msi_cap, c->is_64bit_cap)]); + } else { + PrintError("MSI PT device %s: Unhandled MSI access at %x (MSI CAP %x)!\n", dev->name, reg_num, msi_cap); + return -1; + } + if(status) { + PrintError("MSI PT device %s: Write to MSI capability failed.\n", dev->name); + return status; + } + return 0; +} + + + +static int setup_msi_passthrough(struct v3_vm_info * vm_info, struct pt_dev_state *state) +{ + struct pci_device *dev = state->pci_dev; + uint8_t unused; + + // search for PCI MSI capability and remember its offset. + int msi_cap = pci_find_capability_and_pointer(dev, PCI_CAP_ID_MSI, &unused); + if(!msi_cap) + return 0; + + int msi_cap_len = 10; // Minimal MSI capability size + state->msi.control = (union msi_control*)&dev->config_space[MSI_CAP_CTL(msi_cap)]; + + if(state->msi.control->is_64bit_cap) + msi_cap_len += 4; + if(state->msi.control->mask_cap) + msi_cap_len += 10; + + PrintDebug("MSI PT device %s: detected MSI capability at offset %02x, length %x, control %04x\n", + state->name, msi_cap, msi_cap_len, state->msi.control->val); + + // get MSI router + state->msi.router_private = v3_pci_get_msi_router(state->pci_bus); + if(!state->msi.router_private) { + PrintError("MSI PT device %s: MSI capability passed through, but no MSI router found!\n", state->name); + return -1; + } + + if(v3_msi_register_device(dev, msi_cap, state->msi.router_private) != 0) { + PrintError("MSI PT device %s: Failed to register MSI-capable passthrough device in MSI router.\n", state->name); + return -1; + } + + if(cfg_range_hook_add(msi_cap, (msi_cap_len & 3) == 0 ? msi_cap_len : ((msi_cap_len & ~3) + 4), + cfg_msi_passthrough_capability_read, cfg_msi_passthrough_capability_write, state) != 0) return -1; + + state->msi.cap = msi_cap; + state->msi.cap_len = msi_cap_len; + state->msi.control = (union msi_control*)&dev->config_space[MSI_CAP_CTL(msi_cap)]; + state->msi.msg_address_lo = (union msi_msg_address*)&dev->config_space[MSI_CAP_ADDR_LO(msi_cap)]; + state->msi.msg_data = (union msi_msg_data*)&dev->config_space[MSI_CAP_DATA(msi_cap, state->msi.control->is_64bit_cap)]; + + int i; + for (i = 0; i < MSI_DEV_COUNT; ++i) { + // XXX: this data is shared, need to place it somewhere + dev_vcpus[i] = -1; + } + + return 0; +} + + + static int pt_config_update(uint_t reg_num, void * src, uint_t length, void * private_data) { struct vm_device * dev = (struct vm_device *)private_data; struct pt_dev_state * state = (struct pt_dev_state *)dev->private_data; @@ -1017,6 +1330,7 @@ static int setup_virt_pci_dev(struct v3_vm_info * vm_info, struct vm_device * de cfg = v3_cfg_next_branch(cfg); } + if(setup_msi_passthrough(vm_info, state)) return -1; if(v3_sym_map_pci_passthrough(vm_info, pci_dev->bus_num, pci_dev->dev_num, pci_dev->fn_num)) return -1; @@ -1031,21 +1345,6 @@ static struct v3_device_ops dev_ops = { -static int irq_handler(struct v3_vm_info * vm, struct v3_interrupt * intr, void * private_data) { - struct vm_device * dev = (struct vm_device *)private_data; - struct pt_dev_state * state = (struct pt_dev_state *)dev->private_data; - - - v3_pci_raise_irq(state->pci_bus, 0, state->pci_dev); - - V3_ACK_IRQ(intr->irq); - - return 0; -} - - - - static int passthrough_init(struct v3_vm_info * vm, v3_cfg_tree_t * cfg) { struct pt_dev_state * state = V3_Malloc(sizeof(struct pt_dev_state)); struct vm_device * dev = NULL; @@ -1085,8 +1384,11 @@ static int passthrough_init(struct v3_vm_info * vm, v3_cfg_tree_t * cfg) { setup_virt_pci_dev(vm, dev, cfg); - v3_hook_irq(vm, atoi(v3_cfg_val(cfg, "irq")), irq_handler, dev); - // v3_hook_irq(info, 64, irq_handler, dev); + if(v3_cfg_val(cfg, "irq")) { + uint_t host_vector = atoi(v3_cfg_val(cfg, "irq")); + V3_ASSERT(host_vector >= 0 && host_vector < MAX_IRQ); + v3_hook_irq(vm, host_vector, irq_handler, dev); + } state->pci_dev->is_passthrough = 1; -- 1.7.5.4