diff options
Diffstat (limited to 'drivers/vfio')
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 57 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_config.c | 173 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_intrs.c | 68 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_private.h | 1 | ||||
-rw-r--r-- | drivers/vfio/vfio.c | 117 |
5 files changed, 303 insertions, 113 deletions
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 8189cb6a86a..ac3725440d6 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -70,7 +70,7 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) pci_write_config_word(pdev, PCI_COMMAND, cmd); } - msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX); + msix_pos = pdev->msix_cap; if (msix_pos) { u16 flags; u32 table; @@ -78,8 +78,8 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); - vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK; - vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; + vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; + vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; } else vdev->msix_bar = 0xFF; @@ -183,7 +183,7 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) u8 pos; u16 flags; - pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI); + pos = vdev->pdev->msi_cap; if (pos) { pci_read_config_word(vdev->pdev, pos + PCI_MSI_FLAGS, &flags); @@ -194,14 +194,16 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) u8 pos; u16 flags; - pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX); + pos = vdev->pdev->msix_cap; if (pos) { pci_read_config_word(vdev->pdev, pos + PCI_MSIX_FLAGS, &flags); return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; } - } + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) + if (pci_is_pcie(vdev->pdev)) + return 1; return 0; } @@ -317,6 +319,17 @@ static long vfio_pci_ioctl(void *device_data, if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) return -EINVAL; + switch (info.index) { + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: + break; + case VFIO_PCI_ERR_IRQ_INDEX: + if (pci_is_pcie(vdev->pdev)) + break; + /* pass thru to return error */ + default: + return -EINVAL; + } + info.flags = VFIO_IRQ_INFO_EVENTFD; info.count = vfio_pci_get_irq_count(vdev, info.index); @@ -346,6 +359,7 @@ static long vfio_pci_ioctl(void *device_data, if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { size_t size; + int max = vfio_pci_get_irq_count(vdev, hdr.index); if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) size = sizeof(uint8_t); @@ -355,7 +369,7 @@ static long vfio_pci_ioctl(void *device_data, return -EINVAL; if (hdr.argsz - minsz < hdr.count * size || - hdr.count > vfio_pci_get_irq_count(vdev, hdr.index)) + hdr.start >= max || hdr.start + hdr.count > max) return -EINVAL; data = memdup_user((void __user *)(arg + minsz), @@ -551,11 +565,40 @@ static void vfio_pci_remove(struct pci_dev *pdev) kfree(vdev); } +static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct vfio_pci_device *vdev; + struct vfio_device *device; + + device = vfio_device_get_from_dev(&pdev->dev); + if (device == NULL) + return PCI_ERS_RESULT_DISCONNECT; + + vdev = vfio_device_data(device); + if (vdev == NULL) { + vfio_device_put(device); + return PCI_ERS_RESULT_DISCONNECT; + } + + if (vdev->err_trigger) + eventfd_signal(vdev->err_trigger, 1); + + vfio_device_put(device); + + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static struct pci_error_handlers vfio_err_handlers = { + .error_detected = vfio_pci_aer_err_detected, +}; + static struct pci_driver vfio_pci_driver = { .name = "vfio-pci", .id_table = NULL, /* only dynamic ids */ .probe = vfio_pci_probe, .remove = vfio_pci_remove, + .err_handler = &vfio_err_handlers, }; static void __exit vfio_pci_cleanup(void) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 964ff22bf28..affa34745be 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -27,6 +27,7 @@ #include <linux/pci.h> #include <linux/uaccess.h> #include <linux/vfio.h> +#include <linux/slab.h> #include "vfio_pci_private.h" @@ -273,9 +274,10 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, return count; } -static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 val) +/* Raw access skips any kind of virtualization */ +static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) { int ret; @@ -286,13 +288,36 @@ static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, return count; } -/* Default all regions to read-only, no-virtualization */ +static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) +{ + int ret; + + ret = vfio_user_config_read(vdev->pdev, pos, val, count); + if (ret) + return pcibios_err_to_errno(ret); + + return count; +} + +/* Default capability regions to read-only, no-virtualization */ static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } }; static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = { [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } }; +/* + * Default unassigned regions to raw read-write access. Some devices + * require this to function as they hide registers between the gaps in + * config space (be2net). Like MMIO and I/O port registers, we have + * to trust the hardware isolation. + */ +static struct perm_bits unassigned_perms = { + .readfn = vfio_raw_config_read, + .writefn = vfio_raw_config_write +}; static void free_perm_bits(struct perm_bits *perm) { @@ -778,16 +803,16 @@ int __init vfio_pci_init_perm_bits(void) /* Capabilities */ ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]); - cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write; + cap_perms[PCI_CAP_ID_VPD].writefn = vfio_raw_config_write; ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]); - cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write; + cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write; ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]); ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]); /* Extended capabilities */ ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); - ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write; + ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write; if (ret) vfio_pci_uninit_perm_bits(); @@ -800,9 +825,6 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) u8 cap; int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : PCI_STD_HEADER_SIZEOF; - base /= 4; - pos /= 4; - cap = vdev->pci_config_map[pos]; if (cap == PCI_CAP_ID_BASIC) @@ -812,7 +834,7 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap) pos--; - return pos * 4; + return pos; } static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, @@ -1016,13 +1038,9 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) return byte; case PCI_CAP_ID_EXP: /* length based on version */ - ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word); - if (ret) - return pcibios_err_to_errno(ret); - vdev->extended_caps = true; - if ((word & PCI_EXP_FLAGS_VERS) == 1) + if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1) return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1; else return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2; @@ -1229,8 +1247,8 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) } /* Sanity check, do we overlap other capabilities? */ - for (i = 0; i < len; i += 4) { - if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID)) + for (i = 0; i < len; i++) { + if (likely(map[pos + i] == PCI_CAP_ID_INVALID)) continue; pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n", @@ -1238,7 +1256,7 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) pos + i, map[pos + i], cap); } - memset(map + (pos / 4), cap, len / 4); + memset(map + pos, cap, len); ret = vfio_fill_vconfig_bytes(vdev, pos, len); if (ret) return ret; @@ -1313,8 +1331,8 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) hidden = true; } - for (i = 0; i < len; i += 4) { - if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID)) + for (i = 0; i < len; i++) { + if (likely(map[epos + i] == PCI_CAP_ID_INVALID)) continue; pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n", @@ -1329,7 +1347,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) */ BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID); - memset(map + (epos / 4), ecap, len / 4); + memset(map + epos, ecap, len); ret = vfio_fill_vconfig_bytes(vdev, epos, len); if (ret) return ret; @@ -1376,10 +1394,12 @@ int vfio_config_init(struct vfio_pci_device *vdev) int ret; /* - * Config space, caps and ecaps are all dword aligned, so we can - * use one byte per dword to record the type. + * Config space, caps and ecaps are all dword aligned, so we could + * use one byte per dword to record the type. However, there are + * no requiremenst on the length of a capability, so the gap between + * capabilities needs byte granularity. */ - map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL); + map = kmalloc(pdev->cfg_size, GFP_KERNEL); if (!map) return -ENOMEM; @@ -1392,9 +1412,9 @@ int vfio_config_init(struct vfio_pci_device *vdev) vdev->pci_config_map = map; vdev->vconfig = vconfig; - memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4); - memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID, - (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4); + memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF); + memset(map + PCI_STD_HEADER_SIZEOF, PCI_CAP_ID_INVALID, + pdev->cfg_size - PCI_STD_HEADER_SIZEOF); ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF); if (ret) @@ -1449,6 +1469,22 @@ void vfio_config_free(struct vfio_pci_device *vdev) vdev->msi_perm = NULL; } +/* + * Find the remaining number of bytes in a dword that match the given + * position. Stop at either the end of the capability or the dword boundary. + */ +static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev, + loff_t pos) +{ + u8 cap = vdev->pci_config_map[pos]; + size_t i; + + for (i = 1; (pos + i) % 4 && vdev->pci_config_map[pos + i] == cap; i++) + /* nop */; + + return i; +} + static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { @@ -1457,55 +1493,48 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, __le32 val = 0; int cap_start = 0, offset; u8 cap_id; - ssize_t ret = count; + ssize_t ret; - if (*ppos < 0 || *ppos + count > pdev->cfg_size) + if (*ppos < 0 || *ppos >= pdev->cfg_size || + *ppos + count > pdev->cfg_size) return -EFAULT; /* - * gcc can't seem to figure out we're a static function, only called - * with count of 1/2/4 and hits copy_from_user_overflow without this. + * Chop accesses into aligned chunks containing no more than a + * single capability. Caller increments to the next chunk. */ - if (count > sizeof(val)) - return -EINVAL; - - cap_id = vdev->pci_config_map[*ppos / 4]; - - if (cap_id == PCI_CAP_ID_INVALID) { - if (iswrite) - return ret; /* drop */ - - /* - * Per PCI spec 3.0, section 6.1, reads from reserved and - * unimplemented registers return 0 - */ - if (copy_to_user(buf, &val, count)) - return -EFAULT; - - return ret; - } + count = min(count, vfio_pci_cap_remaining_dword(vdev, *ppos)); + if (count >= 4 && !(*ppos % 4)) + count = 4; + else if (count >= 2 && !(*ppos % 2)) + count = 2; + else + count = 1; - /* - * All capabilities are minimum 4 bytes and aligned on dword - * boundaries. Since we don't support unaligned accesses, we're - * only ever accessing a single capability. - */ - if (*ppos >= PCI_CFG_SPACE_SIZE) { - WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); + ret = count; - perm = &ecap_perms[cap_id]; - cap_start = vfio_find_cap_start(vdev, *ppos); + cap_id = vdev->pci_config_map[*ppos]; + if (cap_id == PCI_CAP_ID_INVALID) { + perm = &unassigned_perms; + cap_start = *ppos; } else { - WARN_ON(cap_id > PCI_CAP_ID_MAX); + if (*ppos >= PCI_CFG_SPACE_SIZE) { + WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); - perm = &cap_perms[cap_id]; + perm = &ecap_perms[cap_id]; + cap_start = vfio_find_cap_start(vdev, *ppos); + } else { + WARN_ON(cap_id > PCI_CAP_ID_MAX); - if (cap_id == PCI_CAP_ID_MSI) - perm = vdev->msi_perm; + perm = &cap_perms[cap_id]; - if (cap_id > PCI_CAP_ID_BASIC) - cap_start = vfio_find_cap_start(vdev, *ppos); + if (cap_id == PCI_CAP_ID_MSI) + perm = vdev->msi_perm; + + if (cap_id > PCI_CAP_ID_BASIC) + cap_start = vfio_find_cap_start(vdev, *ppos); + } } WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC); @@ -1545,20 +1574,8 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf, pos &= VFIO_PCI_OFFSET_MASK; - /* - * We want to both keep the access size the caller users as well as - * support reading large chunks of config space in a single call. - * PCI doesn't support unaligned accesses, so we can safely break - * those apart. - */ while (count) { - if (count >= 4 && !(pos % 4)) - ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite); - else if (count >= 2 && !(pos % 2)) - ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite); - else - ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite); - + ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite); if (ret < 0) return ret; diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 3639371fa69..4bc704e1b7c 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -22,6 +22,7 @@ #include <linux/vfio.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <linux/slab.h> #include "vfio_pci_private.h" @@ -286,7 +287,8 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev) * a signal is necessary, which can then be handled via a work queue * or directly depending on the caller. */ -int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused) +static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, + void *unused) { struct pci_dev *pdev = vdev->pdev; unsigned long flags; @@ -745,6 +747,63 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, return 0; } +static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + int32_t fd = *(int32_t *)data; + struct pci_dev *pdev = vdev->pdev; + + if ((index != VFIO_PCI_ERR_IRQ_INDEX) || + !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) + return -EINVAL; + + /* + * device_lock synchronizes setting and checking of + * err_trigger. The vfio_pci_aer_err_detected() is also + * called with device_lock held. + */ + + /* DATA_NONE/DATA_BOOL enables loopback testing */ + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + device_lock(&pdev->dev); + if (vdev->err_trigger) + eventfd_signal(vdev->err_trigger, 1); + device_unlock(&pdev->dev); + return 0; + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t trigger = *(uint8_t *)data; + device_lock(&pdev->dev); + if (trigger && vdev->err_trigger) + eventfd_signal(vdev->err_trigger, 1); + device_unlock(&pdev->dev); + return 0; + } + + /* Handle SET_DATA_EVENTFD */ + + if (fd == -1) { + device_lock(&pdev->dev); + if (vdev->err_trigger) + eventfd_ctx_put(vdev->err_trigger); + vdev->err_trigger = NULL; + device_unlock(&pdev->dev); + return 0; + } else if (fd >= 0) { + struct eventfd_ctx *efdctx; + efdctx = eventfd_ctx_fdget(fd); + if (IS_ERR(efdctx)) + return PTR_ERR(efdctx); + device_lock(&pdev->dev); + if (vdev->err_trigger) + eventfd_ctx_put(vdev->err_trigger); + vdev->err_trigger = efdctx; + device_unlock(&pdev->dev); + return 0; + } else + return -EINVAL; +} int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, unsigned index, unsigned start, unsigned count, void *data) @@ -779,6 +838,13 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, break; } break; + case VFIO_PCI_ERR_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_TRIGGER: + if (pci_is_pcie(vdev->pdev)) + func = vfio_pci_set_err_trigger; + break; + } } if (!func) diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index d7e55d03f49..9c6d5d0f3b0 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -56,6 +56,7 @@ struct vfio_pci_device { bool has_vga; struct pci_saved_state *pci_saved_state; atomic_t refcnt; + struct eventfd_ctx *err_trigger; }; #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index fcc12f3e60a..acb7121a931 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -24,8 +24,10 @@ #include <linux/list.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/stat.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/vfio.h> @@ -57,7 +59,7 @@ struct vfio_iommu_driver { struct vfio_container { struct kref kref; struct list_head group_list; - struct mutex group_lock; + struct rw_semaphore group_lock; struct vfio_iommu_driver *iommu_driver; void *iommu_data; }; @@ -392,12 +394,13 @@ static void vfio_device_release(struct kref *kref) } /* Device reference always implies a group reference */ -static void vfio_device_put(struct vfio_device *device) +void vfio_device_put(struct vfio_device *device) { struct vfio_group *group = device->group; kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock); vfio_group_put(group); } +EXPORT_SYMBOL_GPL(vfio_device_put); static void vfio_device_get(struct vfio_device *device) { @@ -627,6 +630,33 @@ int vfio_add_group_dev(struct device *dev, } EXPORT_SYMBOL_GPL(vfio_add_group_dev); +/** + * Get a reference to the vfio_device for a device that is known to + * be bound to a vfio driver. The driver implicitly holds a + * vfio_device reference between vfio_add_group_dev and + * vfio_del_group_dev. We can therefore use drvdata to increment + * that reference from the struct device. This additional + * reference must be released by calling vfio_device_put. + */ +struct vfio_device *vfio_device_get_from_dev(struct device *dev) +{ + struct vfio_device *device = dev_get_drvdata(dev); + + vfio_device_get(device); + + return device; +} +EXPORT_SYMBOL_GPL(vfio_device_get_from_dev); + +/* + * Caller must hold a reference to the vfio_device + */ +void *vfio_device_data(struct vfio_device *device) +{ + return device->device_data; +} +EXPORT_SYMBOL_GPL(vfio_device_data); + /* Given a referenced group, check if it contains the device */ static bool vfio_dev_present(struct vfio_group *group, struct device *dev) { @@ -675,9 +705,13 @@ EXPORT_SYMBOL_GPL(vfio_del_group_dev); static long vfio_ioctl_check_extension(struct vfio_container *container, unsigned long arg) { - struct vfio_iommu_driver *driver = container->iommu_driver; + struct vfio_iommu_driver *driver; long ret = 0; + down_read(&container->group_lock); + + driver = container->iommu_driver; + switch (arg) { /* No base extensions yet */ default: @@ -707,10 +741,12 @@ static long vfio_ioctl_check_extension(struct vfio_container *container, VFIO_CHECK_EXTENSION, arg); } + up_read(&container->group_lock); + return ret; } -/* hold container->group_lock */ +/* hold write lock on container->group_lock */ static int __vfio_container_attach_groups(struct vfio_container *container, struct vfio_iommu_driver *driver, void *data) @@ -741,7 +777,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, struct vfio_iommu_driver *driver; long ret = -ENODEV; - mutex_lock(&container->group_lock); + down_write(&container->group_lock); /* * The container is designed to be an unprivileged interface while @@ -752,7 +788,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, * the container is deprivileged and returns to an unset state. */ if (list_empty(&container->group_list) || container->iommu_driver) { - mutex_unlock(&container->group_lock); + up_write(&container->group_lock); return -EINVAL; } @@ -799,7 +835,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, mutex_unlock(&vfio.iommu_drivers_lock); skip_drivers_unlock: - mutex_unlock(&container->group_lock); + up_write(&container->group_lock); return ret; } @@ -815,9 +851,6 @@ static long vfio_fops_unl_ioctl(struct file *filep, if (!container) return ret; - driver = container->iommu_driver; - data = container->iommu_data; - switch (cmd) { case VFIO_GET_API_VERSION: ret = VFIO_API_VERSION; @@ -829,8 +862,15 @@ static long vfio_fops_unl_ioctl(struct file *filep, ret = vfio_ioctl_set_iommu(container, arg); break; default: + down_read(&container->group_lock); + + driver = container->iommu_driver; + data = container->iommu_data; + if (driver) /* passthrough all unrecognized ioctls */ ret = driver->ops->ioctl(data, cmd, arg); + + up_read(&container->group_lock); } return ret; @@ -854,7 +894,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) return -ENOMEM; INIT_LIST_HEAD(&container->group_list); - mutex_init(&container->group_lock); + init_rwsem(&container->group_lock); kref_init(&container->kref); filep->private_data = container; @@ -881,35 +921,55 @@ static ssize_t vfio_fops_read(struct file *filep, char __user *buf, size_t count, loff_t *ppos) { struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver = container->iommu_driver; + struct vfio_iommu_driver *driver; + ssize_t ret = -EINVAL; - if (unlikely(!driver || !driver->ops->read)) - return -EINVAL; + down_read(&container->group_lock); - return driver->ops->read(container->iommu_data, buf, count, ppos); + driver = container->iommu_driver; + if (likely(driver && driver->ops->read)) + ret = driver->ops->read(container->iommu_data, + buf, count, ppos); + + up_read(&container->group_lock); + + return ret; } static ssize_t vfio_fops_write(struct file *filep, const char __user *buf, size_t count, loff_t *ppos) { struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver = container->iommu_driver; + struct vfio_iommu_driver *driver; + ssize_t ret = -EINVAL; - if (unlikely(!driver || !driver->ops->write)) - return -EINVAL; + down_read(&container->group_lock); - return driver->ops->write(container->iommu_data, buf, count, ppos); + driver = container->iommu_driver; + if (likely(driver && driver->ops->write)) + ret = driver->ops->write(container->iommu_data, + buf, count, ppos); + + up_read(&container->group_lock); + + return ret; } static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma) { struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver = container->iommu_driver; + struct vfio_iommu_driver *driver; + int ret = -EINVAL; - if (unlikely(!driver || !driver->ops->mmap)) - return -EINVAL; + down_read(&container->group_lock); - return driver->ops->mmap(container->iommu_data, vma); + driver = container->iommu_driver; + if (likely(driver && driver->ops->mmap)) + ret = driver->ops->mmap(container->iommu_data, vma); + + up_read(&container->group_lock); + + return ret; } static const struct file_operations vfio_fops = { @@ -933,7 +993,7 @@ static void __vfio_group_unset_container(struct vfio_group *group) struct vfio_container *container = group->container; struct vfio_iommu_driver *driver; - mutex_lock(&container->group_lock); + down_write(&container->group_lock); driver = container->iommu_driver; if (driver) @@ -951,7 +1011,7 @@ static void __vfio_group_unset_container(struct vfio_group *group) container->iommu_data = NULL; } - mutex_unlock(&container->group_lock); + up_write(&container->group_lock); vfio_container_put(container); } @@ -1011,7 +1071,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) container = f.file->private_data; WARN_ON(!container); /* fget ensures we don't race vfio_release */ - mutex_lock(&container->group_lock); + down_write(&container->group_lock); driver = container->iommu_driver; if (driver) { @@ -1029,7 +1089,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) atomic_inc(&group->container_users); unlock_out: - mutex_unlock(&container->group_lock); + up_write(&container->group_lock); fdput(f); return ret; } @@ -1300,6 +1360,9 @@ static const struct file_operations vfio_device_fops = { */ static char *vfio_devnode(struct device *dev, umode_t *mode) { + if (MINOR(dev->devt) == 0) + *mode = S_IRUGO | S_IWUGO; + return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); } |