summaryrefslogtreecommitdiff
path: root/drivers/vfio/pci/vfio_pci_intrs.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/vfio/pci/vfio_pci_intrs.c')
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c886
1 files changed, 456 insertions, 430 deletions
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 4bc704e1b7c7..c76e753b3cec 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO PCI interrupt handling
*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
@@ -16,243 +13,100 @@
#include <linux/device.h>
#include <linux/interrupt.h>
#include <linux/eventfd.h>
+#include <linux/msi.h>
#include <linux/pci.h>
#include <linux/file.h>
-#include <linux/poll.h>
#include <linux/vfio.h>
#include <linux/wait.h>
-#include <linux/workqueue.h>
#include <linux/slab.h>
-#include "vfio_pci_private.h"
+#include "vfio_pci_priv.h"
-/*
- * IRQfd - generic
- */
-struct virqfd {
- struct vfio_pci_device *vdev;
- struct eventfd_ctx *eventfd;
- int (*handler)(struct vfio_pci_device *, void *);
- void (*thread)(struct vfio_pci_device *, void *);
- void *data;
- struct work_struct inject;
- wait_queue_t wait;
- poll_table pt;
- struct work_struct shutdown;
- struct virqfd **pvirqfd;
+struct vfio_pci_irq_ctx {
+ struct vfio_pci_core_device *vdev;
+ struct eventfd_ctx *trigger;
+ struct virqfd *unmask;
+ struct virqfd *mask;
+ char *name;
+ bool masked;
+ struct irq_bypass_producer producer;
};
-static struct workqueue_struct *vfio_irqfd_cleanup_wq;
-
-int __init vfio_pci_virqfd_init(void)
+static bool irq_is(struct vfio_pci_core_device *vdev, int type)
{
- vfio_irqfd_cleanup_wq =
- create_singlethread_workqueue("vfio-irqfd-cleanup");
- if (!vfio_irqfd_cleanup_wq)
- return -ENOMEM;
-
- return 0;
+ return vdev->irq_type == type;
}
-void vfio_pci_virqfd_exit(void)
+static bool is_intx(struct vfio_pci_core_device *vdev)
{
- destroy_workqueue(vfio_irqfd_cleanup_wq);
+ return vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX;
}
-static void virqfd_deactivate(struct virqfd *virqfd)
+static bool is_irq_none(struct vfio_pci_core_device *vdev)
{
- queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
-}
-
-static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
-{
- struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
- unsigned long flags = (unsigned long)key;
-
- if (flags & POLLIN) {
- /* An event has been signaled, call function */
- if ((!virqfd->handler ||
- virqfd->handler(virqfd->vdev, virqfd->data)) &&
- virqfd->thread)
- schedule_work(&virqfd->inject);
- }
-
- if (flags & POLLHUP) {
- unsigned long flags;
- spin_lock_irqsave(&virqfd->vdev->irqlock, flags);
-
- /*
- * The eventfd is closing, if the virqfd has not yet been
- * queued for release, as determined by testing whether the
- * vdev pointer to it is still valid, queue it now. As
- * with kvm irqfds, we know we won't race against the virqfd
- * going away because we hold wqh->lock to get here.
- */
- if (*(virqfd->pvirqfd) == virqfd) {
- *(virqfd->pvirqfd) = NULL;
- virqfd_deactivate(virqfd);
- }
-
- spin_unlock_irqrestore(&virqfd->vdev->irqlock, flags);
- }
-
- return 0;
-}
-
-static void virqfd_ptable_queue_proc(struct file *file,
- wait_queue_head_t *wqh, poll_table *pt)
-{
- struct virqfd *virqfd = container_of(pt, struct virqfd, pt);
- add_wait_queue(wqh, &virqfd->wait);
-}
-
-static void virqfd_shutdown(struct work_struct *work)
-{
- struct virqfd *virqfd = container_of(work, struct virqfd, shutdown);
- u64 cnt;
-
- eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt);
- flush_work(&virqfd->inject);
- eventfd_ctx_put(virqfd->eventfd);
-
- kfree(virqfd);
+ return !(vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX ||
+ vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX ||
+ vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX);
}
-static void virqfd_inject(struct work_struct *work)
+static
+struct vfio_pci_irq_ctx *vfio_irq_ctx_get(struct vfio_pci_core_device *vdev,
+ unsigned long index)
{
- struct virqfd *virqfd = container_of(work, struct virqfd, inject);
- if (virqfd->thread)
- virqfd->thread(virqfd->vdev, virqfd->data);
+ return xa_load(&vdev->ctx, index);
}
-static int virqfd_enable(struct vfio_pci_device *vdev,
- int (*handler)(struct vfio_pci_device *, void *),
- void (*thread)(struct vfio_pci_device *, void *),
- void *data, struct virqfd **pvirqfd, int fd)
+static void vfio_irq_ctx_free(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_irq_ctx *ctx, unsigned long index)
{
- struct file *file = NULL;
- struct eventfd_ctx *ctx = NULL;
- struct virqfd *virqfd;
- int ret = 0;
- unsigned int events;
-
- virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL);
- if (!virqfd)
- return -ENOMEM;
-
- virqfd->pvirqfd = pvirqfd;
- virqfd->vdev = vdev;
- virqfd->handler = handler;
- virqfd->thread = thread;
- virqfd->data = data;
-
- INIT_WORK(&virqfd->shutdown, virqfd_shutdown);
- INIT_WORK(&virqfd->inject, virqfd_inject);
-
- file = eventfd_fget(fd);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto fail;
- }
-
- ctx = eventfd_ctx_fileget(file);
- if (IS_ERR(ctx)) {
- ret = PTR_ERR(ctx);
- goto fail;
- }
-
- virqfd->eventfd = ctx;
-
- /*
- * virqfds can be released by closing the eventfd or directly
- * through ioctl. These are both done through a workqueue, so
- * we update the pointer to the virqfd under lock to avoid
- * pushing multiple jobs to release the same virqfd.
- */
- spin_lock_irq(&vdev->irqlock);
-
- if (*pvirqfd) {
- spin_unlock_irq(&vdev->irqlock);
- ret = -EBUSY;
- goto fail;
- }
- *pvirqfd = virqfd;
-
- spin_unlock_irq(&vdev->irqlock);
-
- /*
- * Install our own custom wake-up handling so we are notified via
- * a callback whenever someone signals the underlying eventfd.
- */
- init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
- init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
-
- events = file->f_op->poll(file, &virqfd->pt);
-
- /*
- * Check if there was an event already pending on the eventfd
- * before we registered and trigger it as if we didn't miss it.
- */
- if (events & POLLIN) {
- if ((!handler || handler(vdev, data)) && thread)
- schedule_work(&virqfd->inject);
- }
-
- /*
- * Do not drop the file until the irqfd is fully initialized,
- * otherwise we might race against the POLLHUP.
- */
- fput(file);
-
- return 0;
-
-fail:
- if (ctx && !IS_ERR(ctx))
- eventfd_ctx_put(ctx);
-
- if (file && !IS_ERR(file))
- fput(file);
-
- kfree(virqfd);
-
- return ret;
+ xa_erase(&vdev->ctx, index);
+ kfree(ctx);
}
-static void virqfd_disable(struct vfio_pci_device *vdev,
- struct virqfd **pvirqfd)
+static struct vfio_pci_irq_ctx *
+vfio_irq_ctx_alloc(struct vfio_pci_core_device *vdev, unsigned long index)
{
- unsigned long flags;
+ struct vfio_pci_irq_ctx *ctx;
+ int ret;
- spin_lock_irqsave(&vdev->irqlock, flags);
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
+ if (!ctx)
+ return NULL;
- if (*pvirqfd) {
- virqfd_deactivate(*pvirqfd);
- *pvirqfd = NULL;
+ ret = xa_insert(&vdev->ctx, index, ctx, GFP_KERNEL_ACCOUNT);
+ if (ret) {
+ kfree(ctx);
+ return NULL;
}
- spin_unlock_irqrestore(&vdev->irqlock, flags);
-
- /*
- * Block until we know all outstanding shutdown jobs have completed.
- * Even if we don't queue the job, flush the wq to be sure it's
- * been released.
- */
- flush_workqueue(vfio_irqfd_cleanup_wq);
+ return ctx;
}
/*
* INTx
*/
-static void vfio_send_intx_eventfd(struct vfio_pci_device *vdev, void *unused)
+static void vfio_send_intx_eventfd(void *opaque, void *data)
{
- if (likely(is_intx(vdev) && !vdev->virq_disabled))
- eventfd_signal(vdev->ctx[0].trigger, 1);
+ struct vfio_pci_core_device *vdev = opaque;
+
+ if (likely(is_intx(vdev) && !vdev->virq_disabled)) {
+ struct vfio_pci_irq_ctx *ctx = data;
+ struct eventfd_ctx *trigger = READ_ONCE(ctx->trigger);
+
+ if (likely(trigger))
+ eventfd_signal(trigger);
+ }
}
-void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
+/* Returns true if the INTx vfio_pci_irq_ctx.masked value is changed. */
+static bool __vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
{
struct pci_dev *pdev = vdev->pdev;
+ struct vfio_pci_irq_ctx *ctx;
unsigned long flags;
+ bool masked_changed = false;
+
+ lockdep_assert_held(&vdev->igate);
spin_lock_irqsave(&vdev->irqlock, flags);
@@ -265,7 +119,14 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
if (unlikely(!is_intx(vdev))) {
if (vdev->pci_2_3)
pci_intx(pdev, 0);
- } else if (!vdev->ctx[0].masked) {
+ goto out_unlock;
+ }
+
+ ctx = vfio_irq_ctx_get(vdev, 0);
+ if (WARN_ON_ONCE(!ctx))
+ goto out_unlock;
+
+ if (!ctx->masked) {
/*
* Can't use check_and_mask here because we always want to
* mask, not just when something is pending.
@@ -275,10 +136,24 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
else
disable_irq_nosync(pdev->irq);
- vdev->ctx[0].masked = true;
+ ctx->masked = true;
+ masked_changed = true;
}
+out_unlock:
spin_unlock_irqrestore(&vdev->irqlock, flags);
+ return masked_changed;
+}
+
+bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
+{
+ bool mask_changed;
+
+ mutex_lock(&vdev->igate);
+ mask_changed = __vfio_pci_intx_mask(vdev);
+ mutex_unlock(&vdev->igate);
+
+ return mask_changed;
}
/*
@@ -287,10 +162,11 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
* a signal is necessary, which can then be handled via a work queue
* or directly depending on the caller.
*/
-static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev,
- void *unused)
+static int vfio_pci_intx_unmask_handler(void *opaque, void *data)
{
+ struct vfio_pci_core_device *vdev = opaque;
struct pci_dev *pdev = vdev->pdev;
+ struct vfio_pci_irq_ctx *ctx = data;
unsigned long flags;
int ret = 0;
@@ -303,7 +179,10 @@ static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev,
if (unlikely(!is_intx(vdev))) {
if (vdev->pci_2_3)
pci_intx(pdev, 1);
- } else if (vdev->ctx[0].masked && !vdev->virq_disabled) {
+ goto out_unlock;
+ }
+
+ if (ctx->masked && !vdev->virq_disabled) {
/*
* A pending interrupt here would immediately trigger,
* but we can avoid that overhead by just re-sending
@@ -315,23 +194,36 @@ static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev,
} else
enable_irq(pdev->irq);
- vdev->ctx[0].masked = (ret > 0);
+ ctx->masked = (ret > 0);
}
+out_unlock:
spin_unlock_irqrestore(&vdev->irqlock, flags);
return ret;
}
-void vfio_pci_intx_unmask(struct vfio_pci_device *vdev)
+static void __vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
+{
+ struct vfio_pci_irq_ctx *ctx = vfio_irq_ctx_get(vdev, 0);
+
+ lockdep_assert_held(&vdev->igate);
+
+ if (vfio_pci_intx_unmask_handler(vdev, ctx) > 0)
+ vfio_send_intx_eventfd(vdev, ctx);
+}
+
+void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
{
- if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
- vfio_send_intx_eventfd(vdev, NULL);
+ mutex_lock(&vdev->igate);
+ __vfio_pci_intx_unmask(vdev);
+ mutex_unlock(&vdev->igate);
}
static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
{
- struct vfio_pci_device *vdev = dev_id;
+ struct vfio_pci_irq_ctx *ctx = dev_id;
+ struct vfio_pci_core_device *vdev = ctx->vdev;
unsigned long flags;
int ret = IRQ_NONE;
@@ -339,114 +231,140 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
if (!vdev->pci_2_3) {
disable_irq_nosync(vdev->pdev->irq);
- vdev->ctx[0].masked = true;
+ ctx->masked = true;
ret = IRQ_HANDLED;
- } else if (!vdev->ctx[0].masked && /* may be shared */
+ } else if (!ctx->masked && /* may be shared */
pci_check_and_mask_intx(vdev->pdev)) {
- vdev->ctx[0].masked = true;
+ ctx->masked = true;
ret = IRQ_HANDLED;
}
spin_unlock_irqrestore(&vdev->irqlock, flags);
if (ret == IRQ_HANDLED)
- vfio_send_intx_eventfd(vdev, NULL);
+ vfio_send_intx_eventfd(vdev, ctx);
return ret;
}
-static int vfio_intx_enable(struct vfio_pci_device *vdev)
+static int vfio_intx_enable(struct vfio_pci_core_device *vdev,
+ struct eventfd_ctx *trigger)
{
+ struct pci_dev *pdev = vdev->pdev;
+ struct vfio_pci_irq_ctx *ctx;
+ unsigned long irqflags;
+ char *name;
+ int ret;
+
if (!is_irq_none(vdev))
return -EINVAL;
- if (!vdev->pdev->irq)
+ if (!pdev->irq || pdev->irq == IRQ_NOTCONNECTED)
return -ENODEV;
- vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
- if (!vdev->ctx)
+ name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev));
+ if (!name)
return -ENOMEM;
- vdev->num_ctx = 1;
+ ctx = vfio_irq_ctx_alloc(vdev, 0);
+ if (!ctx) {
+ kfree(name);
+ return -ENOMEM;
+ }
+
+ ctx->name = name;
+ ctx->trigger = trigger;
+ ctx->vdev = vdev;
/*
- * If the virtual interrupt is masked, restore it. Devices
- * supporting DisINTx can be masked at the hardware level
- * here, non-PCI-2.3 devices will have to wait until the
- * interrupt is enabled.
+ * Fill the initial masked state based on virq_disabled. After
+ * enable, changing the DisINTx bit in vconfig directly changes INTx
+ * masking. igate prevents races during setup, once running masked
+ * is protected via irqlock.
+ *
+ * Devices supporting DisINTx also reflect the current mask state in
+ * the physical DisINTx bit, which is not affected during IRQ setup.
+ *
+ * Devices without DisINTx support require an exclusive interrupt.
+ * IRQ masking is performed at the IRQ chip. Again, igate protects
+ * against races during setup and IRQ handlers and irqfds are not
+ * yet active, therefore masked is stable and can be used to
+ * conditionally auto-enable the IRQ.
+ *
+ * irq_type must be stable while the IRQ handler is registered,
+ * therefore it must be set before request_irq().
*/
- vdev->ctx[0].masked = vdev->virq_disabled;
- if (vdev->pci_2_3)
- pci_intx(vdev->pdev, !vdev->ctx[0].masked);
+ ctx->masked = vdev->virq_disabled;
+ if (vdev->pci_2_3) {
+ pci_intx(pdev, !ctx->masked);
+ irqflags = IRQF_SHARED;
+ } else {
+ irqflags = ctx->masked ? IRQF_NO_AUTOEN : 0;
+ }
vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
+ if (!vdev->pci_2_3)
+ irq_set_status_flags(pdev->irq, IRQ_DISABLE_UNLAZY);
+
+ ret = request_irq(pdev->irq, vfio_intx_handler,
+ irqflags, ctx->name, ctx);
+ if (ret) {
+ if (!vdev->pci_2_3)
+ irq_clear_status_flags(pdev->irq, IRQ_DISABLE_UNLAZY);
+ vdev->irq_type = VFIO_PCI_NUM_IRQS;
+ kfree(name);
+ vfio_irq_ctx_free(vdev, ctx, 0);
+ return ret;
+ }
+
return 0;
}
-static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
+static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev,
+ struct eventfd_ctx *trigger)
{
struct pci_dev *pdev = vdev->pdev;
- unsigned long irqflags = IRQF_SHARED;
- struct eventfd_ctx *trigger;
- unsigned long flags;
- int ret;
-
- if (vdev->ctx[0].trigger) {
- free_irq(pdev->irq, vdev);
- kfree(vdev->ctx[0].name);
- eventfd_ctx_put(vdev->ctx[0].trigger);
- vdev->ctx[0].trigger = NULL;
- }
-
- if (fd < 0) /* Disable only */
- return 0;
+ struct vfio_pci_irq_ctx *ctx;
+ struct eventfd_ctx *old;
- vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)",
- pci_name(pdev));
- if (!vdev->ctx[0].name)
- return -ENOMEM;
-
- trigger = eventfd_ctx_fdget(fd);
- if (IS_ERR(trigger)) {
- kfree(vdev->ctx[0].name);
- return PTR_ERR(trigger);
- }
+ ctx = vfio_irq_ctx_get(vdev, 0);
+ if (WARN_ON_ONCE(!ctx))
+ return -EINVAL;
- vdev->ctx[0].trigger = trigger;
+ old = ctx->trigger;
- if (!vdev->pci_2_3)
- irqflags = 0;
+ WRITE_ONCE(ctx->trigger, trigger);
- ret = request_irq(pdev->irq, vfio_intx_handler,
- irqflags, vdev->ctx[0].name, vdev);
- if (ret) {
- vdev->ctx[0].trigger = NULL;
- kfree(vdev->ctx[0].name);
- eventfd_ctx_put(trigger);
- return ret;
+ /* Releasing an old ctx requires synchronizing in-flight users */
+ if (old) {
+ synchronize_irq(pdev->irq);
+ vfio_virqfd_flush_thread(&ctx->unmask);
+ eventfd_ctx_put(old);
}
- /*
- * INTx disable will stick across the new irq setup,
- * disable_irq won't.
- */
- spin_lock_irqsave(&vdev->irqlock, flags);
- if (!vdev->pci_2_3 && vdev->ctx[0].masked)
- disable_irq_nosync(pdev->irq);
- spin_unlock_irqrestore(&vdev->irqlock, flags);
-
return 0;
}
-static void vfio_intx_disable(struct vfio_pci_device *vdev)
+static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
{
- vfio_intx_set_signal(vdev, -1);
- virqfd_disable(vdev, &vdev->ctx[0].unmask);
- virqfd_disable(vdev, &vdev->ctx[0].mask);
+ struct pci_dev *pdev = vdev->pdev;
+ struct vfio_pci_irq_ctx *ctx;
+
+ ctx = vfio_irq_ctx_get(vdev, 0);
+ WARN_ON_ONCE(!ctx);
+ if (ctx) {
+ vfio_virqfd_disable(&ctx->unmask);
+ vfio_virqfd_disable(&ctx->mask);
+ free_irq(pdev->irq, ctx);
+ if (!vdev->pci_2_3)
+ irq_clear_status_flags(pdev->irq, IRQ_DISABLE_UNLAZY);
+ if (ctx->trigger)
+ eventfd_ctx_put(ctx->trigger);
+ kfree(ctx->name);
+ vfio_irq_ctx_free(vdev, ctx, 0);
+ }
vdev->irq_type = VFIO_PCI_NUM_IRQS;
- vdev->num_ctx = 0;
- kfree(vdev->ctx);
}
/*
@@ -456,50 +374,31 @@ static irqreturn_t vfio_msihandler(int irq, void *arg)
{
struct eventfd_ctx *trigger = arg;
- eventfd_signal(trigger, 1);
+ eventfd_signal(trigger);
return IRQ_HANDLED;
}
-static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
+static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msix)
{
struct pci_dev *pdev = vdev->pdev;
+ unsigned int flag = msix ? PCI_IRQ_MSIX : PCI_IRQ_MSI;
int ret;
+ u16 cmd;
if (!is_irq_none(vdev))
return -EINVAL;
- vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
- if (!vdev->ctx)
- return -ENOMEM;
-
- if (msix) {
- int i;
-
- vdev->msix = kzalloc(nvec * sizeof(struct msix_entry),
- GFP_KERNEL);
- if (!vdev->msix) {
- kfree(vdev->ctx);
- return -ENOMEM;
- }
-
- for (i = 0; i < nvec; i++)
- vdev->msix[i].entry = i;
-
- ret = pci_enable_msix(pdev, vdev->msix, nvec);
- if (ret) {
- kfree(vdev->msix);
- kfree(vdev->ctx);
- return ret;
- }
- } else {
- ret = pci_enable_msi_block(pdev, nvec);
- if (ret) {
- kfree(vdev->ctx);
- return ret;
- }
+ /* return the number of supported vectors if we can't get all: */
+ cmd = vfio_pci_memory_lock_and_enable(vdev);
+ ret = pci_alloc_irq_vectors(pdev, 1, nvec, flag);
+ if (ret < nvec) {
+ if (ret > 0)
+ pci_free_irq_vectors(pdev);
+ vfio_pci_memory_unlock_and_restore(vdev, cmd);
+ return ret;
}
+ vfio_pci_memory_unlock_and_restore(vdev, cmd);
- vdev->num_ctx = nvec;
vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX :
VFIO_PCI_MSI_IRQ_INDEX;
@@ -514,59 +413,129 @@ static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
return 0;
}
-static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
- int vector, int fd, bool msix)
+/*
+ * vfio_msi_alloc_irq() returns the Linux IRQ number of an MSI or MSI-X device
+ * interrupt vector. If a Linux IRQ number is not available then a new
+ * interrupt is allocated if dynamic MSI-X is supported.
+ *
+ * Where is vfio_msi_free_irq()? Allocated interrupts are maintained,
+ * essentially forming a cache that subsequent allocations can draw from.
+ * Interrupts are freed using pci_free_irq_vectors() when MSI/MSI-X is
+ * disabled.
+ */
+static int vfio_msi_alloc_irq(struct vfio_pci_core_device *vdev,
+ unsigned int vector, bool msix)
{
struct pci_dev *pdev = vdev->pdev;
- int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector;
- char *name = msix ? "vfio-msix" : "vfio-msi";
- struct eventfd_ctx *trigger;
- int ret;
+ struct msi_map map;
+ int irq;
+ u16 cmd;
- if (vector >= vdev->num_ctx)
+ irq = pci_irq_vector(pdev, vector);
+ if (WARN_ON_ONCE(irq == 0))
return -EINVAL;
+ if (irq > 0 || !msix || !vdev->has_dyn_msix)
+ return irq;
+
+ cmd = vfio_pci_memory_lock_and_enable(vdev);
+ map = pci_msix_alloc_irq_at(pdev, vector, NULL);
+ vfio_pci_memory_unlock_and_restore(vdev, cmd);
+
+ return map.index < 0 ? map.index : map.virq;
+}
- if (vdev->ctx[vector].trigger) {
- free_irq(irq, vdev->ctx[vector].trigger);
- kfree(vdev->ctx[vector].name);
- eventfd_ctx_put(vdev->ctx[vector].trigger);
- vdev->ctx[vector].trigger = NULL;
+static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev,
+ unsigned int vector, int fd, bool msix)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ struct vfio_pci_irq_ctx *ctx;
+ struct eventfd_ctx *trigger;
+ int irq = -EINVAL, ret;
+ u16 cmd;
+
+ ctx = vfio_irq_ctx_get(vdev, vector);
+
+ if (ctx) {
+ irq_bypass_unregister_producer(&ctx->producer);
+ irq = pci_irq_vector(pdev, vector);
+ cmd = vfio_pci_memory_lock_and_enable(vdev);
+ free_irq(irq, ctx->trigger);
+ vfio_pci_memory_unlock_and_restore(vdev, cmd);
+ /* Interrupt stays allocated, will be freed at MSI-X disable. */
+ kfree(ctx->name);
+ eventfd_ctx_put(ctx->trigger);
+ vfio_irq_ctx_free(vdev, ctx, vector);
}
if (fd < 0)
return 0;
- vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)",
- name, vector, pci_name(pdev));
- if (!vdev->ctx[vector].name)
+ if (irq == -EINVAL) {
+ /* Interrupt stays allocated, will be freed at MSI-X disable. */
+ irq = vfio_msi_alloc_irq(vdev, vector, msix);
+ if (irq < 0)
+ return irq;
+ }
+
+ ctx = vfio_irq_ctx_alloc(vdev, vector);
+ if (!ctx)
return -ENOMEM;
+ ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-msi%s[%d](%s)",
+ msix ? "x" : "", vector, pci_name(pdev));
+ if (!ctx->name) {
+ ret = -ENOMEM;
+ goto out_free_ctx;
+ }
+
trigger = eventfd_ctx_fdget(fd);
if (IS_ERR(trigger)) {
- kfree(vdev->ctx[vector].name);
- return PTR_ERR(trigger);
+ ret = PTR_ERR(trigger);
+ goto out_free_name;
}
- ret = request_irq(irq, vfio_msihandler, 0,
- vdev->ctx[vector].name, trigger);
- if (ret) {
- kfree(vdev->ctx[vector].name);
- eventfd_ctx_put(trigger);
- return ret;
+ /*
+ * If the vector was previously allocated, refresh the on-device
+ * message data before enabling in case it had been cleared or
+ * corrupted (e.g. due to backdoor resets) since writing.
+ */
+ cmd = vfio_pci_memory_lock_and_enable(vdev);
+ if (msix) {
+ struct msi_msg msg;
+
+ get_cached_msi_msg(irq, &msg);
+ pci_write_msi_msg(irq, &msg);
}
- vdev->ctx[vector].trigger = trigger;
+ ret = request_irq(irq, vfio_msihandler, 0, ctx->name, trigger);
+ vfio_pci_memory_unlock_and_restore(vdev, cmd);
+ if (ret)
+ goto out_put_eventfd_ctx;
+
+ ret = irq_bypass_register_producer(&ctx->producer, trigger, irq);
+ if (unlikely(ret)) {
+ dev_info(&pdev->dev,
+ "irq bypass producer (eventfd %p) registration fails: %d\n",
+ trigger, ret);
+ }
+ ctx->trigger = trigger;
return 0;
+
+out_put_eventfd_ctx:
+ eventfd_ctx_put(trigger);
+out_free_name:
+ kfree(ctx->name);
+out_free_ctx:
+ vfio_irq_ctx_free(vdev, ctx, vector);
+ return ret;
}
-static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
+static int vfio_msi_set_block(struct vfio_pci_core_device *vdev, unsigned start,
unsigned count, int32_t *fds, bool msix)
{
- int i, j, ret = 0;
-
- if (start + count > vdev->num_ctx)
- return -EINVAL;
+ unsigned int i, j;
+ int ret = 0;
for (i = 0, j = start; i < count && !ret; i++, j++) {
int fd = fds ? fds[i] : -1;
@@ -574,40 +543,44 @@ static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
}
if (ret) {
- for (--j; j >= start; j--)
- vfio_msi_set_vector_signal(vdev, j, -1, msix);
+ for (i = start; i < j; i++)
+ vfio_msi_set_vector_signal(vdev, i, -1, msix);
}
return ret;
}
-static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
+static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix)
{
struct pci_dev *pdev = vdev->pdev;
- int i;
-
- vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
-
- for (i = 0; i < vdev->num_ctx; i++) {
- virqfd_disable(vdev, &vdev->ctx[i].unmask);
- virqfd_disable(vdev, &vdev->ctx[i].mask);
+ struct vfio_pci_irq_ctx *ctx;
+ unsigned long i;
+ u16 cmd;
+
+ xa_for_each(&vdev->ctx, i, ctx) {
+ vfio_virqfd_disable(&ctx->unmask);
+ vfio_virqfd_disable(&ctx->mask);
+ vfio_msi_set_vector_signal(vdev, i, -1, msix);
}
- if (msix) {
- pci_disable_msix(vdev->pdev);
- kfree(vdev->msix);
- } else
- pci_disable_msi(pdev);
+ cmd = vfio_pci_memory_lock_and_enable(vdev);
+ pci_free_irq_vectors(pdev);
+ vfio_pci_memory_unlock_and_restore(vdev, cmd);
+
+ /*
+ * Both disable paths above use pci_intx_for_msi() to clear DisINTx
+ * via their shutdown paths. Restore for NoINTx devices.
+ */
+ if (vdev->nointx)
+ pci_intx(pdev, 0);
vdev->irq_type = VFIO_PCI_NUM_IRQS;
- vdev->num_ctx = 0;
- kfree(vdev->ctx);
}
/*
* IOCTL support
*/
-static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
+static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
{
@@ -615,25 +588,30 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_NONE) {
- vfio_pci_intx_unmask(vdev);
+ __vfio_pci_intx_unmask(vdev);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t unmask = *(uint8_t *)data;
if (unmask)
- vfio_pci_intx_unmask(vdev);
+ __vfio_pci_intx_unmask(vdev);
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ struct vfio_pci_irq_ctx *ctx = vfio_irq_ctx_get(vdev, 0);
int32_t fd = *(int32_t *)data;
+
+ if (WARN_ON_ONCE(!ctx))
+ return -EINVAL;
if (fd >= 0)
- return virqfd_enable(vdev, vfio_pci_intx_unmask_handler,
- vfio_send_intx_eventfd, NULL,
- &vdev->ctx[0].unmask, fd);
+ return vfio_virqfd_enable((void *) vdev,
+ vfio_pci_intx_unmask_handler,
+ vfio_send_intx_eventfd, ctx,
+ &ctx->unmask, fd);
- virqfd_disable(vdev, &vdev->ctx[0].unmask);
+ vfio_virqfd_disable(&ctx->unmask);
}
return 0;
}
-static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
+static int vfio_pci_set_intx_mask(struct vfio_pci_core_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
{
@@ -641,11 +619,11 @@ static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_NONE) {
- vfio_pci_intx_mask(vdev);
+ __vfio_pci_intx_mask(vdev);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t mask = *(uint8_t *)data;
if (mask)
- vfio_pci_intx_mask(vdev);
+ __vfio_pci_intx_mask(vdev);
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
return -ENOTTY; /* XXX implement me */
}
@@ -653,7 +631,7 @@ static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
return 0;
}
-static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
+static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
{
@@ -666,19 +644,23 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ struct eventfd_ctx *trigger = NULL;
int32_t fd = *(int32_t *)data;
int ret;
- if (is_intx(vdev))
- return vfio_intx_set_signal(vdev, fd);
+ if (fd >= 0) {
+ trigger = eventfd_ctx_fdget(fd);
+ if (IS_ERR(trigger))
+ return PTR_ERR(trigger);
+ }
- ret = vfio_intx_enable(vdev);
- if (ret)
- return ret;
+ if (is_intx(vdev))
+ ret = vfio_intx_set_signal(vdev, trigger);
+ else
+ ret = vfio_intx_enable(vdev, trigger);
- ret = vfio_intx_set_signal(vdev, fd);
- if (ret)
- vfio_intx_disable(vdev);
+ if (ret && trigger)
+ eventfd_ctx_put(trigger);
return ret;
}
@@ -687,21 +669,22 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_NONE) {
- vfio_send_intx_eventfd(vdev, NULL);
+ vfio_send_intx_eventfd(vdev, vfio_irq_ctx_get(vdev, 0));
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t trigger = *(uint8_t *)data;
if (trigger)
- vfio_send_intx_eventfd(vdev, NULL);
+ vfio_send_intx_eventfd(vdev, vfio_irq_ctx_get(vdev, 0));
}
return 0;
}
-static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
+static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
{
- int i;
- bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false;
+ struct vfio_pci_irq_ctx *ctx;
+ unsigned int i;
+ bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX);
if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
vfio_msi_disable(vdev, msix);
@@ -730,85 +713,120 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
return ret;
}
- if (!irq_is(vdev, index) || start + count > vdev->num_ctx)
+ if (!irq_is(vdev, index))
return -EINVAL;
for (i = start; i < start + count; i++) {
- if (!vdev->ctx[i].trigger)
+ ctx = vfio_irq_ctx_get(vdev, i);
+ if (!ctx)
continue;
if (flags & VFIO_IRQ_SET_DATA_NONE) {
- eventfd_signal(vdev->ctx[i].trigger, 1);
+ eventfd_signal(ctx->trigger);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t *bools = data;
if (bools[i - start])
- eventfd_signal(vdev->ctx[i].trigger, 1);
+ eventfd_signal(ctx->trigger);
}
}
return 0;
}
-static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
- unsigned index, unsigned start,
- unsigned count, uint32_t flags, void *data)
+static int vfio_pci_set_ctx_trigger_single(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_eventfd __rcu **peventfd,
+ unsigned int count, uint32_t flags,
+ void *data)
{
- int32_t fd = *(int32_t *)data;
- struct pci_dev *pdev = vdev->pdev;
+ /* DATA_NONE/DATA_BOOL enables loopback testing */
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ struct vfio_pci_eventfd *eventfd;
- if ((index != VFIO_PCI_ERR_IRQ_INDEX) ||
- !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
- return -EINVAL;
+ eventfd = rcu_dereference_protected(*peventfd,
+ lockdep_is_held(&vdev->igate));
- /*
- * device_lock synchronizes setting and checking of
- * err_trigger. The vfio_pci_aer_err_detected() is also
- * called with device_lock held.
- */
+ if (!eventfd)
+ return -EINVAL;
- /* DATA_NONE/DATA_BOOL enables loopback testing */
+ if (count) {
+ eventfd_signal(eventfd->ctx);
+ return 0;
+ }
- if (flags & VFIO_IRQ_SET_DATA_NONE) {
- device_lock(&pdev->dev);
- if (vdev->err_trigger)
- eventfd_signal(vdev->err_trigger, 1);
- device_unlock(&pdev->dev);
- return 0;
+ return vfio_pci_eventfd_replace_locked(vdev, peventfd, NULL);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
- uint8_t trigger = *(uint8_t *)data;
- device_lock(&pdev->dev);
- if (trigger && vdev->err_trigger)
- eventfd_signal(vdev->err_trigger, 1);
- device_unlock(&pdev->dev);
+ uint8_t trigger;
+
+ if (!count)
+ return -EINVAL;
+
+ trigger = *(uint8_t *)data;
+
+ if (trigger) {
+ struct vfio_pci_eventfd *eventfd =
+ rcu_dereference_protected(*peventfd,
+ lockdep_is_held(&vdev->igate));
+
+ if (eventfd)
+ eventfd_signal(eventfd->ctx);
+ }
+
return 0;
+ } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ int32_t fd;
+
+ if (!count)
+ return -EINVAL;
+
+ fd = *(int32_t *)data;
+ if (fd == -1) {
+ return vfio_pci_eventfd_replace_locked(vdev,
+ peventfd, NULL);
+ } else if (fd >= 0) {
+ struct eventfd_ctx *efdctx;
+ int ret;
+
+ efdctx = eventfd_ctx_fdget(fd);
+ if (IS_ERR(efdctx))
+ return PTR_ERR(efdctx);
+
+ ret = vfio_pci_eventfd_replace_locked(vdev,
+ peventfd, efdctx);
+ if (ret)
+ eventfd_ctx_put(efdctx);
+
+ return ret;
+ }
}
- /* Handle SET_DATA_EVENTFD */
+ return -EINVAL;
+}
- if (fd == -1) {
- device_lock(&pdev->dev);
- if (vdev->err_trigger)
- eventfd_ctx_put(vdev->err_trigger);
- vdev->err_trigger = NULL;
- device_unlock(&pdev->dev);
- return 0;
- } else if (fd >= 0) {
- struct eventfd_ctx *efdctx;
- efdctx = eventfd_ctx_fdget(fd);
- if (IS_ERR(efdctx))
- return PTR_ERR(efdctx);
- device_lock(&pdev->dev);
- if (vdev->err_trigger)
- eventfd_ctx_put(vdev->err_trigger);
- vdev->err_trigger = efdctx;
- device_unlock(&pdev->dev);
- return 0;
- } else
+static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev,
+ unsigned index, unsigned start,
+ unsigned count, uint32_t flags, void *data)
+{
+ if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1)
return -EINVAL;
+
+ return vfio_pci_set_ctx_trigger_single(vdev, &vdev->err_trigger,
+ count, flags, data);
}
-int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
+
+static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
+ unsigned index, unsigned start,
+ unsigned count, uint32_t flags, void *data)
+{
+ if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1)
+ return -EINVAL;
+
+ return vfio_pci_set_ctx_trigger_single(vdev, &vdev->req_trigger,
+ count, flags, data);
+}
+
+int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
unsigned index, unsigned start, unsigned count,
void *data)
{
- int (*func)(struct vfio_pci_device *vdev, unsigned index,
+ int (*func)(struct vfio_pci_core_device *vdev, unsigned index,
unsigned start, unsigned count, uint32_t flags,
void *data) = NULL;
@@ -845,6 +863,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
func = vfio_pci_set_err_trigger;
break;
}
+ break;
+ case VFIO_PCI_REQ_IRQ_INDEX:
+ switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+ case VFIO_IRQ_SET_ACTION_TRIGGER:
+ func = vfio_pci_set_req_trigger;
+ break;
+ }
+ break;
}
if (!func)