diff options
Diffstat (limited to 'drivers/dma/idxd/cdev.c')
| -rw-r--r-- | drivers/dma/idxd/cdev.c | 540 |
1 files changed, 497 insertions, 43 deletions
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index e9def577c697..7e4715f92773 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -6,13 +6,14 @@ #include <linux/pci.h> #include <linux/device.h> #include <linux/sched/task.h> -#include <linux/intel-svm.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/cdev.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/iommu.h> +#include <linux/highmem.h> #include <uapi/linux/idxd.h> +#include <linux/xarray.h> #include "registers.h" #include "idxd.h" @@ -23,6 +24,12 @@ struct idxd_cdev_context { }; /* + * Since user file names are global in DSA devices, define their ida's as + * global to avoid conflict file names. + */ +static DEFINE_IDA(file_ida); + +/* * ictx is an array based off of accelerator types. enum idxd_type * is used as index */ @@ -35,22 +42,131 @@ struct idxd_user_context { struct idxd_wq *wq; struct task_struct *task; unsigned int pasid; + struct mm_struct *mm; unsigned int flags; struct iommu_sva *sva; + struct idxd_dev idxd_dev; + u64 counters[COUNTER_MAX]; + int id; + pid_t pid; +}; + +static void idxd_cdev_evl_drain_pasid(struct idxd_wq *wq, u32 pasid); +static void idxd_xa_pasid_remove(struct idxd_user_context *ctx); + +static inline struct idxd_user_context *dev_to_uctx(struct device *dev) +{ + struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev); + + return container_of(idxd_dev, struct idxd_user_context, idxd_dev); +} + +static ssize_t cr_faults_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct idxd_user_context *ctx = dev_to_uctx(dev); + + return sysfs_emit(buf, "%llu\n", ctx->counters[COUNTER_FAULTS]); +} +static DEVICE_ATTR_RO(cr_faults); + +static ssize_t cr_fault_failures_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_user_context *ctx = dev_to_uctx(dev); + + return sysfs_emit(buf, "%llu\n", ctx->counters[COUNTER_FAULT_FAILS]); +} +static DEVICE_ATTR_RO(cr_fault_failures); + +static ssize_t pid_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct idxd_user_context *ctx = dev_to_uctx(dev); + + return sysfs_emit(buf, "%u\n", ctx->pid); +} +static DEVICE_ATTR_RO(pid); + +static struct attribute *cdev_file_attributes[] = { + &dev_attr_cr_faults.attr, + &dev_attr_cr_fault_failures.attr, + &dev_attr_pid.attr, + NULL +}; + +static umode_t cdev_file_attr_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct idxd_user_context *ctx = dev_to_uctx(dev); + struct idxd_wq *wq = ctx->wq; + + if (!wq_pasid_enabled(wq)) + return 0; + + return a->mode; +} + +static const struct attribute_group cdev_file_attribute_group = { + .attrs = cdev_file_attributes, + .is_visible = cdev_file_attr_visible, +}; + +static const struct attribute_group *cdev_file_attribute_groups[] = { + &cdev_file_attribute_group, + NULL +}; + +static void idxd_file_dev_release(struct device *dev) +{ + struct idxd_user_context *ctx = dev_to_uctx(dev); + struct idxd_wq *wq = ctx->wq; + struct idxd_device *idxd = wq->idxd; + int rc; + + ida_free(&file_ida, ctx->id); + + /* Wait for in-flight operations to complete. */ + if (wq_shared(wq)) { + idxd_device_drain_pasid(idxd, ctx->pasid); + } else { + if (device_user_pasid_enabled(idxd)) { + /* The wq disable in the disable pasid function will drain the wq */ + rc = idxd_wq_disable_pasid(wq); + if (rc < 0) + dev_err(dev, "wq disable pasid failed.\n"); + } else { + idxd_wq_drain(wq); + } + } + + if (ctx->sva) { + idxd_cdev_evl_drain_pasid(wq, ctx->pasid); + iommu_sva_unbind_device(ctx->sva); + idxd_xa_pasid_remove(ctx); + } + kfree(ctx); + mutex_lock(&wq->wq_lock); + idxd_wq_put(wq); + mutex_unlock(&wq->wq_lock); +} + +static const struct device_type idxd_cdev_file_type = { + .name = "idxd_file", + .release = idxd_file_dev_release, + .groups = cdev_file_attribute_groups, }; static void idxd_cdev_dev_release(struct device *dev) { - struct idxd_cdev *idxd_cdev = container_of(dev, struct idxd_cdev, dev); + struct idxd_cdev *idxd_cdev = dev_to_cdev(dev); struct idxd_cdev_context *cdev_ctx; struct idxd_wq *wq = idxd_cdev->wq; cdev_ctx = &ictx[wq->idxd->data->type]; - ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor); + ida_free(&cdev_ctx->minor_ida, idxd_cdev->minor); kfree(idxd_cdev); } -static struct device_type idxd_cdev_device_type = { +static const struct device_type idxd_cdev_device_type = { .name = "idxd_cdev", .release = idxd_cdev_dev_release, }; @@ -69,15 +185,46 @@ static inline struct idxd_wq *inode_wq(struct inode *inode) return idxd_cdev->wq; } +static void idxd_xa_pasid_remove(struct idxd_user_context *ctx) +{ + struct idxd_wq *wq = ctx->wq; + void *ptr; + + mutex_lock(&wq->uc_lock); + ptr = xa_cmpxchg(&wq->upasid_xa, ctx->pasid, ctx, NULL, GFP_KERNEL); + if (ptr != (void *)ctx) + dev_warn(&wq->idxd->pdev->dev, "xarray cmpxchg failed for pasid %u\n", + ctx->pasid); + mutex_unlock(&wq->uc_lock); +} + +void idxd_user_counter_increment(struct idxd_wq *wq, u32 pasid, int index) +{ + struct idxd_user_context *ctx; + + if (index >= COUNTER_MAX) + return; + + mutex_lock(&wq->uc_lock); + ctx = xa_load(&wq->upasid_xa, pasid); + if (!ctx) { + mutex_unlock(&wq->uc_lock); + return; + } + ctx->counters[index]++; + mutex_unlock(&wq->uc_lock); +} + static int idxd_cdev_open(struct inode *inode, struct file *filp) { struct idxd_user_context *ctx; struct idxd_device *idxd; struct idxd_wq *wq; - struct device *dev; + struct device *dev, *fdev; int rc = 0; - struct iommu_sva *sva; + struct iommu_sva *sva = NULL; unsigned int pasid; + struct idxd_cdev *idxd_cdev; wq = inode_wq(inode); idxd = wq->idxd; @@ -98,9 +245,10 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) ctx->wq = wq; filp->private_data = ctx; + ctx->pid = current->pid; - if (device_pasid_enabled(idxd)) { - sva = iommu_sva_bind_device(dev, current->mm, NULL); + if (device_user_pasid_enabled(idxd)) { + sva = iommu_sva_bind_device(dev, current->mm); if (IS_ERR(sva)) { rc = PTR_ERR(sva); dev_err(dev, "pasid allocation failed: %d\n", rc); @@ -109,65 +257,116 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) pasid = iommu_sva_get_pasid(sva); if (pasid == IOMMU_PASID_INVALID) { - iommu_sva_unbind_device(sva); rc = -EINVAL; - goto failed; + goto failed_get_pasid; } ctx->sva = sva; ctx->pasid = pasid; + ctx->mm = current->mm; + + mutex_lock(&wq->uc_lock); + rc = xa_insert(&wq->upasid_xa, pasid, ctx, GFP_KERNEL); + mutex_unlock(&wq->uc_lock); + if (rc < 0) + dev_warn(dev, "PASID entry already exist in xarray.\n"); if (wq_dedicated(wq)) { rc = idxd_wq_set_pasid(wq, pasid); if (rc < 0) { - iommu_sva_unbind_device(sva); dev_err(dev, "wq set pasid failed: %d\n", rc); - goto failed; + goto failed_set_pasid; } } } + idxd_cdev = wq->idxd_cdev; + ctx->id = ida_alloc(&file_ida, GFP_KERNEL); + if (ctx->id < 0) { + dev_warn(dev, "ida alloc failure\n"); + goto failed_ida; + } + ctx->idxd_dev.type = IDXD_DEV_CDEV_FILE; + fdev = user_ctx_dev(ctx); + device_initialize(fdev); + fdev->parent = cdev_dev(idxd_cdev); + fdev->bus = &dsa_bus_type; + fdev->type = &idxd_cdev_file_type; + + rc = dev_set_name(fdev, "file%d", ctx->id); + if (rc < 0) { + dev_warn(dev, "set name failure\n"); + goto failed_dev_name; + } + + rc = device_add(fdev); + if (rc < 0) { + dev_warn(dev, "file device add failure\n"); + goto failed_dev_add; + } + idxd_wq_get(wq); mutex_unlock(&wq->wq_lock); return 0; - failed: +failed_dev_add: +failed_dev_name: + put_device(fdev); +failed_ida: +failed_set_pasid: + if (device_user_pasid_enabled(idxd)) + idxd_xa_pasid_remove(ctx); +failed_get_pasid: + if (device_user_pasid_enabled(idxd) && !IS_ERR_OR_NULL(sva)) + iommu_sva_unbind_device(sva); +failed: mutex_unlock(&wq->wq_lock); kfree(ctx); return rc; } +static void idxd_cdev_evl_drain_pasid(struct idxd_wq *wq, u32 pasid) +{ + struct idxd_device *idxd = wq->idxd; + struct idxd_evl *evl = idxd->evl; + union evl_status_reg status; + u16 h, t, size; + int ent_size = evl_ent_size(idxd); + struct __evl_entry *entry_head; + + if (!evl) + return; + + mutex_lock(&evl->lock); + status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET); + t = status.tail; + h = status.head; + size = evl->size; + + while (h != t) { + entry_head = (struct __evl_entry *)(evl->log + (h * ent_size)); + if (entry_head->pasid == pasid && entry_head->wq_idx == wq->id) + set_bit(h, evl->bmap); + h = (h + 1) % size; + } + if (wq->wq) + drain_workqueue(wq->wq); + + mutex_unlock(&evl->lock); +} + static int idxd_cdev_release(struct inode *node, struct file *filep) { struct idxd_user_context *ctx = filep->private_data; struct idxd_wq *wq = ctx->wq; struct idxd_device *idxd = wq->idxd; struct device *dev = &idxd->pdev->dev; - int rc; dev_dbg(dev, "%s called\n", __func__); filep->private_data = NULL; - /* Wait for in-flight operations to complete. */ - if (wq_shared(wq)) { - idxd_device_drain_pasid(idxd, ctx->pasid); - } else { - if (device_pasid_enabled(idxd)) { - /* The wq disable in the disable pasid function will drain the wq */ - rc = idxd_wq_disable_pasid(wq); - if (rc < 0) - dev_err(dev, "wq disable pasid failed.\n"); - } else { - idxd_wq_drain(wq); - } - } + device_unregister(user_ctx_dev(ctx)); - if (ctx->sva) - iommu_sva_unbind_device(ctx->sva); - kfree(ctx); - mutex_lock(&wq->wq_lock); - idxd_wq_put(wq); - mutex_unlock(&wq->wq_lock); return 0; } @@ -198,11 +397,26 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma) int rc; dev_dbg(&pdev->dev, "%s called\n", __func__); + + /* + * Due to an erratum in some of the devices supported by the driver, + * direct user submission to the device can be unsafe. + * (See the INTEL-SA-01084 security advisory) + * + * For the devices that exhibit this behavior, require that the user + * has CAP_SYS_RAWIO capabilities. + */ + if (!idxd->user_submission_safe && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (current->mm != ctx->mm) + return -EPERM; + rc = check_vma(wq, vma, __func__); if (rc < 0) return rc; - vma->vm_flags |= VM_DONTCOPY; + vm_flags_set(vma, VM_DONTCOPY); pfn = (base + idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED)) >> PAGE_SHIFT; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); @@ -212,20 +426,91 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_page_prot); } +static int idxd_submit_user_descriptor(struct idxd_user_context *ctx, + struct dsa_hw_desc __user *udesc) +{ + struct idxd_wq *wq = ctx->wq; + struct idxd_dev *idxd_dev = &wq->idxd->idxd_dev; + const uint64_t comp_addr_align = is_dsa_dev(idxd_dev) ? 0x20 : 0x40; + void __iomem *portal = idxd_wq_portal_addr(wq); + struct dsa_hw_desc descriptor __aligned(64); + int rc; + + rc = copy_from_user(&descriptor, udesc, sizeof(descriptor)); + if (rc) + return -EFAULT; + + /* + * DSA devices are capable of indirect ("batch") command submission. + * On devices where direct user submissions are not safe, we cannot + * allow this since there is no good way for us to verify these + * indirect commands. Narrow the restriction of operations with the + * BATCH opcode to only DSA version 1 devices. + */ + if (is_dsa_dev(idxd_dev) && descriptor.opcode == DSA_OPCODE_BATCH && + wq->idxd->hw.version == DEVICE_VERSION_1 && + !wq->idxd->user_submission_safe) + return -EINVAL; + /* + * As per the programming specification, the completion address must be + * aligned to 32 or 64 bytes. If this is violated the hardware + * engine can get very confused (security issue). + */ + if (!IS_ALIGNED(descriptor.completion_addr, comp_addr_align)) + return -EINVAL; + + if (wq_dedicated(wq)) + iosubmit_cmds512(portal, &descriptor, 1); + else { + descriptor.priv = 0; + descriptor.pasid = ctx->pasid; + rc = idxd_enqcmds(wq, portal, &descriptor); + if (rc < 0) + return rc; + } + + return 0; +} + +static ssize_t idxd_cdev_write(struct file *filp, const char __user *buf, size_t len, + loff_t *unused) +{ + struct dsa_hw_desc __user *udesc = (struct dsa_hw_desc __user *)buf; + struct idxd_user_context *ctx = filp->private_data; + ssize_t written = 0; + int i; + + if (current->mm != ctx->mm) + return -EPERM; + + for (i = 0; i < len/sizeof(struct dsa_hw_desc); i++) { + int rc = idxd_submit_user_descriptor(ctx, udesc + i); + + if (rc) + return written ? written : rc; + + written += sizeof(struct dsa_hw_desc); + } + + return written; +} + static __poll_t idxd_cdev_poll(struct file *filp, struct poll_table_struct *wait) { struct idxd_user_context *ctx = filp->private_data; struct idxd_wq *wq = ctx->wq; struct idxd_device *idxd = wq->idxd; - unsigned long flags; __poll_t out = 0; + if (current->mm != ctx->mm) + return POLLNVAL; + poll_wait(filp, &wq->err_queue, wait); - spin_lock_irqsave(&idxd->dev_lock, flags); + spin_lock(&idxd->dev_lock); if (idxd->sw_err.valid) out = EPOLLIN | EPOLLRDNORM; - spin_unlock_irqrestore(&idxd->dev_lock, flags); + spin_unlock(&idxd->dev_lock); return out; } @@ -235,6 +520,7 @@ static const struct file_operations idxd_cdev_fops = { .open = idxd_cdev_open, .release = idxd_cdev_release, .mmap = idxd_cdev_mmap, + .write = idxd_cdev_write, .poll = idxd_cdev_poll, }; @@ -256,11 +542,12 @@ int idxd_wq_add_cdev(struct idxd_wq *wq) if (!idxd_cdev) return -ENOMEM; + idxd_cdev->idxd_dev.type = IDXD_DEV_CDEV; idxd_cdev->wq = wq; cdev = &idxd_cdev->cdev; - dev = &idxd_cdev->dev; + dev = cdev_dev(idxd_cdev); cdev_ctx = &ictx[wq->idxd->data->type]; - minor = ida_simple_get(&cdev_ctx->minor_ida, 0, MINORMASK, GFP_KERNEL); + minor = ida_alloc_max(&cdev_ctx->minor_ida, MINORMASK, GFP_KERNEL); if (minor < 0) { kfree(idxd_cdev); return minor; @@ -268,7 +555,7 @@ int idxd_wq_add_cdev(struct idxd_wq *wq) idxd_cdev->minor = minor; device_initialize(dev); - dev->parent = &wq->conf_dev; + dev->parent = wq_confdev(wq); dev->bus = &dsa_bus_type; dev->type = &idxd_cdev_device_type; dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor); @@ -299,10 +586,104 @@ void idxd_wq_del_cdev(struct idxd_wq *wq) idxd_cdev = wq->idxd_cdev; wq->idxd_cdev = NULL; - cdev_device_del(&idxd_cdev->cdev, &idxd_cdev->dev); - put_device(&idxd_cdev->dev); + cdev_device_del(&idxd_cdev->cdev, cdev_dev(idxd_cdev)); + put_device(cdev_dev(idxd_cdev)); } +static int idxd_user_drv_probe(struct idxd_dev *idxd_dev) +{ + struct device *dev = &idxd_dev->conf_dev; + struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev); + struct idxd_device *idxd = wq->idxd; + int rc; + + if (idxd->state != IDXD_DEV_ENABLED) + return -ENXIO; + + mutex_lock(&wq->wq_lock); + + if (!idxd_wq_driver_name_match(wq, dev)) { + idxd->cmd_status = IDXD_SCMD_WQ_NO_DRV_NAME; + rc = -ENODEV; + goto wq_err; + } + + /* + * User type WQ is enabled only when SVA is enabled for two reasons: + * - If no IOMMU or IOMMU Passthrough without SVA, userspace + * can directly access physical address through the WQ. + * - The IDXD cdev driver does not provide any ways to pin + * user pages and translate the address from user VA to IOVA or + * PA without IOMMU SVA. Therefore the application has no way + * to instruct the device to perform DMA function. This makes + * the cdev not usable for normal application usage. + */ + if (!device_user_pasid_enabled(idxd)) { + idxd->cmd_status = IDXD_SCMD_WQ_USER_NO_IOMMU; + dev_dbg(&idxd->pdev->dev, + "User type WQ cannot be enabled without SVA.\n"); + + rc = -EOPNOTSUPP; + goto wq_err; + } + + wq->wq = create_workqueue(dev_name(wq_confdev(wq))); + if (!wq->wq) { + rc = -ENOMEM; + goto wq_err; + } + + wq->type = IDXD_WQT_USER; + rc = idxd_drv_enable_wq(wq); + if (rc < 0) + goto err; + + rc = idxd_wq_add_cdev(wq); + if (rc < 0) { + idxd->cmd_status = IDXD_SCMD_CDEV_ERR; + goto err_cdev; + } + + idxd->cmd_status = 0; + mutex_unlock(&wq->wq_lock); + return 0; + +err_cdev: + idxd_drv_disable_wq(wq); +err: + destroy_workqueue(wq->wq); + wq->type = IDXD_WQT_NONE; +wq_err: + mutex_unlock(&wq->wq_lock); + return rc; +} + +static void idxd_user_drv_remove(struct idxd_dev *idxd_dev) +{ + struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev); + + mutex_lock(&wq->wq_lock); + idxd_wq_del_cdev(wq); + idxd_drv_disable_wq(wq); + wq->type = IDXD_WQT_NONE; + destroy_workqueue(wq->wq); + wq->wq = NULL; + mutex_unlock(&wq->wq_lock); +} + +static enum idxd_dev_type dev_types[] = { + IDXD_DEV_WQ, + IDXD_DEV_NONE, +}; + +struct idxd_device_driver idxd_user_drv = { + .probe = idxd_user_drv_probe, + .remove = idxd_user_drv_remove, + .name = "user", + .type = dev_types, +}; +EXPORT_SYMBOL_GPL(idxd_user_drv); + int idxd_cdev_register(void) { int rc, i; @@ -312,10 +693,16 @@ int idxd_cdev_register(void) rc = alloc_chrdev_region(&ictx[i].devt, 0, MINORMASK, ictx[i].name); if (rc) - return rc; + goto err_free_chrdev_region; } return 0; + +err_free_chrdev_region: + for (i--; i >= 0; i--) + unregister_chrdev_region(ictx[i].devt, MINORMASK); + + return rc; } void idxd_cdev_remove(void) @@ -327,3 +714,70 @@ void idxd_cdev_remove(void) ida_destroy(&ictx[i].minor_ida); } } + +/** + * idxd_copy_cr - copy completion record to user address space found by wq and + * PASID + * @wq: work queue + * @pasid: PASID + * @addr: user fault address to write + * @cr: completion record + * @len: number of bytes to copy + * + * This is called by a work that handles completion record fault. + * + * Return: number of bytes copied. + */ +int idxd_copy_cr(struct idxd_wq *wq, ioasid_t pasid, unsigned long addr, + void *cr, int len) +{ + struct device *dev = &wq->idxd->pdev->dev; + int left = len, status_size = 1; + struct idxd_user_context *ctx; + struct mm_struct *mm; + + mutex_lock(&wq->uc_lock); + + ctx = xa_load(&wq->upasid_xa, pasid); + if (!ctx) { + dev_warn(dev, "No user context\n"); + goto out; + } + + mm = ctx->mm; + /* + * The completion record fault handling work is running in kernel + * thread context. It temporarily switches to the mm to copy cr + * to addr in the mm. + */ + kthread_use_mm(mm); + left = copy_to_user((void __user *)addr + status_size, cr + status_size, + len - status_size); + /* + * Copy status only after the rest of completion record is copied + * successfully so that the user gets the complete completion record + * when a non-zero status is polled. + */ + if (!left) { + u8 status; + + /* + * Ensure that the completion record's status field is written + * after the rest of the completion record has been written. + * This ensures that the user receives the correct completion + * record information once polling for a non-zero status. + */ + wmb(); + status = *(u8 *)cr; + if (put_user(status, (u8 __user *)addr)) + left += status_size; + } else { + left += status_size; + } + kthread_unuse_mm(mm); + +out: + mutex_unlock(&wq->uc_lock); + + return len - left; +} |
