diff options
Diffstat (limited to 'drivers/dax/device.c')
| -rw-r--r-- | drivers/dax/device.c | 324 |
1 files changed, 168 insertions, 156 deletions
diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 4c0af2eb7e19..22999a402e02 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -4,7 +4,6 @@ #include <linux/pagemap.h> #include <linux/module.h> #include <linux/device.h> -#include <linux/pfn_t.h> #include <linux/cdev.h> #include <linux/slab.h> #include <linux/dax.h> @@ -14,10 +13,10 @@ #include "dax-private.h" #include "bus.h" -static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, - const char *func) +static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags, + unsigned long start, unsigned long end, struct file *file, + const char *func) { - struct dax_region *dax_region = dev_dax->region; struct device *dev = &dev_dax->dev; unsigned long mask; @@ -25,31 +24,23 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, return -ENXIO; /* prevent private mappings from being established */ - if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { + if ((vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { dev_info_ratelimited(dev, "%s: %s: fail, attempted private mapping\n", current->comm, func); return -EINVAL; } - mask = dax_region->align - 1; - if (vma->vm_start & mask || vma->vm_end & mask) { + mask = dev_dax->align - 1; + if (start & mask || end & mask) { dev_info_ratelimited(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", - current->comm, func, vma->vm_start, vma->vm_end, + current->comm, func, start, end, mask); return -EINVAL; } - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV - && (vma->vm_flags & VM_DONTCOPY) == 0) { - dev_info_ratelimited(dev, - "%s: %s: fail, dax range requires MADV_DONTFORK\n", - current->comm, func); - return -EINVAL; - } - - if (!vma_is_dax(vma)) { + if (!file_is_dax(file)) { dev_info_ratelimited(dev, "%s: %s: fail, vma is not DAX capable\n", current->comm, func); @@ -59,41 +50,80 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, return 0; } +static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, + const char *func) +{ + return __check_vma(dev_dax, vma->vm_flags, vma->vm_start, vma->vm_end, + vma->vm_file, func); +} + /* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size) { - struct resource *res = &dev_dax->region->res; - phys_addr_t phys; - - phys = pgoff * PAGE_SIZE + res->start; - if (phys >= res->start && phys <= res->end) { - if (phys + size - 1 <= res->end) + int i; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + struct range *range = &dax_range->range; + unsigned long long pgoff_end; + phys_addr_t phys; + + pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1; + if (pgoff < dax_range->pgoff || pgoff > pgoff_end) + continue; + phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; + if (phys + size - 1 <= range->end) return phys; + break; } - return -1; } +static void dax_set_mapping(struct vm_fault *vmf, unsigned long pfn, + unsigned long fault_size) +{ + unsigned long i, nr_pages = fault_size / PAGE_SIZE; + struct file *filp = vmf->vma->vm_file; + struct dev_dax *dev_dax = filp->private_data; + pgoff_t pgoff; + + /* mapping is only set on the head */ + if (dev_dax->pgmap->vmemmap_shift) + nr_pages = 1; + + pgoff = linear_page_index(vmf->vma, + ALIGN_DOWN(vmf->address, fault_size)); + + for (i = 0; i < nr_pages; i++) { + struct folio *folio = pfn_folio(pfn + i); + + if (folio->mapping) + continue; + + folio->mapping = filp->f_mapping; + folio->index = pgoff + i; + } +} + static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; phys_addr_t phys; + unsigned long pfn; unsigned int fault_size = PAGE_SIZE; if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dev_dax->region; - if (dax_region->align > PAGE_SIZE) { + if (dev_dax->align > PAGE_SIZE) { dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", - dax_region->align, fault_size); + dev_dax->align, fault_size); return VM_FAULT_SIGBUS; } - if (fault_size != dax_region->align) + if (fault_size != dev_dax->align) return VM_FAULT_SIGBUS; phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); @@ -102,40 +132,36 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + pfn = PHYS_PFN(phys); - return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); + dax_set_mapping(vmf, pfn, fault_size); + + return vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), + vmf->flags & FAULT_FLAG_WRITE); } static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { unsigned long pmd_addr = vmf->address & PMD_MASK; struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; + unsigned long pfn; unsigned int fault_size = PMD_SIZE; if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dev_dax->region; - if (dax_region->align > PMD_SIZE) { + if (dev_dax->align > PMD_SIZE) { dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", - dax_region->align, fault_size); + dev_dax->align, fault_size); return VM_FAULT_SIGBUS; } - /* dax pmd mappings require pfn_t_devmap() */ - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "region lacks devmap flags\n"); + if (fault_size < dev_dax->align) return VM_FAULT_SIGBUS; - } - - if (fault_size < dax_region->align) - return VM_FAULT_SIGBUS; - else if (fault_size > dax_region->align) + else if (fault_size > dev_dax->align) return VM_FAULT_FALLBACK; /* if we are outside of the VMA */ @@ -150,42 +176,38 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + pfn = PHYS_PFN(phys); + + dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); + return vmf_insert_folio_pmd(vmf, page_folio(pfn_to_page(pfn)), + vmf->flags & FAULT_FLAG_WRITE); } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { unsigned long pud_addr = vmf->address & PUD_MASK; struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; + unsigned long pfn; unsigned int fault_size = PUD_SIZE; if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dev_dax->region; - if (dax_region->align > PUD_SIZE) { + if (dev_dax->align > PUD_SIZE) { dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", - dax_region->align, fault_size); + dev_dax->align, fault_size); return VM_FAULT_SIGBUS; } - /* dax pud mappings require pfn_t_devmap() */ - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "region lacks devmap flags\n"); + if (fault_size < dev_dax->align) return VM_FAULT_SIGBUS; - } - - if (fault_size < dax_region->align) - return VM_FAULT_SIGBUS; - else if (fault_size > dax_region->align) + else if (fault_size > dev_dax->align) return VM_FAULT_FALLBACK; /* if we are outside of the VMA */ @@ -200,72 +222,42 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + pfn = PHYS_PFN(phys); + + dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); + return vmf_insert_folio_pud(vmf, page_folio(pfn_to_page(pfn)), + vmf->flags & FAULT_FLAG_WRITE); } #else static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { return VM_FAULT_FALLBACK; } #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { struct file *filp = vmf->vma->vm_file; - unsigned long fault_size; vm_fault_t rc = VM_FAULT_SIGBUS; int id; - pfn_t pfn; struct dev_dax *dev_dax = filp->private_data; - dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, - (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", - vmf->vma->vm_start, vmf->vma->vm_end, pe_size); + dev_dbg(&dev_dax->dev, "%s: op=%s addr=%#lx order=%d\n", current->comm, + (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", + vmf->address & ~((1UL << (order + PAGE_SHIFT)) - 1), order); id = dax_read_lock(); - switch (pe_size) { - case PE_SIZE_PTE: - fault_size = PAGE_SIZE; - rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn); - break; - case PE_SIZE_PMD: - fault_size = PMD_SIZE; - rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn); - break; - case PE_SIZE_PUD: - fault_size = PUD_SIZE; - rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn); - break; - default: + if (order == 0) + rc = __dev_dax_pte_fault(dev_dax, vmf); + else if (order == PMD_ORDER) + rc = __dev_dax_pmd_fault(dev_dax, vmf); + else if (order == PUD_ORDER) + rc = __dev_dax_pud_fault(dev_dax, vmf); + else rc = VM_FAULT_SIGBUS; - } - if (rc == VM_FAULT_NOPAGE) { - unsigned long i; - pgoff_t pgoff; - - /* - * In the device-dax case the only possibility for a - * VM_FAULT_NOPAGE result is when device-dax capacity is - * mapped. No need to consider the zero page, or racing - * conflicting mappings. - */ - pgoff = linear_page_index(vmf->vma, vmf->address - & ~(fault_size - 1)); - for (i = 0; i < fault_size / PAGE_SIZE; i++) { - struct page *page; - - page = pfn_to_page(pfn_t_to_pfn(pfn) + i); - if (page->mapping) - continue; - page->mapping = filp->f_mapping; - page->index = pgoff + i; - } - } dax_read_unlock(id); return rc; @@ -273,16 +265,15 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, static vm_fault_t dev_dax_fault(struct vm_fault *vmf) { - return dev_dax_huge_fault(vmf, PE_SIZE_PTE); + return dev_dax_huge_fault(vmf, 0); } -static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr) +static int dev_dax_may_split(struct vm_area_struct *vma, unsigned long addr) { struct file *filp = vma->vm_file; struct dev_dax *dev_dax = filp->private_data; - struct dax_region *dax_region = dev_dax->region; - if (!IS_ALIGNED(addr, dax_region->align)) + if (!IS_ALIGNED(addr, dev_dax->align)) return -EINVAL; return 0; } @@ -291,20 +282,20 @@ static unsigned long dev_dax_pagesize(struct vm_area_struct *vma) { struct file *filp = vma->vm_file; struct dev_dax *dev_dax = filp->private_data; - struct dax_region *dax_region = dev_dax->region; - return dax_region->align; + return dev_dax->align; } static const struct vm_operations_struct dax_vm_ops = { .fault = dev_dax_fault, .huge_fault = dev_dax_huge_fault, - .split = dev_dax_split, + .may_split = dev_dax_may_split, .pagesize = dev_dax_pagesize, }; -static int dax_mmap(struct file *filp, struct vm_area_struct *vma) +static int dax_mmap_prepare(struct vm_area_desc *desc) { + struct file *filp = desc->file; struct dev_dax *dev_dax = filp->private_data; int rc, id; @@ -315,13 +306,14 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma) * fault time. */ id = dax_read_lock(); - rc = check_vma(dev_dax, vma, __func__); + rc = __check_vma(dev_dax, desc->vm_flags, desc->start, desc->end, filp, + __func__); dax_read_unlock(id); if (rc) return rc; - vma->vm_ops = &dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + desc->vm_ops = &dax_vm_ops; + desc->vm_flags |= VM_HUGEPAGE; return 0; } @@ -332,13 +324,11 @@ static unsigned long dax_get_unmapped_area(struct file *filp, { unsigned long off, off_end, off_align, len_align, addr_align, align; struct dev_dax *dev_dax = filp ? filp->private_data : NULL; - struct dax_region *dax_region; if (!dev_dax || addr) goto out; - dax_region = dev_dax->region; - align = dax_region->align; + align = dev_dax->align; off = pgoff << PAGE_SHIFT; off_end = off + len; off_align = round_up(off, align); @@ -350,19 +340,17 @@ static unsigned long dax_get_unmapped_area(struct file *filp, if ((off + len_align) < off) goto out; - addr_align = current->mm->get_unmapped_area(filp, addr, len_align, - pgoff, flags); + addr_align = mm_get_unmapped_area(filp, addr, len_align, pgoff, flags); if (!IS_ERR_VALUE(addr_align)) { addr_align += (off - addr_align) & (align - 1); return addr_align; } out: - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); } static const struct address_space_operations dev_dax_aops = { - .set_page_dirty = noop_set_page_dirty, - .invalidatepage = noop_invalidatepage, + .dirty_folio = noop_dirty_folio, }; static int dax_open(struct inode *inode, struct file *filp) @@ -398,8 +386,8 @@ static const struct file_operations dax_fops = { .open = dax_open, .release = dax_release, .get_unmapped_area = dax_get_unmapped_area, - .mmap = dax_mmap, - .mmap_supported_flags = MAP_SYNC, + .mmap_prepare = dax_mmap_prepare, + .fop_flags = FOP_MMAP_SYNC, }; static void dev_dax_cdev_del(void *cdev) @@ -412,36 +400,69 @@ static void dev_dax_kill(void *dev_dax) kill_dev_dax(dev_dax); } -int dev_dax_probe(struct device *dev) +static int dev_dax_probe(struct dev_dax *dev_dax) { - struct dev_dax *dev_dax = to_dev_dax(dev); struct dax_device *dax_dev = dev_dax->dax_dev; - struct resource *res = &dev_dax->region->res; + struct device *dev = &dev_dax->dev; + struct dev_pagemap *pgmap; struct inode *inode; struct cdev *cdev; void *addr; - int rc; + int rc, i; + + if (static_dev_dax(dev_dax)) { + if (dev_dax->nr_range > 1) { + dev_warn(dev, + "static pgmap / multi-range device conflict\n"); + return -EINVAL; + } + + pgmap = dev_dax->pgmap; + } else { + if (dev_dax->pgmap) { + dev_warn(dev, + "dynamic-dax with pre-populated page map\n"); + return -EINVAL; + } + + pgmap = devm_kzalloc(dev, + struct_size(pgmap, ranges, dev_dax->nr_range - 1), + GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->nr_range = dev_dax->nr_range; + dev_dax->pgmap = pgmap; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + pgmap->ranges[i] = *range; + } + } + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; - /* 1:1 map region resource range to device-dax instance range */ - if (!devm_request_mem_region(dev, res->start, resource_size(res), - dev_name(dev))) { - dev_warn(dev, "could not reserve region %pR\n", res); - return -EBUSY; + if (!devm_request_mem_region(dev, range->start, + range_len(range), dev_name(dev))) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", + i, range->start, range->end); + return -EBUSY; + } } - dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX; - addr = devm_memremap_pages(dev, &dev_dax->pgmap); + pgmap->type = MEMORY_DEVICE_GENERIC; + if (dev_dax->align > PAGE_SIZE) + pgmap->vmemmap_shift = + order_base_2(dev_dax->align >> PAGE_SHIFT); + addr = devm_memremap_pages(dev, pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); inode = dax_inode(dax_dev); cdev = inode->i_cdev; cdev_init(cdev, &dax_fops); - if (dev->class) { - /* for the CONFIG_DEV_DAX_PMEM_COMPAT case */ - cdev->owner = dev->parent->driver->owner; - } else - cdev->owner = dev->driver->owner; + cdev->owner = dev->driver->owner; cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) @@ -454,20 +475,10 @@ int dev_dax_probe(struct device *dev) run_dax(dax_dev); return devm_add_action_or_reset(dev, dev_dax_kill, dev_dax); } -EXPORT_SYMBOL_GPL(dev_dax_probe); - -static int dev_dax_remove(struct device *dev) -{ - /* all probe actions are unwound by devm */ - return 0; -} static struct dax_device_driver device_dax_driver = { - .drv = { - .probe = dev_dax_probe, - .remove = dev_dax_remove, - }, - .match_always = 1, + .probe = dev_dax_probe, + .type = DAXDRV_DEVICE_TYPE, }; static int __init dax_init(void) @@ -481,6 +492,7 @@ static void __exit dax_exit(void) } MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Device DAX: direct access device driver"); MODULE_LICENSE("GPL v2"); module_init(dax_init); module_exit(dax_exit); |
