diff options
Diffstat (limited to 'fs/userfaultfd.c')
| -rw-r--r-- | fs/userfaultfd.c | 522 |
1 files changed, 218 insertions, 304 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 17e409ceaa33..c5ba1f4487bd 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -29,14 +29,14 @@ #include <linux/ioctl.h> #include <linux/security.h> #include <linux/hugetlb.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/miscdevice.h> #include <linux/uio.h> static int sysctl_unprivileged_userfaultfd __read_mostly; #ifdef CONFIG_SYSCTL -static struct ctl_table vm_userfaultfd_table[] = { +static const struct ctl_table vm_userfaultfd_table[] = { { .procname = "unprivileged_userfaultfd", .data = &sysctl_unprivileged_userfaultfd, @@ -104,21 +104,6 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; } -static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, - vm_flags_t flags) -{ - const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; - - vm_flags_reset(vma, flags); - /* - * For shared mappings, we want to enable writenotify while - * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply - * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. - */ - if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) - vma_set_page_prot(vma); -} - static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { @@ -180,14 +165,14 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) { if (refcount_dec_and_test(&ctx->refcount)) { - VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); - VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); - VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); - VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); - VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock)); - VM_BUG_ON(waitqueue_active(&ctx->event_wqh)); - VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); - VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh)); mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } @@ -248,40 +233,48 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, { struct vm_area_struct *vma = vmf->vma; pte_t *ptep, pte; - bool ret = true; assert_fault_locked(vmf); ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); if (!ptep) - goto out; + return true; - ret = false; - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep); /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. PTE markers should be handled the same as none - * ptes here. + * changes under us. + */ + + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (huge_pte_none(pte)) + return true; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (pte_is_uffd_marker(pte)) + return true; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. */ - if (huge_pte_none_mostly(pte)) - ret = true; if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) - ret = true; -out: - return ret; + return true; + + return false; } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, struct vm_fault *vmf, unsigned long reason) { - return false; /* should never get here */ + /* Should never get here. */ + VM_WARN_ON_ONCE(1); + return false; } #endif /* CONFIG_HUGETLB_PAGE */ /* - * Verify the pagetables are still not ok after having reigstered into + * Verify the pagetables are still not ok after having registered into * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any * userfault that has already been resolved, if userfaultfd_read_iter and * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different @@ -299,53 +292,63 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pmd_t *pmd, _pmd; pte_t *pte; pte_t ptent; - bool ret = true; + bool ret; assert_fault_locked(vmf); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - goto out; + return true; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) - goto out; + return true; pud = pud_offset(p4d, address); if (!pud_present(*pud)) - goto out; + return true; pmd = pmd_offset(pud, address); again: _pmd = pmdp_get_lockless(pmd); if (pmd_none(_pmd)) - goto out; + return true; - ret = false; - if (!pmd_present(_pmd) || pmd_devmap(_pmd)) - goto out; + /* + * A race could arise which would result in a softleaf entry such as + * migration entry unexpectedly being present in the PMD, so explicitly + * check for this and bail out if so. + */ + if (!pmd_present(_pmd)) + return false; - if (pmd_trans_huge(_pmd)) { - if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) - ret = true; - goto out; - } + if (pmd_trans_huge(_pmd)) + return !pmd_write(_pmd) && (reason & VM_UFFD_WP); pte = pte_offset_map(pmd, address); - if (!pte) { - ret = true; + if (!pte) goto again; - } + /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. PTE markers should be handled the same as none - * ptes here. + * changes under us. */ ptent = ptep_get(pte); - if (pte_none_mostly(ptent)) - ret = true; + + ret = true; + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (pte_none(ptent)) + goto out; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (pte_is_uffd_marker(ptent)) + goto out; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. + */ if (!pte_write(ptent) && (reason & VM_UFFD_WP)) - ret = true; - pte_unmap(pte); + goto out; + ret = false; out: + pte_unmap(pte); return ret; } @@ -386,15 +389,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) unsigned int blocking_state; /* - * We don't do userfault handling for the final child pid update. - * - * We also don't do userfault handling during - * coredumping. hugetlbfs has the special - * hugetlb_follow_page_mask() to skip missing pages in the - * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with - * the no_page_table() helper in follow_page_mask(), but the - * shmem_vm_ops->fault method is invoked even during - * coredumping and it ends up here. + * We don't do userfault handling for the final child pid update + * and when coredumping (faults triggered by get_dump_page()). */ if (current->flags & (PF_EXITING|PF_DUMPCORE)) goto out; @@ -405,12 +401,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (!ctx) goto out; - BUG_ON(ctx->mm != mm); + VM_WARN_ON_ONCE(ctx->mm != mm); /* Any unrecognized flag is a bug. */ - VM_BUG_ON(reason & ~__VM_UFFD_FLAGS); + VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS); /* 0 or > 1 flags set is a bug; we expect exactly 1. */ - VM_BUG_ON(!reason || (reason & (reason - 1))); + VM_WARN_ON_ONCE(!reason || (reason & (reason - 1))); if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; @@ -418,32 +414,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) goto out; /* - * If it's already released don't get it. This avoids to loop - * in __get_user_pages if userfaultfd_release waits on the - * caller of handle_userfault to release the mmap_lock. - */ - if (unlikely(READ_ONCE(ctx->released))) { - /* - * Don't return VM_FAULT_SIGBUS in this case, so a non - * cooperative manager can close the uffd after the - * last UFFDIO_COPY, without risking to trigger an - * involuntary SIGBUS if the process was starting the - * userfaultfd while the userfaultfd was still armed - * (but after the last UFFDIO_COPY). If the uffd - * wasn't already closed when the userfault reached - * this point, that would normally be solved by - * userfaultfd_must_wait returning 'false'. - * - * If we were to return VM_FAULT_SIGBUS here, the non - * cooperative manager would be instead forced to - * always call UFFDIO_UNREGISTER before it can safely - * close the uffd. - */ - ret = VM_FAULT_NOPAGE; - goto out; - } - - /* * Check that we can return VM_FAULT_RETRY. * * NOTE: it should become possible to return VM_FAULT_RETRY @@ -459,12 +429,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * to be sure not to return SIGBUS erroneously on * nowait invocations. */ - BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); + VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { - printk(KERN_WARNING - "FAULT_FLAG_ALLOW_RETRY missing %x\n", - vmf->flags); + pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n", + vmf->flags); dump_stack(); } #endif @@ -479,6 +448,31 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; + if (unlikely(READ_ONCE(ctx->released))) { + /* + * If a concurrent release is detected, do not return + * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always + * return VM_FAULT_RETRY with lock released proactively. + * + * If we were to return VM_FAULT_SIGBUS here, the non + * cooperative manager would be instead forced to + * always call UFFDIO_UNREGISTER before it can safely + * close the uffd, to avoid involuntary SIGBUS triggered. + * + * If we were to return VM_FAULT_NOPAGE, it would work for + * the fault path, in which the lock will be released + * later. However for GUP, faultin_page() does nothing + * special on NOPAGE, so GUP would spin retrying without + * releasing the mmap read lock, causing possible livelock. + * + * Here only VM_FAULT_RETRY would make sure the mmap lock + * be released immediately, so that the thread concurrently + * releasing the userfault would always make progress. + */ + release_fault_lock(vmf); + goto out; + } + /* take the reference before dropping the mmap_lock */ userfaultfd_ctx_get(ctx); @@ -514,12 +508,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) set_current_state(blocking_state); spin_unlock_irq(&ctx->fault_pending_wqh.lock); - if (!is_vm_hugetlb_page(vma)) - must_wait = userfaultfd_must_wait(ctx, vmf, reason); - else + if (is_vm_hugetlb_page(vma)) { must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); - if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); + } else { + must_wait = userfaultfd_must_wait(ctx, vmf, reason); + } + release_fault_lock(vmf); if (likely(must_wait && !READ_ONCE(ctx->released))) { @@ -615,22 +610,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, spin_unlock_irq(&ctx->event_wqh.lock); if (release_new_ctx) { - struct vm_area_struct *vma; - struct mm_struct *mm = release_new_ctx->mm; - VMA_ITERATOR(vmi, mm, 0); - - /* the various vma->vm_userfaultfd_ctx still points to it */ - mmap_write_lock(mm); - for_each_vma(vmi, vma) { - if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { - vma_start_write(vma); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - userfaultfd_set_vm_flags(vma, - vma->vm_flags & ~__VM_UFFD_FLAGS); - } - } - mmap_write_unlock(mm); - + userfaultfd_release_new(release_new_ctx); userfaultfd_ctx_put(release_new_ctx); } @@ -640,7 +620,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, */ out: atomic_dec(&ctx->mmap_changing); - VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0); + VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0); userfaultfd_ctx_put(ctx); } @@ -662,9 +642,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) return 0; if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) { - vma_start_write(vma); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); + userfaultfd_reset_ctx(vma); return 0; } @@ -731,6 +709,34 @@ void dup_userfaultfd_complete(struct list_head *fcs) } } +void dup_userfaultfd_fail(struct list_head *fcs) +{ + struct userfaultfd_fork_ctx *fctx, *n; + + /* + * An error has occurred on fork, we will tear memory down, but have + * allocated memory for fctx's and raised reference counts for both the + * original and child contexts (and on the mm for each as a result). + * + * These would ordinarily be taken care of by a user handling the event, + * but we are no longer doing so, so manually clean up here. + * + * mm tear down will take care of cleaning up VMA contexts. + */ + list_for_each_entry_safe(fctx, n, fcs, list) { + struct userfaultfd_ctx *octx = fctx->orig; + struct userfaultfd_ctx *ctx = fctx->new; + + atomic_dec(&octx->mmap_changing); + VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0); + userfaultfd_ctx_put(octx); + userfaultfd_ctx_put(ctx); + + list_del(&fctx->list); + kfree(fctx); + } +} + void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *vm_ctx) { @@ -749,9 +755,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, up_write(&ctx->map_changing_lock); } else { /* Drop uffd context if remap feature not enabled */ - vma_start_write(vma); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); + userfaultfd_reset_ctx(vma); } } @@ -765,11 +769,6 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, if (!ctx) return; - if (to & ~PAGE_MASK) { - userfaultfd_ctx_put(ctx); - return; - } - msg_init(&ewq.msg); ewq.msg.event = UFFD_EVENT_REMAP; @@ -780,6 +779,16 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, userfaultfd_event_wait_completion(ctx, &ewq); } +void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx) +{ + struct userfaultfd_ctx *ctx = vm_ctx->ctx; + + if (!ctx) + return; + + userfaultfd_ctx_put(ctx); +} + bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { @@ -870,53 +879,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; struct mm_struct *mm = ctx->mm; - struct vm_area_struct *vma, *prev; /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; - unsigned long new_flags; - VMA_ITERATOR(vmi, mm, 0); WRITE_ONCE(ctx->released, true); - if (!mmget_not_zero(mm)) - goto wakeup; - - /* - * Flush page faults out of all CPUs. NOTE: all page faults - * must be retried without returning VM_FAULT_SIGBUS if - * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx - * changes while handle_userfault released the mmap_lock. So - * it's critical that released is set to true (above), before - * taking the mmap_lock for writing. - */ - mmap_write_lock(mm); - prev = NULL; - for_each_vma(vmi, vma) { - cond_resched(); - BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ - !!(vma->vm_flags & __VM_UFFD_FLAGS)); - if (vma->vm_userfaultfd_ctx.ctx != ctx) { - prev = vma; - continue; - } - /* Reset ptes for the whole vma range if wr-protected */ - if (userfaultfd_wp(vma)) - uffd_wp_range(vma, vma->vm_start, - vma->vm_end - vma->vm_start, false); - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start, - vma->vm_end, new_flags, - NULL_VM_UFFD_CTX); - - vma_start_write(vma); - userfaultfd_set_vm_flags(vma, new_flags); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + userfaultfd_release_all(mm, ctx); - prev = vma; - } - mmap_write_unlock(mm); - mmput(mm); -wakeup: /* * After no new page faults can wait on this fault_*wqh, flush * the last page faults that may have been already waiting on @@ -1293,14 +1262,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { struct mm_struct *mm = ctx->mm; - struct vm_area_struct *vma, *prev, *cur; + struct vm_area_struct *vma, *cur; int ret; struct uffdio_register uffdio_register; struct uffdio_register __user *user_uffdio_register; - unsigned long vm_flags, new_flags; + vm_flags_t vm_flags; bool found; bool basic_ioctls; - unsigned long start, end, vma_end; + unsigned long start, end; struct vma_iterator vmi; bool wp_async = userfaultfd_wp_async_ctx(ctx); @@ -1320,9 +1289,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { -#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP - goto out; -#endif + if (!pgtable_supports_uffd_wp()) + goto out; + vm_flags |= VM_UFFD_WP; } if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { @@ -1371,8 +1340,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & __VM_UFFD_FLAGS)); + VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* check not compatible vmas */ ret = -EINVAL; @@ -1426,59 +1395,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, found = true; } for_each_vma_range(vmi, cur, end); - BUG_ON(!found); - - vma_iter_set(&vmi, start); - prev = vma_prev(&vmi); - if (vma->vm_start < start) - prev = vma; - - ret = 0; - for_each_vma_range(vmi, vma, end) { - cond_resched(); - - BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); - BUG_ON(vma->vm_userfaultfd_ctx.ctx && - vma->vm_userfaultfd_ctx.ctx != ctx); - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); - - /* - * Nothing to do: this vma is already registered into this - * userfaultfd and with the right tracking mode too. - */ - if (vma->vm_userfaultfd_ctx.ctx == ctx && - (vma->vm_flags & vm_flags) == vm_flags) - goto skip; + VM_WARN_ON_ONCE(!found); - if (vma->vm_start > start) - start = vma->vm_start; - vma_end = min(end, vma->vm_end); - - new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, - new_flags, - (struct vm_userfaultfd_ctx){ctx}); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - break; - } - - /* - * In the vma_merge() successful mprotect-like case 8: - * the next vma was merged into the current one and - * the current one has not been updated yet. - */ - vma_start_write(vma); - userfaultfd_set_vm_flags(vma, new_flags); - vma->vm_userfaultfd_ctx.ctx = ctx; - - if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) - hugetlb_unshare_all_pmds(vma); - - skip: - prev = vma; - start = vma->vm_end; - } + ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, + wp_async); out_unlock: mmap_write_unlock(mm); @@ -1519,7 +1439,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, struct vm_area_struct *vma, *prev, *cur; int ret; struct uffdio_range uffdio_unregister; - unsigned long new_flags; bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; @@ -1568,8 +1487,16 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & __VM_UFFD_FLAGS)); + VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & __VM_UFFD_FLAGS)); + + /* + * Prevent unregistering through a different userfaultfd than + * the one used for registration. + */ + if (cur->vm_userfaultfd_ctx.ctx && + cur->vm_userfaultfd_ctx.ctx != ctx) + goto out_unlock; /* * Check not compatible vmas, not strictly required @@ -1583,7 +1510,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, found = true; } for_each_vma_range(vmi, cur, end); - BUG_ON(!found); + VM_WARN_ON_ONCE(!found); vma_iter_set(&vmi, start); prev = vma_prev(&vmi); @@ -1594,16 +1521,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async)); - - /* - * Nothing to do: this vma is already registered into this - * userfaultfd and with the right tracking mode too. - */ + /* VMA not registered with userfaultfd. */ if (!vma->vm_userfaultfd_ctx.ctx) goto skip; - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx); + VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async)); + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); if (vma->vm_start > start) start = vma->vm_start; @@ -1622,27 +1546,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); } - /* Reset ptes for the whole vma range if wr-protected */ - if (userfaultfd_wp(vma)) - uffd_wp_range(vma, start, vma_end - start, false); - - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, - new_flags, NULL_VM_UFFD_CTX); + vma = userfaultfd_clear_vma(&vmi, prev, vma, + start, vma_end); if (IS_ERR(vma)) { ret = PTR_ERR(vma); break; } - /* - * In the vma_merge() successful mprotect-like case 8: - * the next vma was merged into the current one and - * the current one has not been updated yet. - */ - vma_start_write(vma); - userfaultfd_set_vm_flags(vma, new_flags); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - skip: prev = vma; start = vma->vm_end; @@ -1682,7 +1592,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx, * len == 0 means wake all and we don't want to wake all here, * so check it again to be sure. */ - VM_BUG_ON(!range.len); + VM_WARN_ON_ONCE(!range.len); wake_userfault(ctx, &range); ret = 0; @@ -1703,8 +1613,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, user_uffdio_copy = (struct uffdio_copy __user *) arg; ret = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_copy->copy))) + return -EFAULT; goto out; + } ret = -EFAULT; if (copy_from_user(&uffdio_copy, user_uffdio_copy, @@ -1736,7 +1649,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, return -EFAULT; if (ret < 0) goto out; - BUG_ON(!ret); + VM_WARN_ON_ONCE(!ret); /* len == 0 would wake all */ range.len = ret; if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { @@ -1759,8 +1672,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; ret = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) + return -EFAULT; goto out; + } ret = -EFAULT; if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, @@ -1788,7 +1704,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (ret < 0) goto out; /* len == 0 would wake all */ - BUG_ON(!ret); + VM_WARN_ON_ONCE(!ret); range.len = ret; if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { range.start = uffdio_zeropage.range.start; @@ -1862,8 +1778,11 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) user_uffdio_continue = (struct uffdio_continue __user *)arg; ret = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) + return -EFAULT; goto out; + } ret = -EFAULT; if (copy_from_user(&uffdio_continue, user_uffdio_continue, @@ -1897,7 +1816,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) goto out; /* len == 0 would wake all */ - BUG_ON(!ret); + VM_WARN_ON_ONCE(!ret); range.len = ret; if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { range.start = uffdio_continue.range.start; @@ -1919,8 +1838,11 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long user_uffdio_poison = (struct uffdio_poison __user *)arg; ret = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) + return -EFAULT; goto out; + } ret = -EFAULT; if (copy_from_user(&uffdio_poison, user_uffdio_poison, @@ -1951,7 +1873,7 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long goto out; /* len == 0 would wake all */ - BUG_ON(!ret); + VM_WARN_ON_ONCE(!ret); range.len = ret; if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { range.start = uffdio_poison.range.start; @@ -1988,8 +1910,12 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, user_uffdio_move = (struct uffdio_move __user *) arg; - if (atomic_read(&ctx->mmap_changing)) - return -EAGAIN; + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_move->move))) + return -EFAULT; + goto out; + } if (copy_from_user(&uffdio_move, user_uffdio_move, /* don't copy "move" last field */ @@ -2073,14 +1999,14 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, uffdio_api.features &= ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); #endif -#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP - uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; -#endif -#ifndef CONFIG_PTE_MARKER_UFFD_WP - uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; - uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; - uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; -#endif + if (!pgtable_supports_uffd_wp()) + uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; + + if (!uffd_supports_wp_marker()) { + uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; + uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; + } ret = -EINVAL; if (features & ~uffdio_api.features) @@ -2204,16 +2130,12 @@ static void init_once_userfaultfd_ctx(void *mem) static int new_userfaultfd(int flags) { - struct userfaultfd_ctx *ctx; - struct file *file; - int fd; + struct userfaultfd_ctx *ctx __free(kfree) = NULL; - BUG_ON(!current->mm); + VM_WARN_ON_ONCE(!current->mm); /* Check the UFFD_* constants for consistency. */ BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS); - BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); - BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY)) return -EINVAL; @@ -2230,26 +2152,18 @@ static int new_userfaultfd(int flags) atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; - fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); - if (fd < 0) - goto err_out; + FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS, + anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), + NULL)); + if (fdf.err) + return fdf.err; - /* Create a new inode so that the LSM can block the creation. */ - file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); - if (IS_ERR(file)) { - put_unused_fd(fd); - fd = PTR_ERR(file); - goto err_out; - } /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - file->f_mode |= FMODE_NOWAIT; - fd_install(fd, file); - return fd; -err_out: - kmem_cache_free(userfaultfd_ctx_cachep, ctx); - return fd; + fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT; + retain_and_null_ptr(ctx); + return fd_publish(fdf); } static inline bool userfaultfd_syscall_allowed(int flags) |
