summaryrefslogtreecommitdiff
path: root/fs/userfaultfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/userfaultfd.c')
-rw-r--r--fs/userfaultfd.c522
1 files changed, 218 insertions, 304 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 17e409ceaa33..c5ba1f4487bd 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -29,14 +29,14 @@
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/miscdevice.h>
#include <linux/uio.h>
static int sysctl_unprivileged_userfaultfd __read_mostly;
#ifdef CONFIG_SYSCTL
-static struct ctl_table vm_userfaultfd_table[] = {
+static const struct ctl_table vm_userfaultfd_table[] = {
{
.procname = "unprivileged_userfaultfd",
.data = &sysctl_unprivileged_userfaultfd,
@@ -104,21 +104,6 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
}
-static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
- vm_flags_t flags)
-{
- const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
-
- vm_flags_reset(vma, flags);
- /*
- * For shared mappings, we want to enable writenotify while
- * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
- * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
- */
- if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
- vma_set_page_prot(vma);
-}
-
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
@@ -180,14 +165,14 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
{
if (refcount_dec_and_test(&ctx->refcount)) {
- VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
- VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
- VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
- VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh));
mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
}
@@ -248,40 +233,48 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
{
struct vm_area_struct *vma = vmf->vma;
pte_t *ptep, pte;
- bool ret = true;
assert_fault_locked(vmf);
ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
if (!ptep)
- goto out;
+ return true;
- ret = false;
- pte = huge_ptep_get(ptep);
+ pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep);
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us. PTE markers should be handled the same as none
- * ptes here.
+ * changes under us.
+ */
+
+ /* Entry is still missing, wait for userspace to resolve the fault. */
+ if (huge_pte_none(pte))
+ return true;
+ /* UFFD PTE markers require userspace to resolve the fault. */
+ if (pte_is_uffd_marker(pte))
+ return true;
+ /*
+ * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
+ * resolve the fault.
*/
- if (huge_pte_none_mostly(pte))
- ret = true;
if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
- ret = true;
-out:
- return ret;
+ return true;
+
+ return false;
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
struct vm_fault *vmf,
unsigned long reason)
{
- return false; /* should never get here */
+ /* Should never get here. */
+ VM_WARN_ON_ONCE(1);
+ return false;
}
#endif /* CONFIG_HUGETLB_PAGE */
/*
- * Verify the pagetables are still not ok after having reigstered into
+ * Verify the pagetables are still not ok after having registered into
* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
* userfault that has already been resolved, if userfaultfd_read_iter and
* UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
@@ -299,53 +292,63 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pmd_t *pmd, _pmd;
pte_t *pte;
pte_t ptent;
- bool ret = true;
+ bool ret;
assert_fault_locked(vmf);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
- goto out;
+ return true;
p4d = p4d_offset(pgd, address);
if (!p4d_present(*p4d))
- goto out;
+ return true;
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
- goto out;
+ return true;
pmd = pmd_offset(pud, address);
again:
_pmd = pmdp_get_lockless(pmd);
if (pmd_none(_pmd))
- goto out;
+ return true;
- ret = false;
- if (!pmd_present(_pmd) || pmd_devmap(_pmd))
- goto out;
+ /*
+ * A race could arise which would result in a softleaf entry such as
+ * migration entry unexpectedly being present in the PMD, so explicitly
+ * check for this and bail out if so.
+ */
+ if (!pmd_present(_pmd))
+ return false;
- if (pmd_trans_huge(_pmd)) {
- if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
- ret = true;
- goto out;
- }
+ if (pmd_trans_huge(_pmd))
+ return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
pte = pte_offset_map(pmd, address);
- if (!pte) {
- ret = true;
+ if (!pte)
goto again;
- }
+
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us. PTE markers should be handled the same as none
- * ptes here.
+ * changes under us.
*/
ptent = ptep_get(pte);
- if (pte_none_mostly(ptent))
- ret = true;
+
+ ret = true;
+ /* Entry is still missing, wait for userspace to resolve the fault. */
+ if (pte_none(ptent))
+ goto out;
+ /* UFFD PTE markers require userspace to resolve the fault. */
+ if (pte_is_uffd_marker(ptent))
+ goto out;
+ /*
+ * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
+ * resolve the fault.
+ */
if (!pte_write(ptent) && (reason & VM_UFFD_WP))
- ret = true;
- pte_unmap(pte);
+ goto out;
+ ret = false;
out:
+ pte_unmap(pte);
return ret;
}
@@ -386,15 +389,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
unsigned int blocking_state;
/*
- * We don't do userfault handling for the final child pid update.
- *
- * We also don't do userfault handling during
- * coredumping. hugetlbfs has the special
- * hugetlb_follow_page_mask() to skip missing pages in the
- * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
- * the no_page_table() helper in follow_page_mask(), but the
- * shmem_vm_ops->fault method is invoked even during
- * coredumping and it ends up here.
+ * We don't do userfault handling for the final child pid update
+ * and when coredumping (faults triggered by get_dump_page()).
*/
if (current->flags & (PF_EXITING|PF_DUMPCORE))
goto out;
@@ -405,12 +401,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (!ctx)
goto out;
- BUG_ON(ctx->mm != mm);
+ VM_WARN_ON_ONCE(ctx->mm != mm);
/* Any unrecognized flag is a bug. */
- VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
+ VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS);
/* 0 or > 1 flags set is a bug; we expect exactly 1. */
- VM_BUG_ON(!reason || (reason & (reason - 1)));
+ VM_WARN_ON_ONCE(!reason || (reason & (reason - 1)));
if (ctx->features & UFFD_FEATURE_SIGBUS)
goto out;
@@ -418,32 +414,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
goto out;
/*
- * If it's already released don't get it. This avoids to loop
- * in __get_user_pages if userfaultfd_release waits on the
- * caller of handle_userfault to release the mmap_lock.
- */
- if (unlikely(READ_ONCE(ctx->released))) {
- /*
- * Don't return VM_FAULT_SIGBUS in this case, so a non
- * cooperative manager can close the uffd after the
- * last UFFDIO_COPY, without risking to trigger an
- * involuntary SIGBUS if the process was starting the
- * userfaultfd while the userfaultfd was still armed
- * (but after the last UFFDIO_COPY). If the uffd
- * wasn't already closed when the userfault reached
- * this point, that would normally be solved by
- * userfaultfd_must_wait returning 'false'.
- *
- * If we were to return VM_FAULT_SIGBUS here, the non
- * cooperative manager would be instead forced to
- * always call UFFDIO_UNREGISTER before it can safely
- * close the uffd.
- */
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
@@ -459,12 +429,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
* to be sure not to return SIGBUS erroneously on
* nowait invocations.
*/
- BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
+ VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) {
- printk(KERN_WARNING
- "FAULT_FLAG_ALLOW_RETRY missing %x\n",
- vmf->flags);
+ pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n",
+ vmf->flags);
dump_stack();
}
#endif
@@ -479,6 +448,31 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
+ if (unlikely(READ_ONCE(ctx->released))) {
+ /*
+ * If a concurrent release is detected, do not return
+ * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always
+ * return VM_FAULT_RETRY with lock released proactively.
+ *
+ * If we were to return VM_FAULT_SIGBUS here, the non
+ * cooperative manager would be instead forced to
+ * always call UFFDIO_UNREGISTER before it can safely
+ * close the uffd, to avoid involuntary SIGBUS triggered.
+ *
+ * If we were to return VM_FAULT_NOPAGE, it would work for
+ * the fault path, in which the lock will be released
+ * later. However for GUP, faultin_page() does nothing
+ * special on NOPAGE, so GUP would spin retrying without
+ * releasing the mmap read lock, causing possible livelock.
+ *
+ * Here only VM_FAULT_RETRY would make sure the mmap lock
+ * be released immediately, so that the thread concurrently
+ * releasing the userfault would always make progress.
+ */
+ release_fault_lock(vmf);
+ goto out;
+ }
+
/* take the reference before dropping the mmap_lock */
userfaultfd_ctx_get(ctx);
@@ -514,12 +508,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
set_current_state(blocking_state);
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
- if (!is_vm_hugetlb_page(vma))
- must_wait = userfaultfd_must_wait(ctx, vmf, reason);
- else
+ if (is_vm_hugetlb_page(vma)) {
must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
- if (is_vm_hugetlb_page(vma))
hugetlb_vma_unlock_read(vma);
+ } else {
+ must_wait = userfaultfd_must_wait(ctx, vmf, reason);
+ }
+
release_fault_lock(vmf);
if (likely(must_wait && !READ_ONCE(ctx->released))) {
@@ -615,22 +610,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
spin_unlock_irq(&ctx->event_wqh.lock);
if (release_new_ctx) {
- struct vm_area_struct *vma;
- struct mm_struct *mm = release_new_ctx->mm;
- VMA_ITERATOR(vmi, mm, 0);
-
- /* the various vma->vm_userfaultfd_ctx still points to it */
- mmap_write_lock(mm);
- for_each_vma(vmi, vma) {
- if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
- vma_start_write(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- userfaultfd_set_vm_flags(vma,
- vma->vm_flags & ~__VM_UFFD_FLAGS);
- }
- }
- mmap_write_unlock(mm);
-
+ userfaultfd_release_new(release_new_ctx);
userfaultfd_ctx_put(release_new_ctx);
}
@@ -640,7 +620,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
*/
out:
atomic_dec(&ctx->mmap_changing);
- VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
+ VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0);
userfaultfd_ctx_put(ctx);
}
@@ -662,9 +642,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
return 0;
if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
- vma_start_write(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
+ userfaultfd_reset_ctx(vma);
return 0;
}
@@ -731,6 +709,34 @@ void dup_userfaultfd_complete(struct list_head *fcs)
}
}
+void dup_userfaultfd_fail(struct list_head *fcs)
+{
+ struct userfaultfd_fork_ctx *fctx, *n;
+
+ /*
+ * An error has occurred on fork, we will tear memory down, but have
+ * allocated memory for fctx's and raised reference counts for both the
+ * original and child contexts (and on the mm for each as a result).
+ *
+ * These would ordinarily be taken care of by a user handling the event,
+ * but we are no longer doing so, so manually clean up here.
+ *
+ * mm tear down will take care of cleaning up VMA contexts.
+ */
+ list_for_each_entry_safe(fctx, n, fcs, list) {
+ struct userfaultfd_ctx *octx = fctx->orig;
+ struct userfaultfd_ctx *ctx = fctx->new;
+
+ atomic_dec(&octx->mmap_changing);
+ VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0);
+ userfaultfd_ctx_put(octx);
+ userfaultfd_ctx_put(ctx);
+
+ list_del(&fctx->list);
+ kfree(fctx);
+ }
+}
+
void mremap_userfaultfd_prep(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx *vm_ctx)
{
@@ -749,9 +755,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
up_write(&ctx->map_changing_lock);
} else {
/* Drop uffd context if remap feature not enabled */
- vma_start_write(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
+ userfaultfd_reset_ctx(vma);
}
}
@@ -765,11 +769,6 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
if (!ctx)
return;
- if (to & ~PAGE_MASK) {
- userfaultfd_ctx_put(ctx);
- return;
- }
-
msg_init(&ewq.msg);
ewq.msg.event = UFFD_EVENT_REMAP;
@@ -780,6 +779,16 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
userfaultfd_event_wait_completion(ctx, &ewq);
}
+void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx)
+{
+ struct userfaultfd_ctx *ctx = vm_ctx->ctx;
+
+ if (!ctx)
+ return;
+
+ userfaultfd_ctx_put(ctx);
+}
+
bool userfaultfd_remove(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
@@ -870,53 +879,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
{
struct userfaultfd_ctx *ctx = file->private_data;
struct mm_struct *mm = ctx->mm;
- struct vm_area_struct *vma, *prev;
/* len == 0 means wake all */
struct userfaultfd_wake_range range = { .len = 0, };
- unsigned long new_flags;
- VMA_ITERATOR(vmi, mm, 0);
WRITE_ONCE(ctx->released, true);
- if (!mmget_not_zero(mm))
- goto wakeup;
-
- /*
- * Flush page faults out of all CPUs. NOTE: all page faults
- * must be retried without returning VM_FAULT_SIGBUS if
- * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
- * changes while handle_userfault released the mmap_lock. So
- * it's critical that released is set to true (above), before
- * taking the mmap_lock for writing.
- */
- mmap_write_lock(mm);
- prev = NULL;
- for_each_vma(vmi, vma) {
- cond_resched();
- BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
- !!(vma->vm_flags & __VM_UFFD_FLAGS));
- if (vma->vm_userfaultfd_ctx.ctx != ctx) {
- prev = vma;
- continue;
- }
- /* Reset ptes for the whole vma range if wr-protected */
- if (userfaultfd_wp(vma))
- uffd_wp_range(vma, vma->vm_start,
- vma->vm_end - vma->vm_start, false);
- new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
- vma->vm_end, new_flags,
- NULL_VM_UFFD_CTX);
-
- vma_start_write(vma);
- userfaultfd_set_vm_flags(vma, new_flags);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ userfaultfd_release_all(mm, ctx);
- prev = vma;
- }
- mmap_write_unlock(mm);
- mmput(mm);
-wakeup:
/*
* After no new page faults can wait on this fault_*wqh, flush
* the last page faults that may have been already waiting on
@@ -1293,14 +1262,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
struct mm_struct *mm = ctx->mm;
- struct vm_area_struct *vma, *prev, *cur;
+ struct vm_area_struct *vma, *cur;
int ret;
struct uffdio_register uffdio_register;
struct uffdio_register __user *user_uffdio_register;
- unsigned long vm_flags, new_flags;
+ vm_flags_t vm_flags;
bool found;
bool basic_ioctls;
- unsigned long start, end, vma_end;
+ unsigned long start, end;
struct vma_iterator vmi;
bool wp_async = userfaultfd_wp_async_ctx(ctx);
@@ -1320,9 +1289,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
vm_flags |= VM_UFFD_MISSING;
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
-#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
- goto out;
-#endif
+ if (!pgtable_supports_uffd_wp())
+ goto out;
+
vm_flags |= VM_UFFD_WP;
}
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
@@ -1371,8 +1340,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
- !!(cur->vm_flags & __VM_UFFD_FLAGS));
+ VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & __VM_UFFD_FLAGS));
/* check not compatible vmas */
ret = -EINVAL;
@@ -1426,59 +1395,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
found = true;
} for_each_vma_range(vmi, cur, end);
- BUG_ON(!found);
-
- vma_iter_set(&vmi, start);
- prev = vma_prev(&vmi);
- if (vma->vm_start < start)
- prev = vma;
-
- ret = 0;
- for_each_vma_range(vmi, vma, end) {
- cond_resched();
-
- BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
- BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
- vma->vm_userfaultfd_ctx.ctx != ctx);
- WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
-
- /*
- * Nothing to do: this vma is already registered into this
- * userfaultfd and with the right tracking mode too.
- */
- if (vma->vm_userfaultfd_ctx.ctx == ctx &&
- (vma->vm_flags & vm_flags) == vm_flags)
- goto skip;
+ VM_WARN_ON_ONCE(!found);
- if (vma->vm_start > start)
- start = vma->vm_start;
- vma_end = min(end, vma->vm_end);
-
- new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
- vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
- new_flags,
- (struct vm_userfaultfd_ctx){ctx});
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
- break;
- }
-
- /*
- * In the vma_merge() successful mprotect-like case 8:
- * the next vma was merged into the current one and
- * the current one has not been updated yet.
- */
- vma_start_write(vma);
- userfaultfd_set_vm_flags(vma, new_flags);
- vma->vm_userfaultfd_ctx.ctx = ctx;
-
- if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
- hugetlb_unshare_all_pmds(vma);
-
- skip:
- prev = vma;
- start = vma->vm_end;
- }
+ ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
+ wp_async);
out_unlock:
mmap_write_unlock(mm);
@@ -1519,7 +1439,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
struct vm_area_struct *vma, *prev, *cur;
int ret;
struct uffdio_range uffdio_unregister;
- unsigned long new_flags;
bool found;
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
@@ -1568,8 +1487,16 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
- !!(cur->vm_flags & __VM_UFFD_FLAGS));
+ VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & __VM_UFFD_FLAGS));
+
+ /*
+ * Prevent unregistering through a different userfaultfd than
+ * the one used for registration.
+ */
+ if (cur->vm_userfaultfd_ctx.ctx &&
+ cur->vm_userfaultfd_ctx.ctx != ctx)
+ goto out_unlock;
/*
* Check not compatible vmas, not strictly required
@@ -1583,7 +1510,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
found = true;
} for_each_vma_range(vmi, cur, end);
- BUG_ON(!found);
+ VM_WARN_ON_ONCE(!found);
vma_iter_set(&vmi, start);
prev = vma_prev(&vmi);
@@ -1594,16 +1521,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
-
- /*
- * Nothing to do: this vma is already registered into this
- * userfaultfd and with the right tracking mode too.
- */
+ /* VMA not registered with userfaultfd. */
if (!vma->vm_userfaultfd_ctx.ctx)
goto skip;
- WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
+ VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx);
+ VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async));
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
if (vma->vm_start > start)
start = vma->vm_start;
@@ -1622,27 +1546,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
}
- /* Reset ptes for the whole vma range if wr-protected */
- if (userfaultfd_wp(vma))
- uffd_wp_range(vma, start, vma_end - start, false);
-
- new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
- new_flags, NULL_VM_UFFD_CTX);
+ vma = userfaultfd_clear_vma(&vmi, prev, vma,
+ start, vma_end);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
break;
}
- /*
- * In the vma_merge() successful mprotect-like case 8:
- * the next vma was merged into the current one and
- * the current one has not been updated yet.
- */
- vma_start_write(vma);
- userfaultfd_set_vm_flags(vma, new_flags);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-
skip:
prev = vma;
start = vma->vm_end;
@@ -1682,7 +1592,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
* len == 0 means wake all and we don't want to wake all here,
* so check it again to be sure.
*/
- VM_BUG_ON(!range.len);
+ VM_WARN_ON_ONCE(!range.len);
wake_userfault(ctx, &range);
ret = 0;
@@ -1703,8 +1613,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
user_uffdio_copy = (struct uffdio_copy __user *) arg;
ret = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+ return -EFAULT;
goto out;
+ }
ret = -EFAULT;
if (copy_from_user(&uffdio_copy, user_uffdio_copy,
@@ -1736,7 +1649,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
return -EFAULT;
if (ret < 0)
goto out;
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
/* len == 0 would wake all */
range.len = ret;
if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
@@ -1759,8 +1672,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
ret = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+ return -EFAULT;
goto out;
+ }
ret = -EFAULT;
if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
@@ -1788,7 +1704,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
if (ret < 0)
goto out;
/* len == 0 would wake all */
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
range.len = ret;
if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
range.start = uffdio_zeropage.range.start;
@@ -1862,8 +1778,11 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
user_uffdio_continue = (struct uffdio_continue __user *)arg;
ret = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
+ return -EFAULT;
goto out;
+ }
ret = -EFAULT;
if (copy_from_user(&uffdio_continue, user_uffdio_continue,
@@ -1897,7 +1816,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
goto out;
/* len == 0 would wake all */
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
range.len = ret;
if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
range.start = uffdio_continue.range.start;
@@ -1919,8 +1838,11 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
user_uffdio_poison = (struct uffdio_poison __user *)arg;
ret = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
+ return -EFAULT;
goto out;
+ }
ret = -EFAULT;
if (copy_from_user(&uffdio_poison, user_uffdio_poison,
@@ -1951,7 +1873,7 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
goto out;
/* len == 0 would wake all */
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
range.len = ret;
if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
range.start = uffdio_poison.range.start;
@@ -1988,8 +1910,12 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
user_uffdio_move = (struct uffdio_move __user *) arg;
- if (atomic_read(&ctx->mmap_changing))
- return -EAGAIN;
+ ret = -EAGAIN;
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_move->move)))
+ return -EFAULT;
+ goto out;
+ }
if (copy_from_user(&uffdio_move, user_uffdio_move,
/* don't copy "move" last field */
@@ -2073,14 +1999,14 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
uffdio_api.features &=
~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif
-#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
- uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
-#endif
-#ifndef CONFIG_PTE_MARKER_UFFD_WP
- uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
- uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
- uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
-#endif
+ if (!pgtable_supports_uffd_wp())
+ uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+
+ if (!uffd_supports_wp_marker()) {
+ uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
+ }
ret = -EINVAL;
if (features & ~uffdio_api.features)
@@ -2204,16 +2130,12 @@ static void init_once_userfaultfd_ctx(void *mem)
static int new_userfaultfd(int flags)
{
- struct userfaultfd_ctx *ctx;
- struct file *file;
- int fd;
+ struct userfaultfd_ctx *ctx __free(kfree) = NULL;
- BUG_ON(!current->mm);
+ VM_WARN_ON_ONCE(!current->mm);
/* Check the UFFD_* constants for consistency. */
BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
- BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
- BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
return -EINVAL;
@@ -2230,26 +2152,18 @@ static int new_userfaultfd(int flags)
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
- fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
- if (fd < 0)
- goto err_out;
+ FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS,
+ anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+ O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS),
+ NULL));
+ if (fdf.err)
+ return fdf.err;
- /* Create a new inode so that the LSM can block the creation. */
- file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
- O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- fd = PTR_ERR(file);
- goto err_out;
- }
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
- file->f_mode |= FMODE_NOWAIT;
- fd_install(fd, file);
- return fd;
-err_out:
- kmem_cache_free(userfaultfd_ctx_cachep, ctx);
- return fd;
+ fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT;
+ retain_and_null_ptr(ctx);
+ return fd_publish(fdf);
}
static inline bool userfaultfd_syscall_allowed(int flags)