summaryrefslogtreecommitdiff
path: root/fs/userfaultfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/userfaultfd.c')
-rw-r--r--fs/userfaultfd.c334
1 files changed, 128 insertions, 206 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 60dcfafdc11a..22f4bf956ba1 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -31,11 +31,12 @@
#include <linux/hugetlb.h>
#include <linux/swapops.h>
#include <linux/miscdevice.h>
+#include <linux/uio.h>
static int sysctl_unprivileged_userfaultfd __read_mostly;
#ifdef CONFIG_SYSCTL
-static struct ctl_table vm_userfaultfd_table[] = {
+static const struct ctl_table vm_userfaultfd_table[] = {
{
.procname = "unprivileged_userfaultfd",
.data = &sysctl_unprivileged_userfaultfd,
@@ -103,21 +104,6 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
}
-static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
- vm_flags_t flags)
-{
- const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
-
- vm_flags_reset(vma, flags);
- /*
- * For shared mappings, we want to enable writenotify while
- * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
- * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
- */
- if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
- vma_set_page_prot(vma);
-}
-
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
@@ -256,7 +242,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
goto out;
ret = false;
- pte = huge_ptep_get(ptep);
+ pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep);
/*
* Lockless access: we're in a wait_event so it's ok if it
@@ -282,7 +268,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
/*
* Verify the pagetables are still not ok after having reigstered into
* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
- * userfault that has already been resolved, if userfaultfd_read and
+ * userfault that has already been resolved, if userfaultfd_read_iter and
* UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
* threads.
*/
@@ -385,15 +371,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
unsigned int blocking_state;
/*
- * We don't do userfault handling for the final child pid update.
- *
- * We also don't do userfault handling during
- * coredumping. hugetlbfs has the special
- * hugetlb_follow_page_mask() to skip missing pages in the
- * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
- * the no_page_table() helper in follow_page_mask(), but the
- * shmem_vm_ops->fault method is invoked even during
- * coredumping and it ends up here.
+ * We don't do userfault handling for the final child pid update
+ * and when coredumping (faults triggered by get_dump_page()).
*/
if (current->flags & (PF_EXITING|PF_DUMPCORE))
goto out;
@@ -417,32 +396,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
goto out;
/*
- * If it's already released don't get it. This avoids to loop
- * in __get_user_pages if userfaultfd_release waits on the
- * caller of handle_userfault to release the mmap_lock.
- */
- if (unlikely(READ_ONCE(ctx->released))) {
- /*
- * Don't return VM_FAULT_SIGBUS in this case, so a non
- * cooperative manager can close the uffd after the
- * last UFFDIO_COPY, without risking to trigger an
- * involuntary SIGBUS if the process was starting the
- * userfaultfd while the userfaultfd was still armed
- * (but after the last UFFDIO_COPY). If the uffd
- * wasn't already closed when the userfault reached
- * this point, that would normally be solved by
- * userfaultfd_must_wait returning 'false'.
- *
- * If we were to return VM_FAULT_SIGBUS here, the non
- * cooperative manager would be instead forced to
- * always call UFFDIO_UNREGISTER before it can safely
- * close the uffd.
- */
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
@@ -478,6 +431,31 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
+ if (unlikely(READ_ONCE(ctx->released))) {
+ /*
+ * If a concurrent release is detected, do not return
+ * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always
+ * return VM_FAULT_RETRY with lock released proactively.
+ *
+ * If we were to return VM_FAULT_SIGBUS here, the non
+ * cooperative manager would be instead forced to
+ * always call UFFDIO_UNREGISTER before it can safely
+ * close the uffd, to avoid involuntary SIGBUS triggered.
+ *
+ * If we were to return VM_FAULT_NOPAGE, it would work for
+ * the fault path, in which the lock will be released
+ * later. However for GUP, faultin_page() does nothing
+ * special on NOPAGE, so GUP would spin retrying without
+ * releasing the mmap read lock, causing possible livelock.
+ *
+ * Here only VM_FAULT_RETRY would make sure the mmap lock
+ * be released immediately, so that the thread concurrently
+ * releasing the userfault would always make progress.
+ */
+ release_fault_lock(vmf);
+ goto out;
+ }
+
/* take the reference before dropping the mmap_lock */
userfaultfd_ctx_get(ctx);
@@ -614,22 +592,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
spin_unlock_irq(&ctx->event_wqh.lock);
if (release_new_ctx) {
- struct vm_area_struct *vma;
- struct mm_struct *mm = release_new_ctx->mm;
- VMA_ITERATOR(vmi, mm, 0);
-
- /* the various vma->vm_userfaultfd_ctx still points to it */
- mmap_write_lock(mm);
- for_each_vma(vmi, vma) {
- if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
- vma_start_write(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- userfaultfd_set_vm_flags(vma,
- vma->vm_flags & ~__VM_UFFD_FLAGS);
- }
- }
- mmap_write_unlock(mm);
-
+ userfaultfd_release_new(release_new_ctx);
userfaultfd_ctx_put(release_new_ctx);
}
@@ -657,10 +620,11 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
struct userfaultfd_fork_ctx *fctx;
octx = vma->vm_userfaultfd_ctx.ctx;
- if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
- vma_start_write(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
+ if (!octx)
+ return 0;
+
+ if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ userfaultfd_reset_ctx(vma);
return 0;
}
@@ -727,6 +691,34 @@ void dup_userfaultfd_complete(struct list_head *fcs)
}
}
+void dup_userfaultfd_fail(struct list_head *fcs)
+{
+ struct userfaultfd_fork_ctx *fctx, *n;
+
+ /*
+ * An error has occurred on fork, we will tear memory down, but have
+ * allocated memory for fctx's and raised reference counts for both the
+ * original and child contexts (and on the mm for each as a result).
+ *
+ * These would ordinarily be taken care of by a user handling the event,
+ * but we are no longer doing so, so manually clean up here.
+ *
+ * mm tear down will take care of cleaning up VMA contexts.
+ */
+ list_for_each_entry_safe(fctx, n, fcs, list) {
+ struct userfaultfd_ctx *octx = fctx->orig;
+ struct userfaultfd_ctx *ctx = fctx->new;
+
+ atomic_dec(&octx->mmap_changing);
+ VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
+ userfaultfd_ctx_put(octx);
+ userfaultfd_ctx_put(ctx);
+
+ list_del(&fctx->list);
+ kfree(fctx);
+ }
+}
+
void mremap_userfaultfd_prep(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx *vm_ctx)
{
@@ -745,9 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
up_write(&ctx->map_changing_lock);
} else {
/* Drop uffd context if remap feature not enabled */
- vma_start_write(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
+ userfaultfd_reset_ctx(vma);
}
}
@@ -866,49 +856,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
{
struct userfaultfd_ctx *ctx = file->private_data;
struct mm_struct *mm = ctx->mm;
- struct vm_area_struct *vma, *prev;
/* len == 0 means wake all */
struct userfaultfd_wake_range range = { .len = 0, };
- unsigned long new_flags;
- VMA_ITERATOR(vmi, mm, 0);
WRITE_ONCE(ctx->released, true);
- if (!mmget_not_zero(mm))
- goto wakeup;
-
- /*
- * Flush page faults out of all CPUs. NOTE: all page faults
- * must be retried without returning VM_FAULT_SIGBUS if
- * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
- * changes while handle_userfault released the mmap_lock. So
- * it's critical that released is set to true (above), before
- * taking the mmap_lock for writing.
- */
- mmap_write_lock(mm);
- prev = NULL;
- for_each_vma(vmi, vma) {
- cond_resched();
- BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
- !!(vma->vm_flags & __VM_UFFD_FLAGS));
- if (vma->vm_userfaultfd_ctx.ctx != ctx) {
- prev = vma;
- continue;
- }
- new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
- vma->vm_end, new_flags,
- NULL_VM_UFFD_CTX);
-
- vma_start_write(vma);
- userfaultfd_set_vm_flags(vma, new_flags);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ userfaultfd_release_all(mm, ctx);
- prev = vma;
- }
- mmap_write_unlock(mm);
- mmput(mm);
-wakeup:
/*
* After no new page faults can wait on this fault_*wqh, flush
* the last page faults that may have been already waiting on
@@ -1177,34 +1131,34 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
return ret;
}
-static ssize_t userfaultfd_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct file *file = iocb->ki_filp;
struct userfaultfd_ctx *ctx = file->private_data;
ssize_t _ret, ret = 0;
struct uffd_msg msg;
- int no_wait = file->f_flags & O_NONBLOCK;
struct inode *inode = file_inode(file);
+ bool no_wait;
if (!userfaultfd_is_initialized(ctx))
return -EINVAL;
+ no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT;
for (;;) {
- if (count < sizeof(msg))
+ if (iov_iter_count(to) < sizeof(msg))
return ret ? ret : -EINVAL;
_ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
if (_ret < 0)
return ret ? ret : _ret;
- if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
+ _ret = !copy_to_iter_full(&msg, sizeof(msg), to);
+ if (_ret)
return ret ? ret : -EFAULT;
ret += sizeof(msg);
- buf += sizeof(msg);
- count -= sizeof(msg);
/*
* Allow to read more than one fault at time but only
* block if waiting for the very first one.
*/
- no_wait = O_NONBLOCK;
+ no_wait = true;
}
}
@@ -1285,14 +1239,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
struct mm_struct *mm = ctx->mm;
- struct vm_area_struct *vma, *prev, *cur;
+ struct vm_area_struct *vma, *cur;
int ret;
struct uffdio_register uffdio_register;
struct uffdio_register __user *user_uffdio_register;
- unsigned long vm_flags, new_flags;
+ unsigned long vm_flags;
bool found;
bool basic_ioctls;
- unsigned long start, end, vma_end;
+ unsigned long start, end;
struct vma_iterator vmi;
bool wp_async = userfaultfd_wp_async_ctx(ctx);
@@ -1420,57 +1374,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
} for_each_vma_range(vmi, cur, end);
BUG_ON(!found);
- vma_iter_set(&vmi, start);
- prev = vma_prev(&vmi);
- if (vma->vm_start < start)
- prev = vma;
-
- ret = 0;
- for_each_vma_range(vmi, vma, end) {
- cond_resched();
-
- BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
- BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
- vma->vm_userfaultfd_ctx.ctx != ctx);
- WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
-
- /*
- * Nothing to do: this vma is already registered into this
- * userfaultfd and with the right tracking mode too.
- */
- if (vma->vm_userfaultfd_ctx.ctx == ctx &&
- (vma->vm_flags & vm_flags) == vm_flags)
- goto skip;
-
- if (vma->vm_start > start)
- start = vma->vm_start;
- vma_end = min(end, vma->vm_end);
-
- new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
- vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
- new_flags,
- (struct vm_userfaultfd_ctx){ctx});
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
- break;
- }
-
- /*
- * In the vma_merge() successful mprotect-like case 8:
- * the next vma was merged into the current one and
- * the current one has not been updated yet.
- */
- vma_start_write(vma);
- userfaultfd_set_vm_flags(vma, new_flags);
- vma->vm_userfaultfd_ctx.ctx = ctx;
-
- if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
- hugetlb_unshare_all_pmds(vma);
-
- skip:
- prev = vma;
- start = vma->vm_end;
- }
+ ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
+ wp_async);
out_unlock:
mmap_write_unlock(mm);
@@ -1511,7 +1416,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
struct vm_area_struct *vma, *prev, *cur;
int ret;
struct uffdio_range uffdio_unregister;
- unsigned long new_flags;
bool found;
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
@@ -1614,27 +1518,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
}
- /* Reset ptes for the whole vma range if wr-protected */
- if (userfaultfd_wp(vma))
- uffd_wp_range(vma, start, vma_end - start, false);
-
- new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
- new_flags, NULL_VM_UFFD_CTX);
+ vma = userfaultfd_clear_vma(&vmi, prev, vma,
+ start, vma_end);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
break;
}
- /*
- * In the vma_merge() successful mprotect-like case 8:
- * the next vma was merged into the current one and
- * the current one has not been updated yet.
- */
- vma_start_write(vma);
- userfaultfd_set_vm_flags(vma, new_flags);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-
skip:
prev = vma;
start = vma->vm_end;
@@ -1695,8 +1585,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
user_uffdio_copy = (struct uffdio_copy __user *) arg;
ret = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+ return -EFAULT;
goto out;
+ }
ret = -EFAULT;
if (copy_from_user(&uffdio_copy, user_uffdio_copy,
@@ -1751,8 +1644,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
ret = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+ return -EFAULT;
goto out;
+ }
ret = -EFAULT;
if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
@@ -1854,8 +1750,11 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
user_uffdio_continue = (struct uffdio_continue __user *)arg;
ret = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
+ return -EFAULT;
goto out;
+ }
ret = -EFAULT;
if (copy_from_user(&uffdio_continue, user_uffdio_continue,
@@ -1911,8 +1810,11 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
user_uffdio_poison = (struct uffdio_poison __user *)arg;
ret = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
+ return -EFAULT;
goto out;
+ }
ret = -EFAULT;
if (copy_from_user(&uffdio_poison, user_uffdio_poison,
@@ -1980,8 +1882,12 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
user_uffdio_move = (struct uffdio_move __user *) arg;
- if (atomic_read(&ctx->mmap_changing))
- return -EAGAIN;
+ ret = -EAGAIN;
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_move->move)))
+ return -EFAULT;
+ goto out;
+ }
if (copy_from_user(&uffdio_move, user_uffdio_move,
/* don't copy "move" last field */
@@ -2049,7 +1955,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
goto out;
features = uffdio_api.features;
ret = -EINVAL;
- if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
+ if (uffdio_api.api != UFFD_API)
goto err_out;
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
@@ -2073,6 +1979,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif
+
+ ret = -EINVAL;
+ if (features & ~uffdio_api.features)
+ goto err_out;
+
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
@@ -2172,7 +2083,7 @@ static const struct file_operations userfaultfd_fops = {
#endif
.release = userfaultfd_release,
.poll = userfaultfd_poll,
- .read = userfaultfd_read,
+ .read_iter = userfaultfd_read_iter,
.unlocked_ioctl = userfaultfd_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.llseek = noop_llseek,
@@ -2192,6 +2103,7 @@ static void init_once_userfaultfd_ctx(void *mem)
static int new_userfaultfd(int flags)
{
struct userfaultfd_ctx *ctx;
+ struct file *file;
int fd;
BUG_ON(!current->mm);
@@ -2215,16 +2127,26 @@ static int new_userfaultfd(int flags)
init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
- /* prevent the mm struct to be freed */
- mmgrab(ctx->mm);
+
+ fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
+ if (fd < 0)
+ goto err_out;
/* Create a new inode so that the LSM can block the creation. */
- fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
+ file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
- if (fd < 0) {
- mmdrop(ctx->mm);
- kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+ if (IS_ERR(file)) {
+ put_unused_fd(fd);
+ fd = PTR_ERR(file);
+ goto err_out;
}
+ /* prevent the mm struct to be freed */
+ mmgrab(ctx->mm);
+ file->f_mode |= FMODE_NOWAIT;
+ fd_install(fd, file);
+ return fd;
+err_out:
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
return fd;
}