summaryrefslogtreecommitdiff
path: root/fs/userfaultfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/userfaultfd.c')
-rw-r--r--fs/userfaultfd.c191
1 files changed, 102 insertions, 89 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index ef4b48d1ea42..cec550c8468f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -294,10 +294,13 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
* pmd_trans_unstable) of the pmd.
*/
_pmd = READ_ONCE(*pmd);
- if (!pmd_present(_pmd))
+ if (pmd_none(_pmd))
goto out;
ret = false;
+ if (!pmd_present(_pmd))
+ goto out;
+
if (pmd_trans_huge(_pmd))
goto out;
@@ -381,7 +384,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
* in __get_user_pages if userfaultfd_release waits on the
* caller of handle_userfault to release the mmap_sem.
*/
- if (unlikely(ACCESS_ONCE(ctx->released))) {
+ if (unlikely(READ_ONCE(ctx->released))) {
/*
* Don't return VM_FAULT_SIGBUS in this case, so a non
* cooperative manager can close the uffd after the
@@ -477,10 +480,10 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
vmf->flags, reason);
up_read(&mm->mmap_sem);
- if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
+ if (likely(must_wait && !READ_ONCE(ctx->released) &&
(return_to_userland ? !signal_pending(current) :
!fatal_signal_pending(current)))) {
- wake_up_poll(&ctx->fd_wqh, POLLIN);
+ wake_up_poll(&ctx->fd_wqh, EPOLLIN);
schedule();
ret |= VM_FAULT_MAJOR;
@@ -570,11 +573,14 @@ out:
static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
struct userfaultfd_wait_queue *ewq)
{
+ struct userfaultfd_ctx *release_new_ctx;
+
if (WARN_ON_ONCE(current->flags & PF_EXITING))
goto out;
ewq->ctx = ctx;
init_waitqueue_entry(&ewq->wq, current);
+ release_new_ctx = NULL;
spin_lock(&ctx->event_wqh.lock);
/*
@@ -586,8 +592,14 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
set_current_state(TASK_KILLABLE);
if (ewq->msg.event == 0)
break;
- if (ACCESS_ONCE(ctx->released) ||
+ if (READ_ONCE(ctx->released) ||
fatal_signal_pending(current)) {
+ /*
+ * &ewq->wq may be queued in fork_event, but
+ * __remove_wait_queue ignores the head
+ * parameter. It would be a problem if it
+ * didn't.
+ */
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
if (ewq->msg.event == UFFD_EVENT_FORK) {
struct userfaultfd_ctx *new;
@@ -595,15 +607,14 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
new = (struct userfaultfd_ctx *)
(unsigned long)
ewq->msg.arg.reserved.reserved1;
-
- userfaultfd_ctx_put(new);
+ release_new_ctx = new;
}
break;
}
spin_unlock(&ctx->event_wqh.lock);
- wake_up_poll(&ctx->fd_wqh, POLLIN);
+ wake_up_poll(&ctx->fd_wqh, EPOLLIN);
schedule();
spin_lock(&ctx->event_wqh.lock);
@@ -611,6 +622,20 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
__set_current_state(TASK_RUNNING);
spin_unlock(&ctx->event_wqh.lock);
+ if (release_new_ctx) {
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = release_new_ctx->mm;
+
+ /* the various vma->vm_userfaultfd_ctx still points to it */
+ down_write(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx)
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ up_write(&mm->mmap_sem);
+
+ userfaultfd_ctx_put(release_new_ctx);
+ }
+
/*
* ctx may go away after this if the userfault pseudo fd is
* already released.
@@ -662,7 +687,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
ctx->features = octx->features;
ctx->released = false;
ctx->mm = vma->vm_mm;
- atomic_inc(&ctx->mm->mm_count);
+ mmgrab(ctx->mm);
userfaultfd_ctx_get(octx);
fctx->orig = octx;
@@ -827,7 +852,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
struct userfaultfd_wake_range range = { .len = 0, };
unsigned long new_flags;
- ACCESS_ONCE(ctx->released) = true;
+ WRITE_ONCE(ctx->released, true);
if (!mmget_not_zero(mm))
goto wakeup;
@@ -879,7 +904,7 @@ wakeup:
/* Flush pending events that may still wait on event_wqh */
wake_up_all(&ctx->event_wqh);
- wake_up_poll(&ctx->fd_wqh, POLLHUP);
+ wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
userfaultfd_ctx_put(ctx);
return 0;
}
@@ -915,23 +940,23 @@ static inline struct userfaultfd_wait_queue *find_userfault_evt(
return find_userfault_in(&ctx->event_wqh);
}
-static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
+static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
{
struct userfaultfd_ctx *ctx = file->private_data;
- unsigned int ret;
+ __poll_t ret;
poll_wait(file, &ctx->fd_wqh, wait);
switch (ctx->state) {
case UFFD_STATE_WAIT_API:
- return POLLERR;
+ return EPOLLERR;
case UFFD_STATE_RUNNING:
/*
* poll() never guarantees that read won't block.
* userfaults can be waken before they're read().
*/
if (unlikely(!(file->f_flags & O_NONBLOCK)))
- return POLLERR;
+ return EPOLLERR;
/*
* lockless access to see if there are pending faults
* __pollwait last action is the add_wait_queue but
@@ -945,14 +970,14 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
ret = 0;
smp_mb();
if (waitqueue_active(&ctx->fault_pending_wqh))
- ret = POLLIN;
+ ret = EPOLLIN;
else if (waitqueue_active(&ctx->event_wqh))
- ret = POLLIN;
+ ret = EPOLLIN;
return ret;
default:
WARN_ON_ONCE(1);
- return POLLERR;
+ return EPOLLERR;
}
}
@@ -963,24 +988,14 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
struct uffd_msg *msg)
{
int fd;
- struct file *file;
- unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
- fd = get_unused_fd_flags(flags);
+ fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new,
+ O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
if (fd < 0)
return fd;
- file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
- O_RDWR | flags);
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- return PTR_ERR(file);
- }
-
- fd_install(fd, file);
msg->arg.reserved.reserved1 = 0;
msg->arg.fork.ufd = fd;
-
return 0;
}
@@ -1061,6 +1076,12 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
(unsigned long)
uwq->msg.arg.reserved.reserved1;
list_move(&uwq->wq.entry, &fork_event);
+ /*
+ * fork_nctx can be freed as soon as
+ * we drop the lock, unless we take a
+ * reference on it.
+ */
+ userfaultfd_ctx_get(fork_nctx);
spin_unlock(&ctx->event_wqh.lock);
ret = 0;
break;
@@ -1091,19 +1112,53 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
if (!ret && msg->event == UFFD_EVENT_FORK) {
ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+ spin_lock(&ctx->event_wqh.lock);
+ if (!list_empty(&fork_event)) {
+ /*
+ * The fork thread didn't abort, so we can
+ * drop the temporary refcount.
+ */
+ userfaultfd_ctx_put(fork_nctx);
- if (!ret) {
- spin_lock(&ctx->event_wqh.lock);
- if (!list_empty(&fork_event)) {
- uwq = list_first_entry(&fork_event,
- typeof(*uwq),
- wq.entry);
- list_del(&uwq->wq.entry);
- __add_wait_queue(&ctx->event_wqh, &uwq->wq);
+ uwq = list_first_entry(&fork_event,
+ typeof(*uwq),
+ wq.entry);
+ /*
+ * If fork_event list wasn't empty and in turn
+ * the event wasn't already released by fork
+ * (the event is allocated on fork kernel
+ * stack), put the event back to its place in
+ * the event_wq. fork_event head will be freed
+ * as soon as we return so the event cannot
+ * stay queued there no matter the current
+ * "ret" value.
+ */
+ list_del(&uwq->wq.entry);
+ __add_wait_queue(&ctx->event_wqh, &uwq->wq);
+
+ /*
+ * Leave the event in the waitqueue and report
+ * error to userland if we failed to resolve
+ * the userfault fork.
+ */
+ if (likely(!ret))
userfaultfd_event_complete(ctx, uwq);
- }
- spin_unlock(&ctx->event_wqh.lock);
+ } else {
+ /*
+ * Here the fork thread aborted and the
+ * refcount from the fork thread on fork_nctx
+ * has already been released. We still hold
+ * the reference we took before releasing the
+ * lock above. If resolve_userfault_fork
+ * failed we've to drop it because the
+ * fork_nctx has to be freed in such case. If
+ * it succeeded we'll hold it because the new
+ * uffd references it.
+ */
+ if (ret)
+ userfaultfd_ctx_put(fork_nctx);
}
+ spin_unlock(&ctx->event_wqh.lock);
}
return ret;
@@ -1822,24 +1877,10 @@ static void init_once_userfaultfd_ctx(void *mem)
seqcount_init(&ctx->refile_seq);
}
-/**
- * userfaultfd_file_create - Creates a userfaultfd file pointer.
- * @flags: Flags for the userfaultfd file.
- *
- * This function creates a userfaultfd file pointer, w/out installing
- * it into the fd table. This is useful when the userfaultfd file is
- * used during the initialization of data structures that require
- * extra setup after the userfaultfd creation. So the userfaultfd
- * creation is split into the file pointer creation phase, and the
- * file descriptor installation phase. In this way races with
- * userspace closing the newly installed file descriptor can be
- * avoided. Returns a userfaultfd file pointer, or a proper error
- * pointer.
- */
-static struct file *userfaultfd_file_create(int flags)
+SYSCALL_DEFINE1(userfaultfd, int, flags)
{
- struct file *file;
struct userfaultfd_ctx *ctx;
+ int fd;
BUG_ON(!current->mm);
@@ -1847,14 +1888,12 @@ static struct file *userfaultfd_file_create(int flags)
BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
- file = ERR_PTR(-EINVAL);
if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
- goto out;
+ return -EINVAL;
- file = ERR_PTR(-ENOMEM);
ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
if (!ctx)
- goto out;
+ return -ENOMEM;
atomic_set(&ctx->refcount, 1);
ctx->flags = flags;
@@ -1865,39 +1904,13 @@ static struct file *userfaultfd_file_create(int flags)
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
- file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
- O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
- if (IS_ERR(file)) {
+ fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
+ O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+ if (fd < 0) {
mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
}
-out:
- return file;
-}
-
-SYSCALL_DEFINE1(userfaultfd, int, flags)
-{
- int fd, error;
- struct file *file;
-
- error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
- if (error < 0)
- return error;
- fd = error;
-
- file = userfaultfd_file_create(flags);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_put_unused_fd;
- }
- fd_install(fd, file);
-
return fd;
-
-err_put_unused_fd:
- put_unused_fd(fd);
-
- return error;
}
static int __init userfaultfd_init(void)