summaryrefslogtreecommitdiff
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c339
1 files changed, 237 insertions, 102 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 155f3d830ddb..32b0064f806e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -541,6 +541,7 @@ enum {
REQ_F_NO_FILE_TABLE_BIT,
REQ_F_QUEUE_TIMEOUT_BIT,
REQ_F_WORK_INITIALIZED_BIT,
+ REQ_F_TASK_PINNED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -598,10 +599,13 @@ enum {
REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
+ /* req->task is refcounted */
+ REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT),
};
struct async_poll {
struct io_poll_iocb poll;
+ struct io_poll_iocb *double_poll;
struct io_wq_work work;
};
@@ -887,6 +891,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
+static void io_complete_rw_common(struct kiocb *kiocb, long res);
static void io_cleanup_req(struct io_kiocb *req);
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed);
@@ -910,6 +915,21 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+static void io_get_req_task(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_TASK_PINNED)
+ return;
+ get_task_struct(req->task);
+ req->flags |= REQ_F_TASK_PINNED;
+}
+
+/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
+static void __io_put_req_task(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_TASK_PINNED)
+ put_task_struct(req->task);
+}
+
static void io_file_put_work(struct work_struct *work);
/*
@@ -1045,8 +1065,6 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
}
spin_unlock(&current->fs->lock);
}
- if (!req->work.task_pid)
- req->work.task_pid = task_pid_vnr(current);
}
static inline void io_req_work_drop_env(struct io_kiocb *req)
@@ -1079,6 +1097,8 @@ static inline void io_prep_async_work(struct io_kiocb *req,
{
const struct io_op_def *def = &io_op_defs[req->opcode];
+ io_req_init_async(req);
+
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file)
io_wq_hash_work(&req->work, file_inode(req->file));
@@ -1256,6 +1276,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (cqe) {
clear_bit(0, &ctx->sq_check_overflow);
clear_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
}
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
@@ -1293,6 +1314,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->sq_check_overflow);
set_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
req->flags |= REQ_F_OVERFLOW;
refcount_inc(&req->refs);
@@ -1398,9 +1420,7 @@ static void __io_req_aux_free(struct io_kiocb *req)
kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- if (req->task)
- put_task_struct(req->task);
-
+ __io_put_req_task(req);
io_req_work_drop_env(req);
}
@@ -1727,6 +1747,26 @@ static int io_put_kbuf(struct io_kiocb *req)
return cflags;
}
+static void io_iopoll_queue(struct list_head *again)
+{
+ struct io_kiocb *req;
+
+ do {
+ req = list_first_entry(again, struct io_kiocb, list);
+ list_del(&req->list);
+
+ /* shouldn't happen unless io_uring is dying, cancel reqs */
+ if (unlikely(!current->mm)) {
+ io_complete_rw_common(&req->rw.kiocb, -EAGAIN);
+ io_put_req(req);
+ continue;
+ }
+
+ refcount_inc(&req->refs);
+ io_queue_async_work(req);
+ } while (!list_empty(again));
+}
+
/*
* Find and free completed poll iocbs
*/
@@ -1735,12 +1775,21 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
{
struct req_batch rb;
struct io_kiocb *req;
+ LIST_HEAD(again);
+
+ /* order with ->result store in io_complete_rw_iopoll() */
+ smp_rmb();
rb.to_free = rb.need_iter = 0;
while (!list_empty(done)) {
int cflags = 0;
req = list_first_entry(done, struct io_kiocb, list);
+ if (READ_ONCE(req->result) == -EAGAIN) {
+ req->iopoll_completed = 0;
+ list_move_tail(&req->list, &again);
+ continue;
+ }
list_del(&req->list);
if (req->flags & REQ_F_BUFFER_SELECTED)
@@ -1758,18 +1807,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (ctx->flags & IORING_SETUP_SQPOLL)
io_cqring_ev_posted(ctx);
io_free_req_many(ctx, &rb);
-}
-
-static void io_iopoll_queue(struct list_head *again)
-{
- struct io_kiocb *req;
- do {
- req = list_first_entry(again, struct io_kiocb, list);
- list_del(&req->list);
- refcount_inc(&req->refs);
- io_queue_async_work(req);
- } while (!list_empty(again));
+ if (!list_empty(&again))
+ io_iopoll_queue(&again);
}
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
@@ -1777,7 +1817,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
- LIST_HEAD(again);
bool spin;
int ret;
@@ -1803,13 +1842,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
break;
- if (req->result == -EAGAIN) {
- list_move_tail(&req->list, &again);
- continue;
- }
- if (!list_empty(&again))
- break;
-
ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
if (ret < 0)
break;
@@ -1822,9 +1854,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
io_iopoll_complete(ctx, nr_events, &done);
- if (!list_empty(&again))
- io_iopoll_queue(&again);
-
return ret;
}
@@ -1973,11 +2002,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if (res != req->result)
+ if (res != -EAGAIN && res != req->result)
req_set_fail_links(req);
- req->result = res;
- if (res != -EAGAIN)
- WRITE_ONCE(req->iopoll_completed, 1);
+
+ WRITE_ONCE(req->result, res);
+ /* order with io_poll_complete() checking ->result */
+ smp_wmb();
+ WRITE_ONCE(req->iopoll_completed, 1);
}
/*
@@ -2650,8 +2681,8 @@ copy_iov:
}
}
out_free:
- kfree(iovec);
- req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (!(req->flags & REQ_F_NEED_CLEANUP))
+ kfree(iovec);
return ret;
}
@@ -2773,8 +2804,8 @@ copy_iov:
}
}
out_free:
- req->flags &= ~REQ_F_NEED_CLEANUP;
- kfree(iovec);
+ if (!(req->flags & REQ_F_NEED_CLEANUP))
+ kfree(iovec);
return ret;
}
@@ -3524,6 +3555,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
+ io->msg.msg.msg_name = &io->msg.addr;
io->msg.iov = io->msg.fast_iov;
ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
&io->msg.iov);
@@ -3705,6 +3737,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
{
+ io->msg.msg.msg_name = &io->msg.addr;
io->msg.iov = io->msg.fast_iov;
#ifdef CONFIG_COMPAT
@@ -3813,10 +3846,16 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
kmsg->uaddr, flags);
- if (force_nonblock && ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg);
+ if (force_nonblock && ret == -EAGAIN) {
+ ret = io_setup_async_msg(req, kmsg);
+ if (ret != -EAGAIN)
+ kfree(kbuf);
+ return ret;
+ }
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (kbuf)
+ kfree(kbuf);
}
if (kmsg && kmsg->iov != kmsg->fast_iov)
@@ -4045,6 +4084,29 @@ struct io_poll_table {
int error;
};
+static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
+{
+ struct task_struct *tsk = req->task;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret, notify = TWA_RESUME;
+
+ /*
+ * SQPOLL kernel thread doesn't need notification, just a wakeup.
+ * If we're not using an eventfd, then TWA_RESUME is always fine,
+ * as we won't have dependencies between request completions for
+ * other kernel wait conditions.
+ */
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ notify = 0;
+ else if (ctx->cq_ev_fd)
+ notify = TWA_SIGNAL;
+
+ ret = task_work_add(tsk, cb, notify);
+ if (!ret)
+ wake_up_process(tsk);
+ return ret;
+}
+
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
@@ -4068,13 +4130,13 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- ret = task_work_add(tsk, &req->task_work, true);
+ ret = io_req_task_work_add(req, &req->task_work);
if (unlikely(ret)) {
WRITE_ONCE(poll->canceled, true);
tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, true);
+ task_work_add(tsk, &req->task_work, 0);
+ wake_up_process(tsk);
}
- wake_up_process(tsk);
return 1;
}
@@ -4098,9 +4160,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
return false;
}
-static void io_poll_remove_double(struct io_kiocb *req)
+static void io_poll_remove_double(struct io_kiocb *req, void *data)
{
- struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
+ struct io_poll_iocb *poll = data;
lockdep_assert_held(&req->ctx->completion_lock);
@@ -4120,7 +4182,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
{
struct io_ring_ctx *ctx = req->ctx;
- io_poll_remove_double(req);
+ io_poll_remove_double(req, req->io);
req->poll.done = true;
io_cqring_fill_event(req, error ? error : mangle_poll(mask));
io_commit_cqring(ctx);
@@ -4163,21 +4225,21 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
+ struct io_poll_iocb *poll = req->apoll->double_poll;
__poll_t mask = key_to_poll(key);
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
return 0;
- if (req->poll.head) {
+ if (poll && poll->head) {
bool done;
- spin_lock(&req->poll.head->lock);
- done = list_empty(&req->poll.wait.entry);
+ spin_lock(&poll->head->lock);
+ done = list_empty(&poll->wait.entry);
if (!done)
- list_del_init(&req->poll.wait.entry);
- spin_unlock(&req->poll.head->lock);
+ list_del_init(&poll->wait.entry);
+ spin_unlock(&poll->head->lock);
if (!done)
__io_async_wake(req, poll, mask, io_poll_task_func);
}
@@ -4197,7 +4259,8 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
}
static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
- struct wait_queue_head *head)
+ struct wait_queue_head *head,
+ struct io_poll_iocb **poll_ptr)
{
struct io_kiocb *req = pt->req;
@@ -4208,7 +4271,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
*/
if (unlikely(poll->head)) {
/* already have a 2nd entry, fail a third attempt */
- if (req->io) {
+ if (*poll_ptr) {
pt->error = -EINVAL;
return;
}
@@ -4220,7 +4283,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
refcount_inc(&req->refs);
poll->wait.private = req;
- req->io = (void *) poll;
+ *poll_ptr = poll;
}
pt->error = 0;
@@ -4232,8 +4295,31 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
struct poll_table_struct *p)
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+ struct async_poll *apoll = pt->req->apoll;
- __io_queue_proc(&pt->req->apoll->poll, pt, head);
+ __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
+}
+
+static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (mm) {
+ kthread_unuse_mm(mm);
+ mmput(mm);
+ }
+}
+
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ if (io_op_defs[req->opcode].needs_mm && !current->mm) {
+ if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+ return -EFAULT;
+ kthread_use_mm(ctx->sqo_mm);
+ }
+
+ return 0;
}
static void io_async_task_func(struct callback_head *cb)
@@ -4261,20 +4347,27 @@ static void io_async_task_func(struct callback_head *cb)
}
}
+ io_poll_remove_double(req, apoll->double_poll);
spin_unlock_irq(&ctx->completion_lock);
/* restore ->work in case we need to retry again */
if (req->flags & REQ_F_WORK_INITIALIZED)
memcpy(&req->work, &apoll->work, sizeof(req->work));
+ kfree(apoll->double_poll);
kfree(apoll);
if (!canceled) {
__set_current_state(TASK_RUNNING);
+ if (io_sq_thread_acquire_mm(ctx, req)) {
+ io_cqring_add_event(req, -EFAULT);
+ goto end_req;
+ }
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(req, NULL);
mutex_unlock(&ctx->uring_lock);
} else {
io_cqring_ev_posted(ctx);
+end_req:
req_set_fail_links(req);
io_double_put_req(req);
}
@@ -4348,7 +4441,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t mask, ret;
- bool had_io;
if (!req->file || !file_can_poll(req->file))
return false;
@@ -4360,14 +4452,13 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
return false;
+ apoll->double_poll = NULL;
req->flags |= REQ_F_POLLED;
if (req->flags & REQ_F_WORK_INITIALIZED)
memcpy(&apoll->work, &req->work, sizeof(req->work));
- had_io = req->io != NULL;
- get_task_struct(current);
- req->task = current;
+ io_get_req_task(req);
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4383,13 +4474,11 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
io_async_wake);
if (ret) {
- ipt.error = 0;
- /* only remove double add if we did it here */
- if (!had_io)
- io_poll_remove_double(req);
+ io_poll_remove_double(req, apoll->double_poll);
spin_unlock_irq(&ctx->completion_lock);
if (req->flags & REQ_F_WORK_INITIALIZED)
memcpy(&req->work, &apoll->work, sizeof(req->work));
+ kfree(apoll->double_poll);
kfree(apoll);
return false;
}
@@ -4420,11 +4509,13 @@ static bool io_poll_remove_one(struct io_kiocb *req)
bool do_complete;
if (req->opcode == IORING_OP_POLL_ADD) {
- io_poll_remove_double(req);
+ io_poll_remove_double(req, req->io);
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
struct async_poll *apoll = req->apoll;
+ io_poll_remove_double(req, apoll->double_poll);
+
/* non-poll requests have submit ref still */
do_complete = __io_poll_remove_one(req, &apoll->poll);
if (do_complete) {
@@ -4437,6 +4528,7 @@ static bool io_poll_remove_one(struct io_kiocb *req)
if (req->flags & REQ_F_WORK_INITIALIZED)
memcpy(&req->work, &apoll->work,
sizeof(req->work));
+ kfree(apoll->double_poll);
kfree(apoll);
}
}
@@ -4537,7 +4629,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
- __io_queue_proc(&pt->req->poll, pt, head);
+ __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io);
}
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4555,8 +4647,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
- get_task_struct(current);
- req->task = current;
+ io_get_req_task(req);
return 0;
}
@@ -4646,7 +4737,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
+ if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->buf_index || sqe->len)
return -EINVAL;
req->timeout.addr = READ_ONCE(sqe->addr);
@@ -4772,7 +4865,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
enum io_wq_cancel cancel_ret;
int ret = 0;
- cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
+ cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
@@ -4824,8 +4917,9 @@ static int io_async_cancel_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
- sqe->cancel_flags)
+ if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
return -EINVAL;
req->cancel.addr = READ_ONCE(sqe->addr);
@@ -4843,7 +4937,9 @@ static int io_async_cancel(struct io_kiocb *req)
static int io_files_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (sqe->flags || sqe->ioprio || sqe->rw_flags)
+ if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->rw_flags)
return -EINVAL;
req->files_update.offset = READ_ONCE(sqe->off);
@@ -5308,9 +5404,6 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
const bool in_async = io_wq_current_is_worker();
- if (req->result == -EAGAIN)
- return -EAGAIN;
-
/* workqueue context doesn't hold uring_lock, grab it now */
if (in_async)
mutex_lock(&ctx->uring_lock);
@@ -5637,6 +5730,7 @@ fail_req:
* Never try inline submit of IOSQE_ASYNC is set, go straight
* to async execution.
*/
+ io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_CONCURRENT;
io_queue_async_work(req);
} else {
@@ -5817,17 +5911,14 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->flags = 0;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
- req->task = NULL;
+ req->task = current;
req->result = 0;
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
- if (io_op_defs[req->opcode].needs_mm && !current->mm) {
- if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
- return -EFAULT;
- kthread_use_mm(ctx->sqo_mm);
- }
+ if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
+ return -EFAULT;
sqe_flags = READ_ONCE(sqe->flags);
/* enforce forwards compatibility on users */
@@ -5936,16 +6027,6 @@ fail_req:
return submitted;
}
-static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
-{
- struct mm_struct *mm = current->mm;
-
- if (mm) {
- kthread_unuse_mm(mm);
- mmput(mm);
- }
-}
-
static int io_sq_thread(void *data)
{
struct io_ring_ctx *ctx = data;
@@ -5979,7 +6060,7 @@ static int io_sq_thread(void *data)
* If submit got -EBUSY, flag us as needing the application
* to enter the kernel to reap and flush events.
*/
- if (!to_submit || ret == -EBUSY) {
+ if (!to_submit || ret == -EBUSY || need_resched()) {
/*
* Drop cur_mm before scheduling, we can't hold it for
* long periods (or over schedule()). Do this before
@@ -5995,7 +6076,7 @@ static int io_sq_thread(void *data)
* more IO, we should wait for the application to
* reap events and wake us up.
*/
- if (!list_empty(&ctx->poll_list) ||
+ if (!list_empty(&ctx->poll_list) || need_resched() ||
(!time_after(jiffies, timeout) && ret != -EBUSY &&
!percpu_ref_is_dying(&ctx->refs))) {
if (current->task_works)
@@ -6021,9 +6102,9 @@ static int io_sq_thread(void *data)
}
/* Tell userspace we may need a wakeup call */
+ spin_lock_irq(&ctx->completion_lock);
ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
- /* make sure to read SQ tail after writing flags */
- smp_mb();
+ spin_unlock_irq(&ctx->completion_lock);
to_submit = io_sqring_entries(ctx);
if (!to_submit || ret == -EBUSY) {
@@ -6041,13 +6122,17 @@ static int io_sq_thread(void *data)
schedule();
finish_wait(&ctx->sqo_wait, &wait);
+ spin_lock_irq(&ctx->completion_lock);
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ spin_unlock_irq(&ctx->completion_lock);
ret = 0;
continue;
}
finish_wait(&ctx->sqo_wait, &wait);
+ spin_lock_irq(&ctx->completion_lock);
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ spin_unlock_irq(&ctx->completion_lock);
}
mutex_lock(&ctx->uring_lock);
@@ -6146,15 +6231,23 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
do {
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
+ /* make sure we run task_work before checking for signals */
if (current->task_works)
task_work_run();
- if (io_should_wake(&iowq, false))
- break;
- schedule();
if (signal_pending(current)) {
+ if (current->jobctl & JOBCTL_TASK_WORK) {
+ spin_lock_irq(&current->sighand->siglock);
+ current->jobctl &= ~JOBCTL_TASK_WORK;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ continue;
+ }
ret = -EINTR;
break;
}
+ if (io_should_wake(&iowq, false))
+ break;
+ schedule();
} while (1);
finish_wait(&ctx->wait, &iowq.wq);
@@ -6626,6 +6719,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
for (i = 0; i < nr_tables; i++)
kfree(ctx->file_data->table[i].files);
+ percpu_ref_exit(&ctx->file_data->refs);
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
@@ -6778,8 +6872,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
}
table->files[index] = file;
err = io_sqe_file_register(ctx, file, i);
- if (err)
+ if (err) {
+ fput(file);
break;
+ }
}
nr_args--;
done++;
@@ -7275,9 +7371,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_mem_free(ctx->sq_sqes);
percpu_ref_exit(&ctx->refs);
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user,
- ring_pages(ctx->sq_entries, ctx->cq_entries));
free_uid(ctx->user);
put_cred(ctx->creds);
kfree(ctx->cancel_hash);
@@ -7331,7 +7424,17 @@ static void io_ring_exit_work(struct work_struct *work)
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
- wait_for_completion(&ctx->ref_comp);
+ /*
+ * If we're doing polled IO and end up having requests being
+ * submitted async (out-of-line), then completions can come in while
+ * we're waiting for refs to drop. We need to reap these manually,
+ * as nobody else will be looking for them.
+ */
+ while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) {
+ io_iopoll_reap_events(ctx);
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true);
+ }
io_ring_ctx_free(ctx);
}
@@ -7352,6 +7455,16 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
+
+ /*
+ * Do this upfront, so we won't have a grace period where the ring
+ * is closed but resources aren't reaped yet. This can cause
+ * spurious failure in setting up a new ring.
+ */
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user,
+ ring_pages(ctx->sq_entries, ctx->cq_entries));
+
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
queue_work(system_wq, &ctx->exit_work);
}
@@ -7365,9 +7478,22 @@ static int io_uring_release(struct inode *inode, struct file *file)
return 0;
}
+static bool io_wq_files_match(struct io_wq_work *work, void *data)
+{
+ struct files_struct *files = data;
+
+ return work->files == files;
+}
+
static void io_uring_cancel_files(struct io_ring_ctx *ctx,
struct files_struct *files)
{
+ if (list_empty_careful(&ctx->inflight_list))
+ return;
+
+ /* cancel all at once, should be faster than doing it one by one*/
+ io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
+
while (!list_empty_careful(&ctx->inflight_list)) {
struct io_kiocb *cancel_req = NULL, *req;
DEFINE_WAIT(wait);
@@ -7398,6 +7524,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
if (list_empty(&ctx->cq_overflow_list)) {
clear_bit(0, &ctx->sq_check_overflow);
clear_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
}
spin_unlock_irq(&ctx->completion_lock);
@@ -7423,6 +7550,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
}
}
+static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct task_struct *task = data;
+
+ return req->task == task;
+}
+
static int io_uring_flush(struct file *file, void *data)
{
struct io_ring_ctx *ctx = file->private_data;
@@ -7433,7 +7568,7 @@ static int io_uring_flush(struct file *file, void *data)
* If the task is going away, cancel work it may have pending
*/
if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
- io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
+ io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
return 0;
}