From 59960b9deb5354e4cdb0b6ed3a3b653a2b4eb602 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 16:36:30 +0300 Subject: io_uring: fix lazy work init Don't leave garbage in req.work before punting async on -EAGAIN in io_iopoll_queue(). [ 140.922099] general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI ... [ 140.922105] RIP: 0010:io_worker_handle_work+0x1db/0x480 ... [ 140.922114] Call Trace: [ 140.922118] ? __next_timer_interrupt+0xe0/0xe0 [ 140.922119] io_wqe_worker+0x2a9/0x360 [ 140.922121] ? _raw_spin_unlock_irqrestore+0x24/0x40 [ 140.922124] kthread+0x12c/0x170 [ 140.922125] ? io_worker_handle_work+0x480/0x480 [ 140.922126] ? kthread_park+0x90/0x90 [ 140.922127] ret_from_fork+0x22/0x30 Fixes: 7cdaf587de7c ("io_uring: avoid whole io_wq_work copy for requests completed inline") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 155f3d830ddb..c04b20bf2803 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1087,6 +1087,7 @@ static inline void io_prep_async_work(struct io_kiocb *req, req->work.flags |= IO_WQ_WORK_UNBOUND; } + io_req_init_async(req); io_req_work_grab_env(req, def); *link = io_prep_linked_timeout(req); -- cgit From 4f26bda1522c35d2701fc219368c7101c17005c1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:24:03 +0300 Subject: io-wq: add an option to cancel all matched reqs This adds support for cancelling all io-wq works matching a predicate. It isn't used yet, so no change in observable behaviour. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index c04b20bf2803..94bd88556c1e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4773,7 +4773,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) enum io_wq_cancel cancel_ret; int ret = 0; - cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr); + cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; -- cgit From 44e728b8aae0bb6d4229129083974f9dea43f50b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:24:04 +0300 Subject: io_uring: cancel all task's requests on exit If a process is going away, io_uring_flush() will cancel only 1 request with a matching pid. Cancel all of them Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 94bd88556c1e..ad5128a40c14 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7424,6 +7424,13 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } +static bool io_cancel_pid_cb(struct io_wq_work *work, void *data) +{ + pid_t pid = (pid_t) (unsigned long) data; + + return work->task_pid == pid; +} + static int io_uring_flush(struct file *file, void *data) { struct io_ring_ctx *ctx = file->private_data; @@ -7433,8 +7440,11 @@ static int io_uring_flush(struct file *file, void *data) /* * If the task is going away, cancel work it may have pending */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) - io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current)); + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { + void *data = (void *) (unsigned long)task_pid_vnr(current); + + io_wq_cancel_cb(ctx->io_wq, io_cancel_pid_cb, data, true); + } return 0; } -- cgit From 67c4d9e693e3bb7fb968af24e3584f821a78ba56 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:24:05 +0300 Subject: io_uring: batch cancel in io_uring_cancel_files() Instead of waiting for each request one by one, first try to cancel all of them in a batched manner, and then go over inflight_list/etc to reap leftovers. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index ad5128a40c14..b6bcd5a7f4bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7366,9 +7366,22 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } +static bool io_wq_files_match(struct io_wq_work *work, void *data) +{ + struct files_struct *files = data; + + return work->files == files; +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { + if (list_empty_careful(&ctx->inflight_list)) + return; + + /* cancel all at once, should be faster than doing it one by one*/ + io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); + while (!list_empty_careful(&ctx->inflight_list)) { struct io_kiocb *cancel_req = NULL, *req; DEFINE_WAIT(wait); -- cgit From 4dd2824d6d5914949b5fe589538bc2622d84c5dd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:33:13 +0300 Subject: io_uring: lazy get task There will be multiple places where req->task is used, so refcount-pin it lazily with introduced *io_{get,put}_req_task(). We need to always have valid ->task for cancellation reasons, but don't care about pinning it in some cases. That's why it sets req->task in io_req_init() and implements get/put laziness with a flag. This also removes using @current from polling io_arm_poll_handler(), etc., but doesn't change observable behaviour. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index b6bcd5a7f4bc..5f946eb8b740 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -541,6 +541,7 @@ enum { REQ_F_NO_FILE_TABLE_BIT, REQ_F_QUEUE_TIMEOUT_BIT, REQ_F_WORK_INITIALIZED_BIT, + REQ_F_TASK_PINNED_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -598,6 +599,8 @@ enum { REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), + /* req->task is refcounted */ + REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT), }; struct async_poll { @@ -910,6 +913,21 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +static void io_get_req_task(struct io_kiocb *req) +{ + if (req->flags & REQ_F_TASK_PINNED) + return; + get_task_struct(req->task); + req->flags |= REQ_F_TASK_PINNED; +} + +/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ +static void __io_put_req_task(struct io_kiocb *req) +{ + if (req->flags & REQ_F_TASK_PINNED) + put_task_struct(req->task); +} + static void io_file_put_work(struct work_struct *work); /* @@ -1399,9 +1417,7 @@ static void __io_req_aux_free(struct io_kiocb *req) kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - if (req->task) - put_task_struct(req->task); - + __io_put_req_task(req); io_req_work_drop_env(req); } @@ -4367,8 +4383,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) memcpy(&apoll->work, &req->work, sizeof(req->work)); had_io = req->io != NULL; - get_task_struct(current); - req->task = current; + io_get_req_task(req); req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4556,8 +4571,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - get_task_struct(current); - req->task = current; + io_get_req_task(req); return 0; } @@ -5818,7 +5832,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->flags = 0; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); - req->task = NULL; + req->task = current; req->result = 0; if (unlikely(req->opcode >= IORING_OP_LAST)) -- cgit From 801dd57bd1d8c2c253f43635a3045bfa32a810b3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:33:14 +0300 Subject: io_uring: cancel by ->task not pid For an exiting process it tries to cancel all its inflight requests. Use req->task to match such instead of work.pid. We always have req->task set, and it will be valid because we're matching only current exiting task. Also, remove work.pid and everything related, it's useless now. Reported-by: Eric W. Biederman Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5f946eb8b740..e17df662c191 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1063,8 +1063,6 @@ static inline void io_req_work_grab_env(struct io_kiocb *req, } spin_unlock(¤t->fs->lock); } - if (!req->work.task_pid) - req->work.task_pid = task_pid_vnr(current); } static inline void io_req_work_drop_env(struct io_kiocb *req) @@ -7451,11 +7449,12 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } -static bool io_cancel_pid_cb(struct io_wq_work *work, void *data) +static bool io_cancel_task_cb(struct io_wq_work *work, void *data) { - pid_t pid = (pid_t) (unsigned long) data; + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct task_struct *task = data; - return work->task_pid == pid; + return req->task == task; } static int io_uring_flush(struct file *file, void *data) @@ -7467,11 +7466,8 @@ static int io_uring_flush(struct file *file, void *data) /* * If the task is going away, cancel work it may have pending */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { - void *data = (void *) (unsigned long)task_pid_vnr(current); - - io_wq_cancel_cb(ctx->io_wq, io_cancel_pid_cb, data, true); - } + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) + io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true); return 0; } -- cgit From 2d7d67920e5c8e0854df23ca77da2dd5880ce5dd Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 16 Jun 2020 02:06:37 +0800 Subject: io_uring: don't fail links for EAGAIN error in IOPOLL mode In IOPOLL mode, for EAGAIN error, we'll try to submit io request again using io-wq, so don't fail rest of links if this io request has links. Cc: stable@vger.kernel.org Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index e17df662c191..eb3797714539 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1988,7 +1988,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if (res != req->result) + if (res != -EAGAIN && res != req->result) req_set_fail_links(req); req->result = res; if (res != -EAGAIN) -- cgit From bbde017a32b32d2fa8d5fddca25fade20132abf8 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 16 Jun 2020 02:06:38 +0800 Subject: io_uring: add memory barrier to synchronize io_kiocb's result and iopoll_completed In io_complete_rw_iopoll(), stores to io_kiocb's result and iopoll completed are two independent store operations, to ensure that once iopoll_completed is ture and then req->result must been perceived by the cpu executing io_do_iopoll(), proper memory barrier should be used. And in io_do_iopoll(), we check whether req->result is EAGAIN, if it is, we'll need to issue this io request using io-wq again. In order to just issue a single smp_rmb() on the completion side, move the re-submit work to io_iopoll_complete(). Cc: stable@vger.kernel.org Signed-off-by: Xiaoguang Wang [axboe: don't set ->iopoll_completed for -EAGAIN retry] Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 +++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index eb3797714539..9d2ae9aa8b45 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1742,6 +1742,18 @@ static int io_put_kbuf(struct io_kiocb *req) return cflags; } +static void io_iopoll_queue(struct list_head *again) +{ + struct io_kiocb *req; + + do { + req = list_first_entry(again, struct io_kiocb, list); + list_del(&req->list); + refcount_inc(&req->refs); + io_queue_async_work(req); + } while (!list_empty(again)); +} + /* * Find and free completed poll iocbs */ @@ -1750,12 +1762,21 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, { struct req_batch rb; struct io_kiocb *req; + LIST_HEAD(again); + + /* order with ->result store in io_complete_rw_iopoll() */ + smp_rmb(); rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { int cflags = 0; req = list_first_entry(done, struct io_kiocb, list); + if (READ_ONCE(req->result) == -EAGAIN) { + req->iopoll_completed = 0; + list_move_tail(&req->list, &again); + continue; + } list_del(&req->list); if (req->flags & REQ_F_BUFFER_SELECTED) @@ -1773,18 +1794,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, if (ctx->flags & IORING_SETUP_SQPOLL) io_cqring_ev_posted(ctx); io_free_req_many(ctx, &rb); -} - -static void io_iopoll_queue(struct list_head *again) -{ - struct io_kiocb *req; - do { - req = list_first_entry(again, struct io_kiocb, list); - list_del(&req->list); - refcount_inc(&req->refs); - io_queue_async_work(req); - } while (!list_empty(again)); + if (!list_empty(&again)) + io_iopoll_queue(&again); } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, @@ -1792,7 +1804,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, { struct io_kiocb *req, *tmp; LIST_HEAD(done); - LIST_HEAD(again); bool spin; int ret; @@ -1818,13 +1829,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) break; - if (req->result == -EAGAIN) { - list_move_tail(&req->list, &again); - continue; - } - if (!list_empty(&again)) - break; - ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); if (ret < 0) break; @@ -1837,9 +1841,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) io_iopoll_complete(ctx, nr_events, &done); - if (!list_empty(&again)) - io_iopoll_queue(&again); - return ret; } @@ -1990,9 +1991,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (res != -EAGAIN && res != req->result) req_set_fail_links(req); - req->result = res; - if (res != -EAGAIN) + + WRITE_ONCE(req->result, res); + /* order with io_poll_complete() checking ->result */ + if (res != -EAGAIN) { + smp_wmb(); WRITE_ONCE(req->iopoll_completed, 1); + } } /* -- cgit From 9d8426a09195e2dcf2aa249de2aaadd792d491c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jun 2020 18:42:49 -0600 Subject: io_uring: acquire 'mm' for task_work for SQPOLL If we're unlucky with timing, we could be running task_work after having dropped the memory context in the sq thread. Since dropping the context requires a runnable task state, we cannot reliably drop it as part of our check-for-work loop in io_sq_thread(). Instead, abstract out the mm acquire for the sq thread into a helper, and call it from the async task work handler. Cc: stable@vger.kernel.org # v5.7 Signed-off-by: Jens Axboe --- fs/io_uring.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9d2ae9aa8b45..98c83fbf4f88 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4256,6 +4256,28 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&pt->req->apoll->poll, pt, head); } +static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; +} + static void io_async_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); @@ -4290,11 +4312,16 @@ static void io_async_task_func(struct callback_head *cb) if (!canceled) { __set_current_state(TASK_RUNNING); + if (io_sq_thread_acquire_mm(ctx, req)) { + io_cqring_add_event(req, -EFAULT); + goto end_req; + } mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); } else { io_cqring_ev_posted(ctx); +end_req: req_set_fail_links(req); io_double_put_req(req); } @@ -5841,11 +5868,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; - if (io_op_defs[req->opcode].needs_mm && !current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_mm); - } + if (unlikely(io_sq_thread_acquire_mm(ctx, req))) + return -EFAULT; sqe_flags = READ_ONCE(sqe->flags); /* enforce forwards compatibility on users */ @@ -5954,16 +5978,6 @@ fail_req: return submitted; } -static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - } -} - static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; -- cgit From 56952e91acc93ed624fe9da840900defb75f1323 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 17 Jun 2020 15:00:04 -0600 Subject: io_uring: reap poll completions while waiting for refs to drop on exit If we're doing polled IO and end up having requests being submitted async, then completions can come in while we're waiting for refs to drop. We need to reap these manually, as nobody else will be looking for them. Break the wait into 1/20th of a second time waits, and check for done poll completions if we time out. Otherwise we can have done poll completions sitting in ctx->poll_list, which needs us to reap them but we're just waiting for them. Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 98c83fbf4f88..2038d52c5450 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7363,7 +7363,17 @@ static void io_ring_exit_work(struct work_struct *work) if (ctx->rings) io_cqring_overflow_flush(ctx, true); - wait_for_completion(&ctx->ref_comp); + /* + * If we're doing polled IO and end up having requests being + * submitted async (out-of-line), then completions can come in while + * we're waiting for refs to drop. We need to reap these manually, + * as nobody else will be looking for them. + */ + while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) { + io_iopoll_reap_events(ctx); + if (ctx->rings) + io_cqring_overflow_flush(ctx, true); + } io_ring_ctx_free(ctx); } -- cgit From 6f2cc1664db20676069cff27a461ccc97dbfd114 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 18 Jun 2020 15:01:56 +0800 Subject: io_uring: fix possible race condition against REQ_F_NEED_CLEANUP In io_read() or io_write(), when io request is submitted successfully, it'll go through the below sequence: kfree(iovec); req->flags &= ~REQ_F_NEED_CLEANUP; return ret; But clearing REQ_F_NEED_CLEANUP might be unsafe. The io request may already have been completed, and then io_complete_rw_iopoll() and io_complete_rw() will be called, both of which will also modify req->flags if needed. This causes a race condition, with concurrent non-atomic modification of req->flags. To eliminate this race, in io_read() or io_write(), if io request is submitted successfully, we don't remove REQ_F_NEED_CLEANUP flag. If REQ_F_NEED_CLEANUP is set, we'll leave __io_req_aux_free() to the iovec cleanup work correspondingly. Cc: stable@vger.kernel.org Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2038d52c5450..a78201b96179 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2670,8 +2670,8 @@ copy_iov: } } out_free: - kfree(iovec); - req->flags &= ~REQ_F_NEED_CLEANUP; + if (!(req->flags & REQ_F_NEED_CLEANUP)) + kfree(iovec); return ret; } @@ -2793,8 +2793,8 @@ copy_iov: } } out_free: - req->flags &= ~REQ_F_NEED_CLEANUP; - kfree(iovec); + if (!(req->flags & REQ_F_NEED_CLEANUP)) + kfree(iovec); return ret; } -- cgit From 5769a351b89cd4d82016f18fa5f6c4077403564d Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Wed, 17 Jun 2020 17:53:55 +0800 Subject: io_uring: change the poll type to be 32-bits poll events should be 32-bits to cover EPOLLEXCLUSIVE. Explicit word-swap the poll32_events for big endian to make sure the ABI is not changed. We call this feature IORING_FEAT_POLL_32BITS, applications who want to use EPOLLEXCLUSIVE should check the feature bit first. Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index a78201b96179..0eb063daa9b5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4589,7 +4589,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; - u16 events; + u32 events; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4598,7 +4598,10 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (!poll->file) return -EBADF; - events = READ_ONCE(sqe->poll_events); + events = READ_ONCE(sqe->poll32_events); +#ifdef __BIG_ENDIAN + events = swahw32(events); +#endif poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; io_get_req_task(req); @@ -7928,7 +7931,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | + IORING_FEAT_POLL_32BITS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -8217,7 +8221,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); - BUILD_BUG_SQE_ELEM(28, __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); -- cgit From a31eb4a2f1650fa578082ad9e9845487ecd90abe Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Wed, 17 Jun 2020 17:53:56 +0800 Subject: io_uring: use EPOLLEXCLUSIVE flag to aoid thundering herd type behavior Applications can pass this flag in to avoid accept thundering herd. Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0eb063daa9b5..311e8038ae58 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4245,7 +4245,11 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = 0; poll->head = head; - add_wait_queue(head, &poll->wait); + + if (poll->events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(head, &poll->wait); + else + add_wait_queue(head, &poll->wait); } static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, @@ -4602,7 +4606,8 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe #ifdef __BIG_ENDIAN events = swahw32(events); #endif - poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP | + (events & EPOLLEXCLUSIVE); io_get_req_task(req); return 0; -- cgit From a087e2b519929152fdde8299457e32d5a8994a7c Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Tue, 16 Jun 2020 16:36:07 -0700 Subject: io_uring: add wrappers for memory accounting Facilitate separation of locked memory usage reporting vs. limiting for upcoming patches. No functional changes. Signed-off-by: Bijan Mottahedeh [axboe: kill unnecessary () around return in io_account_mem()] Signed-off-by: Jens Axboe --- fs/io_uring.c | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 311e8038ae58..9db9f09499d1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6968,12 +6968,14 @@ err: return ret; } -static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) +static inline void __io_unaccount_mem(struct user_struct *user, + unsigned long nr_pages) { atomic_long_sub(nr_pages, &user->locked_vm); } -static int io_account_mem(struct user_struct *user, unsigned long nr_pages) +static inline int __io_account_mem(struct user_struct *user, + unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -6991,6 +6993,20 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) return 0; } +static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +{ + if (ctx->account_mem) + __io_unaccount_mem(ctx->user, nr_pages); +} + +static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +{ + if (ctx->account_mem) + return __io_account_mem(ctx->user, nr_pages); + + return 0; +} + static void io_mem_free(void *ptr) { struct page *page; @@ -7065,8 +7081,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) for (j = 0; j < imu->nr_bvecs; j++) unpin_user_page(imu->bvec[j].bv_page); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, imu->nr_bvecs); + io_unaccount_mem(ctx, imu->nr_bvecs); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -7149,11 +7164,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - if (ctx->account_mem) { - ret = io_account_mem(ctx->user, nr_pages); - if (ret) - goto err; - } + ret = io_account_mem(ctx, nr_pages); + if (ret) + goto err; ret = 0; if (!pages || nr_pages > got_pages) { @@ -7166,8 +7179,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages); goto err; } got_pages = nr_pages; @@ -7177,8 +7189,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages); goto err; } @@ -7209,8 +7220,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, */ if (pret > 0) unpin_user_pages(pages, pret); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages); kvfree(imu->bvec); goto err; } @@ -7315,9 +7325,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_mem_free(ctx->sq_sqes); percpu_ref_exit(&ctx->refs); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, - ring_pages(ctx->sq_entries, ctx->cq_entries)); + io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); put_cred(ctx->creds); kfree(ctx->cancel_hash); @@ -7887,7 +7895,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, account_mem = !capable(CAP_IPC_LOCK); if (account_mem) { - ret = io_account_mem(user, + ret = __io_account_mem(user, ring_pages(p->sq_entries, p->cq_entries)); if (ret) { free_uid(user); @@ -7898,7 +7906,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx = io_ring_ctx_alloc(p); if (!ctx) { if (account_mem) - io_unaccount_mem(user, ring_pages(p->sq_entries, + __io_unaccount_mem(user, ring_pages(p->sq_entries, p->cq_entries)); free_uid(user); return -ENOMEM; -- cgit From aad5d8da1b301fe399d65f2dcb84df2ec60caaa3 Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Tue, 16 Jun 2020 16:36:08 -0700 Subject: io_uring: rename ctx->account_mem field Rename account_mem to limit_name to clarify its purpose. Signed-off-by: Bijan Mottahedeh Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9db9f09499d1..fcaf9eee3420 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -226,7 +226,7 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int compat: 1; - unsigned int account_mem: 1; + unsigned int limit_mem: 1; unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; @@ -6995,13 +6995,13 @@ static inline int __io_account_mem(struct user_struct *user, static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { - if (ctx->account_mem) + if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); } static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { - if (ctx->account_mem) + if (ctx->limit_mem) return __io_account_mem(ctx->user, nr_pages); return 0; @@ -7853,7 +7853,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, { struct user_struct *user = NULL; struct io_ring_ctx *ctx; - bool account_mem; + bool limit_mem; int ret; if (!entries) @@ -7892,9 +7892,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, } user = get_uid(current_user()); - account_mem = !capable(CAP_IPC_LOCK); + limit_mem = !capable(CAP_IPC_LOCK); - if (account_mem) { + if (limit_mem) { ret = __io_account_mem(user, ring_pages(p->sq_entries, p->cq_entries)); if (ret) { @@ -7905,14 +7905,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx = io_ring_ctx_alloc(p); if (!ctx) { - if (account_mem) + if (limit_mem) __io_unaccount_mem(user, ring_pages(p->sq_entries, p->cq_entries)); free_uid(user); return -ENOMEM; } ctx->compat = in_compat_syscall(); - ctx->account_mem = account_mem; + ctx->limit_mem = limit_mem; ctx->user = user; ctx->creds = get_current_cred(); -- cgit From 309758254ea62e07471abcaeca5b5c2173f4ebc2 Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Tue, 16 Jun 2020 16:36:09 -0700 Subject: io_uring: report pinned memory usage Report pinned memory usage always, regardless of whether locked memory limit is enforced. Signed-off-by: Bijan Mottahedeh Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index fcaf9eee3420..5ea55de3edef 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6997,12 +6997,23 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); + + if (ctx->sqo_mm) + atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); } static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { - if (ctx->limit_mem) - return __io_account_mem(ctx->user, nr_pages); + int ret; + + if (ctx->limit_mem) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + return ret; + } + + if (ctx->sqo_mm) + atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); return 0; } @@ -7304,8 +7315,10 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); - if (ctx->sqo_mm) + if (ctx->sqo_mm) { mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + } io_iopoll_reap_events(ctx); io_sqe_buffer_unregister(ctx); @@ -7912,7 +7925,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, return -ENOMEM; } ctx->compat = in_compat_syscall(); - ctx->limit_mem = limit_mem; ctx->user = user; ctx->creds = get_current_cred(); @@ -7960,6 +7972,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries)); + ctx->limit_mem = limit_mem; return ret; err: io_ring_ctx_wait_and_kill(ctx); -- cgit From 2e0464d48f32a9e78e2aa85cbbedc77ecbb6ed60 Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Tue, 16 Jun 2020 16:36:10 -0700 Subject: io_uring: separate reporting of ring pages from registered pages Ring pages are not pinned so it is more appropriate to report them as locked. Signed-off-by: Bijan Mottahedeh Signed-off-by: Jens Axboe --- fs/io_uring.c | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5ea55de3edef..10b293780703 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -880,6 +880,11 @@ static const struct io_op_def io_op_defs[] = { }, }; +enum io_mem_account { + ACCT_LOCKED, + ACCT_PINNED, +}; + static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); @@ -6993,16 +6998,22 @@ static inline int __io_account_mem(struct user_struct *user, return 0; } -static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, + enum io_mem_account acct) { if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); - if (ctx->sqo_mm) - atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); + if (ctx->sqo_mm) { + if (acct == ACCT_LOCKED) + ctx->sqo_mm->locked_vm -= nr_pages; + else if (acct == ACCT_PINNED) + atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); + } } -static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, + enum io_mem_account acct) { int ret; @@ -7012,8 +7023,12 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return ret; } - if (ctx->sqo_mm) - atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); + if (ctx->sqo_mm) { + if (acct == ACCT_LOCKED) + ctx->sqo_mm->locked_vm += nr_pages; + else if (acct == ACCT_PINNED) + atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); + } return 0; } @@ -7092,7 +7107,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) for (j = 0; j < imu->nr_bvecs; j++) unpin_user_page(imu->bvec[j].bv_page); - io_unaccount_mem(ctx, imu->nr_bvecs); + io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -7175,7 +7190,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - ret = io_account_mem(ctx, nr_pages); + ret = io_account_mem(ctx, nr_pages, ACCT_PINNED); if (ret) goto err; @@ -7190,7 +7205,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; - io_unaccount_mem(ctx, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } got_pages = nr_pages; @@ -7200,7 +7215,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { - io_unaccount_mem(ctx, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } @@ -7231,7 +7246,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, */ if (pret > 0) unpin_user_pages(pages, pret); - io_unaccount_mem(ctx, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); kvfree(imu->bvec); goto err; } @@ -7338,7 +7353,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_mem_free(ctx->sq_sqes); percpu_ref_exit(&ctx->refs); - io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries)); + io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries), + ACCT_LOCKED); free_uid(ctx->user); put_cred(ctx->creds); kfree(ctx->cancel_hash); @@ -7972,7 +7988,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); - io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries)); + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), + ACCT_LOCKED); ctx->limit_mem = limit_mem; return ret; err: -- cgit From ac8691c415e0ce0b8734cb6d9df2df18608eebed Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Jun 2020 08:30:41 -0600 Subject: io_uring: always plug for any number of IOs Currently we only plug if we're doing more than two request. We're going to be relying on always having the plug there to pass down information, so plug unconditionally. Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 10b293780703..de894455f6bd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -676,7 +676,6 @@ struct io_kiocb { }; }; -#define IO_PLUG_THRESHOLD 2 #define IO_IOPOLL_BATCH 8 struct io_submit_state { @@ -5914,7 +5913,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, struct file *ring_file, int ring_fd) { - struct io_submit_state state, *statep = NULL; + struct io_submit_state state; struct io_kiocb *link = NULL; int i, submitted = 0; @@ -5931,10 +5930,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - if (nr > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, nr); - statep = &state; - } + io_submit_state_start(&state, nr); ctx->ring_fd = ring_fd; ctx->ring_file = ring_file; @@ -5949,14 +5945,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, io_consume_sqe(ctx); break; } - req = io_alloc_req(ctx, statep); + req = io_alloc_req(ctx, &state); if (unlikely(!req)) { if (!submitted) submitted = -EAGAIN; break; } - err = io_init_req(ctx, req, sqe, statep); + err = io_init_req(ctx, req, sqe, &state); io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; @@ -5982,8 +5978,7 @@ fail_req: } if (link) io_queue_link_head(link); - if (statep) - io_submit_state_end(&state); + io_submit_state_end(&state); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); -- cgit From 4503b7676a2e0abe69c2f2c0d8b03aec53f2f048 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Jun 2020 10:00:27 -0600 Subject: io_uring: catch -EIO from buffered issue request failure -EIO bubbles up like -EAGAIN if we fail to allocate a request at the lower level. Play it safe and treat it like -EAGAIN in terms of sync retry, to avoid passing back an errant -EIO. Catch some of these early for block based file, as non-mq devices generally do not support NOWAIT. That saves us some overhead by not first trying, then retrying from async context. We can go straight to async punt instead. Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index de894455f6bd..c5ee6d1a92d3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2088,6 +2088,15 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) return state->file; } +static bool io_bdev_nowait(struct block_device *bdev) +{ +#ifdef CONFIG_BLOCK + return !bdev || queue_is_mq(bdev_get_queue(bdev)); +#else + return true; +#endif +} + /* * If we tracked the file through the SCM inflight mechanism, we could support * any file. For now, just ensure that anything potentially problematic is done @@ -2097,10 +2106,19 @@ static bool io_file_supports_async(struct file *file, int rw) { umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode)) - return true; - if (S_ISREG(mode) && file->f_op != &io_uring_fops) + if (S_ISBLK(mode)) { + if (io_bdev_nowait(file->f_inode->i_bdev)) + return true; + return false; + } + if (S_ISCHR(mode) || S_ISSOCK(mode)) return true; + if (S_ISREG(mode)) { + if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) && + file->f_op != &io_uring_fops) + return true; + return false; + } /* any ->read/write should understand O_NONBLOCK */ if (file->f_flags & O_NONBLOCK) @@ -2650,7 +2668,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (!ret) { - ssize_t ret2; + ssize_t ret2 = 0; if (req->file->f_op->read_iter) ret2 = call_read_iter(req->file, kiocb, &iter); @@ -2658,7 +2676,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || ret2 != -EAGAIN) { + if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { kiocb_done(kiocb, ret2); } else { copy_iov: -- cgit From b63534c41e20b474483b4ddf47efc858c17352e0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 4 Jun 2020 11:28:00 -0600 Subject: io_uring: re-issue block requests that failed because of resources Mark the plug with nowait == true, which will cause requests to avoid blocking on request allocation. If they do, we catch them and reissue them from a task_work based handler. Normally we can catch -EAGAIN directly, but the hard case is for split requests. As an example, the application issues a 512KB request. The block core will split this into 128KB if that's the max size for the device. The first request issues just fine, but we run into -EAGAIN for some latter splits for the same request. As the bio is split, we don't get to see the -EAGAIN until one of the actual reads complete, and hence we cannot handle it inline as part of submission. This does potentially cause re-reads of parts of the range, as the whole request is reissued. There's currently no better way to handle this. Signed-off-by: Jens Axboe --- fs/io_uring.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 124 insertions(+), 24 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index c5ee6d1a92d3..f3dbf83fabf3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -900,6 +900,13 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe); +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock); +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter); + static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -1978,12 +1985,115 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) __io_cqring_add_event(req, res, cflags); } +static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; +} + +#ifdef CONFIG_BLOCK +static bool io_resubmit_prep(struct io_kiocb *req, int error) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + ssize_t ret = -ECANCELED; + struct iov_iter iter; + int rw; + + if (error) { + ret = error; + goto end_req; + } + + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + rw = READ; + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + rw = WRITE; + break; + default: + printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n", + req->opcode); + goto end_req; + } + + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); + if (!ret) + return true; + kfree(iovec); +end_req: + io_cqring_add_event(req, ret); + req_set_fail_links(req); + io_put_req(req); + return false; +} + +static void io_rw_resubmit(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; + int err; + + __set_current_state(TASK_RUNNING); + + err = io_sq_thread_acquire_mm(ctx, req); + + if (io_resubmit_prep(req, err)) { + refcount_inc(&req->refs); + io_queue_async_work(req); + } +} +#endif + +static bool io_rw_reissue(struct io_kiocb *req, long res) +{ +#ifdef CONFIG_BLOCK + struct task_struct *tsk; + int ret; + + if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) + return false; + + tsk = req->task; + init_task_work(&req->task_work, io_rw_resubmit); + ret = task_work_add(tsk, &req->task_work, true); + if (!ret) + return true; +#endif + return false; +} + static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - io_complete_rw_common(kiocb, res); - io_put_req(req); + if (!io_rw_reissue(req, res)) { + io_complete_rw_common(kiocb, res); + io_put_req(req); + } } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -2169,6 +2279,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (kiocb->ki_flags & IOCB_NOWAIT) req->flags |= REQ_F_NOWAIT; + if (kiocb->ki_flags & IOCB_DIRECT) + io_get_req_task(req); + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; @@ -2668,6 +2781,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (!ret) { + unsigned long nr_segs = iter.nr_segs; ssize_t ret2 = 0; if (req->file->f_op->read_iter) @@ -2679,6 +2793,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { kiocb_done(kiocb, ret2); } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); @@ -2765,6 +2881,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) iov_count = iov_iter_count(&iter); ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); if (!ret) { + unsigned long nr_segs = iter.nr_segs; ssize_t ret2; /* @@ -2802,6 +2919,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2); } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); @@ -4282,28 +4401,6 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&pt->req->apoll->poll, pt, head); } -static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - } -} - -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (io_op_defs[req->opcode].needs_mm && !current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_mm); - } - - return 0; -} - static void io_async_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); @@ -5814,6 +5911,9 @@ static void io_submit_state_start(struct io_submit_state *state, unsigned int max_ios) { blk_start_plug(&state->plug); +#ifdef CONFIG_BLOCK + state->plug.nowait = true; +#endif state->free_reqs = 0; state->file = NULL; state->ios_left = max_ios; -- cgit From bcf5a06304d69a3bb194a494d87b532d5e90b01c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:24:42 -0600 Subject: io_uring: support true async buffered reads, if file provides it If the file is flagged with FMODE_BUF_RASYNC, then we don't have to punt the buffered read to an io-wq worker. Instead we can rely on page unlocking callbacks to support retry based async IO. This is a lot more efficient than doing async thread offload. The retry is done similarly to how we handle poll based retry. From the unlock callback, we simply queue the retry to a task_work based handler. Signed-off-by: Jens Axboe --- fs/io_uring.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 135 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index f3dbf83fabf3..5d1685e206c1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -78,6 +78,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -503,6 +504,8 @@ struct io_async_rw { struct iovec *iov; ssize_t nr_segs; ssize_t size; + struct wait_page_queue wpq; + struct callback_head task_work; }; struct io_async_ctx { @@ -2750,6 +2753,126 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static void __io_async_buf_error(struct io_kiocb *req, int error) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + io_cqring_fill_event(req, error); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + req_set_fail_links(req); + io_double_put_req(req); +} + +static void io_async_buf_cancel(struct callback_head *cb) +{ + struct io_async_rw *rw; + struct io_kiocb *req; + + rw = container_of(cb, struct io_async_rw, task_work); + req = rw->wpq.wait.private; + __io_async_buf_error(req, -ECANCELED); +} + +static void io_async_buf_retry(struct callback_head *cb) +{ + struct io_async_rw *rw; + struct io_ring_ctx *ctx; + struct io_kiocb *req; + + rw = container_of(cb, struct io_async_rw, task_work); + req = rw->wpq.wait.private; + ctx = req->ctx; + + __set_current_state(TASK_RUNNING); + if (!io_sq_thread_acquire_mm(ctx, req)) { + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL); + mutex_unlock(&ctx->uring_lock); + } else { + __io_async_buf_error(req, -EFAULT); + } +} + +static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, + int sync, void *arg) +{ + struct wait_page_queue *wpq; + struct io_kiocb *req = wait->private; + struct io_async_rw *rw = &req->io->rw; + struct wait_page_key *key = arg; + struct task_struct *tsk; + int ret; + + wpq = container_of(wait, struct wait_page_queue, wait); + + ret = wake_page_match(wpq, key); + if (ret != 1) + return ret; + + list_del_init(&wait->entry); + + init_task_work(&rw->task_work, io_async_buf_retry); + /* submit ref gets dropped, acquire a new one */ + refcount_inc(&req->refs); + tsk = req->task; + ret = task_work_add(tsk, &rw->task_work, true); + if (unlikely(ret)) { + /* queue just for cancelation */ + init_task_work(&rw->task_work, io_async_buf_cancel); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &rw->task_work, true); + } + wake_up_process(tsk); + return 1; +} + +static bool io_rw_should_retry(struct io_kiocb *req) +{ + struct kiocb *kiocb = &req->rw.kiocb; + int ret; + + /* never retry for NOWAIT, we just complete with -EAGAIN */ + if (req->flags & REQ_F_NOWAIT) + return false; + + /* already tried, or we're doing O_DIRECT */ + if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ)) + return false; + /* + * just use poll if we can, and don't attempt if the fs doesn't + * support callback based unlocks + */ + if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) + return false; + + /* + * If request type doesn't require req->io to defer in general, + * we need to allocate it here + */ + if (!req->io && __io_alloc_async_ctx(req)) + return false; + + ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, + io_async_buf_func, req); + if (!ret) { + io_get_req_task(req); + return true; + } + + return false; +} + +static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) +{ + if (req->file->f_op->read_iter) + return call_read_iter(req->file, &req->rw.kiocb, iter); + return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); +} + static int io_read(struct io_kiocb *req, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; @@ -2784,10 +2907,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) unsigned long nr_segs = iter.nr_segs; ssize_t ret2 = 0; - if (req->file->f_op->read_iter) - ret2 = call_read_iter(req->file, kiocb, &iter); - else - ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); + ret2 = io_iter_do_read(req, &iter); /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { @@ -2804,6 +2924,17 @@ copy_iov: if (!(req->flags & REQ_F_NOWAIT) && !file_can_poll(req->file)) req->flags |= REQ_F_MUST_PUNT; + /* if we can retry, do so with the callbacks armed */ + if (io_rw_should_retry(req)) { + ret2 = io_iter_do_read(req, &iter); + if (ret2 == -EIOCBQUEUED) { + goto out_free; + } else if (ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2); + goto out_free; + } + } + kiocb->ki_flags &= ~IOCB_WAITQ; return -EAGAIN; } } -- cgit From 62ef73165091476d31f31e33d9d0d48b088c129d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 21 Jun 2020 13:09:50 +0300 Subject: io_uring: remove setting REQ_F_MUST_PUNT in rw io_{read,write}() { ... copy_iov: // prep async if (!(flags & REQ_F_NOWAIT) && !file_can_poll(file)) flags |= REQ_F_MUST_PUNT; } REQ_F_MUST_PUNT there is pointless, because if it happens then REQ_F_NOWAIT is known to be _not_ set, and the request will go async path in __io_queue_sqe() anyway. file_can_poll() check is also repeated in arm_poll*(), so don't need it. Remove the mentioned assignment REQ_F_MUST_PUNT in preparation for killing the flag. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5d1685e206c1..13f72d2a3fec 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2920,10 +2920,6 @@ copy_iov: inline_vecs, &iter); if (ret) goto out_free; - /* any defer here is final, must blocking retry */ - if (!(req->flags & REQ_F_NOWAIT) && - !file_can_poll(req->file)) - req->flags |= REQ_F_MUST_PUNT; /* if we can retry, do so with the callbacks armed */ if (io_rw_should_retry(req)) { ret2 = io_iter_do_read(req, &iter); @@ -3057,10 +3053,6 @@ copy_iov: inline_vecs, &iter); if (ret) goto out_free; - /* any defer here is final, must blocking retry */ - if (!(req->flags & REQ_F_NOWAIT) && - !file_can_poll(req->file)) - req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } -- cgit From 24c74678634b3cbdb325b3b7706366c83811b311 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 21 Jun 2020 13:09:51 +0300 Subject: io_uring: remove REQ_F_MUST_PUNT REQ_F_MUST_PUNT may seem looking good and clear, but it's the same as not having REQ_F_NOWAIT set. That rather creates more confusion. Moreover, it doesn't even affect any behaviour (e.g. see the patch removing it from io_{read,write}). Kill theg flag and update already outdated comments. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 13f72d2a3fec..93af915a98e6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -534,7 +534,6 @@ enum { REQ_F_LINK_TIMEOUT_BIT, REQ_F_TIMEOUT_BIT, REQ_F_ISREG_BIT, - REQ_F_MUST_PUNT_BIT, REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, @@ -582,8 +581,6 @@ enum { REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* must be punted even for NONBLOCK */ - REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), /* no timeout sequence */ REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), /* completion under lock */ @@ -2894,10 +2891,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) if (req->flags & REQ_F_LINK_HEAD) req->result = io_size; - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ + /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; @@ -2993,10 +2987,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) if (req->flags & REQ_F_LINK_HEAD) req->result = io_size; - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ + /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) goto copy_iov; @@ -3717,8 +3708,10 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) /* if the file has a flush method, be safe and punt to async */ if (close->put_file->f_op->flush && force_nonblock) { + /* was never set, but play safe */ + req->flags &= ~REQ_F_NOWAIT; /* avoid grabbing files - we don't need the files */ - req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT; + req->flags |= REQ_F_NO_FILE_TABLE; return -EAGAIN; } @@ -4645,7 +4638,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (!req->file || !file_can_poll(req->file)) return false; - if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED)) + if (req->flags & REQ_F_POLLED) return false; if (!def->pollin && !def->pollout) return false; @@ -5852,8 +5845,7 @@ again: * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || - (req->flags & REQ_F_MUST_PUNT))) { + if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { if (io_arm_poll_handler(req)) { if (linked_timeout) io_queue_linked_timeout(linked_timeout); -- cgit From b90cd197f9315f968d5ee4e6ee9f4e3067f2c883 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 21 Jun 2020 13:09:52 +0300 Subject: io_uring: set @poll->file after @poll init It's a good practice to modify fields of a struct after but not before it was initialised. Even though io_init_poll_iocb() doesn't touch poll->file, call it first. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 93af915a98e6..cc1f2f3b7bfa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4596,8 +4596,8 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, struct io_ring_ctx *ctx = req->ctx; bool cancel = false; - poll->file = req->file; io_init_poll_iocb(poll, mask, wake_func); + poll->file = req->file; poll->wait.private = req; ipt->pt._key = mask; -- cgit From f6b6c7d6a9600bdbf5826f57137630e1670e2d87 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 21 Jun 2020 13:09:53 +0300 Subject: io_uring: kill NULL checks for submit state After recent changes, io_submit_sqes() always passes valid submit state, so kill leftovers checking it for NULL. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index cc1f2f3b7bfa..c686061c3762 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1376,11 +1376,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; - if (!state) { - req = kmem_cache_alloc(req_cachep, gfp); - if (unlikely(!req)) - goto fallback; - } else if (!state->free_reqs) { + if (!state->free_reqs) { size_t sz; int ret; -- cgit From b772f07add1c0b22e02c0f1e96f647560679d3a9 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 23 Jun 2020 19:34:06 +0800 Subject: io_uring: fix io_sq_thread no schedule when busy When the user consumes and generates sqe at a fast rate, io_sqring_entries can always get sqe, and ret will not be equal to -EBUSY, so that io_sq_thread will never call cond_resched or schedule, and then we will get the following system error prompt: rcu: INFO: rcu_sched self-detected stall on CPU or watchdog: BUG: soft lockup-CPU#23 stuck for 112s! [io_uring-sq:1863] This patch checks whether need to call cond_resched() by checking the need_resched() function every cycle. Suggested-by: Jens Axboe Signed-off-by: Xuan Zhuo Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index a78201b96179..9de9db70b928 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6011,7 +6011,7 @@ static int io_sq_thread(void *data) * If submit got -EBUSY, flag us as needing the application * to enter the kernel to reap and flush events. */ - if (!to_submit || ret == -EBUSY) { + if (!to_submit || ret == -EBUSY || need_resched()) { /* * Drop cur_mm before scheduling, we can't hold it for * long periods (or over schedule()). Do this before @@ -6027,7 +6027,7 @@ static int io_sq_thread(void *data) * more IO, we should wait for the application to * reap events and wake us up. */ - if (!list_empty(&ctx->poll_list) || + if (!list_empty(&ctx->poll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { if (current->task_works) -- cgit From cd664b0e35cb1202f40c259a1a5ea791d18c879d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Jun 2020 12:37:10 +0300 Subject: io_uring: fix hanging iopoll in case of -EAGAIN io_do_iopoll() won't do anything with a request unless req->iopoll_completed is set. So io_complete_rw_iopoll() has to set it, otherwise io_do_iopoll() will poll a file again and again even though the request of interest was completed long time ago. Also, remove -EAGAIN check from io_issue_sqe() as it races with the changed lines. The request will take the long way and be resubmitted from io_iopoll*(). io_kiocb's result and iopoll_completed") Fixes: bbde017a32b3 ("io_uring: add memory barrier to synchronize Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9de9db70b928..c3e5c1346cfe 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1994,10 +1994,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) WRITE_ONCE(req->result, res); /* order with io_poll_complete() checking ->result */ - if (res != -EAGAIN) { - smp_wmb(); - WRITE_ONCE(req->iopoll_completed, 1); - } + smp_wmb(); + WRITE_ONCE(req->iopoll_completed, 1); } /* @@ -5353,9 +5351,6 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) { const bool in_async = io_wq_current_is_worker(); - if (req->result == -EAGAIN) - return -EAGAIN; - /* workqueue context doesn't hold uring_lock, grab it now */ if (in_async) mutex_lock(&ctx->uring_lock); -- cgit From d60b5fbc1ce8210759b568da49d149b868e7c6d3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Jun 2020 12:37:11 +0300 Subject: io_uring: fix current->mm NULL dereference on exit Don't reissue requests from io_iopoll_reap_events(), the task may not have mm, which ends up with NULL. It's better to kill everything off on exit anyway. [ 677.734670] RIP: 0010:io_iopoll_complete+0x27e/0x630 ... [ 677.734679] Call Trace: [ 677.734695] ? __send_signal+0x1f2/0x420 [ 677.734698] ? _raw_spin_unlock_irqrestore+0x24/0x40 [ 677.734699] ? send_signal+0xf5/0x140 [ 677.734700] io_iopoll_getevents+0x12f/0x1a0 [ 677.734702] io_iopoll_reap_events.part.0+0x5e/0xa0 [ 677.734703] io_ring_ctx_wait_and_kill+0x132/0x1c0 [ 677.734704] io_uring_release+0x20/0x30 [ 677.734706] __fput+0xcd/0x230 [ 677.734707] ____fput+0xe/0x10 [ 677.734709] task_work_run+0x67/0xa0 [ 677.734710] do_exit+0x35d/0xb70 [ 677.734712] do_group_exit+0x43/0xa0 [ 677.734713] get_signal+0x140/0x900 [ 677.734715] do_signal+0x37/0x780 [ 677.734717] ? enqueue_hrtimer+0x41/0xb0 [ 677.734718] ? recalibrate_cpu_khz+0x10/0x10 [ 677.734720] ? ktime_get+0x3e/0xa0 [ 677.734721] ? lapic_next_deadline+0x26/0x30 [ 677.734723] ? tick_program_event+0x4d/0x90 [ 677.734724] ? __hrtimer_get_next_event+0x4d/0x80 [ 677.734726] __prepare_exit_to_usermode+0x126/0x1c0 [ 677.734741] prepare_exit_to_usermode+0x9/0x40 [ 677.734742] idtentry_exit_cond_rcu+0x4c/0x60 [ 677.734743] sysvec_reschedule_ipi+0x92/0x160 [ 677.734744] ? asm_sysvec_reschedule_ipi+0xa/0x20 [ 677.734745] asm_sysvec_reschedule_ipi+0x12/0x20 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index c3e5c1346cfe..e507737f044e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -890,6 +890,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); static int io_grab_files(struct io_kiocb *req); +static void io_complete_rw_common(struct kiocb *kiocb, long res); static void io_cleanup_req(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); @@ -1749,6 +1750,14 @@ static void io_iopoll_queue(struct list_head *again) do { req = list_first_entry(again, struct io_kiocb, list); list_del(&req->list); + + /* shouldn't happen unless io_uring is dying, cancel reqs */ + if (unlikely(!current->mm)) { + io_complete_rw_common(&req->rw.kiocb, -EAGAIN); + io_put_req(req); + continue; + } + refcount_inc(&req->refs); io_queue_async_work(req); } while (!list_empty(again)); -- cgit From d3cac64c498c4fb2df46b97ee6f4c7d6d75f5e3d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Jun 2020 12:38:13 +0300 Subject: io_uring: fix NULL-mm for linked reqs __io_queue_sqe() tries to handle all request of a link, so it's not enough to grab mm in io_sq_thread_acquire_mm() based just on the head. Don't check req->needs_mm and do it always. Signed-off-by: Pavel Begunkov --- fs/io_uring.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index c686061c3762..72739188b2ff 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1991,10 +1991,9 @@ static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) } } -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { - if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (!current->mm) { if (unlikely(!mmget_not_zero(ctx->sqo_mm))) return -EFAULT; kthread_use_mm(ctx->sqo_mm); @@ -2003,6 +2002,14 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, return 0; } +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].needs_mm) + return 0; + return __io_sq_thread_acquire_mm(ctx); +} + #ifdef CONFIG_BLOCK static bool io_resubmit_prep(struct io_kiocb *req, int error) { @@ -2781,7 +2788,7 @@ static void io_async_buf_retry(struct callback_head *cb) ctx = req->ctx; __set_current_state(TASK_RUNNING); - if (!io_sq_thread_acquire_mm(ctx, req)) { + if (!__io_sq_thread_acquire_mm(ctx)) { mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); -- cgit From e1e16097e265daac918ce355bf1a0d1677adf0c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 09:17:17 -0600 Subject: io_uring: provide generic io_req_complete() helper We have lots of callers of: io_cqring_add_event(req, result); io_put_req(req); Provide a helper that does this for us. It helps clean up the code, and also provides a more convenient location for us to change the completion handling. Signed-off-by: Jens Axboe --- fs/io_uring.c | 106 ++++++++++++++++++++++++---------------------------------- 1 file changed, 43 insertions(+), 63 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 72739188b2ff..17d7bafaf8cf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1335,7 +1335,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) __io_cqring_fill_event(req, res, 0); } -static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) +static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; @@ -1348,9 +1348,15 @@ static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) io_cqring_ev_posted(ctx); } -static void io_cqring_add_event(struct io_kiocb *req, long res) +static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags) { - __io_cqring_add_event(req, res, 0); + io_cqring_add_event(req, res, cflags); + io_put_req(req); +} + +static void io_req_complete(struct io_kiocb *req, long res) +{ + __io_req_complete(req, res, 0); } static inline bool io_is_fallback_req(struct io_kiocb *req) @@ -1978,7 +1984,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_kbuf(req); - __io_cqring_add_event(req, res, cflags); + io_cqring_add_event(req, res, cflags); } static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) @@ -2048,9 +2054,8 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) return true; kfree(iovec); end_req: - io_cqring_add_event(req, ret); req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return false; } @@ -3117,10 +3122,9 @@ static int io_tee(struct io_kiocb *req, bool force_nonblock) io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret != sp->len) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3154,10 +3158,9 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret != sp->len) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3171,8 +3174,7 @@ static int io_nop(struct io_kiocb *req) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - io_cqring_add_event(req, 0); - io_put_req(req); + io_req_complete(req, 0); return 0; } @@ -3211,8 +3213,7 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock) req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3245,8 +3246,7 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock) current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3342,8 +3342,7 @@ err: req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3416,8 +3415,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) io_ring_submit_lock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3504,8 +3502,7 @@ out: io_ring_submit_unlock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3548,8 +3545,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; #else return -EOPNOTSUPP; @@ -3585,8 +3581,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock) ret = do_madvise(ma->addr, ma->len, ma->advice); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; #else return -EOPNOTSUPP; @@ -3625,8 +3620,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock) ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3665,8 +3659,7 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3722,10 +3715,9 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) ret = filp_close(close->put_file, req->work.files); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); fput(close->put_file); close->put_file = NULL; - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3759,8 +3751,7 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) req->sync.flags); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3859,10 +3850,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3902,10 +3892,9 @@ static int io_send(struct io_kiocb *req, bool force_nonblock) ret = -EINTR; } - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -4102,10 +4091,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, cflags); return 0; } @@ -4159,10 +4147,9 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock) kfree(kbuf); req->flags &= ~REQ_F_NEED_CLEANUP; - __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, cflags); return 0; } @@ -4201,8 +4188,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock) ret = -EINTR; req_set_fail_links(req); } - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -4262,8 +4248,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock) out: if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } #else /* !CONFIG_NET */ @@ -4555,7 +4540,7 @@ static void io_async_task_func(struct callback_head *cb) if (!canceled) { __set_current_state(TASK_RUNNING); if (io_sq_thread_acquire_mm(ctx, req)) { - io_cqring_add_event(req, -EFAULT); + io_cqring_add_event(req, -EFAULT, 0); goto end_req; } mutex_lock(&ctx->uring_lock); @@ -4804,10 +4789,9 @@ static int io_poll_remove(struct io_kiocb *req) ret = io_poll_cancel(ctx, addr); spin_unlock_irq(&ctx->completion_lock); - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -5163,8 +5147,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -5657,8 +5640,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) if (ret) { req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); } io_steal_work(req, workptr); @@ -5775,8 +5757,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req(prev); } else { - io_cqring_add_event(req, -ETIME); - io_put_req(req); + io_req_complete(req, -ETIME); } return HRTIMER_NORESTART; } @@ -5885,9 +5866,8 @@ err: /* and drop final reference, if we failed */ if (ret) { - io_cqring_add_event(req, ret); req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); } if (nxt) { req = nxt; @@ -5909,9 +5889,9 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ret) { if (ret != -EIOCBQUEUED) { fail_req: - io_cqring_add_event(req, ret); req_set_fail_links(req); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { if (!req->io) { @@ -5937,8 +5917,8 @@ fail_req: static inline void io_queue_link_head(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_FAIL_LINK)) { - io_cqring_add_event(req, -ECANCELED); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, -ECANCELED); } else io_queue_sqe(req, NULL); } @@ -6195,8 +6175,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (unlikely(err)) { fail_req: - io_cqring_add_event(req, err); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, err); break; } -- cgit From 013538bd65fd3cdbf3ca8b0c99b962c70473c803 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 09:29:15 -0600 Subject: io_uring: add 'io_comp_state' to struct io_submit_state No functional changes in this patch, just in preparation for passing back pending completions to the caller and completing them in a batched fashion. Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 17d7bafaf8cf..002ab5eae20f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -678,6 +678,12 @@ struct io_kiocb { #define IO_IOPOLL_BATCH 8 +struct io_comp_state { + unsigned int nr; + struct list_head list; + struct io_ring_ctx *ctx; +}; + struct io_submit_state { struct blk_plug plug; @@ -687,6 +693,11 @@ struct io_submit_state { void *reqs[IO_IOPOLL_BATCH]; unsigned int free_reqs; + /* + * Batch completion logic + */ + struct io_comp_state comp; + /* * File reference cache */ @@ -6006,12 +6017,15 @@ static void io_submit_state_end(struct io_submit_state *state) * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - unsigned int max_ios) + struct io_ring_ctx *ctx, unsigned int max_ios) { blk_start_plug(&state->plug); #ifdef CONFIG_BLOCK state->plug.nowait = true; #endif + state->comp.nr = 0; + INIT_LIST_HEAD(&state->comp.list); + state->comp.ctx = ctx; state->free_reqs = 0; state->file = NULL; state->ios_left = max_ios; @@ -6146,7 +6160,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - io_submit_state_start(&state, nr); + io_submit_state_start(&state, ctx, nr); ctx->ring_fd = ring_fd; ctx->ring_file = ring_file; -- cgit From f13fad7ba41cef806358885fbb3f9004f3214b2d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 09:34:30 -0600 Subject: io_uring: pass down completion state on the issue side No functional changes in this patch, just in preparation for having the completion state be available on the issue side. Later on, this will allow requests that complete inline to be completed in batches. Signed-off-by: Jens Axboe --- fs/io_uring.c | 67 ++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 17 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 002ab5eae20f..46241c1ad1b8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -909,7 +909,8 @@ static void io_cleanup_req(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); static void __io_queue_sqe(struct io_kiocb *req, - const struct io_uring_sqe *sqe); + const struct io_uring_sqe *sqe, + struct io_comp_state *cs); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, @@ -2806,7 +2807,7 @@ static void io_async_buf_retry(struct callback_head *cb) __set_current_state(TASK_RUNNING); if (!__io_sq_thread_acquire_mm(ctx)) { mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL); + __io_queue_sqe(req, NULL, NULL); mutex_unlock(&ctx->uring_lock); } else { __io_async_buf_error(req, -EFAULT); @@ -4430,7 +4431,7 @@ static void io_poll_task_func(struct callback_head *cb) struct io_ring_ctx *ctx = nxt->ctx; mutex_lock(&ctx->uring_lock); - __io_queue_sqe(nxt, NULL); + __io_queue_sqe(nxt, NULL, NULL); mutex_unlock(&ctx->uring_lock); } } @@ -4555,7 +4556,7 @@ static void io_async_task_func(struct callback_head *cb) goto end_req; } mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL); + __io_queue_sqe(req, NULL, NULL); mutex_unlock(&ctx->uring_lock); } else { io_cqring_ev_posted(ctx); @@ -5352,7 +5353,7 @@ static void io_cleanup_req(struct io_kiocb *req) } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) + bool force_nonblock, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5637,7 +5638,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) if (!ret) { do { - ret = io_issue_sqe(req, NULL, false); + ret = io_issue_sqe(req, NULL, false, NULL); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -5814,7 +5815,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return nxt; } -static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_comp_state *cs) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt; @@ -5834,7 +5836,7 @@ again: old_creds = override_creds(req->work.creds); } - ret = io_issue_sqe(req, sqe, true); + ret = io_issue_sqe(req, sqe, true, cs); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -5892,7 +5894,8 @@ exit: revert_creds(old_creds); } -static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_comp_state *cs) { int ret; @@ -5921,21 +5924,22 @@ fail_req: req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { - __io_queue_sqe(req, sqe); + __io_queue_sqe(req, sqe, cs); } } -static inline void io_queue_link_head(struct io_kiocb *req) +static inline void io_queue_link_head(struct io_kiocb *req, + struct io_comp_state *cs) { if (unlikely(req->flags & REQ_F_FAIL_LINK)) { io_put_req(req); io_req_complete(req, -ECANCELED); } else - io_queue_sqe(req, NULL); + io_queue_sqe(req, NULL, cs); } static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **link) + struct io_kiocb **link, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5975,7 +5979,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - io_queue_link_head(head); + io_queue_link_head(head, cs); *link = NULL; } } else { @@ -5995,18 +5999,47 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_FAIL_LINK; *link = req; } else { - io_queue_sqe(req, sqe); + io_queue_sqe(req, sqe, cs); } } return 0; } +static void io_submit_flush_completions(struct io_comp_state *cs) +{ + struct io_ring_ctx *ctx = cs->ctx; + + spin_lock_irq(&ctx->completion_lock); + while (!list_empty(&cs->list)) { + struct io_kiocb *req; + + req = list_first_entry(&cs->list, struct io_kiocb, list); + list_del(&req->list); + io_cqring_fill_event(req, req->result); + if (!(req->flags & REQ_F_LINK_HEAD)) { + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + } else { + spin_unlock_irq(&ctx->completion_lock); + io_put_req(req); + spin_lock_irq(&ctx->completion_lock); + } + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + cs->nr = 0; +} + /* * Batched submission is done, ensure local IO is flushed out. */ static void io_submit_state_end(struct io_submit_state *state) { + if (!list_empty(&state->comp.list)) + io_submit_flush_completions(&state->comp); blk_finish_plug(&state->plug); io_state_file_put(state); if (state->free_reqs) @@ -6196,7 +6229,7 @@ fail_req: trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, true, io_async_submit(ctx)); - err = io_submit_sqe(req, sqe, &link); + err = io_submit_sqe(req, sqe, &link, &state.comp); if (err) goto fail_req; } @@ -6207,7 +6240,7 @@ fail_req: percpu_ref_put_many(&ctx->refs, nr - ref_used); } if (link) - io_queue_link_head(link); + io_queue_link_head(link, &state.comp); io_submit_state_end(&state); /* Commit SQ ring head once we've consumed and submitted all SQEs */ -- cgit From 229a7b63507a3e84afb17c3bbb67505a81d28a1d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 10:13:11 -0600 Subject: io_uring: pass in completion state to appropriate issue side handlers Provide the completion state to the handlers that we know can complete inline, so they can utilize this for batching completions. Cap the max batch count at 32. This should be enough to provide a good amortization of the cost of the lock+commit dance for completions, while still being low enough not to cause any real latency issues for SQPOLL applications. Xuan Zhuo reports that this changes his profile from: 17.97% [kernel] [k] copy_user_generic_unrolled 13.92% [kernel] [k] io_commit_cqring 11.04% [kernel] [k] __io_cqring_fill_event 10.33% [kernel] [k] udp_recvmsg 5.94% [kernel] [k] skb_release_data 4.31% [kernel] [k] udp_rmem_release 2.68% [kernel] [k] __check_object_size 2.24% [kernel] [k] __slab_free 2.22% [kernel] [k] _raw_spin_lock_bh 2.21% [kernel] [k] kmem_cache_free 2.13% [kernel] [k] free_pcppages_bulk 1.83% [kernel] [k] io_submit_sqes 1.38% [kernel] [k] page_frag_free 1.31% [kernel] [k] inet_recvmsg to 19.99% [kernel] [k] copy_user_generic_unrolled 11.63% [kernel] [k] skb_release_data 9.36% [kernel] [k] udp_rmem_release 8.64% [kernel] [k] udp_recvmsg 6.21% [kernel] [k] __slab_free 4.39% [kernel] [k] __check_object_size 3.64% [kernel] [k] free_pcppages_bulk 2.41% [kernel] [k] kmem_cache_free 2.00% [kernel] [k] io_submit_sqes 1.95% [kernel] [k] page_frag_free 1.54% [kernel] [k] io_put_req [...] 0.07% [kernel] [k] io_commit_cqring 0.44% [kernel] [k] __io_cqring_fill_event Signed-off-by: Jens Axboe --- fs/io_uring.c | 153 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 86 insertions(+), 67 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 46241c1ad1b8..6c9ca4fcbc31 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1360,15 +1360,50 @@ static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags) io_cqring_ev_posted(ctx); } -static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags) +static void io_submit_flush_completions(struct io_comp_state *cs) { - io_cqring_add_event(req, res, cflags); - io_put_req(req); + struct io_ring_ctx *ctx = cs->ctx; + + spin_lock_irq(&ctx->completion_lock); + while (!list_empty(&cs->list)) { + struct io_kiocb *req; + + req = list_first_entry(&cs->list, struct io_kiocb, list); + list_del(&req->list); + io_cqring_fill_event(req, req->result); + if (!(req->flags & REQ_F_LINK_HEAD)) { + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + } else { + spin_unlock_irq(&ctx->completion_lock); + io_put_req(req); + spin_lock_irq(&ctx->completion_lock); + } + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + cs->nr = 0; +} + +static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, + struct io_comp_state *cs) +{ + if (!cs) { + io_cqring_add_event(req, res, cflags); + io_put_req(req); + } else { + req->result = res; + list_add_tail(&req->list, &cs->list); + if (++cs->nr >= 32) + io_submit_flush_completions(cs); + } } static void io_req_complete(struct io_kiocb *req, long res) { - __io_req_complete(req, res, 0); + __io_req_complete(req, res, 0, NULL); } static inline bool io_is_fallback_req(struct io_kiocb *req) @@ -3179,14 +3214,14 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) /* * IORING_OP_NOP just posts a completion event, nothing else. */ -static int io_nop(struct io_kiocb *req) +static int io_nop(struct io_kiocb *req, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - io_req_complete(req, 0); + __io_req_complete(req, 0, 0, cs); return 0; } @@ -3408,7 +3443,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, return i; } -static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) +static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; @@ -3427,7 +3463,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) io_ring_submit_lock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3485,7 +3521,8 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) return i ? i : -ENOMEM; } -static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) +static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; @@ -3514,7 +3551,7 @@ out: io_ring_submit_unlock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3545,7 +3582,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #endif } -static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) +static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { #if defined(CONFIG_EPOLL) struct io_epoll *ie = &req->epoll; @@ -3557,7 +3595,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; #else return -EOPNOTSUPP; @@ -3702,7 +3740,8 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_close(struct io_kiocb *req, bool force_nonblock) +static int io_close(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_close *close = &req->close; int ret; @@ -3729,7 +3768,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) req_set_fail_links(req); fput(close->put_file); close->put_file = NULL; - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3815,7 +3854,8 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } -static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_async_msghdr *kmsg = NULL; struct socket *sock; @@ -3864,11 +3904,12 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } -static int io_send(struct io_kiocb *req, bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct socket *sock; int ret; @@ -3906,7 +3947,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -4049,7 +4090,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, return ret; } -static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_async_msghdr *kmsg = NULL; struct socket *sock; @@ -4105,11 +4147,12 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, cflags); + __io_req_complete(req, ret, cflags, cs); return 0; } -static int io_recv(struct io_kiocb *req, bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_buffer *kbuf = NULL; struct socket *sock; @@ -4161,7 +4204,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, cflags); + __io_req_complete(req, ret, cflags, cs); return 0; } @@ -4181,7 +4224,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_accept(struct io_kiocb *req, bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_accept *accept = &req->accept; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -4200,7 +4244,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock) ret = -EINTR; req_set_fail_links(req); } - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -4224,7 +4268,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) &io->connect.address); } -static int io_connect(struct io_kiocb *req, bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_async_ctx __io, *io; unsigned file_flags; @@ -4260,7 +4305,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock) out: if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } #else /* !CONFIG_NET */ @@ -5141,7 +5186,8 @@ static int io_files_update_prep(struct io_kiocb *req, return 0; } -static int io_files_update(struct io_kiocb *req, bool force_nonblock) +static int io_files_update(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; struct io_uring_files_update up; @@ -5159,7 +5205,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -5360,7 +5406,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, switch (req->opcode) { case IORING_OP_NOP: - ret = io_nop(req); + ret = io_nop(req, cs); break; case IORING_OP_READV: case IORING_OP_READ_FIXED: @@ -5422,9 +5468,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_SENDMSG) - ret = io_sendmsg(req, force_nonblock); + ret = io_sendmsg(req, force_nonblock, cs); else - ret = io_send(req, force_nonblock); + ret = io_send(req, force_nonblock, cs); break; case IORING_OP_RECVMSG: case IORING_OP_RECV: @@ -5434,9 +5480,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_RECVMSG) - ret = io_recvmsg(req, force_nonblock); + ret = io_recvmsg(req, force_nonblock, cs); else - ret = io_recv(req, force_nonblock); + ret = io_recv(req, force_nonblock, cs); break; case IORING_OP_TIMEOUT: if (sqe) { @@ -5460,7 +5506,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_accept(req, force_nonblock); + ret = io_accept(req, force_nonblock, cs); break; case IORING_OP_CONNECT: if (sqe) { @@ -5468,7 +5514,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_connect(req, force_nonblock); + ret = io_connect(req, force_nonblock, cs); break; case IORING_OP_ASYNC_CANCEL: if (sqe) { @@ -5500,7 +5546,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_close(req, force_nonblock); + ret = io_close(req, force_nonblock, cs); break; case IORING_OP_FILES_UPDATE: if (sqe) { @@ -5508,7 +5554,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_files_update(req, force_nonblock); + ret = io_files_update(req, force_nonblock, cs); break; case IORING_OP_STATX: if (sqe) { @@ -5548,7 +5594,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_epoll_ctl(req, force_nonblock); + ret = io_epoll_ctl(req, force_nonblock, cs); break; case IORING_OP_SPLICE: if (sqe) { @@ -5564,7 +5610,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_provide_buffers(req, force_nonblock); + ret = io_provide_buffers(req, force_nonblock, cs); break; case IORING_OP_REMOVE_BUFFERS: if (sqe) { @@ -5572,7 +5618,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_remove_buffers(req, force_nonblock); + ret = io_remove_buffers(req, force_nonblock, cs); break; case IORING_OP_TEE: if (sqe) { @@ -6006,33 +6052,6 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void io_submit_flush_completions(struct io_comp_state *cs) -{ - struct io_ring_ctx *ctx = cs->ctx; - - spin_lock_irq(&ctx->completion_lock); - while (!list_empty(&cs->list)) { - struct io_kiocb *req; - - req = list_first_entry(&cs->list, struct io_kiocb, list); - list_del(&req->list); - io_cqring_fill_event(req, req->result); - if (!(req->flags & REQ_F_LINK_HEAD)) { - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - } else { - spin_unlock_irq(&ctx->completion_lock); - io_put_req(req); - spin_lock_irq(&ctx->completion_lock); - } - } - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); - cs->nr = 0; -} - /* * Batched submission is done, ensure local IO is flushed out. */ -- cgit From a1d7c393c4711a9ce6c239c3ab053a50dc96505a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 11:09:46 -0600 Subject: io_uring: enable READ/WRITE to use deferred completions A bit more surgery required here, as completions are generally done through the kiocb->ki_complete() callback, even if they complete inline. This enables the regular read/write path to use the io_comp_state logic to batch inline completions. Signed-off-by: Jens Axboe --- fs/io_uring.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6c9ca4fcbc31..0bba12e4e559 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2019,7 +2019,8 @@ static inline void req_set_fail_links(struct io_kiocb *req) req->flags |= REQ_F_FAIL_LINK; } -static void io_complete_rw_common(struct kiocb *kiocb, long res) +static void io_complete_rw_common(struct kiocb *kiocb, long res, + struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); int cflags = 0; @@ -2031,7 +2032,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_kbuf(req); - io_cqring_add_event(req, res, cflags); + __io_req_complete(req, res, cflags, cs); } static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) @@ -2141,14 +2142,18 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) return false; } +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + struct io_comp_state *cs) +{ + if (!io_rw_reissue(req, res)) + io_complete_rw_common(&req->rw.kiocb, res, cs); +} + static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - if (!io_rw_reissue(req, res)) { - io_complete_rw_common(kiocb, res); - io_put_req(req); - } + __io_complete_rw(req, res, res2, NULL); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -2382,14 +2387,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret) +static void kiocb_done(struct kiocb *kiocb, ssize_t ret, + struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; if (ret >= 0 && kiocb->ki_complete == io_complete_rw) - io_complete_rw(kiocb, ret, 0); + __io_complete_rw(req, ret, 0, cs); else io_rw_done(kiocb, ret); } @@ -2925,7 +2931,8 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); } -static int io_read(struct io_kiocb *req, bool force_nonblock) +static int io_read(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; @@ -2960,7 +2967,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { - kiocb_done(kiocb, ret2); + kiocb_done(kiocb, ret2, cs); } else { iter.count = iov_count; iter.nr_segs = nr_segs; @@ -2975,7 +2982,7 @@ copy_iov: if (ret2 == -EIOCBQUEUED) { goto out_free; } else if (ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2); + kiocb_done(kiocb, ret2, cs); goto out_free; } } @@ -3021,7 +3028,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static int io_write(struct io_kiocb *req, bool force_nonblock) +static int io_write(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; @@ -3090,7 +3098,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2); + kiocb_done(kiocb, ret2, cs); } else { iter.count = iov_count; iter.nr_segs = nr_segs; @@ -5416,7 +5424,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_read(req, force_nonblock); + ret = io_read(req, force_nonblock, cs); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: @@ -5426,7 +5434,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_write(req, force_nonblock); + ret = io_write(req, force_nonblock, cs); break; case IORING_OP_FSYNC: if (sqe) { -- cgit From c40f63790ec957e9449056fb78d8c2523eff96b5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Jun 2020 15:39:59 -0600 Subject: io_uring: use task_work for links if possible Currently links are always done in an async fashion, unless we catch them inline after we successfully complete a request without having to resort to blocking. This isn't necessarily the most efficient approach, it'd be more ideal if we could just use the task_work handling for this. Outside of saving an async jump, we can also do less prep work for these kinds of requests. Running dependent links from the task_work handler yields some nice performance benefits. As an example, examples/link-cp from the liburing repository uses read+write links to implement a copy operation. Without this patch, the a cache fold 4G file read from a VM runs in about 3 seconds: $ time examples/link-cp /data/file /dev/null real 0m2.986s user 0m0.051s sys 0m2.843s and a subsequent cache hot run looks like this: $ time examples/link-cp /data/file /dev/null real 0m0.898s user 0m0.069s sys 0m0.797s With this patch in place, the cold case takes about 2.4 seconds: $ time examples/link-cp /data/file /dev/null real 0m2.400s user 0m0.020s sys 0m2.366s and the cache hot case looks like this: $ time examples/link-cp /data/file /dev/null real 0m0.676s user 0m0.010s sys 0m0.665s As expected, the (mostly) cache hot case yields the biggest improvement, running about 25% faster with this change, while the cache cold case yields about a 20% increase in performance. Outside of the performance increase, we're using less CPU as well, as we're not using the async offload threads at all for this anymore. Signed-off-by: Jens Axboe --- fs/io_uring.c | 191 +++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 117 insertions(+), 74 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0bba12e4e559..b628e4429b75 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -898,6 +898,7 @@ enum io_mem_account { static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); +static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); @@ -951,6 +952,41 @@ static void __io_put_req_task(struct io_kiocb *req) put_task_struct(req->task); } +static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) +{ + if (!current->mm) { + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].needs_mm) + return 0; + return __io_sq_thread_acquire_mm(ctx); +} + +static inline void req_set_fail_links(struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; +} + static void io_file_put_work(struct work_struct *work); /* @@ -1664,6 +1700,64 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) } } +static void __io_req_task_cancel(struct io_kiocb *req, int error) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + io_cqring_fill_event(req, error); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + req_set_fail_links(req); + io_double_put_req(req); +} + +static void io_req_task_cancel(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + + __io_req_task_cancel(req, -ECANCELED); +} + +static void __io_req_task_submit(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + __set_current_state(TASK_RUNNING); + if (!__io_sq_thread_acquire_mm(ctx)) { + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL, NULL); + mutex_unlock(&ctx->uring_lock); + } else { + __io_req_task_cancel(req, -EFAULT); + } +} + +static void io_req_task_submit(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + + __io_req_task_submit(req); +} + +static void io_req_task_queue(struct io_kiocb *req) +{ + struct task_struct *tsk = req->task; + int ret; + + init_task_work(&req->task_work, io_req_task_submit); + + ret = task_work_add(tsk, &req->task_work, true); + if (unlikely(ret)) { + init_task_work(&req->task_work, io_req_task_cancel); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, true); + } + wake_up_process(tsk); +} + static void io_free_req(struct io_kiocb *req) { struct io_kiocb *nxt = NULL; @@ -1671,8 +1765,12 @@ static void io_free_req(struct io_kiocb *req) io_req_find_next(req, &nxt); __io_free_req(req); - if (nxt) - io_queue_async_work(nxt); + if (nxt) { + if (nxt->flags & REQ_F_WORK_INITIALIZED) + io_queue_async_work(nxt); + else + io_req_task_queue(nxt); + } } static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) @@ -2013,12 +2111,6 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } -static inline void req_set_fail_links(struct io_kiocb *req) -{ - if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; -} - static void io_complete_rw_common(struct kiocb *kiocb, long res, struct io_comp_state *cs) { @@ -2035,35 +2127,6 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res, __io_req_complete(req, res, cflags, cs); } -static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - } -} - -static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) -{ - if (!current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_mm); - } - - return 0; -} - -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (!io_op_defs[req->opcode].needs_mm) - return 0; - return __io_sq_thread_acquire_mm(ctx); -} - #ifdef CONFIG_BLOCK static bool io_resubmit_prep(struct io_kiocb *req, int error) { @@ -2811,20 +2874,6 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void __io_async_buf_error(struct io_kiocb *req, int error) -{ - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->completion_lock); - io_cqring_fill_event(req, error); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); - req_set_fail_links(req); - io_double_put_req(req); -} - static void io_async_buf_cancel(struct callback_head *cb) { struct io_async_rw *rw; @@ -2832,27 +2881,18 @@ static void io_async_buf_cancel(struct callback_head *cb) rw = container_of(cb, struct io_async_rw, task_work); req = rw->wpq.wait.private; - __io_async_buf_error(req, -ECANCELED); + __io_req_task_cancel(req, -ECANCELED); } static void io_async_buf_retry(struct callback_head *cb) { struct io_async_rw *rw; - struct io_ring_ctx *ctx; struct io_kiocb *req; rw = container_of(cb, struct io_async_rw, task_work); req = rw->wpq.wait.private; - ctx = req->ctx; - __set_current_state(TASK_RUNNING); - if (!__io_sq_thread_acquire_mm(ctx)) { - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL, NULL); - mutex_unlock(&ctx->uring_lock); - } else { - __io_async_buf_error(req, -EFAULT); - } + __io_req_task_submit(req); } static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, @@ -5218,22 +5258,24 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock, } static int io_req_defer_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, bool for_async) { ssize_t ret = 0; if (!sqe) return 0; - io_req_init_async(req); + if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { + io_req_init_async(req); - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (unlikely(ret)) - return ret; - } + if (io_op_defs[req->opcode].file_table) { + ret = io_grab_files(req); + if (unlikely(ret)) + return ret; + } - io_req_work_grab_env(req, &io_op_defs[req->opcode]); + io_req_work_grab_env(req, &io_op_defs[req->opcode]); + } switch (req->opcode) { case IORING_OP_NOP: @@ -5347,7 +5389,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req->io) { if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe); + ret = io_req_defer_prep(req, sqe, true); if (ret < 0) return ret; } @@ -5966,7 +6008,7 @@ fail_req: ret = -EAGAIN; if (io_alloc_async_ctx(req)) goto fail_req; - ret = io_req_defer_prep(req, sqe); + ret = io_req_defer_prep(req, sqe, true); if (unlikely(ret < 0)) goto fail_req; } @@ -6022,13 +6064,14 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe); + ret = io_req_defer_prep(req, sqe, false); if (ret) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; return ret; } trace_io_uring_link(ctx, req, head); + io_get_req_task(req); list_add_tail(&req->link_list, &head->link_list); /* last request of a link, enqueue the link */ @@ -6048,7 +6091,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe); + ret = io_req_defer_prep(req, sqe, true); if (ret) req->flags |= REQ_F_FAIL_LINK; *link = req; -- cgit From f4db7182e0de981a3f1b356e0cf43c6815423055 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Jun 2020 18:20:54 +0300 Subject: io-wq: return next work from ->do_work() directly It's easier to return next work from ->do_work() than having an in-out argument. Looks nicer and easier to compile. Also, merge io_wq_assign_next() into its only user. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 +++++++++++++++++++++-------------------------------- 1 file changed, 21 insertions(+), 32 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index b628e4429b75..2e44b3788265 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -895,7 +895,6 @@ enum io_mem_account { ACCT_PINNED, }; -static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); @@ -1773,20 +1772,6 @@ static void io_free_req(struct io_kiocb *req) } } -static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) -{ - struct io_kiocb *link; - const struct io_op_def *def = &io_op_defs[nxt->opcode]; - - if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file) - io_wq_hash_work(&nxt->work, file_inode(nxt->file)); - - *workptr = &nxt->work; - link = io_prep_linked_timeout(nxt); - if (link) - nxt->flags |= REQ_F_QUEUE_TIMEOUT; -} - /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -1806,24 +1791,29 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_steal_work(struct io_kiocb *req, - struct io_wq_work **workptr) +static struct io_wq_work *io_steal_work(struct io_kiocb *req) { + struct io_kiocb *link, *nxt = NULL; + /* - * It's in an io-wq worker, so there always should be at least - * one reference, which will be dropped in io_put_work() just - * after the current handler returns. - * - * It also means, that if the counter dropped to 1, then there is - * no asynchronous users left, so it's safe to steal the next work. + * A ref is owned by io-wq in which context we're. So, if that's the + * last one, it's safe to steal next work. False negatives are Ok, + * it just will be re-punted async in io_put_work() */ - if (refcount_read(&req->refs) == 1) { - struct io_kiocb *nxt = NULL; + if (refcount_read(&req->refs) != 1) + return NULL; - io_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); - } + io_req_find_next(req, &nxt); + if (!nxt) + return NULL; + + if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file) + io_wq_hash_work(&nxt->work, file_inode(nxt->file)); + + link = io_prep_linked_timeout(nxt); + if (link) + nxt->flags |= REQ_F_QUEUE_TIMEOUT; + return &nxt->work; } /* @@ -5718,9 +5708,8 @@ static void io_arm_async_linked_timeout(struct io_kiocb *req) io_queue_linked_timeout(link); } -static void io_wq_submit_work(struct io_wq_work **workptr) +static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) { - struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); int ret = 0; @@ -5751,7 +5740,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_req_complete(req, ret); } - io_steal_work(req, workptr); + return io_steal_work(req); } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, -- cgit From 1e16c2f917a59d27fb6b540c44d66978c8ad29ef Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 26 Jun 2020 16:32:50 -0700 Subject: io_uring: fix function args for !CONFIG_NET MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build errors when CONFIG_NET is not set/enabled: ../fs/io_uring.c:5472:10: error: too many arguments to function ‘io_sendmsg’ ../fs/io_uring.c:5474:10: error: too many arguments to function ‘io_send’ ../fs/io_uring.c:5484:10: error: too many arguments to function ‘io_recvmsg’ ../fs/io_uring.c:5486:10: error: too many arguments to function ‘io_recv’ ../fs/io_uring.c:5510:9: error: too many arguments to function ‘io_accept’ ../fs/io_uring.c:5518:9: error: too many arguments to function ‘io_connect’ Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: io-uring@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index af4d7a5c49f4..43ddda2a3d49 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4360,12 +4360,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } -static int io_send(struct io_kiocb *req, bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4376,12 +4378,14 @@ static int io_recvmsg_prep(struct io_kiocb *req, return -EOPNOTSUPP; } -static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } -static int io_recv(struct io_kiocb *req, bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4391,7 +4395,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_accept(struct io_kiocb *req, bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4401,7 +4406,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_connect(struct io_kiocb *req, bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } -- cgit From 8ef77766ba8694968ed4ba24311b4bacee14f235 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:59 +0300 Subject: io_uring: fix req->work corruption req->work and req->task_work are in a union, so io_req_task_queue() screws everything that was in work. De-union them for now. [ 704.367253] BUG: unable to handle page fault for address: ffffffffaf7330d0 [ 704.367256] #PF: supervisor write access in kernel mode [ 704.367256] #PF: error_code(0x0003) - permissions violation [ 704.367261] CPU: 6 PID: 1654 Comm: io_wqe_worker-0 Tainted: G I 5.8.0-rc2-00038-ge28d0bdc4863-dirty #498 [ 704.367265] RIP: 0010:_raw_spin_lock+0x1e/0x36 ... [ 704.367276] __alloc_fd+0x35/0x150 [ 704.367279] __get_unused_fd_flags+0x25/0x30 [ 704.367280] io_openat2+0xcb/0x1b0 [ 704.367283] io_issue_sqe+0x36a/0x1320 [ 704.367294] io_wq_submit_work+0x58/0x160 [ 704.367295] io_worker_handle_work+0x2a3/0x430 [ 704.367296] io_wqe_worker+0x2a0/0x350 [ 704.367301] kthread+0x136/0x180 [ 704.367304] ret_from_fork+0x22/0x30 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 43ddda2a3d49..dcf3ffb5ecf3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -668,12 +668,12 @@ struct io_kiocb { * restore the work, if needed. */ struct { - struct callback_head task_work; struct hlist_node hash_node; struct async_poll *apoll; }; struct io_wq_work work; }; + struct callback_head task_work; }; #define IO_IOPOLL_BATCH 8 -- cgit From 906a8c3fdbc367325d4200e39212a2a7715b7b0e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:55 +0300 Subject: io_uring: fix punting req w/o grabbed env It's not enough to check for REQ_F_WORK_INITIALIZED and punt async assuming that io_req_work_grab_env() was called, it may not have been. E.g. io_close_prep() and personality path set the flag without further async init. As a quick fix, always pass next work through io_req_task_queue(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index dcf3ffb5ecf3..483457f6a7df 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1766,12 +1766,8 @@ static void io_free_req(struct io_kiocb *req) io_req_find_next(req, &nxt); __io_free_req(req); - if (nxt) { - if (nxt->flags & REQ_F_WORK_INITIALIZED) - io_queue_async_work(nxt); - else - io_req_task_queue(nxt); - } + if (nxt) + io_req_task_queue(nxt); } /* -- cgit From 1bcb8c5d65a845e0ecb9e82237c399b29b8d15ea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:56 +0300 Subject: io_uring: fix feeding io-wq with uninit reqs io_steal_work() can't be sure that @nxt has req->work properly set, so we can't pass it to io-wq as is. A dirty quick fix -- drag it through io_req_task_queue(), and always return NULL from io_steal_work(). e.g. [ 50.770161] BUG: kernel NULL pointer dereference, address: 00000000 [ 50.770164] #PF: supervisor write access in kernel mode [ 50.770164] #PF: error_code(0x0002) - not-present page [ 50.770168] CPU: 1 PID: 1448 Comm: io_wqe_worker-0 Tainted: G I 5.8.0-rc2-00035-g2237d76530eb-dirty #494 [ 50.770172] RIP: 0010:override_creds+0x19/0x30 ... [ 50.770183] io_worker_handle_work+0x25c/0x430 [ 50.770185] io_wqe_worker+0x2a0/0x350 [ 50.770190] kthread+0x136/0x180 [ 50.770194] ret_from_fork+0x22/0x30 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 483457f6a7df..658949bed77f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1791,7 +1791,7 @@ static void io_put_req(struct io_kiocb *req) static struct io_wq_work *io_steal_work(struct io_kiocb *req) { - struct io_kiocb *link, *nxt = NULL; + struct io_kiocb *nxt = NULL; /* * A ref is owned by io-wq in which context we're. So, if that's the @@ -1808,10 +1808,15 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file) io_wq_hash_work(&nxt->work, file_inode(nxt->file)); - link = io_prep_linked_timeout(nxt); - if (link) - nxt->flags |= REQ_F_QUEUE_TIMEOUT; - return &nxt->work; + io_req_task_queue(nxt); + /* + * If we're going to return actual work, here should be timeout prep: + * + * link = io_prep_linked_timeout(nxt); + * if (link) + * nxt->flags |= REQ_F_QUEUE_TIMEOUT; + */ + return NULL; } /* -- cgit From a6d45dd0d43e6d1275e002704540688b6768bc22 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:57 +0300 Subject: io_uring: don't mark link's head for_async No reason to mark a head of a link as for-async in io_req_defer_prep(). grab_env(), etc. That will be done further during submission if neccessary. Mark for_async=false saving extra grab_env() in many cases. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 658949bed77f..545b137c7b4a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6092,7 +6092,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe, true); + ret = io_req_defer_prep(req, sqe, false); if (ret) req->flags |= REQ_F_FAIL_LINK; *link = req; -- cgit From 710c2bfb66474a186b0196e3342d43db0e6c04e1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:58 +0300 Subject: io_uring: fix missing io_grab_files() We won't have valid ring_fd, ring_file in task work. Grab files early. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 545b137c7b4a..4a9929c0b4ad 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5270,15 +5270,15 @@ static int io_req_defer_prep(struct io_kiocb *req, if (!sqe) return 0; - if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { + if (io_op_defs[req->opcode].file_table) { io_req_init_async(req); + ret = io_grab_files(req); + if (unlikely(ret)) + return ret; + } - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (unlikely(ret)) - return ret; - } - + if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { + io_req_init_async(req); io_req_work_grab_env(req, &io_op_defs[req->opcode]); } -- cgit From 8c9cb6cd9a46ae6fb7cb6c39cf6a48a53440feef Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:29 +0300 Subject: io_uring: fix refs underflow in io_iopoll_queue() Now io_complete_rw_common() puts a ref, extra io_req_put() in io_iopoll_queue() causes undeflow. Remove it. [ 455.998620] refcount_t: underflow; use-after-free. [ 455.998743] WARNING: CPU: 6 PID: 285394 at lib/refcount.c:28 refcount_warn_saturate+0xae/0xf0 [ 455.998772] CPU: 6 PID: 285394 Comm: read-write2 Tainted: G I E 5.8.0-rc2-00048-g1b1aa738f167-dirty #509 [ 455.998772] RIP: 0010:refcount_warn_saturate+0xae/0xf0 ... [ 455.998778] Call Trace: [ 455.998778] io_put_req+0x44/0x50 [ 455.998778] io_iopoll_complete+0x245/0x370 [ 455.998779] io_iopoll_getevents+0x12f/0x1a0 [ 455.998779] io_iopoll_reap_events.part.0+0x5e/0xa0 [ 455.998780] io_ring_ctx_wait_and_kill+0x132/0x1c0 [ 455.998780] io_uring_release+0x20/0x30 [ 455.998780] __fput+0xcd/0x230 [ 455.998781] ____fput+0xe/0x10 [ 455.998781] task_work_run+0x67/0xa0 [ 455.998781] do_exit+0x35d/0xb70 [ 455.998782] do_group_exit+0x43/0xa0 [ 455.998783] get_signal+0x140/0x900 [ 455.998783] do_signal+0x37/0x780 [ 455.998784] __prepare_exit_to_usermode+0x126/0x1c0 [ 455.998785] __syscall_return_slowpath+0x3b/0x1c0 [ 455.998785] do_syscall_64+0x5f/0xa0 [ 455.998785] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: a1d7c393c47 ("io_uring: enable READ/WRITE to use deferred completions") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4a9929c0b4ad..ab9f2f3a9b56 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1904,7 +1904,6 @@ static void io_iopoll_queue(struct list_head *again) /* shouldn't happen unless io_uring is dying, cancel reqs */ if (unlikely(!current->mm)) { io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); - io_put_req(req); continue; } -- cgit From e6543a816edca00b6b4c48625d142059d7211059 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:30 +0300 Subject: io_uring: remove inflight batching in free_many() io_free_req_many() is used only for iopoll requests, i.e. reads/writes. Hence no need to batch inflight unhooking. For safety, it'll be done by io_dismantle_req(), which replaces __io_req_aux_free(), and looks more solid and cleaner. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 41 ++++++++--------------------------------- 1 file changed, 8 insertions(+), 33 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index ab9f2f3a9b56..9863cec8020f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1504,7 +1504,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, fput(file); } -static void __io_req_aux_free(struct io_kiocb *req) +static void io_dismantle_req(struct io_kiocb *req) { if (req->flags & REQ_F_NEED_CLEANUP) io_cleanup_req(req); @@ -1514,11 +1514,6 @@ static void __io_req_aux_free(struct io_kiocb *req) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); __io_put_req_task(req); io_req_work_drop_env(req); -} - -static void __io_free_req(struct io_kiocb *req) -{ - __io_req_aux_free(req); if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -1530,7 +1525,11 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } +} +static void __io_free_req(struct io_kiocb *req) +{ + io_dismantle_req(req); percpu_ref_put(&req->ctx->refs); if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -1549,35 +1548,11 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) if (!rb->to_free) return; if (rb->need_iter) { - int i, inflight = 0; - unsigned long flags; - - for (i = 0; i < rb->to_free; i++) { - struct io_kiocb *req = rb->reqs[i]; - - if (req->flags & REQ_F_INFLIGHT) - inflight++; - __io_req_aux_free(req); - } - if (!inflight) - goto do_free; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - for (i = 0; i < rb->to_free; i++) { - struct io_kiocb *req = rb->reqs[i]; - - if (req->flags & REQ_F_INFLIGHT) { - list_del(&req->inflight_entry); - if (!--inflight) - break; - } - } - spin_unlock_irqrestore(&ctx->inflight_lock, flags); + int i; - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); + for (i = 0; i < rb->to_free; i++) + io_dismantle_req(rb->reqs[i]); } -do_free: kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); percpu_ref_put_many(&ctx->refs, rb->to_free); rb->to_free = rb->need_iter = 0; -- cgit From 2757a23e7f6441eabf605ca59eeb88c34071757d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:31 +0300 Subject: io_uring: dismantle req early and remove need_iter Every request in io_req_multi_free() is has ->file set. Instead of pointlessly defering and counting reqs with file, dismantle it on place and save for batch dealloc. It also saves us from potentially skipping io_cleanup_req(), put_task(), etc. Never happens though, becacuse ->file is always there. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9863cec8020f..8cb5252269d7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1540,22 +1540,16 @@ static void __io_free_req(struct io_kiocb *req) struct req_batch { void *reqs[IO_IOPOLL_BATCH]; int to_free; - int need_iter; }; static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) { if (!rb->to_free) return; - if (rb->need_iter) { - int i; - for (i = 0; i < rb->to_free; i++) - io_dismantle_req(rb->reqs[i]); - } kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); percpu_ref_put_many(&ctx->refs, rb->to_free); - rb->to_free = rb->need_iter = 0; + rb->to_free = 0; } static bool io_link_cancel_timeout(struct io_kiocb *req) @@ -1846,9 +1840,7 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) return false; - if (req->file || req->io) - rb->need_iter++; - + io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) io_free_req_many(req->ctx, rb); @@ -1900,7 +1892,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, /* order with ->result store in io_complete_rw_iopoll() */ smp_rmb(); - rb.to_free = rb.need_iter = 0; + rb.to_free = 0; while (!list_empty(done)) { int cflags = 0; -- cgit From c3524383333e4ff2f720ab0c02b3a329f72de78b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:32 +0300 Subject: io_uring: batch-free linked requests as well There is no reason to not batch deallocation of linked requests. Take away its next req first and handle it as everything else in io_req_multi_free(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8cb5252269d7..af8d1d64f858 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1728,17 +1728,21 @@ static void io_req_task_queue(struct io_kiocb *req) wake_up_process(tsk); } -static void io_free_req(struct io_kiocb *req) +static void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = NULL; io_req_find_next(req, &nxt); - __io_free_req(req); - if (nxt) io_req_task_queue(nxt); } +static void io_free_req(struct io_kiocb *req) +{ + io_queue_next(req); + __io_free_req(req); +} + /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -1835,16 +1839,19 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) +static inline void io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) { - if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) - return false; + if (unlikely(io_is_fallback_req(req))) { + io_free_req(req); + return; + } + if (req->flags & REQ_F_LINK_HEAD) + io_queue_next(req); io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) io_free_req_many(req->ctx, rb); - return true; } static int io_put_kbuf(struct io_kiocb *req) @@ -1910,9 +1917,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; - if (refcount_dec_and_test(&req->refs) && - !io_req_multi_free(&rb, req)) - io_free_req(req); + if (refcount_dec_and_test(&req->refs)) + io_req_multi_free(&rb, req); } io_commit_cqring(ctx); -- cgit From 2d6500d44c1374808040d120e625a22b013c9f0d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:33 +0300 Subject: io_uring: cosmetic changes for batch free Move all batch free bits close to each other and rename in a consistent way. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 69 ++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 32 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index af8d1d64f858..18a452ac81cc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1537,21 +1537,6 @@ static void __io_free_req(struct io_kiocb *req) clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req); } -struct req_batch { - void *reqs[IO_IOPOLL_BATCH]; - int to_free; -}; - -static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) -{ - if (!rb->to_free) - return; - - kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); - percpu_ref_put_many(&ctx->refs, rb->to_free); - rb->to_free = 0; -} - static bool io_link_cancel_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1743,6 +1728,41 @@ static void io_free_req(struct io_kiocb *req) __io_free_req(req); } +struct req_batch { + void *reqs[IO_IOPOLL_BATCH]; + int to_free; +}; + +static void __io_req_free_batch_flush(struct io_ring_ctx *ctx, + struct req_batch *rb) +{ + kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + percpu_ref_put_many(&ctx->refs, rb->to_free); + rb->to_free = 0; +} + +static void io_req_free_batch_finish(struct io_ring_ctx *ctx, + struct req_batch *rb) +{ + if (rb->to_free) + __io_req_free_batch_flush(ctx, rb); +} + +static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) +{ + if (unlikely(io_is_fallback_req(req))) { + io_free_req(req); + return; + } + if (req->flags & REQ_F_LINK_HEAD) + io_queue_next(req); + + io_dismantle_req(req); + rb->reqs[rb->to_free++] = req; + if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) + __io_req_free_batch_flush(req->ctx, rb); +} + /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -1839,21 +1859,6 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static inline void io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) -{ - if (unlikely(io_is_fallback_req(req))) { - io_free_req(req); - return; - } - if (req->flags & REQ_F_LINK_HEAD) - io_queue_next(req); - - io_dismantle_req(req); - rb->reqs[rb->to_free++] = req; - if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) - io_free_req_many(req->ctx, rb); -} - static int io_put_kbuf(struct io_kiocb *req) { struct io_buffer *kbuf; @@ -1918,13 +1923,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, (*nr_events)++; if (refcount_dec_and_test(&req->refs)) - io_req_multi_free(&rb, req); + io_req_free_batch(&rb, req); } io_commit_cqring(ctx); if (ctx->flags & IORING_SETUP_SQPOLL) io_cqring_ev_posted(ctx); - io_free_req_many(ctx, &rb); + io_req_free_batch_finish(ctx, &rb); if (!list_empty(&again)) io_iopoll_queue(&again); -- cgit From 9b0d911acce00b67f7e7336f838b732de7d917d6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:34 +0300 Subject: io_uring: kill REQ_F_LINK_NEXT After pulling nxt from a request, it's no more a links head, so clear REQ_F_LINK_HEAD. Absence of this flag also indicates that there are no linked requests, so replacing REQ_F_LINK_NEXT, which can be killed. Linked timeouts also behave leaving the flag intact when necessary. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 18a452ac81cc..14c5655c0434 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -526,7 +526,6 @@ enum { REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, REQ_F_LINK_HEAD_BIT, - REQ_F_LINK_NEXT_BIT, REQ_F_FAIL_LINK_BIT, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, @@ -565,8 +564,6 @@ enum { /* head of a link */ REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), - /* already grabbed next link */ - REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), /* fail rest of links */ REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), /* on inflight list */ @@ -1559,10 +1556,6 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) struct io_ring_ctx *ctx = req->ctx; bool wake_ev = false; - /* Already got next link */ - if (req->flags & REQ_F_LINK_NEXT) - return; - /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the @@ -1587,7 +1580,6 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) break; } - req->flags |= REQ_F_LINK_NEXT; if (wake_ev) io_cqring_ev_posted(ctx); } @@ -1628,6 +1620,7 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) { if (likely(!(req->flags & REQ_F_LINK_HEAD))) return; + req->flags &= ~REQ_F_LINK_HEAD; /* * If LINK is set, we have dependent requests in this chain. If we -- cgit From 6795c5aba247653f99d1f336ff496dd74659b322 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:35 +0300 Subject: io_uring: clean up req->result setting by rw Assign req->result to io_size early in io_{read,write}(), it's enough and makes it more straightforward. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 14c5655c0434..f283d111666b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2384,7 +2384,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; - req->result = 0; req->iopoll_completed = 0; } else { if (kiocb->ki_flags & IOCB_HIPRI) @@ -2957,10 +2956,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (!force_nonblock) kiocb->ki_flags &= ~IOCB_NOWAIT; - req->result = 0; io_size = ret; - if (req->flags & REQ_F_LINK_HEAD) - req->result = io_size; + req->result = io_size; /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, READ)) @@ -3054,10 +3051,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (!force_nonblock) req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - req->result = 0; io_size = ret; - if (req->flags & REQ_F_LINK_HEAD) - req->result = io_size; + req->result = io_size; /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) -- cgit From 3adfecaa647ff8afa4b6f5907193cf751a0f8351 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:37 +0300 Subject: io_uring: do task_work_run() during iopoll There are a lot of new users of task_work, and some of task_work_add() may happen while we do io polling, thus make iopoll from time to time to do task_work_run(), so it doesn't poll for sitting there reqs. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index f283d111666b..c514a5209703 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2052,6 +2052,8 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ if (!(++iters & 7)) { mutex_unlock(&ctx->uring_lock); + if (current->task_works) + task_work_run(); mutex_lock(&ctx->uring_lock); } -- cgit From f3a6fa2267480d7f19fbde8316372be46055e548 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:38 +0300 Subject: io_uring: fix iopoll -EAGAIN handling req->iopoll() is not necessarily called by a task that submitted a request. Because of that, it's dangerous to grab_env() and punt async on -EGAIN, potentially grabbing another task's mm and corrupting its memory. Do resubmit from the submitter task context. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index c514a5209703..9d3d8d3866cc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -892,6 +892,7 @@ enum io_mem_account { ACCT_PINNED, }; +static bool io_rw_reissue(struct io_kiocb *req, long res); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); @@ -1873,14 +1874,9 @@ static void io_iopoll_queue(struct list_head *again) req = list_first_entry(again, struct io_kiocb, list); list_del(&req->list); - /* shouldn't happen unless io_uring is dying, cancel reqs */ - if (unlikely(!current->mm)) { + /* should have ->mm unless io_uring is dying, kill reqs then */ + if (unlikely(!current->mm) || !io_rw_reissue(req, -EAGAIN)) io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); - continue; - } - - refcount_inc(&req->refs); - io_queue_async_work(req); } while (!list_empty(again)); } @@ -2387,6 +2383,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; + io_get_req_task(req); } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; -- cgit From fb49278624f75e15d36c3c43d322ca8961fb40e9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 12:59:48 +0300 Subject: io_uring: fix missing wake_up io_rw_reissue() Don't forget to wake up a process to which io_rw_reissue() added task_work. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9d3d8d3866cc..92c7e2a96912 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2168,8 +2168,10 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) tsk = req->task; init_task_work(&req->task_work, io_rw_resubmit); ret = task_work_add(tsk, &req->task_work, true); - if (!ret) + if (!ret) { + wake_up_process(tsk); return true; + } #endif return false; } -- cgit From 7c86ffeeed303187f266ed17bd87a9b375955709 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:12:59 +0300 Subject: io_uring: deduplicate freeing linked timeouts Linked timeout cancellation code is repeated in in io_req_link_next() and io_fail_links(), and they differ in details even though shouldn't. Basing on the fact that there is maximum one armed linked timeout in a link, and it immediately follows the head, extract a function that will check for it and defuse. Justification: - DRY and cleaner - better inlining for io_req_link_next() (just 1 call site now) - isolates linked_timeouts from common path - reduces time under spinlock for failed links - actually less code Signed-off-by: Pavel Begunkov [axboe: fold in locking fix for io_fail_links()] Signed-off-by: Jens Axboe --- fs/io_uring.c | 107 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 58 insertions(+), 49 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 92c7e2a96912..a0aea78162a6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1552,48 +1552,57 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) return false; } -static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static void io_kill_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *link; bool wake_ev = false; + unsigned long flags = 0; /* false positive warning */ + + if (!(req->flags & REQ_F_COMP_LOCKED)) + spin_lock_irqsave(&ctx->completion_lock, flags); + + if (list_empty(&req->link_list)) + goto out; + link = list_first_entry(&req->link_list, struct io_kiocb, link_list); + if (link->opcode != IORING_OP_LINK_TIMEOUT) + goto out; + + list_del_init(&link->link_list); + wake_ev = io_link_cancel_timeout(link); + req->flags &= ~REQ_F_LINK_TIMEOUT; +out: + if (!(req->flags & REQ_F_COMP_LOCKED)) + spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (wake_ev) + io_cqring_ev_posted(ctx); +} + +static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +{ + struct io_kiocb *nxt; /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the * safe side. */ - while (!list_empty(&req->link_list)) { - struct io_kiocb *nxt = list_first_entry(&req->link_list, - struct io_kiocb, link_list); - - if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && - (nxt->flags & REQ_F_TIMEOUT))) { - list_del_init(&nxt->link_list); - wake_ev |= io_link_cancel_timeout(nxt); - req->flags &= ~REQ_F_LINK_TIMEOUT; - continue; - } - - list_del_init(&req->link_list); - if (!list_empty(&nxt->link_list)) - nxt->flags |= REQ_F_LINK_HEAD; - *nxtptr = nxt; - break; - } + if (unlikely(list_empty(&req->link_list))) + return; - if (wake_ev) - io_cqring_ev_posted(ctx); + nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK_HEAD; + *nxtptr = nxt; } /* * Called if REQ_F_LINK_HEAD is set, and we fail the head request */ -static void io_fail_links(struct io_kiocb *req) +static void __io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { struct io_kiocb *link = list_first_entry(&req->link_list, @@ -1602,18 +1611,29 @@ static void io_fail_links(struct io_kiocb *req) list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); - if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->opcode == IORING_OP_LINK_TIMEOUT) { - io_link_cancel_timeout(link); - } else { - io_cqring_fill_event(link, -ECANCELED); - __io_double_put_req(link); - } + io_cqring_fill_event(link, -ECANCELED); + __io_double_put_req(link); req->flags &= ~REQ_F_LINK_TIMEOUT; } io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + io_cqring_ev_posted(ctx); +} + +static void io_fail_links(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + __io_fail_links(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + __io_fail_links(req); + } + io_cqring_ev_posted(ctx); } @@ -1623,30 +1643,19 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) return; req->flags &= ~REQ_F_LINK_HEAD; + if (req->flags & REQ_F_LINK_TIMEOUT) + io_kill_linked_timeout(req); + /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & REQ_F_FAIL_LINK) { + if (req->flags & REQ_F_FAIL_LINK) io_fail_links(req); - } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) == - REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - /* - * If this is a timeout link, we could be racing with the - * timeout timer. Grab the completion lock for this case to - * protect against that. - */ - spin_lock_irqsave(&ctx->completion_lock, flags); - io_req_link_next(req, nxt); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else { + else io_req_link_next(req, nxt); - } } static void __io_req_task_cancel(struct io_kiocb *req, int error) -- cgit From 9b5f7bd93272689ec8dc2cfd40a812265c23414e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:13:00 +0300 Subject: io_uring: replace find_next() out param with ret Generally, it's better to return a value directly than having out parameter. It's cleaner and saves from some kinds of ugly bugs. May also be faster. Return next request from io_req_find_next() and friends directly instead of passing out parameter. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index a0aea78162a6..0234dc2c9625 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1578,7 +1578,7 @@ out: io_cqring_ev_posted(ctx); } -static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static struct io_kiocb *io_req_link_next(struct io_kiocb *req) { struct io_kiocb *nxt; @@ -1588,13 +1588,13 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * safe side. */ if (unlikely(list_empty(&req->link_list))) - return; + return NULL; nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); list_del_init(&req->link_list); if (!list_empty(&nxt->link_list)) nxt->flags |= REQ_F_LINK_HEAD; - *nxtptr = nxt; + return nxt; } /* @@ -1637,10 +1637,10 @@ static void io_fail_links(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) +static struct io_kiocb *io_req_find_next(struct io_kiocb *req) { if (likely(!(req->flags & REQ_F_LINK_HEAD))) - return; + return NULL; req->flags &= ~REQ_F_LINK_HEAD; if (req->flags & REQ_F_LINK_TIMEOUT) @@ -1652,10 +1652,10 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & REQ_F_FAIL_LINK) - io_fail_links(req); - else - io_req_link_next(req, nxt); + if (likely(!(req->flags & REQ_F_FAIL_LINK))) + return io_req_link_next(req); + io_fail_links(req); + return NULL; } static void __io_req_task_cancel(struct io_kiocb *req, int error) @@ -1718,9 +1718,8 @@ static void io_req_task_queue(struct io_kiocb *req) static void io_queue_next(struct io_kiocb *req) { - struct io_kiocb *nxt = NULL; + struct io_kiocb *nxt = io_req_find_next(req); - io_req_find_next(req, &nxt); if (nxt) io_req_task_queue(nxt); } @@ -1770,13 +1769,15 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. */ -__attribute__((nonnull)) -static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) { + struct io_kiocb *nxt = NULL; + if (refcount_dec_and_test(&req->refs)) { - io_req_find_next(req, nxtptr); + nxt = io_req_find_next(req); __io_free_req(req); } + return nxt; } static void io_put_req(struct io_kiocb *req) @@ -1797,7 +1798,7 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) if (refcount_read(&req->refs) != 1) return NULL; - io_req_find_next(req, &nxt); + nxt = io_req_find_next(req); if (!nxt) return NULL; @@ -4488,7 +4489,7 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) hash_del(&req->hash_node); io_poll_complete(req, req->result, 0); req->flags |= REQ_F_COMP_LOCKED; - io_put_req_find_next(req, nxt); + *nxt = io_put_req_find_next(req); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -5938,9 +5939,8 @@ punt: } err: - nxt = NULL; /* drop submission reference */ - io_put_req_find_next(req, &nxt); + nxt = io_put_req_find_next(req); if (linked_timeout) { if (!ret) -- cgit From a1a4661691c5f1a3af4c04f56ad68e2d1dbee3af Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:13:01 +0300 Subject: io_uring: kill REQ_F_TIMEOUT Now REQ_F_TIMEOUT is set but never used, kill it Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0234dc2c9625..e9c8f52daf8f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -531,7 +531,6 @@ enum { REQ_F_CUR_POS_BIT, REQ_F_NOWAIT_BIT, REQ_F_LINK_TIMEOUT_BIT, - REQ_F_TIMEOUT_BIT, REQ_F_ISREG_BIT, REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, @@ -574,8 +573,6 @@ enum { REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), /* has linked timeout */ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), - /* timeout request */ - REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), /* no timeout sequence */ @@ -5063,7 +5060,6 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data = &req->io->timeout; data->req = req; - req->flags |= REQ_F_TIMEOUT; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; -- cgit From 8eb7e2d00763367f345ef0b2a2eb4f8001ae40ce Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:13:02 +0300 Subject: io_uring: kill REQ_F_TIMEOUT_NOSEQ There are too many useless flags, kill REQ_F_TIMEOUT_NOSEQ, which can be easily infered from req.timeout itself. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index e9c8f52daf8f..8495c17b53d6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -532,7 +532,6 @@ enum { REQ_F_NOWAIT_BIT, REQ_F_LINK_TIMEOUT_BIT, REQ_F_ISREG_BIT, - REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_OVERFLOW_BIT, @@ -575,8 +574,6 @@ enum { REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* no timeout sequence */ - REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), /* completion under lock */ REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ @@ -1010,6 +1007,11 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } +static inline bool io_is_timeout_noseq(struct io_kiocb *req) +{ + return !req->timeout.off; +} + static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; @@ -1222,7 +1224,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) struct io_kiocb *req = list_first_entry(&ctx->timeout_list, struct io_kiocb, list); - if (req->flags & REQ_F_TIMEOUT_NOSEQ) + if (io_is_timeout_noseq(req)) break; if (req->timeout.target_seq != ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts)) @@ -5087,8 +5089,7 @@ static int io_timeout(struct io_kiocb *req) * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - if (!off) { - req->flags |= REQ_F_TIMEOUT_NOSEQ; + if (io_is_timeout_noseq(req)) { entry = ctx->timeout_list.prev; goto add; } @@ -5103,7 +5104,7 @@ static int io_timeout(struct io_kiocb *req) list_for_each_prev(entry, &ctx->timeout_list) { struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); - if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) + if (io_is_timeout_noseq(nxt)) continue; /* nxt.seq is behind @tail, otherwise would've been completed */ if (off >= nxt->timeout.target_seq - tail) -- cgit From ecfc51777487da4da530710e0b13de4c8cb4a6d2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:13:03 +0300 Subject: io_uring: fix potential use after free on fallback request free After __io_free_req() puts a ctx ref, it should be assumed that the ctx may already be gone. However, it can be accessed when putting the fallback req. Free the req first and then put the ctx. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8495c17b53d6..b54e358e6b31 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1526,12 +1526,15 @@ static void io_dismantle_req(struct io_kiocb *req) static void __io_free_req(struct io_kiocb *req) { + struct io_ring_ctx *ctx; + io_dismantle_req(req); - percpu_ref_put(&req->ctx->refs); + ctx = req->ctx; if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); else - clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req); + clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req); + percpu_ref_put(&ctx->refs); } static bool io_link_cancel_timeout(struct io_kiocb *req) -- cgit From 351fd53595a3ceb88756a005e3b864f7c8cb86e4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 19:18:40 +0300 Subject: io_uring: don't pass def into io_req_work_grab_env Remove struct io_op_def *def parameter from io_req_work_grab_env(), it's trivially deducible from req->opcode and fast. The API is cleaner this way, and also helps the complier to understand that it's a real constant and could be register-cached. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index b54e358e6b31..2b7666e81c13 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1101,9 +1101,10 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline void io_req_work_grab_env(struct io_kiocb *req, - const struct io_op_def *def) +static inline void io_req_work_grab_env(struct io_kiocb *req) { + const struct io_op_def *def = &io_op_defs[req->opcode]; + if (!req->work.mm && def->needs_mm) { mmgrab(current->mm); req->work.mm = current->mm; @@ -1161,7 +1162,7 @@ static inline void io_prep_async_work(struct io_kiocb *req, } io_req_init_async(req); - io_req_work_grab_env(req, def); + io_req_work_grab_env(req); *link = io_prep_linked_timeout(req); } @@ -5255,7 +5256,7 @@ static int io_req_defer_prep(struct io_kiocb *req, if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { io_req_init_async(req); - io_req_work_grab_env(req, &io_op_defs[req->opcode]); + io_req_work_grab_env(req); } switch (req->opcode) { -- cgit From edcdfcc149a8d0c11d4dd2b23b5338af22e31a5f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 19:18:41 +0300 Subject: io_uring: do init work in grab_env() Place io_req_init_async() in io_req_work_grab_env() so it won't be forgotten. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2b7666e81c13..3b2f6fd8f58f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1105,6 +1105,8 @@ static inline void io_req_work_grab_env(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; + io_req_init_async(req); + if (!req->work.mm && def->needs_mm) { mmgrab(current->mm); req->work.mm = current->mm; @@ -1161,9 +1163,7 @@ static inline void io_prep_async_work(struct io_kiocb *req, req->work.flags |= IO_WQ_WORK_UNBOUND; } - io_req_init_async(req); io_req_work_grab_env(req); - *link = io_prep_linked_timeout(req); } @@ -5254,10 +5254,8 @@ static int io_req_defer_prep(struct io_kiocb *req, return ret; } - if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { - io_req_init_async(req); + if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) io_req_work_grab_env(req); - } switch (req->opcode) { case IORING_OP_NOP: -- cgit From debb85f496c9cc70663eac31d3ad9153839c844c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 19:18:42 +0300 Subject: io_uring: factor out grab_env() from defer_prep() Remove io_req_work_grab_env() call from io_req_defer_prep(), just call it when neccessary. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3b2f6fd8f58f..caf908382cdb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5240,7 +5240,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock, } static int io_req_defer_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe, bool for_async) + const struct io_uring_sqe *sqe) { ssize_t ret = 0; @@ -5254,9 +5254,6 @@ static int io_req_defer_prep(struct io_kiocb *req, return ret; } - if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) - io_req_work_grab_env(req); - switch (req->opcode) { case IORING_OP_NOP: break; @@ -5369,9 +5366,10 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req->io) { if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe, true); + ret = io_req_defer_prep(req, sqe); if (ret < 0) return ret; + io_req_work_grab_env(req); } spin_lock_irq(&ctx->completion_lock); @@ -5983,9 +5981,10 @@ fail_req: ret = -EAGAIN; if (io_alloc_async_ctx(req)) goto fail_req; - ret = io_req_defer_prep(req, sqe, true); + ret = io_req_defer_prep(req, sqe); if (unlikely(ret < 0)) goto fail_req; + io_req_work_grab_env(req); } /* @@ -6039,7 +6038,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe, false); + ret = io_req_defer_prep(req, sqe); if (ret) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; @@ -6066,7 +6065,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe, false); + ret = io_req_defer_prep(req, sqe); if (ret) req->flags |= REQ_F_FAIL_LINK; *link = req; -- cgit From cbdcb4357c000861b77369c34e110fa893d23607 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 19:18:43 +0300 Subject: io_uring: do grab_env() just before punting Currently io_steal_work() is disabled, and every linked request should go through task_work for initialisation. Do io_req_work_grab_env() just before io-wq punting and for the whole link, so any request reachable by io_steal_work() is prepared. This is also interesting for another reason -- it localises io_req_work_grab_env() into one place just before io-wq punting, helping to to better manage req->work lifetime and add some neat cleanup/optimisations later. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 +++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index caf908382cdb..9bc4339057ef 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1101,7 +1101,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline void io_req_work_grab_env(struct io_kiocb *req) +static void io_req_work_grab_env(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1150,8 +1150,7 @@ static inline void io_req_work_drop_env(struct io_kiocb *req) } } -static inline void io_prep_async_work(struct io_kiocb *req, - struct io_kiocb **link) +static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1164,15 +1163,22 @@ static inline void io_prep_async_work(struct io_kiocb *req, } io_req_work_grab_env(req); - *link = io_prep_linked_timeout(req); } -static inline void io_queue_async_work(struct io_kiocb *req) +static void io_prep_async_link(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; + struct io_kiocb *cur; - io_prep_async_work(req, &link); + io_prep_async_work(req); + if (req->flags & REQ_F_LINK_HEAD) + list_for_each_entry(cur, &req->link_list, link_list) + io_prep_async_work(cur); +} + +static void __io_queue_async_work(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *link = io_prep_linked_timeout(req); trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, &req->work, req->flags); @@ -1182,6 +1188,13 @@ static inline void io_queue_async_work(struct io_kiocb *req) io_queue_linked_timeout(link); } +static void io_queue_async_work(struct io_kiocb *req) +{ + /* init ->work of the whole link before punting */ + io_prep_async_link(req); + __io_queue_async_work(req); +} + static void io_kill_timeout(struct io_kiocb *req) { int ret; @@ -1215,7 +1228,8 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) if (req_need_defer(req)) break; list_del_init(&req->list); - io_queue_async_work(req); + /* punt-init is done before queueing for defer */ + __io_queue_async_work(req); } while (!list_empty(&ctx->defer_list)); } @@ -1791,7 +1805,7 @@ static void io_put_req(struct io_kiocb *req) static struct io_wq_work *io_steal_work(struct io_kiocb *req) { - struct io_kiocb *nxt = NULL; + struct io_kiocb *timeout, *nxt = NULL; /* * A ref is owned by io-wq in which context we're. So, if that's the @@ -1805,18 +1819,10 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) if (!nxt) return NULL; - if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file) - io_wq_hash_work(&nxt->work, file_inode(nxt->file)); - - io_req_task_queue(nxt); - /* - * If we're going to return actual work, here should be timeout prep: - * - * link = io_prep_linked_timeout(nxt); - * if (link) - * nxt->flags |= REQ_F_QUEUE_TIMEOUT; - */ - return NULL; + timeout = io_prep_linked_timeout(nxt); + if (timeout) + nxt->flags |= REQ_F_QUEUE_TIMEOUT; + return &nxt->work; } /* @@ -5369,8 +5375,8 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) ret = io_req_defer_prep(req, sqe); if (ret < 0) return ret; - io_req_work_grab_env(req); } + io_prep_async_link(req); spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { @@ -5984,7 +5990,6 @@ fail_req: ret = io_req_defer_prep(req, sqe); if (unlikely(ret < 0)) goto fail_req; - io_req_work_grab_env(req); } /* -- cgit From ab0b6451db2a8ed630b89ef3826b8ea994149444 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 30 Jun 2020 08:43:15 -0600 Subject: io_uring: clean up io_kill_linked_timeout() locking Avoid jumping through hoops to silence unused variable warnings, and also fix sparse rightfully complaining about the locking context: fs/io_uring.c:1593:39: warning: context imbalance in 'io_kill_linked_timeout' - unexpected unlock Provide the functional helper as __io_kill_linked_timeout(), and have separate the locking from it. Signed-off-by: Jens Axboe --- fs/io_uring.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9bc4339057ef..3c12221f549e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1569,28 +1569,38 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) return false; } -static void io_kill_linked_timeout(struct io_kiocb *req) +static bool __io_kill_linked_timeout(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link; - bool wake_ev = false; - unsigned long flags = 0; /* false positive warning */ - - if (!(req->flags & REQ_F_COMP_LOCKED)) - spin_lock_irqsave(&ctx->completion_lock, flags); + bool wake_ev; if (list_empty(&req->link_list)) - goto out; + return false; link = list_first_entry(&req->link_list, struct io_kiocb, link_list); if (link->opcode != IORING_OP_LINK_TIMEOUT) - goto out; + return false; list_del_init(&link->link_list); wake_ev = io_link_cancel_timeout(link); req->flags &= ~REQ_F_LINK_TIMEOUT; -out: - if (!(req->flags & REQ_F_COMP_LOCKED)) + return wake_ev; +} + +static void io_kill_linked_timeout(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + bool wake_ev; + + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + wake_ev = __io_kill_linked_timeout(req); spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + wake_ev = __io_kill_linked_timeout(req); + } + if (wake_ev) io_cqring_ev_posted(ctx); } -- cgit From cf2f54255d0342cfbd273cbb964ad6bc7674f587 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:40 +0300 Subject: io_uring: don't fail iopoll requeue without ->mm Actually, io_iopoll_queue() may have NULL ->mm, that's if SQ thread didn't grabbed mm before doing iopoll. Don't fail reqs there, as after recent changes it won't be punted directly but rather through task_work. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3c12221f549e..43419f5bef8c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1902,9 +1902,7 @@ static void io_iopoll_queue(struct list_head *again) do { req = list_first_entry(again, struct io_kiocb, list); list_del(&req->list); - - /* should have ->mm unless io_uring is dying, kill reqs then */ - if (unlikely(!current->mm) || !io_rw_reissue(req, -EAGAIN)) + if (!io_rw_reissue(req, -EAGAIN)) io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); } while (!list_empty(again)); } -- cgit From ea1164e574e9af0a15ab730ead0861a4c7724142 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:41 +0300 Subject: io_uring: fix NULL mm in io_poll_task_func() io_poll_task_func() hand-coded link submission forgetting to set TASK_RUNNING, acquire mm, etc. Call existing helper for that, i.e. __io_req_task_submit(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 43419f5bef8c..2c17c2613205 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4518,13 +4518,8 @@ static void io_poll_task_func(struct callback_head *cb) struct io_kiocb *nxt = NULL; io_poll_task_handler(req, &nxt); - if (nxt) { - struct io_ring_ctx *ctx = nxt->ctx; - - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(nxt, NULL, NULL); - mutex_unlock(&ctx->uring_lock); - } + if (nxt) + __io_req_task_submit(nxt); } static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, -- cgit From 0be0b0e33b0bfd08264b108512e44b3907fe987b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:42 +0300 Subject: io_uring: simplify io_async_task_func() Greatly simplify io_async_task_func() removing duplicated functionality of __io_req_task_submit(). This do one extra spin lock/unlock for cancelled poll case, but that shouldn't happen often. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2c17c2613205..82b35948ac5b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4608,7 +4608,6 @@ static void io_async_task_func(struct callback_head *cb) struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; - bool canceled = false; trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); @@ -4618,15 +4617,8 @@ static void io_async_task_func(struct callback_head *cb) } /* If req is still hashed, it cannot have been canceled. Don't check. */ - if (hash_hashed(&req->hash_node)) { + if (hash_hashed(&req->hash_node)) hash_del(&req->hash_node); - } else { - canceled = READ_ONCE(apoll->poll.canceled); - if (canceled) { - io_cqring_fill_event(req, -ECANCELED); - io_commit_cqring(ctx); - } - } spin_unlock_irq(&ctx->completion_lock); @@ -4635,21 +4627,10 @@ static void io_async_task_func(struct callback_head *cb) memcpy(&req->work, &apoll->work, sizeof(req->work)); kfree(apoll); - if (!canceled) { - __set_current_state(TASK_RUNNING); - if (io_sq_thread_acquire_mm(ctx, req)) { - io_cqring_add_event(req, -EFAULT, 0); - goto end_req; - } - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL, NULL); - mutex_unlock(&ctx->uring_lock); - } else { - io_cqring_ev_posted(ctx); -end_req: - req_set_fail_links(req); - io_double_put_req(req); - } + if (!READ_ONCE(apoll->poll.canceled)) + __io_req_task_submit(req); + else + __io_req_task_cancel(req, -ECANCELED); } static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, -- cgit From 3fa5e0f331280237af918ab2e7a160f5a68d3e7d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:43 +0300 Subject: io_uring: optimise io_req_find_next() fast check gcc 9.2.0 compiles io_req_find_next() as a separate function leaving the first REQ_F_LINK_HEAD fast check not inlined. Help it by splitting out the check from the function. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 82b35948ac5b..9a43847c6823 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1664,12 +1664,9 @@ static void io_fail_links(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static struct io_kiocb *io_req_find_next(struct io_kiocb *req) +static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) { - if (likely(!(req->flags & REQ_F_LINK_HEAD))) - return NULL; req->flags &= ~REQ_F_LINK_HEAD; - if (req->flags & REQ_F_LINK_TIMEOUT) io_kill_linked_timeout(req); @@ -1685,6 +1682,13 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return NULL; } +static struct io_kiocb *io_req_find_next(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_LINK_HEAD))) + return NULL; + return __io_req_find_next(req); +} + static void __io_req_task_cancel(struct io_kiocb *req, int error) { struct io_ring_ctx *ctx = req->ctx; -- cgit From 8eb06d7e8dd853d70668617dda57de4f6cebe651 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:39 +0300 Subject: io_uring: fix missing ->mm on exit There is a fancy bug, where exiting user task may not have ->mm, that makes task_works to try to do kthread_use_mm(ctx->sqo_mm). Don't do that if sqo_mm is NULL. [ 290.460558] WARNING: CPU: 6 PID: 150933 at kernel/kthread.c:1238 kthread_use_mm+0xf3/0x110 [ 290.460579] CPU: 6 PID: 150933 Comm: read-write2 Tainted: G I E 5.8.0-rc2-00066-g9b21720607cf #531 [ 290.460580] RIP: 0010:kthread_use_mm+0xf3/0x110 ... [ 290.460584] Call Trace: [ 290.460584] __io_sq_thread_acquire_mm.isra.0.part.0+0x25/0x30 [ 290.460584] __io_req_task_submit+0x64/0x80 [ 290.460584] io_req_task_submit+0x15/0x20 [ 290.460585] task_work_run+0x67/0xa0 [ 290.460585] do_exit+0x35d/0xb70 [ 290.460585] do_group_exit+0x43/0xa0 [ 290.460585] get_signal+0x140/0x900 [ 290.460586] do_signal+0x37/0x780 [ 290.460586] __prepare_exit_to_usermode+0x126/0x1c0 [ 290.460586] __syscall_return_slowpath+0x3b/0x1c0 [ 290.460587] do_syscall_64+0x5f/0xa0 [ 290.460587] entry_SYSCALL_64_after_hwframe+0x44/0xa9 following with faults. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9a43847c6823..cfad2acd4d86 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -958,7 +958,7 @@ static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { if (!current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + if (unlikely(!ctx->sqo_mm || !mmget_not_zero(ctx->sqo_mm))) return -EFAULT; kthread_use_mm(ctx->sqo_mm); } @@ -7216,10 +7216,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, { int ret; - mmgrab(current->mm); - ctx->sqo_mm = current->mm; - if (ctx->flags & IORING_SETUP_SQPOLL) { + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto err; @@ -7263,8 +7263,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, return 0; err: io_finish_async(ctx); - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; + if (ctx->sqo_mm) { + mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + } return ret; } -- cgit From ce593a6c480a22acba08795be313c0c6d49dd35d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 30 Jun 2020 12:39:05 -0600 Subject: io_uring: use signal based task_work running Since 5.7, we've been using task_work to trigger async running of requests in the context of the original task. This generally works great, but there's a case where if the task is currently blocked in the kernel waiting on a condition to become true, it won't process task_work. Even though the task is woken, it just checks whatever condition it's waiting on, and goes back to sleep if it's still false. This is a problem if that very condition only becomes true when that task_work is run. An example of that is the task registering an eventfd with io_uring, and it's now blocked waiting on an eventfd read. That read could depend on a completion event, and that completion event won't get trigged until task_work has been run. Use the TWA_SIGNAL notification for task_work, so that we ensure that the task always runs the work when queued. Cc: stable@vger.kernel.org # v5.7 Signed-off-by: Jens Axboe --- fs/io_uring.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index e507737f044e..700644a016a7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4072,6 +4072,21 @@ struct io_poll_table { int error; }; +static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb, + int notify) +{ + struct task_struct *tsk = req->task; + int ret; + + if (req->ctx->flags & IORING_SETUP_SQPOLL) + notify = 0; + + ret = task_work_add(tsk, cb, notify); + if (!ret) + wake_up_process(tsk); + return ret; +} + static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { @@ -4095,13 +4110,13 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = task_work_add(tsk, &req->task_work, true); + ret = io_req_task_work_add(req, &req->task_work, TWA_SIGNAL); if (unlikely(ret)) { WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, true); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); } - wake_up_process(tsk); return 1; } @@ -6182,19 +6197,20 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); + /* make sure we run task_work before checking for signals */ if (current->task_works) task_work_run(); - if (io_should_wake(&iowq, false)) - break; - schedule(); if (signal_pending(current)) { - ret = -EINTR; + ret = -ERESTARTSYS; break; } + if (io_should_wake(&iowq, false)) + break; + schedule(); } while (1); finish_wait(&ctx->wait, &iowq.wq); - restore_saved_sigmask_unless(ret == -EINTR); + restore_saved_sigmask_unless(ret == -ERESTARTSYS); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -- cgit From b7db41c9e03b5189bc94993bd50e4506ac9e34c1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 4 Jul 2020 08:55:50 -0600 Subject: io_uring: fix regression with always ignoring signals in io_cqring_wait() When switching to TWA_SIGNAL for task_work notifications, we also made any signal based condition in io_cqring_wait() return -ERESTARTSYS. This breaks applications that rely on using signals to abort someone waiting for events. Check if we have a signal pending because of queued task_work, and repeat the signal check once we've run the task_work. This provides a reliable way of telling the two apart. Additionally, only use TWA_SIGNAL if we are using an eventfd. If not, we don't have the dependency situation described in the original commit, and we can get by with just using TWA_RESUME like we previously did. Fixes: ce593a6c480a ("io_uring: use signal based task_work running") Cc: stable@vger.kernel.org # v5.7 Reported-by: Andres Freund Tested-by: Andres Freund Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 700644a016a7..d37d7ea5ebe5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4072,14 +4072,22 @@ struct io_poll_table { int error; }; -static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb, - int notify) +static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) { struct task_struct *tsk = req->task; - int ret; + struct io_ring_ctx *ctx = req->ctx; + int ret, notify = TWA_RESUME; - if (req->ctx->flags & IORING_SETUP_SQPOLL) + /* + * SQPOLL kernel thread doesn't need notification, just a wakeup. + * If we're not using an eventfd, then TWA_RESUME is always fine, + * as we won't have dependencies between request completions for + * other kernel wait conditions. + */ + if (ctx->flags & IORING_SETUP_SQPOLL) notify = 0; + else if (ctx->cq_ev_fd) + notify = TWA_SIGNAL; ret = task_work_add(tsk, cb, notify); if (!ret) @@ -4110,7 +4118,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = io_req_task_work_add(req, &req->task_work, TWA_SIGNAL); + ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); @@ -6201,7 +6209,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (current->task_works) task_work_run(); if (signal_pending(current)) { - ret = -ERESTARTSYS; + if (current->jobctl & JOBCTL_TASK_WORK) { + spin_lock_irq(¤t->sighand->siglock); + current->jobctl &= ~JOBCTL_TASK_WORK; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + continue; + } + ret = -EINTR; break; } if (io_should_wake(&iowq, false)) @@ -6210,7 +6225,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, } while (1); finish_wait(&ctx->wait, &iowq.wq); - restore_saved_sigmask_unless(ret == -ERESTARTSYS); + restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -- cgit From 4c6e277c4cc4a6b3b2b9c66a7b014787ae757cc1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 1 Jul 2020 11:29:10 -0600 Subject: io_uring: abstract out task work running Provide a helper to run task_work instead of checking and running manually in a bunch of different spots. While doing so, also move the task run state setting where we run the task work. Then we can move it out of the callback helpers. This also helps ensure we only do this once per task_work list run, not per task_work item. Suggested-by: Oleg Nesterov Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7426e4f23f9b..65a6978e1795 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1714,7 +1714,6 @@ static void __io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - __set_current_state(TASK_RUNNING); if (!__io_sq_thread_acquire_mm(ctx)) { mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL, NULL); @@ -1899,6 +1898,17 @@ static int io_put_kbuf(struct io_kiocb *req) return cflags; } +static inline bool io_run_task_work(void) +{ + if (current->task_works) { + __set_current_state(TASK_RUNNING); + task_work_run(); + return true; + } + + return false; +} + static void io_iopoll_queue(struct list_head *again) { struct io_kiocb *req; @@ -2079,8 +2089,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ if (!(++iters & 7)) { mutex_unlock(&ctx->uring_lock); - if (current->task_works) - task_work_run(); + io_run_task_work(); mutex_lock(&ctx->uring_lock); } @@ -2176,8 +2185,6 @@ static void io_rw_resubmit(struct callback_head *cb) struct io_ring_ctx *ctx = req->ctx; int err; - __set_current_state(TASK_RUNNING); - err = io_sq_thread_acquire_mm(ctx, req); if (io_resubmit_prep(req, err)) { @@ -6361,8 +6368,7 @@ static int io_sq_thread(void *data) if (!list_empty(&ctx->poll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { - if (current->task_works) - task_work_run(); + io_run_task_work(); cond_resched(); continue; } @@ -6394,8 +6400,7 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); break; } - if (current->task_works) { - task_work_run(); + if (io_run_task_work()) { finish_wait(&ctx->sqo_wait, &wait); continue; } @@ -6420,8 +6425,7 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } - if (current->task_works) - task_work_run(); + io_run_task_work(); io_sq_thread_drop_mm(ctx); revert_creds(old_cred); @@ -6486,9 +6490,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { if (io_cqring_events(ctx, false) >= min_events) return 0; - if (!current->task_works) + if (!io_run_task_work()) break; - task_work_run(); } while (1); if (sig) { @@ -6510,8 +6513,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); /* make sure we run task_work before checking for signals */ - if (current->task_works) - task_work_run(); + if (io_run_task_work()) + continue; if (signal_pending(current)) { if (current->jobctl & JOBCTL_TASK_WORK) { spin_lock_irq(¤t->sighand->siglock); @@ -7953,8 +7956,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; - if (current->task_works) - task_work_run(); + io_run_task_work(); if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; -- cgit From c2c4c83c58cbca23527fee93b49738a5a84272a1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 1 Jul 2020 15:37:11 -0600 Subject: io_uring: use new io_req_task_work_add() helper throughout Since we now have that in the 5.9 branch, convert the existing users of task_work_add() to use this new helper. Signed-off-by: Jens Axboe --- fs/io_uring.c | 77 ++++++++++++++++++++++++++++------------------------------- 1 file changed, 37 insertions(+), 40 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 65a6978e1795..2b849984bae5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1689,6 +1689,29 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } +static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) +{ + struct task_struct *tsk = req->task; + struct io_ring_ctx *ctx = req->ctx; + int ret, notify = TWA_RESUME; + + /* + * SQPOLL kernel thread doesn't need notification, just a wakeup. + * If we're not using an eventfd, then TWA_RESUME is always fine, + * as we won't have dependencies between request completions for + * other kernel wait conditions. + */ + if (ctx->flags & IORING_SETUP_SQPOLL) + notify = 0; + else if (ctx->cq_ev_fd) + notify = TWA_SIGNAL; + + ret = task_work_add(tsk, cb, notify); + if (!ret) + wake_up_process(tsk); + return ret; +} + static void __io_req_task_cancel(struct io_kiocb *req, int error) { struct io_ring_ctx *ctx = req->ctx; @@ -1732,18 +1755,19 @@ static void io_req_task_submit(struct callback_head *cb) static void io_req_task_queue(struct io_kiocb *req) { - struct task_struct *tsk = req->task; int ret; init_task_work(&req->task_work, io_req_task_submit); - ret = task_work_add(tsk, &req->task_work, true); + ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { + struct task_struct *tsk; + init_task_work(&req->task_work, io_req_task_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, true); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); } - wake_up_process(tsk); } static void io_queue_next(struct io_kiocb *req) @@ -2197,19 +2221,15 @@ static void io_rw_resubmit(struct callback_head *cb) static bool io_rw_reissue(struct io_kiocb *req, long res) { #ifdef CONFIG_BLOCK - struct task_struct *tsk; int ret; if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) return false; - tsk = req->task; init_task_work(&req->task_work, io_rw_resubmit); - ret = task_work_add(tsk, &req->task_work, true); - if (!ret) { - wake_up_process(tsk); + ret = io_req_task_work_add(req, &req->task_work); + if (!ret) return true; - } #endif return false; } @@ -2909,7 +2929,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, struct io_kiocb *req = wait->private; struct io_async_rw *rw = &req->io->rw; struct wait_page_key *key = arg; - struct task_struct *tsk; int ret; wpq = container_of(wait, struct wait_page_queue, wait); @@ -2923,15 +2942,16 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, init_task_work(&rw->task_work, io_async_buf_retry); /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - tsk = req->task; - ret = task_work_add(tsk, &rw->task_work, true); + ret = io_req_task_work_add(req, &rw->task_work); if (unlikely(ret)) { + struct task_struct *tsk; + /* queue just for cancelation */ init_task_work(&rw->task_work, io_async_buf_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &rw->task_work, true); + task_work_add(tsk, &rw->task_work, 0); + wake_up_process(tsk); } - wake_up_process(tsk); return 1; } @@ -4424,33 +4444,9 @@ struct io_poll_table { int error; }; -static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) -{ - struct task_struct *tsk = req->task; - struct io_ring_ctx *ctx = req->ctx; - int ret, notify = TWA_RESUME; - - /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. - * If we're not using an eventfd, then TWA_RESUME is always fine, - * as we won't have dependencies between request completions for - * other kernel wait conditions. - */ - if (ctx->flags & IORING_SETUP_SQPOLL) - notify = 0; - else if (ctx->cq_ev_fd) - notify = TWA_SIGNAL; - - ret = task_work_add(tsk, cb, notify); - if (!ret) - wake_up_process(tsk); - return ret; -} - static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { - struct task_struct *tsk; int ret; /* for instances that support it check for an event match first: */ @@ -4461,7 +4457,6 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, list_del_init(&poll->wait.entry); - tsk = req->task; req->result = mask; init_task_work(&req->task_work, func); /* @@ -4472,6 +4467,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, */ ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { + struct task_struct *tsk; + WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); task_work_add(tsk, &req->task_work, 0); -- cgit From 6df1db6b542436c6d429caa66e1045862fa36155 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 3 Jul 2020 22:15:06 +0300 Subject: io_uring: fix mis-refcounting linked timeouts io_prep_linked_timeout() sets REQ_F_LINK_TIMEOUT altering refcounting of the following linked request. After that someone should call io_queue_linked_timeout(), otherwise a submission reference of the linked timeout won't be ever dropped. That's what happens in io_steal_work() if io-wq decides to postpone linked request with io_wqe_enqueue(). io_queue_linked_timeout() can also be potentially called twice without synchronisation during re-submission, e.g. io_rw_resubmit(). There are the rules, whoever did io_prep_linked_timeout() must also call io_queue_linked_timeout(). To not do it twice, io_prep_linked_timeout() will return non NULL only for the first call. That's controlled by REQ_F_LINK_TIMEOUT flag. Also kill REQ_F_QUEUE_TIMEOUT. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 33 +++++++-------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2b849984bae5..cf1b3d4ac241 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -538,7 +538,6 @@ enum { REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, - REQ_F_QUEUE_TIMEOUT_BIT, REQ_F_WORK_INITIALIZED_BIT, REQ_F_TASK_PINNED_BIT, @@ -586,8 +585,6 @@ enum { REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), /* doesn't need file table for this request */ REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), - /* needs to queue linked timeout */ - REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), /* req->task is refcounted */ @@ -1842,7 +1839,7 @@ static void io_put_req(struct io_kiocb *req) static struct io_wq_work *io_steal_work(struct io_kiocb *req) { - struct io_kiocb *timeout, *nxt = NULL; + struct io_kiocb *nxt; /* * A ref is owned by io-wq in which context we're. So, if that's the @@ -1853,13 +1850,7 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) return NULL; nxt = io_req_find_next(req); - if (!nxt) - return NULL; - - timeout = io_prep_linked_timeout(nxt); - if (timeout) - nxt->flags |= REQ_F_QUEUE_TIMEOUT; - return &nxt->work; + return nxt ? &nxt->work : NULL; } /* @@ -5702,24 +5693,15 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void io_arm_async_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *link; - - /* link head's timeout is queued in io_queue_async_work() */ - if (!(req->flags & REQ_F_QUEUE_TIMEOUT)) - return; - - link = list_first_entry(&req->link_list, struct io_kiocb, link_list); - io_queue_linked_timeout(link); -} - static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_kiocb *timeout; int ret = 0; - io_arm_async_linked_timeout(req); + timeout = io_prep_linked_timeout(req); + if (timeout) + io_queue_linked_timeout(timeout); /* if NO_CANCEL is set, we must still run the work */ if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == @@ -5893,8 +5875,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!(req->flags & REQ_F_LINK_HEAD)) return NULL; - /* for polled retry, if flag is set, we already went through here */ - if (req->flags & REQ_F_POLLED) + if (req->flags & REQ_F_LINK_TIMEOUT) return NULL; nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, -- cgit From 652532ad459524d32c6bf1522e0b88d83b084d1a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 3 Jul 2020 22:15:07 +0300 Subject: io_uring: keep queue_sqe()'s fail path separately A preparation path, extracts error path into a separate block. It looks saner then calling req_set_fail_links() after io_put_req_find_next(), even though it have been working well. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index cf1b3d4ac241..7147e87a24b5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5937,22 +5937,21 @@ punt: goto exit; } + if (unlikely(ret)) { err: - /* drop submission reference */ - nxt = io_put_req_find_next(req); - - if (linked_timeout) { - if (!ret) - io_queue_linked_timeout(linked_timeout); - else - io_put_req(linked_timeout); - } - - /* and drop final reference, if we failed */ - if (ret) { + /* un-prep timeout, so it'll be killed as any other linked */ + req->flags &= ~REQ_F_LINK_TIMEOUT; req_set_fail_links(req); + io_put_req(req); io_req_complete(req, ret); + goto exit; } + + /* drop submission reference */ + nxt = io_put_req_find_next(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); + if (nxt) { req = nxt; -- cgit From 8b3656af2a37dc538d21e144a5a94bacae05e9f1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 3 Jul 2020 22:15:08 +0300 Subject: io_uring: fix lost cqe->flags Don't forget to fill cqe->flags properly in io_submit_flush_completions() Fixes: a1d7c393c4711 ("io_uring: enable READ/WRITE to use deferred completions") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7147e87a24b5..9464f9470bbc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1416,7 +1416,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs) req = list_first_entry(&cs->list, struct io_kiocb, list); list_del(&req->list); - io_cqring_fill_event(req, req->result); + __io_cqring_fill_event(req, req->result, req->cflags); if (!(req->flags & REQ_F_LINK_HEAD)) { req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); @@ -1441,6 +1441,7 @@ static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, io_put_req(req); } else { req->result = res; + req->cflags = cflags; list_add_tail(&req->list, &cs->list); if (++cs->nr >= 32) io_submit_flush_completions(cs); -- cgit From 3aadc23e6054353ca056bf14e87250c79efbd7ed Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 6 Jul 2020 17:59:29 +0300 Subject: io_uring: don't delay iopoll'ed req completion ->iopoll() may have completed current request, but instead of reaping it, io_do_iopoll() just continues with the next request in the list. As a result it can leave just polled and completed request in the list up until next syscall. Even outer loop in io_iopoll_getevents() doesn't help the situation. E.g. poll_list: req0 -> req1 If req0->iopoll() completed both requests, and @min<=1, then @req0 will be left behind. Check whether a req was completed after ->iopoll(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9464f9470bbc..60f1a81c6c35 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2015,6 +2015,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (ret < 0) break; + /* iopoll may have completed current req */ + if (READ_ONCE(req->iopoll_completed)) + list_move_tail(&req->list, &done); + if (ret && spin) spin = false; ret = 0; -- cgit From eba0a4dd2aa5c47ca5b0c56ffb6d6665e047ff72 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 6 Jul 2020 17:59:30 +0300 Subject: io_uring: fix stopping iopoll'ing too early Nobody adjusts *nr_events (number of completed requests) before calling io_iopoll_getevents(), so the passed @min shouldn't be adjusted as well. Othewise it can return less than initially asked @min without hitting need_resched(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 60f1a81c6c35..332008f346e3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2044,7 +2044,7 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, ret = io_do_iopoll(ctx, nr_events, min); if (ret < 0) return ret; - if (!min || *nr_events >= min) + if (*nr_events >= min) return 0; } @@ -2087,8 +2087,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ mutex_lock(&ctx->uring_lock); do { - int tmin = 0; - /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that @@ -2113,10 +2111,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, mutex_lock(&ctx->uring_lock); } - if (*nr_events < min) - tmin = min - *nr_events; - - ret = io_iopoll_getevents(ctx, nr_events, tmin); + ret = io_iopoll_getevents(ctx, nr_events, min); if (ret <= 0) break; ret = 0; -- cgit From 3fcee5a6d5414df8ff4ee22f2477bde76d34527c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 6 Jul 2020 17:59:31 +0300 Subject: io_uring: briefly loose locks while reaping events It's not nice to hold @uring_lock for too long io_iopoll_reap_events(). For instance, the lock is needed to publish requests to @poll_list, and that locks out tasks doing that for no good reason. Loose it occasionally. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 332008f346e3..6e3169834bf7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2069,8 +2069,13 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) /* * Ensure we allow local-to-the-cpu processing to take place, * in this case we need to ensure that we reap all events. + * Also let task_work, etc. to progress by releasing the mutex */ - cond_resched(); + if (need_resched()) { + mutex_unlock(&ctx->uring_lock); + cond_resched(); + mutex_lock(&ctx->uring_lock); + } } mutex_unlock(&ctx->uring_lock); } -- cgit From 9dedd56301564acdbb1dd37cf09250a4c7b783c9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 7 Jul 2020 16:36:20 +0300 Subject: io_uring: partially inline io_iopoll_getevents() io_iopoll_reap_events() doesn't care about returned valued of io_iopoll_getevents() and does the same checks for list emptiness and need_resched(). Just use io_do_iopoll(). io_sq_thread() doesn't check return value as well. It also passes min=0, so there never be the second iteration inside io_poll_getevents(). Inline it there too. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e3169834bf7..104af675f6fb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2064,7 +2064,7 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) while (!list_empty(&ctx->poll_list)) { unsigned int nr_events = 0; - io_iopoll_getevents(ctx, &nr_events, 1); + io_do_iopoll(ctx, &nr_events, 1); /* * Ensure we allow local-to-the-cpu processing to take place, @@ -6318,8 +6318,8 @@ static int io_sq_thread(void *data) unsigned nr_events = 0; mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->poll_list)) - io_iopoll_getevents(ctx, &nr_events, 0); + if (!list_empty(&ctx->poll_list) && !need_resched()) + io_do_iopoll(ctx, &nr_events, 0); else timeout = jiffies + ctx->sq_thread_idle; mutex_unlock(&ctx->uring_lock); -- cgit From 7668b92a69b8201e2dd16a47a08efb93e909f419 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 7 Jul 2020 16:36:21 +0300 Subject: io_uring: remove nr_events arg from iopoll_check() Nobody checks io_iopoll_check()'s output parameter @nr_events. Remove the parameter and declare it further down the stack. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 104af675f6fb..38bf42320f56 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2080,9 +2080,9 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { + unsigned int nr_events = 0; int iters = 0, ret = 0; /* @@ -2116,11 +2116,11 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, mutex_lock(&ctx->uring_lock); } - ret = io_iopoll_getevents(ctx, nr_events, min); + ret = io_iopoll_getevents(ctx, &nr_events, min); if (ret <= 0) break; ret = 0; - } while (min && !*nr_events && !need_resched()); + } while (min && !nr_events && !need_resched()); mutex_unlock(&ctx->uring_lock); return ret; @@ -7977,8 +7977,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { - unsigned nr_events = 0; - min_complete = min(min_complete, ctx->cq_entries); /* @@ -7989,7 +7987,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) { - ret = io_iopoll_check(ctx, &nr_events, min_complete); + ret = io_iopoll_check(ctx, min_complete); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); } -- cgit From b2edc0a77fac19bbdef63cedb2ea34aec1a9a499 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 7 Jul 2020 16:36:22 +0300 Subject: io_uring: don't burn CPU for iopoll on exit First of all don't spin in io_ring_ctx_wait_and_kill() on iopoll. Requests won't complete faster because of that, but only lengthen io_uring_release(). The same goes for offloaded cleanup in io_ring_exit_work() -- it already has waiting loop, don't do blocking active spinning. For that, pass min=0 into io_iopoll_[try_]reap_events(), so it won't actively spin. Leave the function if io_do_iopoll() there can't complete a request to sleep in io_ring_exit_work(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 38bf42320f56..4c9a494c9f9f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2055,7 +2055,7 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static void io_iopoll_reap_events(struct io_ring_ctx *ctx) +static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; @@ -2064,8 +2064,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) while (!list_empty(&ctx->poll_list)) { unsigned int nr_events = 0; - io_do_iopoll(ctx, &nr_events, 1); + io_do_iopoll(ctx, &nr_events, 0); + /* let it sleep and repeat later if can't complete a request */ + if (nr_events == 0) + break; /* * Ensure we allow local-to-the-cpu processing to take place, * in this case we need to ensure that we reap all events. @@ -7648,7 +7651,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) ctx->sqo_mm = NULL; } - io_iopoll_reap_events(ctx); io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); @@ -7715,11 +7717,8 @@ static int io_remove_personalities(int id, void *p, void *data) static void io_ring_exit_work(struct work_struct *work) { - struct io_ring_ctx *ctx; - - ctx = container_of(work, struct io_ring_ctx, exit_work); - if (ctx->rings) - io_cqring_overflow_flush(ctx, true); + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + exit_work); /* * If we're doing polled IO and end up having requests being @@ -7727,11 +7726,11 @@ static void io_ring_exit_work(struct work_struct *work) * we're waiting for refs to drop. We need to reap these manually, * as nobody else will be looking for them. */ - while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) { - io_iopoll_reap_events(ctx); + do { if (ctx->rings) io_cqring_overflow_flush(ctx, true); - } + io_iopoll_try_reap_events(ctx); + } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); } @@ -7747,10 +7746,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) if (ctx->io_wq) io_wq_cancel_all(ctx->io_wq); - io_iopoll_reap_events(ctx); /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) io_cqring_overflow_flush(ctx, true); + io_iopoll_try_reap_events(ctx); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); INIT_WORK(&ctx->exit_work, io_ring_exit_work); queue_work(system_wq, &ctx->exit_work); -- cgit From aa340845ae6f019e0a12321a1741c14679bb0664 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Jul 2020 21:47:11 +0300 Subject: io_uring: fix a use after free in io_async_task_func() The "apoll" variable is freed and then used on the next line. We need to move the free down a few lines. Fixes: 0be0b0e33b0b ("io_uring: simplify io_async_task_func()") Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4c9a494c9f9f..14168fbc7d79 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4655,12 +4655,13 @@ static void io_async_task_func(struct callback_head *cb) /* restore ->work in case we need to retry again */ if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&req->work, &apoll->work, sizeof(req->work)); - kfree(apoll); if (!READ_ONCE(apoll->poll.canceled)) __io_req_task_submit(req); else __io_req_task_cancel(req, -ECANCELED); + + kfree(apoll); } static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, -- cgit From 5acbbc8ed3a9aef71c6eb5f19ba24f7321200220 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jul 2020 15:15:26 -0600 Subject: io_uring: only call kfree() for a non-zero pointer It's safe to call kfree() with a NULL pointer, but it's also pointless. Most of the time we don't have any data to free, and at millions of requests per second, the redundant function call adds noticeable overhead (about 1.3% of the runtime). Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 14168fbc7d79..51ff88330f9a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1519,7 +1519,8 @@ static void io_dismantle_req(struct io_kiocb *req) if (req->flags & REQ_F_NEED_CLEANUP) io_cleanup_req(req); - kfree(req->io); + if (req->io) + kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); __io_put_req_task(req); -- cgit From 6d5f904904608a9cd32854d7d0a4dd65b27f9935 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 9 Jul 2020 09:15:29 +0800 Subject: io_uring: export cq overflow status to userspace For those applications which are not willing to use io_uring_enter() to reap and handle cqes, they may completely rely on liburing's io_uring_peek_cqe(), but if cq ring has overflowed, currently because io_uring_peek_cqe() is not aware of this overflow, it won't enter kernel to flush cqes, below test program can reveal this bug: static void test_cq_overflow(struct io_uring *ring) { struct io_uring_cqe *cqe; struct io_uring_sqe *sqe; int issued = 0; int ret = 0; do { sqe = io_uring_get_sqe(ring); if (!sqe) { fprintf(stderr, "get sqe failed\n"); break;; } ret = io_uring_submit(ring); if (ret <= 0) { if (ret != -EBUSY) fprintf(stderr, "sqe submit failed: %d\n", ret); break; } issued++; } while (ret > 0); assert(ret == -EBUSY); printf("issued requests: %d\n", issued); while (issued) { ret = io_uring_peek_cqe(ring, &cqe); if (ret) { if (ret != -EAGAIN) { fprintf(stderr, "peek completion failed: %s\n", strerror(ret)); break; } printf("left requets: %d\n", issued); continue; } io_uring_cqe_seen(ring, cqe); issued--; printf("left requets: %d\n", issued); } } int main(int argc, char *argv[]) { int ret; struct io_uring ring; ret = io_uring_queue_init(16, &ring, 0); if (ret) { fprintf(stderr, "ring setup failed: %d\n", ret); return 1; } test_cq_overflow(&ring); return 0; } To fix this issue, export cq overflow status to userspace by adding new IORING_SQ_CQ_OVERFLOW flag, then helper functions() in liburing, such as io_uring_peek_cqe, can be aware of this cq overflow and do flush accordingly. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index d37d7ea5ebe5..32e37c38f274 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1274,6 +1274,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) { clear_bit(0, &ctx->sq_check_overflow); clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -1311,6 +1312,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) if (list_empty(&ctx->cq_overflow_list)) { set_bit(0, &ctx->sq_check_overflow); set_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } req->flags |= REQ_F_OVERFLOW; refcount_inc(&req->refs); @@ -6080,9 +6082,9 @@ static int io_sq_thread(void *data) } /* Tell userspace we may need a wakeup call */ + spin_lock_irq(&ctx->completion_lock); ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; - /* make sure to read SQ tail after writing flags */ - smp_mb(); + spin_unlock_irq(&ctx->completion_lock); to_submit = io_sqring_entries(ctx); if (!to_submit || ret == -EBUSY) { @@ -6100,13 +6102,17 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); + spin_lock_irq(&ctx->completion_lock); ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); ret = 0; continue; } finish_wait(&ctx->sqo_wait, &wait); + spin_lock_irq(&ctx->completion_lock); ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); } mutex_lock(&ctx->uring_lock); @@ -7488,6 +7494,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, if (list_empty(&ctx->cq_overflow_list)) { clear_bit(0, &ctx->sq_check_overflow); clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } spin_unlock_irq(&ctx->completion_lock); -- cgit From f3bd9dae3708a0ff6b067e766073ffeb853301f9 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 9 Jul 2020 10:11:41 +0000 Subject: io_uring: fix memleak in __io_sqe_files_update() I got a memleak report when doing some fuzz test: BUG: memory leak unreferenced object 0xffff888113e02300 (size 488): comm "syz-executor401", pid 356, jiffies 4294809529 (age 11.954s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ a0 a4 ce 19 81 88 ff ff 60 ce 09 0d 81 88 ff ff ........`....... backtrace: [<00000000129a84ec>] kmem_cache_zalloc include/linux/slab.h:659 [inline] [<00000000129a84ec>] __alloc_file+0x25/0x310 fs/file_table.c:101 [<000000003050ad84>] alloc_empty_file+0x4f/0x120 fs/file_table.c:151 [<000000004d0a41a3>] alloc_file+0x5e/0x550 fs/file_table.c:193 [<000000002cb242f0>] alloc_file_pseudo+0x16a/0x240 fs/file_table.c:233 [<00000000046a4baa>] anon_inode_getfile fs/anon_inodes.c:91 [inline] [<00000000046a4baa>] anon_inode_getfile+0xac/0x1c0 fs/anon_inodes.c:74 [<0000000035beb745>] __do_sys_perf_event_open+0xd4a/0x2680 kernel/events/core.c:11720 [<0000000049009dc7>] do_syscall_64+0x56/0xa0 arch/x86/entry/common.c:359 [<00000000353731ca>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 BUG: memory leak unreferenced object 0xffff8881152dd5e0 (size 16): comm "syz-executor401", pid 356, jiffies 4294809529 (age 11.954s) hex dump (first 16 bytes): 01 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000074caa794>] kmem_cache_zalloc include/linux/slab.h:659 [inline] [<0000000074caa794>] lsm_file_alloc security/security.c:567 [inline] [<0000000074caa794>] security_file_alloc+0x32/0x160 security/security.c:1440 [<00000000c6745ea3>] __alloc_file+0xba/0x310 fs/file_table.c:106 [<000000003050ad84>] alloc_empty_file+0x4f/0x120 fs/file_table.c:151 [<000000004d0a41a3>] alloc_file+0x5e/0x550 fs/file_table.c:193 [<000000002cb242f0>] alloc_file_pseudo+0x16a/0x240 fs/file_table.c:233 [<00000000046a4baa>] anon_inode_getfile fs/anon_inodes.c:91 [inline] [<00000000046a4baa>] anon_inode_getfile+0xac/0x1c0 fs/anon_inodes.c:74 [<0000000035beb745>] __do_sys_perf_event_open+0xd4a/0x2680 kernel/events/core.c:11720 [<0000000049009dc7>] do_syscall_64+0x56/0xa0 arch/x86/entry/common.c:359 [<00000000353731ca>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 If io_sqe_file_register() failed, we need put the file that get by fget() to avoid the memleak. Fixes: c3a31e605620 ("io_uring: add support for IORING_REGISTER_FILES_UPDATE") Cc: stable@vger.kernel.org Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 32e37c38f274..a9ce2e6f03dd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6851,8 +6851,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, } table->files[index] = file; err = io_sqe_file_register(ctx, file, i); - if (err) + if (err) { + fput(file); break; + } } nr_args--; done++; -- cgit From 2bc9930e78fe0cb3e7b7e3169de0a40baee38d29 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Jul 2020 09:43:27 -0600 Subject: io_uring: get rid of __req_need_defer() We just have one caller of this, req_need_defer(), just inline the code in there instead. Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 51ff88330f9a..7f2a2cb5c056 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1069,18 +1069,14 @@ err: return NULL; } -static inline bool __req_need_defer(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - - return req->sequence != ctx->cached_cq_tail - + atomic_read(&ctx->cached_cq_overflow); -} - static inline bool req_need_defer(struct io_kiocb *req) { - if (unlikely(req->flags & REQ_F_IO_DRAIN)) - return __req_need_defer(req); + if (unlikely(req->flags & REQ_F_IO_DRAIN)) { + struct io_ring_ctx *ctx = req->ctx; + + return req->sequence != ctx->cached_cq_tail + + atomic_read(&ctx->cached_cq_overflow); + } return false; } -- cgit From 4349f30ecb8068d146a1e57bb12f46e745323b4c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Jul 2020 15:07:01 -0600 Subject: io_uring: remove dead 'ctx' argument and move forward declaration We don't use 'ctx' at all in io_sq_thread_drop_mm(), it just works on the mm of the current task. Drop the argument. Move io_file_put_work() to where we have the other forward declarations of functions. Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7f2a2cb5c056..3ce02a1613cc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -902,6 +902,7 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_comp_state *cs); +static void io_file_put_work(struct work_struct *work); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, @@ -942,7 +943,7 @@ static void __io_put_req_task(struct io_kiocb *req) put_task_struct(req->task); } -static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +static void io_sq_thread_drop_mm(void) { struct mm_struct *mm = current->mm; @@ -977,8 +978,6 @@ static inline void req_set_fail_links(struct io_kiocb *req) req->flags |= REQ_F_FAIL_LINK; } -static void io_file_put_work(struct work_struct *work); - /* * Note: must call io_req_init_async() for the first time you * touch any members of io_wq_work. @@ -6339,7 +6338,7 @@ static int io_sq_thread(void *data) * adding ourselves to the waitqueue, as the unuse/drop * may sleep. */ - io_sq_thread_drop_mm(ctx); + io_sq_thread_drop_mm(); /* * We're polling. If we're within the defined idle @@ -6410,7 +6409,7 @@ static int io_sq_thread(void *data) io_run_task_work(); - io_sq_thread_drop_mm(ctx); + io_sq_thread_drop_mm(); revert_creds(old_cred); kthread_parkme(); -- cgit From 667e57da358f61b6966e12e925a69e42d912e8bb Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 10 Jul 2020 14:14:20 +0000 Subject: io_uring: fix memleak in io_sqe_files_register() I got a memleak report when doing some fuzz test: BUG: memory leak unreferenced object 0x607eeac06e78 (size 8): comm "test", pid 295, jiffies 4294735835 (age 31.745s) hex dump (first 8 bytes): 00 00 00 00 00 00 00 00 ........ backtrace: [<00000000932632e6>] percpu_ref_init+0x2a/0x1b0 [<0000000092ddb796>] __io_uring_register+0x111d/0x22a0 [<00000000eadd6c77>] __x64_sys_io_uring_register+0x17b/0x480 [<00000000591b89a6>] do_syscall_64+0x56/0xa0 [<00000000864a281d>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Call percpu_ref_exit() on error path to avoid refcount memleak. Fixes: 05f3fb3c5397 ("io_uring: avoid ring quiesce for fixed file set unregister and update") Cc: stable@vger.kernel.org Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index a9ce2e6f03dd..fc07baf4392a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6699,6 +6699,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, for (i = 0; i < nr_tables; i++) kfree(ctx->file_data->table[i].files); + percpu_ref_exit(&ctx->file_data->refs); kfree(ctx->file_data->table); kfree(ctx->file_data); ctx->file_data = NULL; -- cgit From 309fc03a3284af62eb6082fb60327045a1dabf57 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 10 Jul 2020 09:13:34 -0600 Subject: io_uring: account user memory freed when exit has been queued We currently account the memory after the exit work has been run, but that leaves a gap where a process has closed its ring and until the memory has been accounted as freed. If the memlocked ulimit is borderline, then that can introduce spurious setup errors returning -ENOMEM because the free work hasn't been run yet. Account this as freed when we close the ring, as not to expose a tiny gap where setting up a new ring can fail. Fixes: 85faa7b8346e ("io_uring: punt final io_ring_ctx wait-and-free to workqueue") Cc: stable@vger.kernel.org # v5.7 Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index fc07baf4392a..ca8abde48b6c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7351,9 +7351,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_mem_free(ctx->sq_sqes); percpu_ref_exit(&ctx->refs); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, - ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); put_cred(ctx->creds); kfree(ctx->cancel_hash); @@ -7438,6 +7435,16 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) if (ctx->rings) io_cqring_overflow_flush(ctx, true); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); + + /* + * Do this upfront, so we won't have a grace period where the ring + * is closed but resources aren't reaped yet. This can cause + * spurious failure in setting up a new ring. + */ + if (ctx->account_mem) + io_unaccount_mem(ctx->user, + ring_pages(ctx->sq_entries, ctx->cq_entries)); + INIT_WORK(&ctx->exit_work, io_ring_exit_work); queue_work(system_wq, &ctx->exit_work); } -- cgit From dd821e0c95a64b5923a0c57f07d3f7563553e756 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 13:23:08 +0300 Subject: io_uring: fix missing msg_name assignment Ensure to set msg.msg_name for the async portion of send/recvmsg, as the header copy will copy to/from it. Cc: stable@vger.kernel.org # v5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index ca8abde48b6c..5570d6aeaff8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3553,6 +3553,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->flags & REQ_F_NEED_CLEANUP) return 0; + io->msg.msg.msg_name = &io->msg.addr; io->msg.iov = io->msg.fast_iov; ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.iov); @@ -3734,6 +3735,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) { + io->msg.msg.msg_name = &io->msg.addr; io->msg.iov = io->msg.fast_iov; #ifdef CONFIG_COMPAT -- cgit From 16d598030a37853a7a6b4384cad19c9c0af2f021 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 16:16:47 +0300 Subject: io_uring: fix not initialised work->flags 59960b9deb535 ("io_uring: fix lazy work init") tried to fix missing io_req_init_async(), but left out work.flags and hash. Do it earlier. Fixes: 7cdaf587de7c ("io_uring: avoid whole io_wq_work copy for requests completed inline") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5570d6aeaff8..9fd7e69696c3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1096,6 +1096,8 @@ static inline void io_prep_async_work(struct io_kiocb *req, { const struct io_op_def *def = &io_op_defs[req->opcode]; + io_req_init_async(req); + if (req->flags & REQ_F_ISREG) { if (def->hash_reg_file) io_wq_hash_work(&req->work, file_inode(req->file)); @@ -1104,7 +1106,6 @@ static inline void io_prep_async_work(struct io_kiocb *req, req->work.flags |= IO_WQ_WORK_UNBOUND; } - io_req_init_async(req); io_req_work_grab_env(req, def); *link = io_prep_linked_timeout(req); -- cgit From 681fda8d27a66f7e65ff7f2d200d7635e64a8d05 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 22:20:45 +0300 Subject: io_uring: fix recvmsg memory leak with buffer selection io_recvmsg() doesn't free memory allocated for struct io_buffer. This can causes a leak when used with automatic buffer selection. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9fd7e69696c3..74bc4a04befa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3845,10 +3845,16 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); + if (force_nonblock && ret == -EAGAIN) { + ret = io_setup_async_msg(req, kmsg); + if (ret != -EAGAIN) + kfree(kbuf); + return ret; + } if (ret == -ERESTARTSYS) ret = -EINTR; + if (kbuf) + kfree(kbuf); } if (kmsg && kmsg->iov != kmsg->fast_iov) -- cgit From 807abcb0883439af5ead73f3308310453b97b624 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Jul 2020 17:09:27 -0600 Subject: io_uring: ensure double poll additions work with both request types The double poll additions were centered around doing POLL_ADD on file descriptors that use more than one waitqueue (typically one for read, one for write) when being polled. However, it can also end up being triggered for when we use poll triggered retry. For that case, we cannot safely use req->io, as that could be used by the request type itself. Add a second io_poll_iocb pointer in the structure we allocate for poll based retry, and ensure we use the right one from the two paths. Fixes: 18bceab101ad ("io_uring: allow POLL_ADD with double poll_wait() users") Signed-off-by: Jens Axboe --- fs/io_uring.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 74bc4a04befa..53232ac3da17 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -605,6 +605,7 @@ enum { struct async_poll { struct io_poll_iocb poll; + struct io_poll_iocb *double_poll; struct io_wq_work work; }; @@ -4159,9 +4160,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) return false; } -static void io_poll_remove_double(struct io_kiocb *req) +static void io_poll_remove_double(struct io_kiocb *req, void *data) { - struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io; + struct io_poll_iocb *poll = data; lockdep_assert_held(&req->ctx->completion_lock); @@ -4181,7 +4182,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) { struct io_ring_ctx *ctx = req->ctx; - io_poll_remove_double(req); + io_poll_remove_double(req, req->io); req->poll.done = true; io_cqring_fill_event(req, error ? error : mangle_poll(mask)); io_commit_cqring(ctx); @@ -4224,21 +4225,21 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io; + struct io_poll_iocb *poll = req->apoll->double_poll; __poll_t mask = key_to_poll(key); /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) return 0; - if (req->poll.head) { + if (poll && poll->head) { bool done; - spin_lock(&req->poll.head->lock); - done = list_empty(&req->poll.wait.entry); + spin_lock(&poll->head->lock); + done = list_empty(&poll->wait.entry); if (!done) - list_del_init(&req->poll.wait.entry); - spin_unlock(&req->poll.head->lock); + list_del_init(&poll->wait.entry); + spin_unlock(&poll->head->lock); if (!done) __io_async_wake(req, poll, mask, io_poll_task_func); } @@ -4258,7 +4259,8 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, } static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, - struct wait_queue_head *head) + struct wait_queue_head *head, + struct io_poll_iocb **poll_ptr) { struct io_kiocb *req = pt->req; @@ -4269,7 +4271,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, */ if (unlikely(poll->head)) { /* already have a 2nd entry, fail a third attempt */ - if (req->io) { + if (*poll_ptr) { pt->error = -EINVAL; return; } @@ -4281,7 +4283,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake); refcount_inc(&req->refs); poll->wait.private = req; - req->io = (void *) poll; + *poll_ptr = poll; } pt->error = 0; @@ -4293,8 +4295,9 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct async_poll *apoll = pt->req->apoll; - __io_queue_proc(&pt->req->apoll->poll, pt, head); + __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) @@ -4344,11 +4347,13 @@ static void io_async_task_func(struct callback_head *cb) } } + io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); /* restore ->work in case we need to retry again */ if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll->double_poll); kfree(apoll); if (!canceled) { @@ -4436,7 +4441,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask, ret; - bool had_io; if (!req->file || !file_can_poll(req->file)) return false; @@ -4448,11 +4452,11 @@ static bool io_arm_poll_handler(struct io_kiocb *req) apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) return false; + apoll->double_poll = NULL; req->flags |= REQ_F_POLLED; if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&apoll->work, &req->work, sizeof(req->work)); - had_io = req->io != NULL; io_get_req_task(req); req->apoll = apoll; @@ -4470,13 +4474,11 @@ static bool io_arm_poll_handler(struct io_kiocb *req) ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); if (ret) { - ipt.error = 0; - /* only remove double add if we did it here */ - if (!had_io) - io_poll_remove_double(req); + io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll->double_poll); kfree(apoll); return false; } @@ -4507,11 +4509,13 @@ static bool io_poll_remove_one(struct io_kiocb *req) bool do_complete; if (req->opcode == IORING_OP_POLL_ADD) { - io_poll_remove_double(req); + io_poll_remove_double(req, req->io); do_complete = __io_poll_remove_one(req, &req->poll); } else { struct async_poll *apoll = req->apoll; + io_poll_remove_double(req, apoll->double_poll); + /* non-poll requests have submit ref still */ do_complete = __io_poll_remove_one(req, &apoll->poll); if (do_complete) { @@ -4524,6 +4528,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll->double_poll); kfree(apoll); } } @@ -4624,7 +4629,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - __io_queue_proc(&pt->req->poll, pt, head); + __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io); } static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -- cgit From 61710e437f2807e26a3402543bdbb7217a9c8620 Mon Sep 17 00:00:00 2001 From: Daniele Albano Date: Sat, 18 Jul 2020 14:15:16 -0600 Subject: io_uring: always allow drain/link/hardlink/async sqe flags We currently filter these for timeout_remove/async_cancel/files_update, but we only should be filtering for fixed file and buffer select. This also causes a second read of sqe->flags, which isn't needed. Just check req->flags for the relevant bits. This then allows these commands to be used in links, for example, like everything else. Signed-off-by: Daniele Albano Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 53232ac3da17..d99802ac166f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4737,7 +4737,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req, { if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->ioprio || sqe->buf_index || sqe->len) return -EINVAL; req->timeout.addr = READ_ONCE(sqe->addr); @@ -4915,8 +4917,9 @@ static int io_async_cancel_prep(struct io_kiocb *req, { if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || - sqe->cancel_flags) + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); @@ -4934,7 +4937,9 @@ static int io_async_cancel(struct io_kiocb *req) static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->flags || sqe->ioprio || sqe->rw_flags) + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->ioprio || sqe->rw_flags) return -EINVAL; req->files_update.offset = READ_ONCE(sqe->off); -- cgit From 3e863ea3bb1a2203ae648eb272db0ce6a1a2072c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jul 2020 20:17:20 +0300 Subject: io_uring: missed req_init_async() for IOSQE_ASYNC IOSQE_ASYNC branch of io_queue_sqe() is another place where an unitialised req->work can be accessed (i.e. prior io_req_init_async()). Nothing really bad though, it just looses IO_WQ_WORK_CONCURRENT flag. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index d99802ac166f..32b0064f806e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5730,6 +5730,7 @@ fail_req: * Never try inline submit of IOSQE_ASYNC is set, go straight * to async execution. */ + io_req_init_async(req); req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { -- cgit From d5e16d8e23825304c6a9945116cc6b6f8d51f28c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 24 Jul 2020 20:07:20 +0300 Subject: io_uring: fix ->work corruption with poll_add req->work might be already initialised by the time it gets into __io_arm_poll_handler(), which will corrupt it by using fields that are in an union with req->work. Luckily, the only side effect is missing put_creds(). Clean req->work before going there. Suggested-by: Jens Axboe Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 32b0064f806e..98e8079e67e7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4658,6 +4658,10 @@ static int io_poll_add(struct io_kiocb *req) struct io_poll_table ipt; __poll_t mask; + /* ->work is in union with hash_node and others */ + io_req_work_drop_env(req); + req->flags &= ~REQ_F_WORK_INITIALIZED; + INIT_HLIST_NODE(&req->hash_node); INIT_LIST_HEAD(&req->list); ipt.pt._qproc = io_poll_queue_proc; -- cgit From 4ae6dbd683860b9edc254ea8acf5e04b5ae242e5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 24 Jul 2020 20:07:21 +0300 Subject: io_uring: fix lockup in io_fail_links() io_fail_links() doesn't consider REQ_F_COMP_LOCKED leading to nested spin_lock(completion_lock) and lockup. [ 197.680409] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 6-... } 18239 jiffies s: 1421 root: 0x40/. [ 197.680411] rcu: blocking rcu_node structures: [ 197.680412] Task dump for CPU 6: [ 197.680413] link-timeout R running task 0 1669 1 0x8000008a [ 197.680414] Call Trace: [ 197.680420] ? io_req_find_next+0xa0/0x200 [ 197.680422] ? io_put_req_find_next+0x2a/0x50 [ 197.680423] ? io_poll_task_func+0xcf/0x140 [ 197.680425] ? task_work_run+0x67/0xa0 [ 197.680426] ? do_exit+0x35d/0xb70 [ 197.680429] ? syscall_trace_enter+0x187/0x2c0 [ 197.680430] ? do_group_exit+0x43/0xa0 [ 197.680448] ? __x64_sys_exit_group+0x18/0x20 [ 197.680450] ? do_syscall_64+0x52/0xa0 [ 197.680452] ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 98e8079e67e7..493e5047e67c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4199,10 +4199,9 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) hash_del(&req->hash_node); io_poll_complete(req, req->result, 0); - req->flags |= REQ_F_COMP_LOCKED; - io_put_req_find_next(req, nxt); spin_unlock_irq(&ctx->completion_lock); + io_put_req_find_next(req, nxt); io_cqring_ev_posted(ctx); } -- cgit From b36200f543ff07a1cb346aa582349141df2c8068 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Sat, 11 Jul 2020 11:31:11 +0200 Subject: io_uring: fix sq array offset calculation rings_size() sets sq_offset to the total size of the rings (the returned value which is used for memory allocation). This is wrong: sq array should be located within the rings, not after them. Set sq_offset to where it should be. Fixes: 75b28affdd6a ("io_uring: allocate the two rings together") Signed-off-by: Dmitry Vyukov Acked-by: Hristo Venev Cc: io-uring@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index ff3851d40df4..ca932fb3c67d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7416,6 +7416,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, return SIZE_MAX; #endif + if (sq_offset) + *sq_offset = off; + sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) return SIZE_MAX; @@ -7423,9 +7426,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, if (check_add_overflow(off, sq_array_size, &off)) return SIZE_MAX; - if (sq_offset) - *sq_offset = off; - return off; } -- cgit From 270a5940700bb6cf9abf36ea10cf1fa0d453aa7a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 20:41:04 +0300 Subject: io_uring: rename sr->msg into umsg Every second field in send/recv is called msg, make it a bit more understandable by renaming ->msg, which is a user provided ptr, to ->umsg. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index ca932fb3c67d..f17f098c403a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -414,7 +414,7 @@ struct io_connect { struct io_sr_msg { struct file *file; union { - struct user_msghdr __user *msg; + struct user_msghdr __user *umsg; void __user *buf; }; int msg_flags; @@ -3903,7 +3903,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); #ifdef CONFIG_COMPAT @@ -3919,7 +3919,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) io->msg.msg.msg_name = &io->msg.addr; io->msg.iov = io->msg.fast_iov; - ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + ret = sendmsg_copy_msghdr(&io->msg.msg, sr->umsg, sr->msg_flags, &io->msg.iov); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; @@ -3952,7 +3952,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, kmsg->msg.msg_name = &io.msg.addr; io.msg.iov = io.msg.fast_iov; - ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, + ret = sendmsg_copy_msghdr(&io.msg.msg, sr->umsg, sr->msg_flags, &io.msg.iov); if (ret) return ret; @@ -4030,8 +4030,8 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) size_t iov_len; int ret; - ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr, - &uiov, &iov_len); + ret = __copy_msghdr_from_user(&io->msg.msg, sr->umsg, + &io->msg.uaddr, &uiov, &iov_len); if (ret) return ret; @@ -4065,7 +4065,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, compat_size_t len; int ret; - msg_compat = (struct compat_msghdr __user *) sr->msg; + msg_compat = (struct compat_msghdr __user *) sr->umsg; ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr, &ptr, &len); if (ret) @@ -4142,7 +4142,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->bgid = READ_ONCE(sqe->buf_group); @@ -4207,7 +4207,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, else if (force_nonblock) flags |= MSG_DONTWAIT; - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, kmsg->uaddr, flags); if (force_nonblock && ret == -EAGAIN) { ret = io_setup_async_msg(req, kmsg); -- cgit From 1400e69705baf98d1c9cb73b592a3a68aab1d852 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 20:41:05 +0300 Subject: io_uring: use more specific type in rcv/snd msg cp send/recv msghdr initialisation works with struct io_async_msghdr, but pulls the whole struct io_async_ctx for no reason. That complicates it with composite accessing, e.g. io->msg. Use and pass the most specific type, which is struct io_async_msghdr. It is the larget field in union io_async_ctx and doesn't save stack space, but looks clearer. The most of the changes are replacing "io->msg." with "iomsg->" Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 63 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 32 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index f17f098c403a..8acbaddaebb7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3935,7 +3935,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, sock = sock_from_file(req->file, &ret); if (sock) { - struct io_async_ctx io; + struct io_async_msghdr iomsg; unsigned flags; if (req->io) { @@ -3948,14 +3948,13 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, } else { struct io_sr_msg *sr = &req->sr_msg; - kmsg = &io.msg; - kmsg->msg.msg_name = &io.msg.addr; - - io.msg.iov = io.msg.fast_iov; - ret = sendmsg_copy_msghdr(&io.msg.msg, sr->umsg, - sr->msg_flags, &io.msg.iov); + iomsg.msg.msg_name = &iomsg.addr; + iomsg.iov = iomsg.fast_iov; + ret = sendmsg_copy_msghdr(&iomsg.msg, sr->umsg, + sr->msg_flags, &iomsg.iov); if (ret) return ret; + kmsg = &iomsg; } flags = req->sr_msg.msg_flags; @@ -4023,30 +4022,31 @@ static int io_send(struct io_kiocb *req, bool force_nonblock, return 0; } -static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +static int __io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = &req->sr_msg; struct iovec __user *uiov; size_t iov_len; int ret; - ret = __copy_msghdr_from_user(&io->msg.msg, sr->umsg, - &io->msg.uaddr, &uiov, &iov_len); + ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, + &iomsg->uaddr, &uiov, &iov_len); if (ret) return ret; if (req->flags & REQ_F_BUFFER_SELECT) { if (iov_len > 1) return -EINVAL; - if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov))) + if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov))) return -EFAULT; - sr->len = io->msg.iov[0].iov_len; - iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1, + sr->len = iomsg->iov[0].iov_len; + iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1, sr->len); - io->msg.iov = NULL; + iomsg->iov = NULL; } else { ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &io->msg.iov, &io->msg.msg.msg_iter); + &iomsg->iov, &iomsg->msg.msg_iter); if (ret > 0) ret = 0; } @@ -4056,7 +4056,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) #ifdef CONFIG_COMPAT static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_ctx *io) + struct io_async_msghdr *iomsg) { struct compat_msghdr __user *msg_compat; struct io_sr_msg *sr = &req->sr_msg; @@ -4066,7 +4066,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, int ret; msg_compat = (struct compat_msghdr __user *) sr->umsg; - ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr, + ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr, &ptr, &len); if (ret) return ret; @@ -4083,12 +4083,12 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, return -EFAULT; if (clen < 0) return -EINVAL; - sr->len = io->msg.iov[0].iov_len; - io->msg.iov = NULL; + sr->len = iomsg->iov[0].iov_len; + iomsg->iov = NULL; } else { ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV, - &io->msg.iov, - &io->msg.msg.msg_iter); + &iomsg->iov, + &iomsg->msg.msg_iter); if (ret < 0) return ret; } @@ -4097,17 +4097,18 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, } #endif -static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +static int io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) { - io->msg.msg.msg_name = &io->msg.addr; - io->msg.iov = io->msg.fast_iov; + iomsg->msg.msg_name = &iomsg->addr; + iomsg->iov = iomsg->fast_iov; #ifdef CONFIG_COMPAT if (req->ctx->compat) - return __io_compat_recvmsg_copy_hdr(req, io); + return __io_compat_recvmsg_copy_hdr(req, iomsg); #endif - return __io_recvmsg_copy_hdr(req, io); + return __io_recvmsg_copy_hdr(req, iomsg); } static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, @@ -4157,7 +4158,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (req->flags & REQ_F_NEED_CLEANUP) return 0; - ret = io_recvmsg_copy_hdr(req, io); + ret = io_recvmsg_copy_hdr(req, &io->msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -4173,7 +4174,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, sock = sock_from_file(req->file, &ret); if (sock) { struct io_buffer *kbuf; - struct io_async_ctx io; + struct io_async_msghdr iomsg; unsigned flags; if (req->io) { @@ -4184,12 +4185,10 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, kmsg->iov = kmsg->fast_iov; kmsg->msg.msg_iter.iov = kmsg->iov; } else { - kmsg = &io.msg; - kmsg->msg.msg_name = &io.msg.addr; - - ret = io_recvmsg_copy_hdr(req, &io); + ret = io_recvmsg_copy_hdr(req, &iomsg); if (ret) return ret; + kmsg = &iomsg; } kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); -- cgit From 2ae523ed07f14391d685651f671a7858fe8c368a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 20:41:06 +0300 Subject: io_uring: extract io_sendmsg_copy_hdr() Don't repeat send msg initialisation code, it's error prone. Extract and use a helper function. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8acbaddaebb7..a198466544e7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3893,6 +3893,15 @@ static int io_setup_async_msg(struct io_kiocb *req, return -EAGAIN; } +static int io_sendmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + iomsg->iov = iomsg->fast_iov; + iomsg->msg.msg_name = &iomsg->addr; + return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, + req->sr_msg.msg_flags, &iomsg->iov); +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; @@ -3917,10 +3926,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->flags & REQ_F_NEED_CLEANUP) return 0; - io->msg.msg.msg_name = &io->msg.addr; - io->msg.iov = io->msg.fast_iov; - ret = sendmsg_copy_msghdr(&io->msg.msg, sr->umsg, sr->msg_flags, - &io->msg.iov); + ret = io_sendmsg_copy_hdr(req, &io->msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -3946,12 +3952,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, kmsg->iov = kmsg->fast_iov; kmsg->msg.msg_iter.iov = kmsg->iov; } else { - struct io_sr_msg *sr = &req->sr_msg; - - iomsg.msg.msg_name = &iomsg.addr; - iomsg.iov = iomsg.fast_iov; - ret = sendmsg_copy_msghdr(&iomsg.msg, sr->umsg, - sr->msg_flags, &iomsg.iov); + ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) return ret; kmsg = &iomsg; -- cgit From e73751225bae1e9b67e957afb273366fbb6ca136 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 20:42:04 +0300 Subject: io_uring: replace rw->task_work with rq->task_work io_kiocb::task_work was de-unionised, and is not planned to be shared back, because it's too useful and commonly used. Hence, instead of keeping a separate task_work in struct io_async_rw just reuse req->task_work. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index a198466544e7..ddff3abff363 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -505,7 +505,6 @@ struct io_async_rw { ssize_t nr_segs; ssize_t size; struct wait_page_queue wpq; - struct callback_head task_work; }; struct io_async_ctx { @@ -2901,33 +2900,11 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void io_async_buf_cancel(struct callback_head *cb) -{ - struct io_async_rw *rw; - struct io_kiocb *req; - - rw = container_of(cb, struct io_async_rw, task_work); - req = rw->wpq.wait.private; - __io_req_task_cancel(req, -ECANCELED); -} - -static void io_async_buf_retry(struct callback_head *cb) -{ - struct io_async_rw *rw; - struct io_kiocb *req; - - rw = container_of(cb, struct io_async_rw, task_work); - req = rw->wpq.wait.private; - - __io_req_task_submit(req); -} - static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, int sync, void *arg) { struct wait_page_queue *wpq; struct io_kiocb *req = wait->private; - struct io_async_rw *rw = &req->io->rw; struct wait_page_key *key = arg; int ret; @@ -2939,17 +2916,17 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, list_del_init(&wait->entry); - init_task_work(&rw->task_work, io_async_buf_retry); + init_task_work(&req->task_work, io_req_task_submit); /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - ret = io_req_task_work_add(req, &rw->task_work); + ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { struct task_struct *tsk; /* queue just for cancelation */ - init_task_work(&rw->task_work, io_async_buf_cancel); + init_task_work(&req->task_work, io_req_task_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &rw->task_work, 0); + task_work_add(tsk, &req->task_work, 0); wake_up_process(tsk); } return 1; -- cgit From b64e3444d4e1c71fe148a4f4535395b1fdd73200 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 22:59:18 +0300 Subject: io_uring: simplify io_req_map_rw() Don't deref req->io->rw every time, but put it in a local variable. This looks prettier, generates less instructions, and doesn't break alias analysis. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index ddff3abff363..3b8465dd0214 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2828,15 +2828,17 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { - req->io->rw.nr_segs = iter->nr_segs; - req->io->rw.size = io_size; - req->io->rw.iov = iovec; - if (!req->io->rw.iov) { - req->io->rw.iov = req->io->rw.fast_iov; - if (req->io->rw.iov != fast_iov) - memcpy(req->io->rw.iov, fast_iov, + struct io_async_rw *rw = &req->io->rw; + + rw->nr_segs = iter->nr_segs; + rw->size = io_size; + if (!iovec) { + rw->iov = rw->fast_iov; + if (rw->iov != fast_iov) + memcpy(rw->iov, fast_iov, sizeof(struct iovec) * iter->nr_segs); } else { + rw->iov = iovec; req->flags |= REQ_F_NEED_CLEANUP; } } -- cgit From c3e330a493740a2a8312dcb7b1cffceaec7f619a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 22:59:19 +0300 Subject: io_uring: add a helper for async rw iovec prep Preparing reads/writes for async is a bit tricky. Extract a helper to not repeat it twice. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3b8465dd0214..31466bcd833e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2872,11 +2872,27 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, return 0; } +static inline int io_rw_prep_async(struct io_kiocb *req, int rw, + bool force_nonblock) +{ + struct io_async_ctx *io = req->io; + struct iov_iter iter; + ssize_t ret; + + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock); + req->io = io; + if (unlikely(ret < 0)) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; +} + static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { - struct io_async_ctx *io; - struct iov_iter iter; ssize_t ret; ret = io_prep_rw(req, sqe, force_nonblock); @@ -2889,17 +2905,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; - - io = req->io; - io->rw.iov = io->rw.fast_iov; - req->io = NULL; - ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; - if (ret < 0) - return ret; - - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); - return 0; + return io_rw_prep_async(req, READ, force_nonblock); } static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, @@ -3043,8 +3049,6 @@ out_free: static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { - struct io_async_ctx *io; - struct iov_iter iter; ssize_t ret; ret = io_prep_rw(req, sqe, force_nonblock); @@ -3059,17 +3063,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; - - io = req->io; - io->rw.iov = io->rw.fast_iov; - req->io = NULL; - ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; - if (ret < 0) - return ret; - - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); - return 0; + return io_rw_prep_async(req, WRITE, force_nonblock); } static int io_write(struct io_kiocb *req, bool force_nonblock, -- cgit From 252917c30f551e8e4377faac81d7fcf8e9629df1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 22:59:20 +0300 Subject: io_uring: follow **iovec idiom in io_import_iovec As for import_iovec(), return !=NULL iovec from io_import_iovec() only when it should be freed. That includes returning NULL when iovec is already in req->io, because it should be deallocated by other means, e.g. inside op handler. After io_setup_async_rw() local iovec to ->io, just mark it NULL, to follow the idea in io_{read,write} as well. That's easier to follow, and especially useful if we want to reuse per-op space for completion data. Signed-off-by: Pavel Begunkov [axboe: only call kfree() on non-NULL pointer] Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 31466bcd833e..64ae5b681c62 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2740,10 +2740,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (req->io) { struct io_async_rw *iorw = &req->io->rw; - *iovec = iorw->iov; - iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); - if (iorw->iov == iorw->fast_iov) - *iovec = NULL; + iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size); + *iovec = NULL; return iorw->size; } @@ -3026,6 +3024,8 @@ copy_iov: inline_vecs, &iter); if (ret) goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; /* if we can retry, do so with the callbacks armed */ if (io_rw_should_retry(req)) { ret2 = io_iter_do_read(req, &iter); @@ -3041,7 +3041,7 @@ copy_iov: } } out_free: - if (!(req->flags & REQ_F_NEED_CLEANUP)) + if (iovec) kfree(iovec); return ret; } @@ -3143,11 +3143,13 @@ copy_iov: inline_vecs, &iter); if (ret) goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; return -EAGAIN; } } out_free: - if (!(req->flags & REQ_F_NEED_CLEANUP)) + if (iovec) kfree(iovec); return ret; } -- cgit From 3ca405ebfc1c3445b049dd25ca3338cbc99837d1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:08 +0300 Subject: io_uring: share completion list w/ per-op space Calling io_req_complete(req) means that the request is done, and there is nothing left but to clean it up. That also means that per-op data after that should not be used, so we're free to reuse it in completion path, e.g. to store overflow_list as done in this patch. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 64ae5b681c62..3cadd5f963b7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -487,6 +487,11 @@ struct io_statx { struct statx __user *buffer; }; +struct io_completion { + struct file *file; + struct list_head list; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -622,6 +627,8 @@ struct io_kiocb { struct io_splice splice; struct io_provide_buf pbuf; struct io_statx statx; + /* use only after cleaning per-op data, see io_clean_op() */ + struct io_completion compl; }; struct io_async_ctx *io; @@ -896,7 +903,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, static int io_grab_files(struct io_kiocb *req); static void io_complete_rw_common(struct kiocb *kiocb, long res, struct io_comp_state *cs); -static void io_cleanup_req(struct io_kiocb *req); +static void __io_clean_op(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); static void __io_queue_sqe(struct io_kiocb *req, @@ -936,6 +943,12 @@ static void io_get_req_task(struct io_kiocb *req) req->flags |= REQ_F_TASK_PINNED; } +static inline void io_clean_op(struct io_kiocb *req) +{ + if (req->flags & REQ_F_NEED_CLEANUP) + __io_clean_op(req); +} + /* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ static void __io_put_req_task(struct io_kiocb *req) { @@ -1413,8 +1426,8 @@ static void io_submit_flush_completions(struct io_comp_state *cs) while (!list_empty(&cs->list)) { struct io_kiocb *req; - req = list_first_entry(&cs->list, struct io_kiocb, list); - list_del(&req->list); + req = list_first_entry(&cs->list, struct io_kiocb, compl.list); + list_del(&req->compl.list); __io_cqring_fill_event(req, req->result, req->cflags); if (!(req->flags & REQ_F_LINK_HEAD)) { req->flags |= REQ_F_COMP_LOCKED; @@ -1439,9 +1452,10 @@ static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, io_cqring_add_event(req, res, cflags); io_put_req(req); } else { + io_clean_op(req); req->result = res; req->cflags = cflags; - list_add_tail(&req->list, &cs->list); + list_add_tail(&req->compl.list, &cs->list); if (++cs->nr >= 32) io_submit_flush_completions(cs); } @@ -1515,8 +1529,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, static void io_dismantle_req(struct io_kiocb *req) { - if (req->flags & REQ_F_NEED_CLEANUP) - io_cleanup_req(req); + io_clean_op(req); if (req->io) kfree(req->io); @@ -5402,7 +5415,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EIOCBQUEUED; } -static void io_cleanup_req(struct io_kiocb *req) +static void __io_clean_op(struct io_kiocb *req) { struct io_async_ctx *io = req->io; -- cgit From 540e32a0855e700affa29b1112bf2dbb1fa7702a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:09 +0300 Subject: io_uring: rename ctx->poll into ctx->iopoll It supports both polling and I/O polling. Rename ctx->poll to clearly show that it's only in I/O poll case. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3cadd5f963b7..c8ebd227c837 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -320,12 +320,12 @@ struct io_ring_ctx { spinlock_t completion_lock; /* - * ->poll_list is protected by the ctx->uring_lock for + * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. * For SQPOLL, only the single threaded io_sq_thread() will * manipulate the list, hence no extra locking is needed there. */ - struct list_head poll_list; + struct list_head iopoll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_file; @@ -1064,7 +1064,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); - INIT_LIST_HEAD(&ctx->poll_list); + INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -2009,7 +2009,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, spin = !ctx->poll_multi_file && *nr_events < min; ret = 0; - list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, list) { struct kiocb *kiocb = &req->rw.kiocb; /* @@ -2051,7 +2051,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { - while (!list_empty(&ctx->poll_list) && !need_resched()) { + while (!list_empty(&ctx->iopoll_list) && !need_resched()) { int ret; ret = io_do_iopoll(ctx, nr_events, min); @@ -2074,7 +2074,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) return; mutex_lock(&ctx->uring_lock); - while (!list_empty(&ctx->poll_list)) { + while (!list_empty(&ctx->iopoll_list)) { unsigned int nr_events = 0; io_do_iopoll(ctx, &nr_events, 0); @@ -2291,12 +2291,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * how we do polling eventually, not spinning if we're on potentially * different devices. */ - if (list_empty(&ctx->poll_list)) { + if (list_empty(&ctx->iopoll_list)) { ctx->poll_multi_file = false; } else if (!ctx->poll_multi_file) { struct io_kiocb *list_req; - list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, + list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, list); if (list_req->file != req->file) ctx->poll_multi_file = true; @@ -2307,9 +2307,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) - list_add(&req->list, &ctx->poll_list); + list_add(&req->list, &ctx->iopoll_list); else - list_add_tail(&req->list, &ctx->poll_list); + list_add_tail(&req->list, &ctx->iopoll_list); if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sqo_wait)) @@ -6329,11 +6329,11 @@ static int io_sq_thread(void *data) while (!kthread_should_park()) { unsigned int to_submit; - if (!list_empty(&ctx->poll_list)) { + if (!list_empty(&ctx->iopoll_list)) { unsigned nr_events = 0; mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->poll_list) && !need_resched()) + if (!list_empty(&ctx->iopoll_list) && !need_resched()) io_do_iopoll(ctx, &nr_events, 0); else timeout = jiffies + ctx->sq_thread_idle; @@ -6362,7 +6362,7 @@ static int io_sq_thread(void *data) * more IO, we should wait for the application to * reap events and wake us up. */ - if (!list_empty(&ctx->poll_list) || need_resched() || + if (!list_empty(&ctx->iopoll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { io_run_task_work(); @@ -6375,13 +6375,13 @@ static int io_sq_thread(void *data) /* * While doing polled IO, before going to sleep, we need - * to check if there are new reqs added to poll_list, it - * is because reqs may have been punted to io worker and - * will be added to poll_list later, hence check the - * poll_list again. + * to check if there are new reqs added to iopoll_list, + * it is because reqs may have been punted to io worker + * and will be added to iopoll_list later, hence check + * the iopoll_list again. */ if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->poll_list)) { + !list_empty_careful(&ctx->iopoll_list)) { finish_wait(&ctx->sqo_wait, &wait); continue; } -- cgit From d21ffe7eca82d47b489760899912f81e30456e2e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:10 +0300 Subject: io_uring: use inflight_entry list for iopoll'ing req->inflight_entry is used to track requests that grabbed files_struct. Let's share it with iopoll list, because the only iopoll'ed ops are reads and writes, which don't need a file table. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index c8ebd227c837..8a89480a57ec 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -651,6 +651,10 @@ struct io_kiocb { struct list_head link_list; + /* + * 1. used with ctx->iopoll_list with reads/writes + * 2. to track reqs with ->files (see io_op_def::file_table) + */ struct list_head inflight_entry; struct percpu_ref *fixed_file_refs; @@ -1943,8 +1947,8 @@ static void io_iopoll_queue(struct list_head *again) struct io_kiocb *req; do { - req = list_first_entry(again, struct io_kiocb, list); - list_del(&req->list); + req = list_first_entry(again, struct io_kiocb, inflight_entry); + list_del(&req->inflight_entry); if (!io_rw_reissue(req, -EAGAIN)) io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); } while (!list_empty(again)); @@ -1967,13 +1971,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, while (!list_empty(done)) { int cflags = 0; - req = list_first_entry(done, struct io_kiocb, list); + req = list_first_entry(done, struct io_kiocb, inflight_entry); if (READ_ONCE(req->result) == -EAGAIN) { req->iopoll_completed = 0; - list_move_tail(&req->list, &again); + list_move_tail(&req->inflight_entry, &again); continue; } - list_del(&req->list); + list_del(&req->inflight_entry); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_kbuf(req); @@ -2009,7 +2013,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, spin = !ctx->poll_multi_file && *nr_events < min; ret = 0; - list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, list) { + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { struct kiocb *kiocb = &req->rw.kiocb; /* @@ -2018,7 +2022,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * and complete those lists first, if we have entries there. */ if (READ_ONCE(req->iopoll_completed)) { - list_move_tail(&req->list, &done); + list_move_tail(&req->inflight_entry, &done); continue; } if (!list_empty(&done)) @@ -2030,7 +2034,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, /* iopoll may have completed current req */ if (READ_ONCE(req->iopoll_completed)) - list_move_tail(&req->list, &done); + list_move_tail(&req->inflight_entry, &done); if (ret && spin) spin = false; @@ -2297,7 +2301,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req) struct io_kiocb *list_req; list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, - list); + inflight_entry); if (list_req->file != req->file) ctx->poll_multi_file = true; } @@ -2307,9 +2311,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) - list_add(&req->list, &ctx->iopoll_list); + list_add(&req->inflight_entry, &ctx->iopoll_list); else - list_add_tail(&req->list, &ctx->iopoll_list); + list_add_tail(&req->inflight_entry, &ctx->iopoll_list); if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sqo_wait)) -- cgit From 40d8ddd4facb80760d5a0c61a7cf026d5ff73ff0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:11 +0300 Subject: io_uring: use completion list for CQ overflow As with the completion path, also use compl.list for overflowed requests. If cleaned up properly, nobody needs per-op data there anymore. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8a89480a57ec..2122b37e68e3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1339,8 +1339,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) break; req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, - list); - list_move(&req->list, &list); + compl.list); + list_move(&req->compl.list, &list); req->flags &= ~REQ_F_OVERFLOW; if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); @@ -1362,8 +1362,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) io_cqring_ev_posted(ctx); while (!list_empty(&list)) { - req = list_first_entry(&list, struct io_kiocb, list); - list_del(&req->list); + req = list_first_entry(&list, struct io_kiocb, compl.list); + list_del(&req->compl.list); io_put_req(req); } @@ -1396,11 +1396,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) set_bit(0, &ctx->cq_check_overflow); ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } + io_clean_op(req); req->flags |= REQ_F_OVERFLOW; - refcount_inc(&req->refs); req->result = res; req->cflags = cflags; - list_add_tail(&req->list, &ctx->cq_overflow_list); + refcount_inc(&req->refs); + list_add_tail(&req->compl.list, &ctx->cq_overflow_list); } } @@ -7835,7 +7836,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, if (cancel_req->flags & REQ_F_OVERFLOW) { spin_lock_irq(&ctx->completion_lock); - list_del(&cancel_req->list); + list_del(&cancel_req->compl.list); cancel_req->flags &= ~REQ_F_OVERFLOW; if (list_empty(&ctx->cq_overflow_list)) { clear_bit(0, &ctx->sq_check_overflow); -- cgit From 135fcde8496b03d31648171dbc038990112e41d5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:12 +0300 Subject: io_uring: add req->timeout.list Instead of using shared req->list, hang timeouts up on their own list entry. struct io_timeout have enough extra space for it, but if that will be a problem ->inflight_entry can reused for that. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2122b37e68e3..2544795cfd30 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -396,6 +396,7 @@ struct io_timeout { int flags; u32 off; u32 target_seq; + struct list_head list; }; struct io_rw { @@ -1213,7 +1214,7 @@ static void io_kill_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); - list_del_init(&req->list); + list_del_init(&req->timeout.list); req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); io_put_req(req); @@ -1225,7 +1226,7 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx) struct io_kiocb *req, *tmp; spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) io_kill_timeout(req); spin_unlock_irq(&ctx->completion_lock); } @@ -1248,7 +1249,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->timeout_list)) { struct io_kiocb *req = list_first_entry(&ctx->timeout_list, - struct io_kiocb, list); + struct io_kiocb, timeout.list); if (io_is_timeout_noseq(req)) break; @@ -1256,7 +1257,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) - atomic_read(&ctx->cq_timeouts)) break; - list_del_init(&req->list); + list_del_init(&req->timeout.list); io_kill_timeout(req); } } @@ -4997,8 +4998,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) * We could be racing with timeout deletion. If the list is empty, * then timeout lookup already found it and will be handling it. */ - if (!list_empty(&req->list)) - list_del_init(&req->list); + if (!list_empty(&req->timeout.list)) + list_del_init(&req->timeout.list); io_cqring_fill_event(req, -ETIME); io_commit_cqring(ctx); @@ -5015,9 +5016,9 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) struct io_kiocb *req; int ret = -ENOENT; - list_for_each_entry(req, &ctx->timeout_list, list) { + list_for_each_entry(req, &ctx->timeout_list, timeout.list) { if (user_data == req->user_data) { - list_del_init(&req->list); + list_del_init(&req->timeout.list); ret = 0; break; } @@ -5139,7 +5140,8 @@ static int io_timeout(struct io_kiocb *req) * the one we need first. */ list_for_each_prev(entry, &ctx->timeout_list) { - struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); + struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, + timeout.list); if (io_is_timeout_noseq(nxt)) continue; @@ -5148,7 +5150,7 @@ static int io_timeout(struct io_kiocb *req) break; } add: - list_add(&req->list, entry); + list_add(&req->timeout.list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); -- cgit From 7d6ddea6beaf6639cf3a2b291dcdac6fe1edc584 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:13 +0300 Subject: io_uring: remove init for unused list poll*() doesn't use req->list, don't init it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2544795cfd30..1e4ac48b1557 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4964,7 +4964,6 @@ static int io_poll_add(struct io_kiocb *req) req->flags &= ~REQ_F_WORK_INITIALIZED; INIT_HLIST_NODE(&req->hash_node); - INIT_LIST_HEAD(&req->list); ipt.pt._qproc = io_poll_queue_proc; mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, -- cgit From 27dc8338e5fb0e0ed5b272e792f4ffad7f3bc03e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:14 +0300 Subject: io_uring: use non-intrusive list for defer The only left user of req->list is DRAIN, hence instead of keeping a separate per request list for it, do that with old fashion non-intrusive lists allocated on demand. That's a really slow path, so that's OK. This removes req->list and so sheds 16 bytes from io_kiocb. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1e4ac48b1557..6e6e71310785 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -641,7 +641,6 @@ struct io_kiocb { u16 buf_index; struct io_ring_ctx *ctx; - struct list_head list; unsigned int flags; refcount_t refs; struct task_struct *task; @@ -676,6 +675,11 @@ struct io_kiocb { struct callback_head task_work; }; +struct io_defer_entry { + struct list_head list; + struct io_kiocb *req; +}; + #define IO_IOPOLL_BATCH 8 struct io_comp_state { @@ -1234,14 +1238,15 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx) static void __io_queue_deferred(struct io_ring_ctx *ctx) { do { - struct io_kiocb *req = list_first_entry(&ctx->defer_list, - struct io_kiocb, list); + struct io_defer_entry *de = list_first_entry(&ctx->defer_list, + struct io_defer_entry, list); - if (req_need_defer(req)) + if (req_need_defer(de->req)) break; - list_del_init(&req->list); + list_del_init(&de->list); /* punt-init is done before queueing for defer */ - __io_queue_async_work(req); + __io_queue_async_work(de->req); + kfree(de); } while (!list_empty(&ctx->defer_list)); } @@ -5394,6 +5399,7 @@ static int io_req_defer_prep(struct io_kiocb *req, static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; + struct io_defer_entry *de; int ret; /* Still need defer if there is pending req in defer list. */ @@ -5408,15 +5414,20 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } io_prep_async_link(req); + de = kmalloc(sizeof(*de), GFP_KERNEL); + if (!de) + return -ENOMEM; spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); + kfree(de); return 0; } trace_io_uring_defer(ctx, req, req->user_data); - list_add_tail(&req->list, &ctx->defer_list); + de->req = req; + list_add_tail(&de->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; } -- cgit From 9cf7c104deaef52d6fd7c103a716e31d9815ede8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:15 +0300 Subject: io_uring: remove sequence from io_kiocb req->sequence is used only for deferred (i.e. DRAIN) requests, but initialised for every request. Remove req->sequence from io_kiocb together with its initialisation in io_init_req(). Replace it with a new field in struct io_defer_entry, that will be calculated only when needed in io_req_defer(), which is a slow path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e6e71310785..efa132831f3d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -639,6 +639,7 @@ struct io_kiocb { u8 iopoll_completed; u16 buf_index; + u32 result; struct io_ring_ctx *ctx; unsigned int flags; @@ -646,8 +647,6 @@ struct io_kiocb { struct task_struct *task; unsigned long fsize; u64 user_data; - u32 result; - u32 sequence; struct list_head link_list; @@ -678,6 +677,7 @@ struct io_kiocb { struct io_defer_entry { struct list_head list; struct io_kiocb *req; + u32 seq; }; #define IO_IOPOLL_BATCH 8 @@ -1090,13 +1090,13 @@ err: return NULL; } -static inline bool req_need_defer(struct io_kiocb *req) +static bool req_need_defer(struct io_kiocb *req, u32 seq) { if (unlikely(req->flags & REQ_F_IO_DRAIN)) { struct io_ring_ctx *ctx = req->ctx; - return req->sequence != ctx->cached_cq_tail - + atomic_read(&ctx->cached_cq_overflow); + return seq != ctx->cached_cq_tail + + atomic_read(&ctx->cached_cq_overflow); } return false; @@ -1241,7 +1241,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); - if (req_need_defer(de->req)) + if (req_need_defer(de->req, de->seq)) break; list_del_init(&de->list); /* punt-init is done before queueing for defer */ @@ -5396,14 +5396,35 @@ static int io_req_defer_prep(struct io_kiocb *req, return ret; } +static u32 io_get_sequence(struct io_kiocb *req) +{ + struct io_kiocb *pos; + struct io_ring_ctx *ctx = req->ctx; + u32 total_submitted, nr_reqs = 1; + + if (req->flags & REQ_F_LINK_HEAD) + list_for_each_entry(pos, &req->link_list, link_list) + nr_reqs++; + + total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; + return total_submitted - nr_reqs; +} + static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; int ret; + u32 seq; /* Still need defer if there is pending req in defer list. */ - if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list)) + if (likely(list_empty_careful(&ctx->defer_list) && + !(req->flags & REQ_F_IO_DRAIN))) + return 0; + + seq = io_get_sequence(req); + /* Still a chance to pass the sequence check */ + if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) return 0; if (!req->io) { @@ -5419,7 +5440,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; spin_lock_irq(&ctx->completion_lock); - if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { + if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); kfree(de); return 0; @@ -5427,6 +5448,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) trace_io_uring_defer(ctx, req, req->user_data); de->req = req; + de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; @@ -6204,12 +6226,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags; int id; - /* - * All io need record the previous position, if LINK vs DARIN, - * it can be used to mark the position of the first IO in the - * link list. - */ - req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped; req->opcode = READ_ONCE(sqe->opcode); req->user_data = READ_ONCE(sqe->user_data); req->io = NULL; -- cgit From 0f7e466b393abab86be96ffcf00af383afddc0d1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:16 +0300 Subject: io_uring: place cflags into completion data req->cflags is used only for defer-completion path, just use completion data to store it. With the 4 bytes from the ->sequence patch and compacting io_kiocb, this frees 8 bytes. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index efa132831f3d..4d0fd9ddd3dc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -491,6 +491,7 @@ struct io_statx { struct io_completion { struct file *file; struct list_head list; + int cflags; }; struct io_async_connect { @@ -633,7 +634,6 @@ struct io_kiocb { }; struct io_async_ctx *io; - int cflags; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; @@ -1351,7 +1351,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); - WRITE_ONCE(cqe->flags, req->cflags); + WRITE_ONCE(cqe->flags, req->compl.cflags); } else { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1405,7 +1405,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) io_clean_op(req); req->flags |= REQ_F_OVERFLOW; req->result = res; - req->cflags = cflags; + req->compl.cflags = cflags; refcount_inc(&req->refs); list_add_tail(&req->compl.list, &ctx->cq_overflow_list); } @@ -1439,7 +1439,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs) req = list_first_entry(&cs->list, struct io_kiocb, compl.list); list_del(&req->compl.list); - __io_cqring_fill_event(req, req->result, req->cflags); + __io_cqring_fill_event(req, req->result, req->compl.cflags); if (!(req->flags & REQ_F_LINK_HEAD)) { req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); @@ -1465,7 +1465,7 @@ static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, } else { io_clean_op(req); req->result = res; - req->cflags = cflags; + req->compl.cflags = cflags; list_add_tail(&req->compl.list, &cs->list); if (++cs->nr >= 32) io_submit_flush_completions(cs); -- cgit From dca9cf8b87f55c96f072c1fc6bc90e2b97a8e19f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 12:46:49 +0300 Subject: io_uring: inline io_req_work_grab_env() The only caller of io_req_work_grab_env() is io_prep_async_work(), and they are both initialising req->work. Inline grab_env(), it's easier to keep this way, moreover there already were bugs with misplacing io_req_init_async(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 50 ++++++++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 30 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4d0fd9ddd3dc..a06d5b9cc046 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1115,31 +1115,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static void io_req_work_grab_env(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - - io_req_init_async(req); - - if (!req->work.mm && def->needs_mm) { - mmgrab(current->mm); - req->work.mm = current->mm; - } - if (!req->work.creds) - req->work.creds = get_current_cred(); - if (!req->work.fs && def->needs_fs) { - spin_lock(¤t->fs->lock); - if (!current->fs->in_exec) { - req->work.fs = current->fs; - req->work.fs->users++; - } else { - req->work.flags |= IO_WQ_WORK_CANCEL; - } - spin_unlock(¤t->fs->lock); - } -} - -static inline void io_req_work_drop_env(struct io_kiocb *req) +static void io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) return; @@ -1177,8 +1153,22 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - - io_req_work_grab_env(req); + if (!req->work.mm && def->needs_mm) { + mmgrab(current->mm); + req->work.mm = current->mm; + } + if (!req->work.creds) + req->work.creds = get_current_cred(); + if (!req->work.fs && def->needs_fs) { + spin_lock(¤t->fs->lock); + if (!current->fs->in_exec) { + req->work.fs = current->fs; + req->work.fs->users++; + } else { + req->work.flags |= IO_WQ_WORK_CANCEL; + } + spin_unlock(¤t->fs->lock); + } } static void io_prep_async_link(struct io_kiocb *req) @@ -1547,7 +1537,7 @@ static void io_dismantle_req(struct io_kiocb *req) if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); __io_put_req_task(req); - io_req_work_drop_env(req); + io_req_clean_work(req); if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -4825,7 +4815,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) io_put_req(req); /* * restore ->work because we will call - * io_req_work_drop_env below when dropping the + * io_req_clean_work below when dropping the * final reference. */ if (req->flags & REQ_F_WORK_INITIALIZED) @@ -4965,7 +4955,7 @@ static int io_poll_add(struct io_kiocb *req) __poll_t mask; /* ->work is in union with hash_node and others */ - io_req_work_drop_env(req); + io_req_clean_work(req); req->flags &= ~REQ_F_WORK_INITIALIZED; INIT_HLIST_NODE(&req->hash_node); -- cgit From 1c2da9e8839d6437b43f2c805411d1a0cbd70165 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 12:46:50 +0300 Subject: io_uring: remove empty cleanup of OP_OPEN* reqs A switch in __io_clean_op() doesn't have default, it's pointless to list opcodes that doesn't do any cleanup. Remove IORING_OP_OPEN* from there. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index a06d5b9cc046..8d6f1c4e8dac 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5473,9 +5473,6 @@ static void __io_clean_op(struct io_kiocb *req) if (req->flags & REQ_F_BUFFER_SELECTED) kfree(req->sr_msg.kbuf); break; - case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: - break; case IORING_OP_SPLICE: case IORING_OP_TEE: io_put_file(req, req->splice.file_in, -- cgit From 327d6d968b195cfc48ff97c49b56520aac922f65 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 12:46:51 +0300 Subject: io_uring: alloc ->io in io_req_defer_prep() Every call to io_req_defer_prep() is prepended with allocating ->io, just do that in the function. And while we're at it, mark error paths with unlikey and replace "if (ret < 0)" with "if (ret)". There is only one change in the observable behaviour, that's instead of killing the head request right away on error, it postpones it until the link is assembled, that looks more preferable. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8d6f1c4e8dac..6a1cd2aea018 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5279,6 +5279,9 @@ static int io_req_defer_prep(struct io_kiocb *req, if (!sqe) return 0; + if (io_alloc_async_ctx(req)) + return -EAGAIN; + if (io_op_defs[req->opcode].file_table) { io_req_init_async(req); ret = io_grab_files(req); @@ -5418,10 +5421,8 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; if (!req->io) { - if (io_alloc_async_ctx(req)) - return -EAGAIN; ret = io_req_defer_prep(req, sqe); - if (ret < 0) + if (ret) return ret; } io_prep_async_link(req); @@ -6024,11 +6025,8 @@ fail_req: } } else if (req->flags & REQ_F_FORCE_ASYNC) { if (!req->io) { - ret = -EAGAIN; - if (io_alloc_async_ctx(req)) - goto fail_req; ret = io_req_defer_prep(req, sqe); - if (unlikely(ret < 0)) + if (unlikely(ret)) goto fail_req; } @@ -6081,11 +6079,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_req_defer_prep(req, sqe); - if (ret) { + if (unlikely(ret)) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; return ret; @@ -6108,11 +6103,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_LINK_HEAD; INIT_LIST_HEAD(&req->link_list); - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_req_defer_prep(req, sqe); - if (ret) + if (unlikely(ret)) req->flags |= REQ_F_FAIL_LINK; *link = req; } else { -- cgit From 57f1a64958543fe18a7fe0addbfb31bb2ceeaea2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 12:46:52 +0300 Subject: io_uring/io-wq: move RLIMIT_FSIZE to io-wq RLIMIT_SIZE in needed only for execution from an io-wq context, hence move all preparations from hot path to io-wq work setup. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6a1cd2aea018..8b2f7a1bbd06 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -645,7 +645,6 @@ struct io_kiocb { unsigned int flags; refcount_t refs; struct task_struct *task; - unsigned long fsize; u64 user_data; struct list_head link_list; @@ -736,6 +735,7 @@ struct io_op_def { unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; + unsigned needs_fsize : 1; }; static const struct io_op_def io_op_defs[] = { @@ -755,6 +755,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -769,6 +770,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -821,6 +823,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .needs_fsize = 1, }, [IORING_OP_OPENAT] = { .file_table = 1, @@ -852,6 +855,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_FADVISE] = { .needs_file = 1, @@ -1169,6 +1173,10 @@ static void io_prep_async_work(struct io_kiocb *req) } spin_unlock(¤t->fs->lock); } + if (def->needs_fsize) + req->work.fsize = rlimit(RLIMIT_FSIZE); + else + req->work.fsize = RLIM_INFINITY; } static void io_prep_async_link(struct io_kiocb *req) @@ -3072,8 +3080,6 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - req->fsize = rlimit(RLIMIT_FSIZE); - /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; @@ -3130,17 +3136,11 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, } kiocb->ki_flags |= IOCB_WRITE; - if (!force_nonblock) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; - if (req->file->f_op->write_iter) ret2 = call_write_iter(req->file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); - if (!force_nonblock) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. @@ -3335,7 +3335,6 @@ static int io_fallocate_prep(struct io_kiocb *req, req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->addr); req->sync.mode = READ_ONCE(sqe->len); - req->fsize = rlimit(RLIMIT_FSIZE); return 0; } @@ -3346,11 +3345,8 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock) /* fallocate always requiring blocking context */ if (force_nonblock) return -EAGAIN; - - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; if (ret < 0) req_set_fail_links(req); io_req_complete(req, ret); -- cgit From 06ef3608b0eed673fcbc62cf74c8d3ad0007a337 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:33 +0300 Subject: io_uring: simplify file ref tracking in submission state Currently, file refs in struct io_submit_state are tracked with 2 vars: @has_refs -- how many refs were initially taken @used_refs -- number of refs used Replace it with a single variable counting how many refs left at the current moment. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8b2f7a1bbd06..28b47533454a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -707,7 +707,6 @@ struct io_submit_state { struct file *file; unsigned int fd; unsigned int has_refs; - unsigned int used_refs; unsigned int ios_left; }; @@ -2327,10 +2326,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req) static void __io_state_file_put(struct io_submit_state *state) { - int diff = state->has_refs - state->used_refs; - - if (diff) - fput_many(state->file, diff); + if (state->has_refs) + fput_many(state->file, state->has_refs); state->file = NULL; } @@ -2352,7 +2349,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) if (state->file) { if (state->fd == fd) { - state->used_refs++; + state->has_refs--; state->ios_left--; return state->file; } @@ -2363,9 +2360,8 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) return NULL; state->fd = fd; - state->has_refs = state->ios_left; - state->used_refs = 1; state->ios_left--; + state->has_refs = state->ios_left; return state->file; } -- cgit From 7a7cacba8b4560403615b04d57bdcd1f93f90f10 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:27:59 +0300 Subject: io_uring: indent left {send,recv}[msg]() Flip over "if (sock)" condition with return on error, the upper layer will take care. That change will be handy later, but already removes an extra jump from hot path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 265 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 131 insertions(+), 134 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 28b47533454a..264b1e5e2d54 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3916,42 +3916,41 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_msghdr *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg = NULL; struct socket *sock; + unsigned flags; int ret; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_async_msghdr iomsg; - unsigned flags; - - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + if (unlikely(!sock)) + return ret; - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if (force_nonblock && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &req->io->msg.addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + ret = io_sendmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; } + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -3964,39 +3963,38 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, static int io_send(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; struct socket *sock; + unsigned flags; int ret; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_sr_msg *sr = &req->sr_msg; - struct msghdr msg; - struct iovec iov; - unsigned flags; + if (unlikely(!sock)) + return ret; - ret = import_single_range(WRITE, sr->buf, sr->len, &iov, - &msg.msg_iter); - if (ret) - return ret; + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + return ret; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - } + msg.msg_flags = flags; + ret = sock_sendmsg(sock, &msg); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; if (ret < 0) req_set_fail_links(req); @@ -4149,62 +4147,62 @@ static int io_recvmsg_prep(struct io_kiocb *req, static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_msghdr *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg = NULL; struct socket *sock; + struct io_buffer *kbuf; + unsigned flags; int ret, cflags = 0; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_buffer *kbuf; - struct io_async_msghdr iomsg; - unsigned flags; - - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { - ret = io_recvmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } + if (unlikely(!sock)) + return ret; - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) { - return PTR_ERR(kbuf); - } else if (kbuf) { - kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, - 1, req->sr_msg.len); - } + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &req->io->msg.addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + ret = io_recvmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) { + return PTR_ERR(kbuf); + } else if (kbuf) { + kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, + 1, req->sr_msg.len); + } - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, - kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) { - ret = io_setup_async_msg(req, kmsg); - if (ret != -EAGAIN) - kfree(kbuf); - return ret; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (kbuf) + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, + kmsg->uaddr, flags); + if (force_nonblock && ret == -EAGAIN) { + ret = io_setup_async_msg(req, kmsg); + if (ret != -EAGAIN) kfree(kbuf); + return ret; } + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (kbuf) + kfree(kbuf); if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) req_set_fail_links(req); __io_req_complete(req, ret, cflags, cs); @@ -4215,51 +4213,50 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { struct io_buffer *kbuf = NULL; + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + void __user *buf = sr->buf; struct socket *sock; + struct iovec iov; + unsigned flags; int ret, cflags = 0; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_sr_msg *sr = &req->sr_msg; - void __user *buf = sr->buf; - struct msghdr msg; - struct iovec iov; - unsigned flags; - - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - else if (kbuf) - buf = u64_to_user_ptr(kbuf->addr); + if (unlikely(!sock)) + return ret; - ret = import_single_range(READ, buf, sr->len, &iov, - &msg.msg_iter); - if (ret) { - kfree(kbuf); - return ret; - } + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + else if (kbuf) + buf = u64_to_user_ptr(kbuf->addr); - req->flags |= REQ_F_NEED_CLEANUP; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - msg.msg_iocb = NULL; - msg.msg_flags = 0; - - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - ret = sock_recvmsg(sock, &msg, flags); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; + ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) { + kfree(kbuf); + return ret; } + req->flags |= REQ_F_NEED_CLEANUP; + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_iocb = NULL; + msg.msg_flags = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = sock_recvmsg(sock, &msg, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + kfree(kbuf); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) -- cgit From 6b754c8b912a164fbb15b7b839d51709c3d9ee6f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:00 +0300 Subject: io_uring: remove extra checks in send/recv With the return on a bad socket, kmsg is always non-null by the end of the function, prune left extra checks and initialisations. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 264b1e5e2d54..ac3c16ea7d23 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3916,7 +3916,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_msghdr iomsg, *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; unsigned flags; int ret; @@ -3951,7 +3951,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, if (ret == -ERESTARTSYS) ret = -EINTR; - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) @@ -4147,7 +4147,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_msghdr iomsg, *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; struct io_buffer *kbuf; unsigned flags; @@ -4199,7 +4199,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, if (kbuf) kfree(kbuf); - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -4212,7 +4212,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, static int io_recv(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_buffer *kbuf = NULL; + struct io_buffer *kbuf; struct io_sr_msg *sr = &req->sr_msg; struct msghdr msg; void __user *buf = sr->buf; -- cgit From 14c32eee9286621dd437b53460e44bd11e5bc08d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:01 +0300 Subject: io_uring: don't forget cflags in io_recv() Instead of returning error from io_recv(), go through generic cleanup path, because it'll retain cflags for userspace. Do the same for io_send() for consistency. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index ac3c16ea7d23..2ffacfbf9094 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3976,7 +3976,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock, ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) - return ret; + return ret;; msg.msg_name = NULL; msg.msg_control = NULL; @@ -4232,10 +4232,8 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, buf = u64_to_user_ptr(kbuf->addr); ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); - if (unlikely(ret)) { - kfree(kbuf); - return ret; - } + if (unlikely(ret)) + goto out_free; req->flags |= REQ_F_NEED_CLEANUP; msg.msg_name = NULL; @@ -4256,7 +4254,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; - +out_free: kfree(kbuf); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) -- cgit From 0e1b6fe3d1e5f1b79c5bec37881c98febfba7718 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:02 +0300 Subject: io_uring: free selected-bufs if error'ed io_clean_op() may be skipped even if there is a selected io_buffer, that's because *select_buffer() funcions never set REQ_F_NEED_CLEANUP. Trigger io_clean_op() when REQ_F_BUFFER_SELECTED is set as well, and and clear the flag if was freed out of it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 85 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 41 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2ffacfbf9094..4448b1e9a754 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -957,7 +957,7 @@ static void io_get_req_task(struct io_kiocb *req) static inline void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_NEED_CLEANUP) + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) __io_clean_op(req); } @@ -1931,6 +1931,7 @@ static int io_put_kbuf(struct io_kiocb *req) cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; cflags |= IORING_CQE_F_BUFFER; req->rw.addr = 0; + req->flags &= ~REQ_F_BUFFER_SELECTED; kfree(kbuf); return cflags; } @@ -4188,20 +4189,16 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) { - ret = io_setup_async_msg(req, kmsg); - if (ret != -EAGAIN) - kfree(kbuf); - return ret; - } + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; + if (kbuf) kfree(kbuf); - if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); - req->flags &= ~REQ_F_NEED_CLEANUP; + req->flags &= ~(REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED); if (ret < 0) req_set_fail_links(req); @@ -4235,7 +4232,6 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, if (unlikely(ret)) goto out_free; - req->flags |= REQ_F_NEED_CLEANUP; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; @@ -4255,7 +4251,8 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, if (ret == -ERESTARTSYS) ret = -EINTR; out_free: - kfree(kbuf); + if (kbuf) + kfree(kbuf); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); @@ -5436,39 +5433,45 @@ static void __io_clean_op(struct io_kiocb *req) { struct io_async_ctx *io = req->io; - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - if (req->flags & REQ_F_BUFFER_SELECTED) + if (req->flags & REQ_F_BUFFER_SELECTED) { + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: kfree((void *)(unsigned long)req->rw.addr); - /* fallthrough */ - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - if (io->rw.iov != io->rw.fast_iov) - kfree(io->rw.iov); - break; - case IORING_OP_RECVMSG: - if (req->flags & REQ_F_BUFFER_SELECTED) - kfree(req->sr_msg.kbuf); - /* fallthrough */ - case IORING_OP_SENDMSG: - if (io->msg.iov != io->msg.fast_iov) - kfree(io->msg.iov); - break; - case IORING_OP_RECV: - if (req->flags & REQ_F_BUFFER_SELECTED) + break; + case IORING_OP_RECVMSG: + case IORING_OP_RECV: kfree(req->sr_msg.kbuf); - break; - case IORING_OP_SPLICE: - case IORING_OP_TEE: - io_put_file(req, req->splice.file_in, - (req->splice.flags & SPLICE_F_FD_IN_FIXED)); - break; + break; + } + req->flags &= ~REQ_F_BUFFER_SELECTED; + } + + if (req->flags & REQ_F_NEED_CLEANUP) { + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + if (io->rw.iov != io->rw.fast_iov) + kfree(io->rw.iov); + break; + case IORING_OP_RECVMSG: + case IORING_OP_SENDMSG: + if (io->msg.iov != io->msg.fast_iov) + kfree(io->msg.iov); + break; + case IORING_OP_SPLICE: + case IORING_OP_TEE: + io_put_file(req, req->splice.file_in, + (req->splice.flags & SPLICE_F_FD_IN_FIXED)); + break; + } + req->flags &= ~REQ_F_NEED_CLEANUP; } - - req->flags &= ~REQ_F_NEED_CLEANUP; } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, -- cgit From bc02ef3325e3ef524ef29b65681ca4207b781224 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:03 +0300 Subject: io_uring: move BUFFER_SELECT check into *recv[msg] Move REQ_F_BUFFER_SELECT flag check out of io_recv_buffer_select(), and do that in its call sites That saves us from double error checking and possibly an extra function call. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4448b1e9a754..8dd9037e332e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4098,9 +4098,6 @@ static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, struct io_sr_msg *sr = &req->sr_msg; struct io_buffer *kbuf; - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return NULL; - kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); if (IS_ERR(kbuf)) return kbuf; @@ -4150,7 +4147,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, { struct io_async_msghdr iomsg, *kmsg; struct socket *sock; - struct io_buffer *kbuf; + struct io_buffer *kbuf = NULL; unsigned flags; int ret, cflags = 0; @@ -4172,10 +4169,10 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, kmsg = &iomsg; } - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) { - return PTR_ERR(kbuf); - } else if (kbuf) { + if (req->flags & REQ_F_BUFFER_SELECT) { + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, 1, req->sr_msg.len); @@ -4222,11 +4219,12 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, if (unlikely(!sock)) return ret; - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - else if (kbuf) + if (req->flags & REQ_F_BUFFER_SELECT) { + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); buf = u64_to_user_ptr(kbuf->addr); + } ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) -- cgit From 8ff069bf2efd7b7aeb90b56ea8edc165c93d8940 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:04 +0300 Subject: io_uring: extract io_put_kbuf() helper Extract a common helper for cleaning up a selected buffer, this will be used shortly. By the way, correct cflags types to unsigned and, as kbufs are anyway tracked by a flag, remove useless zeroing req->rw.addr. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8dd9037e332e..871ada2a29c3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1922,20 +1922,25 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static int io_put_kbuf(struct io_kiocb *req) +static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) { - struct io_buffer *kbuf; - int cflags; + unsigned int cflags; - kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; cflags |= IORING_CQE_F_BUFFER; - req->rw.addr = 0; req->flags &= ~REQ_F_BUFFER_SELECTED; kfree(kbuf); return cflags; } +static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) +{ + struct io_buffer *kbuf; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + return io_put_kbuf(req, kbuf); +} + static inline bool io_run_task_work(void) { if (current->task_works) { @@ -1985,7 +1990,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, list_del(&req->inflight_entry); if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_kbuf(req); + cflags = io_put_rw_kbuf(req); __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; @@ -2177,7 +2182,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res, if (res != req->result) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_kbuf(req); + cflags = io_put_rw_kbuf(req); __io_req_complete(req, res, cflags, cs); } -- cgit From 7fbb1b541f4286cc337b9bca1e5bad0ce4ee978c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:05 +0300 Subject: io_uring: don't open-code recv kbuf managment Don't implement fast path of kbuf freeing and management inlined into io_recv{,msg}(), that's error prone and duplicates handling. Replace it with a helper io_put_recv_kbuf(), which mimics io_put_rw_kbuf() in the io_read/write(). This also keeps cflags calculation in one place, removing duplication between rw and recv/send. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 871ada2a29c3..6e5ea7991c08 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4098,7 +4098,7 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, } static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, - int *cflags, bool needs_lock) + bool needs_lock) { struct io_sr_msg *sr = &req->sr_msg; struct io_buffer *kbuf; @@ -4109,12 +4109,14 @@ static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, sr->kbuf = kbuf; req->flags |= REQ_F_BUFFER_SELECTED; - - *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - *cflags |= IORING_CQE_F_BUFFER; return kbuf; } +static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) +{ + return io_put_kbuf(req, req->sr_msg.kbuf); +} + static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -4152,7 +4154,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, { struct io_async_msghdr iomsg, *kmsg; struct socket *sock; - struct io_buffer *kbuf = NULL; + struct io_buffer *kbuf; unsigned flags; int ret, cflags = 0; @@ -4175,7 +4177,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, } if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + kbuf = io_recv_buffer_select(req, !force_nonblock); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); @@ -4196,12 +4198,11 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, if (ret == -ERESTARTSYS) ret = -EINTR; - if (kbuf) - kfree(kbuf); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); - req->flags &= ~(REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED); - + req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); __io_req_complete(req, ret, cflags, cs); @@ -4225,7 +4226,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, return ret; if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + kbuf = io_recv_buffer_select(req, !force_nonblock); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); buf = u64_to_user_ptr(kbuf->addr); @@ -4254,9 +4255,8 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, if (ret == -ERESTARTSYS) ret = -EINTR; out_free: - if (kbuf) - kfree(kbuf); - req->flags &= ~REQ_F_NEED_CLEANUP; + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); if (ret < 0) req_set_fail_links(req); __io_req_complete(req, ret, cflags, cs); -- cgit From 5dbcad51f78434e782d0470b8b5fc4380700c35f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jul 2020 11:31:20 +0300 Subject: io_uring: don't miscount pinned memory io_sqe_buffer_unregister() uses cxt->sqo_mm for memory accounting, but io_ring_ctx_free() drops ->sqo_mm before leaving pinned_vm over-accounted. Postpone mm cleanup for when it's not needed anymore. Fixes: 309758254ea62 ("io_uring: report pinned memory usage") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e5ea7991c08..ba7ce103667b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7670,12 +7670,12 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); + io_sqe_buffer_unregister(ctx); if (ctx->sqo_mm) { mmdrop(ctx->sqo_mm); ctx->sqo_mm = NULL; } - io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); -- cgit From cbcf72148da4af55ea81cfb351ea7c026ff1014f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jul 2020 11:31:21 +0300 Subject: io_uring: return locked and pinned page accounting Locked and pinned memory accounting in io_{,un}account_mem() depends on having ->sqo_mm, which is NULL after a recent change for non SQPOLL'ed io_ring. That disables the accounting. Return ->sqo_mm initialisation back, and do __io_sq_thread_acquire_mm() based on IORING_SETUP_SQPOLL flag. Fixes: 8eb06d7e8dd85 ("io_uring: fix missing ->mm on exit") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index ba7ce103667b..680b16f71a03 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -981,7 +981,8 @@ static void io_sq_thread_drop_mm(void) static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { if (!current->mm) { - if (unlikely(!ctx->sqo_mm || !mmget_not_zero(ctx->sqo_mm))) + if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || + !mmget_not_zero(ctx->sqo_mm))) return -EFAULT; kthread_use_mm(ctx->sqo_mm); } @@ -7259,10 +7260,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, { int ret; - if (ctx->flags & IORING_SETUP_SQPOLL) { - mmgrab(current->mm); - ctx->sqo_mm = current->mm; + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + if (ctx->flags & IORING_SETUP_SQPOLL) { ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto err; -- cgit From 5af1d13e8f0d8839db04a71ec786f369b0e67234 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jul 2020 11:32:52 +0300 Subject: io_uring: batch put_task_struct() As every iopoll request have a task ref, it becomes expensive to put them one by one, instead we can put several at once integrating that into io_req_free_batch(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 680b16f71a03..3a415d924b93 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1544,7 +1544,6 @@ static void io_dismantle_req(struct io_kiocb *req) kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - __io_put_req_task(req); io_req_clean_work(req); if (req->flags & REQ_F_INFLIGHT) { @@ -1564,6 +1563,7 @@ static void __io_free_req(struct io_kiocb *req) struct io_ring_ctx *ctx; io_dismantle_req(req); + __io_put_req_task(req); ctx = req->ctx; if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -1807,8 +1807,18 @@ static void io_free_req(struct io_kiocb *req) struct req_batch { void *reqs[IO_IOPOLL_BATCH]; int to_free; + + struct task_struct *task; + int task_refs; }; +static inline void io_init_req_batch(struct req_batch *rb) +{ + rb->to_free = 0; + rb->task_refs = 0; + rb->task = NULL; +} + static void __io_req_free_batch_flush(struct io_ring_ctx *ctx, struct req_batch *rb) { @@ -1822,6 +1832,10 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, { if (rb->to_free) __io_req_free_batch_flush(ctx, rb); + if (rb->task) { + put_task_struct_many(rb->task, rb->task_refs); + rb->task = NULL; + } } static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) @@ -1833,6 +1847,17 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) if (req->flags & REQ_F_LINK_HEAD) io_queue_next(req); + if (req->flags & REQ_F_TASK_PINNED) { + if (req->task != rb->task) { + if (rb->task) + put_task_struct_many(rb->task, rb->task_refs); + rb->task = req->task; + rb->task_refs = 0; + } + rb->task_refs++; + req->flags &= ~REQ_F_TASK_PINNED; + } + io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) @@ -1978,7 +2003,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, /* order with ->result store in io_complete_rw_iopoll() */ smp_rmb(); - rb.to_free = 0; + io_init_req_batch(&rb); while (!list_empty(done)) { int cflags = 0; -- cgit From 23b3628e45924419399da48c2b3a522b05557c91 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 23 Jul 2020 20:57:24 +0800 Subject: io_uring: clear IORING_SQ_NEED_WAKEUP after executing task works In io_sq_thread(), if there are task works to handle, current codes will skip schedule() and go on polling sq again, but forget to clear IORING_SQ_NEED_WAKEUP flag, fix this issue. Also add two helpers to set and clear IORING_SQ_NEED_WAKEUP flag, Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3a415d924b93..6f3f18a99f4f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6344,6 +6344,21 @@ fail_req: return submitted; } +static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) +{ + /* Tell userspace we may need a wakeup call */ + spin_lock_irq(&ctx->completion_lock); + ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); +} + +static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) +{ + spin_lock_irq(&ctx->completion_lock); + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); +} + static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; @@ -6417,10 +6432,7 @@ static int io_sq_thread(void *data) continue; } - /* Tell userspace we may need a wakeup call */ - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_set_wakeup_flag(ctx); to_submit = io_sqring_entries(ctx); if (!to_submit || ret == -EBUSY) { @@ -6430,6 +6442,7 @@ static int io_sq_thread(void *data) } if (io_run_task_work()) { finish_wait(&ctx->sqo_wait, &wait); + io_ring_clear_wakeup_flag(ctx); continue; } if (signal_pending(current)) @@ -6437,17 +6450,13 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_clear_wakeup_flag(ctx); ret = 0; continue; } finish_wait(&ctx->sqo_wait, &wait); - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_clear_wakeup_flag(ctx); } mutex_lock(&ctx->uring_lock); -- cgit From ae34817bd93e373a03203a4c6892735c430a14e1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jul 2020 20:25:20 +0300 Subject: io_uring: don't do opcode prep twice Calling into opcode prep handlers may be dangerous, as they re-read SQE but might not re-initialise requests completely. If io_req_defer() passed fast checks and is done with preparations, punt it async. As all other cases are covered with nulling @sqe, this guarantees that io_[opcode]_prep() are visited only once per request. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f3f18a99f4f..38e4c3902963 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5447,7 +5447,8 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); kfree(de); - return 0; + io_queue_async_work(req); + return -EIOCBQUEUED; } trace_io_uring_defer(ctx, req, req->user_data); -- cgit From f56040b81999871973d21f334b4657957422c90e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jul 2020 20:25:21 +0300 Subject: io_uring: deduplicate io_grab_files() calls Move io_req_init_async() into io_grab_files(), it's safer this way. Note that io_queue_async_work() does *init_async(), so it's valid to move out of __io_queue_sqe() punt path. Also, add a helper around io_grab_files(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 38e4c3902963..c7e8e9a1b27b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -912,7 +912,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); -static int io_grab_files(struct io_kiocb *req); +static int io_prep_work_files(struct io_kiocb *req); static void io_complete_rw_common(struct kiocb *kiocb, long res, struct io_comp_state *cs); static void __io_clean_op(struct io_kiocb *req); @@ -5294,13 +5294,9 @@ static int io_req_defer_prep(struct io_kiocb *req, if (io_alloc_async_ctx(req)) return -EAGAIN; - - if (io_op_defs[req->opcode].file_table) { - io_req_init_async(req); - ret = io_grab_files(req); - if (unlikely(ret)) - return ret; - } + ret = io_prep_work_files(req); + if (unlikely(ret)) + return ret; switch (req->opcode) { case IORING_OP_NOP: @@ -5851,6 +5847,8 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; + io_req_init_async(req); + if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) return 0; if (!ctx->ring_file) @@ -5876,6 +5874,13 @@ static int io_grab_files(struct io_kiocb *req) return ret; } +static inline int io_prep_work_files(struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].file_table) + return 0; + return io_grab_files(req); +} + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -5987,14 +5992,9 @@ again: goto exit; } punt: - io_req_init_async(req); - - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (ret) - goto err; - } - + ret = io_prep_work_files(req); + if (unlikely(ret)) + goto err; /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. -- cgit From b65e0dd6a2de050d3fc4c0db4969a245f4e7273e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jul 2020 14:41:58 +0300 Subject: io_uring: mark ->work uninitialised after cleanup Remove REQ_F_WORK_INITIALIZED after io_req_clean_work(). That's a cold path but is safer for those using io_req_clean_work() out of *dismantle_req()/*io_free(). And for the same reason zero work.fs Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index c7e8e9a1b27b..59f1f473ffc7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1141,7 +1141,9 @@ static void io_req_clean_work(struct io_kiocb *req) spin_unlock(&req->work.fs->lock); if (fs) free_fs_struct(fs); + req->work.fs = NULL; } + req->flags &= ~REQ_F_WORK_INITIALIZED; } static void io_prep_async_work(struct io_kiocb *req) @@ -4969,7 +4971,6 @@ static int io_poll_add(struct io_kiocb *req) /* ->work is in union with hash_node and others */ io_req_clean_work(req); - req->flags &= ~REQ_F_WORK_INITIALIZED; INIT_HLIST_NODE(&req->hash_node); ipt.pt._qproc = io_poll_queue_proc; -- cgit From f063c5477eb392c315aa25ad538b4920b367ea05 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jul 2020 14:41:59 +0300 Subject: io_uring: fix missing io_queue_linked_timeout() Whoever called io_prep_linked_timeout() should also do io_queue_linked_timeout(). __io_queue_sqe() doesn't follow that for the punting path leaving linked timeouts prepared but never queued. Fixes: 6df1db6b54243 ("io_uring: fix mis-refcounting linked timeouts") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 59f1f473ffc7..3e406bc1f855 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5987,20 +5987,20 @@ again: * doesn't support non-blocking read/write attempts */ if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - if (io_arm_poll_handler(req)) { - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); - goto exit; - } + if (!io_arm_poll_handler(req)) { punt: - ret = io_prep_work_files(req); - if (unlikely(ret)) - goto err; - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req); + ret = io_prep_work_files(req); + if (unlikely(ret)) + goto err; + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req); + } + + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); goto exit; } -- cgit From 010e8e6be2194678f7e4bb3044c088bbee779f57 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:45 +0300 Subject: io_uring: de-unionise io_kiocb As io_kiocb have enough space, move ->work out of a union. It's safer this way and removes ->work memcpy bouncing. By the way make tabulation in struct io_kiocb consistent. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 59 ++++++++++++++--------------------------------------------- 1 file changed, 14 insertions(+), 45 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3e406bc1f855..86ec5669fe50 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -600,7 +600,6 @@ enum { struct async_poll { struct io_poll_iocb poll; struct io_poll_iocb *double_poll; - struct io_wq_work work; }; /* @@ -641,36 +640,26 @@ struct io_kiocb { u16 buf_index; u32 result; - struct io_ring_ctx *ctx; - unsigned int flags; - refcount_t refs; - struct task_struct *task; - u64 user_data; + struct io_ring_ctx *ctx; + unsigned int flags; + refcount_t refs; + struct task_struct *task; + u64 user_data; - struct list_head link_list; + struct list_head link_list; /* * 1. used with ctx->iopoll_list with reads/writes * 2. to track reqs with ->files (see io_op_def::file_table) */ - struct list_head inflight_entry; - - struct percpu_ref *fixed_file_refs; - - union { - /* - * Only commands that never go async can use the below fields, - * obviously. Right now only IORING_OP_POLL_ADD uses them, and - * async armed poll handlers for regular commands. The latter - * restore the work, if needed. - */ - struct { - struct hlist_node hash_node; - struct async_poll *apoll; - }; - struct io_wq_work work; - }; - struct callback_head task_work; + struct list_head inflight_entry; + + struct percpu_ref *fixed_file_refs; + struct callback_head task_work; + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ + struct hlist_node hash_node; + struct async_poll *apoll; + struct io_wq_work work; }; struct io_defer_entry { @@ -4668,10 +4657,6 @@ static void io_async_task_func(struct callback_head *cb) io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); - /* restore ->work in case we need to retry again */ - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, sizeof(req->work)); - if (!READ_ONCE(apoll->poll.canceled)) __io_req_task_submit(req); else @@ -4763,9 +4748,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) apoll->double_poll = NULL; req->flags |= REQ_F_POLLED; - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&apoll->work, &req->work, sizeof(req->work)); - io_get_req_task(req); req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4784,8 +4766,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (ret) { io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, sizeof(req->work)); kfree(apoll->double_poll); kfree(apoll); return false; @@ -4828,14 +4808,6 @@ static bool io_poll_remove_one(struct io_kiocb *req) do_complete = __io_poll_remove_one(req, &apoll->poll); if (do_complete) { io_put_req(req); - /* - * restore ->work because we will call - * io_req_clean_work below when dropping the - * final reference. - */ - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, - sizeof(req->work)); kfree(apoll->double_poll); kfree(apoll); } @@ -4969,9 +4941,6 @@ static int io_poll_add(struct io_kiocb *req) struct io_poll_table ipt; __poll_t mask; - /* ->work is in union with hash_node and others */ - io_req_clean_work(req); - INIT_HLIST_NODE(&req->hash_node); ipt.pt._qproc = io_poll_queue_proc; -- cgit From 81b68a5ca0ab5d92229a7b76332b9ce88bd6dbd1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:46 +0300 Subject: io_uring: deduplicate __io_complete_rw() Call __io_complete_rw() in io_iopoll_queue() instead of hand coding it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 86ec5669fe50..11f4ab87e08f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -891,7 +891,8 @@ enum io_mem_account { ACCT_PINNED, }; -static bool io_rw_reissue(struct io_kiocb *req, long res); +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + struct io_comp_state *cs); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); @@ -902,8 +903,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); static int io_prep_work_files(struct io_kiocb *req); -static void io_complete_rw_common(struct kiocb *kiocb, long res, - struct io_comp_state *cs); static void __io_clean_op(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); @@ -1976,8 +1975,7 @@ static void io_iopoll_queue(struct list_head *again) do { req = list_first_entry(again, struct io_kiocb, inflight_entry); list_del(&req->inflight_entry); - if (!io_rw_reissue(req, -EAGAIN)) - io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); + __io_complete_rw(req, -EAGAIN, 0, NULL); } while (!list_empty(again)); } -- cgit From b2bd1cf99f3e7c8fbf12ea07af2c6998e1209e25 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:47 +0300 Subject: io_uring: fix racy overflow count reporting All ->cq_overflow modifications should be under completion_lock, otherwise it can report a wrong number to the userspace. Fix it in io_uring_cancel_files(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 11f4ab87e08f..6e2322525da6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7847,10 +7847,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, clear_bit(0, &ctx->cq_check_overflow); ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } - spin_unlock_irq(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); + spin_unlock_irq(&ctx->completion_lock); /* * Put inflight ref and overflow ref. If that's -- cgit From dd9dfcdf5a603680458f5e7b0d2273c66e5417db Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:48 +0300 Subject: io_uring: fix stalled deferred requests Always do io_commit_cqring() after completing a request, even if it was accounted as overflowed on the CQ side. Failing to do that may lead to not to pushing deferred requests when needed, and so stalling the whole ring. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e2322525da6..11c1abe8bd1a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7849,6 +7849,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); + io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); /* -- cgit From 4693014340808e7f099e302c1dc40e9d79ff7667 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:49 +0300 Subject: io_uring: consolidate *_check_overflow accounting Add a helper to mark ctx->{cq,sq}_check_overflow to get rid of duplicates, and it's clearer to check cq_overflow_list directly anyway. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 11c1abe8bd1a..efec290c6b08 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1303,6 +1303,15 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) eventfd_signal(ctx->cq_ev_fd, 1); } +static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) +{ + if (list_empty(&ctx->cq_overflow_list)) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; + } +} + /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -1347,11 +1356,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) } io_commit_cqring(ctx); - if (cqe) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; - } + io_cqring_mark_overflow(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -7842,11 +7848,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->completion_lock); list_del(&cancel_req->compl.list); cancel_req->flags &= ~REQ_F_OVERFLOW; - if (list_empty(&ctx->cq_overflow_list)) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; - } + + io_cqring_mark_overflow(ctx); WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); io_commit_cqring(ctx); -- cgit From 01cec8c18f5ad9c27eee9f21439072832181039e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:50 +0300 Subject: io_uring: get rid of atomic FAA for cq_timeouts If ->cq_timeouts modifications are done under ->completion_lock, we don't really nee any fetch-and-add and other complex atomics. Replace it with non-atomic FAA, that saves an implicit full memory barrier. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index efec290c6b08..fabf0b692384 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1205,7 +1205,8 @@ static void io_kill_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { - atomic_inc(&req->ctx->cq_timeouts); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); @@ -4972,9 +4973,10 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - atomic_inc(&ctx->cq_timeouts); - spin_lock_irqsave(&ctx->completion_lock, flags); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + /* * We could be racing with timeout deletion. If the list is empty, * then timeout lookup already found it and will be handling it. -- cgit From d1719f70d0a5b83b12786a7dbc5b9fe396469016 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 30 Jul 2020 13:43:53 -0600 Subject: io_uring: don't touch 'ctx' after installing file descriptor As soon as we install the file descriptor, we have to assume that it can get arbitrarily closed. We currently account memory (and note that we did) after installing the ring fd, which means that it could be a potential use-after-free condition if the fd is closed right after being installed, but before we fiddle with the ctx. In fact, syzbot reported this exact scenario: BUG: KASAN: use-after-free in io_account_mem fs/io_uring.c:7397 [inline] BUG: KASAN: use-after-free in io_uring_create fs/io_uring.c:8369 [inline] BUG: KASAN: use-after-free in io_uring_setup+0x2797/0x2910 fs/io_uring.c:8400 Read of size 1 at addr ffff888087a41044 by task syz-executor.5/18145 CPU: 0 PID: 18145 Comm: syz-executor.5 Not tainted 5.8.0-rc7-next-20200729-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x18f/0x20d lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x497 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530 io_account_mem fs/io_uring.c:7397 [inline] io_uring_create fs/io_uring.c:8369 [inline] io_uring_setup+0x2797/0x2910 fs/io_uring.c:8400 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45c429 Code: 8d b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 5b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f8f121d0c78 EFLAGS: 00000246 ORIG_RAX: 00000000000001a9 RAX: ffffffffffffffda RBX: 0000000000008540 RCX: 000000000045c429 RDX: 0000000000000000 RSI: 0000000020000040 RDI: 0000000000000196 RBP: 000000000078bf38 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000078bf0c R13: 00007fff86698cff R14: 00007f8f121d19c0 R15: 000000000078bf0c Move the accounting of the ring used locked memory before we get and install the ring file descriptor. Cc: stable@vger.kernel.org Reported-by: syzbot+9d46305e76057f30c74e@syzkaller.appspotmail.com Fixes: 309758254ea6 ("io_uring: report pinned memory usage") Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index fabf0b692384..33702f3b5af8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8329,6 +8329,15 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ret = -EFAULT; goto err; } + + /* + * Account memory _before_ installing the file descriptor. Once + * the descriptor is installed, it can get closed at any time. + */ + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), + ACCT_LOCKED); + ctx->limit_mem = limit_mem; + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup @@ -8338,9 +8347,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); - io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), - ACCT_LOCKED); - ctx->limit_mem = limit_mem; return ret; err: io_ring_ctx_wait_and_kill(ctx); -- cgit From fa15bafb71fd7a4d6018dae87cfaf890fd4ab47f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 1 Aug 2020 13:50:02 +0300 Subject: io_uring: flip if handling after io_setup_async_rw As recently done with with send/recv, flip the if after rw_verify_aread() in io_{read,write}() and tabulise left bits left. This removes mispredicted by a compiler jump on the success/fast path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 146 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 72 insertions(+), 74 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 33702f3b5af8..6fd0b0f5df68 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3034,57 +3034,56 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; size_t iov_count; - ssize_t io_size, ret; + ssize_t io_size, ret, ret2; + unsigned long nr_segs; ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; + io_size = ret; + req->result = io_size; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) kiocb->ki_flags &= ~IOCB_NOWAIT; - io_size = ret; - req->result = io_size; - /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; iov_count = iov_iter_count(&iter); + nr_segs = iter.nr_segs; ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); - if (!ret) { - unsigned long nr_segs = iter.nr_segs; - ssize_t ret2 = 0; + if (unlikely(ret)) + goto out_free; - ret2 = io_iter_do_read(req, &iter); + ret2 = io_iter_do_read(req, &iter); - /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { - kiocb_done(kiocb, ret2, cs); - } else { - iter.count = iov_count; - iter.nr_segs = nr_segs; + /* Catch -EAGAIN return for forced non-blocking submission */ + if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { + kiocb_done(kiocb, ret2, cs); + } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, - inline_vecs, &iter); - if (ret) + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, + &iter); + if (ret) + goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + /* if we can retry, do so with the callbacks armed */ + if (io_rw_should_retry(req)) { + ret2 = io_iter_do_read(req, &iter); + if (ret2 == -EIOCBQUEUED) { + goto out_free; + } else if (ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2, cs); goto out_free; - /* it's copied and will be cleaned with ->io */ - iovec = NULL; - /* if we can retry, do so with the callbacks armed */ - if (io_rw_should_retry(req)) { - ret2 = io_iter_do_read(req, &iter); - if (ret2 == -EIOCBQUEUED) { - goto out_free; - } else if (ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, cs); - goto out_free; - } } - kiocb->ki_flags &= ~IOCB_WAITQ; - return -EAGAIN; } + kiocb->ki_flags &= ~IOCB_WAITQ; + return -EAGAIN; } out_free: if (iovec) @@ -3117,19 +3116,19 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; size_t iov_count; - ssize_t ret, io_size; + ssize_t ret, ret2, io_size; + unsigned long nr_segs; ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; + io_size = ret; + req->result = io_size; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - io_size = ret; - req->result = io_size; - /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) goto copy_iov; @@ -3140,51 +3139,50 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, goto copy_iov; iov_count = iov_iter_count(&iter); + nr_segs = iter.nr_segs; ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); - if (!ret) { - unsigned long nr_segs = iter.nr_segs; - ssize_t ret2; + if (unlikely(ret)) + goto out_free; - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * io_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (req->flags & REQ_F_ISREG) { - __sb_start_write(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE, true); - __sb_writers_release(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE); - } - kiocb->ki_flags |= IOCB_WRITE; + /* + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * io_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. + */ + if (req->flags & REQ_F_ISREG) { + __sb_start_write(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE, true); + __sb_writers_release(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE); + } + kiocb->ki_flags |= IOCB_WRITE; - if (req->file->f_op->write_iter) - ret2 = call_write_iter(req->file, kiocb, &iter); - else - ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + if (req->file->f_op->write_iter) + ret2 = call_write_iter(req->file, kiocb, &iter); + else + ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); - /* - * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just - * retry them without IOCB_NOWAIT. - */ - if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) - ret2 = -EAGAIN; - if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, cs); - } else { - iter.count = iov_count; - iter.nr_segs = nr_segs; + /* + * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just + * retry them without IOCB_NOWAIT. + */ + if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) + ret2 = -EAGAIN; + if (!force_nonblock || ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2, cs); + } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, - inline_vecs, &iter); - if (ret) - goto out_free; - /* it's copied and will be cleaned with ->io */ - iovec = NULL; - return -EAGAIN; - } + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, + &iter); + if (ret) + goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + return -EAGAIN; } out_free: if (iovec) -- cgit From cbd287c09351f1d3a4b3cb9167a2616a11390d32 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Aug 2020 17:06:21 -0600 Subject: io_uring: io_async_buf_func() need not test page bit Since we don't do exclusive waits or wakeups, we know that the bit is always going to be set. Kill the test. Also see commit: 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic") Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2a3af95be4ca..bb4f0b2d5138 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2965,10 +2965,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, if (!wake_page_match(wpq, key)) return 0; - /* Stop waking things up if the page is locked again */ - if (test_bit(key->bit_nr, &key->page->flags)) - return -1; - list_del_init(&wait->entry); init_task_work(&req->task_work, io_req_task_submit); -- cgit From c1dd91d16246b168b80af9b64c5cc35a66410455 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Aug 2020 16:43:59 -0600 Subject: io_uring: add comments on how the async buffered read retry works The retry based logic here isn't easy to follow unless you're already familiar with how io_uring does task_work based retries. Add some comments explaining the flow a little better. Suggested-by: Linus Torvalds Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index bb4f0b2d5138..5e1c08e22990 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2952,6 +2952,16 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return io_rw_prep_async(req, READ, force_nonblock); } +/* + * This is our waitqueue callback handler, registered through lock_page_async() + * when we initially tried to do the IO with the iocb armed our waitqueue. + * This gets called when the page is unlocked, and we generally expect that to + * happen when the page IO is completed and the page is now uptodate. This will + * queue a task_work based retry of the operation, attempting to copy the data + * again. If the latter fails because the page was NOT uptodate, then we will + * do a thread based blocking retry of the operation. That's the unexpected + * slow path. + */ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, int sync, void *arg) { @@ -3004,7 +3014,18 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, return -EOPNOTSUPP; } - +/* + * This controls whether a given IO request should be armed for async page + * based retry. If we return false here, the request is handed to the async + * worker threads for retry. If we're doing buffered reads on a regular file, + * we prepare a private wait_page_queue entry and retry the operation. This + * will either succeed because the page is now uptodate and unlocked, or it + * will register a callback when the page is unlocked at IO completion. Through + * that callback, io_uring uses task_work to setup a retry of the operation. + * That retry will attempt the buffered read again. The retry will generally + * succeed, or in rare cases where it fails, we then fall back to using the + * async worker threads for a blocking retry. + */ static bool io_rw_should_retry(struct io_kiocb *req) { struct kiocb *kiocb = &req->rw.kiocb; -- cgit From 2dd2111d0d383df104b144e0d1f6b5a00cb7cd88 Mon Sep 17 00:00:00 2001 From: Guoyu Huang Date: Wed, 5 Aug 2020 03:53:50 -0700 Subject: io_uring: Fix NULL pointer dereference in loop_rw_iter() loop_rw_iter() does not check whether the file has a read or write function. This can lead to NULL pointer dereference when the user passes in a file descriptor that does not have read or write function. The crash log looks like this: [ 99.834071] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 99.835364] #PF: supervisor instruction fetch in kernel mode [ 99.836522] #PF: error_code(0x0010) - not-present page [ 99.837771] PGD 8000000079d62067 P4D 8000000079d62067 PUD 79d8c067 PMD 0 [ 99.839649] Oops: 0010 [#2] SMP PTI [ 99.840591] CPU: 1 PID: 333 Comm: io_wqe_worker-0 Tainted: G D 5.8.0 #2 [ 99.842622] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 [ 99.845140] RIP: 0010:0x0 [ 99.845840] Code: Bad RIP value. [ 99.846672] RSP: 0018:ffffa1c7c01ebc08 EFLAGS: 00010202 [ 99.848018] RAX: 0000000000000000 RBX: ffff92363bd67300 RCX: ffff92363d461208 [ 99.849854] RDX: 0000000000000010 RSI: 00007ffdbf696bb0 RDI: ffff92363bd67300 [ 99.851743] RBP: ffffa1c7c01ebc40 R08: 0000000000000000 R09: 0000000000000000 [ 99.853394] R10: ffffffff9ec692a0 R11: 0000000000000000 R12: 0000000000000010 [ 99.855148] R13: 0000000000000000 R14: ffff92363d461208 R15: ffffa1c7c01ebc68 [ 99.856914] FS: 0000000000000000(0000) GS:ffff92363dd00000(0000) knlGS:0000000000000000 [ 99.858651] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 99.860032] CR2: ffffffffffffffd6 CR3: 000000007ac66000 CR4: 00000000000006e0 [ 99.861979] Call Trace: [ 99.862617] loop_rw_iter.part.0+0xad/0x110 [ 99.863838] io_write+0x2ae/0x380 [ 99.864644] ? kvm_sched_clock_read+0x11/0x20 [ 99.865595] ? sched_clock+0x9/0x10 [ 99.866453] ? sched_clock_cpu+0x11/0xb0 [ 99.867326] ? newidle_balance+0x1d4/0x3c0 [ 99.868283] io_issue_sqe+0xd8f/0x1340 [ 99.869216] ? __switch_to+0x7f/0x450 [ 99.870280] ? __switch_to_asm+0x42/0x70 [ 99.871254] ? __switch_to_asm+0x36/0x70 [ 99.872133] ? lock_timer_base+0x72/0xa0 [ 99.873155] ? switch_mm_irqs_off+0x1bf/0x420 [ 99.874152] io_wq_submit_work+0x64/0x180 [ 99.875192] ? kthread_use_mm+0x71/0x100 [ 99.876132] io_worker_handle_work+0x267/0x440 [ 99.877233] io_wqe_worker+0x297/0x350 [ 99.878145] kthread+0x112/0x150 [ 99.878849] ? __io_worker_unuse+0x100/0x100 [ 99.879935] ? kthread_park+0x90/0x90 [ 99.880874] ret_from_fork+0x22/0x30 [ 99.881679] Modules linked in: [ 99.882493] CR2: 0000000000000000 [ 99.883324] ---[ end trace 4453745f4673190b ]--- [ 99.884289] RIP: 0010:0x0 [ 99.884837] Code: Bad RIP value. [ 99.885492] RSP: 0018:ffffa1c7c01ebc08 EFLAGS: 00010202 [ 99.886851] RAX: 0000000000000000 RBX: ffff92363acd7f00 RCX: ffff92363d461608 [ 99.888561] RDX: 0000000000000010 RSI: 00007ffe040d9e10 RDI: ffff92363acd7f00 [ 99.890203] RBP: ffffa1c7c01ebc40 R08: 0000000000000000 R09: 0000000000000000 [ 99.891907] R10: ffffffff9ec692a0 R11: 0000000000000000 R12: 0000000000000010 [ 99.894106] R13: 0000000000000000 R14: ffff92363d461608 R15: ffffa1c7c01ebc68 [ 99.896079] FS: 0000000000000000(0000) GS:ffff92363dd00000(0000) knlGS:0000000000000000 [ 99.898017] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 99.899197] CR2: ffffffffffffffd6 CR3: 000000007ac66000 CR4: 00000000000006e0 Fixes: 32960613b7c3 ("io_uring: correctly handle non ->{read,write}_iter() file_operations") Cc: stable@vger.kernel.org Signed-off-by: Guoyu Huang Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5e1c08e22990..8f96566603f3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3066,7 +3066,10 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) { if (req->file->f_op->read_iter) return call_read_iter(req->file, &req->rw.kiocb, iter); - return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); + else if (req->file->f_op->read) + return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); + else + return -EINVAL; } static int io_read(struct io_kiocb *req, bool force_nonblock, @@ -3203,8 +3206,10 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (req->file->f_op->write_iter) ret2 = call_write_iter(req->file, kiocb, &iter); - else + else if (req->file->f_op->write) ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + else + ret2 = -EINVAL; /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just -- cgit From bd74048108c179cea0ff52979506164c80f29da7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Aug 2020 12:58:23 -0600 Subject: io_uring: set ctx sq/cq entry count earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we hit an earlier error path in io_uring_create(), then we will have accounted memory, but not set ctx->{sq,cq}_entries yet. Then when the ring is torn down in error, we use those values to unaccount the memory. Ensure we set the ctx entries before we're able to hit a potential error path. Cc: stable@vger.kernel.org Reported-by: Tomáš Chaloupka Tested-by: Tomáš Chaloupka Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8f96566603f3..0d857f7ca507 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8193,6 +8193,10 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_rings *rings; size_t size, sq_array_offset; + /* make sure these are sane, as we already accounted them */ + ctx->sq_entries = p->sq_entries; + ctx->cq_entries = p->cq_entries; + size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -8209,8 +8213,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->cq_ring_entries = p->cq_entries; ctx->sq_mask = rings->sq_ring_mask; ctx->cq_mask = rings->cq_ring_mask; - ctx->sq_entries = rings->sq_ring_entries; - ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { -- cgit From f74441e6311a28f0ee89b9c8e296a33730f812fc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Aug 2020 13:00:44 -0600 Subject: io_uring: account locked memory before potential error case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The tear down path will always unaccount the memory, so ensure that we have accounted it before hitting any of them. Reported-by: Tomáš Chaloupka Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0d857f7ca507..e9b27cdaa735 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8341,6 +8341,16 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->user = user; ctx->creds = get_current_cred(); + /* + * Account memory _before_ installing the file descriptor. Once + * the descriptor is installed, it can get closed at any time. Also + * do this before hitting the general error path, as ring freeing + * will un-account as well. + */ + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), + ACCT_LOCKED); + ctx->limit_mem = limit_mem; + ret = io_allocate_scq_urings(ctx, p); if (ret) goto err; @@ -8377,14 +8387,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; } - /* - * Account memory _before_ installing the file descriptor. Once - * the descriptor is installed, it can get closed at any time. - */ - io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), - ACCT_LOCKED); - ctx->limit_mem = limit_mem; - /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup -- cgit From 0ba9c9edcd152158a0e321a4c13ac1dfc571ff3d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 6 Aug 2020 19:41:50 -0600 Subject: io_uring: use TWA_SIGNAL for task_work uncondtionally An earlier commit: b7db41c9e03b ("io_uring: fix regression with always ignoring signals in io_cqring_wait()") ensured that we didn't get stuck waiting for eventfd reads when it's registered with the io_uring ring for event notification, but we still have cases where the task can be waiting on other events in the kernel and need a bigger nudge to make forward progress. Or the task could be in the kernel and running, but on its way to blocking. This means that TWA_RESUME cannot reliably be used to ensure we make progress. Use TWA_SIGNAL unconditionally. Cc: stable@vger.kernel.org # v5.7+ Reported-by: Josef Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index e9b27cdaa735..af6811ddcfbd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1710,22 +1710,22 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; - int ret, notify = TWA_RESUME; + int ret, notify; /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. - * If we're not using an eventfd, then TWA_RESUME is always fine, - * as we won't have dependencies between request completions for - * other kernel wait conditions. + * SQPOLL kernel thread doesn't need notification, just a wakeup. For + * all other cases, use TWA_SIGNAL unconditionally to ensure we're + * processing task_work. There's no reliable way to tell if TWA_RESUME + * will do the job. */ - if (ctx->flags & IORING_SETUP_SQPOLL) - notify = 0; - else if (ctx->cq_ev_fd) + notify = 0; + if (!(ctx->flags & IORING_SETUP_SQPOLL)) notify = TWA_SIGNAL; ret = task_work_add(tsk, cb, notify); if (!ret) wake_up_process(tsk); + return ret; } -- cgit From 7271ef3a93a832180068c7aade3f130b7f39b17e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Aug 2020 09:55:22 -0600 Subject: io_uring: fix recursive completion locking on oveflow flush syszbot reports a scenario where we recurse on the completion lock when flushing an overflow: 1 lock held by syz-executor287/6816: #0: ffff888093cdb4d8 (&ctx->completion_lock){....}-{2:2}, at: io_cqring_overflow_flush+0xc6/0xab0 fs/io_uring.c:1333 stack backtrace: CPU: 1 PID: 6816 Comm: syz-executor287 Not tainted 5.8.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1f0/0x31e lib/dump_stack.c:118 print_deadlock_bug kernel/locking/lockdep.c:2391 [inline] check_deadlock kernel/locking/lockdep.c:2432 [inline] validate_chain+0x69a4/0x88a0 kernel/locking/lockdep.c:3202 __lock_acquire+0x1161/0x2ab0 kernel/locking/lockdep.c:4426 lock_acquire+0x160/0x730 kernel/locking/lockdep.c:5005 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:128 [inline] _raw_spin_lock_irq+0x67/0x80 kernel/locking/spinlock.c:167 spin_lock_irq include/linux/spinlock.h:379 [inline] io_queue_linked_timeout fs/io_uring.c:5928 [inline] __io_queue_async_work fs/io_uring.c:1192 [inline] __io_queue_deferred+0x36a/0x790 fs/io_uring.c:1237 io_cqring_overflow_flush+0x774/0xab0 fs/io_uring.c:1359 io_ring_ctx_wait_and_kill+0x2a1/0x570 fs/io_uring.c:7808 io_uring_release+0x59/0x70 fs/io_uring.c:7829 __fput+0x34f/0x7b0 fs/file_table.c:281 task_work_run+0x137/0x1c0 kernel/task_work.c:135 exit_task_work include/linux/task_work.h:25 [inline] do_exit+0x5f3/0x1f20 kernel/exit.c:806 do_group_exit+0x161/0x2d0 kernel/exit.c:903 __do_sys_exit_group+0x13/0x20 kernel/exit.c:914 __se_sys_exit_group+0x10/0x10 kernel/exit.c:912 __x64_sys_exit_group+0x37/0x40 kernel/exit.c:912 do_syscall_64+0x31/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix this by passing back the link from __io_queue_async_work(), and then let the caller handle the queueing of the link. Take care to also punt the submission reference put to the caller, as we're holding the completion lock for the __io_queue_defer() case. Hence we need to mark the io_kiocb appropriately for that case. Reported-by: syzbot+996f91b6ec3812c48042@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index af6811ddcfbd..360649041bfa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -898,6 +898,7 @@ static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); +static void __io_queue_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, @@ -1179,7 +1180,7 @@ static void io_prep_async_link(struct io_kiocb *req) io_prep_async_work(cur); } -static void __io_queue_async_work(struct io_kiocb *req) +static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); @@ -1187,16 +1188,19 @@ static void __io_queue_async_work(struct io_kiocb *req) trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, &req->work, req->flags); io_wq_enqueue(ctx->io_wq, &req->work); - - if (link) - io_queue_linked_timeout(link); + return link; } static void io_queue_async_work(struct io_kiocb *req) { + struct io_kiocb *link; + /* init ->work of the whole link before punting */ io_prep_async_link(req); - __io_queue_async_work(req); + link = __io_queue_async_work(req); + + if (link) + io_queue_linked_timeout(link); } static void io_kill_timeout(struct io_kiocb *req) @@ -1229,12 +1233,19 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) do { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); + struct io_kiocb *link; if (req_need_defer(de->req, de->seq)) break; list_del_init(&de->list); /* punt-init is done before queueing for defer */ - __io_queue_async_work(de->req); + link = __io_queue_async_work(de->req); + if (link) { + __io_queue_linked_timeout(link); + /* drop submission reference */ + link->flags |= REQ_F_COMP_LOCKED; + io_put_req(link); + } kfree(de); } while (!list_empty(&ctx->defer_list)); } @@ -5939,15 +5950,12 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void io_queue_linked_timeout(struct io_kiocb *req) +static void __io_queue_linked_timeout(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - /* * If the list is now empty, then our linked request finished before * we got a chance to setup the timer */ - spin_lock_irq(&ctx->completion_lock); if (!list_empty(&req->link_list)) { struct io_timeout_data *data = &req->io->timeout; @@ -5955,6 +5963,14 @@ static void io_queue_linked_timeout(struct io_kiocb *req) hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); } +} + +static void io_queue_linked_timeout(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + __io_queue_linked_timeout(req); spin_unlock_irq(&ctx->completion_lock); /* drop submission reference */ -- cgit From 9b7adba9eaec28e0e4343c96d0dbeb9578802f5f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Aug 2020 10:54:02 -0600 Subject: io_uring: add missing REQ_F_COMP_LOCKED for nested requests When we traverse into failing links or timeouts, we need to ensure we propagate the REQ_F_COMP_LOCKED flag to ensure that we correctly signal to the completion side that we already hold the completion lock. Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 360649041bfa..56115cb4b9fa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1609,6 +1609,7 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req) return false; list_del_init(&link->link_list); + link->flags |= REQ_F_COMP_LOCKED; wake_ev = io_link_cancel_timeout(link); req->flags &= ~REQ_F_LINK_TIMEOUT; return wake_ev; @@ -1667,6 +1668,7 @@ static void __io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); io_cqring_fill_event(link, -ECANCELED); + link->flags |= REQ_F_COMP_LOCKED; __io_double_put_req(link); req->flags &= ~REQ_F_LINK_TIMEOUT; } @@ -5071,6 +5073,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return -EALREADY; req_set_fail_links(req); + req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, -ECANCELED); io_put_req(req); return 0; -- cgit From 51a4cc112c7a42b62a91bcccdfac42e7c4561729 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Aug 2020 10:55:56 -0600 Subject: io_uring: defer file table grabbing request cleanup for locked requests If we're in the error path failing links and we have a link that has grabbed a reference to the fs_struct, then we cannot safely drop our reference to the table if we already hold the completion lock. This adds a hardirq dependency to the fs_struct->lock, which it currently doesn't have. Defer the final cleanup and free of such requests to avoid adding this dependency. Reported-by: syzbot+ef4b654b49ed7ff049bf@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 10 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 56115cb4b9fa..5488698189da 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1108,10 +1108,16 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static void io_req_clean_work(struct io_kiocb *req) +/* + * Returns true if we need to defer file table putting. This can only happen + * from the error path with REQ_F_COMP_LOCKED set. + */ +static bool io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) - return; + return false; + + req->flags &= ~REQ_F_WORK_INITIALIZED; if (req->work.mm) { mmdrop(req->work.mm); @@ -1124,6 +1130,9 @@ static void io_req_clean_work(struct io_kiocb *req) if (req->work.fs) { struct fs_struct *fs = req->work.fs; + if (req->flags & REQ_F_COMP_LOCKED) + return true; + spin_lock(&req->work.fs->lock); if (--fs->users) fs = NULL; @@ -1132,7 +1141,8 @@ static void io_req_clean_work(struct io_kiocb *req) free_fs_struct(fs); req->work.fs = NULL; } - req->flags &= ~REQ_F_WORK_INITIALIZED; + + return false; } static void io_prep_async_work(struct io_kiocb *req) @@ -1544,7 +1554,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, fput(file); } -static void io_dismantle_req(struct io_kiocb *req) +static bool io_dismantle_req(struct io_kiocb *req) { io_clean_op(req); @@ -1552,7 +1562,6 @@ static void io_dismantle_req(struct io_kiocb *req) kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - io_req_clean_work(req); if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -1564,15 +1573,15 @@ static void io_dismantle_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } + + return io_req_clean_work(req); } -static void __io_free_req(struct io_kiocb *req) +static void __io_free_req_finish(struct io_kiocb *req) { - struct io_ring_ctx *ctx; + struct io_ring_ctx *ctx = req->ctx; - io_dismantle_req(req); __io_put_req_task(req); - ctx = req->ctx; if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); else @@ -1580,6 +1589,39 @@ static void __io_free_req(struct io_kiocb *req) percpu_ref_put(&ctx->refs); } +static void io_req_task_file_table_put(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct fs_struct *fs = req->work.fs; + + spin_lock(&req->work.fs->lock); + if (--fs->users) + fs = NULL; + spin_unlock(&req->work.fs->lock); + if (fs) + free_fs_struct(fs); + req->work.fs = NULL; + __io_free_req_finish(req); +} + +static void __io_free_req(struct io_kiocb *req) +{ + if (!io_dismantle_req(req)) { + __io_free_req_finish(req); + } else { + int ret; + + init_task_work(&req->task_work, io_req_task_file_table_put); + ret = task_work_add(req->task, &req->task_work, TWA_RESUME); + if (unlikely(ret)) { + struct task_struct *tsk; + + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); + } + } +} + static bool io_link_cancel_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1868,7 +1910,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) req->flags &= ~REQ_F_TASK_PINNED; } - io_dismantle_req(req); + WARN_ON_ONCE(io_dismantle_req(req)); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) __io_req_free_batch_flush(req->ctx, rb); -- cgit From 6d816e088c359866f9867057e04f244c608c42fe Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Aug 2020 08:04:14 -0600 Subject: io_uring: hold 'ctx' reference around task_work queue + execute We're holding the request reference, but we need to go one higher to ensure that the ctx remains valid after the request has finished. If the ring is closed with pending task_work inflight, and the given io_kiocb finishes sync during issue, then we need a reference to the ring itself around the task_work execution cycle. Cc: stable@vger.kernel.org # v5.7+ Reported-by: syzbot+9b260fc33297966f5a8e@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5488698189da..99582cf5106b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1821,8 +1821,10 @@ static void __io_req_task_submit(struct io_kiocb *req) static void io_req_task_submit(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; __io_req_task_submit(req); + percpu_ref_put(&ctx->refs); } static void io_req_task_queue(struct io_kiocb *req) @@ -1830,6 +1832,7 @@ static void io_req_task_queue(struct io_kiocb *req) int ret; init_task_work(&req->task_work, io_req_task_submit); + percpu_ref_get(&req->ctx->refs); ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { @@ -2318,6 +2321,8 @@ static void io_rw_resubmit(struct callback_head *cb) refcount_inc(&req->refs); io_queue_async_work(req); } + + percpu_ref_put(&ctx->refs); } #endif @@ -2330,6 +2335,8 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) return false; init_task_work(&req->task_work, io_rw_resubmit); + percpu_ref_get(&req->ctx->refs); + ret = io_req_task_work_add(req, &req->task_work); if (!ret) return true; @@ -3033,6 +3040,8 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, list_del_init(&wait->entry); init_task_work(&req->task_work, io_req_task_submit); + percpu_ref_get(&req->ctx->refs); + /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); ret = io_req_task_work_add(req, &req->task_work); @@ -4565,6 +4574,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, req->result = mask; init_task_work(&req->task_work, func); + percpu_ref_get(&req->ctx->refs); + /* * If this fails, then the task is exiting. When a task exits, the * work gets canceled, so just cancel this request as well instead @@ -4652,11 +4663,13 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) static void io_poll_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt = NULL; io_poll_task_handler(req, &nxt); if (nxt) __io_req_task_submit(nxt); + percpu_ref_put(&ctx->refs); } static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, @@ -4752,6 +4765,7 @@ static void io_async_task_func(struct callback_head *cb) if (io_poll_rewait(req, &apoll->poll)) { spin_unlock_irq(&ctx->completion_lock); + percpu_ref_put(&ctx->refs); return; } @@ -4767,6 +4781,7 @@ static void io_async_task_func(struct callback_head *cb) else __io_req_task_cancel(req, -ECANCELED); + percpu_ref_put(&ctx->refs); kfree(apoll->double_poll); kfree(apoll); } -- cgit From a36da65c46565d2527eec3efdb546251e38253fd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Aug 2020 09:50:19 -0600 Subject: io_uring: fail poll arm on queue proc failure Check the ipt.error value, it must have been either cleared to zero or set to another error than the default -EINVAL if we don't go through the waitqueue proc addition. Just give up on poll at that point and return failure, this will fallback to async work. io_poll_add() doesn't suffer from this failure case, as it returns the error value directly. Cc: stable@vger.kernel.org # v5.7+ Reported-by: syzbot+a730016dc0bdce4f6ff5@syzkaller.appspotmail.com Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 99582cf5106b..8a2afd8c33c9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4883,7 +4883,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); - if (ret) { + if (ret || ipt.error) { io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); kfree(apoll->double_poll); -- cgit From f254ac04c8744cf7bfed012717eac34eacc65dfb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 12 Aug 2020 17:33:30 -0600 Subject: io_uring: enable lookup of links holding inflight files When a process exits, we cancel whatever requests it has pending that are referencing the file table. However, if a link is holding a reference, then we cannot find it by simply looking at the inflight list. Enable checking of the poll and timeout list to find the link, and cancel it appropriately. Cc: stable@vger.kernel.org Reported-by: Josef Signed-off-by: Jens Axboe --- fs/io_uring.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 10 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8a2afd8c33c9..1ec25ee71372 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4937,6 +4937,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(req->ctx); req->flags |= REQ_F_COMP_LOCKED; + req_set_fail_links(req); io_put_req(req); } @@ -5109,6 +5110,23 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +static int __io_timeout_cancel(struct io_kiocb *req) +{ + int ret; + + list_del_init(&req->timeout.list); + + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + if (ret == -1) + return -EALREADY; + + req_set_fail_links(req); + req->flags |= REQ_F_COMP_LOCKED; + io_cqring_fill_event(req, -ECANCELED); + io_put_req(req); + return 0; +} + static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) { struct io_kiocb *req; @@ -5116,7 +5134,6 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) list_for_each_entry(req, &ctx->timeout_list, timeout.list) { if (user_data == req->user_data) { - list_del_init(&req->timeout.list); ret = 0; break; } @@ -5125,15 +5142,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -ENOENT) return ret; - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); - if (ret == -1) - return -EALREADY; - - req_set_fail_links(req); - req->flags |= REQ_F_COMP_LOCKED; - io_cqring_fill_event(req, -ECANCELED); - io_put_req(req); - return 0; + return __io_timeout_cancel(req); } static int io_timeout_remove_prep(struct io_kiocb *req, @@ -7935,6 +7944,71 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data) return work->files == files; } +/* + * Returns true if 'preq' is the link parent of 'req' + */ +static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) +{ + struct io_kiocb *link; + + if (!(preq->flags & REQ_F_LINK_HEAD)) + return false; + + list_for_each_entry(link, &preq->link_list, link_list) { + if (link == req) + return true; + } + + return false; +} + +/* + * We're looking to cancel 'req' because it's holding on to our files, but + * 'req' could be a link to another request. See if it is, and cancel that + * parent request if so. + */ +static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + struct hlist_node *tmp; + struct io_kiocb *preq; + bool found = false; + int i; + + spin_lock_irq(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(preq, tmp, list, hash_node) { + found = io_match_link(preq, req); + if (found) { + io_poll_remove_one(preq); + break; + } + } + } + spin_unlock_irq(&ctx->completion_lock); + return found; +} + +static bool io_timeout_remove_link(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_kiocb *preq; + bool found = false; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry(preq, &ctx->timeout_list, timeout.list) { + found = io_match_link(preq, req); + if (found) { + __io_timeout_cancel(preq); + break; + } + } + spin_unlock_irq(&ctx->completion_lock); + return found; +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { @@ -7989,6 +8063,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } else { io_wq_cancel_work(ctx->io_wq, &cancel_req->work); + /* could be a link, check and remove if it is */ + if (!io_poll_remove_link(ctx, cancel_req)) + io_timeout_remove_link(ctx, cancel_req); io_put_req(cancel_req); } -- cgit From ff6165b2d7f66fccb7095a60ccc7a80813858665 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Aug 2020 09:47:43 -0600 Subject: io_uring: retain iov_iter state over io_read/io_write calls Instead of maintaining (and setting/remembering) iov_iter size and segment counts, just put the iov_iter in the async part of the IO structure. This is mostly a preparation patch for doing appropriate internal retries for short reads, but it also cleans up the state handling nicely and simplifies it quite a bit. Signed-off-by: Jens Axboe --- fs/io_uring.c | 136 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 70 insertions(+), 66 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1ec25ee71372..584f86178671 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -508,9 +508,8 @@ struct io_async_msghdr { struct io_async_rw { struct iovec fast_iov[UIO_FASTIOV]; - struct iovec *iov; - ssize_t nr_segs; - ssize_t size; + const struct iovec *free_iovec; + struct iov_iter iter; struct wait_page_queue wpq; }; @@ -915,8 +914,8 @@ static void io_file_put_work(struct work_struct *work); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, bool needs_lock); -static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, +static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, + const struct iovec *fast_iov, struct iov_iter *iter); static struct kmem_cache *req_cachep; @@ -2299,7 +2298,7 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) ret = io_import_iovec(rw, req, &iovec, &iter, false); if (ret < 0) goto end_req; - ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter); if (!ret) return true; kfree(iovec); @@ -2820,6 +2819,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, ssize_t ret; u8 opcode; + if (req->io) { + struct io_async_rw *iorw = &req->io->rw; + + *iovec = NULL; + return iov_iter_count(&iorw->iter); + } + opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; @@ -2845,14 +2851,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return ret < 0 ? ret : sqe_len; } - if (req->io) { - struct io_async_rw *iorw = &req->io->rw; - - iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size); - *iovec = NULL; - return iorw->size; - } - if (req->flags & REQ_F_BUFFER_SELECT) { ret = io_iov_buffer_select(req, *iovec, needs_lock); if (!ret) { @@ -2930,21 +2928,29 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } -static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, - struct iov_iter *iter) +static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, + const struct iovec *fast_iov, struct iov_iter *iter) { struct io_async_rw *rw = &req->io->rw; - rw->nr_segs = iter->nr_segs; - rw->size = io_size; + memcpy(&rw->iter, iter, sizeof(*iter)); + rw->free_iovec = NULL; + /* can only be fixed buffers, no need to do anything */ + if (iter->type == ITER_BVEC) + return; if (!iovec) { - rw->iov = rw->fast_iov; - if (rw->iov != fast_iov) - memcpy(rw->iov, fast_iov, + unsigned iov_off = 0; + + rw->iter.iov = rw->fast_iov; + if (iter->iov != fast_iov) { + iov_off = iter->iov - fast_iov; + rw->iter.iov += iov_off; + } + if (rw->fast_iov != fast_iov) + memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, sizeof(struct iovec) * iter->nr_segs); } else { - rw->iov = iovec; + rw->free_iovec = iovec; req->flags |= REQ_F_NEED_CLEANUP; } } @@ -2963,8 +2969,8 @@ static int io_alloc_async_ctx(struct io_kiocb *req) return __io_alloc_async_ctx(req); } -static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, +static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, + const struct iovec *fast_iov, struct iov_iter *iter) { if (!io_op_defs[req->opcode].async_ctx) @@ -2973,7 +2979,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, if (__io_alloc_async_ctx(req)) return -ENOMEM; - io_req_map_rw(req, io_size, iovec, fast_iov, iter); + io_req_map_rw(req, iovec, fast_iov, iter); } return 0; } @@ -2981,18 +2987,19 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, static inline int io_rw_prep_async(struct io_kiocb *req, int rw, bool force_nonblock) { - struct io_async_ctx *io = req->io; - struct iov_iter iter; + struct io_async_rw *iorw = &req->io->rw; ssize_t ret; - io->rw.iov = io->rw.fast_iov; + iorw->iter.iov = iorw->fast_iov; + /* reset ->io around the iovec import, we don't want to use it */ req->io = NULL; - ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; + ret = io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov, + &iorw->iter, !force_nonblock); + req->io = container_of(iorw, struct io_async_ctx, rw); if (unlikely(ret < 0)) return ret; - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter); return 0; } @@ -3090,7 +3097,8 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, * succeed, or in rare cases where it fails, we then fall back to using the * async worker threads for a blocking retry. */ -static bool io_rw_should_retry(struct io_kiocb *req) +static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, + struct iovec *fast_iov, struct iov_iter *iter) { struct kiocb *kiocb = &req->rw.kiocb; int ret; @@ -3113,8 +3121,11 @@ static bool io_rw_should_retry(struct io_kiocb *req) * If request type doesn't require req->io to defer in general, * we need to allocate it here */ - if (!req->io && __io_alloc_async_ctx(req)) - return false; + if (!req->io) { + if (__io_alloc_async_ctx(req)) + return false; + io_req_map_rw(req, iovec, fast_iov, iter); + } ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, io_async_buf_func, req); @@ -3141,12 +3152,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter iter; + struct iov_iter __iter, *iter = &__iter; size_t iov_count; - ssize_t io_size, ret, ret2; - unsigned long nr_segs; + ssize_t io_size, ret, ret2 = 0; + + if (req->io) + iter = &req->io->rw.iter; - ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); + ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; io_size = ret; @@ -3160,30 +3173,26 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; - iov_count = iov_iter_count(&iter); - nr_segs = iter.nr_segs; + iov_count = iov_iter_count(iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (unlikely(ret)) goto out_free; - ret2 = io_iter_do_read(req, &iter); + ret2 = io_iter_do_read(req, iter); /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { kiocb_done(kiocb, ret2, cs); } else { - iter.count = iov_count; - iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, - &iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, iter); if (ret) goto out_free; /* it's copied and will be cleaned with ->io */ iovec = NULL; /* if we can retry, do so with the callbacks armed */ - if (io_rw_should_retry(req)) { - ret2 = io_iter_do_read(req, &iter); + if (io_rw_should_retry(req, iovec, inline_vecs, iter)) { + ret2 = io_iter_do_read(req, iter); if (ret2 == -EIOCBQUEUED) { goto out_free; } else if (ret2 != -EAGAIN) { @@ -3223,12 +3232,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter iter; + struct iov_iter __iter, *iter = &__iter; size_t iov_count; ssize_t ret, ret2, io_size; - unsigned long nr_segs; - ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); + if (req->io) + iter = &req->io->rw.iter; + + ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; io_size = ret; @@ -3247,8 +3258,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, (req->flags & REQ_F_ISREG)) goto copy_iov; - iov_count = iov_iter_count(&iter); - nr_segs = iter.nr_segs; + iov_count = iov_iter_count(iter); ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); if (unlikely(ret)) goto out_free; @@ -3269,9 +3279,9 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, kiocb->ki_flags |= IOCB_WRITE; if (req->file->f_op->write_iter) - ret2 = call_write_iter(req->file, kiocb, &iter); + ret2 = call_write_iter(req->file, kiocb, iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter); else ret2 = -EINVAL; @@ -3284,16 +3294,10 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, cs); } else { - iter.count = iov_count; - iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, - &iter); - if (ret) - goto out_free; - /* it's copied and will be cleaned with ->io */ - iovec = NULL; - return -EAGAIN; + ret = io_setup_async_rw(req, iovec, inline_vecs, iter); + if (!ret) + return -EAGAIN; } out_free: if (iovec) @@ -5583,8 +5587,8 @@ static void __io_clean_op(struct io_kiocb *req) case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - if (io->rw.iov != io->rw.fast_iov) - kfree(io->rw.iov); + if (io->rw.free_iovec) + kfree(io->rw.free_iovec); break; case IORING_OP_RECVMSG: case IORING_OP_SENDMSG: -- cgit From 227c0c9673d86732995474d277f84e08ee763e46 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Aug 2020 11:51:40 -0600 Subject: io_uring: internally retry short reads We've had a few application cases of not handling short reads properly, and it is understandable as short reads aren't really expected if the application isn't doing non-blocking IO. Now that we retain the iov_iter over retries, we can implement internal retry pretty trivially. This ensures that we don't return a short read, even for buffered reads on page cache conflicts. Cleanup the deep nesting and hard to read nature of io_read() as well, it's much more straight forward now to read and understand. Added a few comments explaining the logic as well. Signed-off-by: Jens Axboe --- fs/io_uring.c | 109 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 39 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 584f86178671..7dd6df15bc49 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -510,6 +510,7 @@ struct io_async_rw { struct iovec fast_iov[UIO_FASTIOV]; const struct iovec *free_iovec; struct iov_iter iter; + size_t bytes_done; struct wait_page_queue wpq; }; @@ -916,7 +917,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, bool needs_lock); static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, - struct iov_iter *iter); + struct iov_iter *iter, bool force); static struct kmem_cache *req_cachep; @@ -2298,7 +2299,7 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) ret = io_import_iovec(rw, req, &iovec, &iter, false); if (ret < 0) goto end_req; - ret = io_setup_async_rw(req, iovec, inline_vecs, &iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); if (!ret) return true; kfree(iovec); @@ -2588,6 +2589,14 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + /* add previously done IO, if any */ + if (req->io && req->io->rw.bytes_done > 0) { + if (ret < 0) + ret = req->io->rw.bytes_done; + else + ret += req->io->rw.bytes_done; + } + if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; if (ret >= 0 && kiocb->ki_complete == io_complete_rw) @@ -2935,6 +2944,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, memcpy(&rw->iter, iter, sizeof(*iter)); rw->free_iovec = NULL; + rw->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ if (iter->type == ITER_BVEC) return; @@ -2971,9 +2981,9 @@ static int io_alloc_async_ctx(struct io_kiocb *req) static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, - struct iov_iter *iter) + struct iov_iter *iter, bool force) { - if (!io_op_defs[req->opcode].async_ctx) + if (!force && !io_op_defs[req->opcode].async_ctx) return 0; if (!req->io) { if (__io_alloc_async_ctx(req)) @@ -3097,8 +3107,7 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, * succeed, or in rare cases where it fails, we then fall back to using the * async worker threads for a blocking retry. */ -static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, - struct iovec *fast_iov, struct iov_iter *iter) +static bool io_rw_should_retry(struct io_kiocb *req) { struct kiocb *kiocb = &req->rw.kiocb; int ret; @@ -3107,8 +3116,8 @@ static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, if (req->flags & REQ_F_NOWAIT) return false; - /* already tried, or we're doing O_DIRECT */ - if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ)) + /* Only for buffered IO */ + if (kiocb->ki_flags & IOCB_DIRECT) return false; /* * just use poll if we can, and don't attempt if the fs doesn't @@ -3117,16 +3126,6 @@ static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) return false; - /* - * If request type doesn't require req->io to defer in general, - * we need to allocate it here - */ - if (!req->io) { - if (__io_alloc_async_ctx(req)) - return false; - io_req_map_rw(req, iovec, fast_iov, iter); - } - ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, io_async_buf_func, req); if (!ret) { @@ -3153,8 +3152,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; + ssize_t io_size, ret, ret2; size_t iov_count; - ssize_t io_size, ret, ret2 = 0; if (req->io) iter = &req->io->rw.iter; @@ -3164,6 +3163,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, return ret; io_size = ret; req->result = io_size; + ret = 0; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) @@ -3178,31 +3178,62 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (unlikely(ret)) goto out_free; - ret2 = io_iter_do_read(req, iter); + ret = io_iter_do_read(req, iter); - /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { - kiocb_done(kiocb, ret2, cs); - } else { -copy_iov: - ret = io_setup_async_rw(req, iovec, inline_vecs, iter); + if (!ret) { + goto done; + } else if (ret == -EIOCBQUEUED) { + ret = 0; + goto out_free; + } else if (ret == -EAGAIN) { + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (ret) goto out_free; - /* it's copied and will be cleaned with ->io */ - iovec = NULL; - /* if we can retry, do so with the callbacks armed */ - if (io_rw_should_retry(req, iovec, inline_vecs, iter)) { - ret2 = io_iter_do_read(req, iter); - if (ret2 == -EIOCBQUEUED) { - goto out_free; - } else if (ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, cs); - goto out_free; - } - } + return -EAGAIN; + } else if (ret < 0) { + goto out_free; + } + + /* read it all, or we did blocking attempt. no retry. */ + if (!iov_iter_count(iter) || !force_nonblock) + goto done; + + io_size -= ret; +copy_iov: + ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); + if (ret2) { + ret = ret2; + goto out_free; + } + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + /* now use our persistent iterator, if we aren't already */ + iter = &req->io->rw.iter; +retry: + req->io->rw.bytes_done += ret; + /* if we can retry, do so with the callbacks armed */ + if (!io_rw_should_retry(req)) { kiocb->ki_flags &= ~IOCB_WAITQ; return -EAGAIN; } + + /* + * Now retry read with the IOCB_WAITQ parts set in the iocb. If we + * get -EIOCBQUEUED, then we'll get a notification when the desired + * page gets unlocked. We can also get a partial read here, and if we + * do, then just retry at the new offset. + */ + ret = io_iter_do_read(req, iter); + if (ret == -EIOCBQUEUED) { + ret = 0; + goto out_free; + } else if (ret > 0 && ret < io_size) { + /* we got some bytes, but not all. retry. */ + goto retry; + } +done: + kiocb_done(kiocb, ret, cs); + ret = 0; out_free: if (iovec) kfree(iovec); @@ -3295,7 +3326,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, kiocb_done(kiocb, ret2, cs); } else { copy_iov: - ret = io_setup_async_rw(req, iovec, inline_vecs, iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (!ret) return -EAGAIN; } -- cgit From d4e7cd36a90e38e0276d6ce0c20f5ccef17ec38c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 15 Aug 2020 11:44:50 -0700 Subject: io_uring: sanitize double poll handling There's a bit of confusion on the matching pairs of poll vs double poll, depending on if the request is a pure poll (IORING_OP_POLL_ADD) or poll driven retry. Add io_poll_get_double() that returns the double poll waitqueue, if any, and io_poll_get_single() that returns the original poll waitqueue. With that, remove the argument to io_poll_remove_double(). Finally ensure that wait->private is cleared once the double poll handler has run, so that remove knows it's already been seen. Cc: stable@vger.kernel.org # v5.8 Reported-by: syzbot+7f617d4a9369028b8a2c@syzkaller.appspotmail.com Fixes: 18bceab101ad ("io_uring: allow POLL_ADD with double poll_wait() users") Signed-off-by: Jens Axboe --- fs/io_uring.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7dd6df15bc49..cb030912bf5e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4649,9 +4649,24 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) return false; } -static void io_poll_remove_double(struct io_kiocb *req, void *data) +static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) { - struct io_poll_iocb *poll = data; + /* pure poll stashes this in ->io, poll driven retry elsewhere */ + if (req->opcode == IORING_OP_POLL_ADD) + return (struct io_poll_iocb *) req->io; + return req->apoll->double_poll; +} + +static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) +{ + if (req->opcode == IORING_OP_POLL_ADD) + return &req->poll; + return &req->apoll->poll; +} + +static void io_poll_remove_double(struct io_kiocb *req) +{ + struct io_poll_iocb *poll = io_poll_get_double(req); lockdep_assert_held(&req->ctx->completion_lock); @@ -4671,7 +4686,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) { struct io_ring_ctx *ctx = req->ctx; - io_poll_remove_double(req, req->io); + io_poll_remove_double(req); req->poll.done = true; io_cqring_fill_event(req, error ? error : mangle_poll(mask)); io_commit_cqring(ctx); @@ -4711,7 +4726,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = req->apoll->double_poll; + struct io_poll_iocb *poll = io_poll_get_single(req); __poll_t mask = key_to_poll(key); /* for instances that support it check for an event match first: */ @@ -4725,6 +4740,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, done = list_empty(&poll->wait.entry); if (!done) list_del_init(&poll->wait.entry); + /* make sure double remove sees this as being gone */ + wait->private = NULL; spin_unlock(&poll->head->lock); if (!done) __io_async_wake(req, poll, mask, io_poll_task_func); @@ -4808,7 +4825,7 @@ static void io_async_task_func(struct callback_head *cb) if (hash_hashed(&req->hash_node)) hash_del(&req->hash_node); - io_poll_remove_double(req, apoll->double_poll); + io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) @@ -4919,7 +4936,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); if (ret || ipt.error) { - io_poll_remove_double(req, apoll->double_poll); + io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); kfree(apoll->double_poll); kfree(apoll); @@ -4951,14 +4968,13 @@ static bool io_poll_remove_one(struct io_kiocb *req) { bool do_complete; + io_poll_remove_double(req); + if (req->opcode == IORING_OP_POLL_ADD) { - io_poll_remove_double(req, req->io); do_complete = __io_poll_remove_one(req, &req->poll); } else { struct async_poll *apoll = req->apoll; - io_poll_remove_double(req, apoll->double_poll); - /* non-poll requests have submit ref still */ do_complete = __io_poll_remove_one(req, &apoll->poll); if (do_complete) { -- cgit From f91daf565b0e272a33bd3fcd19eaebd331c5cffd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 15 Aug 2020 15:58:42 -0700 Subject: io_uring: short circuit -EAGAIN for blocking read attempt One case was missed in the short IO retry handling, and that's hitting -EAGAIN on a blocking attempt read (eg from io-wq context). This is a problem on sockets that are marked as non-blocking when created, they don't carry any REQ_F_NOWAIT information to help us terminate them instead of perpetually retrying. Fixes: 227c0c9673d8 ("io_uring: internally retry short reads") Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index cb030912bf5e..dc506b75659c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3186,6 +3186,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = 0; goto out_free; } else if (ret == -EAGAIN) { + if (!force_nonblock) + goto done; ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (ret) goto out_free; @@ -3195,7 +3197,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, } /* read it all, or we did blocking attempt. no retry. */ - if (!iov_iter_count(iter) || !force_nonblock) + if (!iov_iter_count(iter) || !force_nonblock || + (req->file->f_flags & O_NONBLOCK)) goto done; io_size -= ret; -- cgit From b711d4eaf0c408a811311ee3e94d6e9e5a230a9a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 16 Aug 2020 08:23:05 -0700 Subject: io_uring: find and cancel head link async work on files exit Commit f254ac04c874 ("io_uring: enable lookup of links holding inflight files") only handled 2 out of the three head link cases we have, we also need to lookup and cancel work that is blocked in io-wq if that work has a link that's holding a reference to the files structure. Put the "cancel head links that hold this request pending" logic into io_attempt_cancel(), which will to through the motions of finding and canceling head links that hold the current inflight files stable request pending. Cc: stable@vger.kernel.org Reported-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index dc506b75659c..346a3eb84785 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8063,6 +8063,33 @@ static bool io_timeout_remove_link(struct io_ring_ctx *ctx, return found; } +static bool io_cancel_link_cb(struct io_wq_work *work, void *data) +{ + return io_match_link(container_of(work, struct io_kiocb, work), data); +} + +static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + enum io_wq_cancel cret; + + /* cancel this particular work, if it's running */ + cret = io_wq_cancel_work(ctx->io_wq, &req->work); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* find links that hold this pending, cancel those */ + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* if we have a poll link holding this pending, cancel that */ + if (io_poll_remove_link(ctx, req)) + return; + + /* final option, timeout link is holding this req pending */ + io_timeout_remove_link(ctx, req); +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { @@ -8116,10 +8143,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, continue; } } else { - io_wq_cancel_work(ctx->io_wq, &cancel_req->work); - /* could be a link, check and remove if it is */ - if (!io_poll_remove_link(ctx, cancel_req)) - io_timeout_remove_link(ctx, cancel_req); + /* cancel this request, or head link requests */ + io_attempt_cancel(ctx, cancel_req); io_put_req(cancel_req); } -- cgit From 3b2a4439e0ae1732f90877a7160bbf42e1beb4b6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 16 Aug 2020 10:58:43 -0700 Subject: io_uring: get rid of kiocb_wait_page_queue_init() The 5.9 merge moved this function io_uring, which means that we don't need to retain the generic nature of it. Clean up this part by removing redundant checks, and just inlining the small remainder in io_rw_should_retry(). No functional changes in this patch. Signed-off-by: Jens Axboe --- fs/io_uring.c | 41 +++++++++++------------------------------ 1 file changed, 11 insertions(+), 30 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 346a3eb84785..4b102d9ad846 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3074,27 +3074,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, return 1; } -static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, - struct wait_page_queue *wait, - wait_queue_func_t func, - void *data) -{ - /* Can't support async wakeup with polled IO */ - if (kiocb->ki_flags & IOCB_HIPRI) - return -EINVAL; - if (kiocb->ki_filp->f_mode & FMODE_BUF_RASYNC) { - wait->wait.func = func; - wait->wait.private = data; - wait->wait.flags = 0; - INIT_LIST_HEAD(&wait->wait.entry); - kiocb->ki_flags |= IOCB_WAITQ; - kiocb->ki_waitq = wait; - return 0; - } - - return -EOPNOTSUPP; -} - /* * This controls whether a given IO request should be armed for async page * based retry. If we return false here, the request is handed to the async @@ -3109,16 +3088,17 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, */ static bool io_rw_should_retry(struct io_kiocb *req) { + struct wait_page_queue *wait = &req->io->rw.wpq; struct kiocb *kiocb = &req->rw.kiocb; - int ret; /* never retry for NOWAIT, we just complete with -EAGAIN */ if (req->flags & REQ_F_NOWAIT) return false; /* Only for buffered IO */ - if (kiocb->ki_flags & IOCB_DIRECT) + if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) return false; + /* * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks @@ -3126,14 +3106,15 @@ static bool io_rw_should_retry(struct io_kiocb *req) if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) return false; - ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, - io_async_buf_func, req); - if (!ret) { - io_get_req_task(req); - return true; - } + wait->wait.func = io_async_buf_func; + wait->wait.private = req; + wait->wait.flags = 0; + INIT_LIST_HEAD(&wait->wait.entry); + kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_waitq = wait; - return false; + io_get_req_task(req); + return true; } static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) -- cgit From 8452fd0ce657a4313dfa784f11320971b67727fd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Aug 2020 13:58:33 -0700 Subject: io_uring: cleanup io_import_iovec() of pre-mapped request io_rw_prep_async() goes through a dance of clearing req->io, calling the iovec import, then re-setting req->io. Provide an internal helper that does the right thing without needing state tweaked to get there. This enables further cleanups in io_read, io_write, and io_resubmit_prep(), but that's left for another time. Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4b102d9ad846..e325895d681b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2819,22 +2819,15 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, needs_lock); } -static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter, - bool needs_lock) +static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) { void __user *buf = u64_to_user_ptr(req->rw.addr); size_t sqe_len = req->rw.len; ssize_t ret; u8 opcode; - if (req->io) { - struct io_async_rw *iorw = &req->io->rw; - - *iovec = NULL; - return iov_iter_count(&iorw->iter); - } - opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; @@ -2879,6 +2872,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); } +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) +{ + if (!req->io) + return __io_import_iovec(rw, req, iovec, iter, needs_lock); + *iovec = NULL; + return iov_iter_count(&req->io->rw.iter); +} + /* * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually. @@ -3001,11 +3004,8 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw, ssize_t ret; iorw->iter.iov = iorw->fast_iov; - /* reset ->io around the iovec import, we don't want to use it */ - req->io = NULL; - ret = io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov, + ret = __io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov, &iorw->iter, !force_nonblock); - req->io = container_of(iorw, struct io_async_ctx, rw); if (unlikely(ret < 0)) return ret; -- cgit From fc666777da9da387354b1a142a6ffc5d43bc4f7e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Aug 2020 11:10:51 -0600 Subject: io_uring: use system_unbound_wq for ring exit work We currently use system_wq, which is unbounded in terms of number of workers. This means that if we're exiting tons of rings at the same time, then we'll briefly spawn tons of event kworkers just for a very short blocking time as the rings exit. Use system_unbound_wq instead, which has a sane cap on the concurrency level. Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index e325895d681b..1f2f31d93686 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7960,7 +7960,13 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) ACCT_LOCKED); INIT_WORK(&ctx->exit_work, io_ring_exit_work); - queue_work(system_wq, &ctx->exit_work); + /* + * Use system_unbound_wq to avoid spawning tons of event kworkers + * if we're exiting a ton of rings at the same time. It just adds + * noise and overhead, there's no discernable change in runtime + * over using system_wq. + */ + queue_work(system_unbound_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) -- cgit From bb175342aa64e6c6f1d04f5235502121d6ff0247 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 20 Aug 2020 11:33:35 +0300 Subject: io_uring: fix racy req->flags modification Setting and clearing REQ_F_OVERFLOW in io_uring_cancel_files() and io_cqring_overflow_flush() are racy, because they might be called asynchronously. REQ_F_OVERFLOW flag in only needed for files cancellation, so if it can be guaranteed that requests _currently_ marked inflight can't be overflown, the problem will be solved with removing the flag altogether. That's how the patch works, it removes inflight status of a request in io_cqring_fill_event() whenever it should be thrown into CQ-overflow list. That's Ok to do, because no opcode specific handling can be done after io_cqring_fill_event(), the same assumption as with "struct io_completion" patches. And it already have a good place for such cleanups, which is io_clean_op(). A nice side effect of this is removing this inflight check from the hot path. note on synchronisation: now __io_cqring_fill_event() may be taking two spinlocks simultaneously, completion_lock and inflight_lock. It's fine, because we never do that in reverse order, and CQ-overflow of inflight requests shouldn't happen often. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 61 +++++++++++++++++------------------------------------------ 1 file changed, 17 insertions(+), 44 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1f2f31d93686..d43a20a554e4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -540,7 +540,6 @@ enum { REQ_F_ISREG_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, - REQ_F_OVERFLOW_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, @@ -583,8 +582,6 @@ enum { REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), - /* in overflow list */ - REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), /* already went through poll handler */ REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ @@ -946,7 +943,8 @@ static void io_get_req_task(struct io_kiocb *req) static inline void io_clean_op(struct io_kiocb *req) { - if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | + REQ_F_INFLIGHT)) __io_clean_op(req); } @@ -1366,7 +1364,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, compl.list); list_move(&req->compl.list, &list); - req->flags &= ~REQ_F_OVERFLOW; if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); @@ -1419,7 +1416,6 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } io_clean_op(req); - req->flags |= REQ_F_OVERFLOW; req->result = res; req->compl.cflags = cflags; refcount_inc(&req->refs); @@ -1563,17 +1559,6 @@ static bool io_dismantle_req(struct io_kiocb *req) if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - if (req->flags & REQ_F_INFLIGHT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - list_del(&req->inflight_entry); - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - } - return io_req_clean_work(req); } @@ -5634,6 +5619,18 @@ static void __io_clean_op(struct io_kiocb *req) } req->flags &= ~REQ_F_NEED_CLEANUP; } + + if (req->flags & REQ_F_INFLIGHT) { + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; + } } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, @@ -8108,33 +8105,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, /* We need to keep going until we don't find a matching req */ if (!cancel_req) break; - - if (cancel_req->flags & REQ_F_OVERFLOW) { - spin_lock_irq(&ctx->completion_lock); - list_del(&cancel_req->compl.list); - cancel_req->flags &= ~REQ_F_OVERFLOW; - - io_cqring_mark_overflow(ctx); - WRITE_ONCE(ctx->rings->cq_overflow, - atomic_inc_return(&ctx->cached_cq_overflow)); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - - /* - * Put inflight ref and overflow ref. If that's - * all we had, then we're done with this request. - */ - if (refcount_sub_and_test(2, &cancel_req->refs)) { - io_free_req(cancel_req); - finish_wait(&ctx->inflight_wait, &wait); - continue; - } - } else { - /* cancel this request, or head link requests */ - io_attempt_cancel(ctx, cancel_req); - io_put_req(cancel_req); - } - + /* cancel this request, or head link requests */ + io_attempt_cancel(ctx, cancel_req); + io_put_req(cancel_req); schedule(); finish_wait(&ctx->inflight_wait, &wait); } -- cgit From f261c16861b82951bd2125706660ce4602930563 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 20 Aug 2020 11:34:10 +0300 Subject: io_uring: comment on kfree(iovec) checks kfree() handles NULL pointers well, but io_{read,write}() checks it because of performance reasons. Leave a comment there for those who are tempted to patch it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index d43a20a554e4..b9ca5a54dc20 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3204,6 +3204,7 @@ done: kiocb_done(kiocb, ret, cs); ret = 0; out_free: + /* it's reportedly faster than delegating the null check to kfree() */ if (iovec) kfree(iovec); return ret; @@ -3300,6 +3301,7 @@ copy_iov: return -EAGAIN; } out_free: + /* it's reportedly faster than delegating the null check to kfree() */ if (iovec) kfree(iovec); return ret; -- cgit From 867a23eab52847d41a0a6eae41a64d76de7782a8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 20 Aug 2020 11:34:39 +0300 Subject: io_uring: kill extra iovec=NULL in import_iovec() If io_import_iovec() returns an error, return iovec is undefined and must not be used, so don't set it to NULL when failing. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index b9ca5a54dc20..91e2cc8414f9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2826,10 +2826,8 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { if (req->flags & REQ_F_BUFFER_SELECT) { buf = io_rw_buffer_select(req, &sqe_len, needs_lock); - if (IS_ERR(buf)) { - *iovec = NULL; + if (IS_ERR(buf)) return PTR_ERR(buf); - } req->rw.len = sqe_len; } -- cgit