From cbd287c09351f1d3a4b3cb9167a2616a11390d32 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Aug 2020 17:06:21 -0600 Subject: io_uring: io_async_buf_func() need not test page bit Since we don't do exclusive waits or wakeups, we know that the bit is always going to be set. Kill the test. Also see commit: 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic") Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2a3af95be4ca..bb4f0b2d5138 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2965,10 +2965,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, if (!wake_page_match(wpq, key)) return 0; - /* Stop waking things up if the page is locked again */ - if (test_bit(key->bit_nr, &key->page->flags)) - return -1; - list_del_init(&wait->entry); init_task_work(&req->task_work, io_req_task_submit); -- cgit From c1dd91d16246b168b80af9b64c5cc35a66410455 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Aug 2020 16:43:59 -0600 Subject: io_uring: add comments on how the async buffered read retry works The retry based logic here isn't easy to follow unless you're already familiar with how io_uring does task_work based retries. Add some comments explaining the flow a little better. Suggested-by: Linus Torvalds Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index bb4f0b2d5138..5e1c08e22990 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2952,6 +2952,16 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return io_rw_prep_async(req, READ, force_nonblock); } +/* + * This is our waitqueue callback handler, registered through lock_page_async() + * when we initially tried to do the IO with the iocb armed our waitqueue. + * This gets called when the page is unlocked, and we generally expect that to + * happen when the page IO is completed and the page is now uptodate. This will + * queue a task_work based retry of the operation, attempting to copy the data + * again. If the latter fails because the page was NOT uptodate, then we will + * do a thread based blocking retry of the operation. That's the unexpected + * slow path. + */ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, int sync, void *arg) { @@ -3004,7 +3014,18 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, return -EOPNOTSUPP; } - +/* + * This controls whether a given IO request should be armed for async page + * based retry. If we return false here, the request is handed to the async + * worker threads for retry. If we're doing buffered reads on a regular file, + * we prepare a private wait_page_queue entry and retry the operation. This + * will either succeed because the page is now uptodate and unlocked, or it + * will register a callback when the page is unlocked at IO completion. Through + * that callback, io_uring uses task_work to setup a retry of the operation. + * That retry will attempt the buffered read again. The retry will generally + * succeed, or in rare cases where it fails, we then fall back to using the + * async worker threads for a blocking retry. + */ static bool io_rw_should_retry(struct io_kiocb *req) { struct kiocb *kiocb = &req->rw.kiocb; -- cgit From 2dd2111d0d383df104b144e0d1f6b5a00cb7cd88 Mon Sep 17 00:00:00 2001 From: Guoyu Huang Date: Wed, 5 Aug 2020 03:53:50 -0700 Subject: io_uring: Fix NULL pointer dereference in loop_rw_iter() loop_rw_iter() does not check whether the file has a read or write function. This can lead to NULL pointer dereference when the user passes in a file descriptor that does not have read or write function. The crash log looks like this: [ 99.834071] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 99.835364] #PF: supervisor instruction fetch in kernel mode [ 99.836522] #PF: error_code(0x0010) - not-present page [ 99.837771] PGD 8000000079d62067 P4D 8000000079d62067 PUD 79d8c067 PMD 0 [ 99.839649] Oops: 0010 [#2] SMP PTI [ 99.840591] CPU: 1 PID: 333 Comm: io_wqe_worker-0 Tainted: G D 5.8.0 #2 [ 99.842622] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 [ 99.845140] RIP: 0010:0x0 [ 99.845840] Code: Bad RIP value. [ 99.846672] RSP: 0018:ffffa1c7c01ebc08 EFLAGS: 00010202 [ 99.848018] RAX: 0000000000000000 RBX: ffff92363bd67300 RCX: ffff92363d461208 [ 99.849854] RDX: 0000000000000010 RSI: 00007ffdbf696bb0 RDI: ffff92363bd67300 [ 99.851743] RBP: ffffa1c7c01ebc40 R08: 0000000000000000 R09: 0000000000000000 [ 99.853394] R10: ffffffff9ec692a0 R11: 0000000000000000 R12: 0000000000000010 [ 99.855148] R13: 0000000000000000 R14: ffff92363d461208 R15: ffffa1c7c01ebc68 [ 99.856914] FS: 0000000000000000(0000) GS:ffff92363dd00000(0000) knlGS:0000000000000000 [ 99.858651] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 99.860032] CR2: ffffffffffffffd6 CR3: 000000007ac66000 CR4: 00000000000006e0 [ 99.861979] Call Trace: [ 99.862617] loop_rw_iter.part.0+0xad/0x110 [ 99.863838] io_write+0x2ae/0x380 [ 99.864644] ? kvm_sched_clock_read+0x11/0x20 [ 99.865595] ? sched_clock+0x9/0x10 [ 99.866453] ? sched_clock_cpu+0x11/0xb0 [ 99.867326] ? newidle_balance+0x1d4/0x3c0 [ 99.868283] io_issue_sqe+0xd8f/0x1340 [ 99.869216] ? __switch_to+0x7f/0x450 [ 99.870280] ? __switch_to_asm+0x42/0x70 [ 99.871254] ? __switch_to_asm+0x36/0x70 [ 99.872133] ? lock_timer_base+0x72/0xa0 [ 99.873155] ? switch_mm_irqs_off+0x1bf/0x420 [ 99.874152] io_wq_submit_work+0x64/0x180 [ 99.875192] ? kthread_use_mm+0x71/0x100 [ 99.876132] io_worker_handle_work+0x267/0x440 [ 99.877233] io_wqe_worker+0x297/0x350 [ 99.878145] kthread+0x112/0x150 [ 99.878849] ? __io_worker_unuse+0x100/0x100 [ 99.879935] ? kthread_park+0x90/0x90 [ 99.880874] ret_from_fork+0x22/0x30 [ 99.881679] Modules linked in: [ 99.882493] CR2: 0000000000000000 [ 99.883324] ---[ end trace 4453745f4673190b ]--- [ 99.884289] RIP: 0010:0x0 [ 99.884837] Code: Bad RIP value. [ 99.885492] RSP: 0018:ffffa1c7c01ebc08 EFLAGS: 00010202 [ 99.886851] RAX: 0000000000000000 RBX: ffff92363acd7f00 RCX: ffff92363d461608 [ 99.888561] RDX: 0000000000000010 RSI: 00007ffe040d9e10 RDI: ffff92363acd7f00 [ 99.890203] RBP: ffffa1c7c01ebc40 R08: 0000000000000000 R09: 0000000000000000 [ 99.891907] R10: ffffffff9ec692a0 R11: 0000000000000000 R12: 0000000000000010 [ 99.894106] R13: 0000000000000000 R14: ffff92363d461608 R15: ffffa1c7c01ebc68 [ 99.896079] FS: 0000000000000000(0000) GS:ffff92363dd00000(0000) knlGS:0000000000000000 [ 99.898017] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 99.899197] CR2: ffffffffffffffd6 CR3: 000000007ac66000 CR4: 00000000000006e0 Fixes: 32960613b7c3 ("io_uring: correctly handle non ->{read,write}_iter() file_operations") Cc: stable@vger.kernel.org Signed-off-by: Guoyu Huang Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5e1c08e22990..8f96566603f3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3066,7 +3066,10 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) { if (req->file->f_op->read_iter) return call_read_iter(req->file, &req->rw.kiocb, iter); - return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); + else if (req->file->f_op->read) + return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); + else + return -EINVAL; } static int io_read(struct io_kiocb *req, bool force_nonblock, @@ -3203,8 +3206,10 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (req->file->f_op->write_iter) ret2 = call_write_iter(req->file, kiocb, &iter); - else + else if (req->file->f_op->write) ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + else + ret2 = -EINVAL; /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just -- cgit From bd74048108c179cea0ff52979506164c80f29da7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Aug 2020 12:58:23 -0600 Subject: io_uring: set ctx sq/cq entry count earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we hit an earlier error path in io_uring_create(), then we will have accounted memory, but not set ctx->{sq,cq}_entries yet. Then when the ring is torn down in error, we use those values to unaccount the memory. Ensure we set the ctx entries before we're able to hit a potential error path. Cc: stable@vger.kernel.org Reported-by: Tomáš Chaloupka Tested-by: Tomáš Chaloupka Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8f96566603f3..0d857f7ca507 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8193,6 +8193,10 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_rings *rings; size_t size, sq_array_offset; + /* make sure these are sane, as we already accounted them */ + ctx->sq_entries = p->sq_entries; + ctx->cq_entries = p->cq_entries; + size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -8209,8 +8213,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->cq_ring_entries = p->cq_entries; ctx->sq_mask = rings->sq_ring_mask; ctx->cq_mask = rings->cq_ring_mask; - ctx->sq_entries = rings->sq_ring_entries; - ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { -- cgit From f74441e6311a28f0ee89b9c8e296a33730f812fc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Aug 2020 13:00:44 -0600 Subject: io_uring: account locked memory before potential error case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The tear down path will always unaccount the memory, so ensure that we have accounted it before hitting any of them. Reported-by: Tomáš Chaloupka Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0d857f7ca507..e9b27cdaa735 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8341,6 +8341,16 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->user = user; ctx->creds = get_current_cred(); + /* + * Account memory _before_ installing the file descriptor. Once + * the descriptor is installed, it can get closed at any time. Also + * do this before hitting the general error path, as ring freeing + * will un-account as well. + */ + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), + ACCT_LOCKED); + ctx->limit_mem = limit_mem; + ret = io_allocate_scq_urings(ctx, p); if (ret) goto err; @@ -8377,14 +8387,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; } - /* - * Account memory _before_ installing the file descriptor. Once - * the descriptor is installed, it can get closed at any time. - */ - io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), - ACCT_LOCKED); - ctx->limit_mem = limit_mem; - /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup -- cgit From 0ba9c9edcd152158a0e321a4c13ac1dfc571ff3d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 6 Aug 2020 19:41:50 -0600 Subject: io_uring: use TWA_SIGNAL for task_work uncondtionally An earlier commit: b7db41c9e03b ("io_uring: fix regression with always ignoring signals in io_cqring_wait()") ensured that we didn't get stuck waiting for eventfd reads when it's registered with the io_uring ring for event notification, but we still have cases where the task can be waiting on other events in the kernel and need a bigger nudge to make forward progress. Or the task could be in the kernel and running, but on its way to blocking. This means that TWA_RESUME cannot reliably be used to ensure we make progress. Use TWA_SIGNAL unconditionally. Cc: stable@vger.kernel.org # v5.7+ Reported-by: Josef Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e9b27cdaa735..af6811ddcfbd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1710,22 +1710,22 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; - int ret, notify = TWA_RESUME; + int ret, notify; /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. - * If we're not using an eventfd, then TWA_RESUME is always fine, - * as we won't have dependencies between request completions for - * other kernel wait conditions. + * SQPOLL kernel thread doesn't need notification, just a wakeup. For + * all other cases, use TWA_SIGNAL unconditionally to ensure we're + * processing task_work. There's no reliable way to tell if TWA_RESUME + * will do the job. */ - if (ctx->flags & IORING_SETUP_SQPOLL) - notify = 0; - else if (ctx->cq_ev_fd) + notify = 0; + if (!(ctx->flags & IORING_SETUP_SQPOLL)) notify = TWA_SIGNAL; ret = task_work_add(tsk, cb, notify); if (!ret) wake_up_process(tsk); + return ret; } -- cgit From 7271ef3a93a832180068c7aade3f130b7f39b17e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Aug 2020 09:55:22 -0600 Subject: io_uring: fix recursive completion locking on oveflow flush syszbot reports a scenario where we recurse on the completion lock when flushing an overflow: 1 lock held by syz-executor287/6816: #0: ffff888093cdb4d8 (&ctx->completion_lock){....}-{2:2}, at: io_cqring_overflow_flush+0xc6/0xab0 fs/io_uring.c:1333 stack backtrace: CPU: 1 PID: 6816 Comm: syz-executor287 Not tainted 5.8.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1f0/0x31e lib/dump_stack.c:118 print_deadlock_bug kernel/locking/lockdep.c:2391 [inline] check_deadlock kernel/locking/lockdep.c:2432 [inline] validate_chain+0x69a4/0x88a0 kernel/locking/lockdep.c:3202 __lock_acquire+0x1161/0x2ab0 kernel/locking/lockdep.c:4426 lock_acquire+0x160/0x730 kernel/locking/lockdep.c:5005 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:128 [inline] _raw_spin_lock_irq+0x67/0x80 kernel/locking/spinlock.c:167 spin_lock_irq include/linux/spinlock.h:379 [inline] io_queue_linked_timeout fs/io_uring.c:5928 [inline] __io_queue_async_work fs/io_uring.c:1192 [inline] __io_queue_deferred+0x36a/0x790 fs/io_uring.c:1237 io_cqring_overflow_flush+0x774/0xab0 fs/io_uring.c:1359 io_ring_ctx_wait_and_kill+0x2a1/0x570 fs/io_uring.c:7808 io_uring_release+0x59/0x70 fs/io_uring.c:7829 __fput+0x34f/0x7b0 fs/file_table.c:281 task_work_run+0x137/0x1c0 kernel/task_work.c:135 exit_task_work include/linux/task_work.h:25 [inline] do_exit+0x5f3/0x1f20 kernel/exit.c:806 do_group_exit+0x161/0x2d0 kernel/exit.c:903 __do_sys_exit_group+0x13/0x20 kernel/exit.c:914 __se_sys_exit_group+0x10/0x10 kernel/exit.c:912 __x64_sys_exit_group+0x37/0x40 kernel/exit.c:912 do_syscall_64+0x31/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix this by passing back the link from __io_queue_async_work(), and then let the caller handle the queueing of the link. Take care to also punt the submission reference put to the caller, as we're holding the completion lock for the __io_queue_defer() case. Hence we need to mark the io_kiocb appropriately for that case. Reported-by: syzbot+996f91b6ec3812c48042@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index af6811ddcfbd..360649041bfa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -898,6 +898,7 @@ static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); +static void __io_queue_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, @@ -1179,7 +1180,7 @@ static void io_prep_async_link(struct io_kiocb *req) io_prep_async_work(cur); } -static void __io_queue_async_work(struct io_kiocb *req) +static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); @@ -1187,16 +1188,19 @@ static void __io_queue_async_work(struct io_kiocb *req) trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, &req->work, req->flags); io_wq_enqueue(ctx->io_wq, &req->work); - - if (link) - io_queue_linked_timeout(link); + return link; } static void io_queue_async_work(struct io_kiocb *req) { + struct io_kiocb *link; + /* init ->work of the whole link before punting */ io_prep_async_link(req); - __io_queue_async_work(req); + link = __io_queue_async_work(req); + + if (link) + io_queue_linked_timeout(link); } static void io_kill_timeout(struct io_kiocb *req) @@ -1229,12 +1233,19 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) do { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); + struct io_kiocb *link; if (req_need_defer(de->req, de->seq)) break; list_del_init(&de->list); /* punt-init is done before queueing for defer */ - __io_queue_async_work(de->req); + link = __io_queue_async_work(de->req); + if (link) { + __io_queue_linked_timeout(link); + /* drop submission reference */ + link->flags |= REQ_F_COMP_LOCKED; + io_put_req(link); + } kfree(de); } while (!list_empty(&ctx->defer_list)); } @@ -5939,15 +5950,12 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void io_queue_linked_timeout(struct io_kiocb *req) +static void __io_queue_linked_timeout(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - /* * If the list is now empty, then our linked request finished before * we got a chance to setup the timer */ - spin_lock_irq(&ctx->completion_lock); if (!list_empty(&req->link_list)) { struct io_timeout_data *data = &req->io->timeout; @@ -5955,6 +5963,14 @@ static void io_queue_linked_timeout(struct io_kiocb *req) hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); } +} + +static void io_queue_linked_timeout(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + __io_queue_linked_timeout(req); spin_unlock_irq(&ctx->completion_lock); /* drop submission reference */ -- cgit From 9b7adba9eaec28e0e4343c96d0dbeb9578802f5f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Aug 2020 10:54:02 -0600 Subject: io_uring: add missing REQ_F_COMP_LOCKED for nested requests When we traverse into failing links or timeouts, we need to ensure we propagate the REQ_F_COMP_LOCKED flag to ensure that we correctly signal to the completion side that we already hold the completion lock. Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 360649041bfa..56115cb4b9fa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1609,6 +1609,7 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req) return false; list_del_init(&link->link_list); + link->flags |= REQ_F_COMP_LOCKED; wake_ev = io_link_cancel_timeout(link); req->flags &= ~REQ_F_LINK_TIMEOUT; return wake_ev; @@ -1667,6 +1668,7 @@ static void __io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); io_cqring_fill_event(link, -ECANCELED); + link->flags |= REQ_F_COMP_LOCKED; __io_double_put_req(link); req->flags &= ~REQ_F_LINK_TIMEOUT; } @@ -5071,6 +5073,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return -EALREADY; req_set_fail_links(req); + req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, -ECANCELED); io_put_req(req); return 0; -- cgit From 51a4cc112c7a42b62a91bcccdfac42e7c4561729 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Aug 2020 10:55:56 -0600 Subject: io_uring: defer file table grabbing request cleanup for locked requests If we're in the error path failing links and we have a link that has grabbed a reference to the fs_struct, then we cannot safely drop our reference to the table if we already hold the completion lock. This adds a hardirq dependency to the fs_struct->lock, which it currently doesn't have. Defer the final cleanup and free of such requests to avoid adding this dependency. Reported-by: syzbot+ef4b654b49ed7ff049bf@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 56115cb4b9fa..5488698189da 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1108,10 +1108,16 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static void io_req_clean_work(struct io_kiocb *req) +/* + * Returns true if we need to defer file table putting. This can only happen + * from the error path with REQ_F_COMP_LOCKED set. + */ +static bool io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) - return; + return false; + + req->flags &= ~REQ_F_WORK_INITIALIZED; if (req->work.mm) { mmdrop(req->work.mm); @@ -1124,6 +1130,9 @@ static void io_req_clean_work(struct io_kiocb *req) if (req->work.fs) { struct fs_struct *fs = req->work.fs; + if (req->flags & REQ_F_COMP_LOCKED) + return true; + spin_lock(&req->work.fs->lock); if (--fs->users) fs = NULL; @@ -1132,7 +1141,8 @@ static void io_req_clean_work(struct io_kiocb *req) free_fs_struct(fs); req->work.fs = NULL; } - req->flags &= ~REQ_F_WORK_INITIALIZED; + + return false; } static void io_prep_async_work(struct io_kiocb *req) @@ -1544,7 +1554,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, fput(file); } -static void io_dismantle_req(struct io_kiocb *req) +static bool io_dismantle_req(struct io_kiocb *req) { io_clean_op(req); @@ -1552,7 +1562,6 @@ static void io_dismantle_req(struct io_kiocb *req) kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - io_req_clean_work(req); if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -1564,15 +1573,15 @@ static void io_dismantle_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } + + return io_req_clean_work(req); } -static void __io_free_req(struct io_kiocb *req) +static void __io_free_req_finish(struct io_kiocb *req) { - struct io_ring_ctx *ctx; + struct io_ring_ctx *ctx = req->ctx; - io_dismantle_req(req); __io_put_req_task(req); - ctx = req->ctx; if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); else @@ -1580,6 +1589,39 @@ static void __io_free_req(struct io_kiocb *req) percpu_ref_put(&ctx->refs); } +static void io_req_task_file_table_put(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct fs_struct *fs = req->work.fs; + + spin_lock(&req->work.fs->lock); + if (--fs->users) + fs = NULL; + spin_unlock(&req->work.fs->lock); + if (fs) + free_fs_struct(fs); + req->work.fs = NULL; + __io_free_req_finish(req); +} + +static void __io_free_req(struct io_kiocb *req) +{ + if (!io_dismantle_req(req)) { + __io_free_req_finish(req); + } else { + int ret; + + init_task_work(&req->task_work, io_req_task_file_table_put); + ret = task_work_add(req->task, &req->task_work, TWA_RESUME); + if (unlikely(ret)) { + struct task_struct *tsk; + + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); + } + } +} + static bool io_link_cancel_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1868,7 +1910,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) req->flags &= ~REQ_F_TASK_PINNED; } - io_dismantle_req(req); + WARN_ON_ONCE(io_dismantle_req(req)); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) __io_req_free_batch_flush(req->ctx, rb); -- cgit From efa8480a831673bb52400df9dbe5da0aacda97bf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Aug 2020 18:44:24 -0600 Subject: fs: RWF_NOWAIT should imply IOCB_NOIO With the change allowing read-ahead for IOCB_NOWAIT, we changed the RWF_NOWAIT semantics of only doing cached reads. Since we know have IOCB_NOIO to manage that specific side of it, just make RWF_NOWAIT imply IOCB_NOIO as well to restore the previous behavior. Fixes: 2e85abf053b9 ("mm: allow read-ahead with IOCB_NOWAIT set") Reported-by: Dave Chinner Reviewed-by: Dave Chinner Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index bd7ec3eaeed0..f1cca4bfdd7b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3293,7 +3293,7 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP; - kiocb_flags |= IOCB_NOWAIT; + kiocb_flags |= IOCB_NOWAIT | IOCB_NOIO; } if (flags & RWF_HIPRI) kiocb_flags |= IOCB_HIPRI; -- cgit From 6d816e088c359866f9867057e04f244c608c42fe Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Aug 2020 08:04:14 -0600 Subject: io_uring: hold 'ctx' reference around task_work queue + execute We're holding the request reference, but we need to go one higher to ensure that the ctx remains valid after the request has finished. If the ring is closed with pending task_work inflight, and the given io_kiocb finishes sync during issue, then we need a reference to the ring itself around the task_work execution cycle. Cc: stable@vger.kernel.org # v5.7+ Reported-by: syzbot+9b260fc33297966f5a8e@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5488698189da..99582cf5106b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1821,8 +1821,10 @@ static void __io_req_task_submit(struct io_kiocb *req) static void io_req_task_submit(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; __io_req_task_submit(req); + percpu_ref_put(&ctx->refs); } static void io_req_task_queue(struct io_kiocb *req) @@ -1830,6 +1832,7 @@ static void io_req_task_queue(struct io_kiocb *req) int ret; init_task_work(&req->task_work, io_req_task_submit); + percpu_ref_get(&req->ctx->refs); ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { @@ -2318,6 +2321,8 @@ static void io_rw_resubmit(struct callback_head *cb) refcount_inc(&req->refs); io_queue_async_work(req); } + + percpu_ref_put(&ctx->refs); } #endif @@ -2330,6 +2335,8 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) return false; init_task_work(&req->task_work, io_rw_resubmit); + percpu_ref_get(&req->ctx->refs); + ret = io_req_task_work_add(req, &req->task_work); if (!ret) return true; @@ -3033,6 +3040,8 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, list_del_init(&wait->entry); init_task_work(&req->task_work, io_req_task_submit); + percpu_ref_get(&req->ctx->refs); + /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); ret = io_req_task_work_add(req, &req->task_work); @@ -4565,6 +4574,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, req->result = mask; init_task_work(&req->task_work, func); + percpu_ref_get(&req->ctx->refs); + /* * If this fails, then the task is exiting. When a task exits, the * work gets canceled, so just cancel this request as well instead @@ -4652,11 +4663,13 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) static void io_poll_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt = NULL; io_poll_task_handler(req, &nxt); if (nxt) __io_req_task_submit(nxt); + percpu_ref_put(&ctx->refs); } static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, @@ -4752,6 +4765,7 @@ static void io_async_task_func(struct callback_head *cb) if (io_poll_rewait(req, &apoll->poll)) { spin_unlock_irq(&ctx->completion_lock); + percpu_ref_put(&ctx->refs); return; } @@ -4767,6 +4781,7 @@ static void io_async_task_func(struct callback_head *cb) else __io_req_task_cancel(req, -ECANCELED); + percpu_ref_put(&ctx->refs); kfree(apoll->double_poll); kfree(apoll); } -- cgit From a36da65c46565d2527eec3efdb546251e38253fd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Aug 2020 09:50:19 -0600 Subject: io_uring: fail poll arm on queue proc failure Check the ipt.error value, it must have been either cleared to zero or set to another error than the default -EINVAL if we don't go through the waitqueue proc addition. Just give up on poll at that point and return failure, this will fallback to async work. io_poll_add() doesn't suffer from this failure case, as it returns the error value directly. Cc: stable@vger.kernel.org # v5.7+ Reported-by: syzbot+a730016dc0bdce4f6ff5@syzkaller.appspotmail.com Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 99582cf5106b..8a2afd8c33c9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4883,7 +4883,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); - if (ret) { + if (ret || ipt.error) { io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); kfree(apoll->double_poll); -- cgit From f254ac04c8744cf7bfed012717eac34eacc65dfb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 12 Aug 2020 17:33:30 -0600 Subject: io_uring: enable lookup of links holding inflight files When a process exits, we cancel whatever requests it has pending that are referencing the file table. However, if a link is holding a reference, then we cannot find it by simply looking at the inflight list. Enable checking of the poll and timeout list to find the link, and cancel it appropriately. Cc: stable@vger.kernel.org Reported-by: Josef Signed-off-by: Jens Axboe --- fs/io_uring.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8a2afd8c33c9..1ec25ee71372 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4937,6 +4937,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(req->ctx); req->flags |= REQ_F_COMP_LOCKED; + req_set_fail_links(req); io_put_req(req); } @@ -5109,6 +5110,23 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +static int __io_timeout_cancel(struct io_kiocb *req) +{ + int ret; + + list_del_init(&req->timeout.list); + + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + if (ret == -1) + return -EALREADY; + + req_set_fail_links(req); + req->flags |= REQ_F_COMP_LOCKED; + io_cqring_fill_event(req, -ECANCELED); + io_put_req(req); + return 0; +} + static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) { struct io_kiocb *req; @@ -5116,7 +5134,6 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) list_for_each_entry(req, &ctx->timeout_list, timeout.list) { if (user_data == req->user_data) { - list_del_init(&req->timeout.list); ret = 0; break; } @@ -5125,15 +5142,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -ENOENT) return ret; - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); - if (ret == -1) - return -EALREADY; - - req_set_fail_links(req); - req->flags |= REQ_F_COMP_LOCKED; - io_cqring_fill_event(req, -ECANCELED); - io_put_req(req); - return 0; + return __io_timeout_cancel(req); } static int io_timeout_remove_prep(struct io_kiocb *req, @@ -7935,6 +7944,71 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data) return work->files == files; } +/* + * Returns true if 'preq' is the link parent of 'req' + */ +static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) +{ + struct io_kiocb *link; + + if (!(preq->flags & REQ_F_LINK_HEAD)) + return false; + + list_for_each_entry(link, &preq->link_list, link_list) { + if (link == req) + return true; + } + + return false; +} + +/* + * We're looking to cancel 'req' because it's holding on to our files, but + * 'req' could be a link to another request. See if it is, and cancel that + * parent request if so. + */ +static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + struct hlist_node *tmp; + struct io_kiocb *preq; + bool found = false; + int i; + + spin_lock_irq(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(preq, tmp, list, hash_node) { + found = io_match_link(preq, req); + if (found) { + io_poll_remove_one(preq); + break; + } + } + } + spin_unlock_irq(&ctx->completion_lock); + return found; +} + +static bool io_timeout_remove_link(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_kiocb *preq; + bool found = false; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry(preq, &ctx->timeout_list, timeout.list) { + found = io_match_link(preq, req); + if (found) { + __io_timeout_cancel(preq); + break; + } + } + spin_unlock_irq(&ctx->completion_lock); + return found; +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { @@ -7989,6 +8063,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } else { io_wq_cancel_work(ctx->io_wq, &cancel_req->work); + /* could be a link, check and remove if it is */ + if (!io_poll_remove_link(ctx, cancel_req)) + io_timeout_remove_link(ctx, cancel_req); io_put_req(cancel_req); } -- cgit From ebf0d100df0731901c16632f78d78d35f4123bc4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Aug 2020 09:01:38 -0600 Subject: task_work: only grab task signal lock when needed If JOBCTL_TASK_WORK is already set on the targeted task, then we need not go through {lock,unlock}_task_sighand() to set it again and queue a signal wakeup. This is safe as we're checking it _after_ adding the new task_work with cmpxchg(). The ordering is as follows: task_work_add() get_signal() -------------------------------------------------------------- STORE(task->task_works, new_work); STORE(task->jobctl); mb(); mb(); LOAD(task->jobctl); LOAD(task->task_works); This speeds up TWA_SIGNAL handling quite a bit, which is important now that io_uring is relying on it for all task_work deliveries. Cc: Peter Zijlstra Cc: Jann Horn Acked-by: Oleg Nesterov Signed-off-by: Jens Axboe --- kernel/signal.c | 16 +++++++++++++++- kernel/task_work.c | 8 +++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 6f16f7c5d375..42b67d2cea37 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2541,7 +2541,21 @@ bool get_signal(struct ksignal *ksig) relock: spin_lock_irq(&sighand->siglock); - current->jobctl &= ~JOBCTL_TASK_WORK; + /* + * Make sure we can safely read ->jobctl() in task_work add. As Oleg + * states: + * + * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we + * roughly have + * + * task_work_add: get_signal: + * STORE(task->task_works, new_work); STORE(task->jobctl); + * mb(); mb(); + * LOAD(task->jobctl); LOAD(task->task_works); + * + * and we can rely on STORE-MB-LOAD [ in task_work_add]. + */ + smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK); if (unlikely(current->task_works)) { spin_unlock_irq(&sighand->siglock); task_work_run(); diff --git a/kernel/task_work.c b/kernel/task_work.c index 5c0848ca1287..613b2d634af8 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -42,7 +42,13 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) set_notify_resume(task); break; case TWA_SIGNAL: - if (lock_task_sighand(task, &flags)) { + /* + * Only grab the sighand lock if we don't already have some + * task_work pending. This pairs with the smp_store_mb() + * in get_signal(), see comment there. + */ + if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && + lock_task_sighand(task, &flags)) { task->jobctl |= JOBCTL_TASK_WORK; signal_wake_up(task, 0); unlock_task_sighand(task, &flags); -- cgit From ff6165b2d7f66fccb7095a60ccc7a80813858665 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Aug 2020 09:47:43 -0600 Subject: io_uring: retain iov_iter state over io_read/io_write calls Instead of maintaining (and setting/remembering) iov_iter size and segment counts, just put the iov_iter in the async part of the IO structure. This is mostly a preparation patch for doing appropriate internal retries for short reads, but it also cleans up the state handling nicely and simplifies it quite a bit. Signed-off-by: Jens Axboe --- fs/io_uring.c | 136 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 70 insertions(+), 66 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1ec25ee71372..584f86178671 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -508,9 +508,8 @@ struct io_async_msghdr { struct io_async_rw { struct iovec fast_iov[UIO_FASTIOV]; - struct iovec *iov; - ssize_t nr_segs; - ssize_t size; + const struct iovec *free_iovec; + struct iov_iter iter; struct wait_page_queue wpq; }; @@ -915,8 +914,8 @@ static void io_file_put_work(struct work_struct *work); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, bool needs_lock); -static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, +static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, + const struct iovec *fast_iov, struct iov_iter *iter); static struct kmem_cache *req_cachep; @@ -2299,7 +2298,7 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) ret = io_import_iovec(rw, req, &iovec, &iter, false); if (ret < 0) goto end_req; - ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter); if (!ret) return true; kfree(iovec); @@ -2820,6 +2819,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, ssize_t ret; u8 opcode; + if (req->io) { + struct io_async_rw *iorw = &req->io->rw; + + *iovec = NULL; + return iov_iter_count(&iorw->iter); + } + opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; @@ -2845,14 +2851,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return ret < 0 ? ret : sqe_len; } - if (req->io) { - struct io_async_rw *iorw = &req->io->rw; - - iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size); - *iovec = NULL; - return iorw->size; - } - if (req->flags & REQ_F_BUFFER_SELECT) { ret = io_iov_buffer_select(req, *iovec, needs_lock); if (!ret) { @@ -2930,21 +2928,29 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } -static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, - struct iov_iter *iter) +static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, + const struct iovec *fast_iov, struct iov_iter *iter) { struct io_async_rw *rw = &req->io->rw; - rw->nr_segs = iter->nr_segs; - rw->size = io_size; + memcpy(&rw->iter, iter, sizeof(*iter)); + rw->free_iovec = NULL; + /* can only be fixed buffers, no need to do anything */ + if (iter->type == ITER_BVEC) + return; if (!iovec) { - rw->iov = rw->fast_iov; - if (rw->iov != fast_iov) - memcpy(rw->iov, fast_iov, + unsigned iov_off = 0; + + rw->iter.iov = rw->fast_iov; + if (iter->iov != fast_iov) { + iov_off = iter->iov - fast_iov; + rw->iter.iov += iov_off; + } + if (rw->fast_iov != fast_iov) + memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, sizeof(struct iovec) * iter->nr_segs); } else { - rw->iov = iovec; + rw->free_iovec = iovec; req->flags |= REQ_F_NEED_CLEANUP; } } @@ -2963,8 +2969,8 @@ static int io_alloc_async_ctx(struct io_kiocb *req) return __io_alloc_async_ctx(req); } -static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, +static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, + const struct iovec *fast_iov, struct iov_iter *iter) { if (!io_op_defs[req->opcode].async_ctx) @@ -2973,7 +2979,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, if (__io_alloc_async_ctx(req)) return -ENOMEM; - io_req_map_rw(req, io_size, iovec, fast_iov, iter); + io_req_map_rw(req, iovec, fast_iov, iter); } return 0; } @@ -2981,18 +2987,19 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, static inline int io_rw_prep_async(struct io_kiocb *req, int rw, bool force_nonblock) { - struct io_async_ctx *io = req->io; - struct iov_iter iter; + struct io_async_rw *iorw = &req->io->rw; ssize_t ret; - io->rw.iov = io->rw.fast_iov; + iorw->iter.iov = iorw->fast_iov; + /* reset ->io around the iovec import, we don't want to use it */ req->io = NULL; - ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; + ret = io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov, + &iorw->iter, !force_nonblock); + req->io = container_of(iorw, struct io_async_ctx, rw); if (unlikely(ret < 0)) return ret; - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter); return 0; } @@ -3090,7 +3097,8 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, * succeed, or in rare cases where it fails, we then fall back to using the * async worker threads for a blocking retry. */ -static bool io_rw_should_retry(struct io_kiocb *req) +static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, + struct iovec *fast_iov, struct iov_iter *iter) { struct kiocb *kiocb = &req->rw.kiocb; int ret; @@ -3113,8 +3121,11 @@ static bool io_rw_should_retry(struct io_kiocb *req) * If request type doesn't require req->io to defer in general, * we need to allocate it here */ - if (!req->io && __io_alloc_async_ctx(req)) - return false; + if (!req->io) { + if (__io_alloc_async_ctx(req)) + return false; + io_req_map_rw(req, iovec, fast_iov, iter); + } ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, io_async_buf_func, req); @@ -3141,12 +3152,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter iter; + struct iov_iter __iter, *iter = &__iter; size_t iov_count; - ssize_t io_size, ret, ret2; - unsigned long nr_segs; + ssize_t io_size, ret, ret2 = 0; + + if (req->io) + iter = &req->io->rw.iter; - ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); + ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; io_size = ret; @@ -3160,30 +3173,26 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; - iov_count = iov_iter_count(&iter); - nr_segs = iter.nr_segs; + iov_count = iov_iter_count(iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (unlikely(ret)) goto out_free; - ret2 = io_iter_do_read(req, &iter); + ret2 = io_iter_do_read(req, iter); /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { kiocb_done(kiocb, ret2, cs); } else { - iter.count = iov_count; - iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, - &iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, iter); if (ret) goto out_free; /* it's copied and will be cleaned with ->io */ iovec = NULL; /* if we can retry, do so with the callbacks armed */ - if (io_rw_should_retry(req)) { - ret2 = io_iter_do_read(req, &iter); + if (io_rw_should_retry(req, iovec, inline_vecs, iter)) { + ret2 = io_iter_do_read(req, iter); if (ret2 == -EIOCBQUEUED) { goto out_free; } else if (ret2 != -EAGAIN) { @@ -3223,12 +3232,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter iter; + struct iov_iter __iter, *iter = &__iter; size_t iov_count; ssize_t ret, ret2, io_size; - unsigned long nr_segs; - ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); + if (req->io) + iter = &req->io->rw.iter; + + ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; io_size = ret; @@ -3247,8 +3258,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, (req->flags & REQ_F_ISREG)) goto copy_iov; - iov_count = iov_iter_count(&iter); - nr_segs = iter.nr_segs; + iov_count = iov_iter_count(iter); ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); if (unlikely(ret)) goto out_free; @@ -3269,9 +3279,9 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, kiocb->ki_flags |= IOCB_WRITE; if (req->file->f_op->write_iter) - ret2 = call_write_iter(req->file, kiocb, &iter); + ret2 = call_write_iter(req->file, kiocb, iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter); else ret2 = -EINVAL; @@ -3284,16 +3294,10 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, cs); } else { - iter.count = iov_count; - iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, - &iter); - if (ret) - goto out_free; - /* it's copied and will be cleaned with ->io */ - iovec = NULL; - return -EAGAIN; + ret = io_setup_async_rw(req, iovec, inline_vecs, iter); + if (!ret) + return -EAGAIN; } out_free: if (iovec) @@ -5583,8 +5587,8 @@ static void __io_clean_op(struct io_kiocb *req) case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - if (io->rw.iov != io->rw.fast_iov) - kfree(io->rw.iov); + if (io->rw.free_iovec) + kfree(io->rw.free_iovec); break; case IORING_OP_RECVMSG: case IORING_OP_SENDMSG: -- cgit From 227c0c9673d86732995474d277f84e08ee763e46 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Aug 2020 11:51:40 -0600 Subject: io_uring: internally retry short reads We've had a few application cases of not handling short reads properly, and it is understandable as short reads aren't really expected if the application isn't doing non-blocking IO. Now that we retain the iov_iter over retries, we can implement internal retry pretty trivially. This ensures that we don't return a short read, even for buffered reads on page cache conflicts. Cleanup the deep nesting and hard to read nature of io_read() as well, it's much more straight forward now to read and understand. Added a few comments explaining the logic as well. Signed-off-by: Jens Axboe --- fs/io_uring.c | 109 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 39 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 584f86178671..7dd6df15bc49 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -510,6 +510,7 @@ struct io_async_rw { struct iovec fast_iov[UIO_FASTIOV]; const struct iovec *free_iovec; struct iov_iter iter; + size_t bytes_done; struct wait_page_queue wpq; }; @@ -916,7 +917,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, bool needs_lock); static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, - struct iov_iter *iter); + struct iov_iter *iter, bool force); static struct kmem_cache *req_cachep; @@ -2298,7 +2299,7 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) ret = io_import_iovec(rw, req, &iovec, &iter, false); if (ret < 0) goto end_req; - ret = io_setup_async_rw(req, iovec, inline_vecs, &iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); if (!ret) return true; kfree(iovec); @@ -2588,6 +2589,14 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + /* add previously done IO, if any */ + if (req->io && req->io->rw.bytes_done > 0) { + if (ret < 0) + ret = req->io->rw.bytes_done; + else + ret += req->io->rw.bytes_done; + } + if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; if (ret >= 0 && kiocb->ki_complete == io_complete_rw) @@ -2935,6 +2944,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, memcpy(&rw->iter, iter, sizeof(*iter)); rw->free_iovec = NULL; + rw->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ if (iter->type == ITER_BVEC) return; @@ -2971,9 +2981,9 @@ static int io_alloc_async_ctx(struct io_kiocb *req) static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, - struct iov_iter *iter) + struct iov_iter *iter, bool force) { - if (!io_op_defs[req->opcode].async_ctx) + if (!force && !io_op_defs[req->opcode].async_ctx) return 0; if (!req->io) { if (__io_alloc_async_ctx(req)) @@ -3097,8 +3107,7 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, * succeed, or in rare cases where it fails, we then fall back to using the * async worker threads for a blocking retry. */ -static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, - struct iovec *fast_iov, struct iov_iter *iter) +static bool io_rw_should_retry(struct io_kiocb *req) { struct kiocb *kiocb = &req->rw.kiocb; int ret; @@ -3107,8 +3116,8 @@ static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, if (req->flags & REQ_F_NOWAIT) return false; - /* already tried, or we're doing O_DIRECT */ - if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ)) + /* Only for buffered IO */ + if (kiocb->ki_flags & IOCB_DIRECT) return false; /* * just use poll if we can, and don't attempt if the fs doesn't @@ -3117,16 +3126,6 @@ static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) return false; - /* - * If request type doesn't require req->io to defer in general, - * we need to allocate it here - */ - if (!req->io) { - if (__io_alloc_async_ctx(req)) - return false; - io_req_map_rw(req, iovec, fast_iov, iter); - } - ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, io_async_buf_func, req); if (!ret) { @@ -3153,8 +3152,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; + ssize_t io_size, ret, ret2; size_t iov_count; - ssize_t io_size, ret, ret2 = 0; if (req->io) iter = &req->io->rw.iter; @@ -3164,6 +3163,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, return ret; io_size = ret; req->result = io_size; + ret = 0; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) @@ -3178,31 +3178,62 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (unlikely(ret)) goto out_free; - ret2 = io_iter_do_read(req, iter); + ret = io_iter_do_read(req, iter); - /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { - kiocb_done(kiocb, ret2, cs); - } else { -copy_iov: - ret = io_setup_async_rw(req, iovec, inline_vecs, iter); + if (!ret) { + goto done; + } else if (ret == -EIOCBQUEUED) { + ret = 0; + goto out_free; + } else if (ret == -EAGAIN) { + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (ret) goto out_free; - /* it's copied and will be cleaned with ->io */ - iovec = NULL; - /* if we can retry, do so with the callbacks armed */ - if (io_rw_should_retry(req, iovec, inline_vecs, iter)) { - ret2 = io_iter_do_read(req, iter); - if (ret2 == -EIOCBQUEUED) { - goto out_free; - } else if (ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, cs); - goto out_free; - } - } + return -EAGAIN; + } else if (ret < 0) { + goto out_free; + } + + /* read it all, or we did blocking attempt. no retry. */ + if (!iov_iter_count(iter) || !force_nonblock) + goto done; + + io_size -= ret; +copy_iov: + ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); + if (ret2) { + ret = ret2; + goto out_free; + } + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + /* now use our persistent iterator, if we aren't already */ + iter = &req->io->rw.iter; +retry: + req->io->rw.bytes_done += ret; + /* if we can retry, do so with the callbacks armed */ + if (!io_rw_should_retry(req)) { kiocb->ki_flags &= ~IOCB_WAITQ; return -EAGAIN; } + + /* + * Now retry read with the IOCB_WAITQ parts set in the iocb. If we + * get -EIOCBQUEUED, then we'll get a notification when the desired + * page gets unlocked. We can also get a partial read here, and if we + * do, then just retry at the new offset. + */ + ret = io_iter_do_read(req, iter); + if (ret == -EIOCBQUEUED) { + ret = 0; + goto out_free; + } else if (ret > 0 && ret < io_size) { + /* we got some bytes, but not all. retry. */ + goto retry; + } +done: + kiocb_done(kiocb, ret, cs); + ret = 0; out_free: if (iovec) kfree(iovec); @@ -3295,7 +3326,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, kiocb_done(kiocb, ret2, cs); } else { copy_iov: - ret = io_setup_async_rw(req, iovec, inline_vecs, iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (!ret) return -EAGAIN; } -- cgit From d4e7cd36a90e38e0276d6ce0c20f5ccef17ec38c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 15 Aug 2020 11:44:50 -0700 Subject: io_uring: sanitize double poll handling There's a bit of confusion on the matching pairs of poll vs double poll, depending on if the request is a pure poll (IORING_OP_POLL_ADD) or poll driven retry. Add io_poll_get_double() that returns the double poll waitqueue, if any, and io_poll_get_single() that returns the original poll waitqueue. With that, remove the argument to io_poll_remove_double(). Finally ensure that wait->private is cleared once the double poll handler has run, so that remove knows it's already been seen. Cc: stable@vger.kernel.org # v5.8 Reported-by: syzbot+7f617d4a9369028b8a2c@syzkaller.appspotmail.com Fixes: 18bceab101ad ("io_uring: allow POLL_ADD with double poll_wait() users") Signed-off-by: Jens Axboe --- fs/io_uring.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7dd6df15bc49..cb030912bf5e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4649,9 +4649,24 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) return false; } -static void io_poll_remove_double(struct io_kiocb *req, void *data) +static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) { - struct io_poll_iocb *poll = data; + /* pure poll stashes this in ->io, poll driven retry elsewhere */ + if (req->opcode == IORING_OP_POLL_ADD) + return (struct io_poll_iocb *) req->io; + return req->apoll->double_poll; +} + +static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) +{ + if (req->opcode == IORING_OP_POLL_ADD) + return &req->poll; + return &req->apoll->poll; +} + +static void io_poll_remove_double(struct io_kiocb *req) +{ + struct io_poll_iocb *poll = io_poll_get_double(req); lockdep_assert_held(&req->ctx->completion_lock); @@ -4671,7 +4686,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) { struct io_ring_ctx *ctx = req->ctx; - io_poll_remove_double(req, req->io); + io_poll_remove_double(req); req->poll.done = true; io_cqring_fill_event(req, error ? error : mangle_poll(mask)); io_commit_cqring(ctx); @@ -4711,7 +4726,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = req->apoll->double_poll; + struct io_poll_iocb *poll = io_poll_get_single(req); __poll_t mask = key_to_poll(key); /* for instances that support it check for an event match first: */ @@ -4725,6 +4740,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, done = list_empty(&poll->wait.entry); if (!done) list_del_init(&poll->wait.entry); + /* make sure double remove sees this as being gone */ + wait->private = NULL; spin_unlock(&poll->head->lock); if (!done) __io_async_wake(req, poll, mask, io_poll_task_func); @@ -4808,7 +4825,7 @@ static void io_async_task_func(struct callback_head *cb) if (hash_hashed(&req->hash_node)) hash_del(&req->hash_node); - io_poll_remove_double(req, apoll->double_poll); + io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) @@ -4919,7 +4936,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); if (ret || ipt.error) { - io_poll_remove_double(req, apoll->double_poll); + io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); kfree(apoll->double_poll); kfree(apoll); @@ -4951,14 +4968,13 @@ static bool io_poll_remove_one(struct io_kiocb *req) { bool do_complete; + io_poll_remove_double(req); + if (req->opcode == IORING_OP_POLL_ADD) { - io_poll_remove_double(req, req->io); do_complete = __io_poll_remove_one(req, &req->poll); } else { struct async_poll *apoll = req->apoll; - io_poll_remove_double(req, apoll->double_poll); - /* non-poll requests have submit ref still */ do_complete = __io_poll_remove_one(req, &apoll->poll); if (do_complete) { -- cgit From f91daf565b0e272a33bd3fcd19eaebd331c5cffd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 15 Aug 2020 15:58:42 -0700 Subject: io_uring: short circuit -EAGAIN for blocking read attempt One case was missed in the short IO retry handling, and that's hitting -EAGAIN on a blocking attempt read (eg from io-wq context). This is a problem on sockets that are marked as non-blocking when created, they don't carry any REQ_F_NOWAIT information to help us terminate them instead of perpetually retrying. Fixes: 227c0c9673d8 ("io_uring: internally retry short reads") Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index cb030912bf5e..dc506b75659c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3186,6 +3186,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = 0; goto out_free; } else if (ret == -EAGAIN) { + if (!force_nonblock) + goto done; ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (ret) goto out_free; @@ -3195,7 +3197,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, } /* read it all, or we did blocking attempt. no retry. */ - if (!iov_iter_count(iter) || !force_nonblock) + if (!iov_iter_count(iter) || !force_nonblock || + (req->file->f_flags & O_NONBLOCK)) goto done; io_size -= ret; -- cgit