From 06058632464845abb1af91521122fd04dd3daaec Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 13 Apr 2019 09:26:03 -0600 Subject: io_uring: park SQPOLL thread if it's percpu kthread expects this, or we can throw a warning on exit: WARNING: CPU: 0 PID: 7822 at kernel/kthread.c:399 __kthread_bind_mask+0x3b/0xc0 kernel/kthread.c:399 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 7822 Comm: syz-executor030 Not tainted 5.1.0-rc4-next-20190412 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 panic+0x2cb/0x72b kernel/panic.c:214 __warn.cold+0x20/0x46 kernel/panic.c:576 report_bug+0x263/0x2b0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:179 [inline] fixup_bug arch/x86/kernel/traps.c:174 [inline] do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:272 do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:291 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973 RIP: 0010:__kthread_bind_mask+0x3b/0xc0 kernel/kthread.c:399 Code: 48 89 fb e8 f7 ab 24 00 4c 89 e6 48 89 df e8 ac e1 02 00 31 ff 49 89 c4 48 89 c6 e8 7f ad 24 00 4d 85 e4 75 15 e8 d5 ab 24 00 <0f> 0b e8 ce ab 24 00 5b 41 5c 41 5d 41 5e 5d c3 e8 c0 ab 24 00 4c RSP: 0018:ffff8880a89bfbb8 EFLAGS: 00010293 RAX: ffff88808ca7a280 RBX: ffff8880a98e4380 RCX: ffffffff814bdd11 RDX: 0000000000000000 RSI: ffffffff814bdd1b RDI: 0000000000000007 RBP: ffff8880a89bfbd8 R08: ffff88808ca7a280 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffffffff87691148 R14: ffff8880a98e43a0 R15: ffffffff81c91e10 __kthread_bind kernel/kthread.c:412 [inline] kthread_unpark+0x123/0x160 kernel/kthread.c:480 kthread_stop+0xfa/0x6c0 kernel/kthread.c:556 io_sq_thread_stop fs/io_uring.c:2057 [inline] io_sq_thread_stop fs/io_uring.c:2052 [inline] io_finish_async+0xab/0x180 fs/io_uring.c:2064 io_ring_ctx_free fs/io_uring.c:2534 [inline] io_ring_ctx_wait_and_kill+0x133/0x510 fs/io_uring.c:2591 io_uring_release+0x42/0x50 fs/io_uring.c:2599 __fput+0x2e5/0x8d0 fs/file_table.c:278 ____fput+0x16/0x20 fs/file_table.c:309 task_work_run+0x14a/0x1c0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x90a/0x2fa0 kernel/exit.c:876 do_group_exit+0x135/0x370 kernel/exit.c:980 __do_sys_exit_group kernel/exit.c:991 [inline] __se_sys_exit_group kernel/exit.c:989 [inline] __x64_sys_exit_group+0x44/0x50 kernel/exit.c:989 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Reported-by: syzbot+6d4a92619eb0ad08602b@syzkaller.appspotmail.com Fixes: 6c271ce2f1d5 ("io_uring: add submission polling") Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 89aa8412b5f5..e5008c1b82be 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1920,6 +1920,10 @@ static int io_sq_thread(void *data) unuse_mm(cur_mm); mmput(cur_mm); } + + if (kthread_should_park()) + kthread_parkme(); + return 0; } @@ -2054,6 +2058,7 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) if (ctx->sqo_thread) { ctx->sqo_stop = 1; mb(); + kthread_park(ctx->sqo_thread); kthread_stop(ctx->sqo_thread); ctx->sqo_thread = NULL; } -- cgit From 917257daa0fea7a007102691c0e27d9216a96768 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 13 Apr 2019 09:28:55 -0600 Subject: io_uring: only test SQPOLL cpu after we've verified it We currently call cpu_possible() even if we don't use the CPU. Move the test under the SQ_AFF branch, which is the only place where we'll use the value. Do the cpu_possible() test AFTER we've limited it to a max of NR_CPUS. This avoids triggering the following warning: WARNING: CPU: 1 PID: 7600 at include/linux/cpumask.h:121 cpu_max_bits_warn if CONFIG_DEBUG_PER_CPU_MAPS is enabled. While in there, also move the SQ thread idle period assignment inside SETUP_SQPOLL, as we don't use it otherwise either. Reported-by: syzbot+cd714a07c6de2bc34293@syzkaller.appspotmail.com Fixes: 6c271ce2f1d5 ("io_uring: add submission polling") Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index e5008c1b82be..24355e0c47f0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2241,10 +2241,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, mmgrab(current->mm); ctx->sqo_mm = current->mm; - ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); - if (!ctx->sq_thread_idle) - ctx->sq_thread_idle = HZ; - ret = -EINVAL; if (!cpu_possible(p->sq_thread_cpu)) goto err; @@ -2254,10 +2250,18 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, if (!capable(CAP_SYS_ADMIN)) goto err; + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); + if (!ctx->sq_thread_idle) + ctx->sq_thread_idle = HZ; + if (p->flags & IORING_SETUP_SQ_AFF) { int cpu; cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS); + ret = -EINVAL; + if (!cpu_possible(p->sq_thread_cpu)) + goto err; + ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, ctx, cpu, "io_uring-sq"); -- cgit From 3d6770fbd9353988839611bab107e4e891506aad Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 13 Apr 2019 11:50:54 -0600 Subject: io_uring: drop io_file_put() 'file' argument Since the fget/fput handling was reworked in commit 09bb839434bd, we never call io_file_put() with state == NULL (and hence file != NULL) anymore. Remove that case. Reported-by: Al Viro Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 24355e0c47f0..f4ddb9d23241 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -682,11 +682,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_add_tail(&req->list, &ctx->poll_list); } -static void io_file_put(struct io_submit_state *state, struct file *file) +static void io_file_put(struct io_submit_state *state) { - if (!state) { - fput(file); - } else if (state->file) { + if (state->file) { int diff = state->has_refs - state->used_refs; if (diff) @@ -711,7 +709,7 @@ static struct file *io_file_get(struct io_submit_state *state, int fd) state->ios_left--; return state->file; } - io_file_put(state, NULL); + io_file_put(state); } state->file = fget_many(fd, state->ios_left); if (!state->file) @@ -1671,7 +1669,7 @@ out: static void io_submit_state_end(struct io_submit_state *state) { blk_finish_plug(&state->plug); - io_file_put(state, NULL); + io_file_put(state); if (state->free_reqs) kmem_cache_free_bulk(req_cachep, state->free_reqs, &state->reqs[state->cur_req]); -- cgit From b19062a567266ee1f10f6709325f766bbcc07d1c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 15 Apr 2019 10:49:38 -0600 Subject: io_uring: fix possible deadlock between io_uring_{enter,register} If we have multiple threads, one doing io_uring_enter() while the other is doing io_uring_register(), we can run into a deadlock between the two. io_uring_register() must wait for existing users of the io_uring instance to exit. But it does so while holding the io_uring mutex. Callers of io_uring_enter() may need this mutex to make progress (and eventually exit). If we wait for users to exit in io_uring_register(), we can't do so with the io_uring mutex held without potentially risking a deadlock. Drop the io_uring mutex while waiting for existing callers to exit. This is safe and guaranteed to make forward progress, since we already killed the percpu ref before doing so. Hence later callers of io_uring_enter() will be rejected. Reported-by: syzbot+16dc03452dee970a0c3e@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index f4ddb9d23241..b35300e4c9a7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2929,11 +2929,23 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) + __releases(ctx->uring_lock) + __acquires(ctx->uring_lock) { int ret; percpu_ref_kill(&ctx->refs); + + /* + * Drop uring mutex before waiting for references to exit. If another + * thread is currently inside io_uring_enter() it might need to grab + * the uring_lock to make progress. If we hold it here across the drain + * wait, then we can deadlock. It's safe to drop the mutex here, since + * no new references will come in after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); wait_for_completion(&ctx->ctx_done); + mutex_lock(&ctx->uring_lock); switch (opcode) { case IORING_REGISTER_BUFFERS: -- cgit From 74f464e97044da33b25aaed00213914b0edf1f2e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 17 Apr 2019 08:57:48 -0600 Subject: io_uring: fix CQ overflow condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a leftover from when the rings initially were not free flowing, and hence a test for tail + 1 == head would indicate full. Since we now let them wrap instead of mask them with the size, we need to check if they drift more than the ring size from each other. This fixes a case where we'd overwrite CQ ring entries, if the user failed to reap completions. Both cases would ultimately result in lost completions as the application violated the depth it asked for. The only difference is that before this fix we'd return invalid entries for the overflowed completions, instead of properly flagging it in the cq_ring->overflow variable. Reported-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index b35300e4c9a7..f65f85d89217 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -338,7 +338,7 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) tail = ctx->cached_cq_tail; /* See comment at the top of the file */ smp_rmb(); - if (tail + 1 == READ_ONCE(ring->r.head)) + if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) return NULL; ctx->cached_cq_tail++; -- cgit From 35fa71a030caa50458a043560d4814ea9bcd639f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Apr 2019 10:23:23 -0600 Subject: io_uring: fail io_uring_register(2) on a dying io_uring instance If we have multiple threads doing io_uring_register(2) on an io_uring fd, then we can potentially try and kill the percpu reference while someone else has already killed it. Prevent this race by failing io_uring_register(2) if the ref is marked dying. This is safe since we're inside the io_uring mutex. Fixes: b19062a56726 ("io_uring: fix possible deadlock between io_uring_{enter,register}") Reported-by: syzbot Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index f65f85d89217..a2f39faed6a7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2934,6 +2934,14 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, { int ret; + /* + * We're inside the ring mutex, if the ref is already dying, then + * someone else killed the ctx or is already going through + * io_uring_register(). + */ + if (percpu_ref_is_dying(&ctx->refs)) + return -ENXIO; + percpu_ref_kill(&ctx->refs); /* -- cgit From e523a29c4f2703bdb98f68ce1bb256e259fd8d5f Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Fri, 19 Apr 2019 11:57:44 +0200 Subject: io_uring: fix race condition reading SQ entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A read memory barrier is required between reading SQ tail and reading the actual data belonging to the SQ entry. Userspace needs a matching write barrier between writing SQ entries and updating SQ tail (using smp_store_release to update tail will do). Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index a2f39faed6a7..41e3a6f6a096 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1739,7 +1739,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) head = ctx->cached_sq_head; /* See comment at the top of this file */ smp_rmb(); - if (head == READ_ONCE(ring->r.tail)) + /* make sure SQ entry isn't read before tail */ + if (head == smp_load_acquire(&ring->r.tail)) return false; head = READ_ONCE(ring->array[head & ctx->sq_mask]); -- cgit From 0d7bae69c574c5f25802f8a71252e7d66933a3ab Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Fri, 19 Apr 2019 11:57:45 +0200 Subject: io_uring: fix race condition when sq threads goes sleeping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reading the SQ tail needs to come after setting IORING_SQ_NEED_WAKEUP in flags; there is no cheap barrier for ordering a store before a load, a full memory barrier is required. Userspace needs a full memory barrier between updating SQ tail and checking for the IORING_SQ_NEED_WAKEUP too. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 41e3a6f6a096..69910fd9ccca 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1865,7 +1865,8 @@ static int io_sq_thread(void *data) /* Tell userspace we may need a wakeup call */ ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; - smp_wmb(); + /* make sure to read SQ tail after writing flags */ + smp_mb(); if (!io_get_sqring(ctx, &sqes[0])) { if (kthread_should_stop()) { -- cgit From fb775faa9e46ff481e4ced11116c9bd45359cb43 Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Fri, 19 Apr 2019 11:57:46 +0200 Subject: io_uring: fix poll full SQ detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit io_uring_poll shouldn't signal EPOLLOUT | EPOLLWRNORM if the queue is full; the old check would always signal EPOLLOUT | EPOLLWRNORM (unless there were U32_MAX - 1 entries in the SQ queue). Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 69910fd9ccca..b998e98acd01 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2576,7 +2576,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) poll_wait(file, &ctx->cq_wait, wait); /* See comment at the top of this file */ smp_rmb(); - if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head) + if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != + ctx->sq_ring->ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) mask |= EPOLLIN | EPOLLRDNORM; -- cgit From 8358e3a8264a228cf2dfb6f3a05c0328f4118f12 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 23 Apr 2019 08:17:58 -0600 Subject: io_uring: remove 'state' argument from io_{read,write} path Since commit 09bb839434b we don't use the state argument for any sort of on-stack caching in the io read and write path. Remove the stale and unused argument from them, and bubble it up to __io_submit_sqe() and down to io_prep_rw(). Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index b998e98acd01..0e9fb2cb1984 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -740,7 +740,7 @@ static bool io_file_supports_async(struct file *file) } static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { const struct io_uring_sqe *sqe = s->sqe; struct io_ring_ctx *ctx = req->ctx; @@ -938,7 +938,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) } static int io_read(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; @@ -947,7 +947,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, size_t iov_count; int ret; - ret = io_prep_rw(req, s, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock); if (ret) return ret; file = kiocb->ki_filp; @@ -985,7 +985,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, } static int io_write(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; @@ -994,7 +994,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, size_t iov_count; int ret; - ret = io_prep_rw(req, s, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock); if (ret) return ret; @@ -1336,8 +1336,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) } static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct sqe_submit *s, bool force_nonblock, - struct io_submit_state *state) + const struct sqe_submit *s, bool force_nonblock) { int ret, opcode; @@ -1353,18 +1352,18 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_READV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_read(req, s, force_nonblock, state); + ret = io_read(req, s, force_nonblock); break; case IORING_OP_WRITEV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_write(req, s, force_nonblock, state); + ret = io_write(req, s, force_nonblock); break; case IORING_OP_READ_FIXED: - ret = io_read(req, s, force_nonblock, state); + ret = io_read(req, s, force_nonblock); break; case IORING_OP_WRITE_FIXED: - ret = io_write(req, s, force_nonblock, state); + ret = io_write(req, s, force_nonblock); break; case IORING_OP_FSYNC: ret = io_fsync(req, s->sqe, force_nonblock); @@ -1457,7 +1456,7 @@ restart: s->has_user = cur_mm != NULL; s->needs_lock = true; do { - ret = __io_submit_sqe(ctx, req, s, false, NULL); + ret = __io_submit_sqe(ctx, req, s, false); /* * We can get EAGAIN for polled IO even though * we're forcing a sync submission from here, @@ -1623,7 +1622,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, if (unlikely(ret)) goto out; - ret = __io_submit_sqe(ctx, req, s, true, state); + ret = __io_submit_sqe(ctx, req, s, true); if (ret == -EAGAIN) { struct io_uring_sqe *sqe_copy; -- cgit From 8449eedaa1da6a51d67190c905b1b54243e095f6 Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Sat, 27 Apr 2019 20:34:19 +0200 Subject: io_uring: fix handling SQEs requesting NOWAIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all request types set REQ_F_FORCE_NONBLOCK when they needed async punting; reverse logic instead and set REQ_F_NOWAIT if request mustn't be punted. Signed-off-by: Stefan Bühler Merged with my previous patch for this. Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0e9fb2cb1984..d5e23a6dd6aa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -221,7 +221,7 @@ struct io_kiocb { struct list_head list; unsigned int flags; refcount_t refs; -#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ +#define REQ_F_NOWAIT 1 /* must not punt to workers */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ #define REQ_F_SEQ_PREV 8 /* sequential with previous */ @@ -774,10 +774,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) return ret; - if (force_nonblock) { + + /* don't allow async punt if RWF_NOWAIT was requested */ + if (kiocb->ki_flags & IOCB_NOWAIT) + req->flags |= REQ_F_NOWAIT; + + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; - req->flags |= REQ_F_FORCE_NONBLOCK; - } + if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !kiocb->ki_filp->f_op->iopoll) @@ -1436,8 +1440,7 @@ restart: struct sqe_submit *s = &req->submit; const struct io_uring_sqe *sqe = s->sqe; - /* Ensure we clear previously set forced non-block flag */ - req->flags &= ~REQ_F_FORCE_NONBLOCK; + /* Ensure we clear previously set non-block flag */ req->rw.ki_flags &= ~IOCB_NOWAIT; ret = 0; @@ -1623,7 +1626,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, goto out; ret = __io_submit_sqe(ctx, req, s, true); - if (ret == -EAGAIN) { + if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); -- cgit From 1e84b97b7377bd0198f87b49ad3e396e84bf0458 Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Wed, 24 Apr 2019 23:54:16 +0200 Subject: io_uring: fix notes on barriers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The application reading the CQ ring needs a barrier to pair with the smp_store_release in io_commit_cqring, not the barrier after it. Also a write barrier *after* writing something (but not *before* writing anything interesting) doesn't order anything, so an smp_wmb() after writing SQ tail is not needed. Additionally consider reading SQ head and writing CQ tail in the notes. Also add some clarifications how the various other fields in the ring buffers are used. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 110 insertions(+), 9 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index d5e23a6dd6aa..7ab93e854eb2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4,15 +4,28 @@ * supporting fast/efficient IO. * * A note on the read/write ordering memory barriers that are matched between - * the application and kernel side. When the application reads the CQ ring - * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() - * the kernel uses after writing the tail. Failure to do so could cause a - * delay in when the application notices that completion events available. - * This isn't a fatal condition. Likewise, the application must use an - * appropriate smp_wmb() both before writing the SQ tail, and after writing - * the SQ tail. The first one orders the sqe writes with the tail write, and - * the latter is paired with the smp_rmb() the kernel will issue before - * reading the SQ tail on submission. + * the application and kernel side. + * + * After the application reads the CQ ring tail, it must use an + * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses + * before writing the tail (using smp_load_acquire to read the tail will + * do). It also needs a smp_mb() before updating CQ head (ordering the + * entry load(s) with the head store), pairing with an implicit barrier + * through a control-dependency in io_get_cqring (smp_store_release to + * store head will do). Failure to do so could lead to reading invalid + * CQ entries. + * + * Likewise, the application must use an appropriate smp_wmb() before + * writing the SQ tail (ordering SQ entry stores with the tail store), + * which pairs with smp_load_acquire in io_get_sqring (smp_store_release + * to store the tail will do). And it needs a barrier ordering the SQ + * head load before writing new SQ entries (smp_load_acquire to read + * head will do). + * + * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application + * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* + * updating the SQ tail; a full memory barrier smp_mb() is needed + * between. * * Also see the examples in the liburing library: * @@ -70,20 +83,108 @@ struct io_uring { u32 tail ____cacheline_aligned_in_smp; }; +/* + * This data is shared with the application through the mmap at offset + * IORING_OFF_SQ_RING. + * + * The offsets to the member fields are published through struct + * io_sqring_offsets when calling io_uring_setup. + */ struct io_sq_ring { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The kernel controls head and the application controls tail. + */ struct io_uring r; + /* + * Bitmask to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ u32 ring_mask; + /* Ring size (constant, power of 2) */ u32 ring_entries; + /* + * Number of invalid entries dropped by the kernel due to + * invalid index stored in array + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * After a new SQ head value was read by the application this + * counter includes all submissions that were dropped reaching + * the new SQ head (and possibly more). + */ u32 dropped; + /* + * Runtime flags + * + * Written by the kernel, shouldn't be modified by the + * application. + * + * The application needs a full memory barrier before checking + * for IORING_SQ_NEED_WAKEUP after updating the sq tail. + */ u32 flags; + /* + * Ring buffer of indices into array of io_uring_sqe, which is + * mmapped by the application using the IORING_OFF_SQES offset. + * + * This indirection could e.g. be used to assign fixed + * io_uring_sqe entries to operations and only submit them to + * the queue when needed. + * + * The kernel modifies neither the indices array nor the entries + * array. + */ u32 array[]; }; +/* + * This data is shared with the application through the mmap at offset + * IORING_OFF_CQ_RING. + * + * The offsets to the member fields are published through struct + * io_cqring_offsets when calling io_uring_setup. + */ struct io_cq_ring { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The application controls head and the kernel tail. + */ struct io_uring r; + /* + * Bitmask to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ u32 ring_mask; + /* Ring size (constant, power of 2) */ u32 ring_entries; + /* + * Number of completion events lost because the queue was full; + * this should be avoided by the application by making sure + * there are not more requests pending thatn there is space in + * the completion queue. + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * As completion events come in out of order this counter is not + * ordered with any other data. + */ u32 overflow; + /* + * Ring buffer of completion events. + * + * The kernel writes completion events fresh every time they are + * produced, so the application is allowed to modify pending + * entries. + */ struct io_uring_cqe cqes[]; }; -- cgit From 4f7067c3fb7f2974363a28c597a41949d971af02 Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Wed, 24 Apr 2019 23:54:17 +0200 Subject: io_uring: remove unnecessary barrier before wq_has_sleeper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit wq_has_sleeper has a full barrier internally. The smp_rmb barrier in io_uring_poll synchronizes with it. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7ab93e854eb2..bb71b7f00bb3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -418,12 +418,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) /* order cqe stores with ring update */ smp_store_release(&ring->r.tail, ctx->cached_cq_tail); - /* - * Write sider barrier of tail update, app has read side. See - * comment at the top of this file. - */ - smp_wmb(); - if (wq_has_sleeper(&ctx->cq_wait)) { wake_up_interruptible(&ctx->cq_wait); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); @@ -2677,7 +2671,10 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) __poll_t mask = 0; poll_wait(file, &ctx->cq_wait, wait); - /* See comment at the top of this file */ + /* + * synchronizes with barrier from wq_has_sleeper call in + * io_commit_cqring + */ smp_rmb(); if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != ctx->sq_ring->ring_entries) -- cgit From 115e12e58dbc055e98c965e3255aed7b20214f95 Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Wed, 24 Apr 2019 23:54:18 +0200 Subject: io_uring: remove unnecessary barrier before reading cq head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memory operations before reading cq head are unrelated and we don't care about their order. Document that the control dependency in combination with READ_ONCE and WRITE_ONCE forms a barrier we need. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index bb71b7f00bb3..3671a654a146 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -431,8 +431,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) unsigned tail; tail = ctx->cached_cq_tail; - /* See comment at the top of the file */ - smp_rmb(); + /* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) return NULL; -- cgit From 9e4c15a3939448d2ea9b9bf59561183bbe3fdc49 Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Wed, 24 Apr 2019 23:54:19 +0200 Subject: io_uring: remove unnecessary barrier after updating SQ head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no operation afterwards to order with. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3671a654a146..d3c57ee233fe 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1798,12 +1798,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * write new data to them. */ smp_store_release(&ring->r.head, ctx->cached_sq_head); - - /* - * write side barrier of head update, app has read side. See - * comment at the top of this file - */ - smp_wmb(); } } -- cgit From 82ab082c0e2f8592c2ff6b2ab99a92d8406c8c2c Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Wed, 24 Apr 2019 23:54:20 +0200 Subject: io_uring: remove unnecessary barrier before reading SQ tail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no operation before to order with. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index d3c57ee233fe..662f1c070c8c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1831,8 +1831,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) * though the application is the one updating it. */ head = ctx->cached_sq_head; - /* See comment at the top of this file */ - smp_rmb(); /* make sure SQ entry isn't read before tail */ if (head == smp_load_acquire(&ring->r.tail)) return false; -- cgit From b841f19524a16cd93a39f9306191f85c549a2bc2 Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Wed, 24 Apr 2019 23:54:21 +0200 Subject: io_uring: remove unnecessary barrier after incrementing dropped counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit smp_store_release in io_commit_sqring already orders the store to dropped before the update to SQ head. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 662f1c070c8c..2ebc33cc907b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1846,8 +1846,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) /* drop invalid entries */ ctx->cached_sq_head++; ring->dropped++; - /* See comment at the top of this file */ - smp_wmb(); return false; } -- cgit From 62977281a6384d3904c02272a638cc3ac3bac54d Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Wed, 24 Apr 2019 23:54:22 +0200 Subject: io_uring: remove unnecessary barrier after unsetting IORING_SQ_NEED_WAKEUP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no operation to order with afterwards, and removing the flag is not critical in any way. There will always be a "race condition" where the application will trigger IORING_ENTER_SQ_WAKEUP when it isn't actually needed. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2ebc33cc907b..77b247b5d10b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1969,13 +1969,11 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; - smp_wmb(); continue; } finish_wait(&ctx->sqo_wait, &wait); ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; - smp_wmb(); } i = 0; -- cgit From 5c8b0b54db22c54f2aec991b388f550d3a927f26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 30 Apr 2019 10:16:07 -0600 Subject: io_uring: have submission side sqe errors post a cqe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we only post a cqe if we get an error OUTSIDE of submission. For submission, we return the error directly through io_uring_enter(). This is a bit awkward for applications, and it makes more sense to always post a cqe with an error, if the error happens on behalf of an sqe. This changes submission behavior a bit. io_uring_enter() returns -ERROR for an error, and > 0 for number of sqes submitted. Before this change, if you wanted to submit 8 entries and had an error on the 5th entry, io_uring_enter() would return 4 (for number of entries successfully submitted) and rewind the sqring. The application would then have to peek at the sqring and figure out what was wrong with the head sqe, and then skip it itself. With this change, we'll return 5 since we did consume 5 sqes, and the last sqe (with the error) will result in a cqe being posted with the error. This makes the logic easier to handle in the application, and it cleans up the submission part. Suggested-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 77b247b5d10b..0a894d7baceb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1801,14 +1801,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) } } -/* - * Undo last io_get_sqring() - */ -static void io_drop_sqring(struct io_ring_ctx *ctx) -{ - ctx->cached_sq_head--; -} - /* * Fetch an sqe, if one is available. Note that s->sqe will point to memory * that is mapped by userspace. This means that care needs to be taken to @@ -2018,7 +2010,7 @@ static int io_sq_thread(void *data) static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; - int i, ret = 0, submit = 0; + int i, submit = 0; if (to_submit > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, ctx, to_submit); @@ -2027,6 +2019,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) for (i = 0; i < to_submit; i++) { struct sqe_submit s; + int ret; if (!io_get_sqring(ctx, &s)) break; @@ -2034,21 +2027,18 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; + submit++; ret = io_submit_sqe(ctx, &s, statep); - if (ret) { - io_drop_sqring(ctx); - break; - } - - submit++; + if (ret) + io_cqring_add_event(ctx, s.sqe->user_data, ret, 0); } io_commit_sqring(ctx); if (statep) io_submit_state_end(statep); - return submit ? submit : ret; + return submit; } static unsigned io_cqring_events(struct io_cq_ring *ring) @@ -2779,24 +2769,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, mutex_lock(&ctx->uring_lock); submitted = io_ring_submit(ctx, to_submit); mutex_unlock(&ctx->uring_lock); - - if (submitted < 0) - goto out_ctx; } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; min_complete = min(min_complete, ctx->cq_entries); - /* - * The application could have included the 'to_submit' count - * in how many events it wanted to wait for. If we failed to - * submit the desired count, we may need to adjust the number - * of events to poll/wait for. - */ - if (submitted < to_submit) - min_complete = min_t(unsigned, submitted, min_complete); - if (ctx->flags & IORING_SETUP_IOPOLL) { mutex_lock(&ctx->uring_lock); ret = io_iopoll_check(ctx, &nr_events, min_complete); -- cgit From 975554b03eddc1df73bda3a764a09e18cadd5f1c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 30 Apr 2019 13:34:51 +0100 Subject: io_uring: fix SQPOLL cpu validation In io_sq_offload_start(), we call cpu_possible() on an unbounded cpu value from userspace. On v5.1-rc7 on arm64 with CONFIG_DEBUG_PER_CPU_MAPS, this results in a splat: WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 cpu_max_bits_warn include/linux/cpumask.h:121 [inline] There was an attempt to fix this in commit: 917257daa0fea7a0 ("io_uring: only test SQPOLL cpu after we've verified it") ... by adding a check after the cpu value had been limited to NR_CPU_IDS using array_index_nospec(). However, this left an unbound check at the start of the function, for which the warning still fires. Let's fix this correctly by checking that the cpu value is bound by nr_cpu_ids before passing it to cpu_possible(). Note that only nr_cpu_ids of a cpumask are guaranteed to exist at runtime, and nr_cpu_ids can be significantly smaller than NR_CPUs. For example, an arm64 defconfig has NR_CPUS=256, while my test VM has 4 vCPUs. Following the intent from the commit message for 917257daa0fea7a0, the check is moved under the SQ_AFF branch, which is the only branch where the cpu values is consumed. The check is performed before bounding the value with array_index_nospec() so that we don't silently accept bogus cpu values from userspace, where array_index_nospec() would force these values to 0. I suspect we can remove the array_index_nospec() call entirely, but I've conservatively left that in place, updated to use nr_cpu_ids to match the prior check. Tested on arm64 with the Syzkaller reproducer: https://syzkaller.appspot.com/bug?extid=cd714a07c6de2bc34293 https://syzkaller.appspot.com/x/repro.syz?x=15d8b397200000 Full splat from before this patch: WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 cpu_max_bits_warn include/linux/cpumask.h:121 [inline] WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 cpumask_check include/linux/cpumask.h:128 [inline] WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 cpumask_test_cpu include/linux/cpumask.h:344 [inline] WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 io_sq_offload_start fs/io_uring.c:2244 [inline] WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 io_uring_create fs/io_uring.c:2864 [inline] WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 io_uring_setup+0x1108/0x15a0 fs/io_uring.c:2916 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 27601 Comm: syz-executor.0 Not tainted 5.1.0-rc7 #3 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x2f0 include/linux/compiler.h:193 show_stack+0x20/0x30 arch/arm64/kernel/traps.c:158 __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x110/0x190 lib/dump_stack.c:113 panic+0x384/0x68c kernel/panic.c:214 __warn+0x2bc/0x2c0 kernel/panic.c:571 report_bug+0x228/0x2d8 lib/bug.c:186 bug_handler+0xa0/0x1a0 arch/arm64/kernel/traps.c:956 call_break_hook arch/arm64/kernel/debug-monitors.c:301 [inline] brk_handler+0x1d4/0x388 arch/arm64/kernel/debug-monitors.c:316 do_debug_exception+0x1a0/0x468 arch/arm64/mm/fault.c:831 el1_dbg+0x18/0x8c cpu_max_bits_warn include/linux/cpumask.h:121 [inline] cpumask_check include/linux/cpumask.h:128 [inline] cpumask_test_cpu include/linux/cpumask.h:344 [inline] io_sq_offload_start fs/io_uring.c:2244 [inline] io_uring_create fs/io_uring.c:2864 [inline] io_uring_setup+0x1108/0x15a0 fs/io_uring.c:2916 __do_sys_io_uring_setup fs/io_uring.c:2929 [inline] __se_sys_io_uring_setup fs/io_uring.c:2926 [inline] __arm64_sys_io_uring_setup+0x50/0x70 fs/io_uring.c:2926 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall arch/arm64/kernel/syscall.c:47 [inline] el0_svc_common.constprop.0+0x148/0x2e0 arch/arm64/kernel/syscall.c:83 el0_svc_handler+0xdc/0x100 arch/arm64/kernel/syscall.c:129 el0_svc+0x8/0xc arch/arm64/kernel/entry.S:948 SMP: stopping secondary CPUs Dumping ftrace buffer: (ftrace buffer empty) Kernel Offset: disabled CPU features: 0x002,23000438 Memory Limit: none Rebooting in 1 seconds.. Fixes: 917257daa0fea7a0 ("io_uring: only test SQPOLL cpu after we've verified it") Signed-off-by: Mark Rutland Cc: Jens Axboe Cc: Alexander Viro Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Simplied the logic Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0a894d7baceb..5954047ee96d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2319,10 +2319,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, mmgrab(current->mm); ctx->sqo_mm = current->mm; - ret = -EINVAL; - if (!cpu_possible(p->sq_thread_cpu)) - goto err; - if (ctx->flags & IORING_SETUP_SQPOLL) { ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) @@ -2333,11 +2329,11 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, ctx->sq_thread_idle = HZ; if (p->flags & IORING_SETUP_SQ_AFF) { - int cpu; + int cpu = array_index_nospec(p->sq_thread_cpu, + nr_cpu_ids); - cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS); ret = -EINVAL; - if (!cpu_possible(p->sq_thread_cpu)) + if (!cpu_possible(cpu)) goto err; ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, -- cgit From 52e04ef4c9d459cba3afd86ec335a411b40b7fd2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 30 Apr 2019 17:30:21 +0100 Subject: io_uring: free allocated io_memory once If io_allocate_scq_urings() fails to allocate an sq_* region, it will call io_mem_free() for any previously allocated regions, but leave dangling pointers to these regions in the ctx. Any regions which have not yet been allocated are left NULL. Note that when returning -EOVERFLOW, the previously allocated sq_ring is not freed, which appears to be an unintentional leak. When io_allocate_scq_urings() fails, io_uring_create() will call io_ring_ctx_wait_and_kill(), which calls io_mem_free() on all the sq_* regions, assuming the pointers are valid and not NULL. This can result in pages being freed multiple times, which has been observed to corrupt the page state, leading to subsequent fun. This can also result in virt_to_page() on NULL, resulting in the use of bogus page addresses, and yet more subsequent fun. The latter can be detected with CONFIG_DEBUG_VIRTUAL on arm64. Adding a cleanup path to io_allocate_scq_urings() complicates the logic, so let's leave it to io_ring_ctx_free() to consistently free these pointers, and simplify the io_allocate_scq_urings() error paths. Full splats from before this patch below. Note that the pointer logged by the DEBUG_VIRTUAL "non-linear address" warning has been hashed, and is actually NULL. [ 26.098129] page:ffff80000e949a00 count:0 mapcount:-128 mapping:0000000000000000 index:0x0 [ 26.102976] flags: 0x63fffc000000() [ 26.104373] raw: 000063fffc000000 ffff80000e86c188 ffff80000ea3df08 0000000000000000 [ 26.108917] raw: 0000000000000000 0000000000000001 00000000ffffff7f 0000000000000000 [ 26.137235] page dumped because: VM_BUG_ON_PAGE(page_ref_count(page) == 0) [ 26.143960] ------------[ cut here ]------------ [ 26.146020] kernel BUG at include/linux/mm.h:547! [ 26.147586] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [ 26.149163] Modules linked in: [ 26.150287] Process syz-executor.21 (pid: 20204, stack limit = 0x000000000e9cefeb) [ 26.153307] CPU: 2 PID: 20204 Comm: syz-executor.21 Not tainted 5.1.0-rc7-00004-g7d30b2ea43d6 #18 [ 26.156566] Hardware name: linux,dummy-virt (DT) [ 26.158089] pstate: 40400005 (nZcv daif +PAN -UAO) [ 26.159869] pc : io_mem_free+0x9c/0xa8 [ 26.161436] lr : io_mem_free+0x9c/0xa8 [ 26.162720] sp : ffff000013003d60 [ 26.164048] x29: ffff000013003d60 x28: ffff800025048040 [ 26.165804] x27: 0000000000000000 x26: ffff800025048040 [ 26.167352] x25: 00000000000000c0 x24: ffff0000112c2820 [ 26.169682] x23: 0000000000000000 x22: 0000000020000080 [ 26.171899] x21: ffff80002143b418 x20: ffff80002143b400 [ 26.174236] x19: ffff80002143b280 x18: 0000000000000000 [ 26.176607] x17: 0000000000000000 x16: 0000000000000000 [ 26.178997] x15: 0000000000000000 x14: 0000000000000000 [ 26.181508] x13: 00009178a5e077b2 x12: 0000000000000001 [ 26.183863] x11: 0000000000000000 x10: 0000000000000980 [ 26.186437] x9 : ffff000013003a80 x8 : ffff800025048a20 [ 26.189006] x7 : ffff8000250481c0 x6 : ffff80002ffe9118 [ 26.191359] x5 : ffff80002ffe9118 x4 : 0000000000000000 [ 26.193863] x3 : ffff80002ffefe98 x2 : 44c06ddd107d1f00 [ 26.196642] x1 : 0000000000000000 x0 : 000000000000003e [ 26.198892] Call trace: [ 26.199893] io_mem_free+0x9c/0xa8 [ 26.201155] io_ring_ctx_wait_and_kill+0xec/0x180 [ 26.202688] io_uring_setup+0x6c4/0x6f0 [ 26.204091] __arm64_sys_io_uring_setup+0x18/0x20 [ 26.205576] el0_svc_common.constprop.0+0x7c/0xe8 [ 26.207186] el0_svc_handler+0x28/0x78 [ 26.208389] el0_svc+0x8/0xc [ 26.209408] Code: aa0203e0 d0006861 9133a021 97fcdc3c (d4210000) [ 26.211995] ---[ end trace bdb81cd43a21e50d ]--- [ 81.770626] ------------[ cut here ]------------ [ 81.825015] virt_to_phys used for non-linear address: 000000000d42f2c7 ( (null)) [ 81.827860] WARNING: CPU: 1 PID: 30171 at arch/arm64/mm/physaddr.c:15 __virt_to_phys+0x48/0x68 [ 81.831202] Modules linked in: [ 81.832212] CPU: 1 PID: 30171 Comm: syz-executor.20 Not tainted 5.1.0-rc7-00004-g7d30b2ea43d6 #19 [ 81.835616] Hardware name: linux,dummy-virt (DT) [ 81.836863] pstate: 60400005 (nZCv daif +PAN -UAO) [ 81.838727] pc : __virt_to_phys+0x48/0x68 [ 81.840572] lr : __virt_to_phys+0x48/0x68 [ 81.842264] sp : ffff80002cf67c70 [ 81.843858] x29: ffff80002cf67c70 x28: ffff800014358e18 [ 81.846463] x27: 0000000000000000 x26: 0000000020000080 [ 81.849148] x25: 0000000000000000 x24: ffff80001bb01f40 [ 81.851986] x23: ffff200011db06c8 x22: ffff2000127e3c60 [ 81.854351] x21: ffff800014358cc0 x20: ffff800014358d98 [ 81.856711] x19: 0000000000000000 x18: 0000000000000000 [ 81.859132] x17: 0000000000000000 x16: 0000000000000000 [ 81.861586] x15: 0000000000000000 x14: 0000000000000000 [ 81.863905] x13: 0000000000000000 x12: ffff1000037603e9 [ 81.866226] x11: 1ffff000037603e8 x10: 0000000000000980 [ 81.868776] x9 : ffff80002cf67840 x8 : ffff80001bb02920 [ 81.873272] x7 : ffff1000037603e9 x6 : ffff80001bb01f47 [ 81.875266] x5 : ffff1000037603e9 x4 : dfff200000000000 [ 81.876875] x3 : ffff200010087528 x2 : ffff1000059ecf58 [ 81.878751] x1 : 44c06ddd107d1f00 x0 : 0000000000000000 [ 81.880453] Call trace: [ 81.881164] __virt_to_phys+0x48/0x68 [ 81.882919] io_mem_free+0x18/0x110 [ 81.886585] io_ring_ctx_wait_and_kill+0x13c/0x1f0 [ 81.891212] io_uring_setup+0xa60/0xad0 [ 81.892881] __arm64_sys_io_uring_setup+0x2c/0x38 [ 81.894398] el0_svc_common.constprop.0+0xac/0x150 [ 81.896306] el0_svc_handler+0x34/0x88 [ 81.897744] el0_svc+0x8/0xc [ 81.898715] ---[ end trace b4a703802243cbba ]--- Fixes: 2b188cc1bb857a9d ("Add io_uring IO interface") Signed-off-by: Mark Rutland Cc: Jens Axboe Cc: Alexander Viro Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5954047ee96d..046fc4e1e155 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2396,8 +2396,12 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) static void io_mem_free(void *ptr) { - struct page *page = virt_to_head_page(ptr); + struct page *page; + + if (!ptr) + return; + page = virt_to_head_page(ptr); if (put_page_testzero(page)) free_compound_page(page); } @@ -2816,17 +2820,12 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { - io_mem_free(ctx->sq_ring); + if (!ctx->sq_sqes) return -ENOMEM; - } cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); - if (!cq_ring) { - io_mem_free(ctx->sq_ring); - io_mem_free(ctx->sq_sqes); + if (!cq_ring) return -ENOMEM; - } ctx->cq_ring = cq_ring; cq_ring->ring_mask = p->cq_entries - 1; -- cgit From 817869d2519f0cb7be5b3482129dadc806dfb747 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 30 Apr 2019 14:44:05 -0600 Subject: io_uring: drop req submit reference always in async punt If we don't end up actually calling submit in io_sq_wq_submit_work(), we still need to drop the submit reference to the request. If we don't, then we can leak the request. This can happen if we race with ring shutdown while flushing the workqueue for requests that require use of the mm_struct. Fixes: e65ef56db494 ("io_uring: use regular request ref counts") Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 046fc4e1e155..18cecb6a0151 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1568,10 +1568,11 @@ restart: break; cond_resched(); } while (1); - - /* drop submission reference */ - io_put_req(req); } + + /* drop submission reference */ + io_put_req(req); + if (ret) { io_cqring_add_event(ctx, sqe->user_data, ret, 0); io_put_req(req); -- cgit From d4ef647510b1200fe1c996ff1cbf5ac47eb930cc Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 1 May 2019 16:59:16 +0100 Subject: io_uring: avoid page allocation warnings In io_sqe_buffer_register() we allocate a number of arrays based on the iov_len from the user-provided iov. While we limit iov_len to SZ_1G, we can still attempt to allocate arrays exceeding MAX_ORDER. On a 64-bit system with 4KiB pages, for an iov where iov_base = 0x10 and iov_len = SZ_1G, we'll calculate that nr_pages = 262145. When we try to allocate a corresponding array of (16-byte) bio_vecs, requiring 4194320 bytes, which is greater than 4MiB. This results in SLUB warning that we're trying to allocate greater than MAX_ORDER, and failing the allocation. Avoid this by using kvmalloc() for allocations dependent on the user-provided iov_len. At the same time, fix a leak of imu->bvec when registration fails. Full splat from before this patch: WARNING: CPU: 1 PID: 2314 at mm/page_alloc.c:4595 __alloc_pages_nodemask+0x7ac/0x2938 mm/page_alloc.c:4595 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 2314 Comm: syz-executor326 Not tainted 5.1.0-rc7-dirty #4 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x2f0 include/linux/compiler.h:193 show_stack+0x20/0x30 arch/arm64/kernel/traps.c:158 __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x110/0x190 lib/dump_stack.c:113 panic+0x384/0x68c kernel/panic.c:214 __warn+0x2bc/0x2c0 kernel/panic.c:571 report_bug+0x228/0x2d8 lib/bug.c:186 bug_handler+0xa0/0x1a0 arch/arm64/kernel/traps.c:956 call_break_hook arch/arm64/kernel/debug-monitors.c:301 [inline] brk_handler+0x1d4/0x388 arch/arm64/kernel/debug-monitors.c:316 do_debug_exception+0x1a0/0x468 arch/arm64/mm/fault.c:831 el1_dbg+0x18/0x8c __alloc_pages_nodemask+0x7ac/0x2938 mm/page_alloc.c:4595 alloc_pages_current+0x164/0x278 mm/mempolicy.c:2132 alloc_pages include/linux/gfp.h:509 [inline] kmalloc_order+0x20/0x50 mm/slab_common.c:1231 kmalloc_order_trace+0x30/0x2b0 mm/slab_common.c:1243 kmalloc_large include/linux/slab.h:480 [inline] __kmalloc+0x3dc/0x4f0 mm/slub.c:3791 kmalloc_array include/linux/slab.h:670 [inline] io_sqe_buffer_register fs/io_uring.c:2472 [inline] __io_uring_register fs/io_uring.c:2962 [inline] __do_sys_io_uring_register fs/io_uring.c:3008 [inline] __se_sys_io_uring_register fs/io_uring.c:2990 [inline] __arm64_sys_io_uring_register+0x9e0/0x1bc8 fs/io_uring.c:2990 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall arch/arm64/kernel/syscall.c:47 [inline] el0_svc_common.constprop.0+0x148/0x2e0 arch/arm64/kernel/syscall.c:83 el0_svc_handler+0xdc/0x100 arch/arm64/kernel/syscall.c:129 el0_svc+0x8/0xc arch/arm64/kernel/entry.S:948 SMP: stopping secondary CPUs Dumping ftrace buffer: (ftrace buffer empty) Kernel Offset: disabled CPU features: 0x002,23000438 Memory Limit: none Rebooting in 1 seconds.. Fixes: edafccee56ff3167 ("io_uring: add support for pre-mapped user IO buffers") Signed-off-by: Mark Rutland Cc: Alexander Viro Cc: Jens Axboe Cc: linux-fsdevel@vger.kernel.org Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 18cecb6a0151..84efb8956734 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2443,7 +2443,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); - kfree(imu->bvec); + kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -2535,9 +2535,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, if (!pages || nr_pages > got_pages) { kfree(vmas); kfree(pages); - pages = kmalloc_array(nr_pages, sizeof(struct page *), + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - vmas = kmalloc_array(nr_pages, + vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), GFP_KERNEL); if (!pages || !vmas) { @@ -2549,7 +2549,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, got_pages = nr_pages; } - imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec), + imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { @@ -2588,6 +2588,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, } if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); + kvfree(imu->bvec); goto err; } @@ -2610,12 +2611,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ctx->nr_user_bufs++; } - kfree(pages); - kfree(vmas); + kvfree(pages); + kvfree(vmas); return 0; err: - kfree(pages); - kfree(vmas); + kvfree(pages); + kvfree(vmas); io_sqe_buffer_unregister(ctx); return ret; } -- cgit From de0617e467171ba44c73efd1ba63f101b164a035 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 6 Apr 2019 21:51:27 -0600 Subject: io_uring: add support for marking commands as draining There are no ordering constraints between the submission and completion side of io_uring. But sometimes that would be useful to have. One common example is doing an fsync, for instance, and have it ordered with previous writes. Without support for that, the application must do this tracking itself. This adds a general SQE flag, IOSQE_IO_DRAIN. If a command is marked with this flag, then it will not be issued before previous commands have completed, and subsequent commands submitted after the drain will not be issued before the drain is started.. If there are no pending commands, setting this flag will not change the behavior of the issue of the command. Signed-off-by: Jens Axboe --- fs/io_uring.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 84efb8956734..b3333fec349a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -222,6 +222,8 @@ struct io_ring_ctx { unsigned sq_mask; unsigned sq_thread_idle; struct io_uring_sqe *sq_sqes; + + struct list_head defer_list; } ____cacheline_aligned_in_smp; /* IO offload */ @@ -327,8 +329,11 @@ struct io_kiocb { #define REQ_F_FIXED_FILE 4 /* ctx owns file */ #define REQ_F_SEQ_PREV 8 /* sequential with previous */ #define REQ_F_PREPPED 16 /* prep already done */ +#define REQ_F_IO_DRAIN 32 /* drain existing IO first */ +#define REQ_F_IO_DRAINED 64 /* drain done */ u64 user_data; - u64 error; + u32 error; + u32 sequence; struct work_struct work; }; @@ -356,6 +361,8 @@ struct io_submit_state { unsigned int ios_left; }; +static void io_sq_wq_submit_work(struct work_struct *work); + static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -407,10 +414,36 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->cancel_list); + INIT_LIST_HEAD(&ctx->defer_list); return ctx; } -static void io_commit_cqring(struct io_ring_ctx *ctx) +static inline bool io_sequence_defer(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) + return false; + + return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped; +} + +static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + if (list_empty(&ctx->defer_list)) + return NULL; + + req = list_first_entry(&ctx->defer_list, struct io_kiocb, list); + if (!io_sequence_defer(ctx, req)) { + list_del_init(&req->list); + return req; + } + + return NULL; +} + +static void __io_commit_cqring(struct io_ring_ctx *ctx) { struct io_cq_ring *ring = ctx->cq_ring; @@ -425,6 +458,18 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) } } +static void io_commit_cqring(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + __io_commit_cqring(ctx); + + while ((req = io_get_deferred_req(ctx)) != NULL) { + req->flags |= REQ_F_IO_DRAINED; + queue_work(ctx->sqo_wq, &req->work); + } +} + static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { struct io_cq_ring *ring = ctx->cq_ring; @@ -1437,6 +1482,34 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ipt.error; } +static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_uring_sqe *sqe_copy; + + if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) + return 0; + + sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); + if (!sqe_copy) + return -EAGAIN; + + spin_lock_irq(&ctx->completion_lock); + if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) { + spin_unlock_irq(&ctx->completion_lock); + kfree(sqe_copy); + return 0; + } + + memcpy(sqe_copy, sqe, sizeof(*sqe_copy)); + req->submit.sqe = sqe_copy; + + INIT_WORK(&req->work, io_sq_wq_submit_work); + list_add_tail(&req->list, &ctx->defer_list); + spin_unlock_irq(&ctx->completion_lock); + return -EIOCBQUEUED; +} + static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock) { @@ -1684,6 +1757,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, flags = READ_ONCE(s->sqe->flags); fd = READ_ONCE(s->sqe->fd); + if (flags & IOSQE_IO_DRAIN) { + req->flags |= REQ_F_IO_DRAIN; + req->sequence = ctx->cached_sq_head - 1; + } + if (!io_op_needs_file(s->sqe)) { req->file = NULL; return 0; @@ -1713,7 +1791,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, int ret; /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) + if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN))) return -EINVAL; req = io_get_req(ctx, state); @@ -1724,6 +1802,13 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, if (unlikely(ret)) goto out; + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret == -EIOCBQUEUED) + ret = 0; + return ret; + } + ret = __io_submit_sqe(ctx, req, s, true); if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; -- cgit From 5d17b4a4b7fa172b205be8a05051ae705d1dc3bb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Apr 2019 14:56:44 -0600 Subject: io_uring: add support for IORING_OP_SYNC_FILE_RANGE This behaves just like sync_file_range(2) does. Signed-off-by: Jens Axboe --- fs/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index b3333fec349a..468f9da472b2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1267,6 +1267,54 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret = 0; + + if (!req->file) + return -EBADF; + /* Prep already done (EAGAIN retry) */ + if (req->flags & REQ_F_PREPPED) + return 0; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) + return -EINVAL; + + req->flags |= REQ_F_PREPPED; + return ret; +} + +static int io_sync_file_range(struct io_kiocb *req, + const struct io_uring_sqe *sqe, + bool force_nonblock) +{ + loff_t sqe_off; + loff_t sqe_len; + unsigned flags; + int ret; + + ret = io_prep_sfr(req, sqe); + if (ret) + return ret; + + /* sync_file_range always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; + + sqe_off = READ_ONCE(sqe->off); + sqe_len = READ_ONCE(sqe->len); + flags = READ_ONCE(sqe->sync_range_flags); + + ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); + + io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); + io_put_req(req); + return 0; +} + static void io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; @@ -1549,6 +1597,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_POLL_REMOVE: ret = io_poll_remove(req, s->sqe); break; + case IORING_OP_SYNC_FILE_RANGE: + ret = io_sync_file_range(req, s->sqe, force_nonblock); + break; default: ret = -EINVAL; break; -- cgit From 9b402849e80c85eee10bbd341aab3f1a0f942d4f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Apr 2019 11:45:41 -0600 Subject: io_uring: add support for eventfd notifications Allow registration of an eventfd, which will trigger an event every time a completion event happens for this io_uring instance. Signed-off-by: Jens Axboe --- fs/io_uring.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 468f9da472b2..2a46de56d05c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -241,6 +241,7 @@ struct io_ring_ctx { unsigned cq_mask; struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; + struct eventfd_ctx *cq_ev_fd; } ____cacheline_aligned_in_smp; /* @@ -516,6 +517,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); + if (ctx->cq_ev_fd) + eventfd_signal(ctx->cq_ev_fd, 1); } static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, @@ -2757,6 +2760,38 @@ err: return ret; } +static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) +{ + __s32 __user *fds = arg; + int fd; + + if (ctx->cq_ev_fd) + return -EBUSY; + + if (copy_from_user(&fd, fds, sizeof(*fds))) + return -EFAULT; + + ctx->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ctx->cq_ev_fd)) { + int ret = PTR_ERR(ctx->cq_ev_fd); + ctx->cq_ev_fd = NULL; + return ret; + } + + return 0; +} + +static int io_eventfd_unregister(struct io_ring_ctx *ctx) +{ + if (ctx->cq_ev_fd) { + eventfd_ctx_put(ctx->cq_ev_fd); + ctx->cq_ev_fd = NULL; + return 0; + } + + return -ENXIO; +} + static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); @@ -2766,6 +2801,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_iopoll_reap_events(ctx); io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); + io_eventfd_unregister(ctx); #if defined(CONFIG_UNIX) if (ctx->ring_sock) @@ -3179,6 +3215,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_sqe_files_unregister(ctx); break; + case IORING_REGISTER_EVENTFD: + ret = -EINVAL; + if (nr_args != 1) + break; + ret = io_eventfd_register(ctx, arg); + break; + case IORING_UNREGISTER_EVENTFD: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_eventfd_unregister(ctx); + break; default: ret = -EINVAL; break; -- cgit From 5dcf877fb13f3c6a8ba0777ef766c4af32df725d Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Wed, 1 May 2019 13:53:36 +0200 Subject: req->error only used for iopoll MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No need to set it in io_poll_add; io_poll_complete doesn't use it to set the result in the CQE. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2a46de56d05c..d91cbd53d3ca 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -333,7 +333,7 @@ struct io_kiocb { #define REQ_F_IO_DRAIN 32 /* drain existing IO first */ #define REQ_F_IO_DRAINED 64 /* drain done */ u64 user_data; - u32 error; + u32 error; /* iopoll result from callback */ u32 sequence; struct work_struct work; @@ -1520,7 +1520,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) spin_unlock(&poll->head->lock); } if (mask) { /* no async, we'd stolen it */ - req->error = mangle_poll(mask); ipt.error = 0; io_poll_complete(ctx, req, mask); } -- cgit From efeb862bd5bc001636e690debf6f9fbba98e5bfd Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 5 May 2019 23:01:22 +0100 Subject: io_uring: fix shadowed variable ret return code being not checked Currently variable ret is declared in a while-loop code block that shadows another variable ret. When an error occurs in the while-loop the error return in ret is not being set in the outer code block and so the error check on ret is always going to be checking on the wrong ret variable resulting in check that is always going to be true and a premature return occurs. Fix this by removing the declaration of the inner while-loop variable ret so that shadowing does not occur. Addresses-Coverity: ("'Constant' variable guards dead code") Fixes: 6b06314c47e1 ("io_uring: add file set registration") Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index d91cbd53d3ca..1157a068c253 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2363,7 +2363,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) left = ctx->nr_user_files; while (left) { unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); - int ret; ret = __io_sqe_files_scm(ctx, this_files, total); if (ret) -- cgit From 7889f44dd9cee15aff1c3f7daf81ca4dfed48fc7 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Tue, 7 May 2019 16:03:19 +0800 Subject: io_uring: use cpu_online() to check p->sq_thread_cpu instead of cpu_possible() This issue is found by running liburing/test/io_uring_setup test. When test run, the testcase "attempt to bind to invalid cpu" would not pass with messages like: io_uring_setup(1, 0xbfc2f7c8), \ flags: IORING_SETUP_SQPOLL|IORING_SETUP_SQ_AFF, \ resv: 0x00000000 0x00000000 0x00000000 0x00000000 0x00000000, \ sq_thread_cpu: 2 expected -1, got 3 FAIL On my system, there is: CPU(s) possible : 0-3 CPU(s) online : 0-1 CPU(s) offline : 2-3 CPU(s) present : 0-1 The sq_thread_cpu 2 is offline on my system, so the bind should fail. But cpu_possible() will pass the check. We shouldn't be able to bind to an offline cpu. Use cpu_online() to do the check. After the change, the testcase run as expected: EINVAL will be returned for cpu offlined. Reviewed-by: Jeff Moyer Signed-off-by: Shenghui Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1157a068c253..48ea3977012a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2471,7 +2471,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, nr_cpu_ids); ret = -EINVAL; - if (!cpu_possible(cpu)) + if (!cpu_online(cpu)) goto err; ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, -- cgit From e2033e33cb3821c26d4f9e70677910827d3b7885 Mon Sep 17 00:00:00 2001 From: Stefan Bühler Date: Sat, 11 May 2019 19:08:01 +0200 Subject: io_uring: fix race condition reading SQE data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When punting to workers the SQE gets copied after the initial try. There is a race condition between reading SQE data for the initial try and copying it for punting it to the workers. For example io_rw_done calls kiocb->ki_complete even if it was prepared for IORING_OP_FSYNC (and would be NULL). The easiest solution for now is to alway prepare again in the worker. req->file is safe to prepare though as long as it is checked before use. Signed-off-by: Stefan Bühler Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 48ea3977012a..576d9c652b4c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -329,9 +329,8 @@ struct io_kiocb { #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ #define REQ_F_SEQ_PREV 8 /* sequential with previous */ -#define REQ_F_PREPPED 16 /* prep already done */ -#define REQ_F_IO_DRAIN 32 /* drain existing IO first */ -#define REQ_F_IO_DRAINED 64 /* drain done */ +#define REQ_F_IO_DRAIN 16 /* drain existing IO first */ +#define REQ_F_IO_DRAINED 32 /* drain done */ u64 user_data; u32 error; /* iopoll result from callback */ u32 sequence; @@ -896,9 +895,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, if (!req->file) return -EBADF; - /* For -EAGAIN retry, everything is already prepped */ - if (req->flags & REQ_F_PREPPED) - return 0; if (force_nonblock && !io_file_supports_async(req->file)) force_nonblock = false; @@ -941,7 +937,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, return -EINVAL; kiocb->ki_complete = io_complete_rw; } - req->flags |= REQ_F_PREPPED; return 0; } @@ -1227,16 +1222,12 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req->file) return -EBADF; - /* Prep already done (EAGAIN retry) */ - if (req->flags & REQ_F_PREPPED) - return 0; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; - req->flags |= REQ_F_PREPPED; return 0; } @@ -1277,16 +1268,12 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req->file) return -EBADF; - /* Prep already done (EAGAIN retry) */ - if (req->flags & REQ_F_PREPPED) - return 0; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; - req->flags |= REQ_F_PREPPED; return ret; } -- cgit From 932f4a630a695212bdc7379b05f9bd0dafc5d968 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:03 -0700 Subject: mm/gup: replace get_user_pages_longterm() with FOLL_LONGTERM Pach series "Add FOLL_LONGTERM to GUP fast and use it". HFI1, qib, and mthca, use get_user_pages_fast() due to its performance advantages. These pages can be held for a significant time. But get_user_pages_fast() does not protect against mapping FS DAX pages. Introduce FOLL_LONGTERM and use this flag in get_user_pages_fast() which retains the performance while also adding the FS DAX checks. XDP has also shown interest in using this functionality.[1] In addition we change get_user_pages() to use the new FOLL_LONGTERM flag and remove the specialized get_user_pages_longterm call. [1] https://lkml.org/lkml/2019/3/19/939 "longterm" is a relative thing and at this point is probably a misnomer. This is really flagging a pin which is going to be given to hardware and can't move. I've thought of a couple of alternative names but I think we have to settle on if we are going to use FL_LAYOUT or something else to solve the "longterm" problem. Then I think we can change the flag to a better name. Secondly, it depends on how often you are registering memory. I have spoken with some RDMA users who consider MR in the performance path... For the overall application performance. I don't have the numbers as the tests for HFI1 were done a long time ago. But there was a significant advantage. Some of which is probably due to the fact that you don't have to hold mmap_sem. Finally, architecturally I think it would be good for everyone to use *_fast. There are patches submitted to the RDMA list which would allow the use of *_fast (they reworking the use of mmap_sem) and as soon as they are accepted I'll submit a patch to convert the RDMA core as well. Also to this point others are looking to use *_fast. As an aside, Jasons pointed out in my previous submission that *_fast and *_unlocked look very much the same. I agree and I think further cleanup will be coming. But I'm focused on getting the final solution for DAX at the moment. This patch (of 7): This patch starts a series which aims to support FOLL_LONGTERM in get_user_pages_fast(). Some callers who would like to do a longterm (user controlled pin) of pages with the fast variant of GUP for performance purposes. Rather than have a separate get_user_pages_longterm() call, introduce FOLL_LONGTERM and change the longterm callers to use it. This patch does not change any functionality. In the short term "longterm" or user controlled pins are unsafe for Filesystems and FS DAX in particular has been blocked. However, callers of get_user_pages_fast() were not "protected". FOLL_LONGTERM can _only_ be supported with get_user_pages[_fast]() as it requires vmas to determine if DAX is in use. NOTE: In merging with the CMA changes we opt to change the get_user_pages() call in check_and_migrate_cma_pages() to a call of __get_user_pages_locked() on the newly migrated pages. This makes the code read better in that we are calling __get_user_pages_locked() on the pages before and after a potential migration. As a side affect some of the interfaces are cleaned up but this is not the primary purpose of the series. In review[1] it was asked: > This I don't get - if you do lock down long term mappings performance > of the actual get_user_pages call shouldn't matter to start with. > > What do I miss? A couple of points. First "longterm" is a relative thing and at this point is probably a misnomer. This is really flagging a pin which is going to be given to hardware and can't move. I've thought of a couple of alternative names but I think we have to settle on if we are going to use FL_LAYOUT or something else to solve the "longterm" problem. Then I think we can change the flag to a better name. Second, It depends on how often you are registering memory. I have spoken with some RDMA users who consider MR in the performance path... For the overall application performance. I don't have the numbers as the tests for HFI1 were done a long time ago. But there was a significant advantage. Some of which is probably due to the fact that you don't have to hold mmap_sem. Finally, architecturally I think it would be good for everyone to use *_fast. There are patches submitted to the RDMA list which would allow the use of *_fast (they reworking the use of mmap_sem) and as soon as they are accepted I'll submit a patch to convert the RDMA core as well. Also to this point others are looking to use *_fast. As an asside, Jasons pointed out in my previous submission that *_fast and *_unlocked look very much the same. I agree and I think further cleanup will be coming. But I'm focused on getting the final solution for DAX at the moment. [1] https://lore.kernel.org/lkml/20190220180255.GA12020@iweiny-DESK2.sc.intel.com/T/#md6abad2569f3bf6c1f03686c8097ab6563e94965 [ira.weiny@intel.com: v3] Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-2-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Andrew Morton Cc: Aneesh Kumar K.V Cc: Michal Hocko Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Jason Gunthorpe Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Rich Felker Cc: Yoshinori Sato Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Ralf Baechle Cc: James Hogan Cc: Dan Williams Cc: Mike Marshall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/io_uring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 48ea3977012a..fdc18321d70c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2697,8 +2697,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ret = 0; down_read(¤t->mm->mmap_sem); - pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE, - pages, vmas); + pret = get_user_pages(ubuf, nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages, vmas); if (pret == nr_pages) { /* don't support file backed memory */ for (j = 0; j < nr_pages; j++) { -- cgit From 44a9bd18a0f06bba19d155aeaa11e2edce898293 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 May 2019 20:00:30 -0600 Subject: io_uring: fix failure to verify SQ_AFF cpu The test case we have is rightfully failing with the current kernel: io_uring_setup(1, 0x7ffe2cafebe0), flags: IORING_SETUP_SQPOLL|IORING_SETUP_SQ_AFF, resv: 0x00000000 0x00000000 0x00000000 0x00000000 0x00000000, sq_thread_cpu: 4 expected -1, got 3 This is in a vm, and CPU3 is the last valid one, hence asking for 4 should fail the setup with -EINVAL, not succeed. The problem is that we're using array_index_nospec() with nr_cpu_ids as the index, hence we wrap and end up using CPU0 instead of CPU4. This makes the setup succeed where it should be failing. We don't need to use array_index_nospec() as we're not indexing any array with this. Instead just compare with nr_cpu_ids directly. This is fine as we're checking with cpu_online() afterwards. Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 576d9c652b4c..249a1e4e60e6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2454,10 +2454,11 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, ctx->sq_thread_idle = HZ; if (p->flags & IORING_SETUP_SQ_AFF) { - int cpu = array_index_nospec(p->sq_thread_cpu, - nr_cpu_ids); + int cpu = p->sq_thread_cpu; ret = -EINVAL; + if (cpu >= nr_cpu_ids) + goto err; if (!cpu_online(cpu)) goto err; -- cgit From c71ffb673cd9bb2ddc575ede9055f265b2535690 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 May 2019 20:58:29 -0600 Subject: io_uring: remove 'ev_flags' argument We always pass in 0 for the cqe flags argument, since the support for "this read hit page cache" hint was dropped. Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 249a1e4e60e6..ac0407693834 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -489,7 +489,7 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) } static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, - long res, unsigned ev_flags) + long res) { struct io_uring_cqe *cqe; @@ -502,7 +502,7 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, if (cqe) { WRITE_ONCE(cqe->user_data, ki_user_data); WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, ev_flags); + WRITE_ONCE(cqe->flags, 0); } else { unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); @@ -521,12 +521,12 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) } static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned ev_flags) + long res) { unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - io_cqring_fill_event(ctx, user_data, res, ev_flags); + io_cqring_fill_event(ctx, user_data, res); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -628,7 +628,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); - io_cqring_fill_event(ctx, req->user_data, req->error, 0); + io_cqring_fill_event(ctx, req->user_data, req->error); (*nr_events)++; if (refcount_dec_and_test(&req->refs)) { @@ -776,7 +776,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) kiocb_end_write(kiocb); - io_cqring_add_event(req->ctx, req->user_data, res, 0); + io_cqring_add_event(req->ctx, req->user_data, res); io_put_req(req); } @@ -1211,7 +1211,7 @@ static int io_nop(struct io_kiocb *req, u64 user_data) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - io_cqring_add_event(ctx, user_data, err, 0); + io_cqring_add_event(ctx, user_data, err); io_put_req(req); return 0; } @@ -1256,7 +1256,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, end > 0 ? end : LLONG_MAX, fsync_flags & IORING_FSYNC_DATASYNC); - io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); + io_cqring_add_event(req->ctx, sqe->user_data, ret); io_put_req(req); return 0; } @@ -1300,7 +1300,7 @@ static int io_sync_file_range(struct io_kiocb *req, ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); - io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); + io_cqring_add_event(req->ctx, sqe->user_data, ret); io_put_req(req); return 0; } @@ -1358,7 +1358,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) } spin_unlock_irq(&ctx->completion_lock); - io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); + io_cqring_add_event(req->ctx, sqe->user_data, ret); io_put_req(req); return 0; } @@ -1367,7 +1367,7 @@ static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req, __poll_t mask) { req->poll.done = true; - io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0); + io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask)); io_commit_cqring(ctx); } @@ -1687,7 +1687,7 @@ restart: io_put_req(req); if (ret) { - io_cqring_add_event(ctx, sqe->user_data, ret, 0); + io_cqring_add_event(ctx, sqe->user_data, ret); io_put_req(req); } @@ -1992,7 +1992,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, continue; } - io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0); + io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret); } if (statep) @@ -2157,7 +2157,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) ret = io_submit_sqe(ctx, &s, statep); if (ret) - io_cqring_add_event(ctx, s.sqe->user_data, ret, 0); + io_cqring_add_event(ctx, s.sqe->user_data, ret); } io_commit_sqring(ctx); -- cgit From 2bbcd6d3b36a75a19be4917807f54ae32dd26aba Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Thu, 16 May 2019 10:53:57 +0200 Subject: io_uring: fix infinite wait in khread_park() on io_finish_async() This fixes couple of races which lead to infinite wait of park completion with the following backtraces: [20801.303319] Call Trace: [20801.303321] ? __schedule+0x284/0x650 [20801.303323] schedule+0x33/0xc0 [20801.303324] schedule_timeout+0x1bc/0x210 [20801.303326] ? schedule+0x3d/0xc0 [20801.303327] ? schedule_timeout+0x1bc/0x210 [20801.303329] ? preempt_count_add+0x79/0xb0 [20801.303330] wait_for_completion+0xa5/0x120 [20801.303331] ? wake_up_q+0x70/0x70 [20801.303333] kthread_park+0x48/0x80 [20801.303335] io_finish_async+0x2c/0x70 [20801.303336] io_ring_ctx_wait_and_kill+0x95/0x180 [20801.303338] io_uring_release+0x1c/0x20 [20801.303339] __fput+0xad/0x210 [20801.303341] task_work_run+0x8f/0xb0 [20801.303342] exit_to_usermode_loop+0xa0/0xb0 [20801.303343] do_syscall_64+0xe0/0x100 [20801.303349] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [20801.303380] Call Trace: [20801.303383] ? __schedule+0x284/0x650 [20801.303384] schedule+0x33/0xc0 [20801.303386] io_sq_thread+0x38a/0x410 [20801.303388] ? __switch_to_asm+0x40/0x70 [20801.303390] ? wait_woken+0x80/0x80 [20801.303392] ? _raw_spin_lock_irqsave+0x17/0x40 [20801.303394] ? io_submit_sqes+0x120/0x120 [20801.303395] kthread+0x112/0x130 [20801.303396] ? kthread_create_on_node+0x60/0x60 [20801.303398] ret_from_fork+0x35/0x40 o kthread_park() waits for park completion, so io_sq_thread() loop should check kthread_should_park() along with khread_should_stop(), otherwise if kthread_park() is called before prepare_to_wait() the following schedule() never returns: CPU#0 CPU#1 io_sq_thread_stop(): io_sq_thread(): while(!kthread_should_stop() && !ctx->sqo_stop) { ctx->sqo_stop = 1; kthread_park() prepare_to_wait(); if (kthread_should_stop() { } schedule(); <<< nobody checks park flag, <<< so schedule and never return o if the flag ctx->sqo_stop is observed by the io_sq_thread() loop it is quite possible, that kthread_should_park() check and the following kthread_parkme() is never called, because kthread_park() has not been yet called, but few moments later is is called and waits there for park completion, which never happens, because kthread has already exited: CPU#0 CPU#1 io_sq_thread_stop(): io_sq_thread(): ctx->sqo_stop = 1; while(!kthread_should_stop() && !ctx->sqo_stop) { <<< observe sqo_stop and exit the loop } if (kthread_should_park()) kthread_parkme(); <<< never called, since was <<< never parked kthread_park() <<< waits forever for park completion In the current patch we quit the loop by only kthread_should_park() check (kthread_park() is synchronous, so kthread_should_stop() is never observed), and we abandon ->sqo_stop flag, since it is racy. At the end of the io_sq_thread() we unconditionally call parmke(), since we've exited the loop by the park flag. Signed-off-by: Roman Penyaev Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index ac0407693834..67d1aae349d7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -231,7 +231,6 @@ struct io_ring_ctx { struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; wait_queue_head_t sqo_wait; - unsigned sqo_stop; struct { /* CQ ring */ @@ -2015,7 +2014,7 @@ static int io_sq_thread(void *data) set_fs(USER_DS); timeout = inflight = 0; - while (!kthread_should_stop() && !ctx->sqo_stop) { + while (!kthread_should_park()) { bool all_fixed, mm_fault = false; int i; @@ -2077,7 +2076,7 @@ static int io_sq_thread(void *data) smp_mb(); if (!io_get_sqring(ctx, &sqes[0])) { - if (kthread_should_stop()) { + if (kthread_should_park()) { finish_wait(&ctx->sqo_wait, &wait); break; } @@ -2127,8 +2126,7 @@ static int io_sq_thread(void *data) mmput(cur_mm); } - if (kthread_should_park()) - kthread_parkme(); + kthread_parkme(); return 0; } @@ -2260,8 +2258,11 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) static void io_sq_thread_stop(struct io_ring_ctx *ctx) { if (ctx->sqo_thread) { - ctx->sqo_stop = 1; - mb(); + /* + * The park is a bit of a work-around, without it we get + * warning spews on shutdown with SQPOLL set and affinity + * set to a single CPU. + */ kthread_park(ctx->sqo_thread); kthread_stop(ctx->sqo_thread); ctx->sqo_thread = NULL; -- cgit From dc6ce4bc2b355a47f225a0205046b3ebf29a7f72 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Thu, 16 May 2019 11:46:30 +0800 Subject: io_uring: adjust smp_rmb inside io_cqring_events Whenever smp_rmb is required to use io_cqring_events, keep smp_rmb inside the function io_cqring_events. Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 67d1aae349d7..9cc7a101ef2a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2167,6 +2167,8 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) static unsigned io_cqring_events(struct io_cq_ring *ring) { + /* See comment at the top of this file */ + smp_rmb(); return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); } @@ -2182,8 +2184,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, DEFINE_WAIT(wait); int ret; - /* See comment at the top of this file */ - smp_rmb(); if (io_cqring_events(ring) >= min_events) return 0; @@ -2205,8 +2205,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE); ret = 0; - /* See comment at the top of this file */ - smp_rmb(); if (io_cqring_events(ring) >= min_events) break; -- cgit From fdb288a679cdf6a71f3c1ae6f348ba4dae742681 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Thu, 16 May 2019 11:46:31 +0800 Subject: io_uring: use wait_event_interruptible for cq_wait conditional wait The previous patch has ensured that io_cqring_events contain smp_rmb memory barriers, Now we can use wait_event_interruptible to keep the code simple. Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9cc7a101ef2a..383d208ca0d2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2181,7 +2181,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, { struct io_cq_ring *ring = ctx->cq_ring; sigset_t ksigmask, sigsaved; - DEFINE_WAIT(wait); int ret; if (io_cqring_events(ring) >= min_events) @@ -2201,21 +2200,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - do { - prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE); - - ret = 0; - if (io_cqring_events(ring) >= min_events) - break; - - schedule(); - + ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events); + if (ret == -ERESTARTSYS) ret = -EINTR; - if (signal_pending(current)) - break; - } while (1); - - finish_wait(&ctx->wait, &wait); if (sig) restore_user_sigmask(sig, &sigsaved); -- cgit