From d63d1b5edb7b832210bfde587ba9e7549fa064eb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Dec 2019 10:38:56 -0700 Subject: io_uring: add support for fallocate() This exposes fallocate(2) through io_uring. Signed-off-by: Jens Axboe --- fs/io_uring.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5953d7f13690..7a4e00ef02be 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -319,6 +319,7 @@ struct io_sync { loff_t len; loff_t off; int flags; + int mode; }; struct io_cancel { @@ -2101,6 +2102,54 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } +static void io_fallocate_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + int ret; + + ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, + req->sync.len); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_fallocate_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) + return -EINVAL; + + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->addr); + req->sync.mode = READ_ONCE(sqe->len); + return 0; +} + +static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; + + /* fallocate always requiring blocking context */ + if (force_nonblock) { + io_put_req(req); + req->work.func = io_fallocate_finish; + return -EAGAIN; + } + + work = old_work = &req->work; + io_fallocate_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + + return 0; +} + static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -3123,6 +3172,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_ACCEPT: ret = io_accept_prep(req, sqe); break; + case IORING_OP_FALLOCATE: + ret = io_fallocate_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3277,6 +3329,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_async_cancel(req, nxt); break; + case IORING_OP_FALLOCATE: + if (sqe) { + ret = io_fallocate_prep(req, sqe); + if (ret) + break; + } + ret = io_fallocate(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; -- cgit From 15b71abe7b52df214785dde0de9f581cc0216d17 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Dec 2019 11:20:36 -0700 Subject: io_uring: add support for IORING_OP_OPENAT This works just like openat(2), except it can be performed async. For the normal case of a non-blocking path lookup this will complete inline. If we have to do IO to perform the open, it'll be done from async context. Signed-off-by: Jens Axboe --- fs/io_uring.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7a4e00ef02be..34cbce622fcd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -70,6 +70,8 @@ #include #include #include +#include +#include #define CREATE_TRACE_POINTS #include @@ -353,6 +355,15 @@ struct io_sr_msg { int msg_flags; }; +struct io_open { + struct file *file; + int dfd; + umode_t mode; + const char __user *fname; + struct filename *filename; + int flags; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -371,12 +382,17 @@ struct io_async_rw { ssize_t size; }; +struct io_async_open { + struct filename *filename; +}; + struct io_async_ctx { union { struct io_async_rw rw; struct io_async_msghdr msg; struct io_async_connect connect; struct io_timeout_data timeout; + struct io_async_open open; }; }; @@ -397,6 +413,7 @@ struct io_kiocb { struct io_timeout timeout; struct io_connect connect; struct io_sr_msg sr_msg; + struct io_open open; }; struct io_async_ctx *io; @@ -2150,6 +2167,67 @@ static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } +static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + req->open.mode = READ_ONCE(sqe->len); + req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->open.flags = READ_ONCE(sqe->open_flags); + + req->open.filename = getname(req->open.fname); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct open_flags op; + struct open_how how; + struct file *file; + int ret; + + if (force_nonblock) { + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + return -EAGAIN; + } + + how = build_open_how(req->open.flags, req->open.mode); + ret = build_open_flags(&how, &op); + if (ret) + goto err; + + ret = get_unused_fd_flags(how.flags); + if (ret < 0) + goto err; + + file = do_filp_open(req->open.dfd, req->open.filename, &op); + if (IS_ERR(file)) { + put_unused_fd(ret); + ret = PTR_ERR(file); + } else { + fsnotify_open(file); + fd_install(ret, file); + } +err: + putname(req->open.filename); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -3175,6 +3253,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_FALLOCATE: ret = io_fallocate_prep(req, sqe); break; + case IORING_OP_OPENAT: + ret = io_openat_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3337,6 +3418,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_fallocate(req, nxt, force_nonblock); break; + case IORING_OP_OPENAT: + if (sqe) { + ret = io_openat_prep(req, sqe); + if (ret) + break; + } + ret = io_openat(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; @@ -3409,7 +3498,7 @@ static bool io_req_op_valid(int op) return op >= IORING_OP_NOP && op < IORING_OP_LAST; } -static int io_req_needs_file(struct io_kiocb *req) +static int io_req_needs_file(struct io_kiocb *req, int fd) { switch (req->opcode) { case IORING_OP_NOP: @@ -3419,6 +3508,8 @@ static int io_req_needs_file(struct io_kiocb *req) case IORING_OP_ASYNC_CANCEL: case IORING_OP_LINK_TIMEOUT: return 0; + case IORING_OP_OPENAT: + return fd != -1; default: if (io_req_op_valid(req->opcode)) return 1; @@ -3448,7 +3539,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - ret = io_req_needs_file(req); + ret = io_req_needs_file(req, fd); if (ret <= 0) return ret; -- cgit From 0c9d5ccd26a004f59333c06fbbb98f9cb1eed93d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Dec 2019 19:29:43 -0700 Subject: io-wq: add support for uncancellable work Not all work can be cancelled, some of it we may need to guarantee that it runs to completion. Allow the caller to set IO_WQ_WORK_NO_CANCEL on work that must not be cancelled. Note that the caller work function must also check for IO_WQ_WORK_NO_CANCEL on work that is marked IO_WQ_WORK_CANCEL. Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 34cbce622fcd..fe227650efd6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3460,8 +3460,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr) struct io_kiocb *nxt = NULL; int ret = 0; - if (work->flags & IO_WQ_WORK_CANCEL) + /* if NO_CANCEL is set, we must still run the work */ + if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == + IO_WQ_WORK_CANCEL) { ret = -ECANCELED; + } if (!ret) { req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; -- cgit From b5dba59e0cf7e2cc4d3b3b1ac5fe81ddf21959eb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Dec 2019 14:02:38 -0700 Subject: io_uring: add support for IORING_OP_CLOSE This works just like close(2), unsurprisingly. We remove the file descriptor and post the completion inline, then offload the actual (potential) last file put to async context. Mark the async part of this work as uncancellable, as we really must guarantee that the latter part of the close is run. Signed-off-by: Jens Axboe --- fs/io_uring.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index fe227650efd6..6aaff7bfe8b5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -301,6 +301,12 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +struct io_close { + struct file *file; + struct file *put_file; + int fd; +}; + struct io_timeout_data { struct io_kiocb *req; struct hrtimer timer; @@ -414,6 +420,7 @@ struct io_kiocb { struct io_connect connect; struct io_sr_msg sr_msg; struct io_open open; + struct io_close close; }; struct io_async_ctx *io; @@ -2228,6 +2235,94 @@ err: return 0; } +static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + /* + * If we queue this for async, it must not be cancellable. That would + * leave the 'file' in an undeterminate state. + */ + req->work.flags |= IO_WQ_WORK_NO_CANCEL; + + if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || + sqe->rw_flags || sqe->buf_index) + return -EINVAL; + if (sqe->flags & IOSQE_FIXED_FILE) + return -EINVAL; + + req->close.fd = READ_ONCE(sqe->fd); + if (req->file->f_op == &io_uring_fops || + req->close.fd == req->ring_fd) + return -EBADF; + + return 0; +} + +static void io_close_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + /* Invoked with files, we need to do the close */ + if (req->work.files) { + int ret; + + ret = filp_close(req->close.put_file, req->work.files); + if (ret < 0) { + req_set_fail_links(req); + } + io_cqring_add_event(req, ret); + } + + fput(req->close.put_file); + + /* we bypassed the re-issue, drop the submission reference */ + io_put_req(req); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + int ret; + + req->close.put_file = NULL; + ret = __close_fd_get_file(req->close.fd, &req->close.put_file); + if (ret < 0) + return ret; + + /* if the file has a flush method, be safe and punt to async */ + if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) { + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + goto eagain; + } + + /* + * No ->flush(), safely close from here and just punt the + * fput() to async context. + */ + ret = filp_close(req->close.put_file, current->files); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + + if (io_wq_current_is_worker()) { + struct io_wq_work *old_work, *work; + + old_work = work = &req->work; + io_close_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + return 0; + } + +eagain: + req->work.func = io_close_finish; + return -EAGAIN; +} + static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -3256,6 +3351,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_OPENAT: ret = io_openat_prep(req, sqe); break; + case IORING_OP_CLOSE: + ret = io_close_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3426,6 +3524,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_openat(req, nxt, force_nonblock); break; + case IORING_OP_CLOSE: + if (sqe) { + ret = io_close_prep(req, sqe); + if (ret) + break; + } + ret = io_close(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; @@ -3572,6 +3678,9 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; + if (!req->ring_file) + return -EBADF; + rcu_read_lock(); spin_lock_irq(&ctx->inflight_lock); /* -- cgit From 05f3fb3c5397524feae2e73ee8e150a9090a7da2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 Dec 2019 11:22:50 -0700 Subject: io_uring: avoid ring quiesce for fixed file set unregister and update We currently fully quiesce the ring before an unregister or update of the fixed fileset. This is very expensive, and we can be a bit smarter about this. Add a percpu refcount for the file tables as a whole. Grab a percpu ref when we use a registered file, and put it on completion. This is cheap to do. Upon removal of a file from a set, switch the ref count to atomic mode. When we hit zero ref on the completion side, then we know we can drop the previously registered files. When the old files have been dropped, switch the ref back to percpu mode for normal operation. Since there's a period between doing the update and the kernel being done with it, add a IORING_OP_FILES_UPDATE opcode that can perform the same action. The application knows the update has completed when it gets the CQE for it. Between doing the update and receiving this completion, the application must continue to use the unregistered fd if submitting IO on this particular file. This takes the runtime of test/file-register from liburing from 14s to about 0.7s. Signed-off-by: Jens Axboe --- fs/io_uring.c | 485 ++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 350 insertions(+), 135 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6aaff7bfe8b5..4325068324b7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -179,6 +179,21 @@ struct fixed_file_table { struct file **files; }; +enum { + FFD_F_ATOMIC, +}; + +struct fixed_file_data { + struct fixed_file_table *table; + struct io_ring_ctx *ctx; + + struct percpu_ref refs; + struct llist_head put_llist; + unsigned long state; + struct work_struct ref_work; + struct completion done; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -231,7 +246,7 @@ struct io_ring_ctx { * readers must ensure that ->refs is alive as long as the file* is * used. Only updated through io_uring_register(2). */ - struct fixed_file_table *file_table; + struct fixed_file_data *file_data; unsigned nr_user_files; /* if used, fixed mapped user buffers */ @@ -370,6 +385,13 @@ struct io_open { int flags; }; +struct io_files_update { + struct file *file; + u64 arg; + u32 nr_args; + u32 offset; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -421,6 +443,7 @@ struct io_kiocb { struct io_sr_msg sr_msg; struct io_open open; struct io_close close; + struct io_files_update files_update; }; struct io_async_ctx *io; @@ -496,6 +519,9 @@ static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *ip, + unsigned nr_args); static struct kmem_cache *req_cachep; @@ -945,6 +971,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) if (*nr) { kmem_cache_free_bulk(req_cachep, *nr, reqs); percpu_ref_put_many(&ctx->refs, *nr); + percpu_ref_put_many(&ctx->file_data->refs, *nr); *nr = 0; } } @@ -955,8 +982,12 @@ static void __io_free_req(struct io_kiocb *req) if (req->io) kfree(req->io); - if (req->file && !(req->flags & REQ_F_FIXED_FILE)) - fput(req->file); + if (req->file) { + if (req->flags & REQ_F_FIXED_FILE) + percpu_ref_put(&ctx->file_data->refs); + else + fput(req->file); + } if (req->flags & REQ_F_INFLIGHT) { unsigned long flags; @@ -3293,6 +3324,45 @@ static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) return 0; } +static int io_files_update_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->flags || sqe->ioprio || sqe->rw_flags) + return -EINVAL; + + req->files_update.offset = READ_ONCE(sqe->off); + req->files_update.nr_args = READ_ONCE(sqe->len); + if (!req->files_update.nr_args) + return -EINVAL; + req->files_update.arg = READ_ONCE(sqe->addr); + return 0; +} + +static int io_files_update(struct io_kiocb *req, bool force_nonblock) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_files_update up; + int ret; + + if (force_nonblock) { + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + return -EAGAIN; + } + + up.offset = req->files_update.offset; + up.fds = req->files_update.arg; + + mutex_lock(&ctx->uring_lock); + ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args); + mutex_unlock(&ctx->uring_lock); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; +} + static int io_req_defer_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -3354,6 +3424,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_CLOSE: ret = io_close_prep(req, sqe); break; + case IORING_OP_FILES_UPDATE: + ret = io_files_update_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3532,6 +3605,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_close(req, nxt, force_nonblock); break; + case IORING_OP_FILES_UPDATE: + if (sqe) { + ret = io_files_update_prep(req, sqe); + if (ret) + break; + } + ret = io_files_update(req, force_nonblock); + break; default: ret = -EINVAL; break; @@ -3631,8 +3712,8 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, { struct fixed_file_table *table; - table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT]; - return table->files[index & IORING_FILE_TABLE_MASK]; + table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT]; + return table->files[index & IORING_FILE_TABLE_MASK];; } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, @@ -3653,7 +3734,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, return ret; if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->file_table || + if (unlikely(!ctx->file_data || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); @@ -3661,6 +3742,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (!req->file) return -EBADF; req->flags |= REQ_F_FIXED_FILE; + percpu_ref_get(&ctx->file_data->refs); } else { if (req->needs_fixed_file) return -EBADF; @@ -4338,19 +4420,37 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) #endif } +static void io_file_ref_kill(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + complete(&data->done); +} + static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { + struct fixed_file_data *data = ctx->file_data; unsigned nr_tables, i; - if (!ctx->file_table) + if (!data) return -ENXIO; + /* protect against inflight atomic switch, which drops the ref */ + flush_work(&data->ref_work); + percpu_ref_get(&data->refs); + percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill); + wait_for_completion(&data->done); + percpu_ref_put(&data->refs); + percpu_ref_exit(&data->refs); + __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(data->table[i].files); + kfree(data->table); + kfree(data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; } @@ -4381,16 +4481,6 @@ static void io_finish_async(struct io_ring_ctx *ctx) } #if defined(CONFIG_UNIX) -static void io_destruct_skb(struct sk_buff *skb) -{ - struct io_ring_ctx *ctx = skb->sk->sk_user_data; - - if (ctx->io_wq) - io_wq_flush(ctx->io_wq); - - unix_destruct_scm(skb); -} - /* * Ensure the UNIX gc is aware of our file set, so we are certain that * the io_uring can be safely unregistered on process exit, even if we have @@ -4438,7 +4528,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) fpl->max = SCM_MAX_FD; fpl->count = nr_files; UNIXCB(skb).fp = fpl; - skb->destructor = io_destruct_skb; + skb->destructor = unix_destruct_scm; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_queue_head(&sk->sk_receive_queue, skb); @@ -4500,7 +4590,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, int i; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; unsigned this_files; this_files = min(nr_files, IORING_MAX_FILES_TABLE); @@ -4515,36 +4605,159 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, return 0; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; kfree(table->files); } return 1; } +static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) +{ +#if defined(CONFIG_UNIX) + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff_head list, *head = &sock->sk_receive_queue; + struct sk_buff *skb; + int i; + + __skb_queue_head_init(&list); + + /* + * Find the skb that holds this file in its SCM_RIGHTS. When found, + * remove this entry and rearrange the file array. + */ + skb = skb_dequeue(head); + while (skb) { + struct scm_fp_list *fp; + + fp = UNIXCB(skb).fp; + for (i = 0; i < fp->count; i++) { + int left; + + if (fp->fp[i] != file) + continue; + + unix_notinflight(fp->user, fp->fp[i]); + left = fp->count - 1 - i; + if (left) { + memmove(&fp->fp[i], &fp->fp[i + 1], + left * sizeof(struct file *)); + } + fp->count--; + if (!fp->count) { + kfree_skb(skb); + skb = NULL; + } else { + __skb_queue_tail(&list, skb); + } + fput(file); + file = NULL; + break; + } + + if (!file) + break; + + __skb_queue_tail(&list, skb); + + skb = skb_dequeue(head); + } + + if (skb_peek(&list)) { + spin_lock_irq(&head->lock); + while ((skb = __skb_dequeue(&list)) != NULL) + __skb_queue_tail(head, skb); + spin_unlock_irq(&head->lock); + } +#else + fput(file); +#endif +} + +struct io_file_put { + struct llist_node llist; + struct file *file; + struct completion *done; +}; + +static void io_ring_file_ref_switch(struct work_struct *work) +{ + struct io_file_put *pfile, *tmp; + struct fixed_file_data *data; + struct llist_node *node; + + data = container_of(work, struct fixed_file_data, ref_work); + + while ((node = llist_del_all(&data->put_llist)) != NULL) { + llist_for_each_entry_safe(pfile, tmp, node, llist) { + io_ring_file_put(data->ctx, pfile->file); + if (pfile->done) + complete(pfile->done); + else + kfree(pfile); + } + } + + percpu_ref_get(&data->refs); + percpu_ref_switch_to_percpu(&data->refs); +} + +static void io_file_data_ref_zero(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + + /* we can't safely switch from inside this context, punt to wq */ + queue_work(system_wq, &data->ref_work); +} + static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { __s32 __user *fds = (__s32 __user *) arg; unsigned nr_tables; + struct file *file; int fd, ret = 0; unsigned i; - if (ctx->file_table) + if (ctx->file_data) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; + ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); + if (!ctx->file_data) + return -ENOMEM; + ctx->file_data->ctx = ctx; + init_completion(&ctx->file_data->done); + nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table), + ctx->file_data->table = kcalloc(nr_tables, + sizeof(struct fixed_file_table), GFP_KERNEL); - if (!ctx->file_table) + if (!ctx->file_data->table) { + kfree(ctx->file_data); + ctx->file_data = NULL; return -ENOMEM; + } + + if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } + ctx->file_data->put_llist.first = NULL; + INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch); if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { - kfree(ctx->file_table); - ctx->file_table = NULL; + percpu_ref_exit(&ctx->file_data->refs); + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; return -ENOMEM; } @@ -4561,13 +4774,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, continue; } - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; - table->files[index] = fget(fd); + file = fget(fd); ret = -EBADF; - if (!table->files[index]) + if (!file) break; + /* * Don't allow io_uring instances to be registered. If UNIX * isn't enabled, then this causes a reference cycle and this @@ -4575,26 +4789,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, * handle it just fine, but there's still no point in allowing * a ring fd as it doesn't support regular read/write anyway. */ - if (table->files[index]->f_op == &io_uring_fops) { - fput(table->files[index]); + if (file->f_op == &io_uring_fops) { + fput(file); break; } ret = 0; + table->files[index] = file; } if (ret) { for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file; - file = io_file_from_index(ctx, i); if (file) fput(file); } for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); + kfree(ctx->file_data->table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return ret; } @@ -4606,69 +4820,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index) -{ -#if defined(CONFIG_UNIX) - struct file *file = io_file_from_index(ctx, index); - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head list, *head = &sock->sk_receive_queue; - struct sk_buff *skb; - int i; - - __skb_queue_head_init(&list); - - /* - * Find the skb that holds this file in its SCM_RIGHTS. When found, - * remove this entry and rearrange the file array. - */ - skb = skb_dequeue(head); - while (skb) { - struct scm_fp_list *fp; - - fp = UNIXCB(skb).fp; - for (i = 0; i < fp->count; i++) { - int left; - - if (fp->fp[i] != file) - continue; - - unix_notinflight(fp->user, fp->fp[i]); - left = fp->count - 1 - i; - if (left) { - memmove(&fp->fp[i], &fp->fp[i + 1], - left * sizeof(struct file *)); - } - fp->count--; - if (!fp->count) { - kfree_skb(skb); - skb = NULL; - } else { - __skb_queue_tail(&list, skb); - } - fput(file); - file = NULL; - break; - } - - if (!file) - break; - - __skb_queue_tail(&list, skb); - - skb = skb_dequeue(head); - } - - if (skb_peek(&list)) { - spin_lock_irq(&head->lock); - while ((skb = __skb_dequeue(&list)) != NULL) - __skb_queue_tail(head, skb); - spin_unlock_irq(&head->lock); - } -#else - fput(io_file_from_index(ctx, index)); -#endif -} - static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, int index) { @@ -4712,29 +4863,65 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } -static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) +static void io_atomic_switch(struct percpu_ref *ref) { - struct io_uring_files_update up; + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + clear_bit(FFD_F_ATOMIC, &data->state); +} + +static bool io_queue_file_removal(struct fixed_file_data *data, + struct file *file) +{ + struct io_file_put *pfile, pfile_stack; + DECLARE_COMPLETION_ONSTACK(done); + + /* + * If we fail allocating the struct we need for doing async reomval + * of this file, just punt to sync and wait for it. + */ + pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); + if (!pfile) { + pfile = &pfile_stack; + pfile->done = &done; + } + + pfile->file = file; + llist_add(&pfile->llist, &data->put_llist); + + if (pfile == &pfile_stack) { + if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, + io_atomic_switch); + } + wait_for_completion(&done); + flush_work(&data->ref_work); + return false; + } + + return true; +} + +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *up, + unsigned nr_args) +{ + struct fixed_file_data *data = ctx->file_data; + bool ref_switch = false; + struct file *file; __s32 __user *fds; int fd, i, err; __u32 done; - if (!ctx->file_table) - return -ENXIO; - if (!nr_args) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; - if (up.resv) - return -EINVAL; - if (check_add_overflow(up.offset, nr_args, &done)) + if (check_add_overflow(up->offset, nr_args, &done)) return -EOVERFLOW; if (done > ctx->nr_user_files) return -EINVAL; done = 0; - fds = u64_to_user_ptr(up.fds); + fds = u64_to_user_ptr(up->fds); while (nr_args) { struct fixed_file_table *table; unsigned index; @@ -4744,16 +4931,16 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, err = -EFAULT; break; } - i = array_index_nospec(up.offset, ctx->nr_user_files); - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + i = array_index_nospec(up->offset, ctx->nr_user_files); + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; if (table->files[index]) { - io_sqe_file_unregister(ctx, i); + file = io_file_from_index(ctx, index); table->files[index] = NULL; + if (io_queue_file_removal(data, file)) + ref_switch = true; } if (fd != -1) { - struct file *file; - file = fget(fd); if (!file) { err = -EBADF; @@ -4779,11 +4966,32 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, } nr_args--; done++; - up.offset++; + up->offset++; + } + + if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); } return done ? done : err; } +static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_files_update up; + + if (!ctx->file_data) + return -ENXIO; + if (!nr_args) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + if (up.resv) + return -EINVAL; + + return __io_sqe_files_update(ctx, &up, nr_args); +} static void io_put_work(struct io_wq_work *work) { @@ -5546,7 +5754,6 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; - ctx->ring_sock->sk->sk_user_data = ctx; #endif fd_install(ret, file); return ret; @@ -5710,18 +5917,22 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (percpu_ref_is_dying(&ctx->refs)) return -ENXIO; - percpu_ref_kill(&ctx->refs); + if (opcode != IORING_UNREGISTER_FILES && + opcode != IORING_REGISTER_FILES_UPDATE) { + percpu_ref_kill(&ctx->refs); - /* - * Drop uring mutex before waiting for references to exit. If another - * thread is currently inside io_uring_enter() it might need to grab - * the uring_lock to make progress. If we hold it here across the drain - * wait, then we can deadlock. It's safe to drop the mutex here, since - * no new references will come in after we've killed the percpu ref. - */ - mutex_unlock(&ctx->uring_lock); - wait_for_completion(&ctx->completions[0]); - mutex_lock(&ctx->uring_lock); + /* + * Drop uring mutex before waiting for references to exit. If + * another thread is currently inside io_uring_enter() it might + * need to grab the uring_lock to make progress. If we hold it + * here across the drain wait, then we can deadlock. It's safe + * to drop the mutex here, since no new references will come in + * after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); + wait_for_completion(&ctx->completions[0]); + mutex_lock(&ctx->uring_lock); + } switch (opcode) { case IORING_REGISTER_BUFFERS: @@ -5762,9 +5973,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; } - /* bring the ctx back to life */ - reinit_completion(&ctx->completions[0]); - percpu_ref_reinit(&ctx->refs); + + if (opcode != IORING_UNREGISTER_FILES && + opcode != IORING_REGISTER_FILES_UPDATE) { + /* bring the ctx back to life */ + reinit_completion(&ctx->completions[0]); + percpu_ref_reinit(&ctx->refs); + } return ret; } -- cgit From eddc7ef52a6b37b7ba3d1c8a8fbb63d5d9914f8a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Dec 2019 21:18:10 -0700 Subject: io_uring: add support for IORING_OP_STATX This provides support for async statx(2) through io_uring. Signed-off-by: Jens Axboe --- fs/io_uring.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4325068324b7..115c7d413372 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -379,9 +379,13 @@ struct io_sr_msg { struct io_open { struct file *file; int dfd; - umode_t mode; + union { + umode_t mode; + unsigned mask; + }; const char __user *fname; struct filename *filename; + struct statx __user *buffer; int flags; }; @@ -2266,6 +2270,74 @@ err: return 0; } +static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned lookup_flags; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + req->open.mask = READ_ONCE(sqe->len); + req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + req->open.flags = READ_ONCE(sqe->statx_flags); + + if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.flags)) + return -EINVAL; + + req->open.filename = getname_flags(req->open.fname, lookup_flags, NULL); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_open *ctx = &req->open; + unsigned lookup_flags; + struct path path; + struct kstat stat; + int ret; + + if (force_nonblock) + return -EAGAIN; + + if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->flags)) + return -EINVAL; + +retry: + /* filename_lookup() drops it, keep a reference */ + ctx->filename->refcnt++; + + ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path, + NULL); + if (ret) + goto err; + + ret = vfs_getattr(&path, &stat, ctx->mask, ctx->flags); + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + if (!ret) + ret = cp_statx(&stat, ctx->buffer); +err: + putname(ctx->filename); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { /* @@ -3427,6 +3499,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_FILES_UPDATE: ret = io_files_update_prep(req, sqe); break; + case IORING_OP_STATX: + ret = io_statx_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3613,6 +3688,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_files_update(req, force_nonblock); break; + case IORING_OP_STATX: + if (sqe) { + ret = io_statx_prep(req, sqe); + if (ret) + break; + } + ret = io_statx(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; @@ -3699,6 +3782,7 @@ static int io_req_needs_file(struct io_kiocb *req, int fd) case IORING_OP_LINK_TIMEOUT: return 0; case IORING_OP_OPENAT: + case IORING_OP_STATX: return fd != -1; default: if (io_req_op_valid(req->opcode)) -- cgit From ce35a47a3a0208a77b4d31b7f2e8ed57d624093d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2019 08:04:44 -0700 Subject: io_uring: add IOSQE_ASYNC io_uring defaults to always doing inline submissions, if at all possible. But for larger copies, even if the data is fully cached, that can take a long time. Add an IOSQE_ASYNC flag that the application can set on the SQE - if set, it'll ensure that we always go async for those kinds of requests. Use the io-wq IO_WQ_WORK_CONCURRENT flag to ensure we get the concurrency we desire for this case. Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 115c7d413372..9ffcfdc6382b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -483,6 +483,7 @@ struct io_kiocb { #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ #define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ +#define REQ_F_FORCE_ASYNC 131072 /* IOSQE_ASYNC */ u64 user_data; u32 result; u32 sequence; @@ -4017,8 +4018,17 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) req_set_fail_links(req); io_double_put_req(req); } - } else + } else if ((req->flags & REQ_F_FORCE_ASYNC) && + !io_wq_current_is_worker()) { + /* + * Never try inline submit of IOSQE_ASYNC is set, go straight + * to async execution. + */ + req->work.flags |= IO_WQ_WORK_CONCURRENT; + io_queue_async_work(req); + } else { __io_queue_sqe(req, sqe); + } } static inline void io_queue_link_head(struct io_kiocb *req) @@ -4031,7 +4041,7 @@ static inline void io_queue_link_head(struct io_kiocb *req) } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK) + IOSQE_IO_HARDLINK | IOSQE_ASYNC) static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) @@ -4044,6 +4054,8 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = -EINVAL; goto err_req; } + if (sqe->flags & IOSQE_ASYNC) + req->flags |= REQ_F_FORCE_ASYNC; ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { -- cgit From 9d76377f7e13c19441fdd066033345289f89b5fe Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 17 Dec 2019 02:22:07 +0300 Subject: io_uring: rename prev to head Calling "prev" a head of a link is a bit misleading. Rename it Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9ffcfdc6382b..a463402e9067 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4073,10 +4073,10 @@ err_req: * conditions are true (normal request), then just queue it. */ if (*link) { - struct io_kiocb *prev = *link; + struct io_kiocb *head = *link; if (sqe->flags & IOSQE_IO_DRAIN) - (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; + head->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; if (sqe->flags & IOSQE_IO_HARDLINK) req->flags |= REQ_F_HARDLINK; @@ -4089,11 +4089,11 @@ err_req: ret = io_req_defer_prep(req, sqe); if (ret) { /* fail even hard links since we don't submit */ - prev->flags |= REQ_F_FAIL_LINK; + head->flags |= REQ_F_FAIL_LINK; goto err_req; } - trace_io_uring_link(ctx, req, prev); - list_add_tail(&req->link_list, &prev->link_list); + trace_io_uring_link(ctx, req, head); + list_add_tail(&req->link_list, &head->link_list); } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { req->flags |= REQ_F_LINK; if (sqe->flags & IOSQE_IO_HARDLINK) -- cgit From 32fe525b6d10fec956cfe68f0db76839cd7f0ea5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 17 Dec 2019 22:26:58 +0300 Subject: io_uring: move *queue_link_head() from common path Move io_queue_link_head() to links handling code in io_submit_sqe(), so it wouldn't need extra checks and would have better data locality. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index a463402e9067..165f77cfc6cb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4047,14 +4047,17 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) { struct io_ring_ctx *ctx = req->ctx; + unsigned int sqe_flags; int ret; + sqe_flags = READ_ONCE(sqe->flags); + /* enforce forwards compatibility on users */ - if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) { + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; goto err_req; } - if (sqe->flags & IOSQE_ASYNC) + if (sqe_flags & IOSQE_ASYNC) req->flags |= REQ_F_FORCE_ASYNC; ret = io_req_set_file(state, req, sqe); @@ -4075,10 +4078,10 @@ err_req: if (*link) { struct io_kiocb *head = *link; - if (sqe->flags & IOSQE_IO_DRAIN) + if (sqe_flags & IOSQE_IO_DRAIN) head->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - if (sqe->flags & IOSQE_IO_HARDLINK) + if (sqe_flags & IOSQE_IO_HARDLINK) req->flags |= REQ_F_HARDLINK; if (io_alloc_async_ctx(req)) { @@ -4094,9 +4097,15 @@ err_req: } trace_io_uring_link(ctx, req, head); list_add_tail(&req->link_list, &head->link_list); - } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { + + /* last request of a link, enqueue the link */ + if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) { + io_queue_link_head(head); + *link = NULL; + } + } else if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { req->flags |= REQ_F_LINK; - if (sqe->flags & IOSQE_IO_HARDLINK) + if (sqe_flags & IOSQE_IO_HARDLINK) req->flags |= REQ_F_HARDLINK; INIT_LIST_HEAD(&req->link_list); @@ -4221,7 +4230,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; struct io_kiocb *req; - unsigned int sqe_flags; req = io_get_req(ctx, statep); if (unlikely(!req)) { @@ -4243,8 +4251,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } submitted++; - sqe_flags = sqe->flags; - req->ring_file = ring_file; req->ring_fd = ring_fd; req->has_user = *mm != NULL; @@ -4253,14 +4259,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, trace_io_uring_submit_sqe(ctx, req->user_data, true, async); if (!io_submit_sqe(req, sqe, statep, &link)) break; - /* - * If previous wasn't linked and we have a linked command, - * that's the end of the chain. Submit the previous link. - */ - if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) { - io_queue_link_head(link); - link = NULL; - } } if (link) -- cgit From add7b6b85a4dfa89283834d181e87ea2144b9028 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 Dec 2019 08:54:21 -0700 Subject: io_uring: remove two unnecessary function declarations __io_free_req() and io_double_put_req() aren't used before they are defined, so we can kill these two forwards. Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 165f77cfc6cb..b086a2dc47e2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -518,9 +518,7 @@ struct io_submit_state { static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); -static void __io_free_req(struct io_kiocb *req); static void io_put_req(struct io_kiocb *req); -static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); -- cgit From d3656344fea0339fb0365c8df4d2beba4e0089cd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 Dec 2019 09:50:26 -0700 Subject: io_uring: add lookup table for various opcode needs We currently have various switch statements that check if an opcode needs a file, mm, etc. These are hard to keep in sync as opcodes are added. Add a struct io_op_def that holds all of this information, so we have just one spot to update when opcodes are added. This also enables us to NOT allocate req->io if a deferred command doesn't need it, and corrects some mistakes we had in terms of what commands need mm context. Signed-off-by: Jens Axboe --- fs/io_uring.c | 208 +++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 155 insertions(+), 53 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index b086a2dc47e2..244aaccbc82b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -516,6 +516,135 @@ struct io_submit_state { unsigned int ios_left; }; +struct io_op_def { + /* needs req->io allocated for deferral/async */ + unsigned async_ctx : 1; + /* needs current->mm setup, does mm access */ + unsigned needs_mm : 1; + /* needs req->file assigned */ + unsigned needs_file : 1; + /* needs req->file assigned IFF fd is >= 0 */ + unsigned fd_non_neg : 1; + /* hash wq insertion if file is a regular file */ + unsigned hash_reg_file : 1; + /* unbound wq insertion if file is a non-regular file */ + unsigned unbound_nonreg_file : 1; +}; + +static const struct io_op_def io_op_defs[] = { + { + /* IORING_OP_NOP */ + }, + { + /* IORING_OP_READV */ + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_WRITEV */ + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_FSYNC */ + .needs_file = 1, + }, + { + /* IORING_OP_READ_FIXED */ + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_WRITE_FIXED */ + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_POLL_ADD */ + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_POLL_REMOVE */ + }, + { + /* IORING_OP_SYNC_FILE_RANGE */ + .needs_file = 1, + }, + { + /* IORING_OP_SENDMSG */ + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_RECVMSG */ + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_TIMEOUT */ + .async_ctx = 1, + .needs_mm = 1, + }, + { + /* IORING_OP_TIMEOUT_REMOVE */ + }, + { + /* IORING_OP_ACCEPT */ + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_ASYNC_CANCEL */ + }, + { + /* IORING_OP_LINK_TIMEOUT */ + .async_ctx = 1, + .needs_mm = 1, + }, + { + /* IORING_OP_CONNECT */ + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_FALLOCATE */ + .needs_file = 1, + }, + { + /* IORING_OP_OPENAT */ + .needs_file = 1, + .fd_non_neg = 1, + }, + { + /* IORING_OP_CLOSE */ + .needs_file = 1, + }, + { + /* IORING_OP_FILES_UPDATE */ + .needs_mm = 1, + }, + { + /* IORING_OP_STATX */ + .needs_mm = 1, + .needs_file = 1, + .fd_non_neg = 1, + }, +}; + static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); @@ -671,41 +800,20 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline bool io_req_needs_user(struct io_kiocb *req) -{ - return !(req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED); -} - static inline bool io_prep_async_work(struct io_kiocb *req, struct io_kiocb **link) { + const struct io_op_def *def = &io_op_defs[req->opcode]; bool do_hashed = false; - switch (req->opcode) { - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - /* only regular files should be hashed for writes */ - if (req->flags & REQ_F_ISREG) + if (req->flags & REQ_F_ISREG) { + if (def->hash_reg_file) do_hashed = true; - /* fall-through */ - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_SENDMSG: - case IORING_OP_RECVMSG: - case IORING_OP_ACCEPT: - case IORING_OP_POLL_ADD: - case IORING_OP_CONNECT: - /* - * We know REQ_F_ISREG is not set on some of these - * opcodes, but this enables us to keep the check in - * just one place. - */ - if (!(req->flags & REQ_F_ISREG)) + } else { + if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; - break; } - if (io_req_needs_user(req)) + if (def->needs_mm) req->work.flags |= IO_WQ_WORK_NEEDS_USER; *link = io_prep_linked_timeout(req); @@ -1826,6 +1934,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, static int io_alloc_async_ctx(struct io_kiocb *req) { + if (!io_op_defs[req->opcode].async_ctx) + return 0; req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); return req->io == NULL; } @@ -3765,29 +3875,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_wq_assign_next(workptr, nxt); } -static bool io_req_op_valid(int op) -{ - return op >= IORING_OP_NOP && op < IORING_OP_LAST; -} - static int io_req_needs_file(struct io_kiocb *req, int fd) { - switch (req->opcode) { - case IORING_OP_NOP: - case IORING_OP_POLL_REMOVE: - case IORING_OP_TIMEOUT: - case IORING_OP_TIMEOUT_REMOVE: - case IORING_OP_ASYNC_CANCEL: - case IORING_OP_LINK_TIMEOUT: + if (!io_op_defs[req->opcode].needs_file) return 0; - case IORING_OP_OPENAT: - case IORING_OP_STATX: - return fd != -1; - default: - if (io_req_op_valid(req->opcode)) - return 1; - return -EINVAL; - } + if (fd == -1 && io_op_defs[req->opcode].fd_non_neg) + return 0; + return 1; } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, @@ -3804,7 +3898,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, { struct io_ring_ctx *ctx = req->ctx; unsigned flags; - int fd, ret; + int fd; flags = READ_ONCE(sqe->flags); fd = READ_ONCE(sqe->fd); @@ -3812,9 +3906,8 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - ret = io_req_needs_file(req, fd); - if (ret <= 0) - return ret; + if (!io_req_needs_file(req, fd)) + return 0; if (flags & IOSQE_FIXED_FILE) { if (unlikely(!ctx->file_data || @@ -4240,7 +4333,16 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } - if (io_req_needs_user(req) && !*mm) { + /* will complete beyond this point, count as submitted */ + submitted++; + + if (unlikely(req->opcode >= IORING_OP_LAST)) { + io_cqring_add_event(req, -EINVAL); + io_double_put_req(req); + break; + } + + if (io_op_defs[req->opcode].needs_mm && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -4248,7 +4350,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } - submitted++; req->ring_file = ring_file; req->ring_fd = ring_fd; req->has_user = *mm != NULL; @@ -6104,6 +6205,7 @@ out_fput: static int __init io_uring_init(void) { + BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); return 0; }; -- cgit From ad3eb2c89fb24d14ac81f43eff8e85fece2c934d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 Dec 2019 17:12:20 -0700 Subject: io_uring: split overflow state into SQ and CQ side We currently check ->cq_overflow_list from both SQ and CQ context, which causes some bouncing of that cache line. Add separate bits of state for this instead, so that the SQ side can check using its own state, and likewise for the CQ side. This adds ->sq_check_overflow with the SQ state, and ->cq_check_overflow with the CQ state. If we hit an overflow condition, both of these bits are set. Likewise for overflow flush clear, we clear both bits. For the fast path of just checking if there's an overflow condition on either the SQ or CQ side, we can use our own private bit for this. Signed-off-by: Jens Axboe --- fs/io_uring.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 244aaccbc82b..dfa99da8b2d1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -224,13 +224,14 @@ struct io_ring_ctx { unsigned sq_thread_idle; unsigned cached_sq_dropped; atomic_t cached_cq_overflow; - struct io_uring_sqe *sq_sqes; + unsigned long sq_check_overflow; struct list_head defer_list; struct list_head timeout_list; struct list_head cq_overflow_list; wait_queue_head_t inflight_wait; + struct io_uring_sqe *sq_sqes; } ____cacheline_aligned_in_smp; struct io_rings *rings; @@ -272,6 +273,7 @@ struct io_ring_ctx { unsigned cq_entries; unsigned cq_mask; atomic_t cq_timeouts; + unsigned long cq_check_overflow; struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; struct eventfd_ctx *cq_ev_fd; @@ -950,6 +952,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) } io_commit_cqring(ctx); + if (cqe) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + } spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -983,6 +989,10 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); } else { + if (list_empty(&ctx->cq_overflow_list)) { + set_bit(0, &ctx->sq_check_overflow); + set_bit(0, &ctx->cq_check_overflow); + } refcount_inc(&req->refs); req->result = res; list_add_tail(&req->list, &ctx->cq_overflow_list); @@ -1285,19 +1295,21 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) { struct io_rings *rings = ctx->rings; - /* - * noflush == true is from the waitqueue handler, just ensure we wake - * up the task, and the next invocation will flush the entries. We - * cannot safely to it from here. - */ - if (noflush && !list_empty(&ctx->cq_overflow_list)) - return -1U; + if (test_bit(0, &ctx->cq_check_overflow)) { + /* + * noflush == true is from the waitqueue handler, just ensure + * we wake up the task, and the next invocation will flush the + * entries. We cannot safely to it from here. + */ + if (noflush && !list_empty(&ctx->cq_overflow_list)) + return -1U; - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx, false); + } /* See comment at the top of this file */ smp_rmb(); - return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); + return ctx->cached_cq_tail - READ_ONCE(rings->cq.head); } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) @@ -4309,9 +4321,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, bool mm_fault = false; /* if we have a backlog and couldn't flush it all, return BUSY */ - if (!list_empty(&ctx->cq_overflow_list) && - !io_cqring_overflow_flush(ctx, false)) - return -EBUSY; + if (test_bit(0, &ctx->sq_check_overflow)) { + if (!list_empty(&ctx->cq_overflow_list) && + !io_cqring_overflow_flush(ctx, false)) + return -EBUSY; + } if (nr > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, nr); -- cgit From e94f141bd248ebdadcb7351f1e70b31cee5add53 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Dec 2019 12:06:02 -0700 Subject: io_uring: improve poll completion performance For busy IORING_OP_POLL_ADD workloads, we can have enough contention on the completion lock that we fail the inline completion path quite often as we fail the trylock on that lock. Add a list for deferred completions that we can use in that case. This helps reduce the number of async offloads we have to do, as if we get multiple completions in a row, we'll piggy back on to the poll_llist instead of having to queue our own offload. Signed-off-by: Jens Axboe --- fs/io_uring.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 88 insertions(+), 20 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index dfa99da8b2d1..c54a8bd37b54 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -286,7 +286,8 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; - bool poll_multi_file; + struct llist_head poll_llist; + /* * ->poll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. @@ -296,6 +297,7 @@ struct io_ring_ctx { struct list_head poll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; + bool poll_multi_file; spinlock_t inflight_lock; struct list_head inflight_list; @@ -453,7 +455,14 @@ struct io_kiocb { }; struct io_async_ctx *io; - struct file *ring_file; + union { + /* + * ring_file is only used in the submission path, and + * llist_node is only used for poll deferred completions + */ + struct file *ring_file; + struct llist_node llist_node; + }; int ring_fd; bool has_user; bool in_async; @@ -725,6 +734,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); + init_llist_head(&ctx->poll_llist); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); @@ -1320,6 +1330,20 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } +static inline bool io_req_multi_free(struct io_kiocb *req) +{ + /* + * If we're not using fixed files, we have to pair the completion part + * with the file put. Use regular completions for those, only batch + * free for fixed file and non-linked commands. + */ + if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == REQ_F_FIXED_FILE) + && !io_is_fallback_req(req) && !req->io) + return true; + + return false; +} + /* * Find and free completed poll iocbs */ @@ -1339,14 +1363,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, (*nr_events)++; if (refcount_dec_and_test(&req->refs)) { - /* If we're not using fixed files, we have to pair the - * completion part with the file put. Use regular - * completions for those, only batch free for fixed - * file and non-linked commands. - */ - if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == - REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && - !req->io) { + if (io_req_multi_free(req)) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); @@ -3081,6 +3098,44 @@ static void io_poll_complete_work(struct io_wq_work **workptr) io_wq_assign_next(workptr, nxt); } +static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes) +{ + void *reqs[IO_IOPOLL_BATCH]; + struct io_kiocb *req, *tmp; + int to_free = 0; + + spin_lock_irq(&ctx->completion_lock); + llist_for_each_entry_safe(req, tmp, nodes, llist_node) { + hash_del(&req->hash_node); + io_poll_complete(req, req->result, 0); + + if (refcount_dec_and_test(&req->refs)) { + if (io_req_multi_free(req)) { + reqs[to_free++] = req; + if (to_free == ARRAY_SIZE(reqs)) + io_free_req_many(ctx, reqs, &to_free); + } else { + req->flags |= REQ_F_COMP_LOCKED; + io_free_req(req); + } + } + } + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + io_free_req_many(ctx, reqs, &to_free); +} + +static void io_poll_flush(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct llist_node *nodes; + + nodes = llist_del_all(&req->ctx->poll_llist); + if (nodes) + __io_poll_flush(req->ctx, nodes); +} + static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { @@ -3088,7 +3143,6 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); - unsigned long flags; /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) @@ -3102,17 +3156,31 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * If we have a link timeout we're going to need the completion_lock * for finalizing the request, mark us as having grabbed that already. */ - if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - hash_del(&req->hash_node); - io_poll_complete(req, mask, 0); - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (mask) { + unsigned long flags; - io_cqring_ev_posted(ctx); - } else { - io_queue_async_work(req); + if (llist_empty(&ctx->poll_llist) && + spin_trylock_irqsave(&ctx->completion_lock, flags)) { + hash_del(&req->hash_node); + io_poll_complete(req, mask, 0); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + req = NULL; + } else { + req->result = mask; + req->llist_node.next = NULL; + /* if the list wasn't empty, we're done */ + if (!llist_add(&req->llist_node, &ctx->poll_llist)) + req = NULL; + else + req->work.func = io_poll_flush; + } } + if (req) + io_queue_async_work(req); return 1; } -- cgit From 3a6820f2bb8a079975109c25a5d1f29f46bce5d2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 22 Dec 2019 15:19:35 -0700 Subject: io_uring: add non-vectored read/write commands For uses cases that don't already naturally have an iovec, it's easier (or more convenient) to just use a buffer address + length. This is particular true if the use case is from languages that want to create a memory safe abstraction on top of io_uring, and where introducing the need for the iovec may impose an ownership issue. For those cases, they currently need an indirection buffer, which means allocating data just for this purpose. Add basic read/write that don't require the iovec. Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index c54a8bd37b54..407ba3388e14 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -654,6 +654,18 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .fd_non_neg = 1, }, + { + /* IORING_OP_READ */ + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_WRITE */ + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -1867,6 +1879,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (req->rw.kiocb.private) return -EINVAL; + if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { + ssize_t ret; + ret = import_single_range(rw, buf, sqe_len, *iovec, iter); + *iovec = NULL; + return ret; + } + if (req->io) { struct io_async_rw *iorw = &req->io->rw; @@ -3634,10 +3653,12 @@ static int io_req_defer_prep(struct io_kiocb *req, break; case IORING_OP_READV: case IORING_OP_READ_FIXED: + case IORING_OP_READ: ret = io_read_prep(req, sqe, true); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: ret = io_write_prep(req, sqe, true); break; case IORING_OP_POLL_ADD: @@ -3741,6 +3762,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; case IORING_OP_READV: case IORING_OP_READ_FIXED: + case IORING_OP_READ: if (sqe) { ret = io_read_prep(req, sqe, force_nonblock); if (ret < 0) @@ -3750,6 +3772,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: if (sqe) { ret = io_write_prep(req, sqe, force_nonblock); if (ret < 0) -- cgit From ba04291eb66ed895f194ae5abd3748d72bf8aaea Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Dec 2019 16:33:42 -0700 Subject: io_uring: allow use of offset == -1 to mean file position MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This behaves like preadv2/pwritev2 with offset == -1, it'll use (and update) the current file position. This obviously comes with the caveat that if the application has multiple read/writes in flight, then the end result will not be as expected. This is similar to threads sharing a file descriptor and doing IO using the current file position. Since this feature isn't easily detectable by doing a read or write, add a feature flags, IORING_FEAT_RW_CUR_POS, to allow applications to detect presence of this feature. Reported-by: 李通洲 Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 407ba3388e14..5286620e4e46 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -495,6 +495,7 @@ struct io_kiocb { #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ #define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ #define REQ_F_FORCE_ASYNC 131072 /* IOSQE_ASYNC */ +#define REQ_F_CUR_POS 262144 /* read/write uses file position */ u64 user_data; u32 result; u32 sequence; @@ -1711,6 +1712,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); + if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + } kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -1782,6 +1787,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, bool in_async) { + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + + if (req->flags & REQ_F_CUR_POS) + req->file->f_pos = kiocb->ki_pos; if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) *nxt = __io_complete_rw(kiocb, ret); else @@ -6154,7 +6163,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE; + IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: -- cgit From 4840e418c2fc533d55ff6caa5b9313eed1d26cfd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Dec 2019 22:03:45 -0700 Subject: io_uring: add IORING_OP_FADVISE This adds support for doing fadvise through io_uring. We assume that WILLNEED doesn't block, but that DONTNEED may block. Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5286620e4e46..9ca12b900b42 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -72,6 +72,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -400,6 +401,13 @@ struct io_files_update { u32 offset; }; +struct io_fadvise { + struct file *file; + u64 offset; + u32 len; + u32 advice; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -452,6 +460,7 @@ struct io_kiocb { struct io_open open; struct io_close close; struct io_files_update files_update; + struct io_fadvise fadvise; }; struct io_async_ctx *io; @@ -667,6 +676,10 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, }, + { + /* IORING_OP_FADVISE */ + .needs_file = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2436,6 +2449,35 @@ err: return 0; } +static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + if (sqe->ioprio || sqe->buf_index || sqe->addr) + return -EINVAL; + + req->fadvise.offset = READ_ONCE(sqe->off); + req->fadvise.len = READ_ONCE(sqe->len); + req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); + return 0; +} + +static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_fadvise *fa = &req->fadvise; + int ret; + + /* DONTNEED may block, others _should_ not */ + if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock) + return -EAGAIN; + + ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned lookup_flags; @@ -3721,6 +3763,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_STATX: ret = io_statx_prep(req, sqe); break; + case IORING_OP_FADVISE: + ret = io_fadvise_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3917,6 +3962,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_statx(req, nxt, force_nonblock); break; + case IORING_OP_FADVISE: + if (sqe) { + ret = io_fadvise_prep(req, sqe); + if (ret) + break; + } + ret = io_fadvise(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; -- cgit From c1ca757bd6f4632c510714631ddcc2d13030fe1e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Dec 2019 22:18:28 -0700 Subject: io_uring: add IORING_OP_MADVISE This adds support for doing madvise(2) through io_uring. We assume that any operation can block, and hence punt everything async. This could be improved, but hard to make bullet proof. The async punt ensures it's safe. Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9ca12b900b42..17a199c99c7c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -408,6 +408,13 @@ struct io_fadvise { u32 advice; }; +struct io_madvise { + struct file *file; + u64 addr; + u32 len; + u32 advice; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -461,6 +468,7 @@ struct io_kiocb { struct io_close close; struct io_files_update files_update; struct io_fadvise fadvise; + struct io_madvise madvise; }; struct io_async_ctx *io; @@ -680,6 +688,10 @@ static const struct io_op_def io_op_defs[] = { /* IORING_OP_FADVISE */ .needs_file = 1, }, + { + /* IORING_OP_MADVISE */ + .needs_mm = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2449,6 +2461,42 @@ err: return 0; } +static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + if (sqe->ioprio || sqe->buf_index || sqe->off) + return -EINVAL; + + req->madvise.addr = READ_ONCE(sqe->addr); + req->madvise.len = READ_ONCE(sqe->len); + req->madvise.advice = READ_ONCE(sqe->fadvise_advice); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + struct io_madvise *ma = &req->madvise; + int ret; + + if (force_nonblock) + return -EAGAIN; + + ret = do_madvise(ma->addr, ma->len, ma->advice); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (sqe->ioprio || sqe->buf_index || sqe->addr) @@ -3766,6 +3814,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_FADVISE: ret = io_fadvise_prep(req, sqe); break; + case IORING_OP_MADVISE: + ret = io_madvise_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3970,6 +4021,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_fadvise(req, nxt, force_nonblock); break; + case IORING_OP_MADVISE: + if (sqe) { + ret = io_madvise_prep(req, sqe); + if (ret) + break; + } + ret = io_madvise(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; -- cgit From 2b85edfc0c90efc68dea3d665bb4111bf0694e05 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 28 Dec 2019 14:13:03 +0300 Subject: io_uring: batch getting pcpu references percpu_ref_tryget() has its own overhead. Instead getting a reference for each request, grab a bunch once per io_submit_sqes(). ~5% throughput boost for a "submit and wait 128 nops" benchmark. Signed-off-by: Pavel Begunkov __io_req_free_empty() -> __io_req_do_free() Signed-off-by: Jens Axboe --- fs/io_uring.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 17a199c99c7c..9b869fb6c635 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1083,9 +1083,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; - if (!percpu_ref_tryget(&ctx->refs)) - return NULL; - if (!state) { req = kmem_cache_alloc(req_cachep, gfp); if (unlikely(!req)) @@ -1145,6 +1142,14 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) } } +static void __io_req_do_free(struct io_kiocb *req) +{ + if (likely(!io_is_fallback_req(req))) + kmem_cache_free(req_cachep, req); + else + clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); +} + static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1166,11 +1171,9 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } - percpu_ref_put(&ctx->refs); - if (likely(!io_is_fallback_req(req))) - kmem_cache_free(req_cachep, req); - else - clear_bit_unlock(0, (unsigned long *) ctx->fallback_req); + + percpu_ref_put(&req->ctx->refs); + __io_req_do_free(req); } static bool io_link_cancel_timeout(struct io_kiocb *req) @@ -4539,6 +4542,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, return -EBUSY; } + if (!percpu_ref_tryget_many(&ctx->refs, nr)) + return -EAGAIN; + if (nr > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, nr); statep = &state; @@ -4555,7 +4561,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } if (!io_get_sqring(ctx, req, &sqe)) { - __io_free_req(req); + __io_req_do_free(req); break; } @@ -4586,6 +4592,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } + if (submitted != nr) + percpu_ref_put_many(&ctx->refs, nr - submitted); if (link) io_queue_link_head(link); if (statep) -- cgit From 8237e045983d82ba78eaab5f60b9300927fc6796 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 28 Dec 2019 10:48:22 -0700 Subject: io_uring: wrap multi-req freeing in struct req_batch This cleans up the code a bit, and it allows us to build on top of the multi-req freeing. Signed-off-by: Jens Axboe --- fs/io_uring.c | 65 ++++++++++++++++++++++++++++------------------------------- 1 file changed, 31 insertions(+), 34 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9b869fb6c635..0d02987abf40 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1132,14 +1132,19 @@ fallback: return NULL; } -static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) +struct req_batch { + void *reqs[IO_IOPOLL_BATCH]; + int to_free; +}; + +static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) { - if (*nr) { - kmem_cache_free_bulk(req_cachep, *nr, reqs); - percpu_ref_put_many(&ctx->refs, *nr); - percpu_ref_put_many(&ctx->file_data->refs, *nr); - *nr = 0; - } + if (!rb->to_free) + return; + kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + percpu_ref_put_many(&ctx->refs, rb->to_free); + percpu_ref_put_many(&ctx->file_data->refs, rb->to_free); + rb->to_free = 0; } static void __io_req_do_free(struct io_kiocb *req) @@ -1371,7 +1376,7 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static inline bool io_req_multi_free(struct io_kiocb *req) +static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) { /* * If we're not using fixed files, we have to pair the completion part @@ -1379,8 +1384,12 @@ static inline bool io_req_multi_free(struct io_kiocb *req) * free for fixed file and non-linked commands. */ if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == REQ_F_FIXED_FILE) - && !io_is_fallback_req(req) && !req->io) + && !io_is_fallback_req(req) && !req->io) { + rb->reqs[rb->to_free++] = req; + if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) + io_free_req_many(req->ctx, rb); return true; + } return false; } @@ -1391,11 +1400,10 @@ static inline bool io_req_multi_free(struct io_kiocb *req) static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, struct list_head *done) { - void *reqs[IO_IOPOLL_BATCH]; + struct req_batch rb; struct io_kiocb *req; - int to_free; - to_free = 0; + rb.to_free = 0; while (!list_empty(done)) { req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); @@ -1403,19 +1411,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, io_cqring_fill_event(req, req->result); (*nr_events)++; - if (refcount_dec_and_test(&req->refs)) { - if (io_req_multi_free(req)) { - reqs[to_free++] = req; - if (to_free == ARRAY_SIZE(reqs)) - io_free_req_many(ctx, reqs, &to_free); - } else { - io_free_req(req); - } - } + if (refcount_dec_and_test(&req->refs) && + !io_req_multi_free(&rb, req)) + io_free_req(req); } io_commit_cqring(ctx); - io_free_req_many(ctx, reqs, &to_free); + io_free_req_many(ctx, &rb); } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, @@ -3221,30 +3223,25 @@ static void io_poll_complete_work(struct io_wq_work **workptr) static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes) { - void *reqs[IO_IOPOLL_BATCH]; struct io_kiocb *req, *tmp; - int to_free = 0; + struct req_batch rb; + rb.to_free = 0; spin_lock_irq(&ctx->completion_lock); llist_for_each_entry_safe(req, tmp, nodes, llist_node) { hash_del(&req->hash_node); io_poll_complete(req, req->result, 0); - if (refcount_dec_and_test(&req->refs)) { - if (io_req_multi_free(req)) { - reqs[to_free++] = req; - if (to_free == ARRAY_SIZE(reqs)) - io_free_req_many(ctx, reqs, &to_free); - } else { - req->flags |= REQ_F_COMP_LOCKED; - io_free_req(req); - } + if (refcount_dec_and_test(&req->refs) && + !io_req_multi_free(&rb, req)) { + req->flags |= REQ_F_COMP_LOCKED; + io_free_req(req); } } spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - io_free_req_many(ctx, reqs, &to_free); + io_free_req_many(ctx, &rb); } static void io_poll_flush(struct io_wq_work **workptr) -- cgit From c6ca97b30c47c7ad36107d3764bb4dc37026d171 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 28 Dec 2019 12:11:08 -0700 Subject: io_uring: extend batch freeing to cover more cases Currently we only batch free if fixed files are used, no links, no aux data, etc. This extends the batch freeing to only exclude the linked case and fallback case, and make io_free_req_many() handle the other cases just fine. Signed-off-by: Jens Axboe --- fs/io_uring.c | 100 ++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 69 insertions(+), 31 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0d02987abf40..1fbe3eb78d08 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1132,21 +1132,6 @@ fallback: return NULL; } -struct req_batch { - void *reqs[IO_IOPOLL_BATCH]; - int to_free; -}; - -static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) -{ - if (!rb->to_free) - return; - kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); - percpu_ref_put_many(&ctx->refs, rb->to_free); - percpu_ref_put_many(&ctx->file_data->refs, rb->to_free); - rb->to_free = 0; -} - static void __io_req_do_free(struct io_kiocb *req) { if (likely(!io_is_fallback_req(req))) @@ -1155,7 +1140,7 @@ static void __io_req_do_free(struct io_kiocb *req) clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); } -static void __io_free_req(struct io_kiocb *req) +static void __io_req_aux_free(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1167,7 +1152,14 @@ static void __io_free_req(struct io_kiocb *req) else fput(req->file); } +} + +static void __io_free_req(struct io_kiocb *req) +{ + __io_req_aux_free(req); + if (req->flags & REQ_F_INFLIGHT) { + struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->inflight_lock, flags); @@ -1181,6 +1173,56 @@ static void __io_free_req(struct io_kiocb *req) __io_req_do_free(req); } +struct req_batch { + void *reqs[IO_IOPOLL_BATCH]; + int to_free; + int need_iter; +}; + +static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) +{ + if (!rb->to_free) + return; + if (rb->need_iter) { + int i, inflight = 0; + unsigned long flags; + + for (i = 0; i < rb->to_free; i++) { + struct io_kiocb *req = rb->reqs[i]; + + if (req->flags & REQ_F_FIXED_FILE) + req->file = NULL; + if (req->flags & REQ_F_INFLIGHT) + inflight++; + else + rb->reqs[i] = NULL; + __io_req_aux_free(req); + } + if (!inflight) + goto do_free; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + for (i = 0; i < rb->to_free; i++) { + struct io_kiocb *req = rb->reqs[i]; + + if (req) { + list_del(&req->inflight_entry); + if (!--inflight) + break; + } + } + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + } +do_free: + kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + percpu_ref_put_many(&ctx->refs, rb->to_free); + percpu_ref_put_many(&ctx->file_data->refs, rb->to_free); + rb->to_free = rb->need_iter = 0; +} + static bool io_link_cancel_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1378,20 +1420,16 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) { - /* - * If we're not using fixed files, we have to pair the completion part - * with the file put. Use regular completions for those, only batch - * free for fixed file and non-linked commands. - */ - if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == REQ_F_FIXED_FILE) - && !io_is_fallback_req(req) && !req->io) { - rb->reqs[rb->to_free++] = req; - if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) - io_free_req_many(req->ctx, rb); - return true; - } + if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req)) + return false; - return false; + if (!(req->flags & REQ_F_FIXED_FILE) || req->io) + rb->need_iter++; + + rb->reqs[rb->to_free++] = req; + if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) + io_free_req_many(req->ctx, rb); + return true; } /* @@ -1403,7 +1441,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, struct req_batch rb; struct io_kiocb *req; - rb.to_free = 0; + rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); @@ -3226,7 +3264,7 @@ static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes) struct io_kiocb *req, *tmp; struct req_batch rb; - rb.to_free = 0; + rb.to_free = rb.need_iter = 0; spin_lock_irq(&ctx->completion_lock); llist_for_each_entry_safe(req, tmp, nodes, llist_node) { hash_del(&req->hash_node); -- cgit From 8110c1a6212e430a84edd2b83fe9043def8b743e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 28 Dec 2019 15:39:54 -0700 Subject: io_uring: add support for IORING_SETUP_CLAMP Some applications like to start small in terms of ring size, and then ramp up as needed. This is a bit tricky to do currently, since we don't advertise the max ring size. This adds IORING_SETUP_CLAMP. If set, and the values for SQ or CQ ring size exceed what we support, then clamp them at the max values instead of returning -EINVAL. Since we return the chosen ring sizes after setup, no further changes are needed on the application side. io_uring already changes the ring sizes if the application doesn't ask for power-of-two sizes, for example. Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1fbe3eb78d08..7c44b0ef10d7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6234,8 +6234,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) bool account_mem; int ret; - if (!entries || entries > IORING_MAX_ENTRIES) + if (!entries) return -EINVAL; + if (entries > IORING_MAX_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + entries = IORING_MAX_ENTRIES; + } /* * Use twice as many entries for the CQ ring. It's possible for the @@ -6252,8 +6257,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) * to a power-of-two, if it isn't already. We do NOT impose * any cq vs sq ring sizing. */ - if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES) + if (p->cq_entries < p->sq_entries) return -EINVAL; + if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + p->cq_entries = IORING_MAX_CQ_ENTRIES; + } p->cq_entries = roundup_pow_of_two(p->cq_entries); } else { p->cq_entries = 2 * p->sq_entries; @@ -6345,7 +6355,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) } if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE)) + IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | + IORING_SETUP_CLAMP)) return -EINVAL; ret = io_uring_create(entries, &p); -- cgit From 9ef4f124894b7b9241a3cf5f9b40db0812783d66 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Dec 2019 21:24:44 +0300 Subject: io_uring: clamp to_submit in io_submit_sqes() Make io_submit_sqes() to clamp @to_submit itself. It removes duplicated code and prepares for following changes. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7c44b0ef10d7..497ed610e8b2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4577,6 +4577,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, return -EBUSY; } + nr = min(nr, ctx->sq_entries); + if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; @@ -4751,7 +4753,6 @@ static int io_sq_thread(void *data) ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } - to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); mutex_unlock(&ctx->uring_lock); @@ -6100,7 +6101,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } - to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); /* already have mm, so io_submit_sqes() won't try to grab it */ cur_mm = ctx->sqo_mm; -- cgit From ee7d46d9db19ded7b7222af95add63606318a480 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Dec 2019 21:24:45 +0300 Subject: io_uring: optimise head checks in io_get_sqring() A user may ask to submit more than there is in the ring, and then io_uring will submit as much as it can. However, in the last iteration it will allocate an io_kiocb and immediately free it. It could do better and adjust @to_submit to what is in the ring. And since the ring's head is already checked here, there is no need to do it in the loop, spamming with smp_load_acquire()'s barriers Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 497ed610e8b2..3398f4052ec0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4522,7 +4522,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe **sqe_ptr) { - struct io_rings *rings = ctx->rings; u32 *sq_array = ctx->sq_array; unsigned head; @@ -4534,12 +4533,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ - head = ctx->cached_sq_head; - /* make sure SQ entry isn't read before tail */ - if (unlikely(head == smp_load_acquire(&rings->sq.tail))) - return false; - - head = READ_ONCE(sq_array[head & ctx->sq_mask]); + head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]); if (likely(head < ctx->sq_entries)) { /* * All io need record the previous position, if LINK vs DARIN, @@ -4557,7 +4551,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, /* drop invalid entries */ ctx->cached_sq_head++; ctx->cached_sq_dropped++; - WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped); + WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped); return false; } @@ -4577,7 +4571,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, return -EBUSY; } - nr = min(nr, ctx->sq_entries); + /* make sure SQ entry isn't read before tail */ + nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; -- cgit From caf582c652feccd42c50923f0467c4f2dcef279e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Dec 2019 21:24:46 +0300 Subject: io_uring: optimise commit_sqring() for common case It should be pretty rare to not submitting anything when there is something in the ring. No need to keep heuristics for this case. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3398f4052ec0..8613eae31f4c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4501,14 +4501,12 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) { - /* - * Ensure any loads from the SQEs are done at this point, - * since once we write the new head, the application could - * write new data to them. - */ - smp_store_release(&rings->sq.head, ctx->cached_sq_head); - } + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&rings->sq.head, ctx->cached_sq_head); } /* -- cgit From 2550878f8421f7912fdd56b38c630b797f95c749 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Dec 2019 21:24:47 +0300 Subject: io_uring: remove extra io_wq_current_is_worker() io_wq workers use io_issue_sqe() to forward sqes and never io_queue_sqe(). Remove extra check for io_wq_current_is_worker() Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8613eae31f4c..0d083811ccb9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4371,8 +4371,7 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) req_set_fail_links(req); io_double_put_req(req); } - } else if ((req->flags & REQ_F_FORCE_ASYNC) && - !io_wq_current_is_worker()) { + } else if (req->flags & REQ_F_FORCE_ASYNC) { /* * Never try inline submit of IOSQE_ASYNC is set, go straight * to async execution. -- cgit From fddafacee287b3140212c92464077e971401f860 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 4 Jan 2020 20:19:44 -0700 Subject: io_uring: add support for send(2) and recv(2) This adds IORING_OP_SEND for send(2) support, and IORING_OP_RECV for recv(2) support. Signed-off-by: Jens Axboe --- fs/io_uring.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 135 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0d083811ccb9..edf072b6cb8f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -377,8 +377,12 @@ struct io_connect { struct io_sr_msg { struct file *file; - struct user_msghdr __user *msg; + union { + struct user_msghdr __user *msg; + void __user *buf; + }; int msg_flags; + size_t len; }; struct io_open { @@ -692,6 +696,18 @@ static const struct io_op_def io_op_defs[] = { /* IORING_OP_MADVISE */ .needs_mm = 1, }, + { + /* IORING_OP_SEND */ + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_RECV */ + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2802,8 +2818,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->len = READ_ONCE(sqe->len); - if (!io) + if (!io || req->opcode == IORING_OP_SEND) return 0; io->msg.iov = io->msg.fast_iov; @@ -2883,6 +2900,56 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, #endif } +static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; + unsigned flags; + + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, + &msg.msg_iter); + if (ret) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_sendmsg_sock(sock, &msg, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -2893,7 +2960,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (!io) + if (!io || req->opcode == IORING_OP_RECV) return 0; io->msg.iov = io->msg.fast_iov; @@ -2975,6 +3042,59 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, #endif } +static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; + unsigned flags; + + ret = import_single_range(READ, sr->buf, sr->len, &iov, + &msg.msg_iter); + if (ret) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_iocb = NULL; + msg.msg_flags = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + + static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) @@ -3811,9 +3931,11 @@ static int io_req_defer_prep(struct io_kiocb *req, ret = io_prep_sfr(req, sqe); break; case IORING_OP_SENDMSG: + case IORING_OP_SEND: ret = io_sendmsg_prep(req, sqe); break; case IORING_OP_RECVMSG: + case IORING_OP_RECV: ret = io_recvmsg_prep(req, sqe); break; case IORING_OP_CONNECT: @@ -3956,20 +4078,28 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = io_sync_file_range(req, nxt, force_nonblock); break; case IORING_OP_SENDMSG: + case IORING_OP_SEND: if (sqe) { ret = io_sendmsg_prep(req, sqe); if (ret < 0) break; } - ret = io_sendmsg(req, nxt, force_nonblock); + if (req->opcode == IORING_OP_SENDMSG) + ret = io_sendmsg(req, nxt, force_nonblock); + else + ret = io_send(req, nxt, force_nonblock); break; case IORING_OP_RECVMSG: + case IORING_OP_RECV: if (sqe) { ret = io_recvmsg_prep(req, sqe); if (ret) break; } - ret = io_recvmsg(req, nxt, force_nonblock); + if (req->opcode == IORING_OP_RECVMSG) + ret = io_recvmsg(req, nxt, force_nonblock); + else + ret = io_recv(req, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: if (sqe) { -- cgit From 96fd84d83a778450ffae737d9efa546ac3983b1f Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 7 Jan 2020 22:22:44 +0800 Subject: io_uring: Remove unnecessary null check Null check kfree is redundant, so remove it. This is detected by coccinelle. Signed-off-by: YueHaibing Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index edf072b6cb8f..00426b686092 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1160,8 +1160,7 @@ static void __io_req_aux_free(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (req->io) - kfree(req->io); + kfree(req->io); if (req->file) { if (req->flags & REQ_F_FIXED_FILE) percpu_ref_put(&ctx->file_data->refs); -- cgit From c150368b496837cb207712e78f903ccfd7633b93 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2020 08:26:07 -0700 Subject: io_uring: file set registration should use interruptible waits If an application attempts to register a set with unbounded requests pending, we can be stuck here forever if they don't complete. We can make this wait interruptible, and just abort if we get signaled. Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 00426b686092..2c036972930f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6524,8 +6524,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, * after we've killed the percpu ref. */ mutex_unlock(&ctx->uring_lock); - wait_for_completion(&ctx->completions[0]); + ret = wait_for_completion_interruptible(&ctx->completions[0]); mutex_lock(&ctx->uring_lock); + if (ret) { + percpu_ref_resurrect(&ctx->refs); + ret = -EINTR; + goto out; + } } switch (opcode) { @@ -6571,8 +6576,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (opcode != IORING_UNREGISTER_FILES && opcode != IORING_REGISTER_FILES_UPDATE) { /* bring the ctx back to life */ - reinit_completion(&ctx->completions[0]); percpu_ref_reinit(&ctx->refs); +out: + reinit_completion(&ctx->completions[0]); } return ret; } -- cgit From 69b3e546139a21b3046b6bf0cb79d5e8c9a3fa75 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2020 11:01:46 -0700 Subject: io_uring: change io_ring_ctx bool fields into bit fields In preparation for adding another one, which would make us spill into another long (and hence bump the size of the ctx), change them to bit fields. Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2c036972930f..42bf83b3fbd5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -202,10 +202,10 @@ struct io_ring_ctx { struct { unsigned int flags; - bool compat; - bool account_mem; - bool cq_overflow_flushed; - bool drain_next; + int compat: 1; + int account_mem: 1; + int cq_overflow_flushed: 1; + int drain_next: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -994,7 +994,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) /* if force is set, the ring is going away. always drop after that */ if (force) - ctx->cq_overflow_flushed = true; + ctx->cq_overflow_flushed = 1; cqe = NULL; while (!list_empty(&ctx->cq_overflow_list)) { @@ -4489,9 +4489,9 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->drain_next)) { req->flags |= REQ_F_IO_DRAIN; - req->ctx->drain_next = false; + req->ctx->drain_next = 0; } - req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); + req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK) != 0; ret = io_req_defer(req, sqe); if (ret) { -- cgit From f2842ab5b72d7ee5f7f8385c2d4f32c133f5837b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2020 11:04:00 -0700 Subject: io_uring: enable option to only trigger eventfd for async completions If an application is using eventfd notifications with poll to know when new SQEs can be issued, it's expecting the following read/writes to complete inline. And with that, it knows that there are events available, and don't want spurious wakeups on the eventfd for those requests. This adds IORING_REGISTER_EVENTFD_ASYNC, which works just like IORING_REGISTER_EVENTFD, except it only triggers notifications for events that happen from async completions (IRQ, or io-wq worker completions). Any completions inline from the submission itself will not trigger notifications. Suggested-by: Mark Papadakis Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 42bf83b3fbd5..70656762244f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -206,6 +206,7 @@ struct io_ring_ctx { int account_mem: 1; int cq_overflow_flushed: 1; int drain_next: 1; + int eventfd_async: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -963,13 +964,20 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) return &rings->cqes[tail & ctx->cq_mask]; } +static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) +{ + if (!ctx->eventfd_async) + return true; + return io_wq_current_is_worker() || in_interrupt(); +} + static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); - if (ctx->cq_ev_fd) + if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } @@ -6556,10 +6564,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_sqe_files_update(ctx, arg, nr_args); break; case IORING_REGISTER_EVENTFD: + case IORING_REGISTER_EVENTFD_ASYNC: ret = -EINVAL; if (nr_args != 1) break; ret = io_eventfd_register(ctx, arg); + if (ret) + break; + if (opcode == IORING_REGISTER_EVENTFD_ASYNC) + ctx->eventfd_async = 1; + else + ctx->eventfd_async = 0; break; case IORING_UNREGISTER_EVENTFD: ret = -EINVAL; -- cgit From c12cedf24e786509de031a832e6b0e5f8b3ca37b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2020 17:41:21 -0700 Subject: io_uring: add 'struct open_how' to the openat request context We'll need this for openat2(2) support, remove flags and mode from the existing io_open struct. Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 70656762244f..d8d252b4c9be 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -390,13 +390,12 @@ struct io_open { struct file *file; int dfd; union { - umode_t mode; unsigned mask; }; const char __user *fname; struct filename *filename; struct statx __user *buffer; - int flags; + struct open_how how; }; struct io_files_update { @@ -2474,9 +2473,9 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; req->open.dfd = READ_ONCE(sqe->fd); - req->open.mode = READ_ONCE(sqe->len); + req->open.how.mode = READ_ONCE(sqe->len); req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->open.flags = READ_ONCE(sqe->open_flags); + req->open.how.flags = READ_ONCE(sqe->open_flags); req->open.filename = getname(req->open.fname); if (IS_ERR(req->open.filename)) { @@ -2501,7 +2500,7 @@ static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, return -EAGAIN; } - how = build_open_how(req->open.flags, req->open.mode); + how = build_open_how(req->open.how.flags, req->open.how.mode); ret = build_open_flags(&how, &op); if (ret) goto err; @@ -2604,9 +2603,9 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->open.mask = READ_ONCE(sqe->len); req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - req->open.flags = READ_ONCE(sqe->statx_flags); + req->open.how.flags = READ_ONCE(sqe->statx_flags); - if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.flags)) + if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags)) return -EINVAL; req->open.filename = getname_flags(req->open.fname, lookup_flags, NULL); @@ -2631,7 +2630,7 @@ static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt, if (force_nonblock) return -EAGAIN; - if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->flags)) + if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags)) return -EINVAL; retry: @@ -2643,7 +2642,7 @@ retry: if (ret) goto err; - ret = vfs_getattr(&path, &stat, ctx->mask, ctx->flags); + ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags); path_put(&path); if (retry_estale(ret, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; -- cgit From f8748881b17dc56b3faa1d30c823f071c56593e5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2020 17:47:02 -0700 Subject: io_uring: remove 'fname' from io_open structure We only use it internally in the prep functions for both statx and openat, so we don't need it to be persistent across the request. Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index d8d252b4c9be..3a57ea98fe3a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -392,7 +392,6 @@ struct io_open { union { unsigned mask; }; - const char __user *fname; struct filename *filename; struct statx __user *buffer; struct open_how how; @@ -2467,6 +2466,7 @@ static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + const char __user *fname; int ret; if (sqe->ioprio || sqe->buf_index) @@ -2474,10 +2474,10 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->open.dfd = READ_ONCE(sqe->fd); req->open.how.mode = READ_ONCE(sqe->len); - req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); req->open.how.flags = READ_ONCE(sqe->open_flags); - req->open.filename = getname(req->open.fname); + req->open.filename = getname(fname); if (IS_ERR(req->open.filename)) { ret = PTR_ERR(req->open.filename); req->open.filename = NULL; @@ -2593,6 +2593,7 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + const char __user *fname; unsigned lookup_flags; int ret; @@ -2601,14 +2602,14 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->open.dfd = READ_ONCE(sqe->fd); req->open.mask = READ_ONCE(sqe->len); - req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); req->open.how.flags = READ_ONCE(sqe->statx_flags); if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags)) return -EINVAL; - req->open.filename = getname_flags(req->open.fname, lookup_flags, NULL); + req->open.filename = getname_flags(fname, lookup_flags, NULL); if (IS_ERR(req->open.filename)) { ret = PTR_ERR(req->open.filename); req->open.filename = NULL; -- cgit From cebdb98617ae3e842c81c73758a185248b37cfd6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2020 17:59:24 -0700 Subject: io_uring: add support for IORING_OP_OPENAT2 Add support for the new openat2(2) system call. It's trivial to do, as we can have openat(2) just be wrapped around it. Suggested-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io_uring.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 6 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3a57ea98fe3a..0b30b0cf8af5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -707,6 +707,11 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, }, + { + /* IORING_OP_OPENAT2 */ + .needs_file = 1, + .fd_non_neg = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2487,11 +2492,46 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct open_how __user *how; + const char __user *fname; + size_t len; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + len = READ_ONCE(sqe->len); + + if (len < OPEN_HOW_SIZE_VER0) + return -EINVAL; + + ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, + len); + if (ret) + return ret; + + if (!(req->open.how.flags & O_PATH) && force_o_largefile()) + req->open.how.flags |= O_LARGEFILE; + + req->open.filename = getname(fname); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { struct open_flags op; - struct open_how how; struct file *file; int ret; @@ -2500,12 +2540,11 @@ static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, return -EAGAIN; } - how = build_open_how(req->open.how.flags, req->open.how.mode); - ret = build_open_flags(&how, &op); + ret = build_open_flags(&req->open.how, &op); if (ret) goto err; - ret = get_unused_fd_flags(how.flags); + ret = get_unused_fd_flags(req->open.how.flags); if (ret < 0) goto err; @@ -2526,6 +2565,13 @@ err: return 0; } +static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + req->open.how = build_open_how(req->open.how.flags, req->open.how.mode); + return io_openat2(req, nxt, force_nonblock); +} + static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) @@ -3984,6 +4030,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_MADVISE: ret = io_madvise_prep(req, sqe); break; + case IORING_OP_OPENAT2: + ret = io_openat2_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -4204,6 +4253,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_madvise(req, nxt, force_nonblock); break; + case IORING_OP_OPENAT2: + if (sqe) { + ret = io_openat2_prep(req, sqe); + if (ret) + break; + } + ret = io_openat2(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; -- cgit From 354420f705ccd0aa2d41249f3bb55b4afbed1873 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2020 18:55:15 -0700 Subject: io_uring: add opcode to issue trace event For some test apps at least, user_data is just zeroes. So it's not a good way to tell what the command actually is. Add the opcode to the issue trace point. Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0b30b0cf8af5..50233efd9445 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4810,7 +4810,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, req->has_user = *mm != NULL; req->in_async = async; req->needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, req->user_data, true, async); + trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, + true, async); if (!io_submit_sqe(req, sqe, statep, &link)) break; } -- cgit From 10fef4bebf979bb705feed087611293d5864adfe Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Jan 2020 07:52:28 -0700 Subject: io_uring: account fixed file references correctly in batch We can't assume that the whole batch has fixed files in it. If it's a mix, or none at all, then we can end up doing a ref put that either messes up accounting, or causes an oops if we have no fixed files at all. Also ensure we free requests properly between inflight accounted and normal requests. Fixes: 82c721577011 ("io_uring: extend batch freeing to cover more cases") Reported-by: Dmitrii Dolgov <9erthalion6@gmail.com> Reported-by: Pavel Begunkov Tested-by: Dmitrii Dolgov <9erthalion6@gmail.com> Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 50233efd9445..8a645a37b4c7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1207,21 +1207,24 @@ struct req_batch { static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) { + int fixed_refs = rb->to_free; + if (!rb->to_free) return; if (rb->need_iter) { int i, inflight = 0; unsigned long flags; + fixed_refs = 0; for (i = 0; i < rb->to_free; i++) { struct io_kiocb *req = rb->reqs[i]; - if (req->flags & REQ_F_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE) { req->file = NULL; + fixed_refs++; + } if (req->flags & REQ_F_INFLIGHT) inflight++; - else - rb->reqs[i] = NULL; __io_req_aux_free(req); } if (!inflight) @@ -1231,7 +1234,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) for (i = 0; i < rb->to_free; i++) { struct io_kiocb *req = rb->reqs[i]; - if (req) { + if (req->flags & REQ_F_INFLIGHT) { list_del(&req->inflight_entry); if (!--inflight) break; @@ -1244,8 +1247,9 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) } do_free: kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + if (fixed_refs) + percpu_ref_put_many(&ctx->file_data->refs, fixed_refs); percpu_ref_put_many(&ctx->refs, rb->to_free); - percpu_ref_put_many(&ctx->file_data->refs, rb->to_free); rb->to_free = rb->need_iter = 0; } -- cgit From 66f4af93da5761d2fa05c0dc673a47003cdb9cfe Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 16 Jan 2020 15:36:52 -0700 Subject: io_uring: add support for probing opcodes The application currently has no way of knowing if a given opcode is supported or not without having to try and issue one and see if we get -EINVAL or not. And even this approach is fraught with peril, as maybe we're getting -EINVAL due to some fields being missing, or maybe it's just not that easy to issue that particular command without doing some other leg work in terms of setup first. This adds IORING_REGISTER_PROBE, which fills in a structure with info on what it supported or not. This will work even with sparse opcode fields, which may happen in the future or even today if someone backports specific features to older kernels. Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8a645a37b4c7..7715a729271a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -561,6 +561,8 @@ struct io_op_def { unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ unsigned unbound_nonreg_file : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; }; static const struct io_op_def io_op_defs[] = { @@ -6566,6 +6568,45 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } +static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct io_uring_probe *p; + size_t size; + int i, ret; + + size = struct_size(p, ops, nr_args); + if (size == SIZE_MAX) + return -EOVERFLOW; + p = kzalloc(size, GFP_KERNEL); + if (!p) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(p, arg, size)) + goto out; + ret = -EINVAL; + if (memchr_inv(p, 0, size)) + goto out; + + p->last_op = IORING_OP_LAST - 1; + if (nr_args > IORING_OP_LAST) + nr_args = IORING_OP_LAST; + + for (i = 0; i < nr_args; i++) { + p->ops[i].op = i; + if (!io_op_defs[i].not_supported) + p->ops[i].flags = IO_URING_OP_SUPPORTED; + } + p->ops_len = i; + + ret = 0; + if (copy_to_user(arg, p, size)) + ret = -EFAULT; +out: + kfree(p); + return ret; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -6582,7 +6623,8 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return -ENXIO; if (opcode != IORING_UNREGISTER_FILES && - opcode != IORING_REGISTER_FILES_UPDATE) { + opcode != IORING_REGISTER_FILES_UPDATE && + opcode != IORING_REGISTER_PROBE) { percpu_ref_kill(&ctx->refs); /* @@ -6644,6 +6686,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_eventfd_unregister(ctx); break; + case IORING_REGISTER_PROBE: + ret = -EINVAL; + if (!arg || nr_args > 256) + break; + ret = io_probe(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; @@ -6651,7 +6699,8 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (opcode != IORING_UNREGISTER_FILES && - opcode != IORING_REGISTER_FILES_UPDATE) { + opcode != IORING_REGISTER_FILES_UPDATE && + opcode != IORING_REGISTER_PROBE) { /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); out: -- cgit From 711be0312df4d350fb5bf1671c132cccae5aaf9a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jan 2020 03:57:59 +0300 Subject: io_uring: optimise use of ctx->drain_next Move setting ctx->drain_next to the only place it could be set, when it got linked non-head requests. The same for checking it, it's interesting only for a head of a link or a non-linked request. No functional changes here. This removes some code from the common path and also removes REQ_F_DRAIN_LINK flag, as it doesn't need it anymore. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7715a729271a..68843a123000 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -506,7 +506,6 @@ struct io_kiocb { #define REQ_F_LINK 64 /* linked sqes */ #define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ -#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */ #define REQ_F_TIMEOUT 1024 /* timeout request */ #define REQ_F_ISREG 2048 /* regular file */ #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ @@ -4558,12 +4557,6 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; - if (unlikely(req->ctx->drain_next)) { - req->flags |= REQ_F_IO_DRAIN; - req->ctx->drain_next = 0; - } - req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK) != 0; - ret = io_req_defer(req, sqe); if (ret) { if (ret != -EIOCBQUEUED) { @@ -4630,8 +4623,10 @@ err_req: if (*link) { struct io_kiocb *head = *link; - if (sqe_flags & IOSQE_IO_DRAIN) - head->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; + if (sqe_flags & IOSQE_IO_DRAIN) { + head->flags |= REQ_F_IO_DRAIN; + ctx->drain_next = 1; + } if (sqe_flags & IOSQE_IO_HARDLINK) req->flags |= REQ_F_HARDLINK; @@ -4655,18 +4650,24 @@ err_req: io_queue_link_head(head); *link = NULL; } - } else if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { - req->flags |= REQ_F_LINK; - if (sqe_flags & IOSQE_IO_HARDLINK) - req->flags |= REQ_F_HARDLINK; - - INIT_LIST_HEAD(&req->link_list); - ret = io_req_defer_prep(req, sqe); - if (ret) - req->flags |= REQ_F_FAIL_LINK; - *link = req; } else { - io_queue_sqe(req, sqe); + if (unlikely(ctx->drain_next)) { + req->flags |= REQ_F_IO_DRAIN; + req->ctx->drain_next = 0; + } + if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { + req->flags |= REQ_F_LINK; + if (sqe_flags & IOSQE_IO_HARDLINK) + req->flags |= REQ_F_HARDLINK; + + INIT_LIST_HEAD(&req->link_list); + ret = io_req_defer_prep(req, sqe); + if (ret) + req->flags |= REQ_F_FAIL_LINK; + *link = req; + } else { + io_queue_sqe(req, sqe); + } } return true; -- cgit From 0791015837f1520dd72918355dcb1f1e79175255 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jan 2020 03:52:46 +0300 Subject: io_uring: remove extra check in __io_commit_cqring __io_commit_cqring() is almost always called when there is a change in the rings, so the check is rather pessimising. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 68843a123000..771954bb4c38 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -860,14 +860,12 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) { - /* order cqe stores with ring update */ - smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); + /* order cqe stores with ring update */ + smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); - if (wq_has_sleeper(&ctx->cq_wait)) { - wake_up_interruptible(&ctx->cq_wait); - kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); - } + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); } } -- cgit From b14cca0c84c760fbd39ad6bb7e1181e2df103d25 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jan 2020 04:45:59 +0300 Subject: io_uring: hide uring_fd in ctx req->ring_fd and req->ring_file are used only during the prep stage during submission, which is is protected by mutex. There is no need to store them per-request, place them in ctx. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 771954bb4c38..46cc1bc48062 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -251,6 +251,8 @@ struct io_ring_ctx { */ struct fixed_file_data *file_data; unsigned nr_user_files; + int ring_fd; + struct file *ring_file; /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; @@ -475,15 +477,10 @@ struct io_kiocb { }; struct io_async_ctx *io; - union { - /* - * ring_file is only used in the submission path, and - * llist_node is only used for poll deferred completions - */ - struct file *ring_file; - struct llist_node llist_node; - }; - int ring_fd; + /* + * llist_node is only used for poll deferred completions + */ + struct llist_node llist_node; bool has_user; bool in_async; bool needs_fixed_file; @@ -1141,7 +1138,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, got_it: req->io = NULL; - req->ring_file = NULL; req->file = NULL; req->ctx = ctx; req->flags = 0; @@ -2725,7 +2721,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->close.fd = READ_ONCE(sqe->fd); if (req->file->f_op == &io_uring_fops || - req->close.fd == req->ring_fd) + req->close.fd == req->ctx->ring_fd) return -EBADF; return 0; @@ -4395,7 +4391,7 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; - if (!req->ring_file) + if (!ctx->ring_file) return -EBADF; rcu_read_lock(); @@ -4406,7 +4402,7 @@ static int io_grab_files(struct io_kiocb *req) * the fd has changed since we started down this path, and disallow * this operation if it has. */ - if (fcheck(req->ring_fd) == req->ring_file) { + if (fcheck(ctx->ring_fd) == ctx->ring_file) { list_add(&req->inflight_entry, &ctx->inflight_list); req->flags |= REQ_F_INFLIGHT; req->work.files = current->files; @@ -4778,6 +4774,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, statep = &state; } + ctx->ring_fd = ring_fd; + ctx->ring_file = ring_file; + for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; struct io_kiocb *req; @@ -4810,8 +4809,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } - req->ring_file = ring_file; - req->ring_fd = ring_fd; req->has_user = *mm != NULL; req->in_async = async; req->needs_fixed_file = async; -- cgit From e46a7950d362231a4d0b078af5f4c109b8e5ac9e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Jan 2020 11:15:34 -0700 Subject: io_uring: file switch work needs to get flushed on exit We currently flush early, but if we have something in progress and a new switch is scheduled, we need to ensure to flush after our teardown as well. Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 46cc1bc48062..a0a58c181eaf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5089,11 +5089,14 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) return -ENXIO; /* protect against inflight atomic switch, which drops the ref */ - flush_work(&data->ref_work); percpu_ref_get(&data->refs); + /* wait for existing switches */ + flush_work(&data->ref_work); percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill); wait_for_completion(&data->done); percpu_ref_put(&data->refs); + /* flush potential new switch */ + flush_work(&data->ref_work); percpu_ref_exit(&data->refs); __io_sqe_files_unregister(ctx); -- cgit From 87987898a1dbc69b1138f7c10eb9abd655c03396 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jan 2020 01:22:30 +0300 Subject: io_uring: remove REQ_F_IO_DRAINED A request can get into the defer list only once, there is no need for marking it as drained, so remove it. This probably was left after extracting __need_defer() for use in timeouts. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index a0a58c181eaf..ed1adeda370e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -499,7 +499,6 @@ struct io_kiocb { #define REQ_F_FIXED_FILE 4 /* ctx owns file */ #define REQ_F_LINK_NEXT 8 /* already grabbed next link */ #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ -#define REQ_F_IO_DRAINED 32 /* drain done */ #define REQ_F_LINK 64 /* linked sqes */ #define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ @@ -817,7 +816,7 @@ static inline bool __req_need_defer(struct io_kiocb *req) static inline bool req_need_defer(struct io_kiocb *req) { - if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN) + if (unlikely(req->flags & REQ_F_IO_DRAIN)) return __req_need_defer(req); return false; @@ -939,10 +938,8 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); - while ((req = io_get_deferred_req(ctx)) != NULL) { - req->flags |= REQ_F_IO_DRAINED; + while ((req = io_get_deferred_req(ctx)) != NULL) io_queue_async_work(req); - } } static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) -- cgit From 6b47ee6ecab142f938a40bf3b297abac74218ee2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jan 2020 20:22:41 +0300 Subject: io_uring: optimise sqe-to-req flags translation For each IOSQE_* flag there is a corresponding REQ_F_* flag. And there is a repetitive pattern of their translation: e.g. if (sqe->flags & SQE_FLAG*) req->flags |= REQ_F_FLAG* Use same numeric values/bits for them and copy instead of manual handling. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 92 ++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 29 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index ed1adeda370e..cf5bad51f752 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -452,6 +453,65 @@ struct io_async_ctx { }; }; +enum { + REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, + REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, + REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, + REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, + REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + + REQ_F_LINK_NEXT_BIT, + REQ_F_FAIL_LINK_BIT, + REQ_F_INFLIGHT_BIT, + REQ_F_CUR_POS_BIT, + REQ_F_NOWAIT_BIT, + REQ_F_IOPOLL_COMPLETED_BIT, + REQ_F_LINK_TIMEOUT_BIT, + REQ_F_TIMEOUT_BIT, + REQ_F_ISREG_BIT, + REQ_F_MUST_PUNT_BIT, + REQ_F_TIMEOUT_NOSEQ_BIT, + REQ_F_COMP_LOCKED_BIT, +}; + +enum { + /* ctx owns file */ + REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), + /* drain existing IO first */ + REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), + /* linked sqes */ + REQ_F_LINK = BIT(REQ_F_LINK_BIT), + /* doesn't sever on completion < 0 */ + REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), + /* IOSQE_ASYNC */ + REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + + /* already grabbed next link */ + REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), + /* fail rest of links */ + REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), + /* on inflight list */ + REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), + /* read/write uses file position */ + REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), + /* must not punt to workers */ + REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), + /* polled IO has completed */ + REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT), + /* has linked timeout */ + REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), + /* timeout request */ + REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), + /* regular file */ + REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + /* must be punted even for NONBLOCK */ + REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), + /* no timeout sequence */ + REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), + /* completion under lock */ + REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -494,23 +554,6 @@ struct io_kiocb { struct list_head link_list; unsigned int flags; refcount_t refs; -#define REQ_F_NOWAIT 1 /* must not punt to workers */ -#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ -#define REQ_F_FIXED_FILE 4 /* ctx owns file */ -#define REQ_F_LINK_NEXT 8 /* already grabbed next link */ -#define REQ_F_IO_DRAIN 16 /* drain existing IO first */ -#define REQ_F_LINK 64 /* linked sqes */ -#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ -#define REQ_F_FAIL_LINK 256 /* fail rest of links */ -#define REQ_F_TIMEOUT 1024 /* timeout request */ -#define REQ_F_ISREG 2048 /* regular file */ -#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ -#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ -#define REQ_F_INFLIGHT 16384 /* on inflight list */ -#define REQ_F_COMP_LOCKED 32768 /* completion under lock */ -#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ -#define REQ_F_FORCE_ASYNC 131072 /* IOSQE_ASYNC */ -#define REQ_F_CUR_POS 262144 /* read/write uses file position */ u64 user_data; u32 result; u32 sequence; @@ -4355,9 +4398,6 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, flags = READ_ONCE(sqe->flags); fd = READ_ONCE(sqe->fd); - if (flags & IOSQE_IO_DRAIN) - req->flags |= REQ_F_IO_DRAIN; - if (!io_req_needs_file(req, fd)) return 0; @@ -4593,8 +4633,9 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = -EINVAL; goto err_req; } - if (sqe_flags & IOSQE_ASYNC) - req->flags |= REQ_F_FORCE_ASYNC; + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| + IOSQE_ASYNC); ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { @@ -4618,10 +4659,6 @@ err_req: head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - - if (sqe_flags & IOSQE_IO_HARDLINK) - req->flags |= REQ_F_HARDLINK; - if (io_alloc_async_ctx(req)) { ret = -EAGAIN; goto err_req; @@ -4648,9 +4685,6 @@ err_req: } if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { req->flags |= REQ_F_LINK; - if (sqe_flags & IOSQE_IO_HARDLINK) - req->flags |= REQ_F_HARDLINK; - INIT_LIST_HEAD(&req->link_list); ret = io_req_defer_prep(req, sqe); if (ret) -- cgit From 0463b6c58e557118d602b2f225fa3bbe9b6f3560 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jan 2020 21:35:38 +0300 Subject: io_uring: use labeled array init in io_op_defs Don't rely on implicit ordering of IORING_OP_ and explicitly place them at a right place in io_op_defs. Now former comments are now a part of the code and won't ever outdate. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 91 +++++++++++++++++++---------------------------------------- 1 file changed, 29 insertions(+), 62 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index cf5bad51f752..09503d1e9e45 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -604,151 +604,118 @@ struct io_op_def { }; static const struct io_op_def io_op_defs[] = { - { - /* IORING_OP_NOP */ - }, - { - /* IORING_OP_READV */ + [IORING_OP_NOP] = {}, + [IORING_OP_READV] = { .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_WRITEV */ + [IORING_OP_WRITEV] = { .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_FSYNC */ + [IORING_OP_FSYNC] = { .needs_file = 1, }, - { - /* IORING_OP_READ_FIXED */ + [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_WRITE_FIXED */ + [IORING_OP_WRITE_FIXED] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_POLL_ADD */ + [IORING_OP_POLL_ADD] = { .needs_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_POLL_REMOVE */ - }, - { - /* IORING_OP_SYNC_FILE_RANGE */ + [IORING_OP_POLL_REMOVE] = {}, + [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, }, - { - /* IORING_OP_SENDMSG */ + [IORING_OP_SENDMSG] = { .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_RECVMSG */ + [IORING_OP_RECVMSG] = { .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_TIMEOUT */ + [IORING_OP_TIMEOUT] = { .async_ctx = 1, .needs_mm = 1, }, - { - /* IORING_OP_TIMEOUT_REMOVE */ - }, - { - /* IORING_OP_ACCEPT */ + [IORING_OP_TIMEOUT_REMOVE] = {}, + [IORING_OP_ACCEPT] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_ASYNC_CANCEL */ - }, - { - /* IORING_OP_LINK_TIMEOUT */ + [IORING_OP_ASYNC_CANCEL] = {}, + [IORING_OP_LINK_TIMEOUT] = { .async_ctx = 1, .needs_mm = 1, }, - { - /* IORING_OP_CONNECT */ + [IORING_OP_CONNECT] = { .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_FALLOCATE */ + [IORING_OP_FALLOCATE] = { .needs_file = 1, }, - { - /* IORING_OP_OPENAT */ + [IORING_OP_OPENAT] = { .needs_file = 1, .fd_non_neg = 1, }, - { - /* IORING_OP_CLOSE */ + [IORING_OP_CLOSE] = { .needs_file = 1, }, - { - /* IORING_OP_FILES_UPDATE */ + [IORING_OP_FILES_UPDATE] = { .needs_mm = 1, }, - { - /* IORING_OP_STATX */ + [IORING_OP_STATX] = { .needs_mm = 1, .needs_file = 1, .fd_non_neg = 1, }, - { - /* IORING_OP_READ */ + [IORING_OP_READ] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_WRITE */ + [IORING_OP_WRITE] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_FADVISE */ + [IORING_OP_FADVISE] = { .needs_file = 1, }, - { - /* IORING_OP_MADVISE */ + [IORING_OP_MADVISE] = { .needs_mm = 1, }, - { - /* IORING_OP_SEND */ + [IORING_OP_SEND] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_RECV */ + [IORING_OP_RECV] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, }, - { - /* IORING_OP_OPENAT2 */ + [IORING_OP_OPENAT2] = { .needs_file = 1, .fd_non_neg = 1, }, -- cgit From 1118591ab883f46df4ab614cc976bc4c8e04a464 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 22 Jan 2020 23:09:35 +0300 Subject: io_uring: prep req when do IOSQE_ASYNC Whenever IOSQE_ASYNC is set, requests will be punted to async without getting into io_issue_req() and without proper preparation done (e.g. io_req_defer_prep()). Hence they will be left uninitialised. Prepare them before punting. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 09503d1e9e45..cdbc711ae5fd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4558,11 +4558,15 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) ret = io_req_defer(req, sqe); if (ret) { if (ret != -EIOCBQUEUED) { +fail_req: io_cqring_add_event(req, ret); req_set_fail_links(req); io_double_put_req(req); } } else if (req->flags & REQ_F_FORCE_ASYNC) { + ret = io_req_defer_prep(req, sqe); + if (unlikely(ret < 0)) + goto fail_req; /* * Never try inline submit of IOSQE_ASYNC is set, go straight * to async execution. -- cgit From 86a761f81ec87a96572214f5db606f60d36aaf08 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 22 Jan 2020 23:09:36 +0300 Subject: io_uring: honor IOSQE_ASYNC for linked reqs REQ_F_FORCE_ASYNC is checked only for the head of a link. Fix it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index cdbc711ae5fd..9f73586dcfb8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4512,6 +4512,7 @@ again: */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { +punt: if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { ret = io_grab_files(req); if (ret) @@ -4547,6 +4548,9 @@ done_req: if (nxt) { req = nxt; nxt = NULL; + + if (req->flags & REQ_F_FORCE_ASYNC) + goto punt; goto again; } } -- cgit From 980ad26304abf11e78caaa68023411b9c088b848 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Jan 2020 23:08:54 -0700 Subject: io_uring: don't attempt to copy iovec for READ/WRITE For the non-vectored variant of READV/WRITEV, we don't need to setup an async io context, and we flag that appropriately in the io_op_defs array. However, in fixing this for the 5.5 kernel in commit 74566df3a71c we didn't have these opcodes, so the check there was added just for the READ_FIXED and WRITE_FIXED opcodes. Replace that check with just a single check for needing async context, that covers all four of these read/write variants that don't use an iovec. Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9f73586dcfb8..4bde5da2c2f5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2112,8 +2112,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { - if (req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED) + if (!io_op_defs[req->opcode].async_ctx) return 0; if (!req->io && io_alloc_async_ctx(req)) return -ENOMEM; -- cgit From 8cdf2193a3335b4cfb6e023b41ac293d0843d287 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jan 2020 00:40:24 +0300 Subject: io_uring: add comment for drain_next Draining the middle of a link is tricky, so leave a comment there Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4bde5da2c2f5..a700ee5fc89d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4629,6 +4629,13 @@ err_req: if (*link) { struct io_kiocb *head = *link; + /* + * Taking sequential execution of a link, draining both sides + * of the link also fullfils IOSQE_IO_DRAIN semantics for all + * requests in the link. So, it drains the head and the + * next after the link request. The last one is done via + * drain_next flag to persist the effect across calls. + */ if (sqe_flags & IOSQE_IO_DRAIN) { head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; -- cgit From 9466f43741bc08edd7b1bee642dd6f5561091634 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jan 2020 22:34:01 +0300 Subject: io_uring: fix refcounting with batched allocations at OOM In case of out of memory the second argument of percpu_ref_put_many() in io_submit_sqes() may evaluate into "nr - (-EAGAIN)", that is clearly wrong. Fixes: 2b85edfc0c90 ("io_uring: batch getting pcpu references") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index a700ee5fc89d..1dd20305c664 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4830,8 +4830,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } - if (submitted != nr) - percpu_ref_put_many(&ctx->refs, nr - submitted); + if (unlikely(submitted != nr)) { + int ref_used = (submitted == -EAGAIN) ? 0 : submitted; + + percpu_ref_put_many(&ctx->refs, nr - ref_used); + } if (link) io_queue_link_head(link); if (statep) -- cgit From cccf0ee834559ae0b327b40290e14f6a2a017177 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 27 Jan 2020 16:34:48 -0700 Subject: io_uring/io-wq: don't use static creds/mm assignments We currently setup the io_wq with a static set of mm and creds. Even for a single-use io-wq per io_uring, this is suboptimal as we have may have multiple enters of the ring. For sharing the io-wq backend, it doesn't work at all. Switch to passing in the creds and mm when the work item is setup. This means that async work is no longer deferred to the io_uring mm and creds, it is done with the current mm and creds. Flag this behavior with IORING_FEAT_CUR_PERSONALITY, so applications know they can rely on the current personality (mm and creds) being the same for direct issue and async issue. Reviewed-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1dd20305c664..0ea36911745d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -875,6 +875,29 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } +static inline void io_req_work_grab_env(struct io_kiocb *req, + const struct io_op_def *def) +{ + if (!req->work.mm && def->needs_mm) { + mmgrab(current->mm); + req->work.mm = current->mm; + } + if (!req->work.creds) + req->work.creds = get_current_cred(); +} + +static inline void io_req_work_drop_env(struct io_kiocb *req) +{ + if (req->work.mm) { + mmdrop(req->work.mm); + req->work.mm = NULL; + } + if (req->work.creds) { + put_cred(req->work.creds); + req->work.creds = NULL; + } +} + static inline bool io_prep_async_work(struct io_kiocb *req, struct io_kiocb **link) { @@ -888,8 +911,8 @@ static inline bool io_prep_async_work(struct io_kiocb *req, if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - if (def->needs_mm) - req->work.flags |= IO_WQ_WORK_NEEDS_USER; + + io_req_work_grab_env(req, def); *link = io_prep_linked_timeout(req); return do_hashed; @@ -1180,6 +1203,8 @@ static void __io_req_aux_free(struct io_kiocb *req) else fput(req->file); } + + io_req_work_drop_env(req); } static void __io_free_req(struct io_kiocb *req) @@ -3963,6 +3988,8 @@ static int io_req_defer_prep(struct io_kiocb *req, { ssize_t ret = 0; + io_req_work_grab_env(req, &io_op_defs[req->opcode]); + switch (req->opcode) { case IORING_OP_NOP: break; @@ -5725,9 +5752,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } - data.mm = ctx->sqo_mm; data.user = ctx->user; - data.creds = ctx->creds; data.get_work = io_get_work; data.put_work = io_put_work; @@ -6535,7 +6560,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS; + IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | + IORING_FEAT_CUR_PERSONALITY; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: -- cgit From 24369c2e3bb06d8c4e71fd6ceaf4f8a01ae79b7c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 28 Jan 2020 03:15:48 +0300 Subject: io_uring: add io-wq workqueue sharing If IORING_SETUP_ATTACH_WQ is set, it expects wq_fd in io_uring_params to be a valid io_uring fd io-wq of which will be shared with the newly created io_uring instance. If the flag is set but it can't share io-wq, it fails. This allows creation of "sibling" io_urings, where we prefer to keep the SQ/CQ private, but want to share the async backend to minimize the amount of overhead associated with having multiple rings that belong to the same backend. Reported-by: Jens Axboe Reported-by: Daurnimator Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 14 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0ea36911745d..275355bd3a64 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5704,11 +5704,56 @@ static void io_get_work(struct io_wq_work *work) refcount_inc(&req->refs); } +static int io_init_wq_offload(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + struct io_wq_data data; + struct fd f; + struct io_ring_ctx *ctx_attach; + unsigned int concurrency; + int ret = 0; + + data.user = ctx->user; + data.get_work = io_get_work; + data.put_work = io_put_work; + + if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); + + ctx->io_wq = io_wq_create(concurrency, &data); + if (IS_ERR(ctx->io_wq)) { + ret = PTR_ERR(ctx->io_wq); + ctx->io_wq = NULL; + } + return ret; + } + + f = fdget(p->wq_fd); + if (!f.file) + return -EBADF; + + if (f.file->f_op != &io_uring_fops) { + ret = -EINVAL; + goto out_fput; + } + + ctx_attach = f.file->private_data; + /* @io_wq is protected by holding the fd */ + if (!io_wq_get(ctx_attach->io_wq, &data)) { + ret = -EINVAL; + goto out_fput; + } + + ctx->io_wq = ctx_attach->io_wq; +out_fput: + fdput(f); + return ret; +} + static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct io_wq_data data; - unsigned concurrency; int ret; init_waitqueue_head(&ctx->sqo_wait); @@ -5752,18 +5797,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } - data.user = ctx->user; - data.get_work = io_get_work; - data.put_work = io_put_work; - - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, &data); - if (IS_ERR(ctx->io_wq)) { - ret = PTR_ERR(ctx->io_wq); - ctx->io_wq = NULL; + ret = io_init_wq_offload(ctx, p); + if (ret) goto err; - } return 0; err: @@ -6589,7 +6625,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP)) + IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) return -EINVAL; ret = io_uring_create(entries, &p); -- cgit From 071698e13ac6ba786dfa22349a7b62deb5a9464d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 28 Jan 2020 10:04:42 -0700 Subject: io_uring: allow registering credentials If an application wants to use a ring with different kinds of credentials, it can register them upfront. We don't lookup credentials, the credentials of the task calling IORING_REGISTER_PERSONALITY is used. An 'id' is returned for the application to use in subsequent personality support. Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 7 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 275355bd3a64..d74567fc9628 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -273,6 +273,8 @@ struct io_ring_ctx { struct socket *ring_sock; #endif + struct idr personality_idr; + struct { unsigned cached_cq_tail; unsigned cq_entries; @@ -796,6 +798,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->completions[0]); init_completion(&ctx->completions[1]); + idr_init(&ctx->personality_idr); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); @@ -6177,6 +6180,17 @@ static int io_uring_fasync(int fd, struct file *file, int on) return fasync_helper(fd, file, on, &ctx->cq_fasync); } +static int io_remove_personalities(int id, void *p, void *data) +{ + struct io_ring_ctx *ctx = data; + const struct cred *cred; + + cred = idr_remove(&ctx->personality_idr, id); + if (cred) + put_cred(cred); + return 0; +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); @@ -6193,6 +6207,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) io_cqring_overflow_flush(ctx, true); + idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); wait_for_completion(&ctx->completions[0]); io_ring_ctx_free(ctx); } @@ -6683,6 +6698,45 @@ out: return ret; } +static int io_register_personality(struct io_ring_ctx *ctx) +{ + const struct cred *creds = get_current_cred(); + int id; + + id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, + USHRT_MAX, GFP_KERNEL); + if (id < 0) + put_cred(creds); + return id; +} + +static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) +{ + const struct cred *old_creds; + + old_creds = idr_remove(&ctx->personality_idr, id); + if (old_creds) { + put_cred(old_creds); + return 0; + } + + return -EINVAL; +} + +static bool io_register_op_must_quiesce(int op) +{ + switch (op) { + case IORING_UNREGISTER_FILES: + case IORING_REGISTER_FILES_UPDATE: + case IORING_REGISTER_PROBE: + case IORING_REGISTER_PERSONALITY: + case IORING_UNREGISTER_PERSONALITY: + return false; + default: + return true; + } +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -6698,9 +6752,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (percpu_ref_is_dying(&ctx->refs)) return -ENXIO; - if (opcode != IORING_UNREGISTER_FILES && - opcode != IORING_REGISTER_FILES_UPDATE && - opcode != IORING_REGISTER_PROBE) { + if (io_register_op_must_quiesce(opcode)) { percpu_ref_kill(&ctx->refs); /* @@ -6768,15 +6820,24 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_probe(ctx, arg, nr_args); break; + case IORING_REGISTER_PERSONALITY: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_register_personality(ctx); + break; + case IORING_UNREGISTER_PERSONALITY: + ret = -EINVAL; + if (arg) + break; + ret = io_unregister_personality(ctx, nr_args); + break; default: ret = -EINVAL; break; } - - if (opcode != IORING_UNREGISTER_FILES && - opcode != IORING_REGISTER_FILES_UPDATE && - opcode != IORING_REGISTER_PROBE) { + if (io_register_op_must_quiesce(opcode)) { /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); out: -- cgit From 75c6a03904e0dd414a4d99a3072075cb5117e5bc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 28 Jan 2020 10:15:23 -0700 Subject: io_uring: support using a registered personality for commands For personalities previously registered via IORING_REGISTER_PERSONALITY, allow any command to select them. This is done through setting sqe->personality to the id returned from registration, and then flagging sqe->flags with IOSQE_PERSONALITY. Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index d74567fc9628..8bcf0538e2e1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4626,9 +4626,10 @@ static inline void io_queue_link_head(struct io_kiocb *req) static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) { + const struct cred *old_creds = NULL; struct io_ring_ctx *ctx = req->ctx; unsigned int sqe_flags; - int ret; + int ret, id; sqe_flags = READ_ONCE(sqe->flags); @@ -4637,6 +4638,19 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = -EINVAL; goto err_req; } + + id = READ_ONCE(sqe->personality); + if (id) { + const struct cred *personality_creds; + + personality_creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!personality_creds)) { + ret = -EINVAL; + goto err_req; + } + old_creds = override_creds(personality_creds); + } + /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| IOSQE_ASYNC); @@ -4646,6 +4660,8 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, err_req: io_cqring_add_event(req, ret); io_double_put_req(req); + if (old_creds) + revert_creds(old_creds); return false; } @@ -4706,6 +4722,8 @@ err_req: } } + if (old_creds) + revert_creds(old_creds); return true; } -- cgit From f86cd20c9454847a524ddbdcdec32c0380ed7c9b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 Jan 2020 13:46:44 -0700 Subject: io_uring: fix linked command file table usage We're not consistent in how the file table is grabbed and assigned if we have a command linked that requires the use of it. Add ->file_table to the io_op_defs[] array, and use that to determine when to grab the table instead of having the handlers set it if they need to defer. This also means we can kill the IO_WQ_WORK_NEEDS_FILES flag. We always initialize work->files, so io-wq can just check for that. Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8bcf0538e2e1..0d8d0e217847 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -603,6 +603,8 @@ struct io_op_def { unsigned unbound_nonreg_file : 1; /* opcode is not supported by this kernel */ unsigned not_supported : 1; + /* needs file table */ + unsigned file_table : 1; }; static const struct io_op_def io_op_defs[] = { @@ -661,6 +663,7 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .file_table = 1, }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { @@ -679,12 +682,15 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_OPENAT] = { .needs_file = 1, .fd_non_neg = 1, + .file_table = 1, }, [IORING_OP_CLOSE] = { .needs_file = 1, + .file_table = 1, }, [IORING_OP_FILES_UPDATE] = { .needs_mm = 1, + .file_table = 1, }, [IORING_OP_STATX] = { .needs_mm = 1, @@ -720,6 +726,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_OPENAT2] = { .needs_file = 1, .fd_non_neg = 1, + .file_table = 1, }, }; @@ -732,6 +739,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); +static int io_grab_files(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -2568,10 +2576,8 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, struct file *file; int ret; - if (force_nonblock) { - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + if (force_nonblock) return -EAGAIN; - } ret = build_open_flags(&req->open.how, &op); if (ret) @@ -2797,10 +2803,8 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, return ret; /* if the file has a flush method, be safe and punt to async */ - if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) { - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) goto eagain; - } /* * No ->flush(), safely close from here and just punt the @@ -3244,7 +3248,6 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, ret = __io_accept(req, nxt, force_nonblock); if (ret == -EAGAIN && force_nonblock) { req->work.func = io_accept_finish; - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; io_put_req(req); return -EAGAIN; } @@ -3967,10 +3970,8 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock) struct io_uring_files_update up; int ret; - if (force_nonblock) { - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + if (force_nonblock) return -EAGAIN; - } up.offset = req->files_update.offset; up.fds = req->files_update.arg; @@ -3991,6 +3992,12 @@ static int io_req_defer_prep(struct io_kiocb *req, { ssize_t ret = 0; + if (io_op_defs[req->opcode].file_table) { + ret = io_grab_files(req); + if (unlikely(ret)) + return ret; + } + io_req_work_grab_env(req, &io_op_defs[req->opcode]); switch (req->opcode) { @@ -4424,6 +4431,8 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; + if (req->work.files) + return 0; if (!ctx->ring_file) return -EBADF; @@ -4542,7 +4551,7 @@ again: if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { punt: - if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { + if (io_op_defs[req->opcode].file_table) { ret = io_grab_files(req); if (ret) goto err; -- cgit From 3e4827b05d2ac2d377ed136a52829ec46787bf4b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2020 15:18:09 -0700 Subject: io_uring: add support for epoll_ctl(2) This adds IORING_OP_EPOLL_CTL, which can perform the same work as the epoll_ctl(2) system call. Signed-off-by: Jens Axboe --- fs/io_uring.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0d8d0e217847..c5ca84a305d3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -74,6 +74,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -423,6 +424,14 @@ struct io_madvise { u32 advice; }; +struct io_epoll { + struct file *file; + int epfd; + int op; + int fd; + struct epoll_event event; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -536,6 +545,7 @@ struct io_kiocb { struct io_files_update files_update; struct io_fadvise fadvise; struct io_madvise madvise; + struct io_epoll epoll; }; struct io_async_ctx *io; @@ -728,6 +738,10 @@ static const struct io_op_def io_op_defs[] = { .fd_non_neg = 1, .file_table = 1, }, + [IORING_OP_EPOLL_CTL] = { + .unbound_nonreg_file = 1, + .file_table = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2611,6 +2625,52 @@ static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, return io_openat2(req, nxt, force_nonblock); } +static int io_epoll_ctl_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_EPOLL) + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->epoll.epfd = READ_ONCE(sqe->fd); + req->epoll.op = READ_ONCE(sqe->len); + req->epoll.fd = READ_ONCE(sqe->off); + + if (ep_op_has_event(req->epoll.op)) { + struct epoll_event __user *ev; + + ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); + if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) + return -EFAULT; + } + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_EPOLL) + struct io_epoll *ie = &req->epoll; + int ret; + + ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) @@ -4075,6 +4135,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_OPENAT2: ret = io_openat2_prep(req, sqe); break; + case IORING_OP_EPOLL_CTL: + ret = io_epoll_ctl_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -4303,6 +4366,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_openat2(req, nxt, force_nonblock); break; + case IORING_OP_EPOLL_CTL: + if (sqe) { + ret = io_epoll_ctl_prep(req, sqe); + if (ret) + break; + } + ret = io_epoll_ctl(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; -- cgit