From e307e6698165ca6508ed42c69cb1be76c8eb6a3c Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 27 Oct 2022 20:34:45 +0200 Subject: io_uring/net: introduce IORING_SEND_ZC_REPORT_USAGE flag It might be useful for applications to detect if a zero copy transfer with SEND[MSG]_ZC was actually possible or not. The application can fallback to plain SEND[MSG] in order to avoid the overhead of two cqes per request. Or it can generate a log message that could indicate to an administrator that no zero copy was possible and could explain degraded performance. Cc: stable@vger.kernel.org # 6.1 Link: https://lore.kernel.org/io-uring/fb6a7599-8a9b-15e5-9b64-6cd9d01c6ff4@gmail.com/T/#m2b0d9df94ce43b0e69e6c089bdff0ce6babbdfaa Signed-off-by: Stefan Metzmacher Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/8945b01756d902f5d5b0667f20b957ad3f742e5e.1666895626.git.metze@samba.org Signed-off-by: Jens Axboe --- io_uring/net.c | 6 +++++- io_uring/notif.c | 12 ++++++++++++ io_uring/notif.h | 3 +++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index ab83da7e80f0..03825b043afe 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -937,7 +937,8 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->flags = READ_ONCE(sqe->ioprio); if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | - IORING_RECVSEND_FIXED_BUF)) + IORING_RECVSEND_FIXED_BUF | + IORING_SEND_ZC_REPORT_USAGE)) return -EINVAL; notif = zc->notif = io_alloc_notif(ctx); if (!notif) @@ -955,6 +956,9 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->imu = READ_ONCE(ctx->user_bufs[idx]); io_req_set_rsrc_node(notif, ctx, 0); } + if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { + io_notif_to_data(notif)->zc_report = true; + } if (req->opcode == IORING_OP_SEND_ZC) { if (READ_ONCE(sqe->__pad3[0])) diff --git a/io_uring/notif.c b/io_uring/notif.c index e37c6569d82e..4bfef10161fa 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -18,6 +18,10 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) __io_unaccount_mem(ctx->user, nd->account_pages); nd->account_pages = 0; } + + if (nd->zc_report && (nd->zc_copied || !nd->zc_used)) + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; + io_req_task_complete(notif, locked); } @@ -28,6 +32,13 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); struct io_kiocb *notif = cmd_to_io_kiocb(nd); + if (nd->zc_report) { + if (success && !nd->zc_used && skb) + WRITE_ONCE(nd->zc_used, true); + else if (!success && !nd->zc_copied) + WRITE_ONCE(nd->zc_copied, true); + } + if (refcount_dec_and_test(&uarg->refcnt)) { notif->io_task_work.func = __io_notif_complete_tw; io_req_task_work_add(notif); @@ -55,6 +66,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) nd->account_pages = 0; nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; nd->uarg.callback = io_uring_tx_zerocopy_callback; + nd->zc_report = nd->zc_used = nd->zc_copied = false; refcount_set(&nd->uarg.refcnt, 1); return notif; } diff --git a/io_uring/notif.h b/io_uring/notif.h index 5b4d710c8ca5..4ae696273c78 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -13,6 +13,9 @@ struct io_notif_data { struct file *file; struct ubuf_info uarg; unsigned long account_pages; + bool zc_report; + bool zc_used; + bool zc_copied; }; void io_notif_flush(struct io_kiocb *notif); -- cgit From 3671163beb633fbe3297b8e30369b640ce4bd690 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 4 Nov 2022 10:59:40 +0000 Subject: io_uring: move kbuf put out of generic tw complete There are multiple users of io_req_task_complete() including zc notifications, but only read requests use selected buffers. As we already have an rw specific tw function, move io_put_kbuf() in there. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/94374c7649aaefc3a17808dc4701f25ccd457e25.1667557923.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 ------ io_uring/rw.c | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8840cf3e20f2..bdb7e15f1c48 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1476,12 +1476,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) void io_req_task_complete(struct io_kiocb *req, bool *locked) { - if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; - - req->cqe.flags |= io_put_kbuf(req, issue_flags); - } - if (*locked) io_req_complete_defer(req); else diff --git a/io_uring/rw.c b/io_uring/rw.c index bb47cc4da713..1ce065709724 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -286,6 +286,12 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) static void io_req_rw_complete(struct io_kiocb *req, bool *locked) { io_req_io_end(req); + + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { + unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + + req->cqe.flags |= io_put_kbuf(req, issue_flags); + } io_req_task_complete(req, locked); } -- cgit From 5bc8e8884b4e9579ca57e33d42d60090b7288050 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 4 Nov 2022 10:59:41 +0000 Subject: io_uring/net: remove extra notif rsrc setup io_send_zc_prep() sets up notification's rsrc_node when needed, don't unconditionally install it on notif alloc. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/dbe4875ac33e180b9799d8537a5e27935e82aac4.1667557923.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 1 - 1 file changed, 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/notif.c b/io_uring/notif.c index 4bfef10161fa..59dafc42b8e0 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -60,7 +60,6 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) notif->task = current; io_get_task_refs(1); notif->rsrc_node = NULL; - io_req_set_rsrc_node(notif, ctx, 0); nd = io_notif_to_data(notif); nd->account_pages = 0; -- cgit From fc1dd0d4fa523916529ddf7c56d7b866312c4262 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 4 Nov 2022 10:59:42 +0000 Subject: io_uring/net: preset notif tw handler We're going to have multiple notification tw functions. In preparation for future changes default the tw callback in advance so later we can replace it with other versions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7acdbea5e20eadd844513320cd454af14ba50f64.1667557923.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'io_uring') diff --git a/io_uring/notif.c b/io_uring/notif.c index 59dafc42b8e0..6afb58b94297 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -39,10 +39,8 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, WRITE_ONCE(nd->zc_copied, true); } - if (refcount_dec_and_test(&uarg->refcnt)) { - notif->io_task_work.func = __io_notif_complete_tw; + if (refcount_dec_and_test(&uarg->refcnt)) io_req_task_work_add(notif); - } } struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) @@ -60,6 +58,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) notif->task = current; io_get_task_refs(1); notif->rsrc_node = NULL; + notif->io_task_work.func = __io_notif_complete_tw; nd = io_notif_to_data(notif); nd->account_pages = 0; @@ -76,8 +75,6 @@ void io_notif_flush(struct io_kiocb *notif) struct io_notif_data *nd = io_notif_to_data(notif); /* drop slot's master ref */ - if (refcount_dec_and_test(&nd->uarg.refcnt)) { - notif->io_task_work.func = __io_notif_complete_tw; + if (refcount_dec_and_test(&nd->uarg.refcnt)) io_req_task_work_add(notif); - } } -- cgit From 7fa8e84192fd8dbc97b4c8c1acfd10017c3dd7b6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 4 Nov 2022 10:59:43 +0000 Subject: io_uring/net: rename io_uring_tx_zerocopy_callback Just a simple renaming patch, io_uring_tx_zerocopy_callback() is too bulky and doesn't follow usual naming style. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/24d78325403ca6dcb1ec4bced1e33cacc9b832a5.1667557923.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'io_uring') diff --git a/io_uring/notif.c b/io_uring/notif.c index 6afb58b94297..5b0e7bb1198f 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -25,9 +25,8 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) io_req_task_complete(notif, locked); } -static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, - struct ubuf_info *uarg, - bool success) +static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); struct io_kiocb *notif = cmd_to_io_kiocb(nd); @@ -63,7 +62,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) nd = io_notif_to_data(notif); nd->account_pages = 0; nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; - nd->uarg.callback = io_uring_tx_zerocopy_callback; + nd->uarg.callback = io_tx_ubuf_callback; nd->zc_report = nd->zc_used = nd->zc_copied = false; refcount_set(&nd->uarg.refcnt, 1); return notif; -- cgit From bedd20bcf3b08c5d2f03f30a83a10022bde5e596 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 4 Nov 2022 10:59:44 +0000 Subject: io_uring/net: inline io_notif_flush() io_notif_flush() is pretty simple, we can inline it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/332359e7bd124138dfe51340bbec829c9b265c18.1667557923.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 10 ---------- io_uring/notif.h | 11 ++++++++++- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'io_uring') diff --git a/io_uring/notif.c b/io_uring/notif.c index 5b0e7bb1198f..c287adf24e66 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -67,13 +67,3 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) refcount_set(&nd->uarg.refcnt, 1); return notif; } - -void io_notif_flush(struct io_kiocb *notif) - __must_hold(&slot->notif->ctx->uring_lock) -{ - struct io_notif_data *nd = io_notif_to_data(notif); - - /* drop slot's master ref */ - if (refcount_dec_and_test(&nd->uarg.refcnt)) - io_req_task_work_add(notif); -} diff --git a/io_uring/notif.h b/io_uring/notif.h index 4ae696273c78..7f00176020d3 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -18,7 +18,6 @@ struct io_notif_data { bool zc_copied; }; -void io_notif_flush(struct io_kiocb *notif); struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx); static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) @@ -26,6 +25,16 @@ static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) return io_kiocb_to_cmd(notif, struct io_notif_data); } +static inline void io_notif_flush(struct io_kiocb *notif) + __must_hold(¬if->ctx->uring_lock) +{ + struct io_notif_data *nd = io_notif_to_data(notif); + + /* drop slot's master ref */ + if (refcount_dec_and_test(&nd->uarg.refcnt)) + io_req_task_work_add(notif); +} + static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) { struct io_ring_ctx *ctx = notif->ctx; -- cgit From 40725d1b960f19a11a1ebd1ab537844ebf39347c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 4 Nov 2022 10:59:45 +0000 Subject: io_uring: move zc reporting from the hot path Add custom tw and notif callbacks on top of usual bits also handling zc reporting. That moves it from the hot path. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/40de4a6409042478e1f35adc4912e23226cb1b5c.1667557923.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 22 ++++++++++++++-------- io_uring/notif.c | 31 +++++++++++++++++++++++++++---- io_uring/notif.h | 1 + 3 files changed, 42 insertions(+), 12 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 03825b043afe..9e3da845f906 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -923,6 +923,9 @@ void io_send_zc_cleanup(struct io_kiocb *req) } } +#define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) +#define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) + int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); @@ -935,11 +938,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->flags & REQ_F_CQE_SKIP) return -EINVAL; - zc->flags = READ_ONCE(sqe->ioprio); - if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | - IORING_RECVSEND_FIXED_BUF | - IORING_SEND_ZC_REPORT_USAGE)) - return -EINVAL; notif = zc->notif = io_alloc_notif(ctx); if (!notif) return -ENOMEM; @@ -947,6 +945,17 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) notif->cqe.res = 0; notif->cqe.flags = IORING_CQE_F_NOTIF; req->flags |= REQ_F_NEED_CLEANUP; + + zc->flags = READ_ONCE(sqe->ioprio); + if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { + if (zc->flags & ~IO_ZC_FLAGS_VALID) + return -EINVAL; + if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { + io_notif_set_extended(notif); + io_notif_to_data(notif)->zc_report = true; + } + } + if (zc->flags & IORING_RECVSEND_FIXED_BUF) { unsigned idx = READ_ONCE(sqe->buf_index); @@ -956,9 +965,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->imu = READ_ONCE(ctx->user_bufs[idx]); io_req_set_rsrc_node(notif, ctx, 0); } - if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { - io_notif_to_data(notif)->zc_report = true; - } if (req->opcode == IORING_OP_SEND_ZC) { if (READ_ONCE(sqe->__pad3[0])) diff --git a/io_uring/notif.c b/io_uring/notif.c index c287adf24e66..9864bde3e2ef 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -18,11 +18,17 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) __io_unaccount_mem(ctx->user, nd->account_pages); nd->account_pages = 0; } + io_req_task_complete(notif, locked); +} + +static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked) +{ + struct io_notif_data *nd = io_notif_to_data(notif); if (nd->zc_report && (nd->zc_copied || !nd->zc_used)) notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; - io_req_task_complete(notif, locked); + __io_notif_complete_tw(notif, locked); } static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, @@ -31,15 +37,33 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); struct io_kiocb *notif = cmd_to_io_kiocb(nd); + if (refcount_dec_and_test(&uarg->refcnt)) + io_req_task_work_add(notif); +} + +static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) +{ + struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); + if (nd->zc_report) { if (success && !nd->zc_used && skb) WRITE_ONCE(nd->zc_used, true); else if (!success && !nd->zc_copied) WRITE_ONCE(nd->zc_copied, true); } + io_tx_ubuf_callback(skb, uarg, success); +} - if (refcount_dec_and_test(&uarg->refcnt)) - io_req_task_work_add(notif); +void io_notif_set_extended(struct io_kiocb *notif) +{ + struct io_notif_data *nd = io_notif_to_data(notif); + + nd->zc_report = false; + nd->zc_used = false; + nd->zc_copied = false; + notif->io_task_work.func = io_notif_complete_tw_ext; + io_notif_to_data(notif)->uarg.callback = io_tx_ubuf_callback_ext; } struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) @@ -63,7 +87,6 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) nd->account_pages = 0; nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; nd->uarg.callback = io_tx_ubuf_callback; - nd->zc_report = nd->zc_used = nd->zc_copied = false; refcount_set(&nd->uarg.refcnt, 1); return notif; } diff --git a/io_uring/notif.h b/io_uring/notif.h index 7f00176020d3..c88c800cd89d 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -19,6 +19,7 @@ struct io_notif_data { }; struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx); +void io_notif_set_extended(struct io_kiocb *notif); static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) { -- cgit From 42385b02baad0df55474b7f36dc13e0d4ffd0cc0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 4 Nov 2022 10:59:46 +0000 Subject: io_uring/net: move mm accounting to a slower path We can also move mm accounting to the extended callbacks. It removes a few cycles from the hot path including skipping one function call and setting io_req_task_complete as a callback directly. For user backed I/O it shouldn't make any difference taking into considering atomic mm accounting and page pinning. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1062f270273ad11c1b7b45ec59a6a317533d5e64.1667557923.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 3 +++ io_uring/notif.c | 31 +++++++++++++------------------ 2 files changed, 16 insertions(+), 18 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 9e3da845f906..966019fcbe8c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1097,6 +1097,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) return ret; msg.sg_from_iter = io_sg_from_iter; } else { + io_notif_set_extended(zc->notif); ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter); if (unlikely(ret)) @@ -1158,6 +1159,8 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; + io_notif_set_extended(sr->notif); + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; diff --git a/io_uring/notif.c b/io_uring/notif.c index 9864bde3e2ef..c4bb793ebf0e 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -9,11 +9,14 @@ #include "notif.h" #include "rsrc.h" -static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) +static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked) { struct io_notif_data *nd = io_notif_to_data(notif); struct io_ring_ctx *ctx = notif->ctx; + if (nd->zc_report && (nd->zc_copied || !nd->zc_used)) + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; + if (nd->account_pages && ctx->user) { __io_unaccount_mem(ctx->user, nd->account_pages); nd->account_pages = 0; @@ -21,16 +24,6 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) io_req_task_complete(notif, locked); } -static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked) -{ - struct io_notif_data *nd = io_notif_to_data(notif); - - if (nd->zc_report && (nd->zc_copied || !nd->zc_used)) - notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; - - __io_notif_complete_tw(notif, locked); -} - static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, bool success) { @@ -59,11 +52,14 @@ void io_notif_set_extended(struct io_kiocb *notif) { struct io_notif_data *nd = io_notif_to_data(notif); - nd->zc_report = false; - nd->zc_used = false; - nd->zc_copied = false; - notif->io_task_work.func = io_notif_complete_tw_ext; - io_notif_to_data(notif)->uarg.callback = io_tx_ubuf_callback_ext; + if (nd->uarg.callback != io_tx_ubuf_callback_ext) { + nd->account_pages = 0; + nd->zc_report = false; + nd->zc_used = false; + nd->zc_copied = false; + nd->uarg.callback = io_tx_ubuf_callback_ext; + notif->io_task_work.func = io_notif_complete_tw_ext; + } } struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) @@ -81,10 +77,9 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) notif->task = current; io_get_task_refs(1); notif->rsrc_node = NULL; - notif->io_task_work.func = __io_notif_complete_tw; + notif->io_task_work.func = io_req_task_complete; nd = io_notif_to_data(notif); - nd->account_pages = 0; nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; nd->uarg.callback = io_tx_ubuf_callback; refcount_set(&nd->uarg.refcnt, 1); -- cgit From df730ec21f7ba395b1b22e7f93a3a85b1d1b7882 Mon Sep 17 00:00:00 2001 From: Xinghui Li Date: Wed, 2 Nov 2022 16:25:03 +0800 Subject: io_uring: fix two assignments in if conditions Fixes two errors: "ERROR: do not use assignment in if condition 130: FILE: io_uring/net.c:130: + if (!(issue_flags & IO_URING_F_UNLOCKED) && ERROR: do not use assignment in if condition 599: FILE: io_uring/poll.c:599: + } else if (!(issue_flags & IO_URING_F_UNLOCKED) &&" reported by checkpatch.pl in net.c and poll.c . Signed-off-by: Xinghui Li Reported-by: kernel test robot Link: https://lore.kernel.org/r/20221102082503.32236-1-korantwork@gmail.com [axboe: style tweaks] Signed-off-by: Jens Axboe --- io_uring/net.c | 16 +++++++++------- io_uring/poll.c | 7 +++++-- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 966019fcbe8c..f32e0daeedd8 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -125,13 +125,15 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req, struct io_cache_entry *entry; struct io_async_msghdr *hdr; - if (!(issue_flags & IO_URING_F_UNLOCKED) && - (entry = io_alloc_cache_get(&ctx->netmsg_cache)) != NULL) { - hdr = container_of(entry, struct io_async_msghdr, cache); - hdr->free_iov = NULL; - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = hdr; - return hdr; + if (!(issue_flags & IO_URING_F_UNLOCKED)) { + entry = io_alloc_cache_get(&ctx->netmsg_cache); + if (entry) { + hdr = container_of(entry, struct io_async_msghdr, cache); + hdr->free_iov = NULL; + req->flags |= REQ_F_ASYNC_DATA; + req->async_data = hdr; + return hdr; + } } if (!io_alloc_async_data(req)) { diff --git a/io_uring/poll.c b/io_uring/poll.c index 055632e9092a..58e02d963961 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -615,10 +615,13 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); - } else if (!(issue_flags & IO_URING_F_UNLOCKED) && - (entry = io_alloc_cache_get(&ctx->apoll_cache)) != NULL) { + } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { + entry = io_alloc_cache_get(&ctx->apoll_cache); + if (entry == NULL) + goto alloc_apoll; apoll = container_of(entry, struct async_poll, cache); } else { +alloc_apoll: apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) return NULL; -- cgit From ef67fcb41de6d3d5bbb16aaa66d4c706c4cacf54 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Mon, 7 Nov 2022 04:33:49 -0800 Subject: io_uring: do not always force run task_work in io_uring_register Running task work when not needed can unnecessarily delay operations. Specifically IORING_SETUP_DEFER_TASKRUN tries to avoid running task work until the user requests it. Therefore do not run it in io_uring_register any more. The one catch is that io_rsrc_ref_quiesce expects it to have run in order to process all outstanding references, and so reorder it's loop to do this. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221107123349.4106213-1-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 -- io_uring/rsrc.c | 7 ++++++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bdb7e15f1c48..cf68d16255a0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -4056,8 +4056,6 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, ctx = f.file->private_data; - io_run_task_work_ctx(ctx); - mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); mutex_unlock(&ctx->uring_lock); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 55d4ab96fb92..187f1c83e779 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -321,6 +321,11 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, if (atomic_dec_and_test(&data->refs)) break; mutex_unlock(&ctx->uring_lock); + + ret = io_run_task_work_sig(ctx); + if (ret < 0) + goto reinit; + flush_delayed_work(&ctx->rsrc_put_work); ret = wait_for_completion_interruptible(&data->done); if (!ret) { @@ -336,12 +341,12 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, } } +reinit: atomic_inc(&data->refs); /* wait for all works potentially completing data->done */ flush_delayed_work(&ctx->rsrc_put_work); reinit_completion(&data->done); - ret = io_run_task_work_sig(ctx); mutex_lock(&ctx->uring_lock); } while (ret >= 0); data->quiesce = false; -- cgit From 515e26961295bee9da5e26916c27739dca6c10e1 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Mon, 21 Nov 2022 07:43:42 -0700 Subject: io_uring: revert "io_uring fix multishot accept ordering" This is no longer needed after commit aa1df3a360a0 ("io_uring: fix CQE reordering"), since all reordering is now taken care of. This reverts commit cbd25748545c ("io_uring: fix multishot accept ordering"). Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221107125236.260132-2-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/net.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index f32e0daeedd8..f7b2b068464d 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1322,12 +1322,12 @@ retry: return IOU_OK; } - if (ret >= 0 && - io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, false)) + if (ret < 0) + return ret; + if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true)) goto retry; - io_req_set_res(req, ret, 0); - return (issue_flags & IO_URING_F_MULTISHOT) ? IOU_STOP_MULTISHOT : IOU_OK; + return -ECANCELED; } int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -- cgit From e2ad599d1ed38fe743106f10d58a0cbfc00b51e2 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Mon, 7 Nov 2022 04:52:35 -0800 Subject: io_uring: allow multishot recv CQEs to overflow With commit aa1df3a360a0 ("io_uring: fix CQE reordering"), there are stronger guarantees for overflow ordering. Specifically ensuring that userspace will not receive out of order receive CQEs. Therefore this is not needed any more for recv/recvmsg. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221107125236.260132-4-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/net.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index f7b2b068464d..0de6f78ad978 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -602,15 +602,11 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, if (!mshot_finished) { if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret, - cflags | IORING_CQE_F_MORE, false)) { + cflags | IORING_CQE_F_MORE, true)) { io_recv_prep_retry(req); return false; } - /* - * Otherwise stop multishot but use the current result. - * Probably will end up going into overflow, but this means - * we cannot trust the ordering anymore - */ + /* Otherwise stop multishot but use the current result. */ } io_req_set_res(req, *ret, cflags); -- cgit From cd42a53d25d489317b9ae5213da721cde8cb7071 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Thu, 10 Nov 2022 14:03:13 +0800 Subject: io_uring/poll: remove outdated comments of caching Previous commit 13a99017ff19 ("io_uring: remove events caching atavisms") entirely removes the events caching optimization introduced by commit 81459350d581 ("io_uring: cache req->apoll->events in req->cflags"). Hence the related comment should also be removed to avoid misunderstanding. Fixes: 13a99017ff19 ("io_uring: remove events caching atavisms") Signed-off-by: Lin Ma Link: https://lore.kernel.org/r/20221110060313.16303-1-linma@zju.edu.cn Signed-off-by: Jens Axboe --- io_uring/poll.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'io_uring') diff --git a/io_uring/poll.c b/io_uring/poll.c index 58e02d963961..8fb8e781c02d 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -324,12 +324,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) static void __io_poll_execute(struct io_kiocb *req, int mask) { io_req_set_res(req, mask, 0); - /* - * This is useful for poll that is armed on behalf of another - * request, and where the wakeup path could be on a different - * CPU. We want to avoid pulling in req->apoll->events for that - * case. - */ + if (req->opcode == IORING_OP_POLL_ADD) req->io_task_work.func = io_poll_task_func; else -- cgit From 23a6c9ac4dbd7cccf5b909e78aa84192b65f2833 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Thu, 10 Nov 2022 20:21:03 +0800 Subject: io_uring: update outdated comment of callbacks Previous commit ebc11b6c6b87 ("io_uring: clean io-wq callbacks") rename io_free_work() into io_wq_free_work() for consistency. This patch also updates relevant comment to avoid misunderstanding. Fixes: ebc11b6c6b87 ("io_uring: clean io-wq callbacks") Signed-off-by: Lin Ma Link: https://lore.kernel.org/r/20221110122103.20120-1-linma@zju.edu.cn Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cf68d16255a0..c770eed4d717 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1781,7 +1781,7 @@ void io_wq_submit_work(struct io_wq_work *work) bool needs_poll = false; int ret = 0, err = -ECANCELED; - /* one will be dropped by ->io_free_work() after returning to io-wq */ + /* one will be dropped by ->io_wq_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) __io_req_set_refcount(req, 2); else -- cgit From e52d2e583e4ad1d5d0b804d79c2b8752eb0e5ceb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 11 Nov 2022 16:54:08 +0000 Subject: io_uring: inline io_req_task_work_add() __io_req_task_work_add() is huge but marked inline, that makes compilers to generate lots of garbage. Inline the wrapper caller io_req_task_work_add() instead. before and after: text data bss dec hex filename 47347 16248 8 63603 f873 io_uring/io_uring.o text data bss dec hex filename 45303 16248 8 61559 f077 io_uring/io_uring.o Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/26dc8c28ca0160e3269ef3e55c5a8b917c4d4450.1668162751.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 7 +------ io_uring/io_uring.h | 7 ++++++- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c770eed4d717..8f452dfb4f1c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1127,7 +1127,7 @@ static void io_req_local_work_add(struct io_kiocb *req) __io_cqring_wake(ctx); } -static inline void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) +void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; @@ -1159,11 +1159,6 @@ static inline void __io_req_task_work_add(struct io_kiocb *req, bool allow_local } } -void io_req_task_work_add(struct io_kiocb *req) -{ - __io_req_task_work_add(req, true); -} - static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) { struct llist_node *node; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index cef5ff924e63..38d9e149d2db 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -50,9 +50,9 @@ static inline bool io_req_ffs_set(struct io_kiocb *req) return req->flags & REQ_F_FIXED_FILE; } +void __io_req_task_work_add(struct io_kiocb *req, bool allow_local); bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); -void io_req_task_work_add(struct io_kiocb *req); void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); void io_req_task_queue(struct io_kiocb *req); void io_queue_iowq(struct io_kiocb *req, bool *dont_use); @@ -82,6 +82,11 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); +static inline void io_req_task_work_add(struct io_kiocb *req) +{ + __io_req_task_work_add(req, true); +} + #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) -- cgit From d75936062049522172a107c994242b76c89777f9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 11 Nov 2022 16:54:09 +0000 Subject: io_uring: split tw fallback into a function When the target process is dying and so task_work_add() is not allowed we push all task_work item to the fallback workqueue. Move the part responsible for moving tw items out of __io_req_task_work_add() into a separate function. Makes it a bit cleaner and gives the compiler a bit of extra info. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e503dab9d7af95470ca6b214c6de17715ae4e748.1668162751.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8f452dfb4f1c..9925ac08c398 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1105,6 +1105,20 @@ void tctx_task_work(struct callback_head *cb) trace_io_uring_task_work_run(tctx, count, loops); } +static __cold void io_fallback_tw(struct io_uring_task *tctx) +{ + struct llist_node *node = llist_del_all(&tctx->task_list); + struct io_kiocb *req; + + while (node) { + req = container_of(node, struct io_kiocb, io_task_work.node); + node = node->next; + if (llist_add(&req->io_task_work.node, + &req->ctx->fallback_llist)) + schedule_delayed_work(&req->ctx->fallback_work, 1); + } +} + static void io_req_local_work_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1131,7 +1145,6 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - struct llist_node *node; if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) { io_req_local_work_add(req); @@ -1148,15 +1161,7 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; - node = llist_del_all(&tctx->task_list); - - while (node) { - req = container_of(node, struct io_kiocb, io_task_work.node); - node = node->next; - if (llist_add(&req->io_task_work.node, - &req->ctx->fallback_llist)) - schedule_delayed_work(&req->ctx->fallback_work, 1); - } + io_fallback_tw(tctx); } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) -- cgit From f9d567c75ec216447f36da6e855500023504fa04 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Nov 2022 18:41:06 +0000 Subject: io_uring: inline __io_req_complete_post() There is only one user of __io_req_complete_post(), inline it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ef4c9059950a3da5cf68df00f977f1fd13bd9306.1668597569.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 +++-------- io_uring/io_uring.h | 1 - 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9925ac08c398..1299f9c8567a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -854,19 +854,14 @@ static void __io_req_complete_put(struct io_kiocb *req) } } -void __io_req_complete_post(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req->ctx, req); - __io_req_complete_put(req); -} - void io_req_complete_post(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; io_cq_lock(ctx); - __io_req_complete_post(req); + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe_req(ctx, req); + __io_req_complete_put(req); io_cq_unlock_post(ctx); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 38d9e149d2db..69fbd27c7577 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -32,7 +32,6 @@ int io_run_local_work(struct io_ring_ctx *ctx); void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); -void __io_req_complete_post(struct io_kiocb *req); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, bool allow_overflow); bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, -- cgit From 4464853277d0ccdb9914608dd1332f0fa2f9846f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 20 Nov 2022 10:18:45 -0700 Subject: io_uring: pass in EPOLL_URING_WAKE for eventfd signaling and wakeups Pass in EPOLL_URING_WAKE when signaling eventfd or doing poll related wakups, so that we can check for a circular event dependency between eventfd and epoll. If this flag is set when our wakeup handlers are called, then we know we have a dependency that needs to terminate multishot requests. eventfd and epoll are the only such possible dependencies. Cc: stable@vger.kernel.org # 6.0 Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- io_uring/io_uring.h | 15 +++++++++++---- io_uring/poll.c | 8 ++++++++ 3 files changed, 21 insertions(+), 6 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1299f9c8567a..762ecab801f2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -495,7 +495,7 @@ static void io_eventfd_ops(struct rcu_head *rcu) int ops = atomic_xchg(&ev_fd->ops, 0); if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) - eventfd_signal(ev_fd->cq_ev_fd, 1); + eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE); /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback * ordering in a race but if references are 0 we know we have to free @@ -531,7 +531,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) goto out; if (likely(eventfd_signal_allowed())) { - eventfd_signal(ev_fd->cq_ev_fd, 1); + eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE); } else { atomic_inc(&ev_fd->refs); if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 69fbd27c7577..83013ee584d6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "io-wq.h" #include "slist.h" #include "filetable.h" @@ -211,12 +212,18 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) static inline void __io_cqring_wake(struct io_ring_ctx *ctx) { /* - * wake_up_all() may seem excessive, but io_wake_function() and - * io_should_wake() handle the termination of the loop and only - * wake as many waiters as we need to. + * Trigger waitqueue handler on all waiters on our waitqueue. This + * won't necessarily wake up all the tasks, io_should_wake() will make + * that decision. + * + * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter + * set in the mask so that if we recurse back into our own poll + * waitqueue handlers, we know we have a dependency between eventfd or + * epoll and should terminate multishot poll at that point. */ if (waitqueue_active(&ctx->cq_wait)) - wake_up_all(&ctx->cq_wait); + __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, + poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); } static inline void io_cqring_wake(struct io_ring_ctx *ctx) diff --git a/io_uring/poll.c b/io_uring/poll.c index 8fb8e781c02d..22c9b2e0944a 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -389,6 +389,14 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, return 0; if (io_poll_get_ownership(req)) { + /* + * If we trigger a multishot poll off our own wakeup path, + * disable multishot as there is a circular dependency between + * CQ posting and triggering the event. + */ + if (mask & EPOLL_URING_WAKE) + poll->events |= EPOLLONESHOT; + /* optional, saves extra locking for removal in tw handler */ if (mask && poll->events & EPOLLONESHOT) { list_del_init(&poll->wait.entry); -- cgit From 4061f0ef730cca5171351b7018b34a45b76df9c2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 20 Nov 2022 10:20:46 -0700 Subject: Revert "io_uring: disallow self-propelled ring polling" This reverts commit 7fdbc5f014c3f71bc44673a2d6c5bb2d12d45f25. This patch dealt with a subset of the real problem, which is a potential circular dependency on the wakup path for io_uring itself. Outside of io_uring, eventfd can also trigger this (see details in 03e02acda8e2) and so can epoll (see details in caf1aeaffc3b). Now that we have a generic solution to this problem, get rid of the io_uring specific work-around. Signed-off-by: Jens Axboe --- io_uring/poll.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/poll.c b/io_uring/poll.c index 22c9b2e0944a..cd4d98d622d2 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -246,8 +246,6 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) continue; if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE; - if (io_is_uring_fops(req->file)) - return IOU_POLL_DONE; /* multishot, just fill a CQE and proceed */ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { -- cgit From 6c16fe3c16bdc420719768f7ea97b82bd6303eec Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 21 Nov 2022 07:51:15 -0700 Subject: io_uring: kill io_cqring_ev_posted() and __io_cq_unlock_post() __io_cq_unlock_post() is identical to io_cq_unlock_post(), and io_cqring_ev_posted() has a single caller so migth as well just inline it there. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 762ecab801f2..2260fb7aa7f2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -581,23 +581,14 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_eventfd_flush_signal(ctx); } -static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) -{ - io_commit_cqring_flush(ctx); - io_cqring_wake(ctx); -} - -static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) +void io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} -void io_cq_unlock_post(struct io_ring_ctx *ctx) -{ - __io_cq_unlock_post(ctx); + io_commit_cqring_flush(ctx); + io_cqring_wake(ctx); } /* Returns true if there are no backlogged entries after the flush */ @@ -1346,7 +1337,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) if (!(req->flags & REQ_F_CQE_SKIP)) __io_fill_cqe_req(ctx, req); } - __io_cq_unlock_post(ctx); + io_cq_unlock_post(ctx); io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); -- cgit From 2ccc92f4effcfa1c51c4fcf1e34d769099d3cad4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 23 Nov 2022 11:33:36 +0000 Subject: io_uring: add completion locking for iopoll There are pieces of code that may allow iopoll to race filling cqes, temporarily add spinlocking around posting events. Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/84d86b5c117feda075471c5c9e65208e0dccf5d0.1669203009.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rw.c b/io_uring/rw.c index 1ce065709724..61c326831949 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1049,6 +1049,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) else if (!pos) return 0; + spin_lock(&ctx->completion_lock); prev = start; wq_list_for_each_resume(pos, prev) { struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); @@ -1063,11 +1064,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) req->cqe.flags = io_put_kbuf(req, 0); __io_fill_cqe_req(req->ctx, req); } - + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); if (unlikely(!nr_events)) return 0; - io_commit_cqring(ctx); io_cqring_ev_posted_iopoll(ctx); pos = start ? start->next : ctx->iopoll_list.first; wq_list_cut(&ctx->iopoll_list, prev, start); -- cgit From e276ae344a770f91912a81c6a338d92efd319be2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 23 Nov 2022 11:33:37 +0000 Subject: io_uring: hold locks for io_req_complete_failed A preparation patch, make sure we always hold uring_lock around io_req_complete_failed(). The only place deviating from the rule is io_cancel_defer_files(), queue a tw instead. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/70760344eadaecf2939287084b9d4ba5c05a6984.1669203009.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2260fb7aa7f2..4d16f3b1ee11 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -862,9 +862,12 @@ inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) } void io_req_complete_failed(struct io_kiocb *req, s32 res) + __must_hold(&ctx->uring_lock) { const struct io_op_def *def = &io_op_defs[req->opcode]; + lockdep_assert_held(&req->ctx->uring_lock); + req_set_fail(req); io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); if (def->fail) @@ -1615,6 +1618,7 @@ static u32 io_get_sequence(struct io_kiocb *req) } static __cold void io_drain_req(struct io_kiocb *req) + __must_hold(&ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; @@ -2849,7 +2853,7 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); list_del_init(&de->list); - io_req_complete_failed(de->req, -ECANCELED); + io_req_task_queue_fail(de->req, -ECANCELED); kfree(de); } return true; -- cgit From 624fd779fd869bdcb2c0ccca0f09456eed71ed52 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 23 Nov 2022 11:33:38 +0000 Subject: io_uring: use io_req_task_complete() in timeout Use a more generic io_req_task_complete() in timeout completion task_work instead of io_req_complete_post(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/bda1710b58c07bf06107421c2a65c529ea9cdcac.1669203009.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/timeout.c b/io_uring/timeout.c index e8a8c2099480..a819818df7b3 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -282,11 +282,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); } io_req_set_res(req, ret ?: -ETIME, 0); - io_req_complete_post(req); + io_req_task_complete(req, locked); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); - io_req_complete_post(req); + io_req_task_complete(req, locked); } } -- cgit From 833b5dfffc26c81835ce38e2a5df9ac5fa142735 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 23 Nov 2022 11:33:39 +0000 Subject: io_uring: remove io_req_tw_post_queue Remove io_req_tw_post() and io_req_tw_post_queue(), we can use io_req_task_complete() instead. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b9b73c08022c7f1457023ac841f35c0100e70345.1669203009.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 12 ------------ io_uring/io_uring.h | 8 +++++++- io_uring/timeout.c | 6 +++--- 3 files changed, 10 insertions(+), 16 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4d16f3b1ee11..e445344f6f07 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1226,18 +1226,6 @@ int io_run_local_work(struct io_ring_ctx *ctx) return ret; } -static void io_req_tw_post(struct io_kiocb *req, bool *locked) -{ - io_req_complete_post(req); -} - -void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) -{ - io_req_set_res(req, res, cflags); - req->io_task_work.func = io_req_tw_post; - io_req_task_work_add(req); -} - static void io_req_task_cancel(struct io_kiocb *req, bool *locked) { /* not needed for normal modes, but SQPOLL depends on it */ diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 83013ee584d6..222af88df10f 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -53,7 +53,6 @@ static inline bool io_req_ffs_set(struct io_kiocb *req) void __io_req_task_work_add(struct io_kiocb *req, bool allow_local); bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); -void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); void io_req_task_queue(struct io_kiocb *req); void io_queue_iowq(struct io_kiocb *req, bool *dont_use); void io_req_task_complete(struct io_kiocb *req, bool *locked); @@ -375,4 +374,11 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) ctx->submitter_task == current); } +static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) +{ + io_req_set_res(req, res, 0); + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); +} + #endif diff --git a/io_uring/timeout.c b/io_uring/timeout.c index a819818df7b3..5b4bc93fd6e0 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -63,7 +63,7 @@ static bool io_kill_timeout(struct io_kiocb *req, int status) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&timeout->list); - io_req_tw_post_queue(req, status, 0); + io_req_queue_tw_complete(req, status); return true; } return false; @@ -159,7 +159,7 @@ void io_disarm_next(struct io_kiocb *req) req->flags &= ~REQ_F_ARM_LTIMEOUT; if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { io_remove_next_linked(req); - io_req_tw_post_queue(link, -ECANCELED, 0); + io_req_queue_tw_complete(link, -ECANCELED); } } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; @@ -168,7 +168,7 @@ void io_disarm_next(struct io_kiocb *req) link = io_disarm_linked_timeout(req); spin_unlock_irq(&ctx->timeout_lock); if (link) - io_req_tw_post_queue(link, -ECANCELED, 0); + io_req_queue_tw_complete(link, -ECANCELED); } if (unlikely((req->flags & REQ_F_FAIL) && !(req->flags & REQ_F_HARDLINK))) -- cgit From fa18fa2272c7469e470dcb7bf838ea50a25494ca Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 23 Nov 2022 11:33:40 +0000 Subject: io_uring: inline __io_req_complete_put() Inline __io_req_complete_put() into io_req_complete_post(), there are no other users. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1923a4dfe80fa877f859a22ed3df2d5fc8ecf02b.1669203009.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e445344f6f07..a0c71a2dce19 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -814,15 +814,19 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, return filled; } -static void __io_req_complete_put(struct io_kiocb *req) +void io_req_complete_post(struct io_kiocb *req) { + struct io_ring_ctx *ctx = req->ctx; + + io_cq_lock(ctx); + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe_req(ctx, req); + /* * If we're the last reference to this request, add to our locked * free_list cache. */ if (req_ref_put_and_test(req)) { - struct io_ring_ctx *ctx = req->ctx; - if (req->flags & IO_REQ_LINK_FLAGS) { if (req->flags & IO_DISARM_MASK) io_disarm_next(req); @@ -843,16 +847,6 @@ static void __io_req_complete_put(struct io_kiocb *req) wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; } -} - -void io_req_complete_post(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - - io_cq_lock(ctx); - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(ctx, req); - __io_req_complete_put(req); io_cq_unlock_post(ctx); } -- cgit From 1bec951c3809051f64a6957fe86d1b4786cc0313 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 23 Nov 2022 11:33:41 +0000 Subject: io_uring: iopoll protect complete_post io_req_complete_post() may be used by iopoll enabled rings, grab locks in this case. That requires to pass issue_flags to propagate the locking state. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cc6d854065c57c838ca8e8806f707a226b70fd2d.1669203009.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 21 +++++++++++++++------ io_uring/io_uring.h | 10 ++++++++-- io_uring/kbuf.c | 4 ++-- io_uring/poll.c | 2 +- io_uring/uring_cmd.c | 2 +- 5 files changed, 27 insertions(+), 12 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a0c71a2dce19..cc27413129fc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -814,7 +814,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, return filled; } -void io_req_complete_post(struct io_kiocb *req) +static void __io_req_complete_post(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -850,9 +850,18 @@ void io_req_complete_post(struct io_kiocb *req) io_cq_unlock_post(ctx); } -inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) +void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { - io_req_complete_post(req); + if (!(issue_flags & IO_URING_F_UNLOCKED) || + !(req->ctx->flags & IORING_SETUP_IOPOLL)) { + __io_req_complete_post(req); + } else { + struct io_ring_ctx *ctx = req->ctx; + + mutex_lock(&ctx->uring_lock); + __io_req_complete_post(req); + mutex_unlock(&ctx->uring_lock); + } } void io_req_complete_failed(struct io_kiocb *req, s32 res) @@ -866,7 +875,7 @@ void io_req_complete_failed(struct io_kiocb *req, s32 res) io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); if (def->fail) def->fail(req); - io_req_complete_post(req); + io_req_complete_post(req, 0); } /* @@ -1450,7 +1459,7 @@ void io_req_task_complete(struct io_kiocb *req, bool *locked) if (*locked) io_req_complete_defer(req); else - io_req_complete_post(req); + io_req_complete_post_tw(req, locked); } /* @@ -1718,7 +1727,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_defer(req); else - io_req_complete_post(req); + io_req_complete_post(req, issue_flags); } else if (ret != IOU_ISSUE_SKIP_COMPLETE) return ret; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 222af88df10f..b5b80bf03385 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -31,14 +31,20 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx); int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked); int io_run_local_work(struct io_ring_ctx *ctx); void io_req_complete_failed(struct io_kiocb *req, s32 res); -void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); -void io_req_complete_post(struct io_kiocb *req); +void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, bool allow_overflow); bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, bool allow_overflow); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); +static inline void io_req_complete_post_tw(struct io_kiocb *req, bool *locked) +{ + unsigned flags = *locked ? 0 : IO_URING_F_UNLOCKED; + + io_req_complete_post(req, flags); +} + struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); struct file *io_file_get_normal(struct io_kiocb *req, int fd); diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index e2c46889d5fa..e8150ed637d8 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -311,7 +311,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) /* complete before unlock, IOPOLL may need the lock */ io_req_set_res(req, ret, 0); - __io_req_complete(req, issue_flags); + io_req_complete_post(req, 0); io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } @@ -462,7 +462,7 @@ err: req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ io_req_set_res(req, ret, 0); - __io_req_complete(req, issue_flags); + io_req_complete_post(req, 0); io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } diff --git a/io_uring/poll.c b/io_uring/poll.c index cd4d98d622d2..4624e5eba63e 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -312,7 +312,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_poll_tw_hash_eject(req, locked); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_complete_post(req); + io_req_complete_post_tw(req, locked); else if (ret == IOU_POLL_DONE) io_req_task_submit(req, locked); else diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index e50de0b6b9f8..446a189b78b0 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -56,7 +56,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); else - __io_req_complete(req, 0); + io_req_complete_post(req, 0); } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -- cgit From 2dac1a159216b39ced8d78dba590c5d2f4249586 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 23 Nov 2022 11:33:42 +0000 Subject: io_uring: remove iopoll spinlock This reverts commit 2ccc92f4effcfa1c51c4fcf1e34d769099d3cad4 io_req_complete_post() should now behave well even in case of IOPOLL, we can remove completion_lock locking. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7e171c8b530656b14a671c59100ca260e46e7f2a.1669203009.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rw.c b/io_uring/rw.c index 61c326831949..1ce065709724 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1049,7 +1049,6 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) else if (!pos) return 0; - spin_lock(&ctx->completion_lock); prev = start; wq_list_for_each_resume(pos, prev) { struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); @@ -1064,11 +1063,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) req->cqe.flags = io_put_kbuf(req, 0); __io_fill_cqe_req(req->ctx, req); } - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); + if (unlikely(!nr_events)) return 0; + io_commit_cqring(ctx); io_cqring_ev_posted_iopoll(ctx); pos = start ? start->next : ctx->iopoll_list.first; wq_list_cut(&ctx->iopoll_list, prev, start); -- cgit From c06c6c5d276707e04cedbcc55625e984922118aa Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Nov 2022 01:35:52 -0800 Subject: io_uring: always lock in io_apoll_task_func This is required for the failure case (io_req_complete_failed) and is missing. The alternative would be to only lock in the failure path, however all of the non-error paths in io_poll_check_events that do not do not return IOU_POLL_NO_ACTION end up locking anyway. The only extraneous lock would be for the multishot poll overflowing the CQE ring, however multishot poll would probably benefit from being locked as it will allow completions to be batched. So it seems reasonable to lock always. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221124093559.3780686-3-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/poll.c b/io_uring/poll.c index 4624e5eba63e..42aa10b50f6c 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -308,11 +308,12 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) if (ret == IOU_POLL_NO_ACTION) return; + io_tw_lock(req->ctx, locked); io_poll_remove_entries(req); io_poll_tw_hash_eject(req, locked); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_complete_post_tw(req, locked); + io_req_task_complete(req, locked); else if (ret == IOU_POLL_DONE) io_req_task_submit(req, locked); else -- cgit From 973fc83f3a94bdffcacf482641db38f57c7c8609 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Nov 2022 01:35:53 -0800 Subject: io_uring: defer all io_req_complete_failed All failures happen under lock now, and can be deferred. To be consistent when the failure has happened after some multishot cqe has been deferred (and keep ordering), always defer failures. To make this obvious at the caller (and to help prevent a future bug) rename io_req_complete_failed to io_req_defer_failed. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221124093559.3780686-4-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 17 ++++++++--------- io_uring/io_uring.h | 2 +- io_uring/poll.c | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cc27413129fc..4888fe834920 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -864,7 +864,7 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) } } -void io_req_complete_failed(struct io_kiocb *req, s32 res) +void io_req_defer_failed(struct io_kiocb *req, s32 res) __must_hold(&ctx->uring_lock) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -875,7 +875,7 @@ void io_req_complete_failed(struct io_kiocb *req, s32 res) io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); if (def->fail) def->fail(req); - io_req_complete_post(req, 0); + io_req_complete_defer(req); } /* @@ -1231,9 +1231,8 @@ int io_run_local_work(struct io_ring_ctx *ctx) static void io_req_task_cancel(struct io_kiocb *req, bool *locked) { - /* not needed for normal modes, but SQPOLL depends on it */ io_tw_lock(req->ctx, locked); - io_req_complete_failed(req, req->cqe.res); + io_req_defer_failed(req, req->cqe.res); } void io_req_task_submit(struct io_kiocb *req, bool *locked) @@ -1243,7 +1242,7 @@ void io_req_task_submit(struct io_kiocb *req, bool *locked) if (likely(!(req->task->flags & PF_EXITING))) io_queue_sqe(req); else - io_req_complete_failed(req, -EFAULT); + io_req_defer_failed(req, -EFAULT); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -1630,7 +1629,7 @@ queue: ret = io_req_prep_async(req); if (ret) { fail: - io_req_complete_failed(req, ret); + io_req_defer_failed(req, ret); return; } io_prep_async_link(req); @@ -1860,7 +1859,7 @@ static void io_queue_async(struct io_kiocb *req, int ret) struct io_kiocb *linked_timeout; if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { - io_req_complete_failed(req, ret); + io_req_defer_failed(req, ret); return; } @@ -1910,14 +1909,14 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) */ req->flags &= ~REQ_F_HARDLINK; req->flags |= REQ_F_LINK; - io_req_complete_failed(req, req->cqe.res); + io_req_defer_failed(req, req->cqe.res); } else if (unlikely(req->ctx->drain_active)) { io_drain_req(req); } else { int ret = io_req_prep_async(req); if (unlikely(ret)) - io_req_complete_failed(req, ret); + io_req_defer_failed(req, ret); else io_queue_iowq(req, NULL); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index b5b80bf03385..a26d5aa7f3f3 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -30,7 +30,7 @@ bool io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked); int io_run_local_work(struct io_ring_ctx *ctx); -void io_req_complete_failed(struct io_kiocb *req, s32 res); +void io_req_defer_failed(struct io_kiocb *req, s32 res); void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, bool allow_overflow); diff --git a/io_uring/poll.c b/io_uring/poll.c index 42aa10b50f6c..4bd43e6f5b72 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -317,7 +317,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) else if (ret == IOU_POLL_DONE) io_req_task_submit(req, locked); else - io_req_complete_failed(req, ret); + io_req_defer_failed(req, ret); } static void __io_poll_execute(struct io_kiocb *req, int mask) -- cgit From 931147ddfa6e9ffd814272f1c0370c4740acbe17 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Nov 2022 01:35:54 -0800 Subject: io_uring: allow defer completion for aux posted cqes Multishot ops cannot use the compl_reqs list as the request must stay in the poll list, but that means they need to run each completion without benefiting from batching. Here introduce batching infrastructure for only small (ie 16 byte) CQEs. This restriction is ok because there are no use cases posting 32 byte CQEs. In the ring keep a batch of up to 16 posted results, and flush in the same way as compl_reqs. 16 was chosen through experimentation on a microbenchmark ([1]), as well as trying not to increase the size of the ring too much. This increases the size to 1472 bytes from 1216. [1]: https://github.com/DylanZA/liburing/commit/9ac66b36bcf4477bfafeff1c5f107896b7ae31cf Run with $ make -j && ./benchmark/reg.b -s 1 -t 2000 -r 10 Gives results: baseline 8309 k/s 8 18807 k/s 16 19338 k/s 32 20134 k/s Suggested-by: Pavel Begunkov Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221124093559.3780686-5-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4888fe834920..28635e3e578a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -167,7 +167,8 @@ EXPORT_SYMBOL(io_uring_get_socket); static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { - if (!wq_list_empty(&ctx->submit_state.compl_reqs)) + if (!wq_list_empty(&ctx->submit_state.compl_reqs) || + ctx->submit_state.cqes_count) __io_submit_flush_completions(ctx); } @@ -802,6 +803,21 @@ bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags return false; } +static void __io_flush_post_cqes(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + struct io_submit_state *state = &ctx->submit_state; + unsigned int i; + + lockdep_assert_held(&ctx->uring_lock); + for (i = 0; i < state->cqes_count; i++) { + struct io_uring_cqe *cqe = &state->cqes[i]; + + io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags, true); + } + state->cqes_count = 0; +} + bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, bool allow_overflow) @@ -1323,6 +1339,9 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_submit_state *state = &ctx->submit_state; io_cq_lock(ctx); + /* must come first to preserve CQE ordering in failure cases */ + if (state->cqes_count) + __io_flush_post_cqes(ctx); wq_list_for_each(node, prev, &state->compl_reqs) { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); @@ -1332,8 +1351,10 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) } io_cq_unlock_post(ctx); - io_free_batch_list(ctx, state->compl_reqs.first); - INIT_WQ_LIST(&state->compl_reqs); + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { + io_free_batch_list(ctx, state->compl_reqs.first); + INIT_WQ_LIST(&state->compl_reqs); + } } /* -- cgit From 9b8c54755a2b16d4f23c0ea184b75e2edf77d906 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Nov 2022 01:35:55 -0800 Subject: io_uring: add io_aux_cqe which allows deferred completion Use the just introduced deferred post cqe completion state when possible in io_aux_cqe. If not possible fallback to io_post_aux_cqe. This introduces a complication because of allow_overflow. For deferred completions we cannot know without locking the completion_lock if it will overflow (and even if we locked it, another post could sneak in and cause this cqe to be in overflow). However since overflow protection is mostly a best effort defence in depth to prevent infinite loops of CQEs for poll, just checking the overflow bit is going to be good enough and will result in at most 16 (array size of deferred cqes) overflows. Suggested-by: Pavel Begunkov Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221124093559.3780686-6-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 34 ++++++++++++++++++++++++++++++++++ io_uring/io_uring.h | 2 ++ io_uring/net.c | 7 ++++--- io_uring/poll.c | 4 ++-- 4 files changed, 42 insertions(+), 5 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 28635e3e578a..056aea917cd6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -830,6 +830,40 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, return filled; } +bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, + bool allow_overflow) +{ + struct io_uring_cqe *cqe; + unsigned int length; + + if (!defer) + return io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow); + + length = ARRAY_SIZE(ctx->submit_state.cqes); + + lockdep_assert_held(&ctx->uring_lock); + + if (ctx->submit_state.cqes_count == length) { + io_cq_lock(ctx); + __io_flush_post_cqes(ctx); + /* no need to flush - flush is deferred */ + spin_unlock(&ctx->completion_lock); + } + + /* For defered completions this is not as strict as it is otherwise, + * however it's main job is to prevent unbounded posted completions, + * and in that it works just as well. + */ + if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) + return false; + + cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++]; + cqe->user_data = user_data; + cqe->res = res; + cqe->flags = cflags; + return true; +} + static void __io_req_complete_post(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index a26d5aa7f3f3..dd02adf3d0df 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -36,6 +36,8 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags bool allow_overflow); bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, bool allow_overflow); +bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, + bool allow_overflow); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); static inline void io_req_complete_post_tw(struct io_kiocb *req, bool *locked) diff --git a/io_uring/net.c b/io_uring/net.c index 0de6f78ad978..90342dcb6b1d 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -601,8 +601,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, } if (!mshot_finished) { - if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret, - cflags | IORING_CQE_F_MORE, true)) { + if (io_aux_cqe(req->ctx, issue_flags & IO_URING_F_COMPLETE_DEFER, + req->cqe.user_data, *ret, cflags | IORING_CQE_F_MORE, true)) { io_recv_prep_retry(req); return false; } @@ -1320,7 +1320,8 @@ retry: if (ret < 0) return ret; - if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true)) + if (io_aux_cqe(ctx, issue_flags & IO_URING_F_COMPLETE_DEFER, + req->cqe.user_data, ret, IORING_CQE_F_MORE, true)) goto retry; return -ECANCELED; diff --git a/io_uring/poll.c b/io_uring/poll.c index 4bd43e6f5b72..922c1a366c41 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -252,8 +252,8 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_post_aux_cqe(ctx, req->cqe.user_data, - mask, IORING_CQE_F_MORE, false)) { + if (!io_aux_cqe(ctx, *locked, req->cqe.user_data, + mask, IORING_CQE_F_MORE, false)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } -- cgit From a77ab745f28d5ab2ce51d0e44e85af942bb77d47 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Nov 2022 01:35:56 -0800 Subject: io_uring: make io_fill_cqe_aux static This is only used in io_uring.c Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221124093559.3780686-7-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- io_uring/io_uring.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 056aea917cd6..fea84e51e56f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -770,8 +770,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) return &rings->cqes[off]; } -bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, + bool allow_overflow) { struct io_uring_cqe *cqe; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index dd02adf3d0df..46694f40bf72 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -34,8 +34,6 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res); void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, bool allow_overflow); -bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, - bool allow_overflow); bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, bool allow_overflow); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); -- cgit From 2e2ef4a1dab980d88a1ab45bf0e28c8851999e33 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Nov 2022 01:35:57 -0800 Subject: io_uring: add lockdep assertion in io_fill_cqe_aux Add an assertion for the completion lock to io_fill_cqe_aux Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221124093559.3780686-8-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fea84e51e56f..03051e1fa02e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -775,6 +775,8 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 { struct io_uring_cqe *cqe; + lockdep_assert_held(&ctx->completion_lock); + ctx->cq_extra++; /* -- cgit From b529c96a896b7bea8464a58d350836cc106d70bd Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Nov 2022 01:35:58 -0800 Subject: io_uring: remove overflow param from io_post_aux_cqe The only call sites which would not allow overflow are also call sites which would use the io_aux_cqe as they care about ordering. So remove this parameter from io_post_aux_cqe. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221124093559.3780686-9-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 12 ++++++++---- io_uring/io_uring.h | 3 +-- io_uring/msg_ring.c | 4 ++-- io_uring/rsrc.c | 4 ++-- 4 files changed, 13 insertions(+), 10 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 03051e1fa02e..7ed9cbeb573f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -820,9 +820,8 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx) state->cqes_count = 0; } -bool io_post_aux_cqe(struct io_ring_ctx *ctx, - u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, + bool allow_overflow) { bool filled; @@ -832,6 +831,11 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, return filled; } +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +{ + return __io_post_aux_cqe(ctx, user_data, res, cflags, true); +} + bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, bool allow_overflow) { @@ -839,7 +843,7 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 unsigned int length; if (!defer) - return io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow); + return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow); length = ARRAY_SIZE(ctx->submit_state.cqes); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 46694f40bf72..dcb8e3468f1d 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -32,8 +32,7 @@ int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked); int io_run_local_work(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, - bool allow_overflow); +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, bool allow_overflow); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 90d2fc6fd80e..afb543aab9f6 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -31,7 +31,7 @@ static int io_msg_ring_data(struct io_kiocb *req) if (msg->src_fd || msg->dst_fd || msg->flags) return -EINVAL; - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true)) + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) return 0; return -EOVERFLOW; @@ -116,7 +116,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) * completes with -EOVERFLOW, then the sender must ensure that a * later IORING_OP_MSG_RING delivers the message. */ - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true)) + if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) ret = -EOVERFLOW; out_unlock: io_double_unlock_ctx(ctx, target_ctx, issue_flags); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 187f1c83e779..133608200769 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -170,10 +170,10 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) if (prsrc->tag) { if (ctx->flags & IORING_SETUP_IOPOLL) { mutex_lock(&ctx->uring_lock); - io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true); + io_post_aux_cqe(ctx, prsrc->tag, 0, 0); mutex_unlock(&ctx->uring_lock); } else { - io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true); + io_post_aux_cqe(ctx, prsrc->tag, 0, 0); } } -- cgit From 9a6924519e5e882631a7fff429facca838207e45 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Nov 2022 01:35:59 -0800 Subject: io_uring: allow multishot polled reqs to defer completion Until now there was no reason for multishot polled requests to defer completions as there was no functional difference. However now this will actually defer the completions, for a performance win. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221124093559.3780686-10-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7ed9cbeb573f..72c97af4f292 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1803,7 +1803,8 @@ int io_poll_issue(struct io_kiocb *req, bool *locked) io_tw_lock(req->ctx, locked); if (unlikely(req->task->flags & PF_EXITING)) return -EFAULT; - return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT); + return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| + IO_URING_F_COMPLETE_DEFER); } struct io_wq_work *io_wq_free_work(struct io_wq_work *work) -- cgit From 27f35fe9096b183d45ff6f22ad277ddf107d8428 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Fri, 25 Nov 2022 02:34:10 -0800 Subject: io_uring: remove io_req_complete_post_tw It's only used in one place. Inline it. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221125103412.1425305-2-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/io_uring.h | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 72c97af4f292..24aa049fe7e1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1519,7 +1519,7 @@ void io_req_task_complete(struct io_kiocb *req, bool *locked) if (*locked) io_req_complete_defer(req); else - io_req_complete_post_tw(req, locked); + io_req_complete_post(req, IO_URING_F_UNLOCKED); } /* diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index dcb8e3468f1d..76659d2fc90c 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -37,13 +37,6 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 bool allow_overflow); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); -static inline void io_req_complete_post_tw(struct io_kiocb *req, bool *locked) -{ - unsigned flags = *locked ? 0 : IO_URING_F_UNLOCKED; - - io_req_complete_post(req, flags); -} - struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); struct file *io_file_get_normal(struct io_kiocb *req, int fd); -- cgit From 10d8bc35416d9e83ffe9644478756281c7bd4f52 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Fri, 25 Nov 2022 02:34:11 -0800 Subject: io_uring: spelling fix s/pushs/pushes/ Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221125103412.1425305-3-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 24aa049fe7e1..d9c9e347346d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2706,7 +2706,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * lock(&ep->mtx); * * Users may get EPOLLIN meanwhile seeing nothing in cqring, this - * pushs them to do the flush. + * pushes them to do the flush. */ if (io_cqring_events(ctx) || io_has_work(ctx)) -- cgit From c3b490930dbe6a6c98d3820f445757ddec1efb08 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Nov 2022 19:46:40 +0000 Subject: io_uring: don't use complete_post in kbuf Now we're handling IOPOLL completions more generically, get rid uses of _post() and send requests through the normal path. It may have some extra mertis performance wise, but we don't care much as there is a better interface for selected buffers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4deded706587f55b006dc33adf0c13cfc3b2319f.1669310258.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'io_uring') diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index e8150ed637d8..4a6401080c1f 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -306,14 +306,11 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) if (!bl->buf_nr_pages) ret = __io_remove_buffers(ctx, bl, p->nbufs); } + io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) req_set_fail(req); - - /* complete before unlock, IOPOLL may need the lock */ io_req_set_res(req, ret, 0); - io_req_complete_post(req, 0); - io_ring_submit_unlock(ctx, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; + return IOU_OK; } int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -458,13 +455,12 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) ret = io_add_buffers(ctx, p, bl); err: + io_ring_submit_unlock(ctx, issue_flags); + if (ret < 0) req_set_fail(req); - /* complete before unlock, IOPOLL may need the lock */ io_req_set_res(req, ret, 0); - io_req_complete_post(req, 0); - io_ring_submit_unlock(ctx, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; + return IOU_OK; } int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) -- cgit From 5d772916855f593672de55c437925daccc8ecd73 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Nov 2022 19:46:41 +0000 Subject: io_uring: keep unlock_post inlined in hot path This partially reverts 6c16fe3c16bdc ("io_uring: kill io_cqring_ev_posted() and __io_cq_unlock_post()") The redundancy of __io_cq_unlock_post() was always to keep it inlined into __io_submit_flush_completions(). Inline it back and rename with hope of clarifying the intention behind it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/372a16c485fca44c069be2e92fc5e7332a1d7fd7.1669310258.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d9c9e347346d..adecdf65b130 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -582,7 +582,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_eventfd_flush_signal(ctx); } -void io_cq_unlock_post(struct io_ring_ctx *ctx) +/* keep it inlined for io_submit_flush_completions() */ +static inline void io_cq_unlock_post_inline(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { io_commit_cqring(ctx); @@ -592,6 +593,12 @@ void io_cq_unlock_post(struct io_ring_ctx *ctx) io_cqring_wake(ctx); } +void io_cq_unlock_post(struct io_ring_ctx *ctx) + __releases(ctx->completion_lock) +{ + io_cq_unlock_post_inline(ctx); +} + /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -1389,7 +1396,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) if (!(req->flags & REQ_F_CQE_SKIP)) __io_fill_cqe_req(ctx, req); } - io_cq_unlock_post(ctx); + io_cq_unlock_post_inline(ctx); if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); -- cgit