From e307e6698165ca6508ed42c69cb1be76c8eb6a3c Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Thu, 27 Oct 2022 20:34:45 +0200
Subject: io_uring/net: introduce IORING_SEND_ZC_REPORT_USAGE flag

It might be useful for applications to detect if a zero copy transfer with
SEND[MSG]_ZC was actually possible or not. The application can fallback to
plain SEND[MSG] in order to avoid the overhead of two cqes per request. Or
it can generate a log message that could indicate to an administrator that
no zero copy was possible and could explain degraded performance.

Cc: stable@vger.kernel.org # 6.1
Link: https://lore.kernel.org/io-uring/fb6a7599-8a9b-15e5-9b64-6cd9d01c6ff4@gmail.com/T/#m2b0d9df94ce43b0e69e6c089bdff0ce6babbdfaa
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8945b01756d902f5d5b0667f20b957ad3f742e5e.1666895626.git.metze@samba.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c   |  6 +++++-
 io_uring/notif.c | 12 ++++++++++++
 io_uring/notif.h |  3 +++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/net.c b/io_uring/net.c
index ab83da7e80f0..03825b043afe 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -937,7 +937,8 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	zc->flags = READ_ONCE(sqe->ioprio);
 	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
-			  IORING_RECVSEND_FIXED_BUF))
+			  IORING_RECVSEND_FIXED_BUF |
+			  IORING_SEND_ZC_REPORT_USAGE))
 		return -EINVAL;
 	notif = zc->notif = io_alloc_notif(ctx);
 	if (!notif)
@@ -955,6 +956,9 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		req->imu = READ_ONCE(ctx->user_bufs[idx]);
 		io_req_set_rsrc_node(notif, ctx, 0);
 	}
+	if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
+		io_notif_to_data(notif)->zc_report = true;
+	}
 
 	if (req->opcode == IORING_OP_SEND_ZC) {
 		if (READ_ONCE(sqe->__pad3[0]))
diff --git a/io_uring/notif.c b/io_uring/notif.c
index e37c6569d82e..4bfef10161fa 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -18,6 +18,10 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
 		__io_unaccount_mem(ctx->user, nd->account_pages);
 		nd->account_pages = 0;
 	}
+
+	if (nd->zc_report && (nd->zc_copied || !nd->zc_used))
+		notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED;
+
 	io_req_task_complete(notif, locked);
 }
 
@@ -28,6 +32,13 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
 	struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
 	struct io_kiocb *notif = cmd_to_io_kiocb(nd);
 
+	if (nd->zc_report) {
+		if (success && !nd->zc_used && skb)
+			WRITE_ONCE(nd->zc_used, true);
+		else if (!success && !nd->zc_copied)
+			WRITE_ONCE(nd->zc_copied, true);
+	}
+
 	if (refcount_dec_and_test(&uarg->refcnt)) {
 		notif->io_task_work.func = __io_notif_complete_tw;
 		io_req_task_work_add(notif);
@@ -55,6 +66,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	nd->account_pages = 0;
 	nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
 	nd->uarg.callback = io_uring_tx_zerocopy_callback;
+	nd->zc_report = nd->zc_used = nd->zc_copied = false;
 	refcount_set(&nd->uarg.refcnt, 1);
 	return notif;
 }
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 5b4d710c8ca5..4ae696273c78 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -13,6 +13,9 @@ struct io_notif_data {
 	struct file		*file;
 	struct ubuf_info	uarg;
 	unsigned long		account_pages;
+	bool			zc_report;
+	bool			zc_used;
+	bool			zc_copied;
 };
 
 void io_notif_flush(struct io_kiocb *notif);
-- 
cgit 


From 3671163beb633fbe3297b8e30369b640ce4bd690 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 4 Nov 2022 10:59:40 +0000
Subject: io_uring: move kbuf put out of generic tw complete

There are multiple users of io_req_task_complete() including zc
notifications, but only read requests use selected buffers. As we
already have an rw specific tw function, move io_put_kbuf() in there.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/94374c7649aaefc3a17808dc4701f25ccd457e25.1667557923.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 6 ------
 io_uring/rw.c       | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8840cf3e20f2..bdb7e15f1c48 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1476,12 +1476,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 
 void io_req_task_complete(struct io_kiocb *req, bool *locked)
 {
-	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
-		unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
-
-		req->cqe.flags |= io_put_kbuf(req, issue_flags);
-	}
-
 	if (*locked)
 		io_req_complete_defer(req);
 	else
diff --git a/io_uring/rw.c b/io_uring/rw.c
index bb47cc4da713..1ce065709724 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -286,6 +286,12 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
 static void io_req_rw_complete(struct io_kiocb *req, bool *locked)
 {
 	io_req_io_end(req);
+
+	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
+		unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
+
+		req->cqe.flags |= io_put_kbuf(req, issue_flags);
+	}
 	io_req_task_complete(req, locked);
 }
 
-- 
cgit 


From 5bc8e8884b4e9579ca57e33d42d60090b7288050 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 4 Nov 2022 10:59:41 +0000
Subject: io_uring/net: remove extra notif rsrc setup

io_send_zc_prep() sets up notification's rsrc_node when needed, don't
unconditionally install it on notif alloc.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/dbe4875ac33e180b9799d8537a5e27935e82aac4.1667557923.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/notif.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/notif.c b/io_uring/notif.c
index 4bfef10161fa..59dafc42b8e0 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -60,7 +60,6 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	notif->task = current;
 	io_get_task_refs(1);
 	notif->rsrc_node = NULL;
-	io_req_set_rsrc_node(notif, ctx, 0);
 
 	nd = io_notif_to_data(notif);
 	nd->account_pages = 0;
-- 
cgit 


From fc1dd0d4fa523916529ddf7c56d7b866312c4262 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 4 Nov 2022 10:59:42 +0000
Subject: io_uring/net: preset notif tw handler

We're going to have multiple notification tw functions. In preparation
for future changes default the tw callback in advance so later we can
replace it with other versions.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7acdbea5e20eadd844513320cd454af14ba50f64.1667557923.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/notif.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/notif.c b/io_uring/notif.c
index 59dafc42b8e0..6afb58b94297 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -39,10 +39,8 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
 			WRITE_ONCE(nd->zc_copied, true);
 	}
 
-	if (refcount_dec_and_test(&uarg->refcnt)) {
-		notif->io_task_work.func = __io_notif_complete_tw;
+	if (refcount_dec_and_test(&uarg->refcnt))
 		io_req_task_work_add(notif);
-	}
 }
 
 struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
@@ -60,6 +58,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	notif->task = current;
 	io_get_task_refs(1);
 	notif->rsrc_node = NULL;
+	notif->io_task_work.func = __io_notif_complete_tw;
 
 	nd = io_notif_to_data(notif);
 	nd->account_pages = 0;
@@ -76,8 +75,6 @@ void io_notif_flush(struct io_kiocb *notif)
 	struct io_notif_data *nd = io_notif_to_data(notif);
 
 	/* drop slot's master ref */
-	if (refcount_dec_and_test(&nd->uarg.refcnt)) {
-		notif->io_task_work.func = __io_notif_complete_tw;
+	if (refcount_dec_and_test(&nd->uarg.refcnt))
 		io_req_task_work_add(notif);
-	}
 }
-- 
cgit 


From 7fa8e84192fd8dbc97b4c8c1acfd10017c3dd7b6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 4 Nov 2022 10:59:43 +0000
Subject: io_uring/net: rename io_uring_tx_zerocopy_callback

Just a simple renaming patch, io_uring_tx_zerocopy_callback() is too
bulky and doesn't follow usual naming style.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/24d78325403ca6dcb1ec4bced1e33cacc9b832a5.1667557923.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/notif.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/notif.c b/io_uring/notif.c
index 6afb58b94297..5b0e7bb1198f 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -25,9 +25,8 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
 	io_req_task_complete(notif, locked);
 }
 
-static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
-					  struct ubuf_info *uarg,
-					  bool success)
+static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
+				bool success)
 {
 	struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
 	struct io_kiocb *notif = cmd_to_io_kiocb(nd);
@@ -63,7 +62,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	nd = io_notif_to_data(notif);
 	nd->account_pages = 0;
 	nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
-	nd->uarg.callback = io_uring_tx_zerocopy_callback;
+	nd->uarg.callback = io_tx_ubuf_callback;
 	nd->zc_report = nd->zc_used = nd->zc_copied = false;
 	refcount_set(&nd->uarg.refcnt, 1);
 	return notif;
-- 
cgit 


From bedd20bcf3b08c5d2f03f30a83a10022bde5e596 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 4 Nov 2022 10:59:44 +0000
Subject: io_uring/net: inline io_notif_flush()

io_notif_flush() is pretty simple, we can inline it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/332359e7bd124138dfe51340bbec829c9b265c18.1667557923.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/notif.c | 10 ----------
 io_uring/notif.h | 11 ++++++++++-
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/notif.c b/io_uring/notif.c
index 5b0e7bb1198f..c287adf24e66 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -67,13 +67,3 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	refcount_set(&nd->uarg.refcnt, 1);
 	return notif;
 }
-
-void io_notif_flush(struct io_kiocb *notif)
-	__must_hold(&slot->notif->ctx->uring_lock)
-{
-	struct io_notif_data *nd = io_notif_to_data(notif);
-
-	/* drop slot's master ref */
-	if (refcount_dec_and_test(&nd->uarg.refcnt))
-		io_req_task_work_add(notif);
-}
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 4ae696273c78..7f00176020d3 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -18,7 +18,6 @@ struct io_notif_data {
 	bool			zc_copied;
 };
 
-void io_notif_flush(struct io_kiocb *notif);
 struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx);
 
 static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
@@ -26,6 +25,16 @@ static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
 	return io_kiocb_to_cmd(notif, struct io_notif_data);
 }
 
+static inline void io_notif_flush(struct io_kiocb *notif)
+	__must_hold(&notif->ctx->uring_lock)
+{
+	struct io_notif_data *nd = io_notif_to_data(notif);
+
+	/* drop slot's master ref */
+	if (refcount_dec_and_test(&nd->uarg.refcnt))
+		io_req_task_work_add(notif);
+}
+
 static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
 {
 	struct io_ring_ctx *ctx = notif->ctx;
-- 
cgit 


From 40725d1b960f19a11a1ebd1ab537844ebf39347c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 4 Nov 2022 10:59:45 +0000
Subject: io_uring: move zc reporting from the hot path

Add custom tw and notif callbacks on top of usual bits also handling zc
reporting. That moves it from the hot path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/40de4a6409042478e1f35adc4912e23226cb1b5c.1667557923.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c   | 22 ++++++++++++++--------
 io_uring/notif.c | 31 +++++++++++++++++++++++++++----
 io_uring/notif.h |  1 +
 3 files changed, 42 insertions(+), 12 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/net.c b/io_uring/net.c
index 03825b043afe..9e3da845f906 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -923,6 +923,9 @@ void io_send_zc_cleanup(struct io_kiocb *req)
 	}
 }
 
+#define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
+#define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE)
+
 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -935,11 +938,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (req->flags & REQ_F_CQE_SKIP)
 		return -EINVAL;
 
-	zc->flags = READ_ONCE(sqe->ioprio);
-	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
-			  IORING_RECVSEND_FIXED_BUF |
-			  IORING_SEND_ZC_REPORT_USAGE))
-		return -EINVAL;
 	notif = zc->notif = io_alloc_notif(ctx);
 	if (!notif)
 		return -ENOMEM;
@@ -947,6 +945,17 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	notif->cqe.res = 0;
 	notif->cqe.flags = IORING_CQE_F_NOTIF;
 	req->flags |= REQ_F_NEED_CLEANUP;
+
+	zc->flags = READ_ONCE(sqe->ioprio);
+	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
+		if (zc->flags & ~IO_ZC_FLAGS_VALID)
+			return -EINVAL;
+		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
+			io_notif_set_extended(notif);
+			io_notif_to_data(notif)->zc_report = true;
+		}
+	}
+
 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
 		unsigned idx = READ_ONCE(sqe->buf_index);
 
@@ -956,9 +965,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		req->imu = READ_ONCE(ctx->user_bufs[idx]);
 		io_req_set_rsrc_node(notif, ctx, 0);
 	}
-	if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
-		io_notif_to_data(notif)->zc_report = true;
-	}
 
 	if (req->opcode == IORING_OP_SEND_ZC) {
 		if (READ_ONCE(sqe->__pad3[0]))
diff --git a/io_uring/notif.c b/io_uring/notif.c
index c287adf24e66..9864bde3e2ef 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -18,11 +18,17 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
 		__io_unaccount_mem(ctx->user, nd->account_pages);
 		nd->account_pages = 0;
 	}
+	io_req_task_complete(notif, locked);
+}
+
+static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked)
+{
+	struct io_notif_data *nd = io_notif_to_data(notif);
 
 	if (nd->zc_report && (nd->zc_copied || !nd->zc_used))
 		notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED;
 
-	io_req_task_complete(notif, locked);
+	__io_notif_complete_tw(notif, locked);
 }
 
 static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
@@ -31,15 +37,33 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
 	struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
 	struct io_kiocb *notif = cmd_to_io_kiocb(nd);
 
+	if (refcount_dec_and_test(&uarg->refcnt))
+		io_req_task_work_add(notif);
+}
+
+static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg,
+			     bool success)
+{
+	struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
+
 	if (nd->zc_report) {
 		if (success && !nd->zc_used && skb)
 			WRITE_ONCE(nd->zc_used, true);
 		else if (!success && !nd->zc_copied)
 			WRITE_ONCE(nd->zc_copied, true);
 	}
+	io_tx_ubuf_callback(skb, uarg, success);
+}
 
-	if (refcount_dec_and_test(&uarg->refcnt))
-		io_req_task_work_add(notif);
+void io_notif_set_extended(struct io_kiocb *notif)
+{
+	struct io_notif_data *nd = io_notif_to_data(notif);
+
+	nd->zc_report = false;
+	nd->zc_used = false;
+	nd->zc_copied = false;
+	notif->io_task_work.func = io_notif_complete_tw_ext;
+	io_notif_to_data(notif)->uarg.callback = io_tx_ubuf_callback_ext;
 }
 
 struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
@@ -63,7 +87,6 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	nd->account_pages = 0;
 	nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
 	nd->uarg.callback = io_tx_ubuf_callback;
-	nd->zc_report = nd->zc_used = nd->zc_copied = false;
 	refcount_set(&nd->uarg.refcnt, 1);
 	return notif;
 }
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 7f00176020d3..c88c800cd89d 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -19,6 +19,7 @@ struct io_notif_data {
 };
 
 struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx);
+void io_notif_set_extended(struct io_kiocb *notif);
 
 static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
 {
-- 
cgit 


From 42385b02baad0df55474b7f36dc13e0d4ffd0cc0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 4 Nov 2022 10:59:46 +0000
Subject: io_uring/net: move mm accounting to a slower path

We can also move mm accounting to the extended callbacks. It removes a
few cycles from the hot path including skipping one function call and
setting io_req_task_complete as a callback directly. For user backed I/O
it shouldn't make any difference taking into considering atomic mm
accounting and page pinning.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1062f270273ad11c1b7b45ec59a6a317533d5e64.1667557923.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c   |  3 +++
 io_uring/notif.c | 31 +++++++++++++------------------
 2 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/net.c b/io_uring/net.c
index 9e3da845f906..966019fcbe8c 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1097,6 +1097,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
 			return ret;
 		msg.sg_from_iter = io_sg_from_iter;
 	} else {
+		io_notif_set_extended(zc->notif);
 		ret = import_single_range(WRITE, zc->buf, zc->len, &iov,
 					  &msg.msg_iter);
 		if (unlikely(ret))
@@ -1158,6 +1159,8 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 
+	io_notif_set_extended(sr->notif);
+
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 9864bde3e2ef..c4bb793ebf0e 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -9,11 +9,14 @@
 #include "notif.h"
 #include "rsrc.h"
 
-static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
+static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked)
 {
 	struct io_notif_data *nd = io_notif_to_data(notif);
 	struct io_ring_ctx *ctx = notif->ctx;
 
+	if (nd->zc_report && (nd->zc_copied || !nd->zc_used))
+		notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED;
+
 	if (nd->account_pages && ctx->user) {
 		__io_unaccount_mem(ctx->user, nd->account_pages);
 		nd->account_pages = 0;
@@ -21,16 +24,6 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
 	io_req_task_complete(notif, locked);
 }
 
-static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked)
-{
-	struct io_notif_data *nd = io_notif_to_data(notif);
-
-	if (nd->zc_report && (nd->zc_copied || !nd->zc_used))
-		notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED;
-
-	__io_notif_complete_tw(notif, locked);
-}
-
 static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
 				bool success)
 {
@@ -59,11 +52,14 @@ void io_notif_set_extended(struct io_kiocb *notif)
 {
 	struct io_notif_data *nd = io_notif_to_data(notif);
 
-	nd->zc_report = false;
-	nd->zc_used = false;
-	nd->zc_copied = false;
-	notif->io_task_work.func = io_notif_complete_tw_ext;
-	io_notif_to_data(notif)->uarg.callback = io_tx_ubuf_callback_ext;
+	if (nd->uarg.callback != io_tx_ubuf_callback_ext) {
+		nd->account_pages = 0;
+		nd->zc_report = false;
+		nd->zc_used = false;
+		nd->zc_copied = false;
+		nd->uarg.callback = io_tx_ubuf_callback_ext;
+		notif->io_task_work.func = io_notif_complete_tw_ext;
+	}
 }
 
 struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
@@ -81,10 +77,9 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	notif->task = current;
 	io_get_task_refs(1);
 	notif->rsrc_node = NULL;
-	notif->io_task_work.func = __io_notif_complete_tw;
+	notif->io_task_work.func = io_req_task_complete;
 
 	nd = io_notif_to_data(notif);
-	nd->account_pages = 0;
 	nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
 	nd->uarg.callback = io_tx_ubuf_callback;
 	refcount_set(&nd->uarg.refcnt, 1);
-- 
cgit 


From df730ec21f7ba395b1b22e7f93a3a85b1d1b7882 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Wed, 2 Nov 2022 16:25:03 +0800
Subject: io_uring: fix two assignments in if conditions

Fixes two errors:

"ERROR: do not use assignment in if condition
130: FILE: io_uring/net.c:130:
+       if (!(issue_flags & IO_URING_F_UNLOCKED) &&

ERROR: do not use assignment in if condition
599: FILE: io_uring/poll.c:599:
+       } else if (!(issue_flags & IO_URING_F_UNLOCKED) &&"
reported by checkpatch.pl in net.c and poll.c .

Signed-off-by: Xinghui Li <korantli@tencent.com>
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/r/20221102082503.32236-1-korantwork@gmail.com
[axboe: style tweaks]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c  | 16 +++++++++-------
 io_uring/poll.c |  7 +++++--
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/net.c b/io_uring/net.c
index 966019fcbe8c..f32e0daeedd8 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -125,13 +125,15 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req,
 	struct io_cache_entry *entry;
 	struct io_async_msghdr *hdr;
 
-	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
-	    (entry = io_alloc_cache_get(&ctx->netmsg_cache)) != NULL) {
-		hdr = container_of(entry, struct io_async_msghdr, cache);
-		hdr->free_iov = NULL;
-		req->flags |= REQ_F_ASYNC_DATA;
-		req->async_data = hdr;
-		return hdr;
+	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+		entry = io_alloc_cache_get(&ctx->netmsg_cache);
+		if (entry) {
+			hdr = container_of(entry, struct io_async_msghdr, cache);
+			hdr->free_iov = NULL;
+			req->flags |= REQ_F_ASYNC_DATA;
+			req->async_data = hdr;
+			return hdr;
+		}
 	}
 
 	if (!io_alloc_async_data(req)) {
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 055632e9092a..58e02d963961 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -615,10 +615,13 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
 	if (req->flags & REQ_F_POLLED) {
 		apoll = req->apoll;
 		kfree(apoll->double_poll);
-	} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
-		   (entry = io_alloc_cache_get(&ctx->apoll_cache)) != NULL) {
+	} else if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+		entry = io_alloc_cache_get(&ctx->apoll_cache);
+		if (entry == NULL)
+			goto alloc_apoll;
 		apoll = container_of(entry, struct async_poll, cache);
 	} else {
+alloc_apoll:
 		apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
 		if (unlikely(!apoll))
 			return NULL;
-- 
cgit 


From ef67fcb41de6d3d5bbb16aaa66d4c706c4cacf54 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Mon, 7 Nov 2022 04:33:49 -0800
Subject: io_uring: do not always force run task_work in io_uring_register

Running task work when not needed can unnecessarily delay
operations. Specifically IORING_SETUP_DEFER_TASKRUN tries to avoid running
task work until the user requests it. Therefore do not run it in
io_uring_register any more.

The one catch is that io_rsrc_ref_quiesce expects it to have run in order
to process all outstanding references, and so reorder it's loop to do this.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221107123349.4106213-1-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 --
 io_uring/rsrc.c     | 7 ++++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index bdb7e15f1c48..cf68d16255a0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4056,8 +4056,6 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 
 	ctx = f.file->private_data;
 
-	io_run_task_work_ctx(ctx);
-
 	mutex_lock(&ctx->uring_lock);
 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
 	mutex_unlock(&ctx->uring_lock);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 55d4ab96fb92..187f1c83e779 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -321,6 +321,11 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		if (atomic_dec_and_test(&data->refs))
 			break;
 		mutex_unlock(&ctx->uring_lock);
+
+		ret = io_run_task_work_sig(ctx);
+		if (ret < 0)
+			goto reinit;
+
 		flush_delayed_work(&ctx->rsrc_put_work);
 		ret = wait_for_completion_interruptible(&data->done);
 		if (!ret) {
@@ -336,12 +341,12 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 			}
 		}
 
+reinit:
 		atomic_inc(&data->refs);
 		/* wait for all works potentially completing data->done */
 		flush_delayed_work(&ctx->rsrc_put_work);
 		reinit_completion(&data->done);
 
-		ret = io_run_task_work_sig(ctx);
 		mutex_lock(&ctx->uring_lock);
 	} while (ret >= 0);
 	data->quiesce = false;
-- 
cgit 


From 515e26961295bee9da5e26916c27739dca6c10e1 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Mon, 21 Nov 2022 07:43:42 -0700
Subject: io_uring: revert "io_uring fix multishot accept ordering"

This is no longer needed after commit aa1df3a360a0 ("io_uring: fix CQE
reordering"), since all reordering is now taken care of.

This reverts commit cbd25748545c ("io_uring: fix multishot accept
ordering").

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221107125236.260132-2-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/net.c b/io_uring/net.c
index f32e0daeedd8..f7b2b068464d 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1322,12 +1322,12 @@ retry:
 		return IOU_OK;
 	}
 
-	if (ret >= 0 &&
-	    io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, false))
+	if (ret < 0)
+		return ret;
+	if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
 		goto retry;
 
-	io_req_set_res(req, ret, 0);
-	return (issue_flags & IO_URING_F_MULTISHOT) ? IOU_STOP_MULTISHOT : IOU_OK;
+	return -ECANCELED;
 }
 
 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-- 
cgit 


From e2ad599d1ed38fe743106f10d58a0cbfc00b51e2 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Mon, 7 Nov 2022 04:52:35 -0800
Subject: io_uring: allow multishot recv CQEs to overflow

With commit aa1df3a360a0 ("io_uring: fix CQE reordering"), there are
stronger guarantees for overflow ordering. Specifically ensuring that
userspace will not receive out of order receive CQEs. Therefore this is
not needed any more for recv/recvmsg.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221107125236.260132-4-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/net.c b/io_uring/net.c
index f7b2b068464d..0de6f78ad978 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -602,15 +602,11 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 
 	if (!mshot_finished) {
 		if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret,
-				    cflags | IORING_CQE_F_MORE, false)) {
+				    cflags | IORING_CQE_F_MORE, true)) {
 			io_recv_prep_retry(req);
 			return false;
 		}
-		/*
-		 * Otherwise stop multishot but use the current result.
-		 * Probably will end up going into overflow, but this means
-		 * we cannot trust the ordering anymore
-		 */
+		/* Otherwise stop multishot but use the current result. */
 	}
 
 	io_req_set_res(req, *ret, cflags);
-- 
cgit 


From cd42a53d25d489317b9ae5213da721cde8cb7071 Mon Sep 17 00:00:00 2001
From: Lin Ma <linma@zju.edu.cn>
Date: Thu, 10 Nov 2022 14:03:13 +0800
Subject: io_uring/poll: remove outdated comments of caching

Previous commit 13a99017ff19 ("io_uring: remove events caching
atavisms") entirely removes the events caching optimization introduced
by commit 81459350d581 ("io_uring: cache req->apoll->events in
req->cflags"). Hence the related comment should also be removed to avoid
misunderstanding.

Fixes: 13a99017ff19 ("io_uring: remove events caching atavisms")
Signed-off-by: Lin Ma <linma@zju.edu.cn>
Link: https://lore.kernel.org/r/20221110060313.16303-1-linma@zju.edu.cn
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 58e02d963961..8fb8e781c02d 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -324,12 +324,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 static void __io_poll_execute(struct io_kiocb *req, int mask)
 {
 	io_req_set_res(req, mask, 0);
-	/*
-	 * This is useful for poll that is armed on behalf of another
-	 * request, and where the wakeup path could be on a different
-	 * CPU. We want to avoid pulling in req->apoll->events for that
-	 * case.
-	 */
+
 	if (req->opcode == IORING_OP_POLL_ADD)
 		req->io_task_work.func = io_poll_task_func;
 	else
-- 
cgit 


From 23a6c9ac4dbd7cccf5b909e78aa84192b65f2833 Mon Sep 17 00:00:00 2001
From: Lin Ma <linma@zju.edu.cn>
Date: Thu, 10 Nov 2022 20:21:03 +0800
Subject: io_uring: update outdated comment of callbacks

Previous commit ebc11b6c6b87 ("io_uring: clean io-wq callbacks") rename
io_free_work() into io_wq_free_work() for consistency. This patch also
updates relevant comment to avoid misunderstanding.

Fixes: ebc11b6c6b87 ("io_uring: clean io-wq callbacks")
Signed-off-by: Lin Ma <linma@zju.edu.cn>
Link: https://lore.kernel.org/r/20221110122103.20120-1-linma@zju.edu.cn
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cf68d16255a0..c770eed4d717 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1781,7 +1781,7 @@ void io_wq_submit_work(struct io_wq_work *work)
 	bool needs_poll = false;
 	int ret = 0, err = -ECANCELED;
 
-	/* one will be dropped by ->io_free_work() after returning to io-wq */
+	/* one will be dropped by ->io_wq_free_work() after returning to io-wq */
 	if (!(req->flags & REQ_F_REFCOUNT))
 		__io_req_set_refcount(req, 2);
 	else
-- 
cgit 


From e52d2e583e4ad1d5d0b804d79c2b8752eb0e5ceb Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Nov 2022 16:54:08 +0000
Subject: io_uring: inline io_req_task_work_add()

__io_req_task_work_add() is huge but marked inline, that makes compilers
to generate lots of garbage. Inline the wrapper caller
io_req_task_work_add() instead.

before and after:
   text    data     bss     dec     hex filename
  47347   16248       8   63603    f873 io_uring/io_uring.o
   text    data     bss     dec     hex filename
  45303   16248       8   61559    f077 io_uring/io_uring.o

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/26dc8c28ca0160e3269ef3e55c5a8b917c4d4450.1668162751.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 7 +------
 io_uring/io_uring.h | 7 ++++++-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c770eed4d717..8f452dfb4f1c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1127,7 +1127,7 @@ static void io_req_local_work_add(struct io_kiocb *req)
 	__io_cqring_wake(ctx);
 }
 
-static inline void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
+void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1159,11 +1159,6 @@ static inline void __io_req_task_work_add(struct io_kiocb *req, bool allow_local
 	}
 }
 
-void io_req_task_work_add(struct io_kiocb *req)
-{
-	__io_req_task_work_add(req, true);
-}
-
 static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
 {
 	struct llist_node *node;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index cef5ff924e63..38d9e149d2db 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -50,9 +50,9 @@ static inline bool io_req_ffs_set(struct io_kiocb *req)
 	return req->flags & REQ_F_FIXED_FILE;
 }
 
+void __io_req_task_work_add(struct io_kiocb *req, bool allow_local);
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
-void io_req_task_work_add(struct io_kiocb *req);
 void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 void io_req_task_queue(struct io_kiocb *req);
 void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
@@ -82,6 +82,11 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 			bool cancel_all);
 
+static inline void io_req_task_work_add(struct io_kiocb *req)
+{
+	__io_req_task_work_add(req, true);
+}
+
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
-- 
cgit 


From d75936062049522172a107c994242b76c89777f9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Nov 2022 16:54:09 +0000
Subject: io_uring: split tw fallback into a function

When the target process is dying and so task_work_add() is not allowed
we push all task_work item to the fallback workqueue. Move the part
responsible for moving tw items out of __io_req_task_work_add() into
a separate function. Makes it a bit cleaner and gives the compiler a bit
of extra info.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e503dab9d7af95470ca6b214c6de17715ae4e748.1668162751.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8f452dfb4f1c..9925ac08c398 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1105,6 +1105,20 @@ void tctx_task_work(struct callback_head *cb)
 	trace_io_uring_task_work_run(tctx, count, loops);
 }
 
+static __cold void io_fallback_tw(struct io_uring_task *tctx)
+{
+	struct llist_node *node = llist_del_all(&tctx->task_list);
+	struct io_kiocb *req;
+
+	while (node) {
+		req = container_of(node, struct io_kiocb, io_task_work.node);
+		node = node->next;
+		if (llist_add(&req->io_task_work.node,
+			      &req->ctx->fallback_llist))
+			schedule_delayed_work(&req->ctx->fallback_work, 1);
+	}
+}
+
 static void io_req_local_work_add(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1131,7 +1145,6 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
-	struct llist_node *node;
 
 	if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
 		io_req_local_work_add(req);
@@ -1148,15 +1161,7 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
-	node = llist_del_all(&tctx->task_list);
-
-	while (node) {
-		req = container_of(node, struct io_kiocb, io_task_work.node);
-		node = node->next;
-		if (llist_add(&req->io_task_work.node,
-			      &req->ctx->fallback_llist))
-			schedule_delayed_work(&req->ctx->fallback_work, 1);
-	}
+	io_fallback_tw(tctx);
 }
 
 static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
-- 
cgit 


From f9d567c75ec216447f36da6e855500023504fa04 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 17 Nov 2022 18:41:06 +0000
Subject: io_uring: inline __io_req_complete_post()

There is only one user of __io_req_complete_post(), inline it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ef4c9059950a3da5cf68df00f977f1fd13bd9306.1668597569.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 11 +++--------
 io_uring/io_uring.h |  1 -
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9925ac08c398..1299f9c8567a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -854,19 +854,14 @@ static void __io_req_complete_put(struct io_kiocb *req)
 	}
 }
 
-void __io_req_complete_post(struct io_kiocb *req)
-{
-	if (!(req->flags & REQ_F_CQE_SKIP))
-		__io_fill_cqe_req(req->ctx, req);
-	__io_req_complete_put(req);
-}
-
 void io_req_complete_post(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
 	io_cq_lock(ctx);
-	__io_req_complete_post(req);
+	if (!(req->flags & REQ_F_CQE_SKIP))
+		__io_fill_cqe_req(ctx, req);
+	__io_req_complete_put(req);
 	io_cq_unlock_post(ctx);
 }
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 38d9e149d2db..69fbd27c7577 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -32,7 +32,6 @@ int io_run_local_work(struct io_ring_ctx *ctx);
 void io_req_complete_failed(struct io_kiocb *req, s32 res);
 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
 void io_req_complete_post(struct io_kiocb *req);
-void __io_req_complete_post(struct io_kiocb *req);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
-- 
cgit 


From 4464853277d0ccdb9914608dd1332f0fa2f9846f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 20 Nov 2022 10:18:45 -0700
Subject: io_uring: pass in EPOLL_URING_WAKE for eventfd signaling and wakeups

Pass in EPOLL_URING_WAKE when signaling eventfd or doing poll related
wakups, so that we can check for a circular event dependency between
eventfd and epoll. If this flag is set when our wakeup handlers are
called, then we know we have a dependency that needs to terminate
multishot requests.

eventfd and epoll are the only such possible dependencies.

Cc: stable@vger.kernel.org # 6.0
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  4 ++--
 io_uring/io_uring.h | 15 +++++++++++----
 io_uring/poll.c     |  8 ++++++++
 3 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1299f9c8567a..762ecab801f2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -495,7 +495,7 @@ static void io_eventfd_ops(struct rcu_head *rcu)
 	int ops = atomic_xchg(&ev_fd->ops, 0);
 
 	if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
-		eventfd_signal(ev_fd->cq_ev_fd, 1);
+		eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
 
 	/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
 	 * ordering in a race but if references are 0 we know we have to free
@@ -531,7 +531,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
 		goto out;
 
 	if (likely(eventfd_signal_allowed())) {
-		eventfd_signal(ev_fd->cq_ev_fd, 1);
+		eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
 	} else {
 		atomic_inc(&ev_fd->refs);
 		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 69fbd27c7577..83013ee584d6 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -4,6 +4,7 @@
 #include <linux/errno.h>
 #include <linux/lockdep.h>
 #include <linux/io_uring_types.h>
+#include <uapi/linux/eventpoll.h>
 #include "io-wq.h"
 #include "slist.h"
 #include "filetable.h"
@@ -211,12 +212,18 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
 static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
 {
 	/*
-	 * wake_up_all() may seem excessive, but io_wake_function() and
-	 * io_should_wake() handle the termination of the loop and only
-	 * wake as many waiters as we need to.
+	 * Trigger waitqueue handler on all waiters on our waitqueue. This
+	 * won't necessarily wake up all the tasks, io_should_wake() will make
+	 * that decision.
+	 *
+	 * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter
+	 * set in the mask so that if we recurse back into our own poll
+	 * waitqueue handlers, we know we have a dependency between eventfd or
+	 * epoll and should terminate multishot poll at that point.
 	 */
 	if (waitqueue_active(&ctx->cq_wait))
-		wake_up_all(&ctx->cq_wait);
+		__wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
+				poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
 }
 
 static inline void io_cqring_wake(struct io_ring_ctx *ctx)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 8fb8e781c02d..22c9b2e0944a 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -389,6 +389,14 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		return 0;
 
 	if (io_poll_get_ownership(req)) {
+		/*
+		 * If we trigger a multishot poll off our own wakeup path,
+		 * disable multishot as there is a circular dependency between
+		 * CQ posting and triggering the event.
+		 */
+		if (mask & EPOLL_URING_WAKE)
+			poll->events |= EPOLLONESHOT;
+
 		/* optional, saves extra locking for removal in tw handler */
 		if (mask && poll->events & EPOLLONESHOT) {
 			list_del_init(&poll->wait.entry);
-- 
cgit 


From 4061f0ef730cca5171351b7018b34a45b76df9c2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 20 Nov 2022 10:20:46 -0700
Subject: Revert "io_uring: disallow self-propelled ring polling"

This reverts commit 7fdbc5f014c3f71bc44673a2d6c5bb2d12d45f25.

This patch dealt with a subset of the real problem, which is a potential
circular dependency on the wakup path for io_uring itself. Outside of
io_uring, eventfd can also trigger this (see details in 03e02acda8e2)
and so can epoll (see details in caf1aeaffc3b). Now that we have a
generic solution to this problem, get rid of the io_uring specific
work-around.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 22c9b2e0944a..cd4d98d622d2 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -246,8 +246,6 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 			continue;
 		if (req->apoll_events & EPOLLONESHOT)
 			return IOU_POLL_DONE;
-		if (io_is_uring_fops(req->file))
-			return IOU_POLL_DONE;
 
 		/* multishot, just fill a CQE and proceed */
 		if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
-- 
cgit 


From 6c16fe3c16bdc420719768f7ea97b82bd6303eec Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 21 Nov 2022 07:51:15 -0700
Subject: io_uring: kill io_cqring_ev_posted() and __io_cq_unlock_post()

__io_cq_unlock_post() is identical to io_cq_unlock_post(), and
io_cqring_ev_posted() has a single caller so migth as well just inline
it there.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 762ecab801f2..2260fb7aa7f2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -581,23 +581,14 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 		io_eventfd_flush_signal(ctx);
 }
 
-static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-{
-	io_commit_cqring_flush(ctx);
-	io_cqring_wake(ctx);
-}
-
-static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
+void io_cq_unlock_post(struct io_ring_ctx *ctx)
 	__releases(ctx->completion_lock)
 {
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
-}
 
-void io_cq_unlock_post(struct io_ring_ctx *ctx)
-{
-	__io_cq_unlock_post(ctx);
+	io_commit_cqring_flush(ctx);
+	io_cqring_wake(ctx);
 }
 
 /* Returns true if there are no backlogged entries after the flush */
@@ -1346,7 +1337,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		if (!(req->flags & REQ_F_CQE_SKIP))
 			__io_fill_cqe_req(ctx, req);
 	}
-	__io_cq_unlock_post(ctx);
+	io_cq_unlock_post(ctx);
 
 	io_free_batch_list(ctx, state->compl_reqs.first);
 	INIT_WQ_LIST(&state->compl_reqs);
-- 
cgit 


From 2ccc92f4effcfa1c51c4fcf1e34d769099d3cad4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 23 Nov 2022 11:33:36 +0000
Subject: io_uring: add completion locking for iopoll

There are pieces of code that may allow iopoll to race filling cqes,
temporarily add spinlocking around posting events.

Cc: stable@vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/84d86b5c117feda075471c5c9e65208e0dccf5d0.1669203009.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1ce065709724..61c326831949 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1049,6 +1049,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 	else if (!pos)
 		return 0;
 
+	spin_lock(&ctx->completion_lock);
 	prev = start;
 	wq_list_for_each_resume(pos, prev) {
 		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
@@ -1063,11 +1064,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		req->cqe.flags = io_put_kbuf(req, 0);
 		__io_fill_cqe_req(req->ctx, req);
 	}
-
+	io_commit_cqring(ctx);
+	spin_unlock(&ctx->completion_lock);
 	if (unlikely(!nr_events))
 		return 0;
 
-	io_commit_cqring(ctx);
 	io_cqring_ev_posted_iopoll(ctx);
 	pos = start ? start->next : ctx->iopoll_list.first;
 	wq_list_cut(&ctx->iopoll_list, prev, start);
-- 
cgit 


From e276ae344a770f91912a81c6a338d92efd319be2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 23 Nov 2022 11:33:37 +0000
Subject: io_uring: hold locks for io_req_complete_failed

A preparation patch, make sure we always hold uring_lock around
io_req_complete_failed(). The only place deviating from the rule
is io_cancel_defer_files(), queue a tw instead.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/70760344eadaecf2939287084b9d4ba5c05a6984.1669203009.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2260fb7aa7f2..4d16f3b1ee11 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -862,9 +862,12 @@ inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
 }
 
 void io_req_complete_failed(struct io_kiocb *req, s32 res)
+	__must_hold(&ctx->uring_lock)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
 
+	lockdep_assert_held(&req->ctx->uring_lock);
+
 	req_set_fail(req);
 	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
 	if (def->fail)
@@ -1615,6 +1618,7 @@ static u32 io_get_sequence(struct io_kiocb *req)
 }
 
 static __cold void io_drain_req(struct io_kiocb *req)
+	__must_hold(&ctx->uring_lock)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_defer_entry *de;
@@ -2849,7 +2853,7 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
 	while (!list_empty(&list)) {
 		de = list_first_entry(&list, struct io_defer_entry, list);
 		list_del_init(&de->list);
-		io_req_complete_failed(de->req, -ECANCELED);
+		io_req_task_queue_fail(de->req, -ECANCELED);
 		kfree(de);
 	}
 	return true;
-- 
cgit 


From 624fd779fd869bdcb2c0ccca0f09456eed71ed52 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 23 Nov 2022 11:33:38 +0000
Subject: io_uring: use io_req_task_complete() in timeout

Use a more generic io_req_task_complete() in timeout completion
task_work instead of io_req_complete_post().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/bda1710b58c07bf06107421c2a65c529ea9cdcac.1669203009.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/timeout.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index e8a8c2099480..a819818df7b3 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -282,11 +282,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 			ret = io_try_cancel(req->task->io_uring, &cd, issue_flags);
 		}
 		io_req_set_res(req, ret ?: -ETIME, 0);
-		io_req_complete_post(req);
+		io_req_task_complete(req, locked);
 		io_put_req(prev);
 	} else {
 		io_req_set_res(req, -ETIME, 0);
-		io_req_complete_post(req);
+		io_req_task_complete(req, locked);
 	}
 }
 
-- 
cgit 


From 833b5dfffc26c81835ce38e2a5df9ac5fa142735 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 23 Nov 2022 11:33:39 +0000
Subject: io_uring: remove io_req_tw_post_queue

Remove io_req_tw_post() and io_req_tw_post_queue(), we can use
io_req_task_complete() instead.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b9b73c08022c7f1457023ac841f35c0100e70345.1669203009.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 12 ------------
 io_uring/io_uring.h |  8 +++++++-
 io_uring/timeout.c  |  6 +++---
 3 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4d16f3b1ee11..e445344f6f07 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1226,18 +1226,6 @@ int io_run_local_work(struct io_ring_ctx *ctx)
 	return ret;
 }
 
-static void io_req_tw_post(struct io_kiocb *req, bool *locked)
-{
-	io_req_complete_post(req);
-}
-
-void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
-{
-	io_req_set_res(req, res, cflags);
-	req->io_task_work.func = io_req_tw_post;
-	io_req_task_work_add(req);
-}
-
 static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 {
 	/* not needed for normal modes, but SQPOLL depends on it */
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 83013ee584d6..222af88df10f 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -53,7 +53,6 @@ static inline bool io_req_ffs_set(struct io_kiocb *req)
 void __io_req_task_work_add(struct io_kiocb *req, bool allow_local);
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
-void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 void io_req_task_queue(struct io_kiocb *req);
 void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
 void io_req_task_complete(struct io_kiocb *req, bool *locked);
@@ -375,4 +374,11 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
 		      ctx->submitter_task == current);
 }
 
+static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
+{
+	io_req_set_res(req, res, 0);
+	req->io_task_work.func = io_req_task_complete;
+	io_req_task_work_add(req);
+}
+
 #endif
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index a819818df7b3..5b4bc93fd6e0 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -63,7 +63,7 @@ static bool io_kill_timeout(struct io_kiocb *req, int status)
 		atomic_set(&req->ctx->cq_timeouts,
 			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&timeout->list);
-		io_req_tw_post_queue(req, status, 0);
+		io_req_queue_tw_complete(req, status);
 		return true;
 	}
 	return false;
@@ -159,7 +159,7 @@ void io_disarm_next(struct io_kiocb *req)
 		req->flags &= ~REQ_F_ARM_LTIMEOUT;
 		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_remove_next_linked(req);
-			io_req_tw_post_queue(link, -ECANCELED, 0);
+			io_req_queue_tw_complete(link, -ECANCELED);
 		}
 	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
 		struct io_ring_ctx *ctx = req->ctx;
@@ -168,7 +168,7 @@ void io_disarm_next(struct io_kiocb *req)
 		link = io_disarm_linked_timeout(req);
 		spin_unlock_irq(&ctx->timeout_lock);
 		if (link)
-			io_req_tw_post_queue(link, -ECANCELED, 0);
+			io_req_queue_tw_complete(link, -ECANCELED);
 	}
 	if (unlikely((req->flags & REQ_F_FAIL) &&
 		     !(req->flags & REQ_F_HARDLINK)))
-- 
cgit 


From fa18fa2272c7469e470dcb7bf838ea50a25494ca Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 23 Nov 2022 11:33:40 +0000
Subject: io_uring: inline __io_req_complete_put()

Inline __io_req_complete_put() into io_req_complete_post(), there are no
other users.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1923a4dfe80fa877f859a22ed3df2d5fc8ecf02b.1669203009.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e445344f6f07..a0c71a2dce19 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -814,15 +814,19 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 	return filled;
 }
 
-static void __io_req_complete_put(struct io_kiocb *req)
+void io_req_complete_post(struct io_kiocb *req)
 {
+	struct io_ring_ctx *ctx = req->ctx;
+
+	io_cq_lock(ctx);
+	if (!(req->flags & REQ_F_CQE_SKIP))
+		__io_fill_cqe_req(ctx, req);
+
 	/*
 	 * If we're the last reference to this request, add to our locked
 	 * free_list cache.
 	 */
 	if (req_ref_put_and_test(req)) {
-		struct io_ring_ctx *ctx = req->ctx;
-
 		if (req->flags & IO_REQ_LINK_FLAGS) {
 			if (req->flags & IO_DISARM_MASK)
 				io_disarm_next(req);
@@ -843,16 +847,6 @@ static void __io_req_complete_put(struct io_kiocb *req)
 		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
 		ctx->locked_free_nr++;
 	}
-}
-
-void io_req_complete_post(struct io_kiocb *req)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-
-	io_cq_lock(ctx);
-	if (!(req->flags & REQ_F_CQE_SKIP))
-		__io_fill_cqe_req(ctx, req);
-	__io_req_complete_put(req);
 	io_cq_unlock_post(ctx);
 }
 
-- 
cgit 


From 1bec951c3809051f64a6957fe86d1b4786cc0313 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 23 Nov 2022 11:33:41 +0000
Subject: io_uring: iopoll protect complete_post

io_req_complete_post() may be used by iopoll enabled rings, grab locks
in this case. That requires to pass issue_flags to propagate the locking
state.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/cc6d854065c57c838ca8e8806f707a226b70fd2d.1669203009.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c  | 21 +++++++++++++++------
 io_uring/io_uring.h  | 10 ++++++++--
 io_uring/kbuf.c      |  4 ++--
 io_uring/poll.c      |  2 +-
 io_uring/uring_cmd.c |  2 +-
 5 files changed, 27 insertions(+), 12 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a0c71a2dce19..cc27413129fc 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -814,7 +814,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 	return filled;
 }
 
-void io_req_complete_post(struct io_kiocb *req)
+static void __io_req_complete_post(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -850,9 +850,18 @@ void io_req_complete_post(struct io_kiocb *req)
 	io_cq_unlock_post(ctx);
 }
 
-inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
+void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 {
-	io_req_complete_post(req);
+	if (!(issue_flags & IO_URING_F_UNLOCKED) ||
+	    !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
+		__io_req_complete_post(req);
+	} else {
+		struct io_ring_ctx *ctx = req->ctx;
+
+		mutex_lock(&ctx->uring_lock);
+		__io_req_complete_post(req);
+		mutex_unlock(&ctx->uring_lock);
+	}
 }
 
 void io_req_complete_failed(struct io_kiocb *req, s32 res)
@@ -866,7 +875,7 @@ void io_req_complete_failed(struct io_kiocb *req, s32 res)
 	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
 	if (def->fail)
 		def->fail(req);
-	io_req_complete_post(req);
+	io_req_complete_post(req, 0);
 }
 
 /*
@@ -1450,7 +1459,7 @@ void io_req_task_complete(struct io_kiocb *req, bool *locked)
 	if (*locked)
 		io_req_complete_defer(req);
 	else
-		io_req_complete_post(req);
+		io_req_complete_post_tw(req, locked);
 }
 
 /*
@@ -1718,7 +1727,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 		if (issue_flags & IO_URING_F_COMPLETE_DEFER)
 			io_req_complete_defer(req);
 		else
-			io_req_complete_post(req);
+			io_req_complete_post(req, issue_flags);
 	} else if (ret != IOU_ISSUE_SKIP_COMPLETE)
 		return ret;
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 222af88df10f..b5b80bf03385 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -31,14 +31,20 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx);
 int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked);
 int io_run_local_work(struct io_ring_ctx *ctx);
 void io_req_complete_failed(struct io_kiocb *req, s32 res);
-void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
-void io_req_complete_post(struct io_kiocb *req);
+void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
+static inline void io_req_complete_post_tw(struct io_kiocb *req, bool *locked)
+{
+	unsigned flags = *locked ? 0 : IO_URING_F_UNLOCKED;
+
+	io_req_complete_post(req, flags);
+}
+
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
 
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index e2c46889d5fa..e8150ed637d8 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -311,7 +311,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
 	/* complete before unlock, IOPOLL may need the lock */
 	io_req_set_res(req, ret, 0);
-	__io_req_complete(req, issue_flags);
+	io_req_complete_post(req, 0);
 	io_ring_submit_unlock(ctx, issue_flags);
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
@@ -462,7 +462,7 @@ err:
 		req_set_fail(req);
 	/* complete before unlock, IOPOLL may need the lock */
 	io_req_set_res(req, ret, 0);
-	__io_req_complete(req, issue_flags);
+	io_req_complete_post(req, 0);
 	io_ring_submit_unlock(ctx, issue_flags);
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
diff --git a/io_uring/poll.c b/io_uring/poll.c
index cd4d98d622d2..4624e5eba63e 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -312,7 +312,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 	io_poll_tw_hash_eject(req, locked);
 
 	if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
-		io_req_complete_post(req);
+		io_req_complete_post_tw(req, locked);
 	else if (ret == IOU_POLL_DONE)
 		io_req_task_submit(req, locked);
 	else
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index e50de0b6b9f8..446a189b78b0 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -56,7 +56,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
 		/* order with io_iopoll_req_issued() checking ->iopoll_complete */
 		smp_store_release(&req->iopoll_completed, 1);
 	else
-		__io_req_complete(req, 0);
+		io_req_complete_post(req, 0);
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_done);
 
-- 
cgit 


From 2dac1a159216b39ced8d78dba590c5d2f4249586 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 23 Nov 2022 11:33:42 +0000
Subject: io_uring: remove iopoll spinlock

This reverts commit 2ccc92f4effcfa1c51c4fcf1e34d769099d3cad4

io_req_complete_post() should now behave well even in case of IOPOLL, we
can remove completion_lock locking.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7e171c8b530656b14a671c59100ca260e46e7f2a.1669203009.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rw.c b/io_uring/rw.c
index 61c326831949..1ce065709724 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1049,7 +1049,6 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 	else if (!pos)
 		return 0;
 
-	spin_lock(&ctx->completion_lock);
 	prev = start;
 	wq_list_for_each_resume(pos, prev) {
 		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
@@ -1064,11 +1063,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		req->cqe.flags = io_put_kbuf(req, 0);
 		__io_fill_cqe_req(req->ctx, req);
 	}
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
+
 	if (unlikely(!nr_events))
 		return 0;
 
+	io_commit_cqring(ctx);
 	io_cqring_ev_posted_iopoll(ctx);
 	pos = start ? start->next : ctx->iopoll_list.first;
 	wq_list_cut(&ctx->iopoll_list, prev, start);
-- 
cgit 


From c06c6c5d276707e04cedbcc55625e984922118aa Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:52 -0800
Subject: io_uring: always lock in io_apoll_task_func

This is required for the failure case (io_req_complete_failed) and is
missing.

The alternative would be to only lock in the failure path, however all of
the non-error paths in io_poll_check_events that do not do not return
IOU_POLL_NO_ACTION end up locking anyway. The only extraneous lock would
be for the multishot poll overflowing the CQE ring, however multishot poll
would probably benefit from being locked as it will allow completions to
be batched.

So it seems reasonable to lock always.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-3-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 4624e5eba63e..42aa10b50f6c 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -308,11 +308,12 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 	if (ret == IOU_POLL_NO_ACTION)
 		return;
 
+	io_tw_lock(req->ctx, locked);
 	io_poll_remove_entries(req);
 	io_poll_tw_hash_eject(req, locked);
 
 	if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
-		io_req_complete_post_tw(req, locked);
+		io_req_task_complete(req, locked);
 	else if (ret == IOU_POLL_DONE)
 		io_req_task_submit(req, locked);
 	else
-- 
cgit 


From 973fc83f3a94bdffcacf482641db38f57c7c8609 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:53 -0800
Subject: io_uring: defer all io_req_complete_failed

All failures happen under lock now, and can be deferred. To be consistent
when the failure has happened after some multishot cqe has been
deferred (and keep ordering), always defer failures.

To make this obvious at the caller (and to help prevent a future bug)
rename io_req_complete_failed to io_req_defer_failed.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-4-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 17 ++++++++---------
 io_uring/io_uring.h |  2 +-
 io_uring/poll.c     |  2 +-
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cc27413129fc..4888fe834920 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -864,7 +864,7 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 	}
 }
 
-void io_req_complete_failed(struct io_kiocb *req, s32 res)
+void io_req_defer_failed(struct io_kiocb *req, s32 res)
 	__must_hold(&ctx->uring_lock)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -875,7 +875,7 @@ void io_req_complete_failed(struct io_kiocb *req, s32 res)
 	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
 	if (def->fail)
 		def->fail(req);
-	io_req_complete_post(req, 0);
+	io_req_complete_defer(req);
 }
 
 /*
@@ -1231,9 +1231,8 @@ int io_run_local_work(struct io_ring_ctx *ctx)
 
 static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 {
-	/* not needed for normal modes, but SQPOLL depends on it */
 	io_tw_lock(req->ctx, locked);
-	io_req_complete_failed(req, req->cqe.res);
+	io_req_defer_failed(req, req->cqe.res);
 }
 
 void io_req_task_submit(struct io_kiocb *req, bool *locked)
@@ -1243,7 +1242,7 @@ void io_req_task_submit(struct io_kiocb *req, bool *locked)
 	if (likely(!(req->task->flags & PF_EXITING)))
 		io_queue_sqe(req);
 	else
-		io_req_complete_failed(req, -EFAULT);
+		io_req_defer_failed(req, -EFAULT);
 }
 
 void io_req_task_queue_fail(struct io_kiocb *req, int ret)
@@ -1630,7 +1629,7 @@ queue:
 	ret = io_req_prep_async(req);
 	if (ret) {
 fail:
-		io_req_complete_failed(req, ret);
+		io_req_defer_failed(req, ret);
 		return;
 	}
 	io_prep_async_link(req);
@@ -1860,7 +1859,7 @@ static void io_queue_async(struct io_kiocb *req, int ret)
 	struct io_kiocb *linked_timeout;
 
 	if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
-		io_req_complete_failed(req, ret);
+		io_req_defer_failed(req, ret);
 		return;
 	}
 
@@ -1910,14 +1909,14 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
 		 */
 		req->flags &= ~REQ_F_HARDLINK;
 		req->flags |= REQ_F_LINK;
-		io_req_complete_failed(req, req->cqe.res);
+		io_req_defer_failed(req, req->cqe.res);
 	} else if (unlikely(req->ctx->drain_active)) {
 		io_drain_req(req);
 	} else {
 		int ret = io_req_prep_async(req);
 
 		if (unlikely(ret))
-			io_req_complete_failed(req, ret);
+			io_req_defer_failed(req, ret);
 		else
 			io_queue_iowq(req, NULL);
 	}
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index b5b80bf03385..a26d5aa7f3f3 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -30,7 +30,7 @@ bool io_req_cqe_overflow(struct io_kiocb *req);
 int io_run_task_work_sig(struct io_ring_ctx *ctx);
 int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked);
 int io_run_local_work(struct io_ring_ctx *ctx);
-void io_req_complete_failed(struct io_kiocb *req, s32 res);
+void io_req_defer_failed(struct io_kiocb *req, s32 res);
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 42aa10b50f6c..4bd43e6f5b72 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -317,7 +317,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 	else if (ret == IOU_POLL_DONE)
 		io_req_task_submit(req, locked);
 	else
-		io_req_complete_failed(req, ret);
+		io_req_defer_failed(req, ret);
 }
 
 static void __io_poll_execute(struct io_kiocb *req, int mask)
-- 
cgit 


From 931147ddfa6e9ffd814272f1c0370c4740acbe17 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:54 -0800
Subject: io_uring: allow defer completion for aux posted cqes

Multishot ops cannot use the compl_reqs list as the request must stay in
the poll list, but that means they need to run each completion without
benefiting from batching.

Here introduce batching infrastructure for only small (ie 16 byte)
CQEs. This restriction is ok because there are no use cases posting 32
byte CQEs.

In the ring keep a batch of up to 16 posted results, and flush in the same
way as compl_reqs.

16 was chosen through experimentation on a microbenchmark ([1]), as well
as trying not to increase the size of the ring too much. This increases
the size to 1472 bytes from 1216.

[1]: https://github.com/DylanZA/liburing/commit/9ac66b36bcf4477bfafeff1c5f107896b7ae31cf
Run with $ make -j && ./benchmark/reg.b -s 1 -t 2000 -r 10
Gives results:
baseline	8309 k/s
8		18807 k/s
16		19338 k/s
32		20134 k/s

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-5-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4888fe834920..28635e3e578a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -167,7 +167,8 @@ EXPORT_SYMBOL(io_uring_get_socket);
 
 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 {
-	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
+	if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
+	    ctx->submit_state.cqes_count)
 		__io_submit_flush_completions(ctx);
 }
 
@@ -802,6 +803,21 @@ bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 	return false;
 }
 
+static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_submit_state *state = &ctx->submit_state;
+	unsigned int i;
+
+	lockdep_assert_held(&ctx->uring_lock);
+	for (i = 0; i < state->cqes_count; i++) {
+		struct io_uring_cqe *cqe = &state->cqes[i];
+
+		io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags, true);
+	}
+	state->cqes_count = 0;
+}
+
 bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 		     u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow)
@@ -1323,6 +1339,9 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	struct io_submit_state *state = &ctx->submit_state;
 
 	io_cq_lock(ctx);
+	/* must come first to preserve CQE ordering in failure cases */
+	if (state->cqes_count)
+		__io_flush_post_cqes(ctx);
 	wq_list_for_each(node, prev, &state->compl_reqs) {
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 					    comp_list);
@@ -1332,8 +1351,10 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	}
 	io_cq_unlock_post(ctx);
 
-	io_free_batch_list(ctx, state->compl_reqs.first);
-	INIT_WQ_LIST(&state->compl_reqs);
+	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
+		io_free_batch_list(ctx, state->compl_reqs.first);
+		INIT_WQ_LIST(&state->compl_reqs);
+	}
 }
 
 /*
-- 
cgit 


From 9b8c54755a2b16d4f23c0ea184b75e2edf77d906 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:55 -0800
Subject: io_uring: add io_aux_cqe which allows deferred completion

Use the just introduced deferred post cqe completion state when possible
in io_aux_cqe. If not possible fallback to io_post_aux_cqe.

This introduces a complication because of allow_overflow. For deferred
completions we cannot know without locking the completion_lock if it will
overflow (and even if we locked it, another post could sneak in and cause
this cqe to be in overflow).
However since overflow protection is mostly a best effort defence in depth
to prevent infinite loops of CQEs for poll, just checking the overflow bit
is going to be good enough and will result in at most 16 (array size of
deferred cqes) overflows.

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-6-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 34 ++++++++++++++++++++++++++++++++++
 io_uring/io_uring.h |  2 ++
 io_uring/net.c      |  7 ++++---
 io_uring/poll.c     |  4 ++--
 4 files changed, 42 insertions(+), 5 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 28635e3e578a..056aea917cd6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -830,6 +830,40 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 	return filled;
 }
 
+bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
+		bool allow_overflow)
+{
+	struct io_uring_cqe *cqe;
+	unsigned int length;
+
+	if (!defer)
+		return io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
+
+	length = ARRAY_SIZE(ctx->submit_state.cqes);
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	if (ctx->submit_state.cqes_count == length) {
+		io_cq_lock(ctx);
+		__io_flush_post_cqes(ctx);
+		/* no need to flush - flush is deferred */
+		spin_unlock(&ctx->completion_lock);
+	}
+
+	/* For defered completions this is not as strict as it is otherwise,
+	 * however it's main job is to prevent unbounded posted completions,
+	 * and in that it works just as well.
+	 */
+	if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
+		return false;
+
+	cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++];
+	cqe->user_data = user_data;
+	cqe->res = res;
+	cqe->flags = cflags;
+	return true;
+}
+
 static void __io_req_complete_post(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index a26d5aa7f3f3..dd02adf3d0df 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -36,6 +36,8 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 		     bool allow_overflow);
 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
+bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
+		bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
 static inline void io_req_complete_post_tw(struct io_kiocb *req, bool *locked)
diff --git a/io_uring/net.c b/io_uring/net.c
index 0de6f78ad978..90342dcb6b1d 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -601,8 +601,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 	}
 
 	if (!mshot_finished) {
-		if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret,
-				    cflags | IORING_CQE_F_MORE, true)) {
+		if (io_aux_cqe(req->ctx, issue_flags & IO_URING_F_COMPLETE_DEFER,
+			       req->cqe.user_data, *ret, cflags | IORING_CQE_F_MORE, true)) {
 			io_recv_prep_retry(req);
 			return false;
 		}
@@ -1320,7 +1320,8 @@ retry:
 
 	if (ret < 0)
 		return ret;
-	if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
+	if (io_aux_cqe(ctx, issue_flags & IO_URING_F_COMPLETE_DEFER,
+		       req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
 		goto retry;
 
 	return -ECANCELED;
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 4bd43e6f5b72..922c1a366c41 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -252,8 +252,8 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 			__poll_t mask = mangle_poll(req->cqe.res &
 						    req->apoll_events);
 
-			if (!io_post_aux_cqe(ctx, req->cqe.user_data,
-					     mask, IORING_CQE_F_MORE, false)) {
+			if (!io_aux_cqe(ctx, *locked, req->cqe.user_data,
+					mask, IORING_CQE_F_MORE, false)) {
 				io_req_set_res(req, mask, 0);
 				return IOU_POLL_REMOVE_POLL_USE_RES;
 			}
-- 
cgit 


From a77ab745f28d5ab2ce51d0e44e85af942bb77d47 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:56 -0800
Subject: io_uring: make io_fill_cqe_aux static

This is only used in io_uring.c

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-7-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 4 ++--
 io_uring/io_uring.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 056aea917cd6..fea84e51e56f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -770,8 +770,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
 	return &rings->cqes[off];
 }
 
-bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
-		     bool allow_overflow)
+static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
+			    bool allow_overflow)
 {
 	struct io_uring_cqe *cqe;
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index dd02adf3d0df..46694f40bf72 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -34,8 +34,6 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res);
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
-bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
-		     bool allow_overflow);
 bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
 		bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
-- 
cgit 


From 2e2ef4a1dab980d88a1ab45bf0e28c8851999e33 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:57 -0800
Subject: io_uring: add lockdep assertion in io_fill_cqe_aux

Add an assertion for the completion lock to io_fill_cqe_aux

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-8-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index fea84e51e56f..03051e1fa02e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -775,6 +775,8 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32
 {
 	struct io_uring_cqe *cqe;
 
+	lockdep_assert_held(&ctx->completion_lock);
+
 	ctx->cq_extra++;
 
 	/*
-- 
cgit 


From b529c96a896b7bea8464a58d350836cc106d70bd Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:58 -0800
Subject: io_uring: remove overflow param from io_post_aux_cqe

The only call sites which would not allow overflow are also call sites
which would use the io_aux_cqe as they care about ordering.

So remove this parameter from io_post_aux_cqe.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-9-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 12 ++++++++----
 io_uring/io_uring.h |  3 +--
 io_uring/msg_ring.c |  4 ++--
 io_uring/rsrc.c     |  4 ++--
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 03051e1fa02e..7ed9cbeb573f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -820,9 +820,8 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
 	state->cqes_count = 0;
 }
 
-bool io_post_aux_cqe(struct io_ring_ctx *ctx,
-		     u64 user_data, s32 res, u32 cflags,
-		     bool allow_overflow)
+static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
+			      bool allow_overflow)
 {
 	bool filled;
 
@@ -832,6 +831,11 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 	return filled;
 }
 
+bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
+{
+	return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
+}
+
 bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
 		bool allow_overflow)
 {
@@ -839,7 +843,7 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32
 	unsigned int length;
 
 	if (!defer)
-		return io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
+		return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
 
 	length = ARRAY_SIZE(ctx->submit_state.cqes);
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 46694f40bf72..dcb8e3468f1d 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -32,8 +32,7 @@ int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked);
 int io_run_local_work(struct io_ring_ctx *ctx);
 void io_req_defer_failed(struct io_kiocb *req, s32 res);
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
-bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
-		     bool allow_overflow);
+bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
 bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
 		bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 90d2fc6fd80e..afb543aab9f6 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -31,7 +31,7 @@ static int io_msg_ring_data(struct io_kiocb *req)
 	if (msg->src_fd || msg->dst_fd || msg->flags)
 		return -EINVAL;
 
-	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
+	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
 		return 0;
 
 	return -EOVERFLOW;
@@ -116,7 +116,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
 	 * completes with -EOVERFLOW, then the sender must ensure that a
 	 * later IORING_OP_MSG_RING delivers the message.
 	 */
-	if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
+	if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
 		ret = -EOVERFLOW;
 out_unlock:
 	io_double_unlock_ctx(ctx, target_ctx, issue_flags);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 187f1c83e779..133608200769 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -170,10 +170,10 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 		if (prsrc->tag) {
 			if (ctx->flags & IORING_SETUP_IOPOLL) {
 				mutex_lock(&ctx->uring_lock);
-				io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true);
+				io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
 				mutex_unlock(&ctx->uring_lock);
 			} else {
-				io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true);
+				io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
 			}
 		}
 
-- 
cgit 


From 9a6924519e5e882631a7fff429facca838207e45 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:59 -0800
Subject: io_uring: allow multishot polled reqs to defer completion

Until now there was no reason for multishot polled requests to defer
completions as there was no functional difference. However now this will
actually defer the completions, for a performance win.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-10-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7ed9cbeb573f..72c97af4f292 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1803,7 +1803,8 @@ int io_poll_issue(struct io_kiocb *req, bool *locked)
 	io_tw_lock(req->ctx, locked);
 	if (unlikely(req->task->flags & PF_EXITING))
 		return -EFAULT;
-	return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT);
+	return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT|
+				 IO_URING_F_COMPLETE_DEFER);
 }
 
 struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
-- 
cgit 


From 27f35fe9096b183d45ff6f22ad277ddf107d8428 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Fri, 25 Nov 2022 02:34:10 -0800
Subject: io_uring: remove io_req_complete_post_tw

It's only used in one place. Inline it.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221125103412.1425305-2-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 io_uring/io_uring.h | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 72c97af4f292..24aa049fe7e1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1519,7 +1519,7 @@ void io_req_task_complete(struct io_kiocb *req, bool *locked)
 	if (*locked)
 		io_req_complete_defer(req);
 	else
-		io_req_complete_post_tw(req, locked);
+		io_req_complete_post(req, IO_URING_F_UNLOCKED);
 }
 
 /*
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index dcb8e3468f1d..76659d2fc90c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -37,13 +37,6 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32
 		bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
-static inline void io_req_complete_post_tw(struct io_kiocb *req, bool *locked)
-{
-	unsigned flags = *locked ? 0 : IO_URING_F_UNLOCKED;
-
-	io_req_complete_post(req, flags);
-}
-
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
 
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
-- 
cgit 


From 10d8bc35416d9e83ffe9644478756281c7bd4f52 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Fri, 25 Nov 2022 02:34:11 -0800
Subject: io_uring: spelling fix

s/pushs/pushes/

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221125103412.1425305-3-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 24aa049fe7e1..d9c9e347346d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2706,7 +2706,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	 * lock(&ep->mtx);
 	 *
 	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
-	 * pushs them to do the flush.
+	 * pushes them to do the flush.
 	 */
 
 	if (io_cqring_events(ctx) || io_has_work(ctx))
-- 
cgit 


From c3b490930dbe6a6c98d3820f445757ddec1efb08 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Nov 2022 19:46:40 +0000
Subject: io_uring: don't use complete_post in kbuf

Now we're handling IOPOLL completions more generically, get rid uses of
_post() and send requests through the normal path. It may have some
extra mertis performance wise, but we don't care much as there is a
better interface for selected buffers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4deded706587f55b006dc33adf0c13cfc3b2319f.1669310258.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index e8150ed637d8..4a6401080c1f 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -306,14 +306,11 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 		if (!bl->buf_nr_pages)
 			ret = __io_remove_buffers(ctx, bl, p->nbufs);
 	}
+	io_ring_submit_unlock(ctx, issue_flags);
 	if (ret < 0)
 		req_set_fail(req);
-
-	/* complete before unlock, IOPOLL may need the lock */
 	io_req_set_res(req, ret, 0);
-	io_req_complete_post(req, 0);
-	io_ring_submit_unlock(ctx, issue_flags);
-	return IOU_ISSUE_SKIP_COMPLETE;
+	return IOU_OK;
 }
 
 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -458,13 +455,12 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = io_add_buffers(ctx, p, bl);
 err:
+	io_ring_submit_unlock(ctx, issue_flags);
+
 	if (ret < 0)
 		req_set_fail(req);
-	/* complete before unlock, IOPOLL may need the lock */
 	io_req_set_res(req, ret, 0);
-	io_req_complete_post(req, 0);
-	io_ring_submit_unlock(ctx, issue_flags);
-	return IOU_ISSUE_SKIP_COMPLETE;
+	return IOU_OK;
 }
 
 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
-- 
cgit 


From 5d772916855f593672de55c437925daccc8ecd73 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Nov 2022 19:46:41 +0000
Subject: io_uring: keep unlock_post inlined in hot path

This partially reverts

6c16fe3c16bdc ("io_uring: kill io_cqring_ev_posted() and __io_cq_unlock_post()")

The redundancy of __io_cq_unlock_post() was always to keep it inlined
into __io_submit_flush_completions(). Inline it back and rename with
hope of clarifying the intention behind it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/372a16c485fca44c069be2e92fc5e7332a1d7fd7.1669310258.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d9c9e347346d..adecdf65b130 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -582,7 +582,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 		io_eventfd_flush_signal(ctx);
 }
 
-void io_cq_unlock_post(struct io_ring_ctx *ctx)
+/* keep it inlined for io_submit_flush_completions() */
+static inline void io_cq_unlock_post_inline(struct io_ring_ctx *ctx)
 	__releases(ctx->completion_lock)
 {
 	io_commit_cqring(ctx);
@@ -592,6 +593,12 @@ void io_cq_unlock_post(struct io_ring_ctx *ctx)
 	io_cqring_wake(ctx);
 }
 
+void io_cq_unlock_post(struct io_ring_ctx *ctx)
+	__releases(ctx->completion_lock)
+{
+	io_cq_unlock_post_inline(ctx);
+}
+
 /* Returns true if there are no backlogged entries after the flush */
 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
@@ -1389,7 +1396,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		if (!(req->flags & REQ_F_CQE_SKIP))
 			__io_fill_cqe_req(ctx, req);
 	}
-	io_cq_unlock_post(ctx);
+	io_cq_unlock_post_inline(ctx);
 
 	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
 		io_free_batch_list(ctx, state->compl_reqs.first);
-- 
cgit