From e307e6698165ca6508ed42c69cb1be76c8eb6a3c Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Thu, 27 Oct 2022 20:34:45 +0200
Subject: io_uring/net: introduce IORING_SEND_ZC_REPORT_USAGE flag

It might be useful for applications to detect if a zero copy transfer with
SEND[MSG]_ZC was actually possible or not. The application can fallback to
plain SEND[MSG] in order to avoid the overhead of two cqes per request. Or
it can generate a log message that could indicate to an administrator that
no zero copy was possible and could explain degraded performance.

Cc: stable@vger.kernel.org # 6.1
Link: https://lore.kernel.org/io-uring/fb6a7599-8a9b-15e5-9b64-6cd9d01c6ff4@gmail.com/T/#m2b0d9df94ce43b0e69e6c089bdff0ce6babbdfaa
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8945b01756d902f5d5b0667f20b957ad3f742e5e.1666895626.git.metze@samba.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'io_uring/net.c')

diff --git a/io_uring/net.c b/io_uring/net.c
index ab83da7e80f0..03825b043afe 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -937,7 +937,8 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	zc->flags = READ_ONCE(sqe->ioprio);
 	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
-			  IORING_RECVSEND_FIXED_BUF))
+			  IORING_RECVSEND_FIXED_BUF |
+			  IORING_SEND_ZC_REPORT_USAGE))
 		return -EINVAL;
 	notif = zc->notif = io_alloc_notif(ctx);
 	if (!notif)
@@ -955,6 +956,9 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		req->imu = READ_ONCE(ctx->user_bufs[idx]);
 		io_req_set_rsrc_node(notif, ctx, 0);
 	}
+	if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
+		io_notif_to_data(notif)->zc_report = true;
+	}
 
 	if (req->opcode == IORING_OP_SEND_ZC) {
 		if (READ_ONCE(sqe->__pad3[0]))
-- 
cgit 


From 40725d1b960f19a11a1ebd1ab537844ebf39347c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 4 Nov 2022 10:59:45 +0000
Subject: io_uring: move zc reporting from the hot path

Add custom tw and notif callbacks on top of usual bits also handling zc
reporting. That moves it from the hot path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/40de4a6409042478e1f35adc4912e23226cb1b5c.1667557923.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'io_uring/net.c')

diff --git a/io_uring/net.c b/io_uring/net.c
index 03825b043afe..9e3da845f906 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -923,6 +923,9 @@ void io_send_zc_cleanup(struct io_kiocb *req)
 	}
 }
 
+#define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
+#define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE)
+
 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -935,11 +938,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (req->flags & REQ_F_CQE_SKIP)
 		return -EINVAL;
 
-	zc->flags = READ_ONCE(sqe->ioprio);
-	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
-			  IORING_RECVSEND_FIXED_BUF |
-			  IORING_SEND_ZC_REPORT_USAGE))
-		return -EINVAL;
 	notif = zc->notif = io_alloc_notif(ctx);
 	if (!notif)
 		return -ENOMEM;
@@ -947,6 +945,17 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	notif->cqe.res = 0;
 	notif->cqe.flags = IORING_CQE_F_NOTIF;
 	req->flags |= REQ_F_NEED_CLEANUP;
+
+	zc->flags = READ_ONCE(sqe->ioprio);
+	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
+		if (zc->flags & ~IO_ZC_FLAGS_VALID)
+			return -EINVAL;
+		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
+			io_notif_set_extended(notif);
+			io_notif_to_data(notif)->zc_report = true;
+		}
+	}
+
 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
 		unsigned idx = READ_ONCE(sqe->buf_index);
 
@@ -956,9 +965,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		req->imu = READ_ONCE(ctx->user_bufs[idx]);
 		io_req_set_rsrc_node(notif, ctx, 0);
 	}
-	if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
-		io_notif_to_data(notif)->zc_report = true;
-	}
 
 	if (req->opcode == IORING_OP_SEND_ZC) {
 		if (READ_ONCE(sqe->__pad3[0]))
-- 
cgit 


From 42385b02baad0df55474b7f36dc13e0d4ffd0cc0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 4 Nov 2022 10:59:46 +0000
Subject: io_uring/net: move mm accounting to a slower path

We can also move mm accounting to the extended callbacks. It removes a
few cycles from the hot path including skipping one function call and
setting io_req_task_complete as a callback directly. For user backed I/O
it shouldn't make any difference taking into considering atomic mm
accounting and page pinning.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1062f270273ad11c1b7b45ec59a6a317533d5e64.1667557923.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'io_uring/net.c')

diff --git a/io_uring/net.c b/io_uring/net.c
index 9e3da845f906..966019fcbe8c 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1097,6 +1097,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
 			return ret;
 		msg.sg_from_iter = io_sg_from_iter;
 	} else {
+		io_notif_set_extended(zc->notif);
 		ret = import_single_range(WRITE, zc->buf, zc->len, &iov,
 					  &msg.msg_iter);
 		if (unlikely(ret))
@@ -1158,6 +1159,8 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 
+	io_notif_set_extended(sr->notif);
+
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
-- 
cgit 


From df730ec21f7ba395b1b22e7f93a3a85b1d1b7882 Mon Sep 17 00:00:00 2001
From: Xinghui Li <korantli@tencent.com>
Date: Wed, 2 Nov 2022 16:25:03 +0800
Subject: io_uring: fix two assignments in if conditions

Fixes two errors:

"ERROR: do not use assignment in if condition
130: FILE: io_uring/net.c:130:
+       if (!(issue_flags & IO_URING_F_UNLOCKED) &&

ERROR: do not use assignment in if condition
599: FILE: io_uring/poll.c:599:
+       } else if (!(issue_flags & IO_URING_F_UNLOCKED) &&"
reported by checkpatch.pl in net.c and poll.c .

Signed-off-by: Xinghui Li <korantli@tencent.com>
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/r/20221102082503.32236-1-korantwork@gmail.com
[axboe: style tweaks]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'io_uring/net.c')

diff --git a/io_uring/net.c b/io_uring/net.c
index 966019fcbe8c..f32e0daeedd8 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -125,13 +125,15 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req,
 	struct io_cache_entry *entry;
 	struct io_async_msghdr *hdr;
 
-	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
-	    (entry = io_alloc_cache_get(&ctx->netmsg_cache)) != NULL) {
-		hdr = container_of(entry, struct io_async_msghdr, cache);
-		hdr->free_iov = NULL;
-		req->flags |= REQ_F_ASYNC_DATA;
-		req->async_data = hdr;
-		return hdr;
+	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+		entry = io_alloc_cache_get(&ctx->netmsg_cache);
+		if (entry) {
+			hdr = container_of(entry, struct io_async_msghdr, cache);
+			hdr->free_iov = NULL;
+			req->flags |= REQ_F_ASYNC_DATA;
+			req->async_data = hdr;
+			return hdr;
+		}
 	}
 
 	if (!io_alloc_async_data(req)) {
-- 
cgit 


From 515e26961295bee9da5e26916c27739dca6c10e1 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Mon, 21 Nov 2022 07:43:42 -0700
Subject: io_uring: revert "io_uring fix multishot accept ordering"

This is no longer needed after commit aa1df3a360a0 ("io_uring: fix CQE
reordering"), since all reordering is now taken care of.

This reverts commit cbd25748545c ("io_uring: fix multishot accept
ordering").

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221107125236.260132-2-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'io_uring/net.c')

diff --git a/io_uring/net.c b/io_uring/net.c
index f32e0daeedd8..f7b2b068464d 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1322,12 +1322,12 @@ retry:
 		return IOU_OK;
 	}
 
-	if (ret >= 0 &&
-	    io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, false))
+	if (ret < 0)
+		return ret;
+	if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
 		goto retry;
 
-	io_req_set_res(req, ret, 0);
-	return (issue_flags & IO_URING_F_MULTISHOT) ? IOU_STOP_MULTISHOT : IOU_OK;
+	return -ECANCELED;
 }
 
 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-- 
cgit 


From e2ad599d1ed38fe743106f10d58a0cbfc00b51e2 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Mon, 7 Nov 2022 04:52:35 -0800
Subject: io_uring: allow multishot recv CQEs to overflow

With commit aa1df3a360a0 ("io_uring: fix CQE reordering"), there are
stronger guarantees for overflow ordering. Specifically ensuring that
userspace will not receive out of order receive CQEs. Therefore this is
not needed any more for recv/recvmsg.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221107125236.260132-4-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'io_uring/net.c')

diff --git a/io_uring/net.c b/io_uring/net.c
index f7b2b068464d..0de6f78ad978 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -602,15 +602,11 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 
 	if (!mshot_finished) {
 		if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret,
-				    cflags | IORING_CQE_F_MORE, false)) {
+				    cflags | IORING_CQE_F_MORE, true)) {
 			io_recv_prep_retry(req);
 			return false;
 		}
-		/*
-		 * Otherwise stop multishot but use the current result.
-		 * Probably will end up going into overflow, but this means
-		 * we cannot trust the ordering anymore
-		 */
+		/* Otherwise stop multishot but use the current result. */
 	}
 
 	io_req_set_res(req, *ret, cflags);
-- 
cgit 


From 9b8c54755a2b16d4f23c0ea184b75e2edf77d906 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:55 -0800
Subject: io_uring: add io_aux_cqe which allows deferred completion

Use the just introduced deferred post cqe completion state when possible
in io_aux_cqe. If not possible fallback to io_post_aux_cqe.

This introduces a complication because of allow_overflow. For deferred
completions we cannot know without locking the completion_lock if it will
overflow (and even if we locked it, another post could sneak in and cause
this cqe to be in overflow).
However since overflow protection is mostly a best effort defence in depth
to prevent infinite loops of CQEs for poll, just checking the overflow bit
is going to be good enough and will result in at most 16 (array size of
deferred cqes) overflows.

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-6-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'io_uring/net.c')

diff --git a/io_uring/net.c b/io_uring/net.c
index 0de6f78ad978..90342dcb6b1d 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -601,8 +601,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 	}
 
 	if (!mshot_finished) {
-		if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret,
-				    cflags | IORING_CQE_F_MORE, true)) {
+		if (io_aux_cqe(req->ctx, issue_flags & IO_URING_F_COMPLETE_DEFER,
+			       req->cqe.user_data, *ret, cflags | IORING_CQE_F_MORE, true)) {
 			io_recv_prep_retry(req);
 			return false;
 		}
@@ -1320,7 +1320,8 @@ retry:
 
 	if (ret < 0)
 		return ret;
-	if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
+	if (io_aux_cqe(ctx, issue_flags & IO_URING_F_COMPLETE_DEFER,
+		       req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
 		goto retry;
 
 	return -ECANCELED;
-- 
cgit