From d4755e15386c38e4ae532ace5acc29fbfaee42e7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 7 Mar 2023 09:47:20 -0700
Subject: io_uring: avoid hashing O_DIRECT writes if the filesystem doesn't
 need it

io_uring hashes writes to a given file/inode so that it can serialize
them. This is useful if the file system needs exclusive access to the
file to perform the write, as otherwise we end up with a ton of io-wq
threads trying to lock the inode at the same time. This can cause
excessive system time.

But if the file system has flagged that it supports parallel O_DIRECT
writes, then there's no need to serialize the writes. Check for that
through FMODE_DIO_PARALLEL_WRITE and don't hash it if we don't need to.

In a basic test of 8 threads writing to a file on XFS on a gen2 Optane,
with each thread writing in 4k chunks, it improves performance from
~1350K IOPS (or ~5290MiB/sec) to ~1410K IOPS (or ~5500MiB/sec).

Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 722624b6d0dc..1ed96caa586a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -425,7 +425,13 @@ static void io_prep_async_work(struct io_kiocb *req)
 		req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT;
 
 	if (req->flags & REQ_F_ISREG) {
-		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
+		bool should_hash = def->hash_reg_file;
+
+		/* don't serialize this request if the fs doesn't need it */
+		if (should_hash && (req->file->f_flags & O_DIRECT) &&
+		    (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE))
+			should_hash = false;
+		if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
 			io_wq_hash_work(&req->work, file_inode(req->file));
 	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
 		if (def->unbound_nonreg_file)
-- 
cgit 


From d808459b2e31bd5123a14258a7a529995db974c8 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 16 Feb 2023 09:09:38 +0100
Subject: io_uring: Adjust mapping wrt architecture aliasing requirements

Some architectures have memory cache aliasing requirements (e.g. parisc)
if memory is shared between userspace and kernel. This patch fixes the
kernel to return an aliased address when asked by userspace via mmap().

Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1ed96caa586a..b49b7ee12d60 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -72,6 +72,7 @@
 #include <linux/io_uring.h>
 #include <linux/audit.h>
 #include <linux/security.h>
+#include <asm/shmparam.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -3323,6 +3324,54 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
 }
 
+static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
+			unsigned long addr, unsigned long len,
+			unsigned long pgoff, unsigned long flags)
+{
+	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
+	struct vm_unmapped_area_info info;
+	void *ptr;
+
+	/*
+	 * Do not allow to map to user-provided address to avoid breaking the
+	 * aliasing rules. Userspace is not able to guess the offset address of
+	 * kernel kmalloc()ed memory area.
+	 */
+	if (addr)
+		return -EINVAL;
+
+	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
+	if (IS_ERR(ptr))
+		return -ENOMEM;
+
+	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+	info.length = len;
+	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
+#ifdef SHM_COLOUR
+	info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
+#else
+	info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
+#endif
+	info.align_offset = (unsigned long) ptr;
+
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	addr = vm_unmapped_area(&info);
+	if (offset_in_page(addr)) {
+		info.flags = 0;
+		info.low_limit = TASK_UNMAPPED_BASE;
+		info.high_limit = mmap_end;
+		addr = vm_unmapped_area(&info);
+	}
+
+	return addr;
+}
+
 #else /* !CONFIG_MMU */
 
 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
@@ -3535,6 +3584,8 @@ static const struct file_operations io_uring_fops = {
 #ifndef CONFIG_MMU
 	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
 	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
+#else
+	.get_unmapped_area = io_uring_mmu_get_unmapped_area,
 #endif
 	.poll		= io_uring_poll,
 #ifdef CONFIG_PROC_FS
-- 
cgit 


From ba56b63242d12df088ed9a701cad320e6b306dfe Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 Mar 2023 10:55:50 -0600
Subject: io_uring/kbuf: move pinning of provided buffer ring into helper

In preparation for allowing the kernel to allocate the provided buffer
rings and have the application mmap it instead, abstract out the
current method of pinning and mapping the user allocated ring.

No functional changes intended in this patch.

Acked-by: Helge Deller <deller@gmx.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 3002dc827195..3adc08f90e41 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -463,14 +463,32 @@ err:
 	return IOU_OK;
 }
 
-int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
+			    struct io_buffer_list *bl)
 {
 	struct io_uring_buf_ring *br;
-	struct io_uring_buf_reg reg;
-	struct io_buffer_list *bl, *free_bl = NULL;
 	struct page **pages;
 	int nr_pages;
 
+	pages = io_pin_pages(reg->ring_addr,
+			     flex_array_size(br, bufs, reg->ring_entries),
+			     &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	br = page_address(pages[0]);
+	bl->buf_pages = pages;
+	bl->buf_nr_pages = nr_pages;
+	bl->buf_ring = br;
+	return 0;
+}
+
+int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_buf_reg reg;
+	struct io_buffer_list *bl, *free_bl = NULL;
+	int ret;
+
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
 
@@ -504,20 +522,15 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 			return -ENOMEM;
 	}
 
-	pages = io_pin_pages(reg.ring_addr,
-			     flex_array_size(br, bufs, reg.ring_entries),
-			     &nr_pages);
-	if (IS_ERR(pages)) {
+	ret = io_pin_pbuf_ring(&reg, bl);
+	if (ret) {
 		kfree(free_bl);
-		return PTR_ERR(pages);
+		return ret;
 	}
 
-	br = page_address(pages[0]);
-	bl->buf_pages = pages;
-	bl->buf_nr_pages = nr_pages;
 	bl->nr_entries = reg.ring_entries;
-	bl->buf_ring = br;
 	bl->mask = reg.ring_entries - 1;
+
 	io_buffer_add_list(ctx, bl, reg.bgid);
 	return 0;
 }
-- 
cgit 


From 25a2c188a0a00b3d9f2057798aa86fe6b04377bf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 Mar 2023 10:59:46 -0600
Subject: io_uring/kbuf: add buffer_list->is_mapped member

Rather than rely on checking buffer_list->buf_pages or ->buf_nr_pages,
add a separate member that tracks if this is a ring mapped provided
buffer list or not.

Acked-by: Helge Deller <deller@gmx.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 14 ++++++++------
 io_uring/kbuf.h |  3 +++
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 3adc08f90e41..db5f189267b7 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -179,7 +179,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 
 	bl = io_buffer_get_list(ctx, req->buf_index);
 	if (likely(bl)) {
-		if (bl->buf_nr_pages)
+		if (bl->is_mapped)
 			ret = io_ring_buffer_select(req, len, bl, issue_flags);
 		else
 			ret = io_provided_buffer_select(req, len, bl);
@@ -214,7 +214,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 	if (!nbufs)
 		return 0;
 
-	if (bl->buf_nr_pages) {
+	if (bl->is_mapped && bl->buf_nr_pages) {
 		int j;
 
 		i = bl->buf_ring->tail - bl->head;
@@ -225,6 +225,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 		bl->buf_nr_pages = 0;
 		/* make sure it's seen as empty */
 		INIT_LIST_HEAD(&bl->buf_list);
+		bl->is_mapped = 0;
 		return i;
 	}
 
@@ -303,7 +304,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 	if (bl) {
 		ret = -EINVAL;
 		/* can't use provide/remove buffers command on mapped buffers */
-		if (!bl->buf_nr_pages)
+		if (!bl->is_mapped)
 			ret = __io_remove_buffers(ctx, bl, p->nbufs);
 	}
 	io_ring_submit_unlock(ctx, issue_flags);
@@ -448,7 +449,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 		}
 	}
 	/* can't add buffers via this command for a mapped buffer ring */
-	if (bl->buf_nr_pages) {
+	if (bl->is_mapped) {
 		ret = -EINVAL;
 		goto err;
 	}
@@ -480,6 +481,7 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	bl->buf_pages = pages;
 	bl->buf_nr_pages = nr_pages;
 	bl->buf_ring = br;
+	bl->is_mapped = 1;
 	return 0;
 }
 
@@ -514,7 +516,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	bl = io_buffer_get_list(ctx, reg.bgid);
 	if (bl) {
 		/* if mapped buffer ring OR classic exists, don't allow */
-		if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
+		if (bl->is_mapped || !list_empty(&bl->buf_list))
 			return -EEXIST;
 	} else {
 		free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
@@ -548,7 +550,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	bl = io_buffer_get_list(ctx, reg.bgid);
 	if (!bl)
 		return -ENOENT;
-	if (!bl->buf_nr_pages)
+	if (!bl->is_mapped)
 		return -EINVAL;
 
 	__io_remove_buffers(ctx, bl, -1U);
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index c23e15d7d3ca..61b9c7dade9d 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -23,6 +23,9 @@ struct io_buffer_list {
 	__u16 nr_entries;
 	__u16 head;
 	__u16 mask;
+
+	/* ring mapped provided buffers */
+	__u8 is_mapped;
 };
 
 struct io_buffer {
-- 
cgit 


From 81cf17cd3ab3e5441e876a8e9e9c38ae9920cecb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 Mar 2023 11:01:45 -0600
Subject: io_uring/kbuf: rename struct io_uring_buf_reg 'pad' to'flags'

In preparation for allowing flags to be set for registration, rename
the padding and use it for that.

Acked-by: Helge Deller <deller@gmx.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index db5f189267b7..4b2f4a0ee962 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -494,7 +494,9 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
 
-	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+	if (reg.resv[0] || reg.resv[1] || reg.resv[2])
+		return -EINVAL;
+	if (reg.flags)
 		return -EINVAL;
 	if (!reg.ring_addr)
 		return -EFAULT;
@@ -544,7 +546,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
-	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+	if (reg.resv[0] || reg.resv[1] || reg.resv[2])
+		return -EINVAL;
+	if (reg.flags)
 		return -EINVAL;
 
 	bl = io_buffer_get_list(ctx, reg.bgid);
-- 
cgit 


From c56e022c0a27142b7b59ae6bdf45f86bf4b298a1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 Mar 2023 11:07:19 -0600
Subject: io_uring: add support for user mapped provided buffer ring

The ring mapped provided buffer rings rely on the application allocating
the memory for the ring, and then the kernel will map it. This generally
works fine, but runs into issues on some architectures where we need
to be able to ensure that the kernel and application virtual address for
the ring play nicely together. This at least impacts architectures that
set SHM_COLOUR, but potentially also anyone setting SHMLBA.

To use this variant of ring provided buffers, the application need not
allocate any memory for the ring. Instead the kernel will do so, and
the allocation must subsequently call mmap(2) on the ring with the
offset set to:

	IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)

to get a virtual address for the buffer ring. Normally the application
would allocate a suitable piece of memory (and correctly aligned) and
simply pass that in via io_uring_buf_reg.ring_addr and the kernel would
map it.

Outside of the setup differences, the kernel allocate + user mapped
provided buffer ring works exactly the same.

Acked-by: Helge Deller <deller@gmx.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 13 ++++++-
 io_uring/kbuf.c     | 99 ++++++++++++++++++++++++++++++++++++++++-------------
 io_uring/kbuf.h     |  4 +++
 3 files changed, 92 insertions(+), 24 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b49b7ee12d60..d72aa92ce2d6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3289,7 +3289,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
 	struct page *page;
 	void *ptr;
 
-	switch (offset) {
+	switch (offset & IORING_OFF_MMAP_MASK) {
 	case IORING_OFF_SQ_RING:
 	case IORING_OFF_CQ_RING:
 		ptr = ctx->rings;
@@ -3297,6 +3297,17 @@ static void *io_uring_validate_mmap_request(struct file *file,
 	case IORING_OFF_SQES:
 		ptr = ctx->sq_sqes;
 		break;
+	case IORING_OFF_PBUF_RING: {
+		unsigned int bgid;
+
+		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+		mutex_lock(&ctx->uring_lock);
+		ptr = io_pbuf_get_address(ctx, bgid);
+		mutex_unlock(&ctx->uring_lock);
+		if (!ptr)
+			return ERR_PTR(-EINVAL);
+		break;
+		}
 	default:
 		return ERR_PTR(-EINVAL);
 	}
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 4b2f4a0ee962..cd1d9dddf58e 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -137,7 +137,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 		return NULL;
 
 	head &= bl->mask;
-	if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+	/* mmaped buffers are always contig */
+	if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
 		buf = &br->bufs[head];
 	} else {
 		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
@@ -214,15 +215,27 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 	if (!nbufs)
 		return 0;
 
-	if (bl->is_mapped && bl->buf_nr_pages) {
-		int j;
-
+	if (bl->is_mapped) {
 		i = bl->buf_ring->tail - bl->head;
-		for (j = 0; j < bl->buf_nr_pages; j++)
-			unpin_user_page(bl->buf_pages[j]);
-		kvfree(bl->buf_pages);
-		bl->buf_pages = NULL;
-		bl->buf_nr_pages = 0;
+		if (bl->is_mmap) {
+			if (bl->buf_ring) {
+				struct page *page;
+
+				page = virt_to_head_page(bl->buf_ring);
+				if (put_page_testzero(page))
+					free_compound_page(page);
+				bl->buf_ring = NULL;
+			}
+			bl->is_mmap = 0;
+		} else if (bl->buf_nr_pages) {
+			int j;
+
+			for (j = 0; j < bl->buf_nr_pages; j++)
+				unpin_user_page(bl->buf_pages[j]);
+			kvfree(bl->buf_pages);
+			bl->buf_pages = NULL;
+			bl->buf_nr_pages = 0;
+		}
 		/* make sure it's seen as empty */
 		INIT_LIST_HEAD(&bl->buf_list);
 		bl->is_mapped = 0;
@@ -482,6 +495,25 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	bl->buf_nr_pages = nr_pages;
 	bl->buf_ring = br;
 	bl->is_mapped = 1;
+	bl->is_mmap = 0;
+	return 0;
+}
+
+static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
+			      struct io_buffer_list *bl)
+{
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+	size_t ring_size;
+	void *ptr;
+
+	ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
+	ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
+	if (!ptr)
+		return -ENOMEM;
+
+	bl->buf_ring = ptr;
+	bl->is_mapped = 1;
+	bl->is_mmap = 1;
 	return 0;
 }
 
@@ -496,12 +528,18 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 
 	if (reg.resv[0] || reg.resv[1] || reg.resv[2])
 		return -EINVAL;
-	if (reg.flags)
-		return -EINVAL;
-	if (!reg.ring_addr)
-		return -EFAULT;
-	if (reg.ring_addr & ~PAGE_MASK)
+	if (reg.flags & ~IOU_PBUF_RING_MMAP)
 		return -EINVAL;
+	if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
+		if (!reg.ring_addr)
+			return -EFAULT;
+		if (reg.ring_addr & ~PAGE_MASK)
+			return -EINVAL;
+	} else {
+		if (reg.ring_addr)
+			return -EINVAL;
+	}
+
 	if (!is_power_of_2(reg.ring_entries))
 		return -EINVAL;
 
@@ -526,17 +564,21 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 			return -ENOMEM;
 	}
 
-	ret = io_pin_pbuf_ring(&reg, bl);
-	if (ret) {
-		kfree(free_bl);
-		return ret;
-	}
+	if (!(reg.flags & IOU_PBUF_RING_MMAP))
+		ret = io_pin_pbuf_ring(&reg, bl);
+	else
+		ret = io_alloc_pbuf_ring(&reg, bl);
 
-	bl->nr_entries = reg.ring_entries;
-	bl->mask = reg.ring_entries - 1;
+	if (!ret) {
+		bl->nr_entries = reg.ring_entries;
+		bl->mask = reg.ring_entries - 1;
 
-	io_buffer_add_list(ctx, bl, reg.bgid);
-	return 0;
+		io_buffer_add_list(ctx, bl, reg.bgid);
+		return 0;
+	}
+
+	kfree(free_bl);
+	return ret;
 }
 
 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
@@ -564,3 +606,14 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	}
 	return 0;
 }
+
+void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
+{
+	struct io_buffer_list *bl;
+
+	bl = io_buffer_get_list(ctx, bgid);
+	if (!bl || !bl->is_mmap)
+		return NULL;
+
+	return bl->buf_ring;
+}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 61b9c7dade9d..d14345ef61fc 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -26,6 +26,8 @@ struct io_buffer_list {
 
 	/* ring mapped provided buffers */
 	__u8 is_mapped;
+	/* ring mapped provided buffers, but mmap'ed by application */
+	__u8 is_mmap;
 };
 
 struct io_buffer {
@@ -53,6 +55,8 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
 
 void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
 
+void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
+
 static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
 {
 	/*
-- 
cgit 


From da64d6db3bd304d44d7ac1eb7f319a1cc7efd611 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 10 Mar 2023 12:11:07 -0800
Subject: io_uring: One wqe per wq

Right now io_wq allocates one io_wqe per NUMA node.  As io_wq is now
bound to a task, the task basically uses only the NUMA local io_wqe, and
almost never changes NUMA nodes, thus, the other wqes are mostly
unused.

Allocate just one io_wqe embedded into io_wq, and uses all possible cpus
(cpu_possible_mask) in the io_wqe->cpumask.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/20230310201107.4020580-1-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 180 ++++++++++++++++++++++---------------------------------
 1 file changed, 70 insertions(+), 110 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index f81c0a7136a5..44d522c5d36f 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -15,6 +15,7 @@
 #include <linux/cpu.h>
 #include <linux/task_work.h>
 #include <linux/audit.h>
+#include <linux/mmu_context.h>
 #include <uapi/linux/io_uring.h>
 
 #include "io-wq.h"
@@ -96,8 +97,6 @@ struct io_wqe {
 	raw_spinlock_t lock;
 	struct io_wqe_acct acct[IO_WQ_ACCT_NR];
 
-	int node;
-
 	struct hlist_nulls_head free_list;
 	struct list_head all_list;
 
@@ -127,7 +126,7 @@ struct io_wq {
 
 	struct task_struct *task;
 
-	struct io_wqe *wqes[];
+	struct io_wqe wqe;
 };
 
 static enum cpuhp_state io_wq_online;
@@ -754,7 +753,7 @@ static void create_worker_cont(struct callback_head *cb)
 	worker = container_of(cb, struct io_worker, create_work);
 	clear_bit_unlock(0, &worker->create_state);
 	wqe = worker->wqe;
-	tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
+	tsk = create_io_thread(io_wqe_worker, worker, NUMA_NO_NODE);
 	if (!IS_ERR(tsk)) {
 		io_init_new_worker(wqe, worker, tsk);
 		io_worker_release(worker);
@@ -804,7 +803,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 
 	__set_current_state(TASK_RUNNING);
 
-	worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
+	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
 	if (!worker) {
 fail:
 		atomic_dec(&acct->nr_running);
@@ -823,7 +822,7 @@ fail:
 	if (index == IO_WQ_ACCT_BOUND)
 		worker->flags |= IO_WORKER_F_BOUND;
 
-	tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
+	tsk = create_io_thread(io_wqe_worker, worker, NUMA_NO_NODE);
 	if (!IS_ERR(tsk)) {
 		io_init_new_worker(wqe, worker, tsk);
 	} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
@@ -961,7 +960,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 
 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
 {
-	struct io_wqe *wqe = wq->wqes[numa_node_id()];
+	struct io_wqe *wqe = &wq->wqe;
 
 	io_wqe_enqueue(wqe, work);
 }
@@ -1083,7 +1082,7 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 		.data		= data,
 		.cancel_all	= cancel_all,
 	};
-	int node;
+	struct io_wqe *wqe = &wq->wqe;
 
 	/*
 	 * First check pending list, if we're lucky we can just remove it
@@ -1098,19 +1097,15 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 	 * Do both of these while holding the wqe->lock, to ensure that
 	 * we'll find a work item regardless of state.
 	 */
-	for_each_node(node) {
-		struct io_wqe *wqe = wq->wqes[node];
-
-		io_wqe_cancel_pending_work(wqe, &match);
-		if (match.nr_pending && !match.cancel_all)
-			return IO_WQ_CANCEL_OK;
+	io_wqe_cancel_pending_work(wqe, &match);
+	if (match.nr_pending && !match.cancel_all)
+		return IO_WQ_CANCEL_OK;
 
-		raw_spin_lock(&wqe->lock);
-		io_wqe_cancel_running_work(wqe, &match);
-		raw_spin_unlock(&wqe->lock);
-		if (match.nr_running && !match.cancel_all)
-			return IO_WQ_CANCEL_RUNNING;
-	}
+	raw_spin_lock(&wqe->lock);
+	io_wqe_cancel_running_work(wqe, &match);
+	raw_spin_unlock(&wqe->lock);
+	if (match.nr_running && !match.cancel_all)
+		return IO_WQ_CANCEL_RUNNING;
 
 	if (match.nr_running)
 		return IO_WQ_CANCEL_RUNNING;
@@ -1140,15 +1135,16 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
 
 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 {
-	int ret, node, i;
+	int ret, i;
 	struct io_wq *wq;
+	struct io_wqe *wqe;
 
 	if (WARN_ON_ONCE(!data->free_work || !data->do_work))
 		return ERR_PTR(-EINVAL);
 	if (WARN_ON_ONCE(!bounded))
 		return ERR_PTR(-EINVAL);
 
-	wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
+	wq = kzalloc(sizeof(struct io_wq), GFP_KERNEL);
 	if (!wq)
 		return ERR_PTR(-ENOMEM);
 	ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
@@ -1159,40 +1155,30 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	wq->hash = data->hash;
 	wq->free_work = data->free_work;
 	wq->do_work = data->do_work;
+	wqe = &wq->wqe;
 
 	ret = -ENOMEM;
-	for_each_node(node) {
-		struct io_wqe *wqe;
-		int alloc_node = node;
-
-		if (!node_online(alloc_node))
-			alloc_node = NUMA_NO_NODE;
-		wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
-		if (!wqe)
-			goto err;
-		wq->wqes[node] = wqe;
-		if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL))
-			goto err;
-		cpumask_copy(wqe->cpu_mask, cpumask_of_node(node));
-		wqe->node = alloc_node;
-		wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
-		wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
-					task_rlimit(current, RLIMIT_NPROC);
-		INIT_LIST_HEAD(&wqe->wait.entry);
-		wqe->wait.func = io_wqe_hash_wake;
-		for (i = 0; i < IO_WQ_ACCT_NR; i++) {
-			struct io_wqe_acct *acct = &wqe->acct[i];
-
-			acct->index = i;
-			atomic_set(&acct->nr_running, 0);
-			INIT_WQ_LIST(&acct->work_list);
-			raw_spin_lock_init(&acct->lock);
-		}
-		wqe->wq = wq;
-		raw_spin_lock_init(&wqe->lock);
-		INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
-		INIT_LIST_HEAD(&wqe->all_list);
+
+	if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL))
+		goto err;
+	cpumask_copy(wqe->cpu_mask, cpu_possible_mask);
+	wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
+	wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
+				task_rlimit(current, RLIMIT_NPROC);
+	INIT_LIST_HEAD(&wqe->wait.entry);
+	wqe->wait.func = io_wqe_hash_wake;
+	for (i = 0; i < IO_WQ_ACCT_NR; i++) {
+		struct io_wqe_acct *acct = &wqe->acct[i];
+
+		acct->index = i;
+		atomic_set(&acct->nr_running, 0);
+		INIT_WQ_LIST(&acct->work_list);
+		raw_spin_lock_init(&acct->lock);
 	}
+	wqe->wq = wq;
+	raw_spin_lock_init(&wqe->lock);
+	INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
+	INIT_LIST_HEAD(&wqe->all_list);
 
 	wq->task = get_task_struct(data->task);
 	atomic_set(&wq->worker_refs, 1);
@@ -1201,12 +1187,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 err:
 	io_wq_put_hash(data->hash);
 	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
-	for_each_node(node) {
-		if (!wq->wqes[node])
-			continue;
-		free_cpumask_var(wq->wqes[node]->cpu_mask);
-		kfree(wq->wqes[node]);
-	}
+
+	free_cpumask_var(wq->wqe.cpu_mask);
 err_wq:
 	kfree(wq);
 	return ERR_PTR(ret);
@@ -1247,48 +1229,36 @@ static void io_wq_cancel_tw_create(struct io_wq *wq)
 
 static void io_wq_exit_workers(struct io_wq *wq)
 {
-	int node;
-
 	if (!wq->task)
 		return;
 
 	io_wq_cancel_tw_create(wq);
 
 	rcu_read_lock();
-	for_each_node(node) {
-		struct io_wqe *wqe = wq->wqes[node];
-
-		io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
-	}
+	io_wq_for_each_worker(&wq->wqe, io_wq_worker_wake, NULL);
 	rcu_read_unlock();
 	io_worker_ref_put(wq);
 	wait_for_completion(&wq->worker_done);
 
-	for_each_node(node) {
-		spin_lock_irq(&wq->hash->wait.lock);
-		list_del_init(&wq->wqes[node]->wait.entry);
-		spin_unlock_irq(&wq->hash->wait.lock);
-	}
+	spin_lock_irq(&wq->hash->wait.lock);
+	list_del_init(&wq->wqe.wait.entry);
+	spin_unlock_irq(&wq->hash->wait.lock);
+
 	put_task_struct(wq->task);
 	wq->task = NULL;
 }
 
 static void io_wq_destroy(struct io_wq *wq)
 {
-	int node;
+	struct io_cb_cancel_data match = {
+		.fn		= io_wq_work_match_all,
+		.cancel_all	= true,
+	};
+	struct io_wqe *wqe = &wq->wqe;
 
 	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
-
-	for_each_node(node) {
-		struct io_wqe *wqe = wq->wqes[node];
-		struct io_cb_cancel_data match = {
-			.fn		= io_wq_work_match_all,
-			.cancel_all	= true,
-		};
-		io_wqe_cancel_pending_work(wqe, &match);
-		free_cpumask_var(wqe->cpu_mask);
-		kfree(wqe);
-	}
+	io_wqe_cancel_pending_work(wqe, &match);
+	free_cpumask_var(wqe->cpu_mask);
 	io_wq_put_hash(wq->hash);
 	kfree(wq);
 }
@@ -1323,11 +1293,9 @@ static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online)
 		.cpu = cpu,
 		.online = online
 	};
-	int i;
 
 	rcu_read_lock();
-	for_each_node(i)
-		io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od);
+	io_wq_for_each_worker(&wq->wqe, io_wq_worker_affinity, &od);
 	rcu_read_unlock();
 	return 0;
 }
@@ -1348,18 +1316,15 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
 
 int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
 {
-	int i;
+	struct io_wqe *wqe = &wq->wqe;
 
 	rcu_read_lock();
-	for_each_node(i) {
-		struct io_wqe *wqe = wq->wqes[i];
-
-		if (mask)
-			cpumask_copy(wqe->cpu_mask, mask);
-		else
-			cpumask_copy(wqe->cpu_mask, cpumask_of_node(i));
-	}
+	if (mask)
+		cpumask_copy(wqe->cpu_mask, mask);
+	else
+		cpumask_copy(wqe->cpu_mask, cpu_possible_mask);
 	rcu_read_unlock();
+
 	return 0;
 }
 
@@ -1369,9 +1334,10 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
  */
 int io_wq_max_workers(struct io_wq *wq, int *new_count)
 {
+	struct io_wqe *wqe = &wq->wqe;
+	struct io_wqe_acct *acct;
 	int prev[IO_WQ_ACCT_NR];
-	bool first_node = true;
-	int i, node;
+	int i;
 
 	BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND   != (int) IO_WQ_BOUND);
 	BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
@@ -1386,21 +1352,15 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
 		prev[i] = 0;
 
 	rcu_read_lock();
-	for_each_node(node) {
-		struct io_wqe *wqe = wq->wqes[node];
-		struct io_wqe_acct *acct;
 
-		raw_spin_lock(&wqe->lock);
-		for (i = 0; i < IO_WQ_ACCT_NR; i++) {
-			acct = &wqe->acct[i];
-			if (first_node)
-				prev[i] = max_t(int, acct->max_workers, prev[i]);
-			if (new_count[i])
-				acct->max_workers = new_count[i];
-		}
-		raw_spin_unlock(&wqe->lock);
-		first_node = false;
+	raw_spin_lock(&wqe->lock);
+	for (i = 0; i < IO_WQ_ACCT_NR; i++) {
+		acct = &wqe->acct[i];
+		prev[i] = max_t(int, acct->max_workers, prev[i]);
+		if (new_count[i])
+			acct->max_workers = new_count[i];
 	}
+	raw_spin_unlock(&wqe->lock);
 	rcu_read_unlock();
 
 	for (i = 0; i < IO_WQ_ACCT_NR; i++)
-- 
cgit 


From efba1a9e653e107577a48157b5424878c46f2285 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 23 Feb 2023 08:43:52 -0800
Subject: io_uring: Move from hlist to io_wq_work_node

Having cache entries linked using the hlist format brings no benefit, and
also requires an unnecessary extra pointer address per cache entry.

Use the internal io_wq_work_node single-linked list for the internal
alloc caches (async_msghdr and async_poll)

This is required to be able to use KASAN on cache entries, since we do
not need to touch unused (and poisoned) cache entries when adding more
entries to the list.

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/20230223164353.2839177-2-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/alloc_cache.h | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index c2cde88aeed5..aaa838c31d92 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -7,7 +7,7 @@
 #define IO_ALLOC_CACHE_MAX	512
 
 struct io_cache_entry {
-	struct hlist_node	node;
+	struct io_wq_work_node node;
 };
 
 static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
@@ -15,7 +15,7 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 {
 	if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
 		cache->nr_cached++;
-		hlist_add_head(&entry->node, &cache->list);
+		wq_stack_add_head(&entry->node, &cache->list);
 		return true;
 	}
 	return false;
@@ -23,12 +23,13 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 
 static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
 {
-	if (!hlist_empty(&cache->list)) {
-		struct hlist_node *node = cache->list.first;
+	if (cache->list.next) {
+		struct io_cache_entry *entry;
 
-		hlist_del(node);
+		entry = container_of(cache->list.next, struct io_cache_entry, node);
+		cache->list.next = cache->list.next->next;
 		cache->nr_cached--;
-		return container_of(node, struct io_cache_entry, node);
+		return entry;
 	}
 
 	return NULL;
@@ -36,18 +37,19 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
 
 static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
 {
-	INIT_HLIST_HEAD(&cache->list);
+	cache->list.next = NULL;
 	cache->nr_cached = 0;
 }
 
 static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
 					void (*free)(struct io_cache_entry *))
 {
-	while (!hlist_empty(&cache->list)) {
-		struct hlist_node *node = cache->list.first;
+	while (1) {
+		struct io_cache_entry *entry = io_alloc_cache_get(cache);
 
-		hlist_del(node);
-		free(container_of(node, struct io_cache_entry, node));
+		if (!entry)
+			break;
+		free(entry);
 	}
 	cache->nr_cached = 0;
 }
-- 
cgit 


From e1fe7ee885dc0712e982ee465d9f8b96254c30c1 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 23 Feb 2023 08:43:53 -0800
Subject: io_uring: Add KASAN support for alloc_caches

Add support for KASAN in the alloc_caches (apoll and netmsg_cache).
Thus, if something touches the unused caches, it will raise a KASAN
warning/exception.

It poisons the object when the object is put to the cache, and unpoisons
it when the object is gotten or freed.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20230223164353.2839177-2-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/alloc_cache.h | 6 +++++-
 io_uring/io_uring.c    | 4 ++--
 io_uring/net.h         | 5 ++++-
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index aaa838c31d92..2fbecaa3a1ba 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -16,6 +16,8 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 	if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
 		cache->nr_cached++;
 		wq_stack_add_head(&entry->node, &cache->list);
+		/* KASAN poisons object */
+		kasan_slab_free_mempool(entry);
 		return true;
 	}
 	return false;
@@ -27,6 +29,7 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
 		struct io_cache_entry *entry;
 
 		entry = container_of(cache->list.next, struct io_cache_entry, node);
+		kasan_unpoison_range(entry, cache->elem_size);
 		cache->list.next = cache->list.next->next;
 		cache->nr_cached--;
 		return entry;
@@ -35,10 +38,11 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
 	return NULL;
 }
 
-static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
+static inline void io_alloc_cache_init(struct io_alloc_cache *cache, size_t size)
 {
 	cache->list.next = NULL;
 	cache->nr_cached = 0;
+	cache->elem_size = size;
 }
 
 static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d72aa92ce2d6..24be4992821b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -310,8 +310,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
-	io_alloc_cache_init(&ctx->apoll_cache);
-	io_alloc_cache_init(&ctx->netmsg_cache);
+	io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll));
+	io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr));
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
diff --git a/io_uring/net.h b/io_uring/net.h
index 5ffa11bf5d2e..191009979bcb 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -5,8 +5,8 @@
 
 #include "alloc_cache.h"
 
-#if defined(CONFIG_NET)
 struct io_async_msghdr {
+#if defined(CONFIG_NET)
 	union {
 		struct iovec		fast_iov[UIO_FASTIOV];
 		struct {
@@ -22,8 +22,11 @@ struct io_async_msghdr {
 	struct sockaddr __user		*uaddr;
 	struct msghdr			msg;
 	struct sockaddr_storage		addr;
+#endif
 };
 
+#if defined(CONFIG_NET)
+
 struct io_async_connect {
 	struct sockaddr_storage		address;
 };
-- 
cgit 


From fcb46c0ccc7c07af54f818fd498e461353ea50e7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 17 Mar 2023 10:42:08 -0600
Subject: io_uring/kbuf: disallow mapping a badly aligned provided ring buffer

On at least parisc, we have strict requirements on how we virtually map
an address that is shared between the application and the kernel. On
these platforms, IOU_PBUF_RING_MMAP should be used when setting up a
shared ring buffer for provided buffers. If the application is mapping
these pages and asking the kernel to pin+map them as well, then we have
no control over what virtual address we get in the kernel.

For that case, do a sanity check if SHM_COLOUR is defined, and disallow
the mapping request. The application must fall back to using
IOU_PBUF_RING_MMAP for this case, and liburing will do that transparently
with the set of helpers that it has.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'io_uring')

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index cd1d9dddf58e..79c25459e8de 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -491,6 +491,24 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 		return PTR_ERR(pages);
 
 	br = page_address(pages[0]);
+#ifdef SHM_COLOUR
+	/*
+	 * On platforms that have specific aliasing requirements, SHM_COLOUR
+	 * is set and we must guarantee that the kernel and user side align
+	 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
+	 * the application mmap's the provided ring buffer. Fail the request
+	 * if we, by chance, don't end up with aligned addresses. The app
+	 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
+	 * this transparently.
+	 */
+	if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
+		int i;
+
+		for (i = 0; i < nr_pages; i++)
+			unpin_user_page(pages[i]);
+		return -EINVAL;
+	}
+#endif
 	bl->buf_pages = pages;
 	bl->buf_nr_pages = nr_pages;
 	bl->buf_ring = br;
-- 
cgit 


From dfd63baf892c016dd0c8c6e99d0973459aabe554 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Tue, 21 Mar 2023 22:16:27 -0300
Subject: io-wq: Move wq accounting to io_wq

Since we now have a single io_wqe per io_wq instead of per-node, and in
preparation to its removal, move the accounting into the parent
structure.

Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20230322011628.23359-2-krisman@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 78 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 38 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 44d522c5d36f..da7c6e00b690 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -74,7 +74,7 @@ struct io_worker {
 
 #define IO_WQ_NR_HASH_BUCKETS	(1u << IO_WQ_HASH_ORDER)
 
-struct io_wqe_acct {
+struct io_wq_acct {
 	unsigned nr_workers;
 	unsigned max_workers;
 	int index;
@@ -95,7 +95,6 @@ enum {
  */
 struct io_wqe {
 	raw_spinlock_t lock;
-	struct io_wqe_acct acct[IO_WQ_ACCT_NR];
 
 	struct hlist_nulls_head free_list;
 	struct list_head all_list;
@@ -126,6 +125,8 @@ struct io_wq {
 
 	struct task_struct *task;
 
+	struct io_wq_acct acct[IO_WQ_ACCT_NR];
+
 	struct io_wqe wqe;
 };
 
@@ -142,7 +143,7 @@ struct io_cb_cancel_data {
 static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
 static void io_wqe_dec_running(struct io_worker *worker);
 static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
-					struct io_wqe_acct *acct,
+					struct io_wq_acct *acct,
 					struct io_cb_cancel_data *match);
 static void create_worker_cb(struct callback_head *cb);
 static void io_wq_cancel_tw_create(struct io_wq *wq);
@@ -158,20 +159,20 @@ static void io_worker_release(struct io_worker *worker)
 		complete(&worker->ref_done);
 }
 
-static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound)
+static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound)
 {
-	return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND];
+	return &wq->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND];
 }
 
-static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
-						   struct io_wq_work *work)
+static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq,
+						  struct io_wq_work *work)
 {
-	return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND));
+	return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND));
 }
 
-static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker)
+static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker)
 {
-	return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND);
+	return io_get_acct(worker->wqe->wq, worker->flags & IO_WORKER_F_BOUND);
 }
 
 static void io_worker_ref_put(struct io_wq *wq)
@@ -182,7 +183,7 @@ static void io_worker_ref_put(struct io_wq *wq)
 
 static void io_worker_cancel_cb(struct io_worker *worker)
 {
-	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+	struct io_wq_acct *acct = io_wq_get_acct(worker);
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
 
@@ -238,7 +239,7 @@ static void io_worker_exit(struct io_worker *worker)
 	do_exit(0);
 }
 
-static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
+static inline bool io_acct_run_queue(struct io_wq_acct *acct)
 {
 	bool ret = false;
 
@@ -256,7 +257,7 @@ static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
  * caller must create one.
  */
 static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
-					struct io_wqe_acct *acct)
+					struct io_wq_acct *acct)
 	__must_hold(RCU)
 {
 	struct hlist_nulls_node *n;
@@ -270,7 +271,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
 	hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
 		if (!io_worker_get(worker))
 			continue;
-		if (io_wqe_get_acct(worker) != acct) {
+		if (io_wq_get_acct(worker) != acct) {
 			io_worker_release(worker);
 			continue;
 		}
@@ -288,7 +289,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
  * We need a worker. If we find a free one, we're good. If not, and we're
  * below the max number of workers, create one.
  */
-static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wq_acct *acct)
 {
 	/*
 	 * Most likely an attempt to queue unbounded work on an io_wq that
@@ -311,7 +312,7 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
 
 static void io_wqe_inc_running(struct io_worker *worker)
 {
-	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+	struct io_wq_acct *acct = io_wq_get_acct(worker);
 
 	atomic_inc(&acct->nr_running);
 }
@@ -321,13 +322,13 @@ static void create_worker_cb(struct callback_head *cb)
 	struct io_worker *worker;
 	struct io_wq *wq;
 	struct io_wqe *wqe;
-	struct io_wqe_acct *acct;
+	struct io_wq_acct *acct;
 	bool do_create = false;
 
 	worker = container_of(cb, struct io_worker, create_work);
 	wqe = worker->wqe;
 	wq = wqe->wq;
-	acct = &wqe->acct[worker->create_index];
+	acct = &wq->acct[worker->create_index];
 	raw_spin_lock(&wqe->lock);
 	if (acct->nr_workers < acct->max_workers) {
 		acct->nr_workers++;
@@ -345,7 +346,7 @@ static void create_worker_cb(struct callback_head *cb)
 }
 
 static bool io_queue_worker_create(struct io_worker *worker,
-				   struct io_wqe_acct *acct,
+				   struct io_wq_acct *acct,
 				   task_work_func_t func)
 {
 	struct io_wqe *wqe = worker->wqe;
@@ -393,7 +394,7 @@ fail:
 
 static void io_wqe_dec_running(struct io_worker *worker)
 {
-	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+	struct io_wq_acct *acct = io_wq_get_acct(worker);
 	struct io_wqe *wqe = worker->wqe;
 
 	if (!(worker->flags & IO_WORKER_F_UP))
@@ -462,7 +463,7 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
 	return ret;
 }
 
-static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
+static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct,
 					   struct io_worker *worker)
 	__must_hold(acct->lock)
 {
@@ -537,7 +538,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
 
 static void io_worker_handle_work(struct io_worker *worker)
 {
-	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+	struct io_wq_acct *acct = io_wq_get_acct(worker);
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
 	bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
@@ -612,7 +613,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 static int io_wqe_worker(void *data)
 {
 	struct io_worker *worker = data;
-	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+	struct io_wq_acct *acct = io_wq_get_acct(worker);
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
 	bool exit_mask = false, last_timeout = false;
@@ -759,7 +760,7 @@ static void create_worker_cont(struct callback_head *cb)
 		io_worker_release(worker);
 		return;
 	} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
-		struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+		struct io_wq_acct *acct = io_wq_get_acct(worker);
 
 		atomic_dec(&acct->nr_running);
 		raw_spin_lock(&wqe->lock);
@@ -789,7 +790,7 @@ static void create_worker_cont(struct callback_head *cb)
 static void io_workqueue_create(struct work_struct *work)
 {
 	struct io_worker *worker = container_of(work, struct io_worker, work);
-	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+	struct io_wq_acct *acct = io_wq_get_acct(worker);
 
 	if (!io_queue_worker_create(worker, acct, create_worker_cont))
 		kfree(worker);
@@ -797,7 +798,7 @@ static void io_workqueue_create(struct work_struct *work)
 
 static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 {
-	struct io_wqe_acct *acct = &wqe->acct[index];
+	struct io_wq_acct *acct = &wq->acct[index];
 	struct io_worker *worker;
 	struct task_struct *tsk;
 
@@ -881,7 +882,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
 
 static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
 {
-	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+	struct io_wq_acct *acct = io_work_get_acct(wqe->wq, work);
 	unsigned int hash;
 	struct io_wq_work *tail;
 
@@ -907,7 +908,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
 
 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 {
-	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+	struct io_wq_acct *acct = io_work_get_acct(wqe->wq, work);
 	struct io_cb_cancel_data match;
 	unsigned work_flags = work->flags;
 	bool do_create;
@@ -1011,7 +1012,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
 					 struct io_wq_work *work,
 					 struct io_wq_work_node *prev)
 {
-	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+	struct io_wq_acct *acct = io_work_get_acct(wqe->wq, work);
 	unsigned int hash = io_get_work_hash(work);
 	struct io_wq_work *prev_work = NULL;
 
@@ -1027,7 +1028,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
 }
 
 static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
-					struct io_wqe_acct *acct,
+					struct io_wq_acct *acct,
 					struct io_cb_cancel_data *match)
 {
 	struct io_wq_work_node *node, *prev;
@@ -1051,12 +1052,12 @@ static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
 }
 
 static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
-				       struct io_cb_cancel_data *match)
+				      struct io_cb_cancel_data *match)
 {
 	int i;
 retry:
 	for (i = 0; i < IO_WQ_ACCT_NR; i++) {
-		struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
+		struct io_wq_acct *acct = io_get_acct(wqe->wq, i == 0);
 
 		if (io_acct_cancel_pending_work(wqe, acct, match)) {
 			if (match->cancel_all)
@@ -1118,13 +1119,14 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
 			    int sync, void *key)
 {
 	struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
+	struct io_wq *wq = wqe->wq;
 	int i;
 
 	list_del_init(&wait->entry);
 
 	rcu_read_lock();
 	for (i = 0; i < IO_WQ_ACCT_NR; i++) {
-		struct io_wqe_acct *acct = &wqe->acct[i];
+		struct io_wq_acct *acct = &wq->acct[i];
 
 		if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
 			io_wqe_activate_free_worker(wqe, acct);
@@ -1162,13 +1164,13 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL))
 		goto err;
 	cpumask_copy(wqe->cpu_mask, cpu_possible_mask);
-	wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
-	wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
+	wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
+	wq->acct[IO_WQ_ACCT_UNBOUND].max_workers =
 				task_rlimit(current, RLIMIT_NPROC);
 	INIT_LIST_HEAD(&wqe->wait.entry);
 	wqe->wait.func = io_wqe_hash_wake;
 	for (i = 0; i < IO_WQ_ACCT_NR; i++) {
-		struct io_wqe_acct *acct = &wqe->acct[i];
+		struct io_wq_acct *acct = &wq->acct[i];
 
 		acct->index = i;
 		atomic_set(&acct->nr_running, 0);
@@ -1335,7 +1337,7 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
 int io_wq_max_workers(struct io_wq *wq, int *new_count)
 {
 	struct io_wqe *wqe = &wq->wqe;
-	struct io_wqe_acct *acct;
+	struct io_wq_acct *acct;
 	int prev[IO_WQ_ACCT_NR];
 	int i;
 
@@ -1355,7 +1357,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
 
 	raw_spin_lock(&wqe->lock);
 	for (i = 0; i < IO_WQ_ACCT_NR; i++) {
-		acct = &wqe->acct[i];
+		acct = &wq->acct[i];
 		prev[i] = max_t(int, acct->max_workers, prev[i]);
 		if (new_count[i])
 			acct->max_workers = new_count[i];
-- 
cgit 


From eb47943f2238bf3a002128d897f18abb143612d3 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Tue, 21 Mar 2023 22:16:28 -0300
Subject: io-wq: Drop struct io_wqe

Since commit 0654b05e7e65 ("io_uring: One wqe per wq"), we have just a
single io_wqe instance embedded per io_wq.  Drop the extra structure in
favor of accessing struct io_wq directly, cleaning up quite a bit of
dereferences and backpointers.

No functional changes intended.  Tested with liburing's testsuite
and mmtests performance microbenchmarks.  I didn't observe any
performance regressions.

Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20230322011628.23359-2-krisman@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 344 +++++++++++++++++++++++++------------------------------
 1 file changed, 156 insertions(+), 188 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index da7c6e00b690..2b0b2e33cd71 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -40,7 +40,7 @@ enum {
 };
 
 /*
- * One for each thread in a wqe pool
+ * One for each thread in a wq pool
  */
 struct io_worker {
 	refcount_t ref;
@@ -48,7 +48,7 @@ struct io_worker {
 	struct hlist_nulls_node nulls_node;
 	struct list_head all_list;
 	struct task_struct *task;
-	struct io_wqe *wqe;
+	struct io_wq *wq;
 
 	struct io_wq_work *cur_work;
 	struct io_wq_work *next_work;
@@ -90,23 +90,6 @@ enum {
 	IO_WQ_ACCT_NR,
 };
 
-/*
- * Per-node worker thread pool
- */
-struct io_wqe {
-	raw_spinlock_t lock;
-
-	struct hlist_nulls_head free_list;
-	struct list_head all_list;
-
-	struct wait_queue_entry wait;
-
-	struct io_wq *wq;
-	struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
-
-	cpumask_var_t cpu_mask;
-};
-
 /*
  * Per io_wq state
   */
@@ -127,7 +110,17 @@ struct io_wq {
 
 	struct io_wq_acct acct[IO_WQ_ACCT_NR];
 
-	struct io_wqe wqe;
+	/* lock protects access to elements below */
+	raw_spinlock_t lock;
+
+	struct hlist_nulls_head free_list;
+	struct list_head all_list;
+
+	struct wait_queue_entry wait;
+
+	struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
+
+	cpumask_var_t cpu_mask;
 };
 
 static enum cpuhp_state io_wq_online;
@@ -140,9 +133,9 @@ struct io_cb_cancel_data {
 	bool cancel_all;
 };
 
-static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
-static void io_wqe_dec_running(struct io_worker *worker);
-static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
+static bool create_io_worker(struct io_wq *wq, int index);
+static void io_wq_dec_running(struct io_worker *worker);
+static bool io_acct_cancel_pending_work(struct io_wq *wq,
 					struct io_wq_acct *acct,
 					struct io_cb_cancel_data *match);
 static void create_worker_cb(struct callback_head *cb);
@@ -172,7 +165,7 @@ static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq,
 
 static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker)
 {
-	return io_get_acct(worker->wqe->wq, worker->flags & IO_WORKER_F_BOUND);
+	return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND);
 }
 
 static void io_worker_ref_put(struct io_wq *wq)
@@ -184,13 +177,12 @@ static void io_worker_ref_put(struct io_wq *wq)
 static void io_worker_cancel_cb(struct io_worker *worker)
 {
 	struct io_wq_acct *acct = io_wq_get_acct(worker);
-	struct io_wqe *wqe = worker->wqe;
-	struct io_wq *wq = wqe->wq;
+	struct io_wq *wq = worker->wq;
 
 	atomic_dec(&acct->nr_running);
-	raw_spin_lock(&worker->wqe->lock);
+	raw_spin_lock(&wq->lock);
 	acct->nr_workers--;
-	raw_spin_unlock(&worker->wqe->lock);
+	raw_spin_unlock(&wq->lock);
 	io_worker_ref_put(wq);
 	clear_bit_unlock(0, &worker->create_state);
 	io_worker_release(worker);
@@ -208,8 +200,7 @@ static bool io_task_worker_match(struct callback_head *cb, void *data)
 
 static void io_worker_exit(struct io_worker *worker)
 {
-	struct io_wqe *wqe = worker->wqe;
-	struct io_wq *wq = wqe->wq;
+	struct io_wq *wq = worker->wq;
 
 	while (1) {
 		struct callback_head *cb = task_work_cancel_match(wq->task,
@@ -223,19 +214,19 @@ static void io_worker_exit(struct io_worker *worker)
 	io_worker_release(worker);
 	wait_for_completion(&worker->ref_done);
 
-	raw_spin_lock(&wqe->lock);
+	raw_spin_lock(&wq->lock);
 	if (worker->flags & IO_WORKER_F_FREE)
 		hlist_nulls_del_rcu(&worker->nulls_node);
 	list_del_rcu(&worker->all_list);
-	raw_spin_unlock(&wqe->lock);
-	io_wqe_dec_running(worker);
+	raw_spin_unlock(&wq->lock);
+	io_wq_dec_running(worker);
 	worker->flags = 0;
 	preempt_disable();
 	current->flags &= ~PF_IO_WORKER;
 	preempt_enable();
 
 	kfree_rcu(worker, rcu);
-	io_worker_ref_put(wqe->wq);
+	io_worker_ref_put(wq);
 	do_exit(0);
 }
 
@@ -256,7 +247,7 @@ static inline bool io_acct_run_queue(struct io_wq_acct *acct)
  * Check head of free list for an available worker. If one isn't available,
  * caller must create one.
  */
-static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
+static bool io_wq_activate_free_worker(struct io_wq *wq,
 					struct io_wq_acct *acct)
 	__must_hold(RCU)
 {
@@ -268,7 +259,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
 	 * activate. If a given worker is on the free_list but in the process
 	 * of exiting, keep trying.
 	 */
-	hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
+	hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) {
 		if (!io_worker_get(worker))
 			continue;
 		if (io_wq_get_acct(worker) != acct) {
@@ -289,7 +280,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
  * We need a worker. If we find a free one, we're good. If not, and we're
  * below the max number of workers, create one.
  */
-static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wq_acct *acct)
+static bool io_wq_create_worker(struct io_wq *wq, struct io_wq_acct *acct)
 {
 	/*
 	 * Most likely an attempt to queue unbounded work on an io_wq that
@@ -298,19 +289,19 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wq_acct *acct)
 	if (unlikely(!acct->max_workers))
 		pr_warn_once("io-wq is not configured for unbound workers");
 
-	raw_spin_lock(&wqe->lock);
+	raw_spin_lock(&wq->lock);
 	if (acct->nr_workers >= acct->max_workers) {
-		raw_spin_unlock(&wqe->lock);
+		raw_spin_unlock(&wq->lock);
 		return true;
 	}
 	acct->nr_workers++;
-	raw_spin_unlock(&wqe->lock);
+	raw_spin_unlock(&wq->lock);
 	atomic_inc(&acct->nr_running);
-	atomic_inc(&wqe->wq->worker_refs);
-	return create_io_worker(wqe->wq, wqe, acct->index);
+	atomic_inc(&wq->worker_refs);
+	return create_io_worker(wq, acct->index);
 }
 
-static void io_wqe_inc_running(struct io_worker *worker)
+static void io_wq_inc_running(struct io_worker *worker)
 {
 	struct io_wq_acct *acct = io_wq_get_acct(worker);
 
@@ -321,22 +312,22 @@ static void create_worker_cb(struct callback_head *cb)
 {
 	struct io_worker *worker;
 	struct io_wq *wq;
-	struct io_wqe *wqe;
+
 	struct io_wq_acct *acct;
 	bool do_create = false;
 
 	worker = container_of(cb, struct io_worker, create_work);
-	wqe = worker->wqe;
-	wq = wqe->wq;
+	wq = worker->wq;
 	acct = &wq->acct[worker->create_index];
-	raw_spin_lock(&wqe->lock);
+	raw_spin_lock(&wq->lock);
+
 	if (acct->nr_workers < acct->max_workers) {
 		acct->nr_workers++;
 		do_create = true;
 	}
-	raw_spin_unlock(&wqe->lock);
+	raw_spin_unlock(&wq->lock);
 	if (do_create) {
-		create_io_worker(wq, wqe, worker->create_index);
+		create_io_worker(wq, worker->create_index);
 	} else {
 		atomic_dec(&acct->nr_running);
 		io_worker_ref_put(wq);
@@ -349,8 +340,7 @@ static bool io_queue_worker_create(struct io_worker *worker,
 				   struct io_wq_acct *acct,
 				   task_work_func_t func)
 {
-	struct io_wqe *wqe = worker->wqe;
-	struct io_wq *wq = wqe->wq;
+	struct io_wq *wq = worker->wq;
 
 	/* raced with exit, just ignore create call */
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
@@ -392,10 +382,10 @@ fail:
 	return false;
 }
 
-static void io_wqe_dec_running(struct io_worker *worker)
+static void io_wq_dec_running(struct io_worker *worker)
 {
 	struct io_wq_acct *acct = io_wq_get_acct(worker);
-	struct io_wqe *wqe = worker->wqe;
+	struct io_wq *wq = worker->wq;
 
 	if (!(worker->flags & IO_WORKER_F_UP))
 		return;
@@ -406,7 +396,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
 		return;
 
 	atomic_inc(&acct->nr_running);
-	atomic_inc(&wqe->wq->worker_refs);
+	atomic_inc(&wq->worker_refs);
 	io_queue_worker_create(worker, acct, create_worker_cb);
 }
 
@@ -414,13 +404,13 @@ static void io_wqe_dec_running(struct io_worker *worker)
  * Worker will start processing some work. Move it to the busy list, if
  * it's currently on the freelist
  */
-static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
+static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker)
 {
 	if (worker->flags & IO_WORKER_F_FREE) {
 		worker->flags &= ~IO_WORKER_F_FREE;
-		raw_spin_lock(&wqe->lock);
+		raw_spin_lock(&wq->lock);
 		hlist_nulls_del_init_rcu(&worker->nulls_node);
-		raw_spin_unlock(&wqe->lock);
+		raw_spin_unlock(&wq->lock);
 	}
 }
 
@@ -431,12 +421,12 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
  * retry the loop in that case (we changed task state), we don't regrab
  * the lock if we return success.
  */
-static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
-	__must_hold(wqe->lock)
+static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker)
+	__must_hold(wq->lock)
 {
 	if (!(worker->flags & IO_WORKER_F_FREE)) {
 		worker->flags |= IO_WORKER_F_FREE;
-		hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
+		hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list);
 	}
 }
 
@@ -445,17 +435,16 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work)
 	return work->flags >> IO_WQ_HASH_SHIFT;
 }
 
-static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
+static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash)
 {
-	struct io_wq *wq = wqe->wq;
 	bool ret = false;
 
 	spin_lock_irq(&wq->hash->wait.lock);
-	if (list_empty(&wqe->wait.entry)) {
-		__add_wait_queue(&wq->hash->wait, &wqe->wait);
+	if (list_empty(&wq->wait.entry)) {
+		__add_wait_queue(&wq->hash->wait, &wq->wait);
 		if (!test_bit(hash, &wq->hash->map)) {
 			__set_current_state(TASK_RUNNING);
-			list_del_init(&wqe->wait.entry);
+			list_del_init(&wq->wait.entry);
 			ret = true;
 		}
 	}
@@ -470,7 +459,7 @@ static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct,
 	struct io_wq_work_node *node, *prev;
 	struct io_wq_work *work, *tail;
 	unsigned int stall_hash = -1U;
-	struct io_wqe *wqe = worker->wqe;
+	struct io_wq *wq = worker->wq;
 
 	wq_list_for_each(node, prev, &acct->work_list) {
 		unsigned int hash;
@@ -485,11 +474,11 @@ static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct,
 
 		hash = io_get_work_hash(work);
 		/* all items with this hash lie in [work, tail] */
-		tail = wqe->hash_tail[hash];
+		tail = wq->hash_tail[hash];
 
 		/* hashed, can run if not already running */
-		if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
-			wqe->hash_tail[hash] = NULL;
+		if (!test_and_set_bit(hash, &wq->hash->map)) {
+			wq->hash_tail[hash] = NULL;
 			wq_list_cut(&acct->work_list, &tail->list, prev);
 			return work;
 		}
@@ -508,12 +497,12 @@ static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct,
 		 */
 		set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
 		raw_spin_unlock(&acct->lock);
-		unstalled = io_wait_on_hash(wqe, stall_hash);
+		unstalled = io_wait_on_hash(wq, stall_hash);
 		raw_spin_lock(&acct->lock);
 		if (unstalled) {
 			clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
-			if (wq_has_sleeper(&wqe->wq->hash->wait))
-				wake_up(&wqe->wq->hash->wait);
+			if (wq_has_sleeper(&wq->hash->wait))
+				wake_up(&wq->hash->wait);
 		}
 	}
 
@@ -534,13 +523,10 @@ static void io_assign_current_work(struct io_worker *worker,
 	raw_spin_unlock(&worker->lock);
 }
 
-static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
-
 static void io_worker_handle_work(struct io_worker *worker)
 {
 	struct io_wq_acct *acct = io_wq_get_acct(worker);
-	struct io_wqe *wqe = worker->wqe;
-	struct io_wq *wq = wqe->wq;
+	struct io_wq *wq = worker->wq;
 	bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
 
 	do {
@@ -557,7 +543,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 		work = io_get_next_work(acct, worker);
 		raw_spin_unlock(&acct->lock);
 		if (work) {
-			__io_worker_busy(wqe, worker);
+			__io_worker_busy(wq, worker);
 
 			/*
 			 * Make sure cancelation can find this, even before
@@ -595,7 +581,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 			}
 			io_assign_current_work(worker, work);
 			if (linked)
-				io_wqe_enqueue(wqe, linked);
+				io_wq_enqueue(wq, linked);
 
 			if (hash != -1U && !next_hashed) {
 				/* serialize hash clear with wake_up() */
@@ -610,12 +596,11 @@ static void io_worker_handle_work(struct io_worker *worker)
 	} while (1);
 }
 
-static int io_wqe_worker(void *data)
+static int io_wq_worker(void *data)
 {
 	struct io_worker *worker = data;
 	struct io_wq_acct *acct = io_wq_get_acct(worker);
-	struct io_wqe *wqe = worker->wqe;
-	struct io_wq *wq = wqe->wq;
+	struct io_wq *wq = worker->wq;
 	bool exit_mask = false, last_timeout = false;
 	char buf[TASK_COMM_LEN];
 
@@ -631,20 +616,20 @@ static int io_wqe_worker(void *data)
 		while (io_acct_run_queue(acct))
 			io_worker_handle_work(worker);
 
-		raw_spin_lock(&wqe->lock);
+		raw_spin_lock(&wq->lock);
 		/*
 		 * Last sleep timed out. Exit if we're not the last worker,
 		 * or if someone modified our affinity.
 		 */
 		if (last_timeout && (exit_mask || acct->nr_workers > 1)) {
 			acct->nr_workers--;
-			raw_spin_unlock(&wqe->lock);
+			raw_spin_unlock(&wq->lock);
 			__set_current_state(TASK_RUNNING);
 			break;
 		}
 		last_timeout = false;
-		__io_worker_idle(wqe, worker);
-		raw_spin_unlock(&wqe->lock);
+		__io_worker_idle(wq, worker);
+		raw_spin_unlock(&wq->lock);
 		if (io_run_task_work())
 			continue;
 		ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
@@ -658,7 +643,7 @@ static int io_wqe_worker(void *data)
 		if (!ret) {
 			last_timeout = true;
 			exit_mask = !cpumask_test_cpu(raw_smp_processor_id(),
-							wqe->cpu_mask);
+							wq->cpu_mask);
 		}
 	}
 
@@ -683,7 +668,7 @@ void io_wq_worker_running(struct task_struct *tsk)
 	if (worker->flags & IO_WORKER_F_RUNNING)
 		return;
 	worker->flags |= IO_WORKER_F_RUNNING;
-	io_wqe_inc_running(worker);
+	io_wq_inc_running(worker);
 }
 
 /*
@@ -702,21 +687,21 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
 		return;
 
 	worker->flags &= ~IO_WORKER_F_RUNNING;
-	io_wqe_dec_running(worker);
+	io_wq_dec_running(worker);
 }
 
-static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
+static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker,
 			       struct task_struct *tsk)
 {
 	tsk->worker_private = worker;
 	worker->task = tsk;
-	set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
+	set_cpus_allowed_ptr(tsk, wq->cpu_mask);
 
-	raw_spin_lock(&wqe->lock);
-	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
-	list_add_tail_rcu(&worker->all_list, &wqe->all_list);
+	raw_spin_lock(&wq->lock);
+	hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list);
+	list_add_tail_rcu(&worker->all_list, &wq->all_list);
 	worker->flags |= IO_WORKER_F_FREE;
-	raw_spin_unlock(&wqe->lock);
+	raw_spin_unlock(&wq->lock);
 	wake_up_new_task(tsk);
 }
 
@@ -749,21 +734,21 @@ static void create_worker_cont(struct callback_head *cb)
 {
 	struct io_worker *worker;
 	struct task_struct *tsk;
-	struct io_wqe *wqe;
+	struct io_wq *wq;
 
 	worker = container_of(cb, struct io_worker, create_work);
 	clear_bit_unlock(0, &worker->create_state);
-	wqe = worker->wqe;
-	tsk = create_io_thread(io_wqe_worker, worker, NUMA_NO_NODE);
+	wq = worker->wq;
+	tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
 	if (!IS_ERR(tsk)) {
-		io_init_new_worker(wqe, worker, tsk);
+		io_init_new_worker(wq, worker, tsk);
 		io_worker_release(worker);
 		return;
 	} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
 		struct io_wq_acct *acct = io_wq_get_acct(worker);
 
 		atomic_dec(&acct->nr_running);
-		raw_spin_lock(&wqe->lock);
+		raw_spin_lock(&wq->lock);
 		acct->nr_workers--;
 		if (!acct->nr_workers) {
 			struct io_cb_cancel_data match = {
@@ -771,13 +756,13 @@ static void create_worker_cont(struct callback_head *cb)
 				.cancel_all	= true,
 			};
 
-			raw_spin_unlock(&wqe->lock);
-			while (io_acct_cancel_pending_work(wqe, acct, &match))
+			raw_spin_unlock(&wq->lock);
+			while (io_acct_cancel_pending_work(wq, acct, &match))
 				;
 		} else {
-			raw_spin_unlock(&wqe->lock);
+			raw_spin_unlock(&wq->lock);
 		}
-		io_worker_ref_put(wqe->wq);
+		io_worker_ref_put(wq);
 		kfree(worker);
 		return;
 	}
@@ -796,7 +781,7 @@ static void io_workqueue_create(struct work_struct *work)
 		kfree(worker);
 }
 
-static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+static bool create_io_worker(struct io_wq *wq, int index)
 {
 	struct io_wq_acct *acct = &wq->acct[index];
 	struct io_worker *worker;
@@ -808,24 +793,24 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 	if (!worker) {
 fail:
 		atomic_dec(&acct->nr_running);
-		raw_spin_lock(&wqe->lock);
+		raw_spin_lock(&wq->lock);
 		acct->nr_workers--;
-		raw_spin_unlock(&wqe->lock);
+		raw_spin_unlock(&wq->lock);
 		io_worker_ref_put(wq);
 		return false;
 	}
 
 	refcount_set(&worker->ref, 1);
-	worker->wqe = wqe;
+	worker->wq = wq;
 	raw_spin_lock_init(&worker->lock);
 	init_completion(&worker->ref_done);
 
 	if (index == IO_WQ_ACCT_BOUND)
 		worker->flags |= IO_WORKER_F_BOUND;
 
-	tsk = create_io_thread(io_wqe_worker, worker, NUMA_NO_NODE);
+	tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
 	if (!IS_ERR(tsk)) {
-		io_init_new_worker(wqe, worker, tsk);
+		io_init_new_worker(wq, worker, tsk);
 	} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
 		kfree(worker);
 		goto fail;
@@ -841,14 +826,14 @@ fail:
  * Iterate the passed in list and call the specific function for each
  * worker that isn't exiting
  */
-static bool io_wq_for_each_worker(struct io_wqe *wqe,
+static bool io_wq_for_each_worker(struct io_wq *wq,
 				  bool (*func)(struct io_worker *, void *),
 				  void *data)
 {
 	struct io_worker *worker;
 	bool ret = false;
 
-	list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
+	list_for_each_entry_rcu(worker, &wq->all_list, all_list) {
 		if (io_worker_get(worker)) {
 			/* no task if node is/was offline */
 			if (worker->task)
@@ -869,10 +854,8 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
 	return false;
 }
 
-static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
+static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
 {
-	struct io_wq *wq = wqe->wq;
-
 	do {
 		work->flags |= IO_WQ_WORK_CANCEL;
 		wq->do_work(work);
@@ -880,9 +863,9 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
 	} while (work);
 }
 
-static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
+static void io_wq_insert_work(struct io_wq *wq, struct io_wq_work *work)
 {
-	struct io_wq_acct *acct = io_work_get_acct(wqe->wq, work);
+	struct io_wq_acct *acct = io_work_get_acct(wq, work);
 	unsigned int hash;
 	struct io_wq_work *tail;
 
@@ -893,8 +876,8 @@ append:
 	}
 
 	hash = io_get_work_hash(work);
-	tail = wqe->hash_tail[hash];
-	wqe->hash_tail[hash] = work;
+	tail = wq->hash_tail[hash];
+	wq->hash_tail[hash] = work;
 	if (!tail)
 		goto append;
 
@@ -906,9 +889,9 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
 	return work == data;
 }
 
-static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
+void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
 {
-	struct io_wq_acct *acct = io_work_get_acct(wqe->wq, work);
+	struct io_wq_acct *acct = io_work_get_acct(wq, work);
 	struct io_cb_cancel_data match;
 	unsigned work_flags = work->flags;
 	bool do_create;
@@ -917,55 +900,48 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 	 * If io-wq is exiting for this task, or if the request has explicitly
 	 * been marked as one that should not get executed, cancel it here.
 	 */
-	if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) ||
+	if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
 	    (work->flags & IO_WQ_WORK_CANCEL)) {
-		io_run_cancel(work, wqe);
+		io_run_cancel(work, wq);
 		return;
 	}
 
 	raw_spin_lock(&acct->lock);
-	io_wqe_insert_work(wqe, work);
+	io_wq_insert_work(wq, work);
 	clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
 	raw_spin_unlock(&acct->lock);
 
-	raw_spin_lock(&wqe->lock);
+	raw_spin_lock(&wq->lock);
 	rcu_read_lock();
-	do_create = !io_wqe_activate_free_worker(wqe, acct);
+	do_create = !io_wq_activate_free_worker(wq, acct);
 	rcu_read_unlock();
 
-	raw_spin_unlock(&wqe->lock);
+	raw_spin_unlock(&wq->lock);
 
 	if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
 	    !atomic_read(&acct->nr_running))) {
 		bool did_create;
 
-		did_create = io_wqe_create_worker(wqe, acct);
+		did_create = io_wq_create_worker(wq, acct);
 		if (likely(did_create))
 			return;
 
-		raw_spin_lock(&wqe->lock);
+		raw_spin_lock(&wq->lock);
 		if (acct->nr_workers) {
-			raw_spin_unlock(&wqe->lock);
+			raw_spin_unlock(&wq->lock);
 			return;
 		}
-		raw_spin_unlock(&wqe->lock);
+		raw_spin_unlock(&wq->lock);
 
 		/* fatal condition, failed to create the first worker */
 		match.fn		= io_wq_work_match_item,
 		match.data		= work,
 		match.cancel_all	= false,
 
-		io_acct_cancel_pending_work(wqe, acct, &match);
+		io_acct_cancel_pending_work(wq, acct, &match);
 	}
 }
 
-void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
-{
-	struct io_wqe *wqe = &wq->wqe;
-
-	io_wqe_enqueue(wqe, work);
-}
-
 /*
  * Work items that hash to the same value will not be done in parallel.
  * Used to limit concurrent writes, generally hashed by inode.
@@ -1008,26 +984,26 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 	return match->nr_running && !match->cancel_all;
 }
 
-static inline void io_wqe_remove_pending(struct io_wqe *wqe,
+static inline void io_wq_remove_pending(struct io_wq *wq,
 					 struct io_wq_work *work,
 					 struct io_wq_work_node *prev)
 {
-	struct io_wq_acct *acct = io_work_get_acct(wqe->wq, work);
+	struct io_wq_acct *acct = io_work_get_acct(wq, work);
 	unsigned int hash = io_get_work_hash(work);
 	struct io_wq_work *prev_work = NULL;
 
-	if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) {
+	if (io_wq_is_hashed(work) && work == wq->hash_tail[hash]) {
 		if (prev)
 			prev_work = container_of(prev, struct io_wq_work, list);
 		if (prev_work && io_get_work_hash(prev_work) == hash)
-			wqe->hash_tail[hash] = prev_work;
+			wq->hash_tail[hash] = prev_work;
 		else
-			wqe->hash_tail[hash] = NULL;
+			wq->hash_tail[hash] = NULL;
 	}
 	wq_list_del(&acct->work_list, &work->list, prev);
 }
 
-static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
+static bool io_acct_cancel_pending_work(struct io_wq *wq,
 					struct io_wq_acct *acct,
 					struct io_cb_cancel_data *match)
 {
@@ -1039,9 +1015,9 @@ static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
 		work = container_of(node, struct io_wq_work, list);
 		if (!match->fn(work, match->data))
 			continue;
-		io_wqe_remove_pending(wqe, work, prev);
+		io_wq_remove_pending(wq, work, prev);
 		raw_spin_unlock(&acct->lock);
-		io_run_cancel(work, wqe);
+		io_run_cancel(work, wq);
 		match->nr_pending++;
 		/* not safe to continue after unlock */
 		return true;
@@ -1051,15 +1027,15 @@ static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
 	return false;
 }
 
-static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
+static void io_wq_cancel_pending_work(struct io_wq *wq,
 				      struct io_cb_cancel_data *match)
 {
 	int i;
 retry:
 	for (i = 0; i < IO_WQ_ACCT_NR; i++) {
-		struct io_wq_acct *acct = io_get_acct(wqe->wq, i == 0);
+		struct io_wq_acct *acct = io_get_acct(wq, i == 0);
 
-		if (io_acct_cancel_pending_work(wqe, acct, match)) {
+		if (io_acct_cancel_pending_work(wq, acct, match)) {
 			if (match->cancel_all)
 				goto retry;
 			break;
@@ -1067,11 +1043,11 @@ retry:
 	}
 }
 
-static void io_wqe_cancel_running_work(struct io_wqe *wqe,
+static void io_wq_cancel_running_work(struct io_wq *wq,
 				       struct io_cb_cancel_data *match)
 {
 	rcu_read_lock();
-	io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
+	io_wq_for_each_worker(wq, io_wq_worker_cancel, match);
 	rcu_read_unlock();
 }
 
@@ -1083,7 +1059,6 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 		.data		= data,
 		.cancel_all	= cancel_all,
 	};
-	struct io_wqe *wqe = &wq->wqe;
 
 	/*
 	 * First check pending list, if we're lucky we can just remove it
@@ -1095,16 +1070,16 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 	 * as an indication that we attempt to signal cancellation. The
 	 * completion will run normally in this case.
 	 *
-	 * Do both of these while holding the wqe->lock, to ensure that
+	 * Do both of these while holding the wq->lock, to ensure that
 	 * we'll find a work item regardless of state.
 	 */
-	io_wqe_cancel_pending_work(wqe, &match);
+	io_wq_cancel_pending_work(wq, &match);
 	if (match.nr_pending && !match.cancel_all)
 		return IO_WQ_CANCEL_OK;
 
-	raw_spin_lock(&wqe->lock);
-	io_wqe_cancel_running_work(wqe, &match);
-	raw_spin_unlock(&wqe->lock);
+	raw_spin_lock(&wq->lock);
+	io_wq_cancel_running_work(wq, &match);
+	raw_spin_unlock(&wq->lock);
 	if (match.nr_running && !match.cancel_all)
 		return IO_WQ_CANCEL_RUNNING;
 
@@ -1115,11 +1090,10 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 	return IO_WQ_CANCEL_NOTFOUND;
 }
 
-static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
+static int io_wq_hash_wake(struct wait_queue_entry *wait, unsigned mode,
 			    int sync, void *key)
 {
-	struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
-	struct io_wq *wq = wqe->wq;
+	struct io_wq *wq = container_of(wait, struct io_wq, wait);
 	int i;
 
 	list_del_init(&wait->entry);
@@ -1129,7 +1103,7 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
 		struct io_wq_acct *acct = &wq->acct[i];
 
 		if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
-			io_wqe_activate_free_worker(wqe, acct);
+			io_wq_activate_free_worker(wq, acct);
 	}
 	rcu_read_unlock();
 	return 1;
@@ -1139,7 +1113,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 {
 	int ret, i;
 	struct io_wq *wq;
-	struct io_wqe *wqe;
 
 	if (WARN_ON_ONCE(!data->free_work || !data->do_work))
 		return ERR_PTR(-EINVAL);
@@ -1157,18 +1130,17 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	wq->hash = data->hash;
 	wq->free_work = data->free_work;
 	wq->do_work = data->do_work;
-	wqe = &wq->wqe;
 
 	ret = -ENOMEM;
 
-	if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL))
+	if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL))
 		goto err;
-	cpumask_copy(wqe->cpu_mask, cpu_possible_mask);
+	cpumask_copy(wq->cpu_mask, cpu_possible_mask);
 	wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
 	wq->acct[IO_WQ_ACCT_UNBOUND].max_workers =
 				task_rlimit(current, RLIMIT_NPROC);
-	INIT_LIST_HEAD(&wqe->wait.entry);
-	wqe->wait.func = io_wqe_hash_wake;
+	INIT_LIST_HEAD(&wq->wait.entry);
+	wq->wait.func = io_wq_hash_wake;
 	for (i = 0; i < IO_WQ_ACCT_NR; i++) {
 		struct io_wq_acct *acct = &wq->acct[i];
 
@@ -1177,10 +1149,10 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 		INIT_WQ_LIST(&acct->work_list);
 		raw_spin_lock_init(&acct->lock);
 	}
-	wqe->wq = wq;
-	raw_spin_lock_init(&wqe->lock);
-	INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
-	INIT_LIST_HEAD(&wqe->all_list);
+
+	raw_spin_lock_init(&wq->lock);
+	INIT_HLIST_NULLS_HEAD(&wq->free_list, 0);
+	INIT_LIST_HEAD(&wq->all_list);
 
 	wq->task = get_task_struct(data->task);
 	atomic_set(&wq->worker_refs, 1);
@@ -1190,7 +1162,7 @@ err:
 	io_wq_put_hash(data->hash);
 	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
 
-	free_cpumask_var(wq->wqe.cpu_mask);
+	free_cpumask_var(wq->cpu_mask);
 err_wq:
 	kfree(wq);
 	return ERR_PTR(ret);
@@ -1203,7 +1175,7 @@ static bool io_task_work_match(struct callback_head *cb, void *data)
 	if (cb->func != create_worker_cb && cb->func != create_worker_cont)
 		return false;
 	worker = container_of(cb, struct io_worker, create_work);
-	return worker->wqe->wq == data;
+	return worker->wq == data;
 }
 
 void io_wq_exit_start(struct io_wq *wq)
@@ -1237,13 +1209,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
 	io_wq_cancel_tw_create(wq);
 
 	rcu_read_lock();
-	io_wq_for_each_worker(&wq->wqe, io_wq_worker_wake, NULL);
+	io_wq_for_each_worker(wq, io_wq_worker_wake, NULL);
 	rcu_read_unlock();
 	io_worker_ref_put(wq);
 	wait_for_completion(&wq->worker_done);
 
 	spin_lock_irq(&wq->hash->wait.lock);
-	list_del_init(&wq->wqe.wait.entry);
+	list_del_init(&wq->wait.entry);
 	spin_unlock_irq(&wq->hash->wait.lock);
 
 	put_task_struct(wq->task);
@@ -1256,11 +1228,10 @@ static void io_wq_destroy(struct io_wq *wq)
 		.fn		= io_wq_work_match_all,
 		.cancel_all	= true,
 	};
-	struct io_wqe *wqe = &wq->wqe;
 
 	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
-	io_wqe_cancel_pending_work(wqe, &match);
-	free_cpumask_var(wqe->cpu_mask);
+	io_wq_cancel_pending_work(wq, &match);
+	free_cpumask_var(wq->cpu_mask);
 	io_wq_put_hash(wq->hash);
 	kfree(wq);
 }
@@ -1283,9 +1254,9 @@ static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
 	struct online_data *od = data;
 
 	if (od->online)
-		cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask);
+		cpumask_set_cpu(od->cpu, worker->wq->cpu_mask);
 	else
-		cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask);
+		cpumask_clear_cpu(od->cpu, worker->wq->cpu_mask);
 	return false;
 }
 
@@ -1297,7 +1268,7 @@ static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online)
 	};
 
 	rcu_read_lock();
-	io_wq_for_each_worker(&wq->wqe, io_wq_worker_affinity, &od);
+	io_wq_for_each_worker(wq, io_wq_worker_affinity, &od);
 	rcu_read_unlock();
 	return 0;
 }
@@ -1318,13 +1289,11 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
 
 int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
 {
-	struct io_wqe *wqe = &wq->wqe;
-
 	rcu_read_lock();
 	if (mask)
-		cpumask_copy(wqe->cpu_mask, mask);
+		cpumask_copy(wq->cpu_mask, mask);
 	else
-		cpumask_copy(wqe->cpu_mask, cpu_possible_mask);
+		cpumask_copy(wq->cpu_mask, cpu_possible_mask);
 	rcu_read_unlock();
 
 	return 0;
@@ -1336,7 +1305,6 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
  */
 int io_wq_max_workers(struct io_wq *wq, int *new_count)
 {
-	struct io_wqe *wqe = &wq->wqe;
 	struct io_wq_acct *acct;
 	int prev[IO_WQ_ACCT_NR];
 	int i;
@@ -1355,14 +1323,14 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
 
 	rcu_read_lock();
 
-	raw_spin_lock(&wqe->lock);
+	raw_spin_lock(&wq->lock);
 	for (i = 0; i < IO_WQ_ACCT_NR; i++) {
 		acct = &wq->acct[i];
 		prev[i] = max_t(int, acct->max_workers, prev[i]);
 		if (new_count[i])
 			acct->max_workers = new_count[i];
 	}
-	raw_spin_unlock(&wqe->lock);
+	raw_spin_unlock(&wq->lock);
 	rcu_read_unlock();
 
 	for (i = 0; i < IO_WQ_ACCT_NR; i++)
-- 
cgit 


From 07d99096e1635805fb7c60382dc12554886a39b8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 27 Mar 2023 13:10:21 -0600
Subject: io_uring/io-wq: drop outdated comment

Since the move to PF_IO_WORKER, we don't juggle memory context manually
anymore. Remove that outdated part of the comment for __io_worker_idle().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 2b0b2e33cd71..b2715988791e 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -415,11 +415,7 @@ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker)
 }
 
 /*
- * No work, worker going to sleep. Move to freelist, and unuse mm if we
- * have one attached. Dropping the mm may potentially sleep, so we drop
- * the lock in that case and return success. Since the caller has to
- * retry the loop in that case (we changed task state), we don't regrab
- * the lock if we return success.
+ * No work, worker going to sleep. Move to freelist.
  */
 static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker)
 	__must_hold(wq->lock)
-- 
cgit 


From 13bfa6f15d0b39254937076ab0557da6875bb455 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 27 Mar 2023 16:38:14 +0100
Subject: io_uring: remove extra tw trylocks

Before cond_resched()'ing in handle_tw_list() we also drop the current
ring context, and so the next loop iteration will need to pick/pin a new
context and do trylock.

The chunk removed by this patch was intended to be an optimisation
covering exactly this case, i.e. retaking the lock after reschedule, but
in reality it's skipped for the first iteration after resched as
described and will keep hammering the lock if it's contended.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1ecec9483d58696e248d1bfd52cf62b04442df1d.1679931367.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 24be4992821b..2669aca0ba39 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1186,8 +1186,7 @@ static unsigned int handle_tw_list(struct llist_node *node,
 			/* if not contended, grab and improve batching */
 			*locked = mutex_trylock(&(*ctx)->uring_lock);
 			percpu_ref_get(&(*ctx)->refs);
-		} else if (!*locked)
-			*locked = mutex_trylock(&(*ctx)->uring_lock);
+		}
 		req->io_task_work.func(req, locked);
 		node = next;
 		count++;
-- 
cgit 


From a282967c848fb1d92c28334430c472da9c334e54 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 27 Mar 2023 16:38:15 +0100
Subject: io_uring: encapsulate task_work state

For task works we're passing around a bool pointer for whether the
current ring is locked or not, let's wrap it in a structure, that
will make it more opaque preventing abuse and will also help us
to pass more info in the future if needed.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1ecec9483d58696e248d1bfd52cf62b04442df1d.1679931367.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c  | 71 ++++++++++++++++++++++++++--------------------------
 io_uring/io_uring.h  | 14 +++++------
 io_uring/notif.c     |  4 +--
 io_uring/poll.c      | 32 +++++++++++------------
 io_uring/rw.c        |  6 ++---
 io_uring/timeout.c   | 14 +++++------
 io_uring/uring_cmd.c |  4 +--
 7 files changed, 73 insertions(+), 72 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2669aca0ba39..536940675c67 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -247,12 +247,12 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 						fallback_work.work);
 	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
 	struct io_kiocb *req, *tmp;
-	bool locked = true;
+	struct io_tw_state ts = { .locked = true, };
 
 	mutex_lock(&ctx->uring_lock);
 	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
-		req->io_task_work.func(req, &locked);
-	if (WARN_ON_ONCE(!locked))
+		req->io_task_work.func(req, &ts);
+	if (WARN_ON_ONCE(!ts.locked))
 		return;
 	io_submit_flush_completions(ctx);
 	mutex_unlock(&ctx->uring_lock);
@@ -457,7 +457,7 @@ static void io_prep_async_link(struct io_kiocb *req)
 	}
 }
 
-void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
+void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use)
 {
 	struct io_kiocb *link = io_prep_linked_timeout(req);
 	struct io_uring_task *tctx = req->task->io_uring;
@@ -1153,22 +1153,23 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 	return nxt;
 }
 
-static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
+static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 {
 	if (!ctx)
 		return;
 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
-	if (*locked) {
+	if (ts->locked) {
 		io_submit_flush_completions(ctx);
 		mutex_unlock(&ctx->uring_lock);
-		*locked = false;
+		ts->locked = false;
 	}
 	percpu_ref_put(&ctx->refs);
 }
 
 static unsigned int handle_tw_list(struct llist_node *node,
-				   struct io_ring_ctx **ctx, bool *locked,
+				   struct io_ring_ctx **ctx,
+				   struct io_tw_state *ts,
 				   struct llist_node *last)
 {
 	unsigned int count = 0;
@@ -1181,17 +1182,17 @@ static unsigned int handle_tw_list(struct llist_node *node,
 		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
 
 		if (req->ctx != *ctx) {
-			ctx_flush_and_put(*ctx, locked);
+			ctx_flush_and_put(*ctx, ts);
 			*ctx = req->ctx;
 			/* if not contended, grab and improve batching */
-			*locked = mutex_trylock(&(*ctx)->uring_lock);
+			ts->locked = mutex_trylock(&(*ctx)->uring_lock);
 			percpu_ref_get(&(*ctx)->refs);
 		}
-		req->io_task_work.func(req, locked);
+		req->io_task_work.func(req, ts);
 		node = next;
 		count++;
 		if (unlikely(need_resched())) {
-			ctx_flush_and_put(*ctx, locked);
+			ctx_flush_and_put(*ctx, ts);
 			*ctx = NULL;
 			cond_resched();
 		}
@@ -1232,7 +1233,7 @@ static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
 
 void tctx_task_work(struct callback_head *cb)
 {
-	bool uring_locked = false;
+	struct io_tw_state ts = {};
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
@@ -1249,12 +1250,12 @@ void tctx_task_work(struct callback_head *cb)
 	do {
 		loops++;
 		node = io_llist_xchg(&tctx->task_list, &fake);
-		count += handle_tw_list(node, &ctx, &uring_locked, &fake);
+		count += handle_tw_list(node, &ctx, &ts, &fake);
 
 		/* skip expensive cmpxchg if there are items in the list */
 		if (READ_ONCE(tctx->task_list.first) != &fake)
 			continue;
-		if (uring_locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
+		if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
 			io_submit_flush_completions(ctx);
 			if (READ_ONCE(tctx->task_list.first) != &fake)
 				continue;
@@ -1262,7 +1263,7 @@ void tctx_task_work(struct callback_head *cb)
 		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	} while (node != &fake);
 
-	ctx_flush_and_put(ctx, &uring_locked);
+	ctx_flush_and_put(ctx, &ts);
 
 	/* relaxed read is enough as only the task itself sets ->in_cancel */
 	if (unlikely(atomic_read(&tctx->in_cancel)))
@@ -1351,7 +1352,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
 	}
 }
 
-static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked)
+static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 {
 	struct llist_node *node;
 	unsigned int loops = 0;
@@ -1368,7 +1369,7 @@ again:
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
 		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-		req->io_task_work.func(req, locked);
+		req->io_task_work.func(req, ts);
 		ret++;
 		node = next;
 	}
@@ -1376,7 +1377,7 @@ again:
 
 	if (!llist_empty(&ctx->work_llist))
 		goto again;
-	if (*locked) {
+	if (ts->locked) {
 		io_submit_flush_completions(ctx);
 		if (!llist_empty(&ctx->work_llist))
 			goto again;
@@ -1387,46 +1388,46 @@ again:
 
 static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
 {
-	bool locked;
+	struct io_tw_state ts = { .locked = true, };
 	int ret;
 
 	if (llist_empty(&ctx->work_llist))
 		return 0;
 
-	locked = true;
-	ret = __io_run_local_work(ctx, &locked);
+	ret = __io_run_local_work(ctx, &ts);
 	/* shouldn't happen! */
-	if (WARN_ON_ONCE(!locked))
+	if (WARN_ON_ONCE(!ts.locked))
 		mutex_lock(&ctx->uring_lock);
 	return ret;
 }
 
 static int io_run_local_work(struct io_ring_ctx *ctx)
 {
-	bool locked = mutex_trylock(&ctx->uring_lock);
+	struct io_tw_state ts = {};
 	int ret;
 
-	ret = __io_run_local_work(ctx, &locked);
-	if (locked)
+	ts.locked = mutex_trylock(&ctx->uring_lock);
+	ret = __io_run_local_work(ctx, &ts);
+	if (ts.locked)
 		mutex_unlock(&ctx->uring_lock);
 
 	return ret;
 }
 
-static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
+static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	io_tw_lock(req->ctx, locked);
+	io_tw_lock(req->ctx, ts);
 	io_req_defer_failed(req, req->cqe.res);
 }
 
-void io_req_task_submit(struct io_kiocb *req, bool *locked)
+void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	io_tw_lock(req->ctx, locked);
+	io_tw_lock(req->ctx, ts);
 	/* req->task == current here, checking PF_EXITING is safe */
 	if (unlikely(req->task->flags & PF_EXITING))
 		io_req_defer_failed(req, -EFAULT);
 	else if (req->flags & REQ_F_FORCE_ASYNC)
-		io_queue_iowq(req, locked);
+		io_queue_iowq(req, ts);
 	else
 		io_queue_sqe(req);
 }
@@ -1652,9 +1653,9 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 	return ret;
 }
 
-void io_req_task_complete(struct io_kiocb *req, bool *locked)
+void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	if (*locked)
+	if (ts->locked)
 		io_req_complete_defer(req);
 	else
 		io_req_complete_post(req, IO_URING_F_UNLOCKED);
@@ -1933,9 +1934,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
-int io_poll_issue(struct io_kiocb *req, bool *locked)
+int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	io_tw_lock(req->ctx, locked);
+	io_tw_lock(req->ctx, ts);
 	return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT|
 				 IO_URING_F_COMPLETE_DEFER);
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 2711865f1e19..c33f719731ac 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -52,16 +52,16 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local);
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_queue(struct io_kiocb *req);
-void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
-void io_req_task_complete(struct io_kiocb *req, bool *locked);
+void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use);
+void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
 void io_req_task_queue_fail(struct io_kiocb *req, int ret);
-void io_req_task_submit(struct io_kiocb *req, bool *locked);
+void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts);
 void tctx_task_work(struct callback_head *cb);
 __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 int io_uring_alloc_task_context(struct task_struct *task,
 				struct io_ring_ctx *ctx);
 
-int io_poll_issue(struct io_kiocb *req, bool *locked);
+int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
 void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
@@ -299,11 +299,11 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
 	return task_work_pending(current) || !wq_list_empty(&ctx->work_llist);
 }
 
-static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
+static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 {
-	if (!*locked) {
+	if (!ts->locked) {
 		mutex_lock(&ctx->uring_lock);
-		*locked = true;
+		ts->locked = true;
 	}
 }
 
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 09dfd0832d19..172105eb347d 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -9,7 +9,7 @@
 #include "notif.h"
 #include "rsrc.h"
 
-static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked)
+static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts)
 {
 	struct io_notif_data *nd = io_notif_to_data(notif);
 	struct io_ring_ctx *ctx = notif->ctx;
@@ -21,7 +21,7 @@ static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked)
 		__io_unaccount_mem(ctx->user, nd->account_pages);
 		nd->account_pages = 0;
 	}
-	io_req_task_complete(notif, locked);
+	io_req_task_complete(notif, ts);
 }
 
 static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 55306e801081..c90e47dc1e29 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -148,7 +148,7 @@ static void io_poll_req_insert_locked(struct io_kiocb *req)
 	hlist_add_head(&req->hash_node, &table->hbs[index].list);
 }
 
-static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
+static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -159,7 +159,7 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
 		 * already grabbed the mutex for us, but there is a chance it
 		 * failed.
 		 */
-		io_tw_lock(ctx, locked);
+		io_tw_lock(ctx, ts);
 		hash_del(&req->hash_node);
 		req->flags &= ~REQ_F_HASH_LOCKED;
 	} else {
@@ -238,7 +238,7 @@ enum {
  * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot
  * poll and that the result is stored in req->cqe.
  */
-static int io_poll_check_events(struct io_kiocb *req, bool *locked)
+static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	int v;
 
@@ -300,13 +300,13 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 			__poll_t mask = mangle_poll(req->cqe.res &
 						    req->apoll_events);
 
-			if (!io_aux_cqe(req->ctx, *locked, req->cqe.user_data,
+			if (!io_aux_cqe(req->ctx, ts->locked, req->cqe.user_data,
 					mask, IORING_CQE_F_MORE, false)) {
 				io_req_set_res(req, mask, 0);
 				return IOU_POLL_REMOVE_POLL_USE_RES;
 			}
 		} else {
-			int ret = io_poll_issue(req, locked);
+			int ret = io_poll_issue(req, ts);
 			if (ret == IOU_STOP_MULTISHOT)
 				return IOU_POLL_REMOVE_POLL_USE_RES;
 			if (ret < 0)
@@ -326,15 +326,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 	return IOU_POLL_NO_ACTION;
 }
 
-static void io_poll_task_func(struct io_kiocb *req, bool *locked)
+static void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	int ret;
 
-	ret = io_poll_check_events(req, locked);
+	ret = io_poll_check_events(req, ts);
 	if (ret == IOU_POLL_NO_ACTION)
 		return;
 	io_poll_remove_entries(req);
-	io_poll_tw_hash_eject(req, locked);
+	io_poll_tw_hash_eject(req, ts);
 
 	if (req->opcode == IORING_OP_POLL_ADD) {
 		if (ret == IOU_POLL_DONE) {
@@ -343,7 +343,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 			poll = io_kiocb_to_cmd(req, struct io_poll);
 			req->cqe.res = mangle_poll(req->cqe.res & poll->events);
 		} else if (ret == IOU_POLL_REISSUE) {
-			io_req_task_submit(req, locked);
+			io_req_task_submit(req, ts);
 			return;
 		} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
 			req->cqe.res = ret;
@@ -351,14 +351,14 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 		}
 
 		io_req_set_res(req, req->cqe.res, 0);
-		io_req_task_complete(req, locked);
+		io_req_task_complete(req, ts);
 	} else {
-		io_tw_lock(req->ctx, locked);
+		io_tw_lock(req->ctx, ts);
 
 		if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
-			io_req_task_complete(req, locked);
+			io_req_task_complete(req, ts);
 		else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE)
-			io_req_task_submit(req, locked);
+			io_req_task_submit(req, ts);
 		else
 			io_req_defer_failed(req, ret);
 	}
@@ -977,7 +977,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_hash_bucket *bucket;
 	struct io_kiocb *preq;
 	int ret2, ret = 0;
-	bool locked;
+	struct io_tw_state ts = {};
 
 	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
 	ret2 = io_poll_disarm(preq);
@@ -1027,8 +1027,8 @@ found:
 
 	req_set_fail(preq);
 	io_req_set_res(preq, -ECANCELED, 0);
-	locked = !(issue_flags & IO_URING_F_UNLOCKED);
-	io_req_task_complete(preq, &locked);
+	ts.locked = !(issue_flags & IO_URING_F_UNLOCKED);
+	io_req_task_complete(preq, &ts);
 out:
 	if (ret < 0) {
 		req_set_fail(req);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 4c233910e200..f14868624f41 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -283,16 +283,16 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
 	return res;
 }
 
-static void io_req_rw_complete(struct io_kiocb *req, bool *locked)
+static void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	io_req_io_end(req);
 
 	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
-		unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
+		unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
 
 		req->cqe.flags |= io_put_kbuf(req, issue_flags);
 	}
-	io_req_task_complete(req, locked);
+	io_req_task_complete(req, ts);
 }
 
 static void io_complete_rw(struct kiocb *kiocb, long res)
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 826a51bca3e4..5c6c6f720809 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -101,9 +101,9 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
 	spin_unlock_irq(&ctx->timeout_lock);
 }
 
-static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked)
+static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts)
 {
-	io_tw_lock(link->ctx, locked);
+	io_tw_lock(link->ctx, ts);
 	while (link) {
 		struct io_kiocb *nxt = link->link;
 		long res = -ECANCELED;
@@ -112,7 +112,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked)
 			res = link->cqe.res;
 		link->link = NULL;
 		io_req_set_res(link, res, 0);
-		io_req_task_complete(link, locked);
+		io_req_task_complete(link, ts);
 		link = nxt;
 	}
 }
@@ -265,9 +265,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 	return 0;
 }
 
-static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
+static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
+	unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
 	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_kiocb *prev = timeout->prev;
 	int ret = -ENOENT;
@@ -282,11 +282,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 			ret = io_try_cancel(req->task->io_uring, &cd, issue_flags);
 		}
 		io_req_set_res(req, ret ?: -ETIME, 0);
-		io_req_task_complete(req, locked);
+		io_req_task_complete(req, ts);
 		io_put_req(prev);
 	} else {
 		io_req_set_res(req, -ETIME, 0);
-		io_req_task_complete(req, locked);
+		io_req_task_complete(req, ts);
 	}
 }
 
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 9a1dee571872..3d825d939b13 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -12,10 +12,10 @@
 #include "rsrc.h"
 #include "uring_cmd.h"
 
-static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
+static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-	unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
+	unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
 
 	ioucmd->task_work_cb(ioucmd, issue_flags);
 }
-- 
cgit 


From 2ad57931db641f3de627023afb8147a8ec0b41dc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 30 Mar 2023 10:03:41 -0600
Subject: io_uring: rename trace_io_uring_submit_sqe() tracepoint

It has nothing to do with the SQE at this point, it's a request
submission. While in there, get rid of the 'force_nonblock' argument
which is also dead, as we only pass in true.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 536940675c67..775b53730c2f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2305,8 +2305,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	if (unlikely(ret))
 		return io_submit_fail_init(sqe, req, ret);
 
-	/* don't need @sqe from now on */
-	trace_io_uring_submit_sqe(req, true);
+	trace_io_uring_submit_req(req);
 
 	/*
 	 * If we already have a head request, queue this one for async
-- 
cgit 


From e3ef728ff07b42668e7e12f49cd2f9055e064ec1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 30 Mar 2023 10:05:31 -0600
Subject: io_uring: cap io_sqring_entries() at SQ ring size

We already do this manually for the !SQPOLL case, do it in general and
we can also dump the ugly min3() in io_submit_sqes().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 io_uring/io_uring.h | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 775b53730c2f..a0b64831c455 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2434,7 +2434,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	if (unlikely(!entries))
 		return 0;
 	/* make sure SQ entry isn't read before tail */
-	ret = left = min3(nr, ctx->sq_entries, entries);
+	ret = left = min(nr, entries);
 	io_get_task_refs(left);
 	io_submit_state_start(&ctx->submit_state, left);
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index c33f719731ac..193b2db39fe8 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -262,9 +262,11 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx)
 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
+	unsigned int entries;
 
 	/* make sure SQ entry isn't read before tail */
-	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
+	entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
+	return min(entries, ctx->sq_entries);
 }
 
 static inline int io_run_task_work(void)
-- 
cgit 


From b8fb5b4fdd67f9d18109c5d21d44a8bd4ddb608b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:45 +0100
Subject: io_uring/rsrc: use non-pcpu refcounts for nodes

One problem with the current rsrc infra is that often updates will
generates lots of rsrc nodes, each carry pcpu refs. That takes quite a
lot of memory, especially if there is a stall, and takes lots of CPU
cycles. Only pcpu allocations takes >50 of CPU with a naive benchmark
updating files in a loop.

Replace pcpu refs with normal refcounting. There is already a hot path
avoiding atomics / refs, but following patches will further improve it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e9ed8a9457b331a26555ff9443afc64cdaab7247.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 15 +++++----------
 io_uring/rsrc.h |  6 ++++--
 2 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 7a43aed8e395..f2da9e251e3f 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -155,7 +155,7 @@ void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
 {
 	ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
-	percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
+	refcount_add(IO_RSRC_REF_BATCH, &ctx->rsrc_node->refs);
 }
 
 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
@@ -220,13 +220,11 @@ void io_wait_rsrc_data(struct io_rsrc_data *data)
 
 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
 {
-	percpu_ref_exit(&ref_node->refs);
 	kfree(ref_node);
 }
 
-static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+__cold void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 {
-	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
 	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
 	unsigned long flags;
 	bool first_add = false;
@@ -269,11 +267,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void)
 	if (!ref_node)
 		return NULL;
 
-	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
-			    0, GFP_KERNEL)) {
-		kfree(ref_node);
-		return NULL;
-	}
+	refcount_set(&ref_node->refs, 1);
 	INIT_LIST_HEAD(&ref_node->node);
 	INIT_LIST_HEAD(&ref_node->rsrc_list);
 	ref_node->done = false;
@@ -298,7 +292,8 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 		spin_unlock_irq(&ctx->rsrc_ref_lock);
 
 		atomic_inc(&data_to_kill->refs);
-		percpu_ref_kill(&rsrc_node->refs);
+		/* put master ref */
+		io_rsrc_put_node(rsrc_node, 1);
 		ctx->rsrc_node = NULL;
 	}
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f27f4975217d..1467b31843bc 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -37,7 +37,7 @@ struct io_rsrc_data {
 };
 
 struct io_rsrc_node {
-	struct percpu_ref		refs;
+	refcount_t			refs;
 	struct list_head		node;
 	struct list_head		rsrc_list;
 	struct io_rsrc_data		*rsrc_data;
@@ -54,6 +54,7 @@ struct io_mapped_ubuf {
 };
 
 void io_rsrc_put_tw(struct callback_head *cb);
+void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_put_work(struct work_struct *work);
 void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
 void io_wait_rsrc_data(struct io_rsrc_data *data);
@@ -109,7 +110,8 @@ int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 
 static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
 {
-	percpu_ref_put_many(&node->refs, nr);
+	if (refcount_sub_and_test(nr, &node->refs))
+		io_rsrc_node_ref_zero(node);
 }
 
 static inline void io_req_put_rsrc(struct io_kiocb *req)
-- 
cgit 


From 8e15c0e71b8ae64fb7163532860f8d608165281f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:46 +0100
Subject: io_uring/rsrc: keep cached refs per node

We cache refs of the current node (i.e. ctx->rsrc_node) in
ctx->rsrc_cached_refs. We'll be moving away from atomics, so move the
cached refs in struct io_rsrc_node for now. It's a prep patch and
shouldn't change anything in practise.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9edc3669c1d71b06c2dca78b2b2b8bb9292738b9.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 15 +++++++++------
 io_uring/rsrc.h | 16 +++++++++-------
 2 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index f2da9e251e3f..1e7c960737fd 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -36,9 +36,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
 {
-	if (ctx->rsrc_cached_refs) {
-		io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
-		ctx->rsrc_cached_refs = 0;
+	struct io_rsrc_node *node = ctx->rsrc_node;
+
+	if (node && node->cached_refs) {
+		io_rsrc_put_node(node, node->cached_refs);
+		node->cached_refs = 0;
 	}
 }
 
@@ -151,11 +153,11 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 	*slot = NULL;
 }
 
-void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 	__must_hold(&ctx->uring_lock)
 {
-	ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
-	refcount_add(IO_RSRC_REF_BATCH, &ctx->rsrc_node->refs);
+	node->cached_refs += IO_RSRC_REF_BATCH;
+	refcount_add(IO_RSRC_REF_BATCH, &node->refs);
 }
 
 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
@@ -300,6 +302,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 	if (!ctx->rsrc_node) {
 		ctx->rsrc_node = ctx->rsrc_backup_node;
 		ctx->rsrc_backup_node = NULL;
+		ctx->rsrc_node->cached_refs = 0;
 	}
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 1467b31843bc..950535e2b9f4 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -43,6 +43,7 @@ struct io_rsrc_node {
 	struct io_rsrc_data		*rsrc_data;
 	struct llist_node		llist;
 	bool				done;
+	int				cached_refs;
 };
 
 struct io_mapped_ubuf {
@@ -56,7 +57,7 @@ struct io_mapped_ubuf {
 void io_rsrc_put_tw(struct callback_head *cb);
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_put_work(struct work_struct *work);
-void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
+void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
 void io_wait_rsrc_data(struct io_rsrc_data *data);
 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
 void io_rsrc_refs_drop(struct io_ring_ctx *ctx);
@@ -128,17 +129,18 @@ static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
 
 	if (node) {
 		if (node == ctx->rsrc_node)
-			ctx->rsrc_cached_refs++;
+			node->cached_refs++;
 		else
 			io_rsrc_put_node(node, 1);
 	}
 }
 
-static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx)
+static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
+				       struct io_rsrc_node *node)
 {
-	ctx->rsrc_cached_refs--;
-	if (unlikely(ctx->rsrc_cached_refs < 0))
-		io_rsrc_refs_refill(ctx);
+	node->cached_refs--;
+	if (unlikely(node->cached_refs < 0))
+		io_rsrc_refs_refill(ctx, node);
 }
 
 static inline void io_req_set_rsrc_node(struct io_kiocb *req,
@@ -151,7 +153,7 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
 		lockdep_assert_held(&ctx->uring_lock);
 
 		req->rsrc_node = ctx->rsrc_node;
-		io_charge_rsrc_node(ctx);
+		io_charge_rsrc_node(ctx, ctx->rsrc_node);
 		io_ring_submit_unlock(ctx, issue_flags);
 	}
 }
-- 
cgit 


From 2ad4c6d08018e4eec130c29992028dc356ab2181 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:47 +0100
Subject: io_uring: don't put nodes under spinlocks

io_req_put_rsrc() doesn't need any locking, so move it out of
a spinlock section in __io_req_complete_post() and adjust helpers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d5b87a5f31270dade6805f7acafc4cc34b84b241.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 7 +++++--
 io_uring/rsrc.h     | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a0b64831c455..596af20cddb4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -970,6 +970,7 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32
 static void __io_req_complete_post(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_rsrc_node *rsrc_node = NULL;
 
 	io_cq_lock(ctx);
 	if (!(req->flags & REQ_F_CQE_SKIP))
@@ -990,7 +991,7 @@ static void __io_req_complete_post(struct io_kiocb *req)
 		}
 		io_put_kbuf_comp(req);
 		io_dismantle_req(req);
-		io_req_put_rsrc(req);
+		rsrc_node = req->rsrc_node;
 		/*
 		 * Selected buffer deallocation in io_clean_op() assumes that
 		 * we don't hold ->completion_lock. Clean them here to avoid
@@ -1001,6 +1002,8 @@ static void __io_req_complete_post(struct io_kiocb *req)
 		ctx->locked_free_nr++;
 	}
 	io_cq_unlock_post(ctx);
+
+	io_put_rsrc_node(rsrc_node);
 }
 
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
@@ -1117,7 +1120,7 @@ __cold void io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	io_req_put_rsrc(req);
+	io_put_rsrc_node(req->rsrc_node);
 	io_dismantle_req(req);
 	io_put_task_remote(req->task, 1);
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 950535e2b9f4..8164777279ba 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -115,10 +115,10 @@ static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
 		io_rsrc_node_ref_zero(node);
 }
 
-static inline void io_req_put_rsrc(struct io_kiocb *req)
+static inline void io_put_rsrc_node(struct io_rsrc_node *node)
 {
-	if (req->rsrc_node)
-		io_rsrc_put_node(req->rsrc_node, 1);
+	if (node)
+		io_rsrc_put_node(node, 1);
 }
 
 static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
-- 
cgit 


From 03adabe81abb20221079b48343783b4327bd1186 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:48 +0100
Subject: io_uring: io_free_req() via tw

io_free_req() is not often used but nevertheless problematic as there is
no way to know the current context, it may be used from the submission
path or even by an irq handler. Push it to a fresh context using
task_work.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3a92fe80bb068757e51aaa0b105cfbe8f5dfee9e.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 596af20cddb4..98320f4b0bca 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1116,7 +1116,7 @@ static inline void io_dismantle_req(struct io_kiocb *req)
 		io_put_file(req->file);
 }
 
-__cold void io_free_req(struct io_kiocb *req)
+static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -1130,6 +1130,12 @@ __cold void io_free_req(struct io_kiocb *req)
 	spin_unlock(&ctx->completion_lock);
 }
 
+__cold void io_free_req(struct io_kiocb *req)
+{
+	req->io_task_work.func = io_free_req_tw;
+	io_req_task_work_add(req);
+}
+
 static void __io_req_find_next_prep(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-- 
cgit 


From ef8ae64ffa9578c12e44de42604004c2cc3e9c27 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:49 +0100
Subject: io_uring/rsrc: protect node refs with uring_lock

Currently, for nodes we have an atomic counter and some cached
(non-atomic) refs protected by uring_lock. Let's put all ref
manipulations under uring_lock and get rid of the atomic part.
It's free as in all cases we care about we already hold the lock.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/25b142feed7d831008257d90c8b17c0115d4fc15.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 18 ++++++++++++------
 io_uring/rsrc.c     | 30 ++++--------------------------
 io_uring/rsrc.h     | 29 +++++------------------------
 3 files changed, 21 insertions(+), 56 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 98320f4b0bca..36a76c7b34f0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -967,7 +967,7 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32
 	return true;
 }
 
-static void __io_req_complete_post(struct io_kiocb *req)
+static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_rsrc_node *rsrc_node = NULL;
@@ -1003,7 +1003,11 @@ static void __io_req_complete_post(struct io_kiocb *req)
 	}
 	io_cq_unlock_post(ctx);
 
-	io_put_rsrc_node(rsrc_node);
+	if (rsrc_node) {
+		io_ring_submit_lock(ctx, issue_flags);
+		io_put_rsrc_node(rsrc_node);
+		io_ring_submit_unlock(ctx, issue_flags);
+	}
 }
 
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
@@ -1013,12 +1017,12 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 		io_req_task_work_add(req);
 	} else if (!(issue_flags & IO_URING_F_UNLOCKED) ||
 		   !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
-		__io_req_complete_post(req);
+		__io_req_complete_post(req, issue_flags);
 	} else {
 		struct io_ring_ctx *ctx = req->ctx;
 
 		mutex_lock(&ctx->uring_lock);
-		__io_req_complete_post(req);
+		__io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED);
 		mutex_unlock(&ctx->uring_lock);
 	}
 }
@@ -1120,7 +1124,10 @@ static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	io_put_rsrc_node(req->rsrc_node);
+	if (req->rsrc_node) {
+		io_tw_lock(ctx, ts);
+		io_put_rsrc_node(req->rsrc_node);
+	}
 	io_dismantle_req(req);
 	io_put_task_remote(req->task, 1);
 
@@ -2790,7 +2797,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
-	io_rsrc_refs_drop(ctx);
 	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
 	io_wait_rsrc_data(ctx->buf_data);
 	io_wait_rsrc_data(ctx->file_data);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 1e7c960737fd..1237fc77c250 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -27,23 +27,10 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 				  struct io_mapped_ubuf **pimu,
 				  struct page **last_hpage);
 
-#define IO_RSRC_REF_BATCH	100
-
 /* only define max */
 #define IORING_MAX_FIXED_FILES	(1U << 20)
 #define IORING_MAX_REG_BUFFERS	(1U << 14)
 
-void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
-	__must_hold(&ctx->uring_lock)
-{
-	struct io_rsrc_node *node = ctx->rsrc_node;
-
-	if (node && node->cached_refs) {
-		io_rsrc_put_node(node, node->cached_refs);
-		node->cached_refs = 0;
-	}
-}
-
 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
 {
 	unsigned long page_limit, cur_pages, new_pages;
@@ -153,13 +140,6 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 	*slot = NULL;
 }
 
-void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
-	__must_hold(&ctx->uring_lock)
-{
-	node->cached_refs += IO_RSRC_REF_BATCH;
-	refcount_add(IO_RSRC_REF_BATCH, &node->refs);
-}
-
 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 {
 	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
@@ -225,7 +205,8 @@ void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
 	kfree(ref_node);
 }
 
-__cold void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
+void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
+	__must_hold(&node->rsrc_data->ctx->uring_lock)
 {
 	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
 	unsigned long flags;
@@ -269,7 +250,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void)
 	if (!ref_node)
 		return NULL;
 
-	refcount_set(&ref_node->refs, 1);
+	ref_node->refs = 1;
 	INIT_LIST_HEAD(&ref_node->node);
 	INIT_LIST_HEAD(&ref_node->rsrc_list);
 	ref_node->done = false;
@@ -283,8 +264,6 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 	WARN_ON_ONCE(!ctx->rsrc_backup_node);
 	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
 
-	io_rsrc_refs_drop(ctx);
-
 	if (data_to_kill) {
 		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
 
@@ -295,14 +274,13 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 
 		atomic_inc(&data_to_kill->refs);
 		/* put master ref */
-		io_rsrc_put_node(rsrc_node, 1);
+		io_put_rsrc_node(rsrc_node);
 		ctx->rsrc_node = NULL;
 	}
 
 	if (!ctx->rsrc_node) {
 		ctx->rsrc_node = ctx->rsrc_backup_node;
 		ctx->rsrc_backup_node = NULL;
-		ctx->rsrc_node->cached_refs = 0;
 	}
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 8164777279ba..a96103095f0f 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -37,13 +37,12 @@ struct io_rsrc_data {
 };
 
 struct io_rsrc_node {
-	refcount_t			refs;
 	struct list_head		node;
 	struct list_head		rsrc_list;
 	struct io_rsrc_data		*rsrc_data;
 	struct llist_node		llist;
+	int				refs;
 	bool				done;
-	int				cached_refs;
 };
 
 struct io_mapped_ubuf {
@@ -57,10 +56,8 @@ struct io_mapped_ubuf {
 void io_rsrc_put_tw(struct callback_head *cb);
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_put_work(struct work_struct *work);
-void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
 void io_wait_rsrc_data(struct io_rsrc_data *data);
 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
-void io_rsrc_refs_drop(struct io_ring_ctx *ctx);
 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 			  struct io_rsrc_node *node, void *rsrc);
@@ -109,38 +106,22 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
 int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 			unsigned int size, unsigned int type);
 
-static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
-{
-	if (refcount_sub_and_test(nr, &node->refs))
-		io_rsrc_node_ref_zero(node);
-}
-
 static inline void io_put_rsrc_node(struct io_rsrc_node *node)
 {
-	if (node)
-		io_rsrc_put_node(node, 1);
+	if (node && !--node->refs)
+		io_rsrc_node_ref_zero(node);
 }
 
 static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
 					  struct io_ring_ctx *ctx)
-	__must_hold(&ctx->uring_lock)
 {
-	struct io_rsrc_node *node = req->rsrc_node;
-
-	if (node) {
-		if (node == ctx->rsrc_node)
-			node->cached_refs++;
-		else
-			io_rsrc_put_node(node, 1);
-	}
+	io_put_rsrc_node(req->rsrc_node);
 }
 
 static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
 				       struct io_rsrc_node *node)
 {
-	node->cached_refs--;
-	if (unlikely(node->cached_refs < 0))
-		io_rsrc_refs_refill(ctx, node);
+	node->refs++;
 }
 
 static inline void io_req_set_rsrc_node(struct io_kiocb *req,
-- 
cgit 


From 0a4813b1abdf06e44ce60cdebfd374cfd27c46bf Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:50 +0100
Subject: io_uring/rsrc: kill rsrc_ref_lock

We use ->rsrc_ref_lock spinlock to protect ->rsrc_ref_list in
io_rsrc_node_ref_zero(). Now we removed pcpu refcounting, which means
io_rsrc_node_ref_zero() is not executed from the irq context as an RCU
callback anymore, and we also put it under ->uring_lock.
io_rsrc_node_switch(), which queues up nodes into the list, is also
protected by ->uring_lock, so we can safely get rid of ->rsrc_ref_lock.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6b60af883c263551190b526a55ff2c9d5ae07141.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 1 -
 io_uring/rsrc.c     | 5 -----
 2 files changed, 6 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 36a76c7b34f0..764df5694d73 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -325,7 +325,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
 	INIT_LIST_HEAD(&ctx->ltimeout_list);
-	spin_lock_init(&ctx->rsrc_ref_lock);
 	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 	init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 1237fc77c250..e122b6e5f9c5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -209,11 +209,9 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 	__must_hold(&node->rsrc_data->ctx->uring_lock)
 {
 	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
-	unsigned long flags;
 	bool first_add = false;
 	unsigned long delay = HZ;
 
-	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
 	node->done = true;
 
 	/* if we are mid-quiesce then do not delay */
@@ -229,7 +227,6 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 		list_del(&node->node);
 		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
 	}
-	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
 
 	if (!first_add)
 		return;
@@ -268,9 +265,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
 
 		rsrc_node->rsrc_data = data_to_kill;
-		spin_lock_irq(&ctx->rsrc_ref_lock);
 		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
-		spin_unlock_irq(&ctx->rsrc_ref_lock);
 
 		atomic_inc(&data_to_kill->refs);
 		/* put master ref */
-- 
cgit 


From c824986c113f15e2ef2c00da9a226c09ecaac74c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:51 +0100
Subject: io_uring/rsrc: rename rsrc_list

We have too many "rsrc" around which makes the name of struct
io_rsrc_node::rsrc_list confusing. The field is responsible for keeping
a list of files or buffers, so call it item_list and add comments
around.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3e34d4dfc1fdbb6b520f904ee6187c2ccf680efe.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 6 +++---
 io_uring/rsrc.h | 8 +++++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index e122b6e5f9c5..10006fb169d2 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -146,7 +146,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 	struct io_ring_ctx *ctx = rsrc_data->ctx;
 	struct io_rsrc_put *prsrc, *tmp;
 
-	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
+	list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) {
 		list_del(&prsrc->list);
 
 		if (prsrc->tag) {
@@ -249,7 +249,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void)
 
 	ref_node->refs = 1;
 	INIT_LIST_HEAD(&ref_node->node);
-	INIT_LIST_HEAD(&ref_node->rsrc_list);
+	INIT_LIST_HEAD(&ref_node->item_list);
 	ref_node->done = false;
 	return ref_node;
 }
@@ -737,7 +737,7 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 	prsrc->tag = *tag_slot;
 	*tag_slot = 0;
 	prsrc->rsrc = rsrc;
-	list_add(&prsrc->list, &node->rsrc_list);
+	list_add(&prsrc->list, &node->item_list);
 	return 0;
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index a96103095f0f..509a5ea7eabf 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -38,11 +38,17 @@ struct io_rsrc_data {
 
 struct io_rsrc_node {
 	struct list_head		node;
-	struct list_head		rsrc_list;
 	struct io_rsrc_data		*rsrc_data;
 	struct llist_node		llist;
 	int				refs;
 	bool				done;
+
+	/*
+	 * Keeps a list of struct io_rsrc_put to be completed. Each entry
+	 * represents one rsrc (e.g. file or buffer), but all of them should've
+	 * came from the same table and so are of the same type.
+	 */
+	struct list_head		item_list;
 };
 
 struct io_mapped_ubuf {
-- 
cgit 


From ff7c75ecaa9e6b251f76c24e289d4bfe413ffe31 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:52 +0100
Subject: io_uring/rsrc: optimise io_rsrc_put allocation

Every io_rsrc_node keeps a list of items to put, and all entries are
kmalloc()'ed. However, it's quite often to queue up only one entry per
node, so let's add an inline entry there to avoid extra allocations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c482c1c652c45c85ac52e67c974bc758a50fed5f.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 51 ++++++++++++++++++++++++++++++++++-----------------
 io_uring/rsrc.h |  2 ++
 2 files changed, 36 insertions(+), 17 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 10006fb169d2..95e71300bb35 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -140,26 +140,34 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 	*slot = NULL;
 }
 
+static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data,
+				 struct io_rsrc_put *prsrc)
+{
+	struct io_ring_ctx *ctx = rsrc_data->ctx;
+
+	if (prsrc->tag) {
+		if (ctx->flags & IORING_SETUP_IOPOLL) {
+			mutex_lock(&ctx->uring_lock);
+			io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
+			mutex_unlock(&ctx->uring_lock);
+		} else {
+			io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
+		}
+	}
+	rsrc_data->do_put(ctx, prsrc);
+}
+
 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 {
 	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
-	struct io_ring_ctx *ctx = rsrc_data->ctx;
 	struct io_rsrc_put *prsrc, *tmp;
 
+	if (ref_node->inline_items)
+		io_rsrc_put_work_one(rsrc_data, &ref_node->item);
+
 	list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) {
 		list_del(&prsrc->list);
-
-		if (prsrc->tag) {
-			if (ctx->flags & IORING_SETUP_IOPOLL) {
-				mutex_lock(&ctx->uring_lock);
-				io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
-				mutex_unlock(&ctx->uring_lock);
-			} else {
-				io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
-			}
-		}
-
-		rsrc_data->do_put(ctx, prsrc);
+		io_rsrc_put_work_one(rsrc_data, prsrc);
 		kfree(prsrc);
 	}
 
@@ -251,6 +259,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void)
 	INIT_LIST_HEAD(&ref_node->node);
 	INIT_LIST_HEAD(&ref_node->item_list);
 	ref_node->done = false;
+	ref_node->inline_items = 0;
 	return ref_node;
 }
 
@@ -729,15 +738,23 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 {
 	u64 *tag_slot = io_get_tag_slot(data, idx);
 	struct io_rsrc_put *prsrc;
+	bool inline_item = true;
 
-	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
-	if (!prsrc)
-		return -ENOMEM;
+	if (!node->inline_items) {
+		prsrc = &node->item;
+		node->inline_items++;
+	} else {
+		prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
+		if (!prsrc)
+			return -ENOMEM;
+		inline_item = false;
+	}
 
 	prsrc->tag = *tag_slot;
 	*tag_slot = 0;
 	prsrc->rsrc = rsrc;
-	list_add(&prsrc->list, &node->item_list);
+	if (!inline_item)
+		list_add(&prsrc->list, &node->item_list);
 	return 0;
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 509a5ea7eabf..11703082d125 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -49,6 +49,8 @@ struct io_rsrc_node {
 	 * came from the same table and so are of the same type.
 	 */
 	struct list_head		item_list;
+	struct io_rsrc_put		item;
+	int				inline_items;
 };
 
 struct io_mapped_ubuf {
-- 
cgit 


From 36b9818a5a84cb7c977fb723babca1c8d74f288f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:53 +0100
Subject: io_uring/rsrc: don't offload node free

struct delayed_work rsrc_put_work was previously used to offload node
freeing because io_rsrc_node_ref_zero() was previously called by RCU in
the IRQ context. Now, as percpu refcounting is gone, we can do it
eagerly at the spot without pushing it to a worker.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/13fb1aac1e8d068ad8fd4a0c6d0d157ab61b90c0.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  6 ------
 io_uring/rsrc.c     | 59 ++++-------------------------------------------------
 2 files changed, 4 insertions(+), 61 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 764df5694d73..d6a0025afc31 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -326,9 +326,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->timeout_list);
 	INIT_LIST_HEAD(&ctx->ltimeout_list);
 	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
-	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
-	init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw);
-	init_llist_head(&ctx->rsrc_put_llist);
 	init_llist_head(&ctx->work_llist);
 	INIT_LIST_HEAD(&ctx->tctx_list);
 	ctx->submit_state.free_list.next = NULL;
@@ -2821,11 +2818,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		io_rsrc_node_destroy(ctx->rsrc_node);
 	if (ctx->rsrc_backup_node)
 		io_rsrc_node_destroy(ctx->rsrc_backup_node);
-	flush_delayed_work(&ctx->rsrc_put_work);
-	flush_delayed_work(&ctx->fallback_work);
 
 	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
-	WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
 
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock) {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 95e71300bb35..0f4e245dee1b 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -145,15 +145,8 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data,
 {
 	struct io_ring_ctx *ctx = rsrc_data->ctx;
 
-	if (prsrc->tag) {
-		if (ctx->flags & IORING_SETUP_IOPOLL) {
-			mutex_lock(&ctx->uring_lock);
-			io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
-			mutex_unlock(&ctx->uring_lock);
-		} else {
-			io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
-		}
-	}
+	if (prsrc->tag)
+		io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
 	rsrc_data->do_put(ctx, prsrc);
 }
 
@@ -176,32 +169,6 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 		complete(&rsrc_data->done);
 }
 
-void io_rsrc_put_work(struct work_struct *work)
-{
-	struct io_ring_ctx *ctx;
-	struct llist_node *node;
-
-	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
-	node = llist_del_all(&ctx->rsrc_put_llist);
-
-	while (node) {
-		struct io_rsrc_node *ref_node;
-		struct llist_node *next = node->next;
-
-		ref_node = llist_entry(node, struct io_rsrc_node, llist);
-		__io_rsrc_put_work(ref_node);
-		node = next;
-	}
-}
-
-void io_rsrc_put_tw(struct callback_head *cb)
-{
-	struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
-					       rsrc_put_tw);
-
-	io_rsrc_put_work(&ctx->rsrc_put_work.work);
-}
-
 void io_wait_rsrc_data(struct io_rsrc_data *data)
 {
 	if (data && !atomic_dec_and_test(&data->refs))
@@ -217,34 +184,18 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 	__must_hold(&node->rsrc_data->ctx->uring_lock)
 {
 	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
-	bool first_add = false;
-	unsigned long delay = HZ;
 
 	node->done = true;
-
-	/* if we are mid-quiesce then do not delay */
-	if (node->rsrc_data->quiesce)
-		delay = 0;
-
 	while (!list_empty(&ctx->rsrc_ref_list)) {
 		node = list_first_entry(&ctx->rsrc_ref_list,
 					    struct io_rsrc_node, node);
 		/* recycle ref nodes in order */
 		if (!node->done)
 			break;
-		list_del(&node->node);
-		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
-	}
 
-	if (!first_add)
-		return;
-
-	if (ctx->submitter_task) {
-		if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw,
-				   ctx->notify_method))
-			return;
+		list_del(&node->node);
+		__io_rsrc_put_work(node);
 	}
-	mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
 }
 
 static struct io_rsrc_node *io_rsrc_node_alloc(void)
@@ -320,13 +271,11 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		if (ret < 0) {
 			atomic_inc(&data->refs);
 			/* wait for all works potentially completing data->done */
-			flush_delayed_work(&ctx->rsrc_put_work);
 			reinit_completion(&data->done);
 			mutex_lock(&ctx->uring_lock);
 			break;
 		}
 
-		flush_delayed_work(&ctx->rsrc_put_work);
 		ret = wait_for_completion_interruptible(&data->done);
 		if (!ret) {
 			mutex_lock(&ctx->uring_lock);
-- 
cgit 


From 9eae8655f9cd2eeed99fb7a0d2bb22816c17e497 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:54 +0100
Subject: io_uring/rsrc: cache struct io_rsrc_node

Add allocation cache for struct io_rsrc_node, it's always allocated and
put under ->uring_lock, so it doesn't need any extra synchronisation
around caches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/252a9d9ef9654e6467af30fdc02f57c0118fb76e.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 11 +++++++++--
 io_uring/rsrc.c     | 23 +++++++++++++++--------
 io_uring/rsrc.h     |  9 +++++++--
 3 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d6a0025afc31..419d6f42935f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -310,6 +310,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
+	io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node));
 	io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll));
 	io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr));
 	init_completion(&ctx->ref_comp);
@@ -2790,6 +2791,11 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 	mutex_unlock(&ctx->uring_lock);
 }
 
+static void io_rsrc_node_cache_free(struct io_cache_entry *entry)
+{
+	kfree(container_of(entry, struct io_rsrc_node, cache));
+}
+
 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
@@ -2815,9 +2821,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 
 	/* there are no registered resources left, nobody uses it */
 	if (ctx->rsrc_node)
-		io_rsrc_node_destroy(ctx->rsrc_node);
+		io_rsrc_node_destroy(ctx, ctx->rsrc_node);
 	if (ctx->rsrc_backup_node)
-		io_rsrc_node_destroy(ctx->rsrc_backup_node);
+		io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node);
 
 	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 
@@ -2829,6 +2835,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 #endif
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
+	io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
 	if (ctx->mm_account) {
 		mmdrop(ctx->mm_account);
 		ctx->mm_account = NULL;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 0f4e245dee1b..345631091d80 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -164,7 +164,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 		kfree(prsrc);
 	}
 
-	io_rsrc_node_destroy(ref_node);
+	io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
 	if (atomic_dec_and_test(&rsrc_data->refs))
 		complete(&rsrc_data->done);
 }
@@ -175,9 +175,10 @@ void io_wait_rsrc_data(struct io_rsrc_data *data)
 		wait_for_completion(&data->done);
 }
 
-void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
+void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 {
-	kfree(ref_node);
+	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache))
+		kfree(node);
 }
 
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
@@ -198,13 +199,19 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 	}
 }
 
-static struct io_rsrc_node *io_rsrc_node_alloc(void)
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 {
 	struct io_rsrc_node *ref_node;
+	struct io_cache_entry *entry;
 
-	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
-	if (!ref_node)
-		return NULL;
+	entry = io_alloc_cache_get(&ctx->rsrc_node_cache);
+	if (entry) {
+		ref_node = container_of(entry, struct io_rsrc_node, cache);
+	} else {
+		ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+		if (!ref_node)
+			return NULL;
+	}
 
 	ref_node->refs = 1;
 	INIT_LIST_HEAD(&ref_node->node);
@@ -243,7 +250,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 {
 	if (ctx->rsrc_backup_node)
 		return 0;
-	ctx->rsrc_backup_node = io_rsrc_node_alloc();
+	ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
 	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 11703082d125..3b9f4c57c47c 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -4,6 +4,8 @@
 
 #include <net/af_unix.h>
 
+#include "alloc_cache.h"
+
 #define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
 #define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
 #define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
@@ -37,8 +39,11 @@ struct io_rsrc_data {
 };
 
 struct io_rsrc_node {
+	union {
+		struct io_cache_entry		cache;
+		struct io_rsrc_data		*rsrc_data;
+	};
 	struct list_head		node;
-	struct io_rsrc_data		*rsrc_data;
 	struct llist_node		llist;
 	int				refs;
 	bool				done;
@@ -65,7 +70,7 @@ void io_rsrc_put_tw(struct callback_head *cb);
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_put_work(struct work_struct *work);
 void io_wait_rsrc_data(struct io_rsrc_data *data);
-void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
+void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 			  struct io_rsrc_node *node, void *rsrc);
-- 
cgit 


From 1f2c8f610aa6c6a3dc3523f93eaf28c25051df6f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:55 +0100
Subject: io_uring/rsrc: add lockdep sanity checks

We should hold ->uring_lock while putting nodes with io_put_rsrc_node(),
add a lockdep check for that.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b50d5f156ac41450029796738c1dfd22a521df7a.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 4 ++--
 io_uring/rsrc.c     | 2 +-
 io_uring/rsrc.h     | 6 ++++--
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 419d6f42935f..da36fa1eeac9 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1002,7 +1002,7 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 
 	if (rsrc_node) {
 		io_ring_submit_lock(ctx, issue_flags);
-		io_put_rsrc_node(rsrc_node);
+		io_put_rsrc_node(ctx, rsrc_node);
 		io_ring_submit_unlock(ctx, issue_flags);
 	}
 }
@@ -1123,7 +1123,7 @@ static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts)
 
 	if (req->rsrc_node) {
 		io_tw_lock(ctx, ts);
-		io_put_rsrc_node(req->rsrc_node);
+		io_put_rsrc_node(ctx, req->rsrc_node);
 	}
 	io_dismantle_req(req);
 	io_put_task_remote(req->task, 1);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 345631091d80..d4bca5e18434 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -236,7 +236,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 
 		atomic_inc(&data_to_kill->refs);
 		/* put master ref */
-		io_put_rsrc_node(rsrc_node);
+		io_put_rsrc_node(ctx, rsrc_node);
 		ctx->rsrc_node = NULL;
 	}
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 3b9f4c57c47c..cf24c3fd701f 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -119,8 +119,10 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
 int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 			unsigned int size, unsigned int type);
 
-static inline void io_put_rsrc_node(struct io_rsrc_node *node)
+static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 {
+	lockdep_assert_held(&ctx->uring_lock);
+
 	if (node && !--node->refs)
 		io_rsrc_node_ref_zero(node);
 }
@@ -128,7 +130,7 @@ static inline void io_put_rsrc_node(struct io_rsrc_node *node)
 static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
 					  struct io_ring_ctx *ctx)
 {
-	io_put_rsrc_node(req->rsrc_node);
+	io_put_rsrc_node(ctx, req->rsrc_node);
 }
 
 static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
-- 
cgit 


From 757ef4682b6aa29fdf752ad47f0d63eb48b261cf Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:56 +0100
Subject: io_uring/rsrc: optimise io_rsrc_data refcounting

Every struct io_rsrc_node takes a struct io_rsrc_data reference, which
means all rsrc updates do 2 extra atomics. Replace atomics refcounting
with a int as it's all done under ->uring_lock.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e73c3d6820cf679532696d790b5b8fae23537213.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 30 ++++++++++++++++++------------
 io_uring/rsrc.h |  2 +-
 2 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d4bca5e18434..603a783a0383 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -31,6 +31,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 #define IORING_MAX_FIXED_FILES	(1U << 20)
 #define IORING_MAX_REG_BUFFERS	(1U << 14)
 
+static inline bool io_put_rsrc_data_ref(struct io_rsrc_data *rsrc_data)
+{
+	return !--rsrc_data->refs;
+}
+
 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
 {
 	unsigned long page_limit, cur_pages, new_pages;
@@ -165,13 +170,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 	}
 
 	io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
-	if (atomic_dec_and_test(&rsrc_data->refs))
+	if (io_put_rsrc_data_ref(rsrc_data))
 		complete(&rsrc_data->done);
 }
 
 void io_wait_rsrc_data(struct io_rsrc_data *data)
 {
-	if (data && !atomic_dec_and_test(&data->refs))
+	if (data && !io_put_rsrc_data_ref(data))
 		wait_for_completion(&data->done);
 }
 
@@ -234,7 +239,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 		rsrc_node->rsrc_data = data_to_kill;
 		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
 
-		atomic_inc(&data_to_kill->refs);
+		data_to_kill->refs++;
 		/* put master ref */
 		io_put_rsrc_node(ctx, rsrc_node);
 		ctx->rsrc_node = NULL;
@@ -267,8 +272,8 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		return ret;
 	io_rsrc_node_switch(ctx, data);
 
-	/* kill initial ref, already quiesced if zero */
-	if (atomic_dec_and_test(&data->refs))
+	/* kill initial ref */
+	if (io_put_rsrc_data_ref(data))
 		return 0;
 
 	data->quiesce = true;
@@ -276,17 +281,19 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 	do {
 		ret = io_run_task_work_sig(ctx);
 		if (ret < 0) {
-			atomic_inc(&data->refs);
-			/* wait for all works potentially completing data->done */
-			reinit_completion(&data->done);
 			mutex_lock(&ctx->uring_lock);
+			if (!data->refs) {
+				ret = 0;
+			} else {
+				/* restore the master reference */
+				data->refs++;
+			}
 			break;
 		}
-
 		ret = wait_for_completion_interruptible(&data->done);
 		if (!ret) {
 			mutex_lock(&ctx->uring_lock);
-			if (atomic_read(&data->refs) <= 0)
+			if (!data->refs)
 				break;
 			/*
 			 * it has been revived by another thread while
@@ -361,6 +368,7 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 	data->nr = nr;
 	data->ctx = ctx;
 	data->do_put = do_put;
+	data->refs = 1;
 	if (utags) {
 		ret = -EFAULT;
 		for (i = 0; i < nr; i++) {
@@ -371,8 +379,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 				goto fail;
 		}
 	}
-
-	atomic_set(&data->refs, 1);
 	init_completion(&data->done);
 	*pdata = data;
 	return 0;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index cf24c3fd701f..7ab9b2b2e757 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -33,8 +33,8 @@ struct io_rsrc_data {
 	u64				**tags;
 	unsigned int			nr;
 	rsrc_put_fn			*do_put;
-	atomic_t			refs;
 	struct completion		done;
+	int				refs;
 	bool				quiesce;
 };
 
-- 
cgit 


From 69bbc6ade9d9d4e3c556cb83e77b6f3cd9ad3d18 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Apr 2023 13:39:57 +0100
Subject: io_uring/rsrc: add custom limit for node caching

The number of entries in the rsrc node cache is limited to 512, which
still seems unnecessarily large. Add per cache thresholds and set to
to 32 for the rsrc node cache.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d0cd538b944dac0bf878e276fc0199f21e6bccea.1680576071.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/alloc_cache.h | 6 ++++--
 io_uring/io_uring.c    | 9 ++++++---
 io_uring/rsrc.h        | 2 ++
 3 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 2fbecaa3a1ba..851a527afb5e 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -13,7 +13,7 @@ struct io_cache_entry {
 static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 				      struct io_cache_entry *entry)
 {
-	if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
+	if (cache->nr_cached < cache->max_cached) {
 		cache->nr_cached++;
 		wq_stack_add_head(&entry->node, &cache->list);
 		/* KASAN poisons object */
@@ -38,10 +38,12 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
 	return NULL;
 }
 
-static inline void io_alloc_cache_init(struct io_alloc_cache *cache, size_t size)
+static inline void io_alloc_cache_init(struct io_alloc_cache *cache,
+				       unsigned max_nr, size_t size)
 {
 	cache->list.next = NULL;
 	cache->nr_cached = 0;
+	cache->max_cached = max_nr;
 	cache->elem_size = size;
 }
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index da36fa1eeac9..ae90d2753e0d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -310,9 +310,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
-	io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node));
-	io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll));
-	io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr));
+	io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
+			    sizeof(struct io_rsrc_node));
+	io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
+			    sizeof(struct async_poll));
+	io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
+			    sizeof(struct io_async_msghdr));
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 7ab9b2b2e757..8729f2fee256 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -6,6 +6,8 @@
 
 #include "alloc_cache.h"
 
+#define IO_NODE_ALLOC_CACHE_MAX 32
+
 #define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
 #define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
 #define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
-- 
cgit 


From 758d5d64b619ddbbf96a5605d8d5a919aafaafab Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 5 Apr 2023 08:21:45 -0600
Subject: io_uring/uring_cmd: assign ioucmd->cmd at async prep time

Rather than check this in the fast path issue, it makes more sense to
just assign the copy of the data when we're setting it up anyway. This
makes the code a bit cleaner, and removes the need for this check in
the issue path.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/uring_cmd.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 3d825d939b13..f7a96bc76ea1 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -73,6 +73,7 @@ int io_uring_cmd_prep_async(struct io_kiocb *req)
 	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
 
 	memcpy(req->async_data, ioucmd->cmd, cmd_size);
+	ioucmd->cmd = req->async_data;
 	return 0;
 }
 
@@ -129,9 +130,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 		WRITE_ONCE(ioucmd->cookie, NULL);
 	}
 
-	if (req_has_async_data(req))
-		ioucmd->cmd = req->async_data;
-
 	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
 	if (ret == -EAGAIN) {
 		if (!req_has_async_data(req)) {
-- 
cgit 


From ab1c590f5c9b96d8d8843d351aed72469f8f2ef0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 6 Apr 2023 14:20:07 +0100
Subject: io_uring: move pinning out of io_req_local_work_add

Move ctx pinning from io_req_local_work_add() to the caller, looks
better and makes working with the code a bit easier.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/49c0dbed390b0d6d04cb942dd3592879fd5bfb1b.1680782017.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ae90d2753e0d..29a0516ee5ce 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1306,17 +1306,15 @@ static void io_req_local_work_add(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	percpu_ref_get(&ctx->refs);
-
 	if (!llist_add(&req->io_task_work.node, &ctx->work_llist))
-		goto put_ref;
+		return;
 
 	/* needed for the following wake up */
 	smp_mb__after_atomic();
 
 	if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
 		io_move_task_work_from_local(ctx);
-		goto put_ref;
+		return;
 	}
 
 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
@@ -1326,9 +1324,6 @@ static void io_req_local_work_add(struct io_kiocb *req)
 
 	if (READ_ONCE(ctx->cq_waiting))
 		wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
-
-put_ref:
-	percpu_ref_put(&ctx->refs);
 }
 
 void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
@@ -1337,7 +1332,9 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+		percpu_ref_get(&ctx->refs);
 		io_req_local_work_add(req);
+		percpu_ref_put(&ctx->refs);
 		return;
 	}
 
-- 
cgit 


From d73a572df24661851465c821d33c03e70e4b68e5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 6 Apr 2023 14:20:08 +0100
Subject: io_uring: optimize local tw add ctx pinning

We currently pin the ctx for io_req_local_work_add() with
percpu_ref_get/put, which implies two rcu_read_lock/unlock pairs and some
extra overhead on top in the fast path. Replace it with a pure rcu read
and let io_ring_exit_work() synchronise against it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/cbdfcb6b232627f30e9e50ef91f13c4f05910247.1680782017.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 29a0516ee5ce..fb7215b543cd 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1332,9 +1332,9 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
-		percpu_ref_get(&ctx->refs);
+		rcu_read_lock();
 		io_req_local_work_add(req);
-		percpu_ref_put(&ctx->refs);
+		rcu_read_unlock();
 		return;
 	}
 
@@ -3052,6 +3052,10 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 	spin_lock(&ctx->completion_lock);
 	spin_unlock(&ctx->completion_lock);
 
+	/* pairs with RCU read section in io_req_local_work_add() */
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+		synchronize_rcu();
+
 	io_ring_ctx_free(ctx);
 }
 
-- 
cgit 


From 6e7248adf8f7adb5e36ec1e91efcc85a83bf8aeb Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 6 Apr 2023 14:20:09 +0100
Subject: io_uring: refactor io_cqring_wake()

Instead of smp_mb() + __io_cqring_wake() in __io_cq_unlock_post_flush()
use equivalent io_cqring_wake(). With that we can clean it up further
and remove __io_cqring_wake().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/662ee5d898168ac206be06038525e97b64072a46.1680782017.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  6 ++----
 io_uring/io_uring.h | 11 ++---------
 2 files changed, 4 insertions(+), 13 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index fb7215b543cd..d4ac62de2113 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -640,10 +640,8 @@ static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
 	 * it will re-check the wakeup conditions once we return we can safely
 	 * skip waking it up.
 	 */
-	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
-		smp_mb();
-		__io_cqring_wake(ctx);
-	}
+	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+		io_cqring_wake(ctx);
 }
 
 void io_cq_unlock_post(struct io_ring_ctx *ctx)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 193b2db39fe8..24d8196bbca3 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -228,8 +228,7 @@ static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
 				poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
 }
 
-/* requires smb_mb() prior, see wq_has_sleeper() */
-static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
+static inline void io_cqring_wake(struct io_ring_ctx *ctx)
 {
 	/*
 	 * Trigger waitqueue handler on all waiters on our waitqueue. This
@@ -241,17 +240,11 @@ static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
 	 * waitqueue handlers, we know we have a dependency between eventfd or
 	 * epoll and should terminate multishot poll at that point.
 	 */
-	if (waitqueue_active(&ctx->cq_wait))
+	if (wq_has_sleeper(&ctx->cq_wait))
 		__wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
 				poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
 }
 
-static inline void io_cqring_wake(struct io_ring_ctx *ctx)
-{
-	smp_mb();
-	__io_cqring_wake(ctx);
-}
-
 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
 {
 	struct io_rings *r = ctx->rings;
-- 
cgit 


From 8501fe70ae9855076ffb03a3670e02a7b3437304 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 6 Apr 2023 14:20:10 +0100
Subject: io_uring: add tw add flags

We pass 'allow_local' into io_req_task_work_add() but will need more
flags. Replace it with a flags bit field and name this allow_local
flag.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4c0f01e7ef4e6feebfb199093cc995af7a19befa.1680782017.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 7 ++++---
 io_uring/io_uring.h | 9 +++++++--
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d4ac62de2113..6f175fe682e4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1324,12 +1324,13 @@ static void io_req_local_work_add(struct io_kiocb *req)
 		wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
 }
 
-void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
+void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+	if (!(flags & IOU_F_TWQ_FORCE_NORMAL) &&
+	    (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
 		rcu_read_lock();
 		io_req_local_work_add(req);
 		rcu_read_unlock();
@@ -1359,7 +1360,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
 						    io_task_work.node);
 
 		node = node->next;
-		__io_req_task_work_add(req, false);
+		__io_req_task_work_add(req, IOU_F_TWQ_FORCE_NORMAL);
 	}
 }
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 24d8196bbca3..cb4309a2acdc 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -15,6 +15,11 @@
 #include <trace/events/io_uring.h>
 #endif
 
+enum {
+	/* don't use deferred task_work */
+	IOU_F_TWQ_FORCE_NORMAL			= 1,
+};
+
 enum {
 	IOU_OK			= 0,
 	IOU_ISSUE_SKIP_COMPLETE	= -EIOCBQUEUED,
@@ -48,7 +53,7 @@ static inline bool io_req_ffs_set(struct io_kiocb *req)
 	return req->flags & REQ_F_FIXED_FILE;
 }
 
-void __io_req_task_work_add(struct io_kiocb *req, bool allow_local);
+void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_queue(struct io_kiocb *req);
@@ -93,7 +98,7 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 
 static inline void io_req_task_work_add(struct io_kiocb *req)
 {
-	__io_req_task_work_add(req, true);
+	__io_req_task_work_add(req, 0);
 }
 
 #define io_for_each_link(pos, head) \
-- 
cgit 


From 5150940079a3ce94d7474f6f5b0d6276569dc1de Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 6 Apr 2023 14:20:11 +0100
Subject: io_uring: inline llist_add()

We'll need to grab some information from the previous request in the tw
list, inline llist_add(), it'll be used in the following patch.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f0165493af7b379943c792114b972f331e7d7d10.1680782017.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 6f175fe682e4..786ecfa01c54 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1303,8 +1303,15 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx)
 static void io_req_local_work_add(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	struct llist_node *first;
 
-	if (!llist_add(&req->io_task_work.node, &ctx->work_llist))
+	first = READ_ONCE(ctx->work_llist.first);
+	do {
+		req->io_task_work.node.next = first;
+	} while (!try_cmpxchg(&ctx->work_llist.first, &first,
+			      &req->io_task_work.node));
+
+	if (first)
 		return;
 
 	/* needed for the following wake up */
-- 
cgit 


From 8751d15426a31baaf40f7570263c27c3e5d1dc44 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 6 Apr 2023 14:20:12 +0100
Subject: io_uring: reduce scheduling due to tw

Every task_work will try to wake the task to be executed, which causes
excessive scheduling and additional overhead. For some tw it's
justified, but others won't do much but post a single CQE.

When a task waits for multiple cqes, every such task_work will wake it
up. Instead, the task may give a hint about how many cqes it waits for,
io_req_local_work_add() will compare against it and skip wake ups
if #cqes + #tw is not enough to satisfy the waiting condition. Task_work
that uses the optimisation should be simple enough and never post more
than one CQE. It's also ignored for non DEFER_TASKRUN rings.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d2b77e99d1e86624d8a69f7037d764b739dcd225.1680782017.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 68 ++++++++++++++++++++++++++++++++++++-----------------
 io_uring/io_uring.h |  9 +++++++
 io_uring/notif.c    |  2 +-
 io_uring/notif.h    |  2 +-
 io_uring/rw.c       |  2 +-
 5 files changed, 59 insertions(+), 24 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 786ecfa01c54..8a327a81beaf 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1300,35 +1300,59 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx)
 	}
 }
 
-static void io_req_local_work_add(struct io_kiocb *req)
+static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	unsigned nr_wait, nr_tw, nr_tw_prev;
 	struct llist_node *first;
 
+	if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+		flags &= ~IOU_F_TWQ_LAZY_WAKE;
+
 	first = READ_ONCE(ctx->work_llist.first);
 	do {
+		nr_tw_prev = 0;
+		if (first) {
+			struct io_kiocb *first_req = container_of(first,
+							struct io_kiocb,
+							io_task_work.node);
+			/*
+			 * Might be executed at any moment, rely on
+			 * SLAB_TYPESAFE_BY_RCU to keep it alive.
+			 */
+			nr_tw_prev = READ_ONCE(first_req->nr_tw);
+		}
+		nr_tw = nr_tw_prev + 1;
+		/* Large enough to fail the nr_wait comparison below */
+		if (!(flags & IOU_F_TWQ_LAZY_WAKE))
+			nr_tw = -1U;
+
+		req->nr_tw = nr_tw;
 		req->io_task_work.node.next = first;
 	} while (!try_cmpxchg(&ctx->work_llist.first, &first,
 			      &req->io_task_work.node));
 
-	if (first)
-		return;
-
-	/* needed for the following wake up */
-	smp_mb__after_atomic();
-
-	if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
-		io_move_task_work_from_local(ctx);
-		return;
+	if (!first) {
+		if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
+			io_move_task_work_from_local(ctx);
+			return;
+		}
+		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+			atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+		if (ctx->has_evfd)
+			io_eventfd_signal(ctx);
 	}
 
-	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
-		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
-	if (ctx->has_evfd)
-		io_eventfd_signal(ctx);
-
-	if (READ_ONCE(ctx->cq_waiting))
-		wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
+	nr_wait = atomic_read(&ctx->cq_wait_nr);
+	/* no one is waiting */
+	if (!nr_wait)
+		return;
+	/* either not enough or the previous add has already woken it up */
+	if (nr_wait > nr_tw || nr_tw_prev >= nr_wait)
+		return;
+	/* pairs with set_current_state() in io_cqring_wait() */
+	smp_mb__after_atomic();
+	wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
 }
 
 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
@@ -1339,7 +1363,7 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
 	if (!(flags & IOU_F_TWQ_FORCE_NORMAL) &&
 	    (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
 		rcu_read_lock();
-		io_req_local_work_add(req);
+		io_req_local_work_add(req, flags);
 		rcu_read_unlock();
 		return;
 	}
@@ -2625,7 +2649,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		unsigned long check_cq;
 
 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
-			WRITE_ONCE(ctx->cq_waiting, 1);
+			int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
+
+			atomic_set(&ctx->cq_wait_nr, nr_wait);
 			set_current_state(TASK_INTERRUPTIBLE);
 		} else {
 			prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
@@ -2634,7 +2660,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 
 		ret = io_cqring_wait_schedule(ctx, &iowq);
 		__set_current_state(TASK_RUNNING);
-		WRITE_ONCE(ctx->cq_waiting, 0);
+		atomic_set(&ctx->cq_wait_nr, 0);
 
 		if (ret < 0)
 			break;
@@ -4517,7 +4543,7 @@ static int __init io_uring_init(void)
 	io_uring_optable_init();
 
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
-				SLAB_ACCOUNT);
+				SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
 	return 0;
 };
 __initcall(io_uring_init);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index cb4309a2acdc..ef449e43d493 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -18,6 +18,15 @@
 enum {
 	/* don't use deferred task_work */
 	IOU_F_TWQ_FORCE_NORMAL			= 1,
+
+	/*
+	 * A hint to not wake right away but delay until there are enough of
+	 * tw's queued to match the number of CQEs the task is waiting for.
+	 *
+	 * Must not be used wirh requests generating more than one CQE.
+	 * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
+	 */
+	IOU_F_TWQ_LAZY_WAKE			= 2,
 };
 
 enum {
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 172105eb347d..e1846a25dde1 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -31,7 +31,7 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
 	struct io_kiocb *notif = cmd_to_io_kiocb(nd);
 
 	if (refcount_dec_and_test(&uarg->refcnt))
-		io_req_task_work_add(notif);
+		__io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE);
 }
 
 static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg,
diff --git a/io_uring/notif.h b/io_uring/notif.h
index c88c800cd89d..6dd1b30a468f 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -33,7 +33,7 @@ static inline void io_notif_flush(struct io_kiocb *notif)
 
 	/* drop slot's master ref */
 	if (refcount_dec_and_test(&nd->uarg.refcnt))
-		io_req_task_work_add(notif);
+		__io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE);
 }
 
 static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index f14868624f41..6c7d2654770e 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -304,7 +304,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 		return;
 	io_req_set_res(req, io_fixup_rw_res(req, res), 0);
 	req->io_task_work.func = io_req_rw_complete;
-	io_req_task_work_add(req);
+	__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
-- 
cgit 


From c66ae3ec38f946edb1776d25c1c8cd63803b8ec3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 6 Apr 2023 14:20:13 +0100
Subject: io_uring: refactor __io_cq_unlock_post_flush()

Separate ->task_complete path in __io_cq_unlock_post_flush().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/baa9b8d822f024e4ee01c40209dbbe38d9c8c11d.1680782017.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8a327a81beaf..0ea50c46f27f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -627,21 +627,23 @@ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
 	io_cqring_wake(ctx);
 }
 
-static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
+static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
 	__releases(ctx->completion_lock)
 {
 	io_commit_cqring(ctx);
-	__io_cq_unlock(ctx);
-	io_commit_cqring_flush(ctx);
 
-	/*
-	 * As ->task_complete implies that the ring is single tasked, cq_wait
-	 * may only be waited on by the current in io_cqring_wait(), but since
-	 * it will re-check the wakeup conditions once we return we can safely
-	 * skip waking it up.
-	 */
-	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+	if (ctx->task_complete) {
+		/*
+		 * ->task_complete implies that only current might be waiting
+		 * for CQEs, and obviously, we currently don't. No one is
+		 * waiting, wakeups are futile, skip them.
+		 */
+		io_commit_cqring_flush(ctx);
+	} else {
+		__io_cq_unlock(ctx);
+		io_commit_cqring_flush(ctx);
 		io_cqring_wake(ctx);
+	}
 }
 
 void io_cq_unlock_post(struct io_ring_ctx *ctx)
-- 
cgit 


From 360cd42c4e95ff06d8d7b0a54e42236c7e7c187f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 6 Apr 2023 14:20:14 +0100
Subject: io_uring: optimise io_req_local_work_add

Chains of memory accesses are never good for performance.
The req->task->io_uring->in_cancel in io_req_local_work_add() is there
so that when a task is exiting via io_uring_try_cancel_requests() and
starts waiting for completions, it gets woken up by every new task_work
item queued.

Do a little trick by announcing waiting in io_uring_try_cancel_requests(),
making io_req_local_work_add() wake us up. We also need to check for
deferred tw items after prepare_to_wait(TASK_INTERRUPTIBLE);

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/fb11597e9bbcb365901824f8c5c2cf0d6ee100d0.1680782017.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0ea50c46f27f..9bbf58297a0e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1335,10 +1335,6 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
 			      &req->io_task_work.node));
 
 	if (!first) {
-		if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
-			io_move_task_work_from_local(ctx);
-			return;
-		}
 		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 			atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
 		if (ctx->has_evfd)
@@ -3205,6 +3201,12 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	enum io_wq_cancel cret;
 	bool ret = false;
 
+	/* set it so io_req_local_work_add() would wake us up */
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+		atomic_set(&ctx->cq_wait_nr, 1);
+		smp_mb();
+	}
+
 	/* failed during ring init, it couldn't have issued any requests */
 	if (!ctx->rings)
 		return false;
@@ -3259,6 +3261,8 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_ring_ctx *ctx;
+	struct io_tctx_node *node;
+	unsigned long index;
 	s64 inflight;
 	DEFINE_WAIT(wait);
 
@@ -3280,9 +3284,6 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 			break;
 
 		if (!sqd) {
-			struct io_tctx_node *node;
-			unsigned long index;
-
 			xa_for_each(&tctx->xa, index, node) {
 				/* sqpoll task will cancel all its requests */
 				if (node->ctx->sq_data)
@@ -3305,7 +3306,13 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 		prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
 		io_run_task_work();
 		io_uring_drop_tctx_refs(current);
-
+		xa_for_each(&tctx->xa, index, node) {
+			if (!llist_empty(&node->ctx->work_llist)) {
+				WARN_ON_ONCE(node->ctx->submitter_task &&
+					     node->ctx->submitter_task != current);
+				goto end_wait;
+			}
+		}
 		/*
 		 * If we've seen completions, retry without waiting. This
 		 * avoids a race where a completion comes in before we did
@@ -3313,6 +3320,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 		 */
 		if (inflight == tctx_inflight(tctx, !cancel_all))
 			schedule();
+end_wait:
 		finish_wait(&tctx->wait, &wait);
 	} while (1);
 
-- 
cgit 


From 27a67079c0e548d5c3232c40951517cfa630fe51 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 12 Apr 2023 12:07:36 -0600
Subject: io_uring/uring_cmd: take advantage of completion batching

We know now what the completion context is for the uring_cmd completion
handling, so use that to have io_req_task_complete() decide what the
best way to complete the request is. This allows batching of the posted
completions if we have multiple pending, rather than always doing them
one-by-one.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/uring_cmd.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index f7a96bc76ea1..5113c9a48583 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -54,11 +54,15 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
 	io_req_set_res(req, ret, 0);
 	if (req->ctx->flags & IORING_SETUP_CQE32)
 		io_req_set_cqe32_extra(req, res2, 0);
-	if (req->ctx->flags & IORING_SETUP_IOPOLL)
+	if (req->ctx->flags & IORING_SETUP_IOPOLL) {
 		/* order with io_iopoll_req_issued() checking ->iopoll_complete */
 		smp_store_release(&req->iopoll_completed, 1);
-	else
-		io_req_complete_post(req, issue_flags);
+	} else {
+		struct io_tw_state ts = {
+			.locked = !(issue_flags & IO_URING_F_UNLOCKED),
+		};
+		io_req_task_complete(req, &ts);
+	}
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_done);
 
-- 
cgit 


From 8b1df11f97333d6d8647f1c6c0554eb2d9774396 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 11 Apr 2023 12:06:01 +0100
Subject: io_uring: shut io_prep_async_work warning

io_uring/io_uring.c:432 io_prep_async_work() error: we previously
assumed 'req->file' could be null (see line 425).

Even though it's a false positive as there will not be REQ_F_ISREG set
without a file, let's add a simple check to make the kernel test robot
happy. We don't care about performance here, but assumingly it'll be
optimised out by the compiler.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a6cfbe92c74b789c0b4f046f7f98d19b1ca2e5b7.1681210788.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9bbf58297a0e..b171c26d331d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -425,7 +425,7 @@ static void io_prep_async_work(struct io_kiocb *req)
 	if (req->file && !io_req_ffs_set(req))
 		req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT;
 
-	if (req->flags & REQ_F_ISREG) {
+	if (req->file && (req->flags & REQ_F_ISREG)) {
 		bool should_hash = def->hash_reg_file;
 
 		/* don't serialize this request if the fs doesn't need it */
-- 
cgit 


From ceac766a5581e4e671ec8e5236b8fdaed8e4c8c9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 11 Apr 2023 12:06:02 +0100
Subject: io_uring/kbuf: remove extra ->buf_ring null check

The kernel test robot complains about __io_remove_buffers().

io_uring/kbuf.c:221 __io_remove_buffers() warn: variable dereferenced
before check 'bl->buf_ring' (see line 219)

That check is not needed as ->buf_ring will always be set, so we can
remove it and so silence the warning.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9a632bbf749d9d911e605255652ce08d18e7d2c6.1681210788.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 79c25459e8de..0905c1761fba 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -218,14 +218,12 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 	if (bl->is_mapped) {
 		i = bl->buf_ring->tail - bl->head;
 		if (bl->is_mmap) {
-			if (bl->buf_ring) {
-				struct page *page;
-
-				page = virt_to_head_page(bl->buf_ring);
-				if (put_page_testzero(page))
-					free_compound_page(page);
-				bl->buf_ring = NULL;
-			}
+			struct page *page;
+
+			page = virt_to_head_page(bl->buf_ring);
+			if (put_page_testzero(page))
+				free_compound_page(page);
+			bl->buf_ring = NULL;
 			bl->is_mmap = 0;
 		} else if (bl->buf_nr_pages) {
 			int j;
-- 
cgit 


From 8ce4269eeedc5b31f5817f610b42cba8be8fa9de Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 11 Apr 2023 12:06:03 +0100
Subject: io_uring: add irq lockdep checks

We don't post CQEs from the IRQ context, add a check catching that.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f23f7a24dbe8027b3d37873fece2b6488f878b31.1681210788.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index ef449e43d493..25515d69d205 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -94,6 +94,8 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 
 #define io_lockdep_assert_cq_locked(ctx)				\
 	do {								\
+		lockdep_assert(in_task());				\
+									\
 		if (ctx->flags & IORING_SETUP_IOPOLL) {			\
 			lockdep_assert_held(&ctx->uring_lock);		\
 		} else if (!ctx->task_complete) {			\
-- 
cgit 


From 786788a8cfe03056e9c7b1c6e418c1db92a0ce80 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 11 Apr 2023 12:06:04 +0100
Subject: io_uring/rsrc: add lockdep checks

Add a lockdep chek to make sure that file and buffer updates hold
->uring_lock.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/961bbe6e433ec9bc0375127f23468b37b729df99.1681210788.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 603a783a0383..24e4e2109549 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -534,6 +534,8 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 	__u32 tmp;
 	int err;
 
+	lockdep_assert_held(&ctx->uring_lock);
+
 	if (check_add_overflow(up->offset, nr_args, &tmp))
 		return -EOVERFLOW;
 	err = io_rsrc_node_switch_start(ctx);
-- 
cgit 


From 528407b1e0ea51260fff2cc8b669c632a65d7a09 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 11 Apr 2023 12:06:05 +0100
Subject: io_uring/rsrc: consolidate node caching

We store one pre-allocated rsrc node in ->rsrc_backup_node, merge it
with ->rsrc_node_cache.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6d5410e51ccd29be7a716be045b51d6b371baef6.1681210788.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/alloc_cache.h |  5 +++++
 io_uring/io_uring.c    |  2 --
 io_uring/rsrc.c        | 20 +++++++++++---------
 3 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 851a527afb5e..241245cb54a6 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -23,6 +23,11 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 	return false;
 }
 
+static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache)
+{
+	return !cache->list.next;
+}
+
 static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
 {
 	if (cache->list.next) {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b171c26d331d..075bae8a2bb1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2852,8 +2852,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	/* there are no registered resources left, nobody uses it */
 	if (ctx->rsrc_node)
 		io_rsrc_node_destroy(ctx, ctx->rsrc_node);
-	if (ctx->rsrc_backup_node)
-		io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node);
 
 	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 24e4e2109549..73f9e10d9bf0 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -230,7 +230,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 			 struct io_rsrc_data *data_to_kill)
 	__must_hold(&ctx->uring_lock)
 {
-	WARN_ON_ONCE(!ctx->rsrc_backup_node);
+	WARN_ON_ONCE(io_alloc_cache_empty(&ctx->rsrc_node_cache));
 	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
 
 	if (data_to_kill) {
@@ -245,18 +245,20 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 		ctx->rsrc_node = NULL;
 	}
 
-	if (!ctx->rsrc_node) {
-		ctx->rsrc_node = ctx->rsrc_backup_node;
-		ctx->rsrc_backup_node = NULL;
-	}
+	if (!ctx->rsrc_node)
+		ctx->rsrc_node = io_rsrc_node_alloc(ctx);
 }
 
 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 {
-	if (ctx->rsrc_backup_node)
-		return 0;
-	ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
-	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
+	if (io_alloc_cache_empty(&ctx->rsrc_node_cache)) {
+		struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
+
+		if (!node)
+			return -ENOMEM;
+		io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache);
+	}
+	return 0;
 }
 
 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
-- 
cgit 


From 13c223962eac16f161cf9b6355209774c609af28 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 11 Apr 2023 12:06:06 +0100
Subject: io_uring/rsrc: zero node's rsrc data on alloc

struct io_rsrc_node::rsrc_data field is initialised on rsrc removal and
shouldn't be used before that, still let's play safe and zero the field
on alloc.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/09bd03cedc8da8a7974c5e6e4bf0489fd16593ab.1681210788.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 73f9e10d9bf0..329cc3851dfd 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -218,6 +218,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 			return NULL;
 	}
 
+	ref_node->rsrc_data = NULL;
 	ref_node->refs = 1;
 	INIT_LIST_HEAD(&ref_node->node);
 	INIT_LIST_HEAD(&ref_node->item_list);
-- 
cgit 


From 2933ae6eaa05e8db6ad33a3ca12af18d2a25358c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 11 Apr 2023 12:06:07 +0100
Subject: io_uring/rsrc: refactor io_rsrc_node_switch

We use io_rsrc_node_switch() coupled with io_rsrc_node_switch_start()
for a bunch of cases including initialising ctx->rsrc_node, i.e. by
passing NULL instead of rsrc_data. Leave it to only deal with actual
node changing.

For that, first remove it from io_uring_create() and add a function
allocating the first node. Then also remove all calls to
io_rsrc_node_switch() from files/buffers register as we already have a
node installed and it does essentially nothing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d146fe306ff98b1a5a60c997c252534f03d423d7.1681210788.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  5 ++---
 io_uring/rsrc.c     | 36 +++++++++++-------------------------
 io_uring/rsrc.h     |  7 +++++++
 3 files changed, 20 insertions(+), 28 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 075bae8a2bb1..9083a8466ebf 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3881,11 +3881,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	ret = io_sq_offload_create(ctx, p);
 	if (ret)
 		goto err;
-	/* always set a rsrc node */
-	ret = io_rsrc_node_switch_start(ctx);
+
+	ret = io_rsrc_init(ctx);
 	if (ret)
 		goto err;
-	io_rsrc_node_switch(ctx, NULL);
 
 	memset(&p->sq_off, 0, sizeof(p->sq_off));
 	p->sq_off.head = offsetof(struct io_rings, sq.head);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 329cc3851dfd..f2c660ffea74 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -204,7 +204,7 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 	}
 }
 
-static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 {
 	struct io_rsrc_node *ref_node;
 	struct io_cache_entry *entry;
@@ -231,23 +231,18 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 			 struct io_rsrc_data *data_to_kill)
 	__must_hold(&ctx->uring_lock)
 {
-	WARN_ON_ONCE(io_alloc_cache_empty(&ctx->rsrc_node_cache));
-	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
+	struct io_rsrc_node *node = ctx->rsrc_node;
+	struct io_rsrc_node *backup = io_rsrc_node_alloc(ctx);
 
-	if (data_to_kill) {
-		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
-
-		rsrc_node->rsrc_data = data_to_kill;
-		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
-
-		data_to_kill->refs++;
-		/* put master ref */
-		io_put_rsrc_node(ctx, rsrc_node);
-		ctx->rsrc_node = NULL;
-	}
+	if (WARN_ON_ONCE(!backup))
+		return;
 
-	if (!ctx->rsrc_node)
-		ctx->rsrc_node = io_rsrc_node_alloc(ctx);
+	data_to_kill->refs++;
+	node->rsrc_data = data_to_kill;
+	list_add_tail(&node->node, &ctx->rsrc_ref_list);
+	/* put master ref */
+	io_put_rsrc_node(ctx, node);
+	ctx->rsrc_node = backup;
 }
 
 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
@@ -921,9 +916,6 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EMFILE;
 	if (nr_args > rlimit(RLIMIT_NOFILE))
 		return -EMFILE;
-	ret = io_rsrc_node_switch_start(ctx);
-	if (ret)
-		return ret;
 	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
 				 &ctx->file_data);
 	if (ret)
@@ -978,7 +970,6 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 
 	/* default it to the whole table */
 	io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
-	io_rsrc_node_switch(ctx, NULL);
 	return 0;
 fail:
 	__io_sqe_files_unregister(ctx);
@@ -1260,9 +1251,6 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EBUSY;
 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 		return -EINVAL;
-	ret = io_rsrc_node_switch_start(ctx);
-	if (ret)
-		return ret;
 	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
 	if (ret)
 		return ret;
@@ -1300,8 +1288,6 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	ctx->buf_data = data;
 	if (ret)
 		__io_sqe_buffers_unregister(ctx);
-	else
-		io_rsrc_node_switch(ctx, NULL);
 	return ret;
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 8729f2fee256..17dfe180208f 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -74,6 +74,7 @@ void io_rsrc_put_work(struct work_struct *work);
 void io_wait_rsrc_data(struct io_rsrc_data *data);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 			  struct io_rsrc_node *node, void *rsrc);
 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
@@ -164,6 +165,12 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
 	return &data->tags[table_idx][off];
 }
 
+static inline int io_rsrc_init(struct io_ring_ctx *ctx)
+{
+	ctx->rsrc_node = io_rsrc_node_alloc(ctx);
+	return ctx->rsrc_node ? 0 : -ENOMEM;
+}
+
 int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 
-- 
cgit 


From d581076b6a85c6f8308a4ba2bdcd82651f5183df Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 11 Apr 2023 12:06:08 +0100
Subject: io_uring/rsrc: extract SCM file put helper

SCM file accounting is a slow path and is only used for UNIX files.
Extract a helper out of io_rsrc_file_put() that does the SCM
unaccounting.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/58cc7bffc2ee96bec8c2b89274a51febcbfa5556.1681210788.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index f2c660ffea74..11058e20bdcc 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -832,20 +832,14 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
 	return 0;
 }
 
-static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file)
 {
-	struct file *file = prsrc->file;
 #if defined(CONFIG_UNIX)
 	struct sock *sock = ctx->ring_sock->sk;
 	struct sk_buff_head list, *head = &sock->sk_receive_queue;
 	struct sk_buff *skb;
 	int i;
 
-	if (!io_file_need_scm(file)) {
-		fput(file);
-		return;
-	}
-
 	__skb_queue_head_init(&list);
 
 	/*
@@ -895,11 +889,19 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 			__skb_queue_tail(head, skb);
 		spin_unlock_irq(&head->lock);
 	}
-#else
-	fput(file);
 #endif
 }
 
+static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+{
+	struct file *file = prsrc->file;
+
+	if (likely(!io_file_need_scm(file)))
+		fput(file);
+	else
+		io_rsrc_file_scm_put(ctx, file);
+}
+
 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			  unsigned nr_args, u64 __user *tags)
 {
-- 
cgit 


From 519760df251bf2dcafb0af23df0229096537e78a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 15 Apr 2023 14:20:08 +0100
Subject: io_uring/notif: add constant for ubuf_info flags

Add a constant IO_NOTIF_UBUF_FLAGS for struct ubuf_info flags that
notifications use. That should minimise merge conflicts for planned
changes touching both io_uring and net at the same time.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/notif.c | 2 +-
 io_uring/notif.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/notif.c b/io_uring/notif.c
index e1846a25dde1..d3e703c37aba 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -79,7 +79,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	notif->io_task_work.func = io_req_task_complete;
 
 	nd = io_notif_to_data(notif);
-	nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+	nd->uarg.flags = IO_NOTIF_UBUF_FLAGS;
 	nd->uarg.callback = io_tx_ubuf_callback;
 	refcount_set(&nd->uarg.refcnt, 1);
 	return notif;
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 6dd1b30a468f..86d32bd9f856 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -7,6 +7,7 @@
 
 #include "rsrc.h"
 
+#define IO_NOTIF_UBUF_FLAGS	(SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN)
 #define IO_NOTIF_SPLICE_BATCH	32
 
 struct io_notif_data {
-- 
cgit 


From 953c37e066f05a3dca2d74643574b8dfe8a83983 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Apr 2023 15:28:05 +0100
Subject: io_uring/rsrc: use nospec'ed indexes

We use array_index_nospec() for registered buffer indexes, but don't use
it while poking into rsrc tags, fix that.

Fixes: 634d00df5e1cf ("io_uring: add full-fledged dynamic buffers support")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f02fafc5a9c0dd69be2b0618c38831c078232ff0.1681395792.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 11058e20bdcc..3c1538b8c8f4 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -517,7 +517,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 		}
 
 		ctx->user_bufs[i] = imu;
-		*io_get_tag_slot(ctx->buf_data, offset) = tag;
+		*io_get_tag_slot(ctx->buf_data, i) = tag;
 	}
 
 	if (needs_switch)
-- 
cgit 


From c732ea242d565c8281c4b017929fc62a246d81b9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Apr 2023 15:28:06 +0100
Subject: io_uring/rsrc: remove io_rsrc_node::done

Kill io_rsrc_node::node and check refs instead, it's set when the nodes
refcount hits zero, and it won't change afterwards.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/bbde361f4010f7e8bf196f1ecca27a763b79926f.1681395792.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 5 +----
 io_uring/rsrc.h | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 3c1538b8c8f4..5fc9d10743e0 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -191,14 +191,12 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 {
 	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
 
-	node->done = true;
 	while (!list_empty(&ctx->rsrc_ref_list)) {
 		node = list_first_entry(&ctx->rsrc_ref_list,
 					    struct io_rsrc_node, node);
 		/* recycle ref nodes in order */
-		if (!node->done)
+		if (node->refs)
 			break;
-
 		list_del(&node->node);
 		__io_rsrc_put_work(node);
 	}
@@ -222,7 +220,6 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 	ref_node->refs = 1;
 	INIT_LIST_HEAD(&ref_node->node);
 	INIT_LIST_HEAD(&ref_node->item_list);
-	ref_node->done = false;
 	ref_node->inline_items = 0;
 	return ref_node;
 }
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 17dfe180208f..88adcb0b7963 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -48,7 +48,6 @@ struct io_rsrc_node {
 	struct list_head		node;
 	struct llist_node		llist;
 	int				refs;
-	bool				done;
 
 	/*
 	 * Keeps a list of struct io_rsrc_put to be completed. Each entry
-- 
cgit 


From eef81fcaa61e1bc6b7735be65f41bbf1a8efd133 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Apr 2023 15:28:07 +0100
Subject: io_uring/rsrc: refactor io_rsrc_ref_quiesce

Refactor io_rsrc_ref_quiesce() by moving the first mutex_unlock(),
so we don't have to have a second mutex_unlock() further in the loop.
It prepares us to the next patch.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/65bc876271fb16bf550a53a4c76c91aacd94e52e.1681395792.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 5fc9d10743e0..d7e7528f7159 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -272,8 +272,8 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		return 0;
 
 	data->quiesce = true;
-	mutex_unlock(&ctx->uring_lock);
 	do {
+		mutex_unlock(&ctx->uring_lock);
 		ret = io_run_task_work_sig(ctx);
 		if (ret < 0) {
 			mutex_lock(&ctx->uring_lock);
@@ -285,18 +285,10 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 			}
 			break;
 		}
-		ret = wait_for_completion_interruptible(&data->done);
-		if (!ret) {
-			mutex_lock(&ctx->uring_lock);
-			if (!data->refs)
-				break;
-			/*
-			 * it has been revived by another thread while
-			 * we were unlocked
-			 */
-			mutex_unlock(&ctx->uring_lock);
-		}
-	} while (1);
+		wait_for_completion_interruptible(&data->done);
+		mutex_lock(&ctx->uring_lock);
+		ret = 0;
+	} while (data->refs);
 	data->quiesce = false;
 
 	return ret;
-- 
cgit 


From 4ea15b56f0810f0d8795d475db1bb74b3a7c1b2f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Apr 2023 15:28:08 +0100
Subject: io_uring/rsrc: use wq for quiescing

Replace completions with waitqueues for rsrc data quiesce, the main
wakeup condition is when data refs hit zero. Note that data refs are
only changes under ->uring_lock, so we prepare before mutex_unlock()
reacquire it after taking the lock back. This change will be needed
in the next patch.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1d0dbc74b3b4fd67c8f01819e680c5e0da252956.1681395792.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  1 +
 io_uring/rsrc.c     | 18 ++++++++++++------
 io_uring/rsrc.h     |  1 -
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9083a8466ebf..3c1c8c788b7b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -321,6 +321,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->cq_wait);
 	init_waitqueue_head(&ctx->poll_wq);
+	init_waitqueue_head(&ctx->rsrc_quiesce_wq);
 	spin_lock_init(&ctx->completion_lock);
 	spin_lock_init(&ctx->timeout_lock);
 	INIT_WQ_LIST(&ctx->iopoll_list);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d7e7528f7159..f9ce4076c73d 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -158,6 +158,7 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data,
 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 {
 	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
+	struct io_ring_ctx *ctx = rsrc_data->ctx;
 	struct io_rsrc_put *prsrc, *tmp;
 
 	if (ref_node->inline_items)
@@ -171,13 +172,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 
 	io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
 	if (io_put_rsrc_data_ref(rsrc_data))
-		complete(&rsrc_data->done);
+		wake_up_all(&ctx->rsrc_quiesce_wq);
 }
 
 void io_wait_rsrc_data(struct io_rsrc_data *data)
 {
-	if (data && !io_put_rsrc_data_ref(data))
-		wait_for_completion(&data->done);
+	if (data)
+		WARN_ON_ONCE(!io_put_rsrc_data_ref(data));
 }
 
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
@@ -257,6 +258,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 				      struct io_ring_ctx *ctx)
 {
+	DEFINE_WAIT(we);
 	int ret;
 
 	/* As we may drop ->uring_lock, other task may have started quiesce */
@@ -273,7 +275,9 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 
 	data->quiesce = true;
 	do {
+		prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
 		mutex_unlock(&ctx->uring_lock);
+
 		ret = io_run_task_work_sig(ctx);
 		if (ret < 0) {
 			mutex_lock(&ctx->uring_lock);
@@ -285,12 +289,15 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 			}
 			break;
 		}
-		wait_for_completion_interruptible(&data->done);
+
+		schedule();
+		__set_current_state(TASK_RUNNING);
 		mutex_lock(&ctx->uring_lock);
 		ret = 0;
 	} while (data->refs);
-	data->quiesce = false;
 
+	finish_wait(&ctx->rsrc_quiesce_wq, &we);
+	data->quiesce = false;
 	return ret;
 }
 
@@ -366,7 +373,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 				goto fail;
 		}
 	}
-	init_completion(&data->done);
 	*pdata = data;
 	return 0;
 fail:
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 88adcb0b7963..d93ba4e9742a 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -35,7 +35,6 @@ struct io_rsrc_data {
 	u64				**tags;
 	unsigned int			nr;
 	rsrc_put_fn			*do_put;
-	struct completion		done;
 	int				refs;
 	bool				quiesce;
 };
-- 
cgit 


From 7d481e0356334eb2de254414769b4bed4b2a8827 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Apr 2023 15:28:09 +0100
Subject: io_uring/rsrc: fix DEFER_TASKRUN rsrc quiesce

For io_rsrc_ref_quiesce() to progress it should execute all task_work
items, including deferred ones. However, currently nobody would wake us,
and so let's set ctx->cq_wait_nr, so io_req_local_work_add() would wake
us up.

Fixes: c0e0d6ba25f18 ("io_uring: add IORING_SETUP_DEFER_TASKRUN")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f1a90d1bc5ebf096475b018fed52e54f3b89d4af.1681395792.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index f9ce4076c73d..e634ef384724 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -273,6 +273,11 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 	if (io_put_rsrc_data_ref(data))
 		return 0;
 
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+		atomic_set(&ctx->cq_wait_nr, 1);
+		smp_mb();
+	}
+
 	data->quiesce = true;
 	do {
 		prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
@@ -298,6 +303,10 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 
 	finish_wait(&ctx->rsrc_quiesce_wq, &we);
 	data->quiesce = false;
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+		atomic_set(&ctx->cq_wait_nr, 0);
+		smp_mb();
+	}
 	return ret;
 }
 
-- 
cgit 


From 0b222eeb6514ba6c3457b667fa4f3645032e1fc9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Apr 2023 15:28:10 +0100
Subject: io_uring/rsrc: remove rsrc_data refs

Instead of waiting for rsrc_data->refs to be downed to zero, check
whether there are rsrc nodes queued for completion, that's easier then
maintaining references.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8e33fd143d83e11af3e386aea28eb6d6c6a1be10.1681395792.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  4 ++--
 io_uring/rsrc.c     | 32 ++++++++------------------------
 io_uring/rsrc.h     |  2 --
 3 files changed, 10 insertions(+), 28 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3c1c8c788b7b..3d43df8f1e4e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2831,8 +2831,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
 	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
-	io_wait_rsrc_data(ctx->buf_data);
-	io_wait_rsrc_data(ctx->file_data);
+	if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)))
+		return;
 
 	mutex_lock(&ctx->uring_lock);
 	if (ctx->buf_data)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index e634ef384724..5415a18844e0 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -31,11 +31,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 #define IORING_MAX_FIXED_FILES	(1U << 20)
 #define IORING_MAX_REG_BUFFERS	(1U << 14)
 
-static inline bool io_put_rsrc_data_ref(struct io_rsrc_data *rsrc_data)
-{
-	return !--rsrc_data->refs;
-}
-
 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
 {
 	unsigned long page_limit, cur_pages, new_pages;
@@ -158,7 +153,6 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data,
 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 {
 	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
-	struct io_ring_ctx *ctx = rsrc_data->ctx;
 	struct io_rsrc_put *prsrc, *tmp;
 
 	if (ref_node->inline_items)
@@ -171,14 +165,6 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 	}
 
 	io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
-	if (io_put_rsrc_data_ref(rsrc_data))
-		wake_up_all(&ctx->rsrc_quiesce_wq);
-}
-
-void io_wait_rsrc_data(struct io_rsrc_data *data)
-{
-	if (data)
-		WARN_ON_ONCE(!io_put_rsrc_data_ref(data));
 }
 
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
@@ -201,6 +187,8 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 		list_del(&node->node);
 		__io_rsrc_put_work(node);
 	}
+	if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
+		wake_up_all(&ctx->rsrc_quiesce_wq);
 }
 
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
@@ -235,7 +223,6 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 	if (WARN_ON_ONCE(!backup))
 		return;
 
-	data_to_kill->refs++;
 	node->rsrc_data = data_to_kill;
 	list_add_tail(&node->node, &ctx->rsrc_ref_list);
 	/* put master ref */
@@ -269,8 +256,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		return ret;
 	io_rsrc_node_switch(ctx, data);
 
-	/* kill initial ref */
-	if (io_put_rsrc_data_ref(data))
+	if (list_empty(&ctx->rsrc_ref_list))
 		return 0;
 
 	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
@@ -278,6 +264,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		smp_mb();
 	}
 
+	ctx->rsrc_quiesce++;
 	data->quiesce = true;
 	do {
 		prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
@@ -286,12 +273,8 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		ret = io_run_task_work_sig(ctx);
 		if (ret < 0) {
 			mutex_lock(&ctx->uring_lock);
-			if (!data->refs) {
+			if (list_empty(&ctx->rsrc_ref_list))
 				ret = 0;
-			} else {
-				/* restore the master reference */
-				data->refs++;
-			}
 			break;
 		}
 
@@ -299,10 +282,12 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		__set_current_state(TASK_RUNNING);
 		mutex_lock(&ctx->uring_lock);
 		ret = 0;
-	} while (data->refs);
+	} while (!list_empty(&ctx->rsrc_ref_list));
 
 	finish_wait(&ctx->rsrc_quiesce_wq, &we);
 	data->quiesce = false;
+	ctx->rsrc_quiesce--;
+
 	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
 		atomic_set(&ctx->cq_wait_nr, 0);
 		smp_mb();
@@ -371,7 +356,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 	data->nr = nr;
 	data->ctx = ctx;
 	data->do_put = do_put;
-	data->refs = 1;
 	if (utags) {
 		ret = -EFAULT;
 		for (i = 0; i < nr; i++) {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index d93ba4e9742a..5dd2fcb28069 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -35,7 +35,6 @@ struct io_rsrc_data {
 	u64				**tags;
 	unsigned int			nr;
 	rsrc_put_fn			*do_put;
-	int				refs;
 	bool				quiesce;
 };
 
@@ -69,7 +68,6 @@ struct io_mapped_ubuf {
 void io_rsrc_put_tw(struct callback_head *cb);
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_put_work(struct work_struct *work);
-void io_wait_rsrc_data(struct io_rsrc_data *data);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
-- 
cgit 


From 2f2af35f8e5a1ed552ed02e47277d50092a2b9f6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Apr 2023 15:28:11 +0100
Subject: io_uring/rsrc: inline switch_start fast path

Inline the part of io_rsrc_node_switch_start() that checks whether the
cache is empty or not, as most of the times it will have some number of
entries in there.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9619c1717a0e01f22c5fce2f1ba2735f804da0f2.1681395792.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 12 +++++-------
 io_uring/rsrc.h |  9 ++++++++-
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 5415a18844e0..bfa0b382c6c6 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -230,15 +230,13 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 	ctx->rsrc_node = backup;
 }
 
-int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
+int __io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 {
-	if (io_alloc_cache_empty(&ctx->rsrc_node_cache)) {
-		struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
+	struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
 
-		if (!node)
-			return -ENOMEM;
-		io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache);
-	}
+	if (!node)
+		return -ENOMEM;
+	io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache);
 	return 0;
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 5dd2fcb28069..732496afed4c 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -69,7 +69,7 @@ void io_rsrc_put_tw(struct callback_head *cb);
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_put_work(struct work_struct *work);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
-int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
+int __io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 			  struct io_rsrc_node *node, void *rsrc);
@@ -111,6 +111,13 @@ static inline int io_scm_file_account(struct io_ring_ctx *ctx,
 	return __io_scm_file_account(ctx, file);
 }
 
+static inline int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
+{
+	if (unlikely(io_alloc_cache_empty(&ctx->rsrc_node_cache)))
+		return __io_rsrc_node_switch_start(ctx);
+	return 0;
+}
+
 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
 			     unsigned nr_args);
 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
-- 
cgit 


From 9a57fffedc0ee078418a7793ab29cd3864205340 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Apr 2023 15:28:12 +0100
Subject: io_uring/rsrc: clean up __io_sqe_buffers_update()

Inline offset variable, so we don't use it without subjecting it to
array_index_nospec() first.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/77936d9ed23755588810c5eafcea7e1c3b90e3cd.1681395792.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index bfa0b382c6c6..38f0c9ce67a7 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -469,7 +469,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 
 	for (done = 0; done < nr_args; done++) {
 		struct io_mapped_ubuf *imu;
-		int offset = up->offset + done;
 		u64 tag = 0;
 
 		err = io_copy_iov(ctx, &iov, iovs, done);
@@ -490,7 +489,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 		if (err)
 			break;
 
-		i = array_index_nospec(offset, ctx->nr_user_bufs);
+		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
 		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
 			err = io_queue_rsrc_removal(ctx->buf_data, i,
 						    ctx->rsrc_node, ctx->user_bufs[i]);
-- 
cgit 


From c87fd583f3b5ef770af33893394ea37c7a10b5b8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Apr 2023 15:28:13 +0100
Subject: io_uring/rsrc: simplify single file node switching

At maximum io_install_fixed_file() removes only one file, so no need to
keep needs_switch state and we can call io_rsrc_node_switch() right after
removal.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/37cfb46f46160f81dced79f646e97db608994574.1681395792.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/filetable.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index b80614e7d605..6255fa255ae2 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -64,7 +64,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 				 u32 slot_index)
 	__must_hold(&req->ctx->uring_lock)
 {
-	bool needs_switch = false;
 	struct io_fixed_file *file_slot;
 	int ret;
 
@@ -83,16 +82,17 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 
 		ret = io_rsrc_node_switch_start(ctx);
 		if (ret)
-			goto err;
+			return ret;
 
 		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
 		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
 					    ctx->rsrc_node, old_file);
 		if (ret)
-			goto err;
+			return ret;
+
 		file_slot->file_ptr = 0;
 		io_file_bitmap_clear(&ctx->file_table, slot_index);
-		needs_switch = true;
+		io_rsrc_node_switch(ctx, ctx->file_data);
 	}
 
 	ret = io_scm_file_account(ctx, file);
@@ -101,9 +101,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 		io_fixed_file_set(file_slot, file);
 		io_file_bitmap_set(&ctx->file_table, slot_index);
 	}
-err:
-	if (needs_switch)
-		io_rsrc_node_switch(ctx, ctx->file_data);
 	return ret;
 }
 
-- 
cgit 


From c899a5d7d0eca054546b63e95c94b1e609516f84 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 13 Apr 2023 15:28:14 +0100
Subject: io_uring/rsrc: refactor io_queue_rsrc_removal

We can queue up a rsrc into a list in io_queue_rsrc_removal() while
allocating io_rsrc_put and so simplify the function.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/36bd708ee25c0e2e7992dc19b17db166eea9ac40.1681395792.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 38f0c9ce67a7..db58a51d19da 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -685,7 +685,6 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 {
 	u64 *tag_slot = io_get_tag_slot(data, idx);
 	struct io_rsrc_put *prsrc;
-	bool inline_item = true;
 
 	if (!node->inline_items) {
 		prsrc = &node->item;
@@ -694,14 +693,12 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 		prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
 		if (!prsrc)
 			return -ENOMEM;
-		inline_item = false;
+		list_add(&prsrc->list, &node->item_list);
 	}
 
 	prsrc->tag = *tag_slot;
 	*tag_slot = 0;
 	prsrc->rsrc = rsrc;
-	if (!inline_item)
-		list_add(&prsrc->list, &node->item_list);
 	return 0;
 }
 
-- 
cgit 


From 2e6f45ac0e640bbd49296adfa0982c84f85fa342 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 18 Apr 2023 14:06:34 +0100
Subject: io_uring/rsrc: remove unused io_rsrc_node::llist

->llist was needed for rsrc node destruction offload, which is removed
now. Get rid of the unused field.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8e7d764c3f947489fde88d0927c3060d2e1bb599.1681822823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 732496afed4c..525905a30a55 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -44,7 +44,6 @@ struct io_rsrc_node {
 		struct io_rsrc_data		*rsrc_data;
 	};
 	struct list_head		node;
-	struct llist_node		llist;
 	int				refs;
 
 	/*
-- 
cgit 


From 63fea89027ff4fd4f350b471ad5b9220d373eec5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 18 Apr 2023 14:06:35 +0100
Subject: io_uring/rsrc: infer node from ctx on io_queue_rsrc_removal

For io_queue_rsrc_removal() we should always use the current active rsrc
node, don't pass it directly but let the function grab it from the
context.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d15939b4afea730978b4925685c2577538b823bb.1681822823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/filetable.c | 5 ++---
 io_uring/rsrc.c      | 9 +++++----
 io_uring/rsrc.h      | 3 +--
 3 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 6255fa255ae2..367a44a6c8c5 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -85,8 +85,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 			return ret;
 
 		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
-					    ctx->rsrc_node, old_file);
+		ret = io_queue_rsrc_removal(ctx->file_data, slot_index, old_file);
 		if (ret)
 			return ret;
 
@@ -163,7 +162,7 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 		return -EBADF;
 
 	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
+	ret = io_queue_rsrc_removal(ctx->file_data, offset, file);
 	if (ret)
 		return ret;
 
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index db58a51d19da..3be483de613e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -409,7 +409,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 		if (file_slot->file_ptr) {
 			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-			err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
+			err = io_queue_rsrc_removal(data, i, file);
 			if (err)
 				break;
 			file_slot->file_ptr = 0;
@@ -492,7 +492,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
 		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
 			err = io_queue_rsrc_removal(ctx->buf_data, i,
-						    ctx->rsrc_node, ctx->user_bufs[i]);
+						    ctx->user_bufs[i]);
 			if (unlikely(err)) {
 				io_buffer_unmap(ctx, &imu);
 				break;
@@ -680,9 +680,10 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
-			  struct io_rsrc_node *node, void *rsrc)
+int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
 {
+	struct io_ring_ctx *ctx = data->ctx;
+	struct io_rsrc_node *node = ctx->rsrc_node;
 	u64 *tag_slot = io_get_tag_slot(data, idx);
 	struct io_rsrc_put *prsrc;
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 525905a30a55..8ed3e6a65cf6 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -70,8 +70,7 @@ void io_rsrc_put_work(struct work_struct *work);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 int __io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
-			  struct io_rsrc_node *node, void *rsrc);
+int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc);
 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 			 struct io_rsrc_data *data_to_kill);
 
-- 
cgit 


From c376644fb915fbdea8c4a04f859d032a8be352fd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 18 Apr 2023 14:06:36 +0100
Subject: io_uring/rsrc: merge nodes and io_rsrc_put

struct io_rsrc_node carries a number of resources represented by struct
io_rsrc_put. That was handy before for sync overhead ammortisation, but
all complexity is gone and nodes are simple and lightweight. Let's
allocate a separate node for each resource.

Nodes and io_rsrc_put and not much different in size, and former are
cached, so node allocation should work better. That also removes some
overhead for nested iteration in io_rsrc_node_ref_zero() /
__io_rsrc_put_work().

Another reason for the patch is that it greatly reduces complexity
by moving io_rsrc_node_switch[_start]() inside io_queue_rsrc_removal(),
so users don't have to care about it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c7d3a45b30cc14cd93700a710dd112edc703db98.1681822823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/filetable.c |  9 ------
 io_uring/rsrc.c      | 91 +++++++++++++---------------------------------------
 io_uring/rsrc.h      | 22 ++-----------
 3 files changed, 25 insertions(+), 97 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 367a44a6c8c5..0f6fa791a47d 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -80,10 +80,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 	if (file_slot->file_ptr) {
 		struct file *old_file;
 
-		ret = io_rsrc_node_switch_start(ctx);
-		if (ret)
-			return ret;
-
 		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
 		ret = io_queue_rsrc_removal(ctx->file_data, slot_index, old_file);
 		if (ret)
@@ -91,7 +87,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 
 		file_slot->file_ptr = 0;
 		io_file_bitmap_clear(&ctx->file_table, slot_index);
-		io_rsrc_node_switch(ctx, ctx->file_data);
 	}
 
 	ret = io_scm_file_account(ctx, file);
@@ -152,9 +147,6 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 		return -ENXIO;
 	if (offset >= ctx->nr_user_files)
 		return -EINVAL;
-	ret = io_rsrc_node_switch_start(ctx);
-	if (ret)
-		return ret;
 
 	offset = array_index_nospec(offset, ctx->nr_user_files);
 	file_slot = io_fixed_file_slot(&ctx->file_table, offset);
@@ -168,7 +160,6 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 
 	file_slot->file_ptr = 0;
 	io_file_bitmap_clear(&ctx->file_table, offset);
-	io_rsrc_node_switch(ctx, ctx->file_data);
 	return 0;
 }
 
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 3be483de613e..a54a222a20b8 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -153,17 +153,10 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data,
 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 {
 	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
-	struct io_rsrc_put *prsrc, *tmp;
 
-	if (ref_node->inline_items)
+	if (likely(ref_node->inline_items))
 		io_rsrc_put_work_one(rsrc_data, &ref_node->item);
 
-	list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) {
-		list_del(&prsrc->list);
-		io_rsrc_put_work_one(rsrc_data, prsrc);
-		kfree(prsrc);
-	}
-
 	io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
 }
 
@@ -206,53 +199,29 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 	}
 
 	ref_node->rsrc_data = NULL;
-	ref_node->refs = 1;
-	INIT_LIST_HEAD(&ref_node->node);
-	INIT_LIST_HEAD(&ref_node->item_list);
 	ref_node->inline_items = 0;
+	ref_node->refs = 1;
 	return ref_node;
 }
 
-void io_rsrc_node_switch(struct io_ring_ctx *ctx,
-			 struct io_rsrc_data *data_to_kill)
-	__must_hold(&ctx->uring_lock)
-{
-	struct io_rsrc_node *node = ctx->rsrc_node;
-	struct io_rsrc_node *backup = io_rsrc_node_alloc(ctx);
-
-	if (WARN_ON_ONCE(!backup))
-		return;
-
-	node->rsrc_data = data_to_kill;
-	list_add_tail(&node->node, &ctx->rsrc_ref_list);
-	/* put master ref */
-	io_put_rsrc_node(ctx, node);
-	ctx->rsrc_node = backup;
-}
-
-int __io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
-{
-	struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
-
-	if (!node)
-		return -ENOMEM;
-	io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache);
-	return 0;
-}
-
 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 				      struct io_ring_ctx *ctx)
 {
+	struct io_rsrc_node *backup;
 	DEFINE_WAIT(we);
 	int ret;
 
-	/* As we may drop ->uring_lock, other task may have started quiesce */
+	/* As We may drop ->uring_lock, other task may have started quiesce */
 	if (data->quiesce)
 		return -ENXIO;
-	ret = io_rsrc_node_switch_start(ctx);
-	if (ret)
-		return ret;
-	io_rsrc_node_switch(ctx, data);
+
+	backup = io_rsrc_node_alloc(ctx);
+	if (!backup)
+		return -ENOMEM;
+	ctx->rsrc_node->rsrc_data = data;
+	list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
+	io_put_rsrc_node(ctx, ctx->rsrc_node);
+	ctx->rsrc_node = backup;
 
 	if (list_empty(&ctx->rsrc_ref_list))
 		return 0;
@@ -382,7 +351,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 	struct file *file;
 	int fd, i, err = 0;
 	unsigned int done;
-	bool needs_switch = false;
 
 	if (!ctx->file_data)
 		return -ENXIO;
@@ -414,7 +382,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				break;
 			file_slot->file_ptr = 0;
 			io_file_bitmap_clear(&ctx->file_table, i);
-			needs_switch = true;
 		}
 		if (fd != -1) {
 			file = fget(fd);
@@ -445,9 +412,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			io_file_bitmap_set(&ctx->file_table, i);
 		}
 	}
-
-	if (needs_switch)
-		io_rsrc_node_switch(ctx, data);
 	return done ? done : err;
 }
 
@@ -458,7 +422,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 	u64 __user *tags = u64_to_user_ptr(up->tags);
 	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
 	struct page *last_hpage = NULL;
-	bool needs_switch = false;
 	__u32 done;
 	int i, err;
 
@@ -498,15 +461,11 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 				break;
 			}
 			ctx->user_bufs[i] = ctx->dummy_ubuf;
-			needs_switch = true;
 		}
 
 		ctx->user_bufs[i] = imu;
 		*io_get_tag_slot(ctx->buf_data, i) = tag;
 	}
-
-	if (needs_switch)
-		io_rsrc_node_switch(ctx, ctx->buf_data);
 	return done ? done : err;
 }
 
@@ -515,15 +474,11 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				     unsigned nr_args)
 {
 	__u32 tmp;
-	int err;
 
 	lockdep_assert_held(&ctx->uring_lock);
 
 	if (check_add_overflow(up->offset, nr_args, &tmp))
 		return -EOVERFLOW;
-	err = io_rsrc_node_switch_start(ctx);
-	if (err)
-		return err;
 
 	switch (type) {
 	case IORING_RSRC_FILE:
@@ -685,21 +640,21 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
 	struct io_ring_ctx *ctx = data->ctx;
 	struct io_rsrc_node *node = ctx->rsrc_node;
 	u64 *tag_slot = io_get_tag_slot(data, idx);
-	struct io_rsrc_put *prsrc;
 
-	if (!node->inline_items) {
-		prsrc = &node->item;
-		node->inline_items++;
-	} else {
-		prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
-		if (!prsrc)
-			return -ENOMEM;
-		list_add(&prsrc->list, &node->item_list);
+	ctx->rsrc_node = io_rsrc_node_alloc(ctx);
+	if (unlikely(!ctx->rsrc_node)) {
+		ctx->rsrc_node = node;
+		return -ENOMEM;
 	}
 
-	prsrc->tag = *tag_slot;
+	node->item.rsrc = rsrc;
+	node->item.tag = *tag_slot;
+	node->inline_items = 1;
 	*tag_slot = 0;
-	prsrc->rsrc = rsrc;
+
+	node->rsrc_data = data;
+	list_add_tail(&node->node, &ctx->rsrc_ref_list);
+	io_put_rsrc_node(ctx, node);
 	return 0;
 }
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 8ed3e6a65cf6..bad7103f5033 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -18,7 +18,6 @@ enum {
 };
 
 struct io_rsrc_put {
-	struct list_head list;
 	u64 tag;
 	union {
 		void *rsrc;
@@ -43,17 +42,10 @@ struct io_rsrc_node {
 		struct io_cache_entry		cache;
 		struct io_rsrc_data		*rsrc_data;
 	};
-	struct list_head		node;
 	int				refs;
-
-	/*
-	 * Keeps a list of struct io_rsrc_put to be completed. Each entry
-	 * represents one rsrc (e.g. file or buffer), but all of them should've
-	 * came from the same table and so are of the same type.
-	 */
-	struct list_head		item_list;
-	struct io_rsrc_put		item;
 	int				inline_items;
+	struct list_head		node;
+	struct io_rsrc_put		item;
 };
 
 struct io_mapped_ubuf {
@@ -68,11 +60,8 @@ void io_rsrc_put_tw(struct callback_head *cb);
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_put_work(struct work_struct *work);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
-int __io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc);
-void io_rsrc_node_switch(struct io_ring_ctx *ctx,
-			 struct io_rsrc_data *data_to_kill);
 
 int io_import_fixed(int ddir, struct iov_iter *iter,
 			   struct io_mapped_ubuf *imu,
@@ -109,13 +98,6 @@ static inline int io_scm_file_account(struct io_ring_ctx *ctx,
 	return __io_scm_file_account(ctx, file);
 }
 
-static inline int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
-{
-	if (unlikely(io_alloc_cache_empty(&ctx->rsrc_node_cache)))
-		return __io_rsrc_node_switch_start(ctx);
-	return 0;
-}
-
 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
 			     unsigned nr_args);
 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
-- 
cgit 


From 26147da37f3e52041d9deba189d39f27ce78a84f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 18 Apr 2023 14:06:37 +0100
Subject: io_uring/rsrc: add empty flag in rsrc_node

Unless a node was flushed by io_rsrc_ref_quiesce(), it'll carry a
resource. Replace ->inline_items with an empty flag, which is
initialised to false and only raised in io_rsrc_ref_quiesce().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/75d384c9d2252e12af73b9cf8a44e1699106aeb1.1681822823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 6 +++---
 io_uring/rsrc.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index a54a222a20b8..127bd602131e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -154,7 +154,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 {
 	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
 
-	if (likely(ref_node->inline_items))
+	if (likely(!ref_node->empty))
 		io_rsrc_put_work_one(rsrc_data, &ref_node->item);
 
 	io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
@@ -199,7 +199,7 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 	}
 
 	ref_node->rsrc_data = NULL;
-	ref_node->inline_items = 0;
+	ref_node->empty = 0;
 	ref_node->refs = 1;
 	return ref_node;
 }
@@ -218,6 +218,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 	backup = io_rsrc_node_alloc(ctx);
 	if (!backup)
 		return -ENOMEM;
+	ctx->rsrc_node->empty = true;
 	ctx->rsrc_node->rsrc_data = data;
 	list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
 	io_put_rsrc_node(ctx, ctx->rsrc_node);
@@ -649,7 +650,6 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
 
 	node->item.rsrc = rsrc;
 	node->item.tag = *tag_slot;
-	node->inline_items = 1;
 	*tag_slot = 0;
 
 	node->rsrc_data = data;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index bad7103f5033..f3fe455c6c71 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -43,7 +43,7 @@ struct io_rsrc_node {
 		struct io_rsrc_data		*rsrc_data;
 	};
 	int				refs;
-	int				inline_items;
+	bool				empty;
 	struct list_head		node;
 	struct io_rsrc_put		item;
 };
-- 
cgit 


From 4130b49991d6b8ca0ea44cb256e710c4e48d7f01 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 18 Apr 2023 14:06:38 +0100
Subject: io_uring/rsrc: inline io_rsrc_put_work()

io_rsrc_put_work() is simple enough to be open coded into its only
caller.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1b36dd46766ced39a9b160767babfa2fce07b8f8.1681822823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 19 ++++++-------------
 io_uring/rsrc.h |  1 -
 2 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 127bd602131e..d1167b0643b7 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -140,8 +140,8 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 	*slot = NULL;
 }
 
-static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data,
-				 struct io_rsrc_put *prsrc)
+static void io_rsrc_put_work(struct io_rsrc_data *rsrc_data,
+			     struct io_rsrc_put *prsrc)
 {
 	struct io_ring_ctx *ctx = rsrc_data->ctx;
 
@@ -150,16 +150,6 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data,
 	rsrc_data->do_put(ctx, prsrc);
 }
 
-static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
-{
-	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
-
-	if (likely(!ref_node->empty))
-		io_rsrc_put_work_one(rsrc_data, &ref_node->item);
-
-	io_rsrc_node_destroy(rsrc_data->ctx, ref_node);
-}
-
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 {
 	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache))
@@ -178,7 +168,10 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 		if (node->refs)
 			break;
 		list_del(&node->node);
-		__io_rsrc_put_work(node);
+
+		if (likely(!node->empty))
+			io_rsrc_put_work(node->rsrc_data, &node->item);
+		io_rsrc_node_destroy(ctx, node);
 	}
 	if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
 		wake_up_all(&ctx->rsrc_quiesce_wq);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f3fe455c6c71..232079363f6a 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -58,7 +58,6 @@ struct io_mapped_ubuf {
 
 void io_rsrc_put_tw(struct callback_head *cb);
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
-void io_rsrc_put_work(struct work_struct *work);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc);
-- 
cgit 


From 29b26c556e7439b1370ac6a59fce83a9d1521de1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 18 Apr 2023 14:06:39 +0100
Subject: io_uring/rsrc: pass node to io_rsrc_put_work()

Instead of passing rsrc_data and a resource to io_rsrc_put_work() just
forward node, that's all the function needs.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/791e8edd28d78797240b74d34e99facbaad62f3b.1681822823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d1167b0643b7..9378691d49f5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -140,14 +140,14 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 	*slot = NULL;
 }
 
-static void io_rsrc_put_work(struct io_rsrc_data *rsrc_data,
-			     struct io_rsrc_put *prsrc)
+static void io_rsrc_put_work(struct io_rsrc_node *node)
 {
-	struct io_ring_ctx *ctx = rsrc_data->ctx;
+	struct io_rsrc_data *data = node->rsrc_data;
+	struct io_rsrc_put *prsrc = &node->item;
 
 	if (prsrc->tag)
-		io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
-	rsrc_data->do_put(ctx, prsrc);
+		io_post_aux_cqe(data->ctx, prsrc->tag, 0, 0);
+	data->do_put(data->ctx, prsrc);
 }
 
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
@@ -170,7 +170,7 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 		list_del(&node->node);
 
 		if (likely(!node->empty))
-			io_rsrc_put_work(node->rsrc_data, &node->item);
+			io_rsrc_put_work(node);
 		io_rsrc_node_destroy(ctx, node);
 	}
 	if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
-- 
cgit 


From fc7f3a8d3a78503c4f3e108155fb9a233dc307a4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 18 Apr 2023 14:06:40 +0100
Subject: io_uring/rsrc: devirtualise rsrc put callbacks

We only have two rsrc types, buffers and files, replace virtual
callbacks for putting resources down with a switch..case.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/02ca727bf8e5f7f820c2f404e95ae88c8f472930.1681822823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 25 +++++++++++++++++++------
 io_uring/rsrc.h |  2 +-
 2 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 9378691d49f5..62988b3aa927 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -23,6 +23,8 @@ struct io_rsrc_update {
 	u32				offset;
 };
 
+static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
+static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 				  struct io_mapped_ubuf **pimu,
 				  struct page **last_hpage);
@@ -147,7 +149,18 @@ static void io_rsrc_put_work(struct io_rsrc_node *node)
 
 	if (prsrc->tag)
 		io_post_aux_cqe(data->ctx, prsrc->tag, 0, 0);
-	data->do_put(data->ctx, prsrc);
+
+	switch (data->rsrc_type) {
+	case IORING_RSRC_FILE:
+		io_rsrc_file_put(data->ctx, prsrc);
+		break;
+	case IORING_RSRC_BUFFER:
+		io_rsrc_buf_put(data->ctx, prsrc);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
 }
 
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
@@ -297,8 +310,8 @@ static __cold void **io_alloc_page_table(size_t size)
 	return table;
 }
 
-__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
-				     rsrc_put_fn *do_put, u64 __user *utags,
+__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type,
+				     u64 __user *utags,
 				     unsigned nr, struct io_rsrc_data **pdata)
 {
 	struct io_rsrc_data *data;
@@ -316,7 +329,7 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 
 	data->nr = nr;
 	data->ctx = ctx;
-	data->do_put = do_put;
+	data->rsrc_type = type;
 	if (utags) {
 		ret = -EFAULT;
 		for (i = 0; i < nr; i++) {
@@ -849,7 +862,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EMFILE;
 	if (nr_args > rlimit(RLIMIT_NOFILE))
 		return -EMFILE;
-	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
+	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args,
 				 &ctx->file_data);
 	if (ret)
 		return ret;
@@ -1184,7 +1197,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EBUSY;
 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 		return -EINVAL;
-	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
+	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data);
 	if (ret)
 		return ret;
 	ret = io_buffers_map_alloc(ctx, nr_args);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 232079363f6a..5d0733c4c08d 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -33,7 +33,7 @@ struct io_rsrc_data {
 
 	u64				**tags;
 	unsigned int			nr;
-	rsrc_put_fn			*do_put;
+	u16				rsrc_type;
 	bool				quiesce;
 };
 
-- 
cgit 


From 2236b3905b4d4e9cd4d149ab35767858c02bb79b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 18 Apr 2023 14:06:41 +0100
Subject: io_uring/rsrc: disassociate nodes and rsrc_data

Make rsrc nodes independent from rsrd_data, for that we keep ctx and
rsrc type in nodes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4f259abe9cd4eea6a3b4ed83508635218acd3c3f.1681822823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 20 +++++++++-----------
 io_uring/rsrc.h |  3 ++-
 2 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 62988b3aa927..20dcc7668cb0 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -144,18 +144,17 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 
 static void io_rsrc_put_work(struct io_rsrc_node *node)
 {
-	struct io_rsrc_data *data = node->rsrc_data;
 	struct io_rsrc_put *prsrc = &node->item;
 
 	if (prsrc->tag)
-		io_post_aux_cqe(data->ctx, prsrc->tag, 0, 0);
+		io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0);
 
-	switch (data->rsrc_type) {
+	switch (node->type) {
 	case IORING_RSRC_FILE:
-		io_rsrc_file_put(data->ctx, prsrc);
+		io_rsrc_file_put(node->ctx, prsrc);
 		break;
 	case IORING_RSRC_BUFFER:
-		io_rsrc_buf_put(data->ctx, prsrc);
+		io_rsrc_buf_put(node->ctx, prsrc);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -170,9 +169,9 @@ void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 }
 
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
-	__must_hold(&node->rsrc_data->ctx->uring_lock)
+	__must_hold(&node->ctx->uring_lock)
 {
-	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
+	struct io_ring_ctx *ctx = node->ctx;
 
 	while (!list_empty(&ctx->rsrc_ref_list)) {
 		node = list_first_entry(&ctx->rsrc_ref_list,
@@ -204,7 +203,7 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 			return NULL;
 	}
 
-	ref_node->rsrc_data = NULL;
+	ref_node->ctx = ctx;
 	ref_node->empty = 0;
 	ref_node->refs = 1;
 	return ref_node;
@@ -225,7 +224,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 	if (!backup)
 		return -ENOMEM;
 	ctx->rsrc_node->empty = true;
-	ctx->rsrc_node->rsrc_data = data;
+	ctx->rsrc_node->type = -1;
 	list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
 	io_put_rsrc_node(ctx, ctx->rsrc_node);
 	ctx->rsrc_node = backup;
@@ -655,10 +654,9 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
 	}
 
 	node->item.rsrc = rsrc;
+	node->type = data->rsrc_type;
 	node->item.tag = *tag_slot;
 	*tag_slot = 0;
-
-	node->rsrc_data = data;
 	list_add_tail(&node->node, &ctx->rsrc_ref_list);
 	io_put_rsrc_node(ctx, node);
 	return 0;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 5d0733c4c08d..0a8a95e9b99e 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -40,10 +40,11 @@ struct io_rsrc_data {
 struct io_rsrc_node {
 	union {
 		struct io_cache_entry		cache;
-		struct io_rsrc_data		*rsrc_data;
+		struct io_ring_ctx		*ctx;
 	};
 	int				refs;
 	bool				empty;
+	u16				type;
 	struct list_head		node;
 	struct io_rsrc_put		item;
 };
-- 
cgit 


From ea97f6c8558e83cb457c3b5f53351e4fd8519ab1 Mon Sep 17 00:00:00 2001
From: David Wei <davidhwei@meta.com>
Date: Tue, 18 Apr 2023 15:58:18 -0700
Subject: io_uring: add support for multishot timeouts

A multishot timeout submission will repeatedly generate completions with
the IORING_CQE_F_MORE cflag set. Depending on the value of the `off'
field in the submission, these timeouts can either repeat indefinitely
until cancelled (`off' = 0) or for a fixed number of times (`off' > 0).

Only noseq timeouts (i.e. not dependent on the number of I/O
completions) are supported.

An indefinite timer will be cancelled if the CQ ever overflows.

Signed-off-by: David Wei <davidhwei@meta.com>
Link: https://lore.kernel.org/r/20230418225817.1905027-1-davidhwei@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/timeout.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 3 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 5c6c6f720809..fc950177e2e1 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -17,6 +17,7 @@ struct io_timeout {
 	struct file			*file;
 	u32				off;
 	u32				target_seq;
+	u32				repeats;
 	struct list_head		list;
 	/* head of the link, used by linked timeouts only */
 	struct io_kiocb			*head;
@@ -37,8 +38,9 @@ struct io_timeout_rem {
 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
 {
 	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
+	struct io_timeout_data *data = req->async_data;
 
-	return !timeout->off;
+	return !timeout->off || data->flags & IORING_TIMEOUT_MULTISHOT;
 }
 
 static inline void io_put_req(struct io_kiocb *req)
@@ -49,6 +51,44 @@ static inline void io_put_req(struct io_kiocb *req)
 	}
 }
 
+static inline bool io_timeout_finish(struct io_timeout *timeout,
+				     struct io_timeout_data *data)
+{
+	if (!(data->flags & IORING_TIMEOUT_MULTISHOT))
+		return true;
+
+	if (!timeout->off || (timeout->repeats && --timeout->repeats))
+		return false;
+
+	return true;
+}
+
+static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer);
+
+static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
+{
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
+	struct io_timeout_data *data = req->async_data;
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (!io_timeout_finish(timeout, data)) {
+		bool filled;
+		filled = io_aux_cqe(ctx, ts->locked, req->cqe.user_data, -ETIME,
+				    IORING_CQE_F_MORE, false);
+		if (filled) {
+			/* re-arm timer */
+			spin_lock_irq(&ctx->timeout_lock);
+			list_add(&timeout->list, ctx->timeout_list.prev);
+			data->timer.function = io_timeout_fn;
+			hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
+			spin_unlock_irq(&ctx->timeout_lock);
+			return;
+		}
+	}
+
+	io_req_task_complete(req, ts);
+}
+
 static bool io_kill_timeout(struct io_kiocb *req, int status)
 	__must_hold(&req->ctx->timeout_lock)
 {
@@ -212,7 +252,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 		req_set_fail(req);
 
 	io_req_set_res(req, -ETIME, 0);
-	req->io_task_work.func = io_req_task_complete;
+	req->io_task_work.func = io_timeout_complete;
 	io_req_task_work_add(req);
 	return HRTIMER_NORESTART;
 }
@@ -470,16 +510,27 @@ static int __io_timeout_prep(struct io_kiocb *req,
 		return -EINVAL;
 	flags = READ_ONCE(sqe->timeout_flags);
 	if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
-		      IORING_TIMEOUT_ETIME_SUCCESS))
+		      IORING_TIMEOUT_ETIME_SUCCESS |
+		      IORING_TIMEOUT_MULTISHOT))
 		return -EINVAL;
 	/* more than one clock specified is invalid, obviously */
 	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
 		return -EINVAL;
+	/* multishot requests only make sense with rel values */
+	if (!(~flags & (IORING_TIMEOUT_MULTISHOT | IORING_TIMEOUT_ABS)))
+		return -EINVAL;
 
 	INIT_LIST_HEAD(&timeout->list);
 	timeout->off = off;
 	if (unlikely(off && !req->ctx->off_timeout_used))
 		req->ctx->off_timeout_used = true;
+	/*
+	 * for multishot reqs w/ fixed nr of repeats, repeats tracks the
+	 * remaining nr
+	 */
+	timeout->repeats = 0;
+	if ((flags & IORING_TIMEOUT_MULTISHOT) && off > 0)
+		timeout->repeats = off;
 
 	if (WARN_ON_ONCE(req_has_async_data(req)))
 		return -EFAULT;
-- 
cgit 


From 3c85cc43c8e7855d202da184baf00c7b8eeacf71 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 19 Apr 2023 14:16:03 -0600
Subject: Revert "io_uring/rsrc: disallow multi-source reg buffers"

This reverts commit edd478269640b360c6f301f2baa04abdda563ef3.

There's really no specific need to disallow multiple sources of buffers,
and io_uring really should not be mandating this by itself. We should
be able to solely rely on GUP making these decisions.

As this also stands in the way of a cleanup where io_uring is the odd
one out, kill it.

Link: https://lore.kernel.org/all/61ded378-51a8-1dcb-b631-fda1903248a9@gmail.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 20dcc7668cb0..ddee7adb4006 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1053,17 +1053,14 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
 	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
 			      pages, vmas);
 	if (pret == nr_pages) {
-		struct file *file = vmas[0]->vm_file;
-
 		/* don't support file backed memory */
 		for (i = 0; i < nr_pages; i++) {
-			if (vmas[i]->vm_file != file) {
-				ret = -EINVAL;
-				break;
-			}
-			if (!file)
+			struct vm_area_struct *vma = vmas[i];
+
+			if (vma_is_shmem(vma))
 				continue;
-			if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) {
+			if (vma->vm_file &&
+			    !is_file_hugepages(vma->vm_file)) {
 				ret = -EOPNOTSUPP;
 				break;
 			}
-- 
cgit