From d4755e15386c38e4ae532ace5acc29fbfaee42e7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 7 Mar 2023 09:47:20 -0700 Subject: io_uring: avoid hashing O_DIRECT writes if the filesystem doesn't need it io_uring hashes writes to a given file/inode so that it can serialize them. This is useful if the file system needs exclusive access to the file to perform the write, as otherwise we end up with a ton of io-wq threads trying to lock the inode at the same time. This can cause excessive system time. But if the file system has flagged that it supports parallel O_DIRECT writes, then there's no need to serialize the writes. Check for that through FMODE_DIO_PARALLEL_WRITE and don't hash it if we don't need to. In a basic test of 8 threads writing to a file on XFS on a gen2 Optane, with each thread writing in 4k chunks, it improves performance from ~1350K IOPS (or ~5290MiB/sec) to ~1410K IOPS (or ~5500MiB/sec). Reviewed-by: Darrick J. Wong Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 722624b6d0dc..1ed96caa586a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -425,7 +425,13 @@ static void io_prep_async_work(struct io_kiocb *req) req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT; if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) + bool should_hash = def->hash_reg_file; + + /* don't serialize this request if the fs doesn't need it */ + if (should_hash && (req->file->f_flags & O_DIRECT) && + (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE)) + should_hash = false; + if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) -- cgit From d808459b2e31bd5123a14258a7a529995db974c8 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 16 Feb 2023 09:09:38 +0100 Subject: io_uring: Adjust mapping wrt architecture aliasing requirements Some architectures have memory cache aliasing requirements (e.g. parisc) if memory is shared between userspace and kernel. This patch fixes the kernel to return an aliased address when asked by userspace via mmap(). Signed-off-by: Helge Deller Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1ed96caa586a..b49b7ee12d60 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -72,6 +72,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -3323,6 +3324,54 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); + struct vm_unmapped_area_info info; + void *ptr; + + /* + * Do not allow to map to user-provided address to avoid breaking the + * aliasing rules. Userspace is not able to guess the offset address of + * kernel kmalloc()ed memory area. + */ + if (addr) + return -EINVAL; + + ptr = io_uring_validate_mmap_request(filp, pgoff, len); + if (IS_ERR(ptr)) + return -ENOMEM; + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); +#ifdef SHM_COLOUR + info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL); +#else + info.align_mask = PAGE_MASK & (SHMLBA - 1UL); +#endif + info.align_offset = (unsigned long) ptr; + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + addr = vm_unmapped_area(&info); + if (offset_in_page(addr)) { + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = mmap_end; + addr = vm_unmapped_area(&info); + } + + return addr; +} + #else /* !CONFIG_MMU */ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) @@ -3535,6 +3584,8 @@ static const struct file_operations io_uring_fops = { #ifndef CONFIG_MMU .get_unmapped_area = io_uring_nommu_get_unmapped_area, .mmap_capabilities = io_uring_nommu_mmap_capabilities, +#else + .get_unmapped_area = io_uring_mmu_get_unmapped_area, #endif .poll = io_uring_poll, #ifdef CONFIG_PROC_FS -- cgit From ba56b63242d12df088ed9a701cad320e6b306dfe Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Mar 2023 10:55:50 -0600 Subject: io_uring/kbuf: move pinning of provided buffer ring into helper In preparation for allowing the kernel to allocate the provided buffer rings and have the application mmap it instead, abstract out the current method of pinning and mapping the user allocated ring. No functional changes intended in this patch. Acked-by: Helge Deller Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) (limited to 'io_uring') diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 3002dc827195..3adc08f90e41 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -463,14 +463,32 @@ err: return IOU_OK; } -int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, + struct io_buffer_list *bl) { struct io_uring_buf_ring *br; - struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; struct page **pages; int nr_pages; + pages = io_pin_pages(reg->ring_addr, + flex_array_size(br, bufs, reg->ring_entries), + &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + br = page_address(pages[0]); + bl->buf_pages = pages; + bl->buf_nr_pages = nr_pages; + bl->buf_ring = br; + return 0; +} + +int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_reg reg; + struct io_buffer_list *bl, *free_bl = NULL; + int ret; + if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; @@ -504,20 +522,15 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -ENOMEM; } - pages = io_pin_pages(reg.ring_addr, - flex_array_size(br, bufs, reg.ring_entries), - &nr_pages); - if (IS_ERR(pages)) { + ret = io_pin_pbuf_ring(®, bl); + if (ret) { kfree(free_bl); - return PTR_ERR(pages); + return ret; } - br = page_address(pages[0]); - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; bl->nr_entries = reg.ring_entries; - bl->buf_ring = br; bl->mask = reg.ring_entries - 1; + io_buffer_add_list(ctx, bl, reg.bgid); return 0; } -- cgit From 25a2c188a0a00b3d9f2057798aa86fe6b04377bf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Mar 2023 10:59:46 -0600 Subject: io_uring/kbuf: add buffer_list->is_mapped member Rather than rely on checking buffer_list->buf_pages or ->buf_nr_pages, add a separate member that tracks if this is a ring mapped provided buffer list or not. Acked-by: Helge Deller Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 14 ++++++++------ io_uring/kbuf.h | 3 +++ 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'io_uring') diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 3adc08f90e41..db5f189267b7 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -179,7 +179,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, bl = io_buffer_get_list(ctx, req->buf_index); if (likely(bl)) { - if (bl->buf_nr_pages) + if (bl->is_mapped) ret = io_ring_buffer_select(req, len, bl, issue_flags); else ret = io_provided_buffer_select(req, len, bl); @@ -214,7 +214,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; - if (bl->buf_nr_pages) { + if (bl->is_mapped && bl->buf_nr_pages) { int j; i = bl->buf_ring->tail - bl->head; @@ -225,6 +225,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, bl->buf_nr_pages = 0; /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); + bl->is_mapped = 0; return i; } @@ -303,7 +304,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) if (bl) { ret = -EINVAL; /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->buf_nr_pages) + if (!bl->is_mapped) ret = __io_remove_buffers(ctx, bl, p->nbufs); } io_ring_submit_unlock(ctx, issue_flags); @@ -448,7 +449,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) } } /* can't add buffers via this command for a mapped buffer ring */ - if (bl->buf_nr_pages) { + if (bl->is_mapped) { ret = -EINVAL; goto err; } @@ -480,6 +481,7 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, bl->buf_pages = pages; bl->buf_nr_pages = nr_pages; bl->buf_ring = br; + bl->is_mapped = 1; return 0; } @@ -514,7 +516,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) + if (bl->is_mapped || !list_empty(&bl->buf_list)) return -EEXIST; } else { free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); @@ -548,7 +550,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (!bl) return -ENOENT; - if (!bl->buf_nr_pages) + if (!bl->is_mapped) return -EINVAL; __io_remove_buffers(ctx, bl, -1U); diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index c23e15d7d3ca..61b9c7dade9d 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -23,6 +23,9 @@ struct io_buffer_list { __u16 nr_entries; __u16 head; __u16 mask; + + /* ring mapped provided buffers */ + __u8 is_mapped; }; struct io_buffer { -- cgit From 81cf17cd3ab3e5441e876a8e9e9c38ae9920cecb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Mar 2023 11:01:45 -0600 Subject: io_uring/kbuf: rename struct io_uring_buf_reg 'pad' to'flags' In preparation for allowing flags to be set for registration, rename the padding and use it for that. Acked-by: Helge Deller Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index db5f189267b7..4b2f4a0ee962 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -494,7 +494,9 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + if (reg.flags) return -EINVAL; if (!reg.ring_addr) return -EFAULT; @@ -544,7 +546,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + if (reg.flags) return -EINVAL; bl = io_buffer_get_list(ctx, reg.bgid); -- cgit From c56e022c0a27142b7b59ae6bdf45f86bf4b298a1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Mar 2023 11:07:19 -0600 Subject: io_uring: add support for user mapped provided buffer ring The ring mapped provided buffer rings rely on the application allocating the memory for the ring, and then the kernel will map it. This generally works fine, but runs into issues on some architectures where we need to be able to ensure that the kernel and application virtual address for the ring play nicely together. This at least impacts architectures that set SHM_COLOUR, but potentially also anyone setting SHMLBA. To use this variant of ring provided buffers, the application need not allocate any memory for the ring. Instead the kernel will do so, and the allocation must subsequently call mmap(2) on the ring with the offset set to: IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) to get a virtual address for the buffer ring. Normally the application would allocate a suitable piece of memory (and correctly aligned) and simply pass that in via io_uring_buf_reg.ring_addr and the kernel would map it. Outside of the setup differences, the kernel allocate + user mapped provided buffer ring works exactly the same. Acked-by: Helge Deller Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 13 ++++++- io_uring/kbuf.c | 99 ++++++++++++++++++++++++++++++++++++++++------------- io_uring/kbuf.h | 4 +++ 3 files changed, 92 insertions(+), 24 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b49b7ee12d60..d72aa92ce2d6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3289,7 +3289,7 @@ static void *io_uring_validate_mmap_request(struct file *file, struct page *page; void *ptr; - switch (offset) { + switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: ptr = ctx->rings; @@ -3297,6 +3297,17 @@ static void *io_uring_validate_mmap_request(struct file *file, case IORING_OFF_SQES: ptr = ctx->sq_sqes; break; + case IORING_OFF_PBUF_RING: { + unsigned int bgid; + + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + mutex_lock(&ctx->uring_lock); + ptr = io_pbuf_get_address(ctx, bgid); + mutex_unlock(&ctx->uring_lock); + if (!ptr) + return ERR_PTR(-EINVAL); + break; + } default: return ERR_PTR(-EINVAL); } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 4b2f4a0ee962..cd1d9dddf58e 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -137,7 +137,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, return NULL; head &= bl->mask; - if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { + /* mmaped buffers are always contig */ + if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { buf = &br->bufs[head]; } else { int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); @@ -214,15 +215,27 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; - if (bl->is_mapped && bl->buf_nr_pages) { - int j; - + if (bl->is_mapped) { i = bl->buf_ring->tail - bl->head; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; + if (bl->is_mmap) { + if (bl->buf_ring) { + struct page *page; + + page = virt_to_head_page(bl->buf_ring); + if (put_page_testzero(page)) + free_compound_page(page); + bl->buf_ring = NULL; + } + bl->is_mmap = 0; + } else if (bl->buf_nr_pages) { + int j; + + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + kvfree(bl->buf_pages); + bl->buf_pages = NULL; + bl->buf_nr_pages = 0; + } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); bl->is_mapped = 0; @@ -482,6 +495,25 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, bl->buf_nr_pages = nr_pages; bl->buf_ring = br; bl->is_mapped = 1; + bl->is_mmap = 0; + return 0; +} + +static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, + struct io_buffer_list *bl) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; + size_t ring_size; + void *ptr; + + ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); + ptr = (void *) __get_free_pages(gfp, get_order(ring_size)); + if (!ptr) + return -ENOMEM; + + bl->buf_ring = ptr; + bl->is_mapped = 1; + bl->is_mmap = 1; return 0; } @@ -496,12 +528,18 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.resv[0] || reg.resv[1] || reg.resv[2]) return -EINVAL; - if (reg.flags) - return -EINVAL; - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) + if (reg.flags & ~IOU_PBUF_RING_MMAP) return -EINVAL; + if (!(reg.flags & IOU_PBUF_RING_MMAP)) { + if (!reg.ring_addr) + return -EFAULT; + if (reg.ring_addr & ~PAGE_MASK) + return -EINVAL; + } else { + if (reg.ring_addr) + return -EINVAL; + } + if (!is_power_of_2(reg.ring_entries)) return -EINVAL; @@ -526,17 +564,21 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -ENOMEM; } - ret = io_pin_pbuf_ring(®, bl); - if (ret) { - kfree(free_bl); - return ret; - } + if (!(reg.flags & IOU_PBUF_RING_MMAP)) + ret = io_pin_pbuf_ring(®, bl); + else + ret = io_alloc_pbuf_ring(®, bl); - bl->nr_entries = reg.ring_entries; - bl->mask = reg.ring_entries - 1; + if (!ret) { + bl->nr_entries = reg.ring_entries; + bl->mask = reg.ring_entries - 1; - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; + } + + kfree(free_bl); + return ret; } int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) @@ -564,3 +606,14 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) } return 0; } + +void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) +{ + struct io_buffer_list *bl; + + bl = io_buffer_get_list(ctx, bgid); + if (!bl || !bl->is_mmap) + return NULL; + + return bl->buf_ring; +} diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 61b9c7dade9d..d14345ef61fc 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -26,6 +26,8 @@ struct io_buffer_list { /* ring mapped provided buffers */ __u8 is_mapped; + /* ring mapped provided buffers, but mmap'ed by application */ + __u8 is_mmap; }; struct io_buffer { @@ -53,6 +55,8 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); +void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid); + static inline void io_kbuf_recycle_ring(struct io_kiocb *req) { /* -- cgit From da64d6db3bd304d44d7ac1eb7f319a1cc7efd611 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 10 Mar 2023 12:11:07 -0800 Subject: io_uring: One wqe per wq Right now io_wq allocates one io_wqe per NUMA node. As io_wq is now bound to a task, the task basically uses only the NUMA local io_wqe, and almost never changes NUMA nodes, thus, the other wqes are mostly unused. Allocate just one io_wqe embedded into io_wq, and uses all possible cpus (cpu_possible_mask) in the io_wqe->cpumask. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20230310201107.4020580-1-leitao@debian.org Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 180 ++++++++++++++++++++++--------------------------------- 1 file changed, 70 insertions(+), 110 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index f81c0a7136a5..44d522c5d36f 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "io-wq.h" @@ -96,8 +97,6 @@ struct io_wqe { raw_spinlock_t lock; struct io_wqe_acct acct[IO_WQ_ACCT_NR]; - int node; - struct hlist_nulls_head free_list; struct list_head all_list; @@ -127,7 +126,7 @@ struct io_wq { struct task_struct *task; - struct io_wqe *wqes[]; + struct io_wqe wqe; }; static enum cpuhp_state io_wq_online; @@ -754,7 +753,7 @@ static void create_worker_cont(struct callback_head *cb) worker = container_of(cb, struct io_worker, create_work); clear_bit_unlock(0, &worker->create_state); wqe = worker->wqe; - tsk = create_io_thread(io_wqe_worker, worker, wqe->node); + tsk = create_io_thread(io_wqe_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { io_init_new_worker(wqe, worker, tsk); io_worker_release(worker); @@ -804,7 +803,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) __set_current_state(TASK_RUNNING); - worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); + worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) { fail: atomic_dec(&acct->nr_running); @@ -823,7 +822,7 @@ fail: if (index == IO_WQ_ACCT_BOUND) worker->flags |= IO_WORKER_F_BOUND; - tsk = create_io_thread(io_wqe_worker, worker, wqe->node); + tsk = create_io_thread(io_wqe_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { io_init_new_worker(wqe, worker, tsk); } else if (!io_should_retry_thread(PTR_ERR(tsk))) { @@ -961,7 +960,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { - struct io_wqe *wqe = wq->wqes[numa_node_id()]; + struct io_wqe *wqe = &wq->wqe; io_wqe_enqueue(wqe, work); } @@ -1083,7 +1082,7 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, .data = data, .cancel_all = cancel_all, }; - int node; + struct io_wqe *wqe = &wq->wqe; /* * First check pending list, if we're lucky we can just remove it @@ -1098,19 +1097,15 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, * Do both of these while holding the wqe->lock, to ensure that * we'll find a work item regardless of state. */ - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wqe_cancel_pending_work(wqe, &match); - if (match.nr_pending && !match.cancel_all) - return IO_WQ_CANCEL_OK; + io_wqe_cancel_pending_work(wqe, &match); + if (match.nr_pending && !match.cancel_all) + return IO_WQ_CANCEL_OK; - raw_spin_lock(&wqe->lock); - io_wqe_cancel_running_work(wqe, &match); - raw_spin_unlock(&wqe->lock); - if (match.nr_running && !match.cancel_all) - return IO_WQ_CANCEL_RUNNING; - } + raw_spin_lock(&wqe->lock); + io_wqe_cancel_running_work(wqe, &match); + raw_spin_unlock(&wqe->lock); + if (match.nr_running && !match.cancel_all) + return IO_WQ_CANCEL_RUNNING; if (match.nr_running) return IO_WQ_CANCEL_RUNNING; @@ -1140,15 +1135,16 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { - int ret, node, i; + int ret, i; struct io_wq *wq; + struct io_wqe *wqe; if (WARN_ON_ONCE(!data->free_work || !data->do_work)) return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(!bounded)) return ERR_PTR(-EINVAL); - wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL); + wq = kzalloc(sizeof(struct io_wq), GFP_KERNEL); if (!wq) return ERR_PTR(-ENOMEM); ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); @@ -1159,40 +1155,30 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wq->hash = data->hash; wq->free_work = data->free_work; wq->do_work = data->do_work; + wqe = &wq->wqe; ret = -ENOMEM; - for_each_node(node) { - struct io_wqe *wqe; - int alloc_node = node; - - if (!node_online(alloc_node)) - alloc_node = NUMA_NO_NODE; - wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node); - if (!wqe) - goto err; - wq->wqes[node] = wqe; - if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL)) - goto err; - cpumask_copy(wqe->cpu_mask, cpumask_of_node(node)); - wqe->node = alloc_node; - wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; - wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = - task_rlimit(current, RLIMIT_NPROC); - INIT_LIST_HEAD(&wqe->wait.entry); - wqe->wait.func = io_wqe_hash_wake; - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = &wqe->acct[i]; - - acct->index = i; - atomic_set(&acct->nr_running, 0); - INIT_WQ_LIST(&acct->work_list); - raw_spin_lock_init(&acct->lock); - } - wqe->wq = wq; - raw_spin_lock_init(&wqe->lock); - INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); - INIT_LIST_HEAD(&wqe->all_list); + + if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL)) + goto err; + cpumask_copy(wqe->cpu_mask, cpu_possible_mask); + wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; + wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = + task_rlimit(current, RLIMIT_NPROC); + INIT_LIST_HEAD(&wqe->wait.entry); + wqe->wait.func = io_wqe_hash_wake; + for (i = 0; i < IO_WQ_ACCT_NR; i++) { + struct io_wqe_acct *acct = &wqe->acct[i]; + + acct->index = i; + atomic_set(&acct->nr_running, 0); + INIT_WQ_LIST(&acct->work_list); + raw_spin_lock_init(&acct->lock); } + wqe->wq = wq; + raw_spin_lock_init(&wqe->lock); + INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); + INIT_LIST_HEAD(&wqe->all_list); wq->task = get_task_struct(data->task); atomic_set(&wq->worker_refs, 1); @@ -1201,12 +1187,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) err: io_wq_put_hash(data->hash); cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - for_each_node(node) { - if (!wq->wqes[node]) - continue; - free_cpumask_var(wq->wqes[node]->cpu_mask); - kfree(wq->wqes[node]); - } + + free_cpumask_var(wq->wqe.cpu_mask); err_wq: kfree(wq); return ERR_PTR(ret); @@ -1247,48 +1229,36 @@ static void io_wq_cancel_tw_create(struct io_wq *wq) static void io_wq_exit_workers(struct io_wq *wq) { - int node; - if (!wq->task) return; io_wq_cancel_tw_create(wq); rcu_read_lock(); - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL); - } + io_wq_for_each_worker(&wq->wqe, io_wq_worker_wake, NULL); rcu_read_unlock(); io_worker_ref_put(wq); wait_for_completion(&wq->worker_done); - for_each_node(node) { - spin_lock_irq(&wq->hash->wait.lock); - list_del_init(&wq->wqes[node]->wait.entry); - spin_unlock_irq(&wq->hash->wait.lock); - } + spin_lock_irq(&wq->hash->wait.lock); + list_del_init(&wq->wqe.wait.entry); + spin_unlock_irq(&wq->hash->wait.lock); + put_task_struct(wq->task); wq->task = NULL; } static void io_wq_destroy(struct io_wq *wq) { - int node; + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_all, + .cancel_all = true, + }; + struct io_wqe *wqe = &wq->wqe; cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - struct io_cb_cancel_data match = { - .fn = io_wq_work_match_all, - .cancel_all = true, - }; - io_wqe_cancel_pending_work(wqe, &match); - free_cpumask_var(wqe->cpu_mask); - kfree(wqe); - } + io_wqe_cancel_pending_work(wqe, &match); + free_cpumask_var(wqe->cpu_mask); io_wq_put_hash(wq->hash); kfree(wq); } @@ -1323,11 +1293,9 @@ static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online) .cpu = cpu, .online = online }; - int i; rcu_read_lock(); - for_each_node(i) - io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od); + io_wq_for_each_worker(&wq->wqe, io_wq_worker_affinity, &od); rcu_read_unlock(); return 0; } @@ -1348,18 +1316,15 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) { - int i; + struct io_wqe *wqe = &wq->wqe; rcu_read_lock(); - for_each_node(i) { - struct io_wqe *wqe = wq->wqes[i]; - - if (mask) - cpumask_copy(wqe->cpu_mask, mask); - else - cpumask_copy(wqe->cpu_mask, cpumask_of_node(i)); - } + if (mask) + cpumask_copy(wqe->cpu_mask, mask); + else + cpumask_copy(wqe->cpu_mask, cpu_possible_mask); rcu_read_unlock(); + return 0; } @@ -1369,9 +1334,10 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) */ int io_wq_max_workers(struct io_wq *wq, int *new_count) { + struct io_wqe *wqe = &wq->wqe; + struct io_wqe_acct *acct; int prev[IO_WQ_ACCT_NR]; - bool first_node = true; - int i, node; + int i; BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND); BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND); @@ -1386,21 +1352,15 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) prev[i] = 0; rcu_read_lock(); - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - struct io_wqe_acct *acct; - raw_spin_lock(&wqe->lock); - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - acct = &wqe->acct[i]; - if (first_node) - prev[i] = max_t(int, acct->max_workers, prev[i]); - if (new_count[i]) - acct->max_workers = new_count[i]; - } - raw_spin_unlock(&wqe->lock); - first_node = false; + raw_spin_lock(&wqe->lock); + for (i = 0; i < IO_WQ_ACCT_NR; i++) { + acct = &wqe->acct[i]; + prev[i] = max_t(int, acct->max_workers, prev[i]); + if (new_count[i]) + acct->max_workers = new_count[i]; } + raw_spin_unlock(&wqe->lock); rcu_read_unlock(); for (i = 0; i < IO_WQ_ACCT_NR; i++) -- cgit From efba1a9e653e107577a48157b5424878c46f2285 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 23 Feb 2023 08:43:52 -0800 Subject: io_uring: Move from hlist to io_wq_work_node Having cache entries linked using the hlist format brings no benefit, and also requires an unnecessary extra pointer address per cache entry. Use the internal io_wq_work_node single-linked list for the internal alloc caches (async_msghdr and async_poll) This is required to be able to use KASAN on cache entries, since we do not need to touch unused (and poisoned) cache entries when adding more entries to the list. Suggested-by: Pavel Begunkov Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20230223164353.2839177-2-leitao@debian.org Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'io_uring') diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index c2cde88aeed5..aaa838c31d92 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -7,7 +7,7 @@ #define IO_ALLOC_CACHE_MAX 512 struct io_cache_entry { - struct hlist_node node; + struct io_wq_work_node node; }; static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, @@ -15,7 +15,7 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, { if (cache->nr_cached < IO_ALLOC_CACHE_MAX) { cache->nr_cached++; - hlist_add_head(&entry->node, &cache->list); + wq_stack_add_head(&entry->node, &cache->list); return true; } return false; @@ -23,12 +23,13 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) { - if (!hlist_empty(&cache->list)) { - struct hlist_node *node = cache->list.first; + if (cache->list.next) { + struct io_cache_entry *entry; - hlist_del(node); + entry = container_of(cache->list.next, struct io_cache_entry, node); + cache->list.next = cache->list.next->next; cache->nr_cached--; - return container_of(node, struct io_cache_entry, node); + return entry; } return NULL; @@ -36,18 +37,19 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c static inline void io_alloc_cache_init(struct io_alloc_cache *cache) { - INIT_HLIST_HEAD(&cache->list); + cache->list.next = NULL; cache->nr_cached = 0; } static inline void io_alloc_cache_free(struct io_alloc_cache *cache, void (*free)(struct io_cache_entry *)) { - while (!hlist_empty(&cache->list)) { - struct hlist_node *node = cache->list.first; + while (1) { + struct io_cache_entry *entry = io_alloc_cache_get(cache); - hlist_del(node); - free(container_of(node, struct io_cache_entry, node)); + if (!entry) + break; + free(entry); } cache->nr_cached = 0; } -- cgit From e1fe7ee885dc0712e982ee465d9f8b96254c30c1 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 23 Feb 2023 08:43:53 -0800 Subject: io_uring: Add KASAN support for alloc_caches Add support for KASAN in the alloc_caches (apoll and netmsg_cache). Thus, if something touches the unused caches, it will raise a KASAN warning/exception. It poisons the object when the object is put to the cache, and unpoisons it when the object is gotten or freed. Signed-off-by: Breno Leitao Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20230223164353.2839177-2-leitao@debian.org Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 6 +++++- io_uring/io_uring.c | 4 ++-- io_uring/net.h | 5 ++++- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'io_uring') diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index aaa838c31d92..2fbecaa3a1ba 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -16,6 +16,8 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, if (cache->nr_cached < IO_ALLOC_CACHE_MAX) { cache->nr_cached++; wq_stack_add_head(&entry->node, &cache->list); + /* KASAN poisons object */ + kasan_slab_free_mempool(entry); return true; } return false; @@ -27,6 +29,7 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c struct io_cache_entry *entry; entry = container_of(cache->list.next, struct io_cache_entry, node); + kasan_unpoison_range(entry, cache->elem_size); cache->list.next = cache->list.next->next; cache->nr_cached--; return entry; @@ -35,10 +38,11 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c return NULL; } -static inline void io_alloc_cache_init(struct io_alloc_cache *cache) +static inline void io_alloc_cache_init(struct io_alloc_cache *cache, size_t size) { cache->list.next = NULL; cache->nr_cached = 0; + cache->elem_size = size; } static inline void io_alloc_cache_free(struct io_alloc_cache *cache, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d72aa92ce2d6..24be4992821b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -310,8 +310,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - io_alloc_cache_init(&ctx->apoll_cache); - io_alloc_cache_init(&ctx->netmsg_cache); + io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll)); + io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr)); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); diff --git a/io_uring/net.h b/io_uring/net.h index 5ffa11bf5d2e..191009979bcb 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -5,8 +5,8 @@ #include "alloc_cache.h" -#if defined(CONFIG_NET) struct io_async_msghdr { +#if defined(CONFIG_NET) union { struct iovec fast_iov[UIO_FASTIOV]; struct { @@ -22,8 +22,11 @@ struct io_async_msghdr { struct sockaddr __user *uaddr; struct msghdr msg; struct sockaddr_storage addr; +#endif }; +#if defined(CONFIG_NET) + struct io_async_connect { struct sockaddr_storage address; }; -- cgit From fcb46c0ccc7c07af54f818fd498e461353ea50e7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Mar 2023 10:42:08 -0600 Subject: io_uring/kbuf: disallow mapping a badly aligned provided ring buffer On at least parisc, we have strict requirements on how we virtually map an address that is shared between the application and the kernel. On these platforms, IOU_PBUF_RING_MMAP should be used when setting up a shared ring buffer for provided buffers. If the application is mapping these pages and asking the kernel to pin+map them as well, then we have no control over what virtual address we get in the kernel. For that case, do a sanity check if SHM_COLOUR is defined, and disallow the mapping request. The application must fall back to using IOU_PBUF_RING_MMAP for this case, and liburing will do that transparently with the set of helpers that it has. Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'io_uring') diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index cd1d9dddf58e..79c25459e8de 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -491,6 +491,24 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, return PTR_ERR(pages); br = page_address(pages[0]); +#ifdef SHM_COLOUR + /* + * On platforms that have specific aliasing requirements, SHM_COLOUR + * is set and we must guarantee that the kernel and user side align + * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and + * the application mmap's the provided ring buffer. Fail the request + * if we, by chance, don't end up with aligned addresses. The app + * should use IOU_PBUF_RING_MMAP instead, and liburing will handle + * this transparently. + */ + if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { + int i; + + for (i = 0; i < nr_pages; i++) + unpin_user_page(pages[i]); + return -EINVAL; + } +#endif bl->buf_pages = pages; bl->buf_nr_pages = nr_pages; bl->buf_ring = br; -- cgit From dfd63baf892c016dd0c8c6e99d0973459aabe554 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 21 Mar 2023 22:16:27 -0300 Subject: io-wq: Move wq accounting to io_wq Since we now have a single io_wqe per io_wq instead of per-node, and in preparation to its removal, move the accounting into the parent structure. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20230322011628.23359-2-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 78 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 38 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 44d522c5d36f..da7c6e00b690 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -74,7 +74,7 @@ struct io_worker { #define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) -struct io_wqe_acct { +struct io_wq_acct { unsigned nr_workers; unsigned max_workers; int index; @@ -95,7 +95,6 @@ enum { */ struct io_wqe { raw_spinlock_t lock; - struct io_wqe_acct acct[IO_WQ_ACCT_NR]; struct hlist_nulls_head free_list; struct list_head all_list; @@ -126,6 +125,8 @@ struct io_wq { struct task_struct *task; + struct io_wq_acct acct[IO_WQ_ACCT_NR]; + struct io_wqe wqe; }; @@ -142,7 +143,7 @@ struct io_cb_cancel_data { static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index); static void io_wqe_dec_running(struct io_worker *worker); static bool io_acct_cancel_pending_work(struct io_wqe *wqe, - struct io_wqe_acct *acct, + struct io_wq_acct *acct, struct io_cb_cancel_data *match); static void create_worker_cb(struct callback_head *cb); static void io_wq_cancel_tw_create(struct io_wq *wq); @@ -158,20 +159,20 @@ static void io_worker_release(struct io_worker *worker) complete(&worker->ref_done); } -static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound) +static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) { - return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND]; + return &wq->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND]; } -static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, - struct io_wq_work *work) +static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, + struct io_wq_work *work) { - return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND)); } -static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker) +static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND); + return io_get_acct(worker->wqe->wq, worker->flags & IO_WORKER_F_BOUND); } static void io_worker_ref_put(struct io_wq *wq) @@ -182,7 +183,7 @@ static void io_worker_ref_put(struct io_wq *wq) static void io_worker_cancel_cb(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; @@ -238,7 +239,7 @@ static void io_worker_exit(struct io_worker *worker) do_exit(0); } -static inline bool io_acct_run_queue(struct io_wqe_acct *acct) +static inline bool io_acct_run_queue(struct io_wq_acct *acct) { bool ret = false; @@ -256,7 +257,7 @@ static inline bool io_acct_run_queue(struct io_wqe_acct *acct) * caller must create one. */ static bool io_wqe_activate_free_worker(struct io_wqe *wqe, - struct io_wqe_acct *acct) + struct io_wq_acct *acct) __must_hold(RCU) { struct hlist_nulls_node *n; @@ -270,7 +271,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe, hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) { if (!io_worker_get(worker)) continue; - if (io_wqe_get_acct(worker) != acct) { + if (io_wq_get_acct(worker) != acct) { io_worker_release(worker); continue; } @@ -288,7 +289,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe, * We need a worker. If we find a free one, we're good. If not, and we're * below the max number of workers, create one. */ -static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) +static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wq_acct *acct) { /* * Most likely an attempt to queue unbounded work on an io_wq that @@ -311,7 +312,7 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) static void io_wqe_inc_running(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); atomic_inc(&acct->nr_running); } @@ -321,13 +322,13 @@ static void create_worker_cb(struct callback_head *cb) struct io_worker *worker; struct io_wq *wq; struct io_wqe *wqe; - struct io_wqe_acct *acct; + struct io_wq_acct *acct; bool do_create = false; worker = container_of(cb, struct io_worker, create_work); wqe = worker->wqe; wq = wqe->wq; - acct = &wqe->acct[worker->create_index]; + acct = &wq->acct[worker->create_index]; raw_spin_lock(&wqe->lock); if (acct->nr_workers < acct->max_workers) { acct->nr_workers++; @@ -345,7 +346,7 @@ static void create_worker_cb(struct callback_head *cb) } static bool io_queue_worker_create(struct io_worker *worker, - struct io_wqe_acct *acct, + struct io_wq_acct *acct, task_work_func_t func) { struct io_wqe *wqe = worker->wqe; @@ -393,7 +394,7 @@ fail: static void io_wqe_dec_running(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wqe *wqe = worker->wqe; if (!(worker->flags & IO_WORKER_F_UP)) @@ -462,7 +463,7 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) return ret; } -static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, +static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, struct io_worker *worker) __must_hold(acct->lock) { @@ -537,7 +538,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); static void io_worker_handle_work(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); @@ -612,7 +613,7 @@ static void io_worker_handle_work(struct io_worker *worker) static int io_wqe_worker(void *data) { struct io_worker *worker = data; - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; bool exit_mask = false, last_timeout = false; @@ -759,7 +760,7 @@ static void create_worker_cont(struct callback_head *cb) io_worker_release(worker); return; } else if (!io_should_retry_thread(PTR_ERR(tsk))) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); atomic_dec(&acct->nr_running); raw_spin_lock(&wqe->lock); @@ -789,7 +790,7 @@ static void create_worker_cont(struct callback_head *cb) static void io_workqueue_create(struct work_struct *work) { struct io_worker *worker = container_of(work, struct io_worker, work); - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); if (!io_queue_worker_create(worker, acct, create_worker_cont)) kfree(worker); @@ -797,7 +798,7 @@ static void io_workqueue_create(struct work_struct *work) static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) { - struct io_wqe_acct *acct = &wqe->acct[index]; + struct io_wq_acct *acct = &wq->acct[index]; struct io_worker *worker; struct task_struct *tsk; @@ -881,7 +882,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) { - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + struct io_wq_acct *acct = io_work_get_acct(wqe->wq, work); unsigned int hash; struct io_wq_work *tail; @@ -907,7 +908,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + struct io_wq_acct *acct = io_work_get_acct(wqe->wq, work); struct io_cb_cancel_data match; unsigned work_flags = work->flags; bool do_create; @@ -1011,7 +1012,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe, struct io_wq_work *work, struct io_wq_work_node *prev) { - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + struct io_wq_acct *acct = io_work_get_acct(wqe->wq, work); unsigned int hash = io_get_work_hash(work); struct io_wq_work *prev_work = NULL; @@ -1027,7 +1028,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe, } static bool io_acct_cancel_pending_work(struct io_wqe *wqe, - struct io_wqe_acct *acct, + struct io_wq_acct *acct, struct io_cb_cancel_data *match) { struct io_wq_work_node *node, *prev; @@ -1051,12 +1052,12 @@ static bool io_acct_cancel_pending_work(struct io_wqe *wqe, } static void io_wqe_cancel_pending_work(struct io_wqe *wqe, - struct io_cb_cancel_data *match) + struct io_cb_cancel_data *match) { int i; retry: for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); + struct io_wq_acct *acct = io_get_acct(wqe->wq, i == 0); if (io_acct_cancel_pending_work(wqe, acct, match)) { if (match->cancel_all) @@ -1118,13 +1119,14 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); + struct io_wq *wq = wqe->wq; int i; list_del_init(&wait->entry); rcu_read_lock(); for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = &wqe->acct[i]; + struct io_wq_acct *acct = &wq->acct[i]; if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) io_wqe_activate_free_worker(wqe, acct); @@ -1162,13 +1164,13 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL)) goto err; cpumask_copy(wqe->cpu_mask, cpu_possible_mask); - wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; - wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = + wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; + wq->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); INIT_LIST_HEAD(&wqe->wait.entry); wqe->wait.func = io_wqe_hash_wake; for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = &wqe->acct[i]; + struct io_wq_acct *acct = &wq->acct[i]; acct->index = i; atomic_set(&acct->nr_running, 0); @@ -1335,7 +1337,7 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) int io_wq_max_workers(struct io_wq *wq, int *new_count) { struct io_wqe *wqe = &wq->wqe; - struct io_wqe_acct *acct; + struct io_wq_acct *acct; int prev[IO_WQ_ACCT_NR]; int i; @@ -1355,7 +1357,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) raw_spin_lock(&wqe->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { - acct = &wqe->acct[i]; + acct = &wq->acct[i]; prev[i] = max_t(int, acct->max_workers, prev[i]); if (new_count[i]) acct->max_workers = new_count[i]; -- cgit From eb47943f2238bf3a002128d897f18abb143612d3 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 21 Mar 2023 22:16:28 -0300 Subject: io-wq: Drop struct io_wqe Since commit 0654b05e7e65 ("io_uring: One wqe per wq"), we have just a single io_wqe instance embedded per io_wq. Drop the extra structure in favor of accessing struct io_wq directly, cleaning up quite a bit of dereferences and backpointers. No functional changes intended. Tested with liburing's testsuite and mmtests performance microbenchmarks. I didn't observe any performance regressions. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20230322011628.23359-2-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 344 +++++++++++++++++++++++++------------------------------ 1 file changed, 156 insertions(+), 188 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index da7c6e00b690..2b0b2e33cd71 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -40,7 +40,7 @@ enum { }; /* - * One for each thread in a wqe pool + * One for each thread in a wq pool */ struct io_worker { refcount_t ref; @@ -48,7 +48,7 @@ struct io_worker { struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; - struct io_wqe *wqe; + struct io_wq *wq; struct io_wq_work *cur_work; struct io_wq_work *next_work; @@ -90,23 +90,6 @@ enum { IO_WQ_ACCT_NR, }; -/* - * Per-node worker thread pool - */ -struct io_wqe { - raw_spinlock_t lock; - - struct hlist_nulls_head free_list; - struct list_head all_list; - - struct wait_queue_entry wait; - - struct io_wq *wq; - struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; - - cpumask_var_t cpu_mask; -}; - /* * Per io_wq state */ @@ -127,7 +110,17 @@ struct io_wq { struct io_wq_acct acct[IO_WQ_ACCT_NR]; - struct io_wqe wqe; + /* lock protects access to elements below */ + raw_spinlock_t lock; + + struct hlist_nulls_head free_list; + struct list_head all_list; + + struct wait_queue_entry wait; + + struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; + + cpumask_var_t cpu_mask; }; static enum cpuhp_state io_wq_online; @@ -140,9 +133,9 @@ struct io_cb_cancel_data { bool cancel_all; }; -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index); -static void io_wqe_dec_running(struct io_worker *worker); -static bool io_acct_cancel_pending_work(struct io_wqe *wqe, +static bool create_io_worker(struct io_wq *wq, int index); +static void io_wq_dec_running(struct io_worker *worker); +static bool io_acct_cancel_pending_work(struct io_wq *wq, struct io_wq_acct *acct, struct io_cb_cancel_data *match); static void create_worker_cb(struct callback_head *cb); @@ -172,7 +165,7 @@ static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wqe->wq, worker->flags & IO_WORKER_F_BOUND); + return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND); } static void io_worker_ref_put(struct io_wq *wq) @@ -184,13 +177,12 @@ static void io_worker_ref_put(struct io_wq *wq) static void io_worker_cancel_cb(struct io_worker *worker) { struct io_wq_acct *acct = io_wq_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq *wq = worker->wq; atomic_dec(&acct->nr_running); - raw_spin_lock(&worker->wqe->lock); + raw_spin_lock(&wq->lock); acct->nr_workers--; - raw_spin_unlock(&worker->wqe->lock); + raw_spin_unlock(&wq->lock); io_worker_ref_put(wq); clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); @@ -208,8 +200,7 @@ static bool io_task_worker_match(struct callback_head *cb, void *data) static void io_worker_exit(struct io_worker *worker) { - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq *wq = worker->wq; while (1) { struct callback_head *cb = task_work_cancel_match(wq->task, @@ -223,19 +214,19 @@ static void io_worker_exit(struct io_worker *worker) io_worker_release(worker); wait_for_completion(&worker->ref_done); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); if (worker->flags & IO_WORKER_F_FREE) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - raw_spin_unlock(&wqe->lock); - io_wqe_dec_running(worker); + raw_spin_unlock(&wq->lock); + io_wq_dec_running(worker); worker->flags = 0; preempt_disable(); current->flags &= ~PF_IO_WORKER; preempt_enable(); kfree_rcu(worker, rcu); - io_worker_ref_put(wqe->wq); + io_worker_ref_put(wq); do_exit(0); } @@ -256,7 +247,7 @@ static inline bool io_acct_run_queue(struct io_wq_acct *acct) * Check head of free list for an available worker. If one isn't available, * caller must create one. */ -static bool io_wqe_activate_free_worker(struct io_wqe *wqe, +static bool io_wq_activate_free_worker(struct io_wq *wq, struct io_wq_acct *acct) __must_hold(RCU) { @@ -268,7 +259,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe, * activate. If a given worker is on the free_list but in the process * of exiting, keep trying. */ - hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) { + hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) { if (!io_worker_get(worker)) continue; if (io_wq_get_acct(worker) != acct) { @@ -289,7 +280,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe, * We need a worker. If we find a free one, we're good. If not, and we're * below the max number of workers, create one. */ -static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wq_acct *acct) +static bool io_wq_create_worker(struct io_wq *wq, struct io_wq_acct *acct) { /* * Most likely an attempt to queue unbounded work on an io_wq that @@ -298,19 +289,19 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wq_acct *acct) if (unlikely(!acct->max_workers)) pr_warn_once("io-wq is not configured for unbound workers"); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); if (acct->nr_workers >= acct->max_workers) { - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); return true; } acct->nr_workers++; - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); - return create_io_worker(wqe->wq, wqe, acct->index); + atomic_inc(&wq->worker_refs); + return create_io_worker(wq, acct->index); } -static void io_wqe_inc_running(struct io_worker *worker) +static void io_wq_inc_running(struct io_worker *worker) { struct io_wq_acct *acct = io_wq_get_acct(worker); @@ -321,22 +312,22 @@ static void create_worker_cb(struct callback_head *cb) { struct io_worker *worker; struct io_wq *wq; - struct io_wqe *wqe; + struct io_wq_acct *acct; bool do_create = false; worker = container_of(cb, struct io_worker, create_work); - wqe = worker->wqe; - wq = wqe->wq; + wq = worker->wq; acct = &wq->acct[worker->create_index]; - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); + if (acct->nr_workers < acct->max_workers) { acct->nr_workers++; do_create = true; } - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); if (do_create) { - create_io_worker(wq, wqe, worker->create_index); + create_io_worker(wq, worker->create_index); } else { atomic_dec(&acct->nr_running); io_worker_ref_put(wq); @@ -349,8 +340,7 @@ static bool io_queue_worker_create(struct io_worker *worker, struct io_wq_acct *acct, task_work_func_t func) { - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq *wq = worker->wq; /* raced with exit, just ignore create call */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) @@ -392,10 +382,10 @@ fail: return false; } -static void io_wqe_dec_running(struct io_worker *worker) +static void io_wq_dec_running(struct io_worker *worker) { struct io_wq_acct *acct = io_wq_get_acct(worker); - struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = worker->wq; if (!(worker->flags & IO_WORKER_F_UP)) return; @@ -406,7 +396,7 @@ static void io_wqe_dec_running(struct io_worker *worker) return; atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); + atomic_inc(&wq->worker_refs); io_queue_worker_create(worker, acct, create_worker_cb); } @@ -414,13 +404,13 @@ static void io_wqe_dec_running(struct io_worker *worker) * Worker will start processing some work. Move it to the busy list, if * it's currently on the freelist */ -static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker) +static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) { if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); hlist_nulls_del_init_rcu(&worker->nulls_node); - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); } } @@ -431,12 +421,12 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker) * retry the loop in that case (we changed task state), we don't regrab * the lock if we return success. */ -static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) - __must_hold(wqe->lock) +static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) + __must_hold(wq->lock) { if (!(worker->flags & IO_WORKER_F_FREE)) { worker->flags |= IO_WORKER_F_FREE; - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); } } @@ -445,17 +435,16 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work) return work->flags >> IO_WQ_HASH_SHIFT; } -static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) +static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) { - struct io_wq *wq = wqe->wq; bool ret = false; spin_lock_irq(&wq->hash->wait.lock); - if (list_empty(&wqe->wait.entry)) { - __add_wait_queue(&wq->hash->wait, &wqe->wait); + if (list_empty(&wq->wait.entry)) { + __add_wait_queue(&wq->hash->wait, &wq->wait); if (!test_bit(hash, &wq->hash->map)) { __set_current_state(TASK_RUNNING); - list_del_init(&wqe->wait.entry); + list_del_init(&wq->wait.entry); ret = true; } } @@ -470,7 +459,7 @@ static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; unsigned int stall_hash = -1U; - struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = worker->wq; wq_list_for_each(node, prev, &acct->work_list) { unsigned int hash; @@ -485,11 +474,11 @@ static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, hash = io_get_work_hash(work); /* all items with this hash lie in [work, tail] */ - tail = wqe->hash_tail[hash]; + tail = wq->hash_tail[hash]; /* hashed, can run if not already running */ - if (!test_and_set_bit(hash, &wqe->wq->hash->map)) { - wqe->hash_tail[hash] = NULL; + if (!test_and_set_bit(hash, &wq->hash->map)) { + wq->hash_tail[hash] = NULL; wq_list_cut(&acct->work_list, &tail->list, prev); return work; } @@ -508,12 +497,12 @@ static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, */ set_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); - unstalled = io_wait_on_hash(wqe, stall_hash); + unstalled = io_wait_on_hash(wq, stall_hash); raw_spin_lock(&acct->lock); if (unstalled) { clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); - if (wq_has_sleeper(&wqe->wq->hash->wait)) - wake_up(&wqe->wq->hash->wait); + if (wq_has_sleeper(&wq->hash->wait)) + wake_up(&wq->hash->wait); } } @@ -534,13 +523,10 @@ static void io_assign_current_work(struct io_worker *worker, raw_spin_unlock(&worker->lock); } -static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); - static void io_worker_handle_work(struct io_worker *worker) { struct io_wq_acct *acct = io_wq_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq *wq = worker->wq; bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); do { @@ -557,7 +543,7 @@ static void io_worker_handle_work(struct io_worker *worker) work = io_get_next_work(acct, worker); raw_spin_unlock(&acct->lock); if (work) { - __io_worker_busy(wqe, worker); + __io_worker_busy(wq, worker); /* * Make sure cancelation can find this, even before @@ -595,7 +581,7 @@ static void io_worker_handle_work(struct io_worker *worker) } io_assign_current_work(worker, work); if (linked) - io_wqe_enqueue(wqe, linked); + io_wq_enqueue(wq, linked); if (hash != -1U && !next_hashed) { /* serialize hash clear with wake_up() */ @@ -610,12 +596,11 @@ static void io_worker_handle_work(struct io_worker *worker) } while (1); } -static int io_wqe_worker(void *data) +static int io_wq_worker(void *data) { struct io_worker *worker = data; struct io_wq_acct *acct = io_wq_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq *wq = worker->wq; bool exit_mask = false, last_timeout = false; char buf[TASK_COMM_LEN]; @@ -631,20 +616,20 @@ static int io_wqe_worker(void *data) while (io_acct_run_queue(acct)) io_worker_handle_work(worker); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); /* * Last sleep timed out. Exit if we're not the last worker, * or if someone modified our affinity. */ if (last_timeout && (exit_mask || acct->nr_workers > 1)) { acct->nr_workers--; - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); __set_current_state(TASK_RUNNING); break; } last_timeout = false; - __io_worker_idle(wqe, worker); - raw_spin_unlock(&wqe->lock); + __io_worker_idle(wq, worker); + raw_spin_unlock(&wq->lock); if (io_run_task_work()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); @@ -658,7 +643,7 @@ static int io_wqe_worker(void *data) if (!ret) { last_timeout = true; exit_mask = !cpumask_test_cpu(raw_smp_processor_id(), - wqe->cpu_mask); + wq->cpu_mask); } } @@ -683,7 +668,7 @@ void io_wq_worker_running(struct task_struct *tsk) if (worker->flags & IO_WORKER_F_RUNNING) return; worker->flags |= IO_WORKER_F_RUNNING; - io_wqe_inc_running(worker); + io_wq_inc_running(worker); } /* @@ -702,21 +687,21 @@ void io_wq_worker_sleeping(struct task_struct *tsk) return; worker->flags &= ~IO_WORKER_F_RUNNING; - io_wqe_dec_running(worker); + io_wq_dec_running(worker); } -static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, +static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, struct task_struct *tsk) { tsk->worker_private = worker; worker->task = tsk; - set_cpus_allowed_ptr(tsk, wqe->cpu_mask); + set_cpus_allowed_ptr(tsk, wq->cpu_mask); - raw_spin_lock(&wqe->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); - list_add_tail_rcu(&worker->all_list, &wqe->all_list); + raw_spin_lock(&wq->lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); + list_add_tail_rcu(&worker->all_list, &wq->all_list); worker->flags |= IO_WORKER_F_FREE; - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); wake_up_new_task(tsk); } @@ -749,21 +734,21 @@ static void create_worker_cont(struct callback_head *cb) { struct io_worker *worker; struct task_struct *tsk; - struct io_wqe *wqe; + struct io_wq *wq; worker = container_of(cb, struct io_worker, create_work); clear_bit_unlock(0, &worker->create_state); - wqe = worker->wqe; - tsk = create_io_thread(io_wqe_worker, worker, NUMA_NO_NODE); + wq = worker->wq; + tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wqe, worker, tsk); + io_init_new_worker(wq, worker, tsk); io_worker_release(worker); return; } else if (!io_should_retry_thread(PTR_ERR(tsk))) { struct io_wq_acct *acct = io_wq_get_acct(worker); atomic_dec(&acct->nr_running); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); acct->nr_workers--; if (!acct->nr_workers) { struct io_cb_cancel_data match = { @@ -771,13 +756,13 @@ static void create_worker_cont(struct callback_head *cb) .cancel_all = true, }; - raw_spin_unlock(&wqe->lock); - while (io_acct_cancel_pending_work(wqe, acct, &match)) + raw_spin_unlock(&wq->lock); + while (io_acct_cancel_pending_work(wq, acct, &match)) ; } else { - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); } - io_worker_ref_put(wqe->wq); + io_worker_ref_put(wq); kfree(worker); return; } @@ -796,7 +781,7 @@ static void io_workqueue_create(struct work_struct *work) kfree(worker); } -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +static bool create_io_worker(struct io_wq *wq, int index) { struct io_wq_acct *acct = &wq->acct[index]; struct io_worker *worker; @@ -808,24 +793,24 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) if (!worker) { fail: atomic_dec(&acct->nr_running); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); acct->nr_workers--; - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); io_worker_ref_put(wq); return false; } refcount_set(&worker->ref, 1); - worker->wqe = wqe; + worker->wq = wq; raw_spin_lock_init(&worker->lock); init_completion(&worker->ref_done); if (index == IO_WQ_ACCT_BOUND) worker->flags |= IO_WORKER_F_BOUND; - tsk = create_io_thread(io_wqe_worker, worker, NUMA_NO_NODE); + tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wqe, worker, tsk); + io_init_new_worker(wq, worker, tsk); } else if (!io_should_retry_thread(PTR_ERR(tsk))) { kfree(worker); goto fail; @@ -841,14 +826,14 @@ fail: * Iterate the passed in list and call the specific function for each * worker that isn't exiting */ -static bool io_wq_for_each_worker(struct io_wqe *wqe, +static bool io_wq_for_each_worker(struct io_wq *wq, bool (*func)(struct io_worker *, void *), void *data) { struct io_worker *worker; bool ret = false; - list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { + list_for_each_entry_rcu(worker, &wq->all_list, all_list) { if (io_worker_get(worker)) { /* no task if node is/was offline */ if (worker->task) @@ -869,10 +854,8 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) return false; } -static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) +static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { - struct io_wq *wq = wqe->wq; - do { work->flags |= IO_WQ_WORK_CANCEL; wq->do_work(work); @@ -880,9 +863,9 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) } while (work); } -static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) +static void io_wq_insert_work(struct io_wq *wq, struct io_wq_work *work) { - struct io_wq_acct *acct = io_work_get_acct(wqe->wq, work); + struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash; struct io_wq_work *tail; @@ -893,8 +876,8 @@ append: } hash = io_get_work_hash(work); - tail = wqe->hash_tail[hash]; - wqe->hash_tail[hash] = work; + tail = wq->hash_tail[hash]; + wq->hash_tail[hash] = work; if (!tail) goto append; @@ -906,9 +889,9 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) return work == data; } -static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) +void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { - struct io_wq_acct *acct = io_work_get_acct(wqe->wq, work); + struct io_wq_acct *acct = io_work_get_acct(wq, work); struct io_cb_cancel_data match; unsigned work_flags = work->flags; bool do_create; @@ -917,55 +900,48 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) * If io-wq is exiting for this task, or if the request has explicitly * been marked as one that should not get executed, cancel it here. */ - if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) || + if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || (work->flags & IO_WQ_WORK_CANCEL)) { - io_run_cancel(work, wqe); + io_run_cancel(work, wq); return; } raw_spin_lock(&acct->lock); - io_wqe_insert_work(wqe, work); + io_wq_insert_work(wq, work); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); rcu_read_lock(); - do_create = !io_wqe_activate_free_worker(wqe, acct); + do_create = !io_wq_activate_free_worker(wq, acct); rcu_read_unlock(); - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || !atomic_read(&acct->nr_running))) { bool did_create; - did_create = io_wqe_create_worker(wqe, acct); + did_create = io_wq_create_worker(wq, acct); if (likely(did_create)) return; - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); if (acct->nr_workers) { - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); return; } - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); /* fatal condition, failed to create the first worker */ match.fn = io_wq_work_match_item, match.data = work, match.cancel_all = false, - io_acct_cancel_pending_work(wqe, acct, &match); + io_acct_cancel_pending_work(wq, acct, &match); } } -void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) -{ - struct io_wqe *wqe = &wq->wqe; - - io_wqe_enqueue(wqe, work); -} - /* * Work items that hash to the same value will not be done in parallel. * Used to limit concurrent writes, generally hashed by inode. @@ -1008,26 +984,26 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) return match->nr_running && !match->cancel_all; } -static inline void io_wqe_remove_pending(struct io_wqe *wqe, +static inline void io_wq_remove_pending(struct io_wq *wq, struct io_wq_work *work, struct io_wq_work_node *prev) { - struct io_wq_acct *acct = io_work_get_acct(wqe->wq, work); + struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash = io_get_work_hash(work); struct io_wq_work *prev_work = NULL; - if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) { + if (io_wq_is_hashed(work) && work == wq->hash_tail[hash]) { if (prev) prev_work = container_of(prev, struct io_wq_work, list); if (prev_work && io_get_work_hash(prev_work) == hash) - wqe->hash_tail[hash] = prev_work; + wq->hash_tail[hash] = prev_work; else - wqe->hash_tail[hash] = NULL; + wq->hash_tail[hash] = NULL; } wq_list_del(&acct->work_list, &work->list, prev); } -static bool io_acct_cancel_pending_work(struct io_wqe *wqe, +static bool io_acct_cancel_pending_work(struct io_wq *wq, struct io_wq_acct *acct, struct io_cb_cancel_data *match) { @@ -1039,9 +1015,9 @@ static bool io_acct_cancel_pending_work(struct io_wqe *wqe, work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; - io_wqe_remove_pending(wqe, work, prev); + io_wq_remove_pending(wq, work, prev); raw_spin_unlock(&acct->lock); - io_run_cancel(work, wqe); + io_run_cancel(work, wq); match->nr_pending++; /* not safe to continue after unlock */ return true; @@ -1051,15 +1027,15 @@ static bool io_acct_cancel_pending_work(struct io_wqe *wqe, return false; } -static void io_wqe_cancel_pending_work(struct io_wqe *wqe, +static void io_wq_cancel_pending_work(struct io_wq *wq, struct io_cb_cancel_data *match) { int i; retry: for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wq_acct *acct = io_get_acct(wqe->wq, i == 0); + struct io_wq_acct *acct = io_get_acct(wq, i == 0); - if (io_acct_cancel_pending_work(wqe, acct, match)) { + if (io_acct_cancel_pending_work(wq, acct, match)) { if (match->cancel_all) goto retry; break; @@ -1067,11 +1043,11 @@ retry: } } -static void io_wqe_cancel_running_work(struct io_wqe *wqe, +static void io_wq_cancel_running_work(struct io_wq *wq, struct io_cb_cancel_data *match) { rcu_read_lock(); - io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); + io_wq_for_each_worker(wq, io_wq_worker_cancel, match); rcu_read_unlock(); } @@ -1083,7 +1059,6 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, .data = data, .cancel_all = cancel_all, }; - struct io_wqe *wqe = &wq->wqe; /* * First check pending list, if we're lucky we can just remove it @@ -1095,16 +1070,16 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. * - * Do both of these while holding the wqe->lock, to ensure that + * Do both of these while holding the wq->lock, to ensure that * we'll find a work item regardless of state. */ - io_wqe_cancel_pending_work(wqe, &match); + io_wq_cancel_pending_work(wq, &match); if (match.nr_pending && !match.cancel_all) return IO_WQ_CANCEL_OK; - raw_spin_lock(&wqe->lock); - io_wqe_cancel_running_work(wqe, &match); - raw_spin_unlock(&wqe->lock); + raw_spin_lock(&wq->lock); + io_wq_cancel_running_work(wq, &match); + raw_spin_unlock(&wq->lock); if (match.nr_running && !match.cancel_all) return IO_WQ_CANCEL_RUNNING; @@ -1115,11 +1090,10 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, return IO_WQ_CANCEL_NOTFOUND; } -static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, +static int io_wq_hash_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); - struct io_wq *wq = wqe->wq; + struct io_wq *wq = container_of(wait, struct io_wq, wait); int i; list_del_init(&wait->entry); @@ -1129,7 +1103,7 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, struct io_wq_acct *acct = &wq->acct[i]; if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - io_wqe_activate_free_worker(wqe, acct); + io_wq_activate_free_worker(wq, acct); } rcu_read_unlock(); return 1; @@ -1139,7 +1113,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { int ret, i; struct io_wq *wq; - struct io_wqe *wqe; if (WARN_ON_ONCE(!data->free_work || !data->do_work)) return ERR_PTR(-EINVAL); @@ -1157,18 +1130,17 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wq->hash = data->hash; wq->free_work = data->free_work; wq->do_work = data->do_work; - wqe = &wq->wqe; ret = -ENOMEM; - if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL)) + if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL)) goto err; - cpumask_copy(wqe->cpu_mask, cpu_possible_mask); + cpumask_copy(wq->cpu_mask, cpu_possible_mask); wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; wq->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); - INIT_LIST_HEAD(&wqe->wait.entry); - wqe->wait.func = io_wqe_hash_wake; + INIT_LIST_HEAD(&wq->wait.entry); + wq->wait.func = io_wq_hash_wake; for (i = 0; i < IO_WQ_ACCT_NR; i++) { struct io_wq_acct *acct = &wq->acct[i]; @@ -1177,10 +1149,10 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) INIT_WQ_LIST(&acct->work_list); raw_spin_lock_init(&acct->lock); } - wqe->wq = wq; - raw_spin_lock_init(&wqe->lock); - INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); - INIT_LIST_HEAD(&wqe->all_list); + + raw_spin_lock_init(&wq->lock); + INIT_HLIST_NULLS_HEAD(&wq->free_list, 0); + INIT_LIST_HEAD(&wq->all_list); wq->task = get_task_struct(data->task); atomic_set(&wq->worker_refs, 1); @@ -1190,7 +1162,7 @@ err: io_wq_put_hash(data->hash); cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - free_cpumask_var(wq->wqe.cpu_mask); + free_cpumask_var(wq->cpu_mask); err_wq: kfree(wq); return ERR_PTR(ret); @@ -1203,7 +1175,7 @@ static bool io_task_work_match(struct callback_head *cb, void *data) if (cb->func != create_worker_cb && cb->func != create_worker_cont) return false; worker = container_of(cb, struct io_worker, create_work); - return worker->wqe->wq == data; + return worker->wq == data; } void io_wq_exit_start(struct io_wq *wq) @@ -1237,13 +1209,13 @@ static void io_wq_exit_workers(struct io_wq *wq) io_wq_cancel_tw_create(wq); rcu_read_lock(); - io_wq_for_each_worker(&wq->wqe, io_wq_worker_wake, NULL); + io_wq_for_each_worker(wq, io_wq_worker_wake, NULL); rcu_read_unlock(); io_worker_ref_put(wq); wait_for_completion(&wq->worker_done); spin_lock_irq(&wq->hash->wait.lock); - list_del_init(&wq->wqe.wait.entry); + list_del_init(&wq->wait.entry); spin_unlock_irq(&wq->hash->wait.lock); put_task_struct(wq->task); @@ -1256,11 +1228,10 @@ static void io_wq_destroy(struct io_wq *wq) .fn = io_wq_work_match_all, .cancel_all = true, }; - struct io_wqe *wqe = &wq->wqe; cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - io_wqe_cancel_pending_work(wqe, &match); - free_cpumask_var(wqe->cpu_mask); + io_wq_cancel_pending_work(wq, &match); + free_cpumask_var(wq->cpu_mask); io_wq_put_hash(wq->hash); kfree(wq); } @@ -1283,9 +1254,9 @@ static bool io_wq_worker_affinity(struct io_worker *worker, void *data) struct online_data *od = data; if (od->online) - cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask); + cpumask_set_cpu(od->cpu, worker->wq->cpu_mask); else - cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask); + cpumask_clear_cpu(od->cpu, worker->wq->cpu_mask); return false; } @@ -1297,7 +1268,7 @@ static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online) }; rcu_read_lock(); - io_wq_for_each_worker(&wq->wqe, io_wq_worker_affinity, &od); + io_wq_for_each_worker(wq, io_wq_worker_affinity, &od); rcu_read_unlock(); return 0; } @@ -1318,13 +1289,11 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) { - struct io_wqe *wqe = &wq->wqe; - rcu_read_lock(); if (mask) - cpumask_copy(wqe->cpu_mask, mask); + cpumask_copy(wq->cpu_mask, mask); else - cpumask_copy(wqe->cpu_mask, cpu_possible_mask); + cpumask_copy(wq->cpu_mask, cpu_possible_mask); rcu_read_unlock(); return 0; @@ -1336,7 +1305,6 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) */ int io_wq_max_workers(struct io_wq *wq, int *new_count) { - struct io_wqe *wqe = &wq->wqe; struct io_wq_acct *acct; int prev[IO_WQ_ACCT_NR]; int i; @@ -1355,14 +1323,14 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) rcu_read_lock(); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { acct = &wq->acct[i]; prev[i] = max_t(int, acct->max_workers, prev[i]); if (new_count[i]) acct->max_workers = new_count[i]; } - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); rcu_read_unlock(); for (i = 0; i < IO_WQ_ACCT_NR; i++) -- cgit From 07d99096e1635805fb7c60382dc12554886a39b8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 27 Mar 2023 13:10:21 -0600 Subject: io_uring/io-wq: drop outdated comment Since the move to PF_IO_WORKER, we don't juggle memory context manually anymore. Remove that outdated part of the comment for __io_worker_idle(). Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 2b0b2e33cd71..b2715988791e 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -415,11 +415,7 @@ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) } /* - * No work, worker going to sleep. Move to freelist, and unuse mm if we - * have one attached. Dropping the mm may potentially sleep, so we drop - * the lock in that case and return success. Since the caller has to - * retry the loop in that case (we changed task state), we don't regrab - * the lock if we return success. + * No work, worker going to sleep. Move to freelist. */ static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) __must_hold(wq->lock) -- cgit From 13bfa6f15d0b39254937076ab0557da6875bb455 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 27 Mar 2023 16:38:14 +0100 Subject: io_uring: remove extra tw trylocks Before cond_resched()'ing in handle_tw_list() we also drop the current ring context, and so the next loop iteration will need to pick/pin a new context and do trylock. The chunk removed by this patch was intended to be an optimisation covering exactly this case, i.e. retaking the lock after reschedule, but in reality it's skipped for the first iteration after resched as described and will keep hammering the lock if it's contended. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1ecec9483d58696e248d1bfd52cf62b04442df1d.1679931367.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 24be4992821b..2669aca0ba39 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1186,8 +1186,7 @@ static unsigned int handle_tw_list(struct llist_node *node, /* if not contended, grab and improve batching */ *locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); - } else if (!*locked) - *locked = mutex_trylock(&(*ctx)->uring_lock); + } req->io_task_work.func(req, locked); node = next; count++; -- cgit From a282967c848fb1d92c28334430c472da9c334e54 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 27 Mar 2023 16:38:15 +0100 Subject: io_uring: encapsulate task_work state For task works we're passing around a bool pointer for whether the current ring is locked or not, let's wrap it in a structure, that will make it more opaque preventing abuse and will also help us to pass more info in the future if needed. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1ecec9483d58696e248d1bfd52cf62b04442df1d.1679931367.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 71 ++++++++++++++++++++++++++-------------------------- io_uring/io_uring.h | 14 +++++------ io_uring/notif.c | 4 +-- io_uring/poll.c | 32 +++++++++++------------ io_uring/rw.c | 6 ++--- io_uring/timeout.c | 14 +++++------ io_uring/uring_cmd.c | 4 +-- 7 files changed, 73 insertions(+), 72 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2669aca0ba39..536940675c67 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -247,12 +247,12 @@ static __cold void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - bool locked = true; + struct io_tw_state ts = { .locked = true, }; mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, &locked); - if (WARN_ON_ONCE(!locked)) + req->io_task_work.func(req, &ts); + if (WARN_ON_ONCE(!ts.locked)) return; io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); @@ -457,7 +457,7 @@ static void io_prep_async_link(struct io_kiocb *req) } } -void io_queue_iowq(struct io_kiocb *req, bool *dont_use) +void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) { struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -1153,22 +1153,23 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return nxt; } -static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) { if (!ctx) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (*locked) { + if (ts->locked) { io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); - *locked = false; + ts->locked = false; } percpu_ref_put(&ctx->refs); } static unsigned int handle_tw_list(struct llist_node *node, - struct io_ring_ctx **ctx, bool *locked, + struct io_ring_ctx **ctx, + struct io_tw_state *ts, struct llist_node *last) { unsigned int count = 0; @@ -1181,17 +1182,17 @@ static unsigned int handle_tw_list(struct llist_node *node, prefetch(container_of(next, struct io_kiocb, io_task_work.node)); if (req->ctx != *ctx) { - ctx_flush_and_put(*ctx, locked); + ctx_flush_and_put(*ctx, ts); *ctx = req->ctx; /* if not contended, grab and improve batching */ - *locked = mutex_trylock(&(*ctx)->uring_lock); + ts->locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); } - req->io_task_work.func(req, locked); + req->io_task_work.func(req, ts); node = next; count++; if (unlikely(need_resched())) { - ctx_flush_and_put(*ctx, locked); + ctx_flush_and_put(*ctx, ts); *ctx = NULL; cond_resched(); } @@ -1232,7 +1233,7 @@ static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, void tctx_task_work(struct callback_head *cb) { - bool uring_locked = false; + struct io_tw_state ts = {}; struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); @@ -1249,12 +1250,12 @@ void tctx_task_work(struct callback_head *cb) do { loops++; node = io_llist_xchg(&tctx->task_list, &fake); - count += handle_tw_list(node, &ctx, &uring_locked, &fake); + count += handle_tw_list(node, &ctx, &ts, &fake); /* skip expensive cmpxchg if there are items in the list */ if (READ_ONCE(tctx->task_list.first) != &fake) continue; - if (uring_locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { + if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { io_submit_flush_completions(ctx); if (READ_ONCE(tctx->task_list.first) != &fake) continue; @@ -1262,7 +1263,7 @@ void tctx_task_work(struct callback_head *cb) node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); } while (node != &fake); - ctx_flush_and_put(ctx, &uring_locked); + ctx_flush_and_put(ctx, &ts); /* relaxed read is enough as only the task itself sets ->in_cancel */ if (unlikely(atomic_read(&tctx->in_cancel))) @@ -1351,7 +1352,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) } } -static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) +static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) { struct llist_node *node; unsigned int loops = 0; @@ -1368,7 +1369,7 @@ again: struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - req->io_task_work.func(req, locked); + req->io_task_work.func(req, ts); ret++; node = next; } @@ -1376,7 +1377,7 @@ again: if (!llist_empty(&ctx->work_llist)) goto again; - if (*locked) { + if (ts->locked) { io_submit_flush_completions(ctx); if (!llist_empty(&ctx->work_llist)) goto again; @@ -1387,46 +1388,46 @@ again: static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) { - bool locked; + struct io_tw_state ts = { .locked = true, }; int ret; if (llist_empty(&ctx->work_llist)) return 0; - locked = true; - ret = __io_run_local_work(ctx, &locked); + ret = __io_run_local_work(ctx, &ts); /* shouldn't happen! */ - if (WARN_ON_ONCE(!locked)) + if (WARN_ON_ONCE(!ts.locked)) mutex_lock(&ctx->uring_lock); return ret; } static int io_run_local_work(struct io_ring_ctx *ctx) { - bool locked = mutex_trylock(&ctx->uring_lock); + struct io_tw_state ts = {}; int ret; - ret = __io_run_local_work(ctx, &locked); - if (locked) + ts.locked = mutex_trylock(&ctx->uring_lock); + ret = __io_run_local_work(ctx, &ts); + if (ts.locked) mutex_unlock(&ctx->uring_lock); return ret; } -static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); io_req_defer_failed(req, req->cqe.res); } -void io_req_task_submit(struct io_kiocb *req, bool *locked) +void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) - io_queue_iowq(req, locked); + io_queue_iowq(req, ts); else io_queue_sqe(req); } @@ -1652,9 +1653,9 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return ret; } -void io_req_task_complete(struct io_kiocb *req, bool *locked) +void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) { - if (*locked) + if (ts->locked) io_req_complete_defer(req); else io_req_complete_post(req, IO_URING_F_UNLOCKED); @@ -1933,9 +1934,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return 0; } -int io_poll_issue(struct io_kiocb *req, bool *locked) +int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| IO_URING_F_COMPLETE_DEFER); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 2711865f1e19..c33f719731ac 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -52,16 +52,16 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local); bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_queue_iowq(struct io_kiocb *req, bool *dont_use); -void io_req_task_complete(struct io_kiocb *req, bool *locked); +void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); +void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, bool *locked); +void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); void tctx_task_work(struct callback_head *cb); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); -int io_poll_issue(struct io_kiocb *req, bool *locked); +int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); @@ -299,11 +299,11 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx) return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); } -static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) +static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) { - if (!*locked) { + if (!ts->locked) { mutex_lock(&ctx->uring_lock); - *locked = true; + ts->locked = true; } } diff --git a/io_uring/notif.c b/io_uring/notif.c index 09dfd0832d19..172105eb347d 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -9,7 +9,7 @@ #include "notif.h" #include "rsrc.h" -static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked) +static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts) { struct io_notif_data *nd = io_notif_to_data(notif); struct io_ring_ctx *ctx = notif->ctx; @@ -21,7 +21,7 @@ static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked) __io_unaccount_mem(ctx->user, nd->account_pages); nd->account_pages = 0; } - io_req_task_complete(notif, locked); + io_req_task_complete(notif, ts); } static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, diff --git a/io_uring/poll.c b/io_uring/poll.c index 55306e801081..c90e47dc1e29 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -148,7 +148,7 @@ static void io_poll_req_insert_locked(struct io_kiocb *req) hlist_add_head(&req->hash_node, &table->hbs[index].list); } -static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) +static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts) { struct io_ring_ctx *ctx = req->ctx; @@ -159,7 +159,7 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) * already grabbed the mutex for us, but there is a chance it * failed. */ - io_tw_lock(ctx, locked); + io_tw_lock(ctx, ts); hash_del(&req->hash_node); req->flags &= ~REQ_F_HASH_LOCKED; } else { @@ -238,7 +238,7 @@ enum { * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * poll and that the result is stored in req->cqe. */ -static int io_poll_check_events(struct io_kiocb *req, bool *locked) +static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) { int v; @@ -300,13 +300,13 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_aux_cqe(req->ctx, *locked, req->cqe.user_data, + if (!io_aux_cqe(req->ctx, ts->locked, req->cqe.user_data, mask, IORING_CQE_F_MORE, false)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } } else { - int ret = io_poll_issue(req, locked); + int ret = io_poll_issue(req, ts); if (ret == IOU_STOP_MULTISHOT) return IOU_POLL_REMOVE_POLL_USE_RES; if (ret < 0) @@ -326,15 +326,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) return IOU_POLL_NO_ACTION; } -static void io_poll_task_func(struct io_kiocb *req, bool *locked) +static void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) { int ret; - ret = io_poll_check_events(req, locked); + ret = io_poll_check_events(req, ts); if (ret == IOU_POLL_NO_ACTION) return; io_poll_remove_entries(req); - io_poll_tw_hash_eject(req, locked); + io_poll_tw_hash_eject(req, ts); if (req->opcode == IORING_OP_POLL_ADD) { if (ret == IOU_POLL_DONE) { @@ -343,7 +343,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { - io_req_task_submit(req, locked); + io_req_task_submit(req, ts); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; @@ -351,14 +351,14 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); } else { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) - io_req_task_submit(req, locked); + io_req_task_submit(req, ts); else io_req_defer_failed(req, ret); } @@ -977,7 +977,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; - bool locked; + struct io_tw_state ts = {}; preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); ret2 = io_poll_disarm(preq); @@ -1027,8 +1027,8 @@ found: req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); - locked = !(issue_flags & IO_URING_F_UNLOCKED); - io_req_task_complete(preq, &locked); + ts.locked = !(issue_flags & IO_URING_F_UNLOCKED); + io_req_task_complete(preq, &ts); out: if (ret < 0) { req_set_fail(req); diff --git a/io_uring/rw.c b/io_uring/rw.c index 4c233910e200..f14868624f41 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -283,16 +283,16 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) return res; } -static void io_req_rw_complete(struct io_kiocb *req, bool *locked) +static void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) { io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; req->cqe.flags |= io_put_kbuf(req, issue_flags); } - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); } static void io_complete_rw(struct kiocb *kiocb, long res) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 826a51bca3e4..5c6c6f720809 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -101,9 +101,9 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked) +static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) { - io_tw_lock(link->ctx, locked); + io_tw_lock(link->ctx, ts); while (link) { struct io_kiocb *nxt = link->link; long res = -ECANCELED; @@ -112,7 +112,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); - io_req_task_complete(link, locked); + io_req_task_complete(link, ts); link = nxt; } } @@ -265,9 +265,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return 0; } -static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) +static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) { - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; int ret = -ENOENT; @@ -282,11 +282,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); } io_req_set_res(req, ret ?: -ETIME, 0); - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); } } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 9a1dee571872..3d825d939b13 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -12,10 +12,10 @@ #include "rsrc.h" #include "uring_cmd.h" -static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) +static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; ioucmd->task_work_cb(ioucmd, issue_flags); } -- cgit From 2ad57931db641f3de627023afb8147a8ec0b41dc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 30 Mar 2023 10:03:41 -0600 Subject: io_uring: rename trace_io_uring_submit_sqe() tracepoint It has nothing to do with the SQE at this point, it's a request submission. While in there, get rid of the 'force_nonblock' argument which is also dead, as we only pass in true. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 536940675c67..775b53730c2f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2305,8 +2305,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); - /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(req, true); + trace_io_uring_submit_req(req); /* * If we already have a head request, queue this one for async -- cgit From e3ef728ff07b42668e7e12f49cd2f9055e064ec1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 30 Mar 2023 10:05:31 -0600 Subject: io_uring: cap io_sqring_entries() at SQ ring size We already do this manually for the !SQPOLL case, do it in general and we can also dump the ugly min3() in io_submit_sqes(). Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/io_uring.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 775b53730c2f..a0b64831c455 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2434,7 +2434,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) if (unlikely(!entries)) return 0; /* make sure SQ entry isn't read before tail */ - ret = left = min3(nr, ctx->sq_entries, entries); + ret = left = min(nr, entries); io_get_task_refs(left); io_submit_state_start(&ctx->submit_state, left); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index c33f719731ac..193b2db39fe8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -262,9 +262,11 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx) static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; + unsigned int entries; /* make sure SQ entry isn't read before tail */ - return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; + entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; + return min(entries, ctx->sq_entries); } static inline int io_run_task_work(void) -- cgit From b8fb5b4fdd67f9d18109c5d21d44a8bd4ddb608b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:45 +0100 Subject: io_uring/rsrc: use non-pcpu refcounts for nodes One problem with the current rsrc infra is that often updates will generates lots of rsrc nodes, each carry pcpu refs. That takes quite a lot of memory, especially if there is a stall, and takes lots of CPU cycles. Only pcpu allocations takes >50 of CPU with a naive benchmark updating files in a loop. Replace pcpu refs with normal refcounting. There is already a hot path avoiding atomics / refs, but following patches will further improve it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e9ed8a9457b331a26555ff9443afc64cdaab7247.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 15 +++++---------- io_uring/rsrc.h | 6 ++++-- 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 7a43aed8e395..f2da9e251e3f 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -155,7 +155,7 @@ void io_rsrc_refs_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; - percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); + refcount_add(IO_RSRC_REF_BATCH, &ctx->rsrc_node->refs); } static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) @@ -220,13 +220,11 @@ void io_wait_rsrc_data(struct io_rsrc_data *data) void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) { - percpu_ref_exit(&ref_node->refs); kfree(ref_node); } -static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) +__cold void io_rsrc_node_ref_zero(struct io_rsrc_node *node) { - struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); struct io_ring_ctx *ctx = node->rsrc_data->ctx; unsigned long flags; bool first_add = false; @@ -269,11 +267,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void) if (!ref_node) return NULL; - if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, - 0, GFP_KERNEL)) { - kfree(ref_node); - return NULL; - } + refcount_set(&ref_node->refs, 1); INIT_LIST_HEAD(&ref_node->node); INIT_LIST_HEAD(&ref_node->rsrc_list); ref_node->done = false; @@ -298,7 +292,8 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, spin_unlock_irq(&ctx->rsrc_ref_lock); atomic_inc(&data_to_kill->refs); - percpu_ref_kill(&rsrc_node->refs); + /* put master ref */ + io_rsrc_put_node(rsrc_node, 1); ctx->rsrc_node = NULL; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index f27f4975217d..1467b31843bc 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -37,7 +37,7 @@ struct io_rsrc_data { }; struct io_rsrc_node { - struct percpu_ref refs; + refcount_t refs; struct list_head node; struct list_head rsrc_list; struct io_rsrc_data *rsrc_data; @@ -54,6 +54,7 @@ struct io_mapped_ubuf { }; void io_rsrc_put_tw(struct callback_head *cb); +void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_put_work(struct work_struct *work); void io_rsrc_refs_refill(struct io_ring_ctx *ctx); void io_wait_rsrc_data(struct io_rsrc_data *data); @@ -109,7 +110,8 @@ int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr) { - percpu_ref_put_many(&node->refs, nr); + if (refcount_sub_and_test(nr, &node->refs)) + io_rsrc_node_ref_zero(node); } static inline void io_req_put_rsrc(struct io_kiocb *req) -- cgit From 8e15c0e71b8ae64fb7163532860f8d608165281f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:46 +0100 Subject: io_uring/rsrc: keep cached refs per node We cache refs of the current node (i.e. ctx->rsrc_node) in ctx->rsrc_cached_refs. We'll be moving away from atomics, so move the cached refs in struct io_rsrc_node for now. It's a prep patch and shouldn't change anything in practise. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9edc3669c1d71b06c2dca78b2b2b8bb9292738b9.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 15 +++++++++------ io_uring/rsrc.h | 16 +++++++++------- 2 files changed, 18 insertions(+), 13 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f2da9e251e3f..1e7c960737fd 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -36,9 +36,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, void io_rsrc_refs_drop(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - if (ctx->rsrc_cached_refs) { - io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); - ctx->rsrc_cached_refs = 0; + struct io_rsrc_node *node = ctx->rsrc_node; + + if (node && node->cached_refs) { + io_rsrc_put_node(node, node->cached_refs); + node->cached_refs = 0; } } @@ -151,11 +153,11 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo *slot = NULL; } -void io_rsrc_refs_refill(struct io_ring_ctx *ctx) +void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node) __must_hold(&ctx->uring_lock) { - ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; - refcount_add(IO_RSRC_REF_BATCH, &ctx->rsrc_node->refs); + node->cached_refs += IO_RSRC_REF_BATCH; + refcount_add(IO_RSRC_REF_BATCH, &node->refs); } static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) @@ -300,6 +302,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, if (!ctx->rsrc_node) { ctx->rsrc_node = ctx->rsrc_backup_node; ctx->rsrc_backup_node = NULL; + ctx->rsrc_node->cached_refs = 0; } } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 1467b31843bc..950535e2b9f4 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -43,6 +43,7 @@ struct io_rsrc_node { struct io_rsrc_data *rsrc_data; struct llist_node llist; bool done; + int cached_refs; }; struct io_mapped_ubuf { @@ -56,7 +57,7 @@ struct io_mapped_ubuf { void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_put_work(struct work_struct *work); -void io_rsrc_refs_refill(struct io_ring_ctx *ctx); +void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_wait_rsrc_data(struct io_rsrc_data *data); void io_rsrc_node_destroy(struct io_rsrc_node *ref_node); void io_rsrc_refs_drop(struct io_ring_ctx *ctx); @@ -128,17 +129,18 @@ static inline void io_req_put_rsrc_locked(struct io_kiocb *req, if (node) { if (node == ctx->rsrc_node) - ctx->rsrc_cached_refs++; + node->cached_refs++; else io_rsrc_put_node(node, 1); } } -static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx) +static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, + struct io_rsrc_node *node) { - ctx->rsrc_cached_refs--; - if (unlikely(ctx->rsrc_cached_refs < 0)) - io_rsrc_refs_refill(ctx); + node->cached_refs--; + if (unlikely(node->cached_refs < 0)) + io_rsrc_refs_refill(ctx, node); } static inline void io_req_set_rsrc_node(struct io_kiocb *req, @@ -151,7 +153,7 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, lockdep_assert_held(&ctx->uring_lock); req->rsrc_node = ctx->rsrc_node; - io_charge_rsrc_node(ctx); + io_charge_rsrc_node(ctx, ctx->rsrc_node); io_ring_submit_unlock(ctx, issue_flags); } } -- cgit From 2ad4c6d08018e4eec130c29992028dc356ab2181 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:47 +0100 Subject: io_uring: don't put nodes under spinlocks io_req_put_rsrc() doesn't need any locking, so move it out of a spinlock section in __io_req_complete_post() and adjust helpers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d5b87a5f31270dade6805f7acafc4cc34b84b241.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 7 +++++-- io_uring/rsrc.h | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a0b64831c455..596af20cddb4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -970,6 +970,7 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 static void __io_req_complete_post(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *rsrc_node = NULL; io_cq_lock(ctx); if (!(req->flags & REQ_F_CQE_SKIP)) @@ -990,7 +991,7 @@ static void __io_req_complete_post(struct io_kiocb *req) } io_put_kbuf_comp(req); io_dismantle_req(req); - io_req_put_rsrc(req); + rsrc_node = req->rsrc_node; /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid @@ -1001,6 +1002,8 @@ static void __io_req_complete_post(struct io_kiocb *req) ctx->locked_free_nr++; } io_cq_unlock_post(ctx); + + io_put_rsrc_node(rsrc_node); } void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) @@ -1117,7 +1120,7 @@ __cold void io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - io_req_put_rsrc(req); + io_put_rsrc_node(req->rsrc_node); io_dismantle_req(req); io_put_task_remote(req->task, 1); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 950535e2b9f4..8164777279ba 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -115,10 +115,10 @@ static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr) io_rsrc_node_ref_zero(node); } -static inline void io_req_put_rsrc(struct io_kiocb *req) +static inline void io_put_rsrc_node(struct io_rsrc_node *node) { - if (req->rsrc_node) - io_rsrc_put_node(req->rsrc_node, 1); + if (node) + io_rsrc_put_node(node, 1); } static inline void io_req_put_rsrc_locked(struct io_kiocb *req, -- cgit From 03adabe81abb20221079b48343783b4327bd1186 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:48 +0100 Subject: io_uring: io_free_req() via tw io_free_req() is not often used but nevertheless problematic as there is no way to know the current context, it may be used from the submission path or even by an irq handler. Push it to a fresh context using task_work. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3a92fe80bb068757e51aaa0b105cfbe8f5dfee9e.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 596af20cddb4..98320f4b0bca 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1116,7 +1116,7 @@ static inline void io_dismantle_req(struct io_kiocb *req) io_put_file(req->file); } -__cold void io_free_req(struct io_kiocb *req) +static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts) { struct io_ring_ctx *ctx = req->ctx; @@ -1130,6 +1130,12 @@ __cold void io_free_req(struct io_kiocb *req) spin_unlock(&ctx->completion_lock); } +__cold void io_free_req(struct io_kiocb *req) +{ + req->io_task_work.func = io_free_req_tw; + io_req_task_work_add(req); +} + static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; -- cgit From ef8ae64ffa9578c12e44de42604004c2cc3e9c27 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:49 +0100 Subject: io_uring/rsrc: protect node refs with uring_lock Currently, for nodes we have an atomic counter and some cached (non-atomic) refs protected by uring_lock. Let's put all ref manipulations under uring_lock and get rid of the atomic part. It's free as in all cases we care about we already hold the lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/25b142feed7d831008257d90c8b17c0115d4fc15.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 18 ++++++++++++------ io_uring/rsrc.c | 30 ++++-------------------------- io_uring/rsrc.h | 29 +++++------------------------ 3 files changed, 21 insertions(+), 56 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 98320f4b0bca..36a76c7b34f0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -967,7 +967,7 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 return true; } -static void __io_req_complete_post(struct io_kiocb *req) +static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_rsrc_node *rsrc_node = NULL; @@ -1003,7 +1003,11 @@ static void __io_req_complete_post(struct io_kiocb *req) } io_cq_unlock_post(ctx); - io_put_rsrc_node(rsrc_node); + if (rsrc_node) { + io_ring_submit_lock(ctx, issue_flags); + io_put_rsrc_node(rsrc_node); + io_ring_submit_unlock(ctx, issue_flags); + } } void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) @@ -1013,12 +1017,12 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) io_req_task_work_add(req); } else if (!(issue_flags & IO_URING_F_UNLOCKED) || !(req->ctx->flags & IORING_SETUP_IOPOLL)) { - __io_req_complete_post(req); + __io_req_complete_post(req, issue_flags); } else { struct io_ring_ctx *ctx = req->ctx; mutex_lock(&ctx->uring_lock); - __io_req_complete_post(req); + __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); mutex_unlock(&ctx->uring_lock); } } @@ -1120,7 +1124,10 @@ static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts) { struct io_ring_ctx *ctx = req->ctx; - io_put_rsrc_node(req->rsrc_node); + if (req->rsrc_node) { + io_tw_lock(ctx, ts); + io_put_rsrc_node(req->rsrc_node); + } io_dismantle_req(req); io_put_task_remote(req->task, 1); @@ -2790,7 +2797,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); - io_rsrc_refs_drop(ctx); /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ io_wait_rsrc_data(ctx->buf_data); io_wait_rsrc_data(ctx->file_data); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 1e7c960737fd..1237fc77c250 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -27,23 +27,10 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage); -#define IO_RSRC_REF_BATCH 100 - /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) -void io_rsrc_refs_drop(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - struct io_rsrc_node *node = ctx->rsrc_node; - - if (node && node->cached_refs) { - io_rsrc_put_node(node, node->cached_refs); - node->cached_refs = 0; - } -} - int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -153,13 +140,6 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo *slot = NULL; } -void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node) - __must_hold(&ctx->uring_lock) -{ - node->cached_refs += IO_RSRC_REF_BATCH; - refcount_add(IO_RSRC_REF_BATCH, &node->refs); -} - static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) { struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; @@ -225,7 +205,8 @@ void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) kfree(ref_node); } -__cold void io_rsrc_node_ref_zero(struct io_rsrc_node *node) +void io_rsrc_node_ref_zero(struct io_rsrc_node *node) + __must_hold(&node->rsrc_data->ctx->uring_lock) { struct io_ring_ctx *ctx = node->rsrc_data->ctx; unsigned long flags; @@ -269,7 +250,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void) if (!ref_node) return NULL; - refcount_set(&ref_node->refs, 1); + ref_node->refs = 1; INIT_LIST_HEAD(&ref_node->node); INIT_LIST_HEAD(&ref_node->rsrc_list); ref_node->done = false; @@ -283,8 +264,6 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, WARN_ON_ONCE(!ctx->rsrc_backup_node); WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); - io_rsrc_refs_drop(ctx); - if (data_to_kill) { struct io_rsrc_node *rsrc_node = ctx->rsrc_node; @@ -295,14 +274,13 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, atomic_inc(&data_to_kill->refs); /* put master ref */ - io_rsrc_put_node(rsrc_node, 1); + io_put_rsrc_node(rsrc_node); ctx->rsrc_node = NULL; } if (!ctx->rsrc_node) { ctx->rsrc_node = ctx->rsrc_backup_node; ctx->rsrc_backup_node = NULL; - ctx->rsrc_node->cached_refs = 0; } } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 8164777279ba..a96103095f0f 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -37,13 +37,12 @@ struct io_rsrc_data { }; struct io_rsrc_node { - refcount_t refs; struct list_head node; struct list_head rsrc_list; struct io_rsrc_data *rsrc_data; struct llist_node llist; + int refs; bool done; - int cached_refs; }; struct io_mapped_ubuf { @@ -57,10 +56,8 @@ struct io_mapped_ubuf { void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_put_work(struct work_struct *work); -void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_wait_rsrc_data(struct io_rsrc_data *data); void io_rsrc_node_destroy(struct io_rsrc_node *ref_node); -void io_rsrc_refs_drop(struct io_ring_ctx *ctx); int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc); @@ -109,38 +106,22 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); -static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr) -{ - if (refcount_sub_and_test(nr, &node->refs)) - io_rsrc_node_ref_zero(node); -} - static inline void io_put_rsrc_node(struct io_rsrc_node *node) { - if (node) - io_rsrc_put_node(node, 1); + if (node && !--node->refs) + io_rsrc_node_ref_zero(node); } static inline void io_req_put_rsrc_locked(struct io_kiocb *req, struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) { - struct io_rsrc_node *node = req->rsrc_node; - - if (node) { - if (node == ctx->rsrc_node) - node->cached_refs++; - else - io_rsrc_put_node(node, 1); - } + io_put_rsrc_node(req->rsrc_node); } static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - node->cached_refs--; - if (unlikely(node->cached_refs < 0)) - io_rsrc_refs_refill(ctx, node); + node->refs++; } static inline void io_req_set_rsrc_node(struct io_kiocb *req, -- cgit From 0a4813b1abdf06e44ce60cdebfd374cfd27c46bf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:50 +0100 Subject: io_uring/rsrc: kill rsrc_ref_lock We use ->rsrc_ref_lock spinlock to protect ->rsrc_ref_list in io_rsrc_node_ref_zero(). Now we removed pcpu refcounting, which means io_rsrc_node_ref_zero() is not executed from the irq context as an RCU callback anymore, and we also put it under ->uring_lock. io_rsrc_node_switch(), which queues up nodes into the list, is also protected by ->uring_lock, so we can safely get rid of ->rsrc_ref_lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6b60af883c263551190b526a55ff2c9d5ae07141.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 - io_uring/rsrc.c | 5 ----- 2 files changed, 6 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 36a76c7b34f0..764df5694d73 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -325,7 +325,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); - spin_lock_init(&ctx->rsrc_ref_lock); INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 1237fc77c250..e122b6e5f9c5 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -209,11 +209,9 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) __must_hold(&node->rsrc_data->ctx->uring_lock) { struct io_ring_ctx *ctx = node->rsrc_data->ctx; - unsigned long flags; bool first_add = false; unsigned long delay = HZ; - spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); node->done = true; /* if we are mid-quiesce then do not delay */ @@ -229,7 +227,6 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) list_del(&node->node); first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); } - spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); if (!first_add) return; @@ -268,9 +265,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_node *rsrc_node = ctx->rsrc_node; rsrc_node->rsrc_data = data_to_kill; - spin_lock_irq(&ctx->rsrc_ref_lock); list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - spin_unlock_irq(&ctx->rsrc_ref_lock); atomic_inc(&data_to_kill->refs); /* put master ref */ -- cgit From c824986c113f15e2ef2c00da9a226c09ecaac74c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:51 +0100 Subject: io_uring/rsrc: rename rsrc_list We have too many "rsrc" around which makes the name of struct io_rsrc_node::rsrc_list confusing. The field is responsible for keeping a list of files or buffers, so call it item_list and add comments around. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3e34d4dfc1fdbb6b520f904ee6187c2ccf680efe.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 6 +++--- io_uring/rsrc.h | 8 +++++++- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index e122b6e5f9c5..10006fb169d2 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -146,7 +146,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) struct io_ring_ctx *ctx = rsrc_data->ctx; struct io_rsrc_put *prsrc, *tmp; - list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { + list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) { list_del(&prsrc->list); if (prsrc->tag) { @@ -249,7 +249,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void) ref_node->refs = 1; INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->rsrc_list); + INIT_LIST_HEAD(&ref_node->item_list); ref_node->done = false; return ref_node; } @@ -737,7 +737,7 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, prsrc->tag = *tag_slot; *tag_slot = 0; prsrc->rsrc = rsrc; - list_add(&prsrc->list, &node->rsrc_list); + list_add(&prsrc->list, &node->item_list); return 0; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index a96103095f0f..509a5ea7eabf 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -38,11 +38,17 @@ struct io_rsrc_data { struct io_rsrc_node { struct list_head node; - struct list_head rsrc_list; struct io_rsrc_data *rsrc_data; struct llist_node llist; int refs; bool done; + + /* + * Keeps a list of struct io_rsrc_put to be completed. Each entry + * represents one rsrc (e.g. file or buffer), but all of them should've + * came from the same table and so are of the same type. + */ + struct list_head item_list; }; struct io_mapped_ubuf { -- cgit From ff7c75ecaa9e6b251f76c24e289d4bfe413ffe31 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:52 +0100 Subject: io_uring/rsrc: optimise io_rsrc_put allocation Every io_rsrc_node keeps a list of items to put, and all entries are kmalloc()'ed. However, it's quite often to queue up only one entry per node, so let's add an inline entry there to avoid extra allocations. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c482c1c652c45c85ac52e67c974bc758a50fed5f.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 51 ++++++++++++++++++++++++++++++++++----------------- io_uring/rsrc.h | 2 ++ 2 files changed, 36 insertions(+), 17 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 10006fb169d2..95e71300bb35 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -140,26 +140,34 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo *slot = NULL; } +static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, + struct io_rsrc_put *prsrc) +{ + struct io_ring_ctx *ctx = rsrc_data->ctx; + + if (prsrc->tag) { + if (ctx->flags & IORING_SETUP_IOPOLL) { + mutex_lock(&ctx->uring_lock); + io_post_aux_cqe(ctx, prsrc->tag, 0, 0); + mutex_unlock(&ctx->uring_lock); + } else { + io_post_aux_cqe(ctx, prsrc->tag, 0, 0); + } + } + rsrc_data->do_put(ctx, prsrc); +} + static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) { struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - struct io_ring_ctx *ctx = rsrc_data->ctx; struct io_rsrc_put *prsrc, *tmp; + if (ref_node->inline_items) + io_rsrc_put_work_one(rsrc_data, &ref_node->item); + list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) { list_del(&prsrc->list); - - if (prsrc->tag) { - if (ctx->flags & IORING_SETUP_IOPOLL) { - mutex_lock(&ctx->uring_lock); - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - mutex_unlock(&ctx->uring_lock); - } else { - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - } - } - - rsrc_data->do_put(ctx, prsrc); + io_rsrc_put_work_one(rsrc_data, prsrc); kfree(prsrc); } @@ -251,6 +259,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void) INIT_LIST_HEAD(&ref_node->node); INIT_LIST_HEAD(&ref_node->item_list); ref_node->done = false; + ref_node->inline_items = 0; return ref_node; } @@ -729,15 +738,23 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, { u64 *tag_slot = io_get_tag_slot(data, idx); struct io_rsrc_put *prsrc; + bool inline_item = true; - prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); - if (!prsrc) - return -ENOMEM; + if (!node->inline_items) { + prsrc = &node->item; + node->inline_items++; + } else { + prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); + if (!prsrc) + return -ENOMEM; + inline_item = false; + } prsrc->tag = *tag_slot; *tag_slot = 0; prsrc->rsrc = rsrc; - list_add(&prsrc->list, &node->item_list); + if (!inline_item) + list_add(&prsrc->list, &node->item_list); return 0; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 509a5ea7eabf..11703082d125 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -49,6 +49,8 @@ struct io_rsrc_node { * came from the same table and so are of the same type. */ struct list_head item_list; + struct io_rsrc_put item; + int inline_items; }; struct io_mapped_ubuf { -- cgit From 36b9818a5a84cb7c977fb723babca1c8d74f288f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:53 +0100 Subject: io_uring/rsrc: don't offload node free struct delayed_work rsrc_put_work was previously used to offload node freeing because io_rsrc_node_ref_zero() was previously called by RCU in the IRQ context. Now, as percpu refcounting is gone, we can do it eagerly at the spot without pushing it to a worker. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/13fb1aac1e8d068ad8fd4a0c6d0d157ab61b90c0.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 ------ io_uring/rsrc.c | 59 ++++------------------------------------------------- 2 files changed, 4 insertions(+), 61 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 764df5694d73..d6a0025afc31 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -326,9 +326,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); INIT_LIST_HEAD(&ctx->rsrc_ref_list); - INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); - init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw); - init_llist_head(&ctx->rsrc_put_llist); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; @@ -2821,11 +2818,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_rsrc_node_destroy(ctx->rsrc_node); if (ctx->rsrc_backup_node) io_rsrc_node_destroy(ctx->rsrc_backup_node); - flush_delayed_work(&ctx->rsrc_put_work); - flush_delayed_work(&ctx->fallback_work); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); - WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); #if defined(CONFIG_UNIX) if (ctx->ring_sock) { diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 95e71300bb35..0f4e245dee1b 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -145,15 +145,8 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, { struct io_ring_ctx *ctx = rsrc_data->ctx; - if (prsrc->tag) { - if (ctx->flags & IORING_SETUP_IOPOLL) { - mutex_lock(&ctx->uring_lock); - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - mutex_unlock(&ctx->uring_lock); - } else { - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - } - } + if (prsrc->tag) + io_post_aux_cqe(ctx, prsrc->tag, 0, 0); rsrc_data->do_put(ctx, prsrc); } @@ -176,32 +169,6 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) complete(&rsrc_data->done); } -void io_rsrc_put_work(struct work_struct *work) -{ - struct io_ring_ctx *ctx; - struct llist_node *node; - - ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); - node = llist_del_all(&ctx->rsrc_put_llist); - - while (node) { - struct io_rsrc_node *ref_node; - struct llist_node *next = node->next; - - ref_node = llist_entry(node, struct io_rsrc_node, llist); - __io_rsrc_put_work(ref_node); - node = next; - } -} - -void io_rsrc_put_tw(struct callback_head *cb) -{ - struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, - rsrc_put_tw); - - io_rsrc_put_work(&ctx->rsrc_put_work.work); -} - void io_wait_rsrc_data(struct io_rsrc_data *data) { if (data && !atomic_dec_and_test(&data->refs)) @@ -217,34 +184,18 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) __must_hold(&node->rsrc_data->ctx->uring_lock) { struct io_ring_ctx *ctx = node->rsrc_data->ctx; - bool first_add = false; - unsigned long delay = HZ; node->done = true; - - /* if we are mid-quiesce then do not delay */ - if (node->rsrc_data->quiesce) - delay = 0; - while (!list_empty(&ctx->rsrc_ref_list)) { node = list_first_entry(&ctx->rsrc_ref_list, struct io_rsrc_node, node); /* recycle ref nodes in order */ if (!node->done) break; - list_del(&node->node); - first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); - } - if (!first_add) - return; - - if (ctx->submitter_task) { - if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw, - ctx->notify_method)) - return; + list_del(&node->node); + __io_rsrc_put_work(node); } - mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); } static struct io_rsrc_node *io_rsrc_node_alloc(void) @@ -320,13 +271,11 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, if (ret < 0) { atomic_inc(&data->refs); /* wait for all works potentially completing data->done */ - flush_delayed_work(&ctx->rsrc_put_work); reinit_completion(&data->done); mutex_lock(&ctx->uring_lock); break; } - flush_delayed_work(&ctx->rsrc_put_work); ret = wait_for_completion_interruptible(&data->done); if (!ret) { mutex_lock(&ctx->uring_lock); -- cgit From 9eae8655f9cd2eeed99fb7a0d2bb22816c17e497 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:54 +0100 Subject: io_uring/rsrc: cache struct io_rsrc_node Add allocation cache for struct io_rsrc_node, it's always allocated and put under ->uring_lock, so it doesn't need any extra synchronisation around caches. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/252a9d9ef9654e6467af30fdc02f57c0118fb76e.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 +++++++++-- io_uring/rsrc.c | 23 +++++++++++++++-------- io_uring/rsrc.h | 9 +++++++-- 3 files changed, 31 insertions(+), 12 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d6a0025afc31..419d6f42935f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -310,6 +310,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); + io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node)); io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll)); io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr)); init_completion(&ctx->ref_comp); @@ -2790,6 +2791,11 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } +static void io_rsrc_node_cache_free(struct io_cache_entry *entry) +{ + kfree(container_of(entry, struct io_rsrc_node, cache)); +} + static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); @@ -2815,9 +2821,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) /* there are no registered resources left, nobody uses it */ if (ctx->rsrc_node) - io_rsrc_node_destroy(ctx->rsrc_node); + io_rsrc_node_destroy(ctx, ctx->rsrc_node); if (ctx->rsrc_backup_node) - io_rsrc_node_destroy(ctx->rsrc_backup_node); + io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); @@ -2829,6 +2835,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) #endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); + io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 0f4e245dee1b..345631091d80 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -164,7 +164,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) kfree(prsrc); } - io_rsrc_node_destroy(ref_node); + io_rsrc_node_destroy(rsrc_data->ctx, ref_node); if (atomic_dec_and_test(&rsrc_data->refs)) complete(&rsrc_data->done); } @@ -175,9 +175,10 @@ void io_wait_rsrc_data(struct io_rsrc_data *data) wait_for_completion(&data->done); } -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - kfree(ref_node); + if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) + kfree(node); } void io_rsrc_node_ref_zero(struct io_rsrc_node *node) @@ -198,13 +199,19 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) } } -static struct io_rsrc_node *io_rsrc_node_alloc(void) +static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; + struct io_cache_entry *entry; - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; + entry = io_alloc_cache_get(&ctx->rsrc_node_cache); + if (entry) { + ref_node = container_of(entry, struct io_rsrc_node, cache); + } else { + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return NULL; + } ref_node->refs = 1; INIT_LIST_HEAD(&ref_node->node); @@ -243,7 +250,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) { if (ctx->rsrc_backup_node) return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(); + ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); return ctx->rsrc_backup_node ? 0 : -ENOMEM; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 11703082d125..3b9f4c57c47c 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -4,6 +4,8 @@ #include +#include "alloc_cache.h" + #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) @@ -37,8 +39,11 @@ struct io_rsrc_data { }; struct io_rsrc_node { + union { + struct io_cache_entry cache; + struct io_rsrc_data *rsrc_data; + }; struct list_head node; - struct io_rsrc_data *rsrc_data; struct llist_node llist; int refs; bool done; @@ -65,7 +70,7 @@ void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_put_work(struct work_struct *work); void io_wait_rsrc_data(struct io_rsrc_data *data); -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node); +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc); -- cgit From 1f2c8f610aa6c6a3dc3523f93eaf28c25051df6f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:55 +0100 Subject: io_uring/rsrc: add lockdep sanity checks We should hold ->uring_lock while putting nodes with io_put_rsrc_node(), add a lockdep check for that. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b50d5f156ac41450029796738c1dfd22a521df7a.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- io_uring/rsrc.c | 2 +- io_uring/rsrc.h | 6 ++++-- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 419d6f42935f..da36fa1eeac9 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1002,7 +1002,7 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) if (rsrc_node) { io_ring_submit_lock(ctx, issue_flags); - io_put_rsrc_node(rsrc_node); + io_put_rsrc_node(ctx, rsrc_node); io_ring_submit_unlock(ctx, issue_flags); } } @@ -1123,7 +1123,7 @@ static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts) if (req->rsrc_node) { io_tw_lock(ctx, ts); - io_put_rsrc_node(req->rsrc_node); + io_put_rsrc_node(ctx, req->rsrc_node); } io_dismantle_req(req); io_put_task_remote(req->task, 1); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 345631091d80..d4bca5e18434 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -236,7 +236,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, atomic_inc(&data_to_kill->refs); /* put master ref */ - io_put_rsrc_node(rsrc_node); + io_put_rsrc_node(ctx, rsrc_node); ctx->rsrc_node = NULL; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 3b9f4c57c47c..cf24c3fd701f 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -119,8 +119,10 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); -static inline void io_put_rsrc_node(struct io_rsrc_node *node) +static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { + lockdep_assert_held(&ctx->uring_lock); + if (node && !--node->refs) io_rsrc_node_ref_zero(node); } @@ -128,7 +130,7 @@ static inline void io_put_rsrc_node(struct io_rsrc_node *node) static inline void io_req_put_rsrc_locked(struct io_kiocb *req, struct io_ring_ctx *ctx) { - io_put_rsrc_node(req->rsrc_node); + io_put_rsrc_node(ctx, req->rsrc_node); } static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, -- cgit From 757ef4682b6aa29fdf752ad47f0d63eb48b261cf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:56 +0100 Subject: io_uring/rsrc: optimise io_rsrc_data refcounting Every struct io_rsrc_node takes a struct io_rsrc_data reference, which means all rsrc updates do 2 extra atomics. Replace atomics refcounting with a int as it's all done under ->uring_lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e73c3d6820cf679532696d790b5b8fae23537213.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 30 ++++++++++++++++++------------ io_uring/rsrc.h | 2 +- 2 files changed, 19 insertions(+), 13 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d4bca5e18434..603a783a0383 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -31,6 +31,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) +static inline bool io_put_rsrc_data_ref(struct io_rsrc_data *rsrc_data) +{ + return !--rsrc_data->refs; +} + int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -165,13 +170,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) } io_rsrc_node_destroy(rsrc_data->ctx, ref_node); - if (atomic_dec_and_test(&rsrc_data->refs)) + if (io_put_rsrc_data_ref(rsrc_data)) complete(&rsrc_data->done); } void io_wait_rsrc_data(struct io_rsrc_data *data) { - if (data && !atomic_dec_and_test(&data->refs)) + if (data && !io_put_rsrc_data_ref(data)) wait_for_completion(&data->done); } @@ -234,7 +239,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, rsrc_node->rsrc_data = data_to_kill; list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - atomic_inc(&data_to_kill->refs); + data_to_kill->refs++; /* put master ref */ io_put_rsrc_node(ctx, rsrc_node); ctx->rsrc_node = NULL; @@ -267,8 +272,8 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, return ret; io_rsrc_node_switch(ctx, data); - /* kill initial ref, already quiesced if zero */ - if (atomic_dec_and_test(&data->refs)) + /* kill initial ref */ + if (io_put_rsrc_data_ref(data)) return 0; data->quiesce = true; @@ -276,17 +281,19 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, do { ret = io_run_task_work_sig(ctx); if (ret < 0) { - atomic_inc(&data->refs); - /* wait for all works potentially completing data->done */ - reinit_completion(&data->done); mutex_lock(&ctx->uring_lock); + if (!data->refs) { + ret = 0; + } else { + /* restore the master reference */ + data->refs++; + } break; } - ret = wait_for_completion_interruptible(&data->done); if (!ret) { mutex_lock(&ctx->uring_lock); - if (atomic_read(&data->refs) <= 0) + if (!data->refs) break; /* * it has been revived by another thread while @@ -361,6 +368,7 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, data->nr = nr; data->ctx = ctx; data->do_put = do_put; + data->refs = 1; if (utags) { ret = -EFAULT; for (i = 0; i < nr; i++) { @@ -371,8 +379,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, goto fail; } } - - atomic_set(&data->refs, 1); init_completion(&data->done); *pdata = data; return 0; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index cf24c3fd701f..7ab9b2b2e757 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -33,8 +33,8 @@ struct io_rsrc_data { u64 **tags; unsigned int nr; rsrc_put_fn *do_put; - atomic_t refs; struct completion done; + int refs; bool quiesce; }; -- cgit From 69bbc6ade9d9d4e3c556cb83e77b6f3cd9ad3d18 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:57 +0100 Subject: io_uring/rsrc: add custom limit for node caching The number of entries in the rsrc node cache is limited to 512, which still seems unnecessarily large. Add per cache thresholds and set to to 32 for the rsrc node cache. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d0cd538b944dac0bf878e276fc0199f21e6bccea.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 6 ++++-- io_uring/io_uring.c | 9 ++++++--- io_uring/rsrc.h | 2 ++ 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'io_uring') diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 2fbecaa3a1ba..851a527afb5e 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -13,7 +13,7 @@ struct io_cache_entry { static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, struct io_cache_entry *entry) { - if (cache->nr_cached < IO_ALLOC_CACHE_MAX) { + if (cache->nr_cached < cache->max_cached) { cache->nr_cached++; wq_stack_add_head(&entry->node, &cache->list); /* KASAN poisons object */ @@ -38,10 +38,12 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c return NULL; } -static inline void io_alloc_cache_init(struct io_alloc_cache *cache, size_t size) +static inline void io_alloc_cache_init(struct io_alloc_cache *cache, + unsigned max_nr, size_t size) { cache->list.next = NULL; cache->nr_cached = 0; + cache->max_cached = max_nr; cache->elem_size = size; } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index da36fa1eeac9..ae90d2753e0d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -310,9 +310,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node)); - io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll)); - io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr)); + io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, + sizeof(struct io_rsrc_node)); + io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct async_poll)); + io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_msghdr)); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 7ab9b2b2e757..8729f2fee256 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -6,6 +6,8 @@ #include "alloc_cache.h" +#define IO_NODE_ALLOC_CACHE_MAX 32 + #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) -- cgit From 758d5d64b619ddbbf96a5605d8d5a919aafaafab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Apr 2023 08:21:45 -0600 Subject: io_uring/uring_cmd: assign ioucmd->cmd at async prep time Rather than check this in the fast path issue, it makes more sense to just assign the copy of the data when we're setting it up anyway. This makes the code a bit cleaner, and removes the need for this check in the issue path. Reviewed-by: Gabriel Krisman Bertazi Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'io_uring') diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 3d825d939b13..f7a96bc76ea1 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -73,6 +73,7 @@ int io_uring_cmd_prep_async(struct io_kiocb *req) cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); memcpy(req->async_data, ioucmd->cmd, cmd_size); + ioucmd->cmd = req->async_data; return 0; } @@ -129,9 +130,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) WRITE_ONCE(ioucmd->cookie, NULL); } - if (req_has_async_data(req)) - ioucmd->cmd = req->async_data; - ret = file->f_op->uring_cmd(ioucmd, issue_flags); if (ret == -EAGAIN) { if (!req_has_async_data(req)) { -- cgit From ab1c590f5c9b96d8d8843d351aed72469f8f2ef0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Apr 2023 14:20:07 +0100 Subject: io_uring: move pinning out of io_req_local_work_add Move ctx pinning from io_req_local_work_add() to the caller, looks better and makes working with the code a bit easier. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/49c0dbed390b0d6d04cb942dd3592879fd5bfb1b.1680782017.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ae90d2753e0d..29a0516ee5ce 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1306,17 +1306,15 @@ static void io_req_local_work_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - percpu_ref_get(&ctx->refs); - if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) - goto put_ref; + return; /* needed for the following wake up */ smp_mb__after_atomic(); if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) { io_move_task_work_from_local(ctx); - goto put_ref; + return; } if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) @@ -1326,9 +1324,6 @@ static void io_req_local_work_add(struct io_kiocb *req) if (READ_ONCE(ctx->cq_waiting)) wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); - -put_ref: - percpu_ref_put(&ctx->refs); } void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) @@ -1337,7 +1332,9 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) struct io_ring_ctx *ctx = req->ctx; if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + percpu_ref_get(&ctx->refs); io_req_local_work_add(req); + percpu_ref_put(&ctx->refs); return; } -- cgit From d73a572df24661851465c821d33c03e70e4b68e5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Apr 2023 14:20:08 +0100 Subject: io_uring: optimize local tw add ctx pinning We currently pin the ctx for io_req_local_work_add() with percpu_ref_get/put, which implies two rcu_read_lock/unlock pairs and some extra overhead on top in the fast path. Replace it with a pure rcu read and let io_ring_exit_work() synchronise against it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cbdfcb6b232627f30e9e50ef91f13c4f05910247.1680782017.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 29a0516ee5ce..fb7215b543cd 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1332,9 +1332,9 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) struct io_ring_ctx *ctx = req->ctx; if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - percpu_ref_get(&ctx->refs); + rcu_read_lock(); io_req_local_work_add(req); - percpu_ref_put(&ctx->refs); + rcu_read_unlock(); return; } @@ -3052,6 +3052,10 @@ static __cold void io_ring_exit_work(struct work_struct *work) spin_lock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock); + /* pairs with RCU read section in io_req_local_work_add() */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + synchronize_rcu(); + io_ring_ctx_free(ctx); } -- cgit From 6e7248adf8f7adb5e36ec1e91efcc85a83bf8aeb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Apr 2023 14:20:09 +0100 Subject: io_uring: refactor io_cqring_wake() Instead of smp_mb() + __io_cqring_wake() in __io_cq_unlock_post_flush() use equivalent io_cqring_wake(). With that we can clean it up further and remove __io_cqring_wake(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/662ee5d898168ac206be06038525e97b64072a46.1680782017.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 ++---- io_uring/io_uring.h | 11 ++--------- 2 files changed, 4 insertions(+), 13 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fb7215b543cd..d4ac62de2113 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -640,10 +640,8 @@ static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) * it will re-check the wakeup conditions once we return we can safely * skip waking it up. */ - if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { - smp_mb(); - __io_cqring_wake(ctx); - } + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + io_cqring_wake(ctx); } void io_cq_unlock_post(struct io_ring_ctx *ctx) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 193b2db39fe8..24d8196bbca3 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -228,8 +228,7 @@ static inline void io_poll_wq_wake(struct io_ring_ctx *ctx) poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); } -/* requires smb_mb() prior, see wq_has_sleeper() */ -static inline void __io_cqring_wake(struct io_ring_ctx *ctx) +static inline void io_cqring_wake(struct io_ring_ctx *ctx) { /* * Trigger waitqueue handler on all waiters on our waitqueue. This @@ -241,17 +240,11 @@ static inline void __io_cqring_wake(struct io_ring_ctx *ctx) * waitqueue handlers, we know we have a dependency between eventfd or * epoll and should terminate multishot poll at that point. */ - if (waitqueue_active(&ctx->cq_wait)) + if (wq_has_sleeper(&ctx->cq_wait)) __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); } -static inline void io_cqring_wake(struct io_ring_ctx *ctx) -{ - smp_mb(); - __io_cqring_wake(ctx); -} - static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; -- cgit From 8501fe70ae9855076ffb03a3670e02a7b3437304 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Apr 2023 14:20:10 +0100 Subject: io_uring: add tw add flags We pass 'allow_local' into io_req_task_work_add() but will need more flags. Replace it with a flags bit field and name this allow_local flag. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4c0f01e7ef4e6feebfb199093cc995af7a19befa.1680782017.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 7 ++++--- io_uring/io_uring.h | 9 +++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d4ac62de2113..6f175fe682e4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1324,12 +1324,13 @@ static void io_req_local_work_add(struct io_kiocb *req) wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } -void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) +void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + if (!(flags & IOU_F_TWQ_FORCE_NORMAL) && + (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { rcu_read_lock(); io_req_local_work_add(req); rcu_read_unlock(); @@ -1359,7 +1360,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) io_task_work.node); node = node->next; - __io_req_task_work_add(req, false); + __io_req_task_work_add(req, IOU_F_TWQ_FORCE_NORMAL); } } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 24d8196bbca3..cb4309a2acdc 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -15,6 +15,11 @@ #include #endif +enum { + /* don't use deferred task_work */ + IOU_F_TWQ_FORCE_NORMAL = 1, +}; + enum { IOU_OK = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, @@ -48,7 +53,7 @@ static inline bool io_req_ffs_set(struct io_kiocb *req) return req->flags & REQ_F_FIXED_FILE; } -void __io_req_task_work_add(struct io_kiocb *req, bool allow_local); +void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); @@ -93,7 +98,7 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, static inline void io_req_task_work_add(struct io_kiocb *req) { - __io_req_task_work_add(req, true); + __io_req_task_work_add(req, 0); } #define io_for_each_link(pos, head) \ -- cgit From 5150940079a3ce94d7474f6f5b0d6276569dc1de Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Apr 2023 14:20:11 +0100 Subject: io_uring: inline llist_add() We'll need to grab some information from the previous request in the tw list, inline llist_add(), it'll be used in the following patch. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f0165493af7b379943c792114b972f331e7d7d10.1680782017.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6f175fe682e4..786ecfa01c54 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1303,8 +1303,15 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx) static void io_req_local_work_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + struct llist_node *first; - if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) + first = READ_ONCE(ctx->work_llist.first); + do { + req->io_task_work.node.next = first; + } while (!try_cmpxchg(&ctx->work_llist.first, &first, + &req->io_task_work.node)); + + if (first) return; /* needed for the following wake up */ -- cgit From 8751d15426a31baaf40f7570263c27c3e5d1dc44 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Apr 2023 14:20:12 +0100 Subject: io_uring: reduce scheduling due to tw Every task_work will try to wake the task to be executed, which causes excessive scheduling and additional overhead. For some tw it's justified, but others won't do much but post a single CQE. When a task waits for multiple cqes, every such task_work will wake it up. Instead, the task may give a hint about how many cqes it waits for, io_req_local_work_add() will compare against it and skip wake ups if #cqes + #tw is not enough to satisfy the waiting condition. Task_work that uses the optimisation should be simple enough and never post more than one CQE. It's also ignored for non DEFER_TASKRUN rings. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d2b77e99d1e86624d8a69f7037d764b739dcd225.1680782017.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 68 ++++++++++++++++++++++++++++++++++++----------------- io_uring/io_uring.h | 9 +++++++ io_uring/notif.c | 2 +- io_uring/notif.h | 2 +- io_uring/rw.c | 2 +- 5 files changed, 59 insertions(+), 24 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 786ecfa01c54..8a327a81beaf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1300,35 +1300,59 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx) } } -static void io_req_local_work_add(struct io_kiocb *req) +static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; + unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *first; + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + flags &= ~IOU_F_TWQ_LAZY_WAKE; + first = READ_ONCE(ctx->work_llist.first); do { + nr_tw_prev = 0; + if (first) { + struct io_kiocb *first_req = container_of(first, + struct io_kiocb, + io_task_work.node); + /* + * Might be executed at any moment, rely on + * SLAB_TYPESAFE_BY_RCU to keep it alive. + */ + nr_tw_prev = READ_ONCE(first_req->nr_tw); + } + nr_tw = nr_tw_prev + 1; + /* Large enough to fail the nr_wait comparison below */ + if (!(flags & IOU_F_TWQ_LAZY_WAKE)) + nr_tw = -1U; + + req->nr_tw = nr_tw; req->io_task_work.node.next = first; } while (!try_cmpxchg(&ctx->work_llist.first, &first, &req->io_task_work.node)); - if (first) - return; - - /* needed for the following wake up */ - smp_mb__after_atomic(); - - if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) { - io_move_task_work_from_local(ctx); - return; + if (!first) { + if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) { + io_move_task_work_from_local(ctx); + return; + } + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + if (ctx->has_evfd) + io_eventfd_signal(ctx); } - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ctx->has_evfd) - io_eventfd_signal(ctx); - - if (READ_ONCE(ctx->cq_waiting)) - wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); + nr_wait = atomic_read(&ctx->cq_wait_nr); + /* no one is waiting */ + if (!nr_wait) + return; + /* either not enough or the previous add has already woken it up */ + if (nr_wait > nr_tw || nr_tw_prev >= nr_wait) + return; + /* pairs with set_current_state() in io_cqring_wait() */ + smp_mb__after_atomic(); + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) @@ -1339,7 +1363,7 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) if (!(flags & IOU_F_TWQ_FORCE_NORMAL) && (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { rcu_read_lock(); - io_req_local_work_add(req); + io_req_local_work_add(req, flags); rcu_read_unlock(); return; } @@ -2625,7 +2649,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, unsigned long check_cq; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - WRITE_ONCE(ctx->cq_waiting, 1); + int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); + + atomic_set(&ctx->cq_wait_nr, nr_wait); set_current_state(TASK_INTERRUPTIBLE); } else { prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, @@ -2634,7 +2660,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, ret = io_cqring_wait_schedule(ctx, &iowq); __set_current_state(TASK_RUNNING); - WRITE_ONCE(ctx->cq_waiting, 0); + atomic_set(&ctx->cq_wait_nr, 0); if (ret < 0) break; @@ -4517,7 +4543,7 @@ static int __init io_uring_init(void) io_uring_optable_init(); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT); + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); return 0; }; __initcall(io_uring_init); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index cb4309a2acdc..ef449e43d493 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -18,6 +18,15 @@ enum { /* don't use deferred task_work */ IOU_F_TWQ_FORCE_NORMAL = 1, + + /* + * A hint to not wake right away but delay until there are enough of + * tw's queued to match the number of CQEs the task is waiting for. + * + * Must not be used wirh requests generating more than one CQE. + * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. + */ + IOU_F_TWQ_LAZY_WAKE = 2, }; enum { diff --git a/io_uring/notif.c b/io_uring/notif.c index 172105eb347d..e1846a25dde1 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -31,7 +31,7 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, struct io_kiocb *notif = cmd_to_io_kiocb(nd); if (refcount_dec_and_test(&uarg->refcnt)) - io_req_task_work_add(notif); + __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); } static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, diff --git a/io_uring/notif.h b/io_uring/notif.h index c88c800cd89d..6dd1b30a468f 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -33,7 +33,7 @@ static inline void io_notif_flush(struct io_kiocb *notif) /* drop slot's master ref */ if (refcount_dec_and_test(&nd->uarg.refcnt)) - io_req_task_work_add(notif); + __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); } static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) diff --git a/io_uring/rw.c b/io_uring/rw.c index f14868624f41..6c7d2654770e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -304,7 +304,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) return; io_req_set_res(req, io_fixup_rw_res(req, res), 0); req->io_task_work.func = io_req_rw_complete; - io_req_task_work_add(req); + __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) -- cgit From c66ae3ec38f946edb1776d25c1c8cd63803b8ec3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Apr 2023 14:20:13 +0100 Subject: io_uring: refactor __io_cq_unlock_post_flush() Separate ->task_complete path in __io_cq_unlock_post_flush(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/baa9b8d822f024e4ee01c40209dbbe38d9c8c11d.1680782017.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8a327a81beaf..0ea50c46f27f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -627,21 +627,23 @@ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) io_cqring_wake(ctx); } -static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) +static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { io_commit_cqring(ctx); - __io_cq_unlock(ctx); - io_commit_cqring_flush(ctx); - /* - * As ->task_complete implies that the ring is single tasked, cq_wait - * may only be waited on by the current in io_cqring_wait(), but since - * it will re-check the wakeup conditions once we return we can safely - * skip waking it up. - */ - if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + if (ctx->task_complete) { + /* + * ->task_complete implies that only current might be waiting + * for CQEs, and obviously, we currently don't. No one is + * waiting, wakeups are futile, skip them. + */ + io_commit_cqring_flush(ctx); + } else { + __io_cq_unlock(ctx); + io_commit_cqring_flush(ctx); io_cqring_wake(ctx); + } } void io_cq_unlock_post(struct io_ring_ctx *ctx) -- cgit From 360cd42c4e95ff06d8d7b0a54e42236c7e7c187f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Apr 2023 14:20:14 +0100 Subject: io_uring: optimise io_req_local_work_add Chains of memory accesses are never good for performance. The req->task->io_uring->in_cancel in io_req_local_work_add() is there so that when a task is exiting via io_uring_try_cancel_requests() and starts waiting for completions, it gets woken up by every new task_work item queued. Do a little trick by announcing waiting in io_uring_try_cancel_requests(), making io_req_local_work_add() wake us up. We also need to check for deferred tw items after prepare_to_wait(TASK_INTERRUPTIBLE); Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/fb11597e9bbcb365901824f8c5c2cf0d6ee100d0.1680782017.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0ea50c46f27f..9bbf58297a0e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1335,10 +1335,6 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) &req->io_task_work.node)); if (!first) { - if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) { - io_move_task_work_from_local(ctx); - return; - } if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ctx->has_evfd) @@ -3205,6 +3201,12 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, enum io_wq_cancel cret; bool ret = false; + /* set it so io_req_local_work_add() would wake us up */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + } + /* failed during ring init, it couldn't have issued any requests */ if (!ctx->rings) return false; @@ -3259,6 +3261,8 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) { struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx; + struct io_tctx_node *node; + unsigned long index; s64 inflight; DEFINE_WAIT(wait); @@ -3280,9 +3284,6 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) break; if (!sqd) { - struct io_tctx_node *node; - unsigned long index; - xa_for_each(&tctx->xa, index, node) { /* sqpoll task will cancel all its requests */ if (node->ctx->sq_data) @@ -3305,7 +3306,13 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); io_run_task_work(); io_uring_drop_tctx_refs(current); - + xa_for_each(&tctx->xa, index, node) { + if (!llist_empty(&node->ctx->work_llist)) { + WARN_ON_ONCE(node->ctx->submitter_task && + node->ctx->submitter_task != current); + goto end_wait; + } + } /* * If we've seen completions, retry without waiting. This * avoids a race where a completion comes in before we did @@ -3313,6 +3320,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) */ if (inflight == tctx_inflight(tctx, !cancel_all)) schedule(); +end_wait: finish_wait(&tctx->wait, &wait); } while (1); -- cgit From 27a67079c0e548d5c3232c40951517cfa630fe51 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 12 Apr 2023 12:07:36 -0600 Subject: io_uring/uring_cmd: take advantage of completion batching We know now what the completion context is for the uring_cmd completion handling, so use that to have io_req_task_complete() decide what the best way to complete the request is. This allows batching of the posted completions if we have multiple pending, rather than always doing them one-by-one. Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index f7a96bc76ea1..5113c9a48583 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -54,11 +54,15 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, io_req_set_res(req, ret, 0); if (req->ctx->flags & IORING_SETUP_CQE32) io_req_set_cqe32_extra(req, res2, 0); - if (req->ctx->flags & IORING_SETUP_IOPOLL) + if (req->ctx->flags & IORING_SETUP_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); - else - io_req_complete_post(req, issue_flags); + } else { + struct io_tw_state ts = { + .locked = !(issue_flags & IO_URING_F_UNLOCKED), + }; + io_req_task_complete(req, &ts); + } } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -- cgit From 8b1df11f97333d6d8647f1c6c0554eb2d9774396 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 11 Apr 2023 12:06:01 +0100 Subject: io_uring: shut io_prep_async_work warning io_uring/io_uring.c:432 io_prep_async_work() error: we previously assumed 'req->file' could be null (see line 425). Even though it's a false positive as there will not be REQ_F_ISREG set without a file, let's add a simple check to make the kernel test robot happy. We don't care about performance here, but assumingly it'll be optimised out by the compiler. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a6cfbe92c74b789c0b4f046f7f98d19b1ca2e5b7.1681210788.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9bbf58297a0e..b171c26d331d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -425,7 +425,7 @@ static void io_prep_async_work(struct io_kiocb *req) if (req->file && !io_req_ffs_set(req)) req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT; - if (req->flags & REQ_F_ISREG) { + if (req->file && (req->flags & REQ_F_ISREG)) { bool should_hash = def->hash_reg_file; /* don't serialize this request if the fs doesn't need it */ -- cgit From ceac766a5581e4e671ec8e5236b8fdaed8e4c8c9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 11 Apr 2023 12:06:02 +0100 Subject: io_uring/kbuf: remove extra ->buf_ring null check The kernel test robot complains about __io_remove_buffers(). io_uring/kbuf.c:221 __io_remove_buffers() warn: variable dereferenced before check 'bl->buf_ring' (see line 219) That check is not needed as ->buf_ring will always be set, so we can remove it and so silence the warning. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9a632bbf749d9d911e605255652ce08d18e7d2c6.1681210788.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'io_uring') diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 79c25459e8de..0905c1761fba 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -218,14 +218,12 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->is_mapped) { i = bl->buf_ring->tail - bl->head; if (bl->is_mmap) { - if (bl->buf_ring) { - struct page *page; - - page = virt_to_head_page(bl->buf_ring); - if (put_page_testzero(page)) - free_compound_page(page); - bl->buf_ring = NULL; - } + struct page *page; + + page = virt_to_head_page(bl->buf_ring); + if (put_page_testzero(page)) + free_compound_page(page); + bl->buf_ring = NULL; bl->is_mmap = 0; } else if (bl->buf_nr_pages) { int j; -- cgit From 8ce4269eeedc5b31f5817f610b42cba8be8fa9de Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 11 Apr 2023 12:06:03 +0100 Subject: io_uring: add irq lockdep checks We don't post CQEs from the IRQ context, add a check catching that. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f23f7a24dbe8027b3d37873fece2b6488f878b31.1681210788.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'io_uring') diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ef449e43d493..25515d69d205 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -94,6 +94,8 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, #define io_lockdep_assert_cq_locked(ctx) \ do { \ + lockdep_assert(in_task()); \ + \ if (ctx->flags & IORING_SETUP_IOPOLL) { \ lockdep_assert_held(&ctx->uring_lock); \ } else if (!ctx->task_complete) { \ -- cgit From 786788a8cfe03056e9c7b1c6e418c1db92a0ce80 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 11 Apr 2023 12:06:04 +0100 Subject: io_uring/rsrc: add lockdep checks Add a lockdep chek to make sure that file and buffer updates hold ->uring_lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/961bbe6e433ec9bc0375127f23468b37b729df99.1681210788.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 603a783a0383..24e4e2109549 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -534,6 +534,8 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, __u32 tmp; int err; + lockdep_assert_held(&ctx->uring_lock); + if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; err = io_rsrc_node_switch_start(ctx); -- cgit From 528407b1e0ea51260fff2cc8b669c632a65d7a09 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 11 Apr 2023 12:06:05 +0100 Subject: io_uring/rsrc: consolidate node caching We store one pre-allocated rsrc node in ->rsrc_backup_node, merge it with ->rsrc_node_cache. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6d5410e51ccd29be7a716be045b51d6b371baef6.1681210788.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 5 +++++ io_uring/io_uring.c | 2 -- io_uring/rsrc.c | 20 +++++++++++--------- 3 files changed, 16 insertions(+), 11 deletions(-) (limited to 'io_uring') diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 851a527afb5e..241245cb54a6 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -23,6 +23,11 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, return false; } +static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache) +{ + return !cache->list.next; +} + static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) { if (cache->list.next) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b171c26d331d..075bae8a2bb1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2852,8 +2852,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) /* there are no registered resources left, nobody uses it */ if (ctx->rsrc_node) io_rsrc_node_destroy(ctx, ctx->rsrc_node); - if (ctx->rsrc_backup_node) - io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 24e4e2109549..73f9e10d9bf0 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -230,7 +230,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill) __must_hold(&ctx->uring_lock) { - WARN_ON_ONCE(!ctx->rsrc_backup_node); + WARN_ON_ONCE(io_alloc_cache_empty(&ctx->rsrc_node_cache)); WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); if (data_to_kill) { @@ -245,18 +245,20 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, ctx->rsrc_node = NULL; } - if (!ctx->rsrc_node) { - ctx->rsrc_node = ctx->rsrc_backup_node; - ctx->rsrc_backup_node = NULL; - } + if (!ctx->rsrc_node) + ctx->rsrc_node = io_rsrc_node_alloc(ctx); } int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) { - if (ctx->rsrc_backup_node) - return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); - return ctx->rsrc_backup_node ? 0 : -ENOMEM; + if (io_alloc_cache_empty(&ctx->rsrc_node_cache)) { + struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL); + + if (!node) + return -ENOMEM; + io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache); + } + return 0; } __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, -- cgit From 13c223962eac16f161cf9b6355209774c609af28 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 11 Apr 2023 12:06:06 +0100 Subject: io_uring/rsrc: zero node's rsrc data on alloc struct io_rsrc_node::rsrc_data field is initialised on rsrc removal and shouldn't be used before that, still let's play safe and zero the field on alloc. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/09bd03cedc8da8a7974c5e6e4bf0489fd16593ab.1681210788.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 73f9e10d9bf0..329cc3851dfd 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -218,6 +218,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) return NULL; } + ref_node->rsrc_data = NULL; ref_node->refs = 1; INIT_LIST_HEAD(&ref_node->node); INIT_LIST_HEAD(&ref_node->item_list); -- cgit From 2933ae6eaa05e8db6ad33a3ca12af18d2a25358c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 11 Apr 2023 12:06:07 +0100 Subject: io_uring/rsrc: refactor io_rsrc_node_switch We use io_rsrc_node_switch() coupled with io_rsrc_node_switch_start() for a bunch of cases including initialising ctx->rsrc_node, i.e. by passing NULL instead of rsrc_data. Leave it to only deal with actual node changing. For that, first remove it from io_uring_create() and add a function allocating the first node. Then also remove all calls to io_rsrc_node_switch() from files/buffers register as we already have a node installed and it does essentially nothing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d146fe306ff98b1a5a60c997c252534f03d423d7.1681210788.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ++--- io_uring/rsrc.c | 36 +++++++++++------------------------- io_uring/rsrc.h | 7 +++++++ 3 files changed, 20 insertions(+), 28 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 075bae8a2bb1..9083a8466ebf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3881,11 +3881,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ret = io_sq_offload_create(ctx, p); if (ret) goto err; - /* always set a rsrc node */ - ret = io_rsrc_node_switch_start(ctx); + + ret = io_rsrc_init(ctx); if (ret) goto err; - io_rsrc_node_switch(ctx, NULL); memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 329cc3851dfd..f2c660ffea74 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -204,7 +204,7 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) } } -static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; struct io_cache_entry *entry; @@ -231,23 +231,18 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill) __must_hold(&ctx->uring_lock) { - WARN_ON_ONCE(io_alloc_cache_empty(&ctx->rsrc_node_cache)); - WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); + struct io_rsrc_node *node = ctx->rsrc_node; + struct io_rsrc_node *backup = io_rsrc_node_alloc(ctx); - if (data_to_kill) { - struct io_rsrc_node *rsrc_node = ctx->rsrc_node; - - rsrc_node->rsrc_data = data_to_kill; - list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - - data_to_kill->refs++; - /* put master ref */ - io_put_rsrc_node(ctx, rsrc_node); - ctx->rsrc_node = NULL; - } + if (WARN_ON_ONCE(!backup)) + return; - if (!ctx->rsrc_node) - ctx->rsrc_node = io_rsrc_node_alloc(ctx); + data_to_kill->refs++; + node->rsrc_data = data_to_kill; + list_add_tail(&node->node, &ctx->rsrc_ref_list); + /* put master ref */ + io_put_rsrc_node(ctx, node); + ctx->rsrc_node = backup; } int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) @@ -921,9 +916,6 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, &ctx->file_data); if (ret) @@ -978,7 +970,6 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, /* default it to the whole table */ io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); - io_rsrc_node_switch(ctx, NULL); return 0; fail: __io_sqe_files_unregister(ctx); @@ -1260,9 +1251,6 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); if (ret) return ret; @@ -1300,8 +1288,6 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, ctx->buf_data = data; if (ret) __io_sqe_buffers_unregister(ctx); - else - io_rsrc_node_switch(ctx, NULL); return ret; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 8729f2fee256..17dfe180208f 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -74,6 +74,7 @@ void io_rsrc_put_work(struct work_struct *work); void io_wait_rsrc_data(struct io_rsrc_data *data); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc); void io_rsrc_node_switch(struct io_ring_ctx *ctx, @@ -164,6 +165,12 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) return &data->tags[table_idx][off]; } +static inline int io_rsrc_init(struct io_ring_ctx *ctx) +{ + ctx->rsrc_node = io_rsrc_node_alloc(ctx); + return ctx->rsrc_node ? 0 : -ENOMEM; +} + int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -- cgit From d581076b6a85c6f8308a4ba2bdcd82651f5183df Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 11 Apr 2023 12:06:08 +0100 Subject: io_uring/rsrc: extract SCM file put helper SCM file accounting is a slow path and is only used for UNIX files. Extract a helper out of io_rsrc_file_put() that does the SCM unaccounting. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/58cc7bffc2ee96bec8c2b89274a51febcbfa5556.1681210788.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f2c660ffea74..11058e20bdcc 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -832,20 +832,14 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) return 0; } -static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) +static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file) { - struct file *file = prsrc->file; #if defined(CONFIG_UNIX) struct sock *sock = ctx->ring_sock->sk; struct sk_buff_head list, *head = &sock->sk_receive_queue; struct sk_buff *skb; int i; - if (!io_file_need_scm(file)) { - fput(file); - return; - } - __skb_queue_head_init(&list); /* @@ -895,11 +889,19 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) __skb_queue_tail(head, skb); spin_unlock_irq(&head->lock); } -#else - fput(file); #endif } +static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) +{ + struct file *file = prsrc->file; + + if (likely(!io_file_need_scm(file))) + fput(file); + else + io_rsrc_file_scm_put(ctx, file); +} + int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { -- cgit From 519760df251bf2dcafb0af23df0229096537e78a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 15 Apr 2023 14:20:08 +0100 Subject: io_uring/notif: add constant for ubuf_info flags Add a constant IO_NOTIF_UBUF_FLAGS for struct ubuf_info flags that notifications use. That should minimise merge conflicts for planned changes touching both io_uring and net at the same time. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/notif.c | 2 +- io_uring/notif.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/notif.c b/io_uring/notif.c index e1846a25dde1..d3e703c37aba 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -79,7 +79,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) notif->io_task_work.func = io_req_task_complete; nd = io_notif_to_data(notif); - nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + nd->uarg.flags = IO_NOTIF_UBUF_FLAGS; nd->uarg.callback = io_tx_ubuf_callback; refcount_set(&nd->uarg.refcnt, 1); return notif; diff --git a/io_uring/notif.h b/io_uring/notif.h index 6dd1b30a468f..86d32bd9f856 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -7,6 +7,7 @@ #include "rsrc.h" +#define IO_NOTIF_UBUF_FLAGS (SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN) #define IO_NOTIF_SPLICE_BATCH 32 struct io_notif_data { -- cgit From 953c37e066f05a3dca2d74643574b8dfe8a83983 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:05 +0100 Subject: io_uring/rsrc: use nospec'ed indexes We use array_index_nospec() for registered buffer indexes, but don't use it while poking into rsrc tags, fix that. Fixes: 634d00df5e1cf ("io_uring: add full-fledged dynamic buffers support") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f02fafc5a9c0dd69be2b0618c38831c078232ff0.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 11058e20bdcc..3c1538b8c8f4 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -517,7 +517,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, } ctx->user_bufs[i] = imu; - *io_get_tag_slot(ctx->buf_data, offset) = tag; + *io_get_tag_slot(ctx->buf_data, i) = tag; } if (needs_switch) -- cgit From c732ea242d565c8281c4b017929fc62a246d81b9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:06 +0100 Subject: io_uring/rsrc: remove io_rsrc_node::done Kill io_rsrc_node::node and check refs instead, it's set when the nodes refcount hits zero, and it won't change afterwards. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/bbde361f4010f7e8bf196f1ecca27a763b79926f.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 5 +---- io_uring/rsrc.h | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 3c1538b8c8f4..5fc9d10743e0 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -191,14 +191,12 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) { struct io_ring_ctx *ctx = node->rsrc_data->ctx; - node->done = true; while (!list_empty(&ctx->rsrc_ref_list)) { node = list_first_entry(&ctx->rsrc_ref_list, struct io_rsrc_node, node); /* recycle ref nodes in order */ - if (!node->done) + if (node->refs) break; - list_del(&node->node); __io_rsrc_put_work(node); } @@ -222,7 +220,6 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) ref_node->refs = 1; INIT_LIST_HEAD(&ref_node->node); INIT_LIST_HEAD(&ref_node->item_list); - ref_node->done = false; ref_node->inline_items = 0; return ref_node; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 17dfe180208f..88adcb0b7963 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -48,7 +48,6 @@ struct io_rsrc_node { struct list_head node; struct llist_node llist; int refs; - bool done; /* * Keeps a list of struct io_rsrc_put to be completed. Each entry -- cgit From eef81fcaa61e1bc6b7735be65f41bbf1a8efd133 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:07 +0100 Subject: io_uring/rsrc: refactor io_rsrc_ref_quiesce Refactor io_rsrc_ref_quiesce() by moving the first mutex_unlock(), so we don't have to have a second mutex_unlock() further in the loop. It prepares us to the next patch. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/65bc876271fb16bf550a53a4c76c91aacd94e52e.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 5fc9d10743e0..d7e7528f7159 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -272,8 +272,8 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, return 0; data->quiesce = true; - mutex_unlock(&ctx->uring_lock); do { + mutex_unlock(&ctx->uring_lock); ret = io_run_task_work_sig(ctx); if (ret < 0) { mutex_lock(&ctx->uring_lock); @@ -285,18 +285,10 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, } break; } - ret = wait_for_completion_interruptible(&data->done); - if (!ret) { - mutex_lock(&ctx->uring_lock); - if (!data->refs) - break; - /* - * it has been revived by another thread while - * we were unlocked - */ - mutex_unlock(&ctx->uring_lock); - } - } while (1); + wait_for_completion_interruptible(&data->done); + mutex_lock(&ctx->uring_lock); + ret = 0; + } while (data->refs); data->quiesce = false; return ret; -- cgit From 4ea15b56f0810f0d8795d475db1bb74b3a7c1b2f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:08 +0100 Subject: io_uring/rsrc: use wq for quiescing Replace completions with waitqueues for rsrc data quiesce, the main wakeup condition is when data refs hit zero. Note that data refs are only changes under ->uring_lock, so we prepare before mutex_unlock() reacquire it after taking the lock back. This change will be needed in the next patch. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1d0dbc74b3b4fd67c8f01819e680c5e0da252956.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 + io_uring/rsrc.c | 18 ++++++++++++------ io_uring/rsrc.h | 1 - 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9083a8466ebf..3c1c8c788b7b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -321,6 +321,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); + init_waitqueue_head(&ctx->rsrc_quiesce_wq); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d7e7528f7159..f9ce4076c73d 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -158,6 +158,7 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) { struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; + struct io_ring_ctx *ctx = rsrc_data->ctx; struct io_rsrc_put *prsrc, *tmp; if (ref_node->inline_items) @@ -171,13 +172,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) io_rsrc_node_destroy(rsrc_data->ctx, ref_node); if (io_put_rsrc_data_ref(rsrc_data)) - complete(&rsrc_data->done); + wake_up_all(&ctx->rsrc_quiesce_wq); } void io_wait_rsrc_data(struct io_rsrc_data *data) { - if (data && !io_put_rsrc_data_ref(data)) - wait_for_completion(&data->done); + if (data) + WARN_ON_ONCE(!io_put_rsrc_data_ref(data)); } void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) @@ -257,6 +258,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) { + DEFINE_WAIT(we); int ret; /* As we may drop ->uring_lock, other task may have started quiesce */ @@ -273,7 +275,9 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, data->quiesce = true; do { + prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); mutex_unlock(&ctx->uring_lock); + ret = io_run_task_work_sig(ctx); if (ret < 0) { mutex_lock(&ctx->uring_lock); @@ -285,12 +289,15 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, } break; } - wait_for_completion_interruptible(&data->done); + + schedule(); + __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); ret = 0; } while (data->refs); - data->quiesce = false; + finish_wait(&ctx->rsrc_quiesce_wq, &we); + data->quiesce = false; return ret; } @@ -366,7 +373,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, goto fail; } } - init_completion(&data->done); *pdata = data; return 0; fail: diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 88adcb0b7963..d93ba4e9742a 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -35,7 +35,6 @@ struct io_rsrc_data { u64 **tags; unsigned int nr; rsrc_put_fn *do_put; - struct completion done; int refs; bool quiesce; }; -- cgit From 7d481e0356334eb2de254414769b4bed4b2a8827 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:09 +0100 Subject: io_uring/rsrc: fix DEFER_TASKRUN rsrc quiesce For io_rsrc_ref_quiesce() to progress it should execute all task_work items, including deferred ones. However, currently nobody would wake us, and so let's set ctx->cq_wait_nr, so io_req_local_work_add() would wake us up. Fixes: c0e0d6ba25f18 ("io_uring: add IORING_SETUP_DEFER_TASKRUN") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f1a90d1bc5ebf096475b018fed52e54f3b89d4af.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f9ce4076c73d..e634ef384724 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -273,6 +273,11 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, if (io_put_rsrc_data_ref(data)) return 0; + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + } + data->quiesce = true; do { prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); @@ -298,6 +303,10 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, finish_wait(&ctx->rsrc_quiesce_wq, &we); data->quiesce = false; + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 0); + smp_mb(); + } return ret; } -- cgit From 0b222eeb6514ba6c3457b667fa4f3645032e1fc9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:10 +0100 Subject: io_uring/rsrc: remove rsrc_data refs Instead of waiting for rsrc_data->refs to be downed to zero, check whether there are rsrc nodes queued for completion, that's easier then maintaining references. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8e33fd143d83e11af3e386aea28eb6d6c6a1be10.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- io_uring/rsrc.c | 32 ++++++++------------------------ io_uring/rsrc.h | 2 -- 3 files changed, 10 insertions(+), 28 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3c1c8c788b7b..3d43df8f1e4e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2831,8 +2831,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ - io_wait_rsrc_data(ctx->buf_data); - io_wait_rsrc_data(ctx->file_data); + if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list))) + return; mutex_lock(&ctx->uring_lock); if (ctx->buf_data) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index e634ef384724..5415a18844e0 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -31,11 +31,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) -static inline bool io_put_rsrc_data_ref(struct io_rsrc_data *rsrc_data) -{ - return !--rsrc_data->refs; -} - int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -158,7 +153,6 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) { struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - struct io_ring_ctx *ctx = rsrc_data->ctx; struct io_rsrc_put *prsrc, *tmp; if (ref_node->inline_items) @@ -171,14 +165,6 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) } io_rsrc_node_destroy(rsrc_data->ctx, ref_node); - if (io_put_rsrc_data_ref(rsrc_data)) - wake_up_all(&ctx->rsrc_quiesce_wq); -} - -void io_wait_rsrc_data(struct io_rsrc_data *data) -{ - if (data) - WARN_ON_ONCE(!io_put_rsrc_data_ref(data)); } void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) @@ -201,6 +187,8 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) list_del(&node->node); __io_rsrc_put_work(node); } + if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) + wake_up_all(&ctx->rsrc_quiesce_wq); } struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) @@ -235,7 +223,6 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, if (WARN_ON_ONCE(!backup)) return; - data_to_kill->refs++; node->rsrc_data = data_to_kill; list_add_tail(&node->node, &ctx->rsrc_ref_list); /* put master ref */ @@ -269,8 +256,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, return ret; io_rsrc_node_switch(ctx, data); - /* kill initial ref */ - if (io_put_rsrc_data_ref(data)) + if (list_empty(&ctx->rsrc_ref_list)) return 0; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { @@ -278,6 +264,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, smp_mb(); } + ctx->rsrc_quiesce++; data->quiesce = true; do { prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); @@ -286,12 +273,8 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, ret = io_run_task_work_sig(ctx); if (ret < 0) { mutex_lock(&ctx->uring_lock); - if (!data->refs) { + if (list_empty(&ctx->rsrc_ref_list)) ret = 0; - } else { - /* restore the master reference */ - data->refs++; - } break; } @@ -299,10 +282,12 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); ret = 0; - } while (data->refs); + } while (!list_empty(&ctx->rsrc_ref_list)); finish_wait(&ctx->rsrc_quiesce_wq, &we); data->quiesce = false; + ctx->rsrc_quiesce--; + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, 0); smp_mb(); @@ -371,7 +356,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, data->nr = nr; data->ctx = ctx; data->do_put = do_put; - data->refs = 1; if (utags) { ret = -EFAULT; for (i = 0; i < nr; i++) { diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index d93ba4e9742a..5dd2fcb28069 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -35,7 +35,6 @@ struct io_rsrc_data { u64 **tags; unsigned int nr; rsrc_put_fn *do_put; - int refs; bool quiesce; }; @@ -69,7 +68,6 @@ struct io_mapped_ubuf { void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_put_work(struct work_struct *work); -void io_wait_rsrc_data(struct io_rsrc_data *data); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); -- cgit From 2f2af35f8e5a1ed552ed02e47277d50092a2b9f6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:11 +0100 Subject: io_uring/rsrc: inline switch_start fast path Inline the part of io_rsrc_node_switch_start() that checks whether the cache is empty or not, as most of the times it will have some number of entries in there. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9619c1717a0e01f22c5fce2f1ba2735f804da0f2.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 12 +++++------- io_uring/rsrc.h | 9 ++++++++- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 5415a18844e0..bfa0b382c6c6 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -230,15 +230,13 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, ctx->rsrc_node = backup; } -int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) +int __io_rsrc_node_switch_start(struct io_ring_ctx *ctx) { - if (io_alloc_cache_empty(&ctx->rsrc_node_cache)) { - struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL); + struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; - io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache); - } + if (!node) + return -ENOMEM; + io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache); return 0; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 5dd2fcb28069..732496afed4c 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -69,7 +69,7 @@ void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_put_work(struct work_struct *work); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); -int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); +int __io_rsrc_node_switch_start(struct io_ring_ctx *ctx); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc); @@ -111,6 +111,13 @@ static inline int io_scm_file_account(struct io_ring_ctx *ctx, return __io_scm_file_account(ctx, file); } +static inline int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) +{ + if (unlikely(io_alloc_cache_empty(&ctx->rsrc_node_cache))) + return __io_rsrc_node_switch_start(ctx); + return 0; +} + int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, -- cgit From 9a57fffedc0ee078418a7793ab29cd3864205340 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:12 +0100 Subject: io_uring/rsrc: clean up __io_sqe_buffers_update() Inline offset variable, so we don't use it without subjecting it to array_index_nospec() first. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/77936d9ed23755588810c5eafcea7e1c3b90e3cd.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index bfa0b382c6c6..38f0c9ce67a7 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -469,7 +469,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, for (done = 0; done < nr_args; done++) { struct io_mapped_ubuf *imu; - int offset = up->offset + done; u64 tag = 0; err = io_copy_iov(ctx, &iov, iovs, done); @@ -490,7 +489,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, if (err) break; - i = array_index_nospec(offset, ctx->nr_user_bufs); + i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); if (ctx->user_bufs[i] != ctx->dummy_ubuf) { err = io_queue_rsrc_removal(ctx->buf_data, i, ctx->rsrc_node, ctx->user_bufs[i]); -- cgit From c87fd583f3b5ef770af33893394ea37c7a10b5b8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:13 +0100 Subject: io_uring/rsrc: simplify single file node switching At maximum io_install_fixed_file() removes only one file, so no need to keep needs_switch state and we can call io_rsrc_node_switch() right after removal. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/37cfb46f46160f81dced79f646e97db608994574.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/filetable.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'io_uring') diff --git a/io_uring/filetable.c b/io_uring/filetable.c index b80614e7d605..6255fa255ae2 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -64,7 +64,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, u32 slot_index) __must_hold(&req->ctx->uring_lock) { - bool needs_switch = false; struct io_fixed_file *file_slot; int ret; @@ -83,16 +82,17 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, ret = io_rsrc_node_switch_start(ctx); if (ret) - goto err; + return ret; old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); ret = io_queue_rsrc_removal(ctx->file_data, slot_index, ctx->rsrc_node, old_file); if (ret) - goto err; + return ret; + file_slot->file_ptr = 0; io_file_bitmap_clear(&ctx->file_table, slot_index); - needs_switch = true; + io_rsrc_node_switch(ctx, ctx->file_data); } ret = io_scm_file_account(ctx, file); @@ -101,9 +101,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, io_fixed_file_set(file_slot, file); io_file_bitmap_set(&ctx->file_table, slot_index); } -err: - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->file_data); return ret; } -- cgit From c899a5d7d0eca054546b63e95c94b1e609516f84 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:14 +0100 Subject: io_uring/rsrc: refactor io_queue_rsrc_removal We can queue up a rsrc into a list in io_queue_rsrc_removal() while allocating io_rsrc_put and so simplify the function. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/36bd708ee25c0e2e7992dc19b17db166eea9ac40.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 38f0c9ce67a7..db58a51d19da 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -685,7 +685,6 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, { u64 *tag_slot = io_get_tag_slot(data, idx); struct io_rsrc_put *prsrc; - bool inline_item = true; if (!node->inline_items) { prsrc = &node->item; @@ -694,14 +693,12 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); if (!prsrc) return -ENOMEM; - inline_item = false; + list_add(&prsrc->list, &node->item_list); } prsrc->tag = *tag_slot; *tag_slot = 0; prsrc->rsrc = rsrc; - if (!inline_item) - list_add(&prsrc->list, &node->item_list); return 0; } -- cgit From 2e6f45ac0e640bbd49296adfa0982c84f85fa342 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 18 Apr 2023 14:06:34 +0100 Subject: io_uring/rsrc: remove unused io_rsrc_node::llist ->llist was needed for rsrc node destruction offload, which is removed now. Get rid of the unused field. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8e7d764c3f947489fde88d0927c3060d2e1bb599.1681822823.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 732496afed4c..525905a30a55 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -44,7 +44,6 @@ struct io_rsrc_node { struct io_rsrc_data *rsrc_data; }; struct list_head node; - struct llist_node llist; int refs; /* -- cgit From 63fea89027ff4fd4f350b471ad5b9220d373eec5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 18 Apr 2023 14:06:35 +0100 Subject: io_uring/rsrc: infer node from ctx on io_queue_rsrc_removal For io_queue_rsrc_removal() we should always use the current active rsrc node, don't pass it directly but let the function grab it from the context. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d15939b4afea730978b4925685c2577538b823bb.1681822823.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/filetable.c | 5 ++--- io_uring/rsrc.c | 9 +++++---- io_uring/rsrc.h | 3 +-- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'io_uring') diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 6255fa255ae2..367a44a6c8c5 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -85,8 +85,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, return ret; old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, slot_index, - ctx->rsrc_node, old_file); + ret = io_queue_rsrc_removal(ctx->file_data, slot_index, old_file); if (ret) return ret; @@ -163,7 +162,7 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) return -EBADF; file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); + ret = io_queue_rsrc_removal(ctx->file_data, offset, file); if (ret) return ret; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index db58a51d19da..3be483de613e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -409,7 +409,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (file_slot->file_ptr) { file = (struct file *)(file_slot->file_ptr & FFS_MASK); - err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); + err = io_queue_rsrc_removal(data, i, file); if (err) break; file_slot->file_ptr = 0; @@ -492,7 +492,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); if (ctx->user_bufs[i] != ctx->dummy_ubuf) { err = io_queue_rsrc_removal(ctx->buf_data, i, - ctx->rsrc_node, ctx->user_bufs[i]); + ctx->user_bufs[i]); if (unlikely(err)) { io_buffer_unmap(ctx, &imu); break; @@ -680,9 +680,10 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc) +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) { + struct io_ring_ctx *ctx = data->ctx; + struct io_rsrc_node *node = ctx->rsrc_node; u64 *tag_slot = io_get_tag_slot(data, idx); struct io_rsrc_put *prsrc; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 525905a30a55..8ed3e6a65cf6 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -70,8 +70,7 @@ void io_rsrc_put_work(struct work_struct *work); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); int __io_rsrc_node_switch_start(struct io_ring_ctx *ctx); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc); +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc); void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill); -- cgit From c376644fb915fbdea8c4a04f859d032a8be352fd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 18 Apr 2023 14:06:36 +0100 Subject: io_uring/rsrc: merge nodes and io_rsrc_put struct io_rsrc_node carries a number of resources represented by struct io_rsrc_put. That was handy before for sync overhead ammortisation, but all complexity is gone and nodes are simple and lightweight. Let's allocate a separate node for each resource. Nodes and io_rsrc_put and not much different in size, and former are cached, so node allocation should work better. That also removes some overhead for nested iteration in io_rsrc_node_ref_zero() / __io_rsrc_put_work(). Another reason for the patch is that it greatly reduces complexity by moving io_rsrc_node_switch[_start]() inside io_queue_rsrc_removal(), so users don't have to care about it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c7d3a45b30cc14cd93700a710dd112edc703db98.1681822823.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/filetable.c | 9 ------ io_uring/rsrc.c | 91 +++++++++++++--------------------------------------- io_uring/rsrc.h | 22 ++----------- 3 files changed, 25 insertions(+), 97 deletions(-) (limited to 'io_uring') diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 367a44a6c8c5..0f6fa791a47d 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -80,10 +80,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, if (file_slot->file_ptr) { struct file *old_file; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); ret = io_queue_rsrc_removal(ctx->file_data, slot_index, old_file); if (ret) @@ -91,7 +87,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, file_slot->file_ptr = 0; io_file_bitmap_clear(&ctx->file_table, slot_index); - io_rsrc_node_switch(ctx, ctx->file_data); } ret = io_scm_file_account(ctx, file); @@ -152,9 +147,6 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) return -ENXIO; if (offset >= ctx->nr_user_files) return -EINVAL; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; offset = array_index_nospec(offset, ctx->nr_user_files); file_slot = io_fixed_file_slot(&ctx->file_table, offset); @@ -168,7 +160,6 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) file_slot->file_ptr = 0; io_file_bitmap_clear(&ctx->file_table, offset); - io_rsrc_node_switch(ctx, ctx->file_data); return 0; } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 3be483de613e..a54a222a20b8 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -153,17 +153,10 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) { struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - struct io_rsrc_put *prsrc, *tmp; - if (ref_node->inline_items) + if (likely(ref_node->inline_items)) io_rsrc_put_work_one(rsrc_data, &ref_node->item); - list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) { - list_del(&prsrc->list); - io_rsrc_put_work_one(rsrc_data, prsrc); - kfree(prsrc); - } - io_rsrc_node_destroy(rsrc_data->ctx, ref_node); } @@ -206,53 +199,29 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) } ref_node->rsrc_data = NULL; - ref_node->refs = 1; - INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->item_list); ref_node->inline_items = 0; + ref_node->refs = 1; return ref_node; } -void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill) - __must_hold(&ctx->uring_lock) -{ - struct io_rsrc_node *node = ctx->rsrc_node; - struct io_rsrc_node *backup = io_rsrc_node_alloc(ctx); - - if (WARN_ON_ONCE(!backup)) - return; - - node->rsrc_data = data_to_kill; - list_add_tail(&node->node, &ctx->rsrc_ref_list); - /* put master ref */ - io_put_rsrc_node(ctx, node); - ctx->rsrc_node = backup; -} - -int __io_rsrc_node_switch_start(struct io_ring_ctx *ctx) -{ - struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL); - - if (!node) - return -ENOMEM; - io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache); - return 0; -} - __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) { + struct io_rsrc_node *backup; DEFINE_WAIT(we); int ret; - /* As we may drop ->uring_lock, other task may have started quiesce */ + /* As We may drop ->uring_lock, other task may have started quiesce */ if (data->quiesce) return -ENXIO; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - io_rsrc_node_switch(ctx, data); + + backup = io_rsrc_node_alloc(ctx); + if (!backup) + return -ENOMEM; + ctx->rsrc_node->rsrc_data = data; + list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); + io_put_rsrc_node(ctx, ctx->rsrc_node); + ctx->rsrc_node = backup; if (list_empty(&ctx->rsrc_ref_list)) return 0; @@ -382,7 +351,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct file *file; int fd, i, err = 0; unsigned int done; - bool needs_switch = false; if (!ctx->file_data) return -ENXIO; @@ -414,7 +382,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, break; file_slot->file_ptr = 0; io_file_bitmap_clear(&ctx->file_table, i); - needs_switch = true; } if (fd != -1) { file = fget(fd); @@ -445,9 +412,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, io_file_bitmap_set(&ctx->file_table, i); } } - - if (needs_switch) - io_rsrc_node_switch(ctx, data); return done ? done : err; } @@ -458,7 +422,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, u64 __user *tags = u64_to_user_ptr(up->tags); struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); struct page *last_hpage = NULL; - bool needs_switch = false; __u32 done; int i, err; @@ -498,15 +461,11 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, break; } ctx->user_bufs[i] = ctx->dummy_ubuf; - needs_switch = true; } ctx->user_bufs[i] = imu; *io_get_tag_slot(ctx->buf_data, i) = tag; } - - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->buf_data); return done ? done : err; } @@ -515,15 +474,11 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, unsigned nr_args) { __u32 tmp; - int err; lockdep_assert_held(&ctx->uring_lock); if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; - err = io_rsrc_node_switch_start(ctx); - if (err) - return err; switch (type) { case IORING_RSRC_FILE: @@ -685,21 +640,21 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) struct io_ring_ctx *ctx = data->ctx; struct io_rsrc_node *node = ctx->rsrc_node; u64 *tag_slot = io_get_tag_slot(data, idx); - struct io_rsrc_put *prsrc; - if (!node->inline_items) { - prsrc = &node->item; - node->inline_items++; - } else { - prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); - if (!prsrc) - return -ENOMEM; - list_add(&prsrc->list, &node->item_list); + ctx->rsrc_node = io_rsrc_node_alloc(ctx); + if (unlikely(!ctx->rsrc_node)) { + ctx->rsrc_node = node; + return -ENOMEM; } - prsrc->tag = *tag_slot; + node->item.rsrc = rsrc; + node->item.tag = *tag_slot; + node->inline_items = 1; *tag_slot = 0; - prsrc->rsrc = rsrc; + + node->rsrc_data = data; + list_add_tail(&node->node, &ctx->rsrc_ref_list); + io_put_rsrc_node(ctx, node); return 0; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 8ed3e6a65cf6..bad7103f5033 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -18,7 +18,6 @@ enum { }; struct io_rsrc_put { - struct list_head list; u64 tag; union { void *rsrc; @@ -43,17 +42,10 @@ struct io_rsrc_node { struct io_cache_entry cache; struct io_rsrc_data *rsrc_data; }; - struct list_head node; int refs; - - /* - * Keeps a list of struct io_rsrc_put to be completed. Each entry - * represents one rsrc (e.g. file or buffer), but all of them should've - * came from the same table and so are of the same type. - */ - struct list_head item_list; - struct io_rsrc_put item; int inline_items; + struct list_head node; + struct io_rsrc_put item; }; struct io_mapped_ubuf { @@ -68,11 +60,8 @@ void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_put_work(struct work_struct *work); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); -int __io_rsrc_node_switch_start(struct io_ring_ctx *ctx); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc); -void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill); int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, @@ -109,13 +98,6 @@ static inline int io_scm_file_account(struct io_ring_ctx *ctx, return __io_scm_file_account(ctx, file); } -static inline int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) -{ - if (unlikely(io_alloc_cache_empty(&ctx->rsrc_node_cache))) - return __io_rsrc_node_switch_start(ctx); - return 0; -} - int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, -- cgit From 26147da37f3e52041d9deba189d39f27ce78a84f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 18 Apr 2023 14:06:37 +0100 Subject: io_uring/rsrc: add empty flag in rsrc_node Unless a node was flushed by io_rsrc_ref_quiesce(), it'll carry a resource. Replace ->inline_items with an empty flag, which is initialised to false and only raised in io_rsrc_ref_quiesce(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/75d384c9d2252e12af73b9cf8a44e1699106aeb1.1681822823.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 6 +++--- io_uring/rsrc.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index a54a222a20b8..127bd602131e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -154,7 +154,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) { struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - if (likely(ref_node->inline_items)) + if (likely(!ref_node->empty)) io_rsrc_put_work_one(rsrc_data, &ref_node->item); io_rsrc_node_destroy(rsrc_data->ctx, ref_node); @@ -199,7 +199,7 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) } ref_node->rsrc_data = NULL; - ref_node->inline_items = 0; + ref_node->empty = 0; ref_node->refs = 1; return ref_node; } @@ -218,6 +218,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, backup = io_rsrc_node_alloc(ctx); if (!backup) return -ENOMEM; + ctx->rsrc_node->empty = true; ctx->rsrc_node->rsrc_data = data; list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); io_put_rsrc_node(ctx, ctx->rsrc_node); @@ -649,7 +650,6 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) node->item.rsrc = rsrc; node->item.tag = *tag_slot; - node->inline_items = 1; *tag_slot = 0; node->rsrc_data = data; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index bad7103f5033..f3fe455c6c71 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -43,7 +43,7 @@ struct io_rsrc_node { struct io_rsrc_data *rsrc_data; }; int refs; - int inline_items; + bool empty; struct list_head node; struct io_rsrc_put item; }; -- cgit From 4130b49991d6b8ca0ea44cb256e710c4e48d7f01 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 18 Apr 2023 14:06:38 +0100 Subject: io_uring/rsrc: inline io_rsrc_put_work() io_rsrc_put_work() is simple enough to be open coded into its only caller. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1b36dd46766ced39a9b160767babfa2fce07b8f8.1681822823.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 19 ++++++------------- io_uring/rsrc.h | 1 - 2 files changed, 6 insertions(+), 14 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 127bd602131e..d1167b0643b7 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -140,8 +140,8 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo *slot = NULL; } -static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, - struct io_rsrc_put *prsrc) +static void io_rsrc_put_work(struct io_rsrc_data *rsrc_data, + struct io_rsrc_put *prsrc) { struct io_ring_ctx *ctx = rsrc_data->ctx; @@ -150,16 +150,6 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, rsrc_data->do_put(ctx, prsrc); } -static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) -{ - struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - - if (likely(!ref_node->empty)) - io_rsrc_put_work_one(rsrc_data, &ref_node->item); - - io_rsrc_node_destroy(rsrc_data->ctx, ref_node); -} - void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) @@ -178,7 +168,10 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) if (node->refs) break; list_del(&node->node); - __io_rsrc_put_work(node); + + if (likely(!node->empty)) + io_rsrc_put_work(node->rsrc_data, &node->item); + io_rsrc_node_destroy(ctx, node); } if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) wake_up_all(&ctx->rsrc_quiesce_wq); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index f3fe455c6c71..232079363f6a 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -58,7 +58,6 @@ struct io_mapped_ubuf { void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node); -void io_rsrc_put_work(struct work_struct *work); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc); -- cgit From 29b26c556e7439b1370ac6a59fce83a9d1521de1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 18 Apr 2023 14:06:39 +0100 Subject: io_uring/rsrc: pass node to io_rsrc_put_work() Instead of passing rsrc_data and a resource to io_rsrc_put_work() just forward node, that's all the function needs. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/791e8edd28d78797240b74d34e99facbaad62f3b.1681822823.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d1167b0643b7..9378691d49f5 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -140,14 +140,14 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo *slot = NULL; } -static void io_rsrc_put_work(struct io_rsrc_data *rsrc_data, - struct io_rsrc_put *prsrc) +static void io_rsrc_put_work(struct io_rsrc_node *node) { - struct io_ring_ctx *ctx = rsrc_data->ctx; + struct io_rsrc_data *data = node->rsrc_data; + struct io_rsrc_put *prsrc = &node->item; if (prsrc->tag) - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - rsrc_data->do_put(ctx, prsrc); + io_post_aux_cqe(data->ctx, prsrc->tag, 0, 0); + data->do_put(data->ctx, prsrc); } void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) @@ -170,7 +170,7 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) list_del(&node->node); if (likely(!node->empty)) - io_rsrc_put_work(node->rsrc_data, &node->item); + io_rsrc_put_work(node); io_rsrc_node_destroy(ctx, node); } if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) -- cgit From fc7f3a8d3a78503c4f3e108155fb9a233dc307a4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 18 Apr 2023 14:06:40 +0100 Subject: io_uring/rsrc: devirtualise rsrc put callbacks We only have two rsrc types, buffers and files, replace virtual callbacks for putting resources down with a switch..case. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/02ca727bf8e5f7f820c2f404e95ae88c8f472930.1681822823.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 25 +++++++++++++++++++------ io_uring/rsrc.h | 2 +- 2 files changed, 20 insertions(+), 7 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 9378691d49f5..62988b3aa927 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -23,6 +23,8 @@ struct io_rsrc_update { u32 offset; }; +static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); +static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage); @@ -147,7 +149,18 @@ static void io_rsrc_put_work(struct io_rsrc_node *node) if (prsrc->tag) io_post_aux_cqe(data->ctx, prsrc->tag, 0, 0); - data->do_put(data->ctx, prsrc); + + switch (data->rsrc_type) { + case IORING_RSRC_FILE: + io_rsrc_file_put(data->ctx, prsrc); + break; + case IORING_RSRC_BUFFER: + io_rsrc_buf_put(data->ctx, prsrc); + break; + default: + WARN_ON_ONCE(1); + break; + } } void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) @@ -297,8 +310,8 @@ static __cold void **io_alloc_page_table(size_t size) return table; } -__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, - rsrc_put_fn *do_put, u64 __user *utags, +__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, + u64 __user *utags, unsigned nr, struct io_rsrc_data **pdata) { struct io_rsrc_data *data; @@ -316,7 +329,7 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, data->nr = nr; data->ctx = ctx; - data->do_put = do_put; + data->rsrc_type = type; if (utags) { ret = -EFAULT; for (i = 0; i < nr; i++) { @@ -849,7 +862,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; - ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, + ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args, &ctx->file_data); if (ret) return ret; @@ -1184,7 +1197,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; - ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); + ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data); if (ret) return ret; ret = io_buffers_map_alloc(ctx, nr_args); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 232079363f6a..5d0733c4c08d 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -33,7 +33,7 @@ struct io_rsrc_data { u64 **tags; unsigned int nr; - rsrc_put_fn *do_put; + u16 rsrc_type; bool quiesce; }; -- cgit From 2236b3905b4d4e9cd4d149ab35767858c02bb79b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 18 Apr 2023 14:06:41 +0100 Subject: io_uring/rsrc: disassociate nodes and rsrc_data Make rsrc nodes independent from rsrd_data, for that we keep ctx and rsrc type in nodes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4f259abe9cd4eea6a3b4ed83508635218acd3c3f.1681822823.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 20 +++++++++----------- io_uring/rsrc.h | 3 ++- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 62988b3aa927..20dcc7668cb0 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -144,18 +144,17 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo static void io_rsrc_put_work(struct io_rsrc_node *node) { - struct io_rsrc_data *data = node->rsrc_data; struct io_rsrc_put *prsrc = &node->item; if (prsrc->tag) - io_post_aux_cqe(data->ctx, prsrc->tag, 0, 0); + io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0); - switch (data->rsrc_type) { + switch (node->type) { case IORING_RSRC_FILE: - io_rsrc_file_put(data->ctx, prsrc); + io_rsrc_file_put(node->ctx, prsrc); break; case IORING_RSRC_BUFFER: - io_rsrc_buf_put(data->ctx, prsrc); + io_rsrc_buf_put(node->ctx, prsrc); break; default: WARN_ON_ONCE(1); @@ -170,9 +169,9 @@ void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) } void io_rsrc_node_ref_zero(struct io_rsrc_node *node) - __must_hold(&node->rsrc_data->ctx->uring_lock) + __must_hold(&node->ctx->uring_lock) { - struct io_ring_ctx *ctx = node->rsrc_data->ctx; + struct io_ring_ctx *ctx = node->ctx; while (!list_empty(&ctx->rsrc_ref_list)) { node = list_first_entry(&ctx->rsrc_ref_list, @@ -204,7 +203,7 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) return NULL; } - ref_node->rsrc_data = NULL; + ref_node->ctx = ctx; ref_node->empty = 0; ref_node->refs = 1; return ref_node; @@ -225,7 +224,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, if (!backup) return -ENOMEM; ctx->rsrc_node->empty = true; - ctx->rsrc_node->rsrc_data = data; + ctx->rsrc_node->type = -1; list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); io_put_rsrc_node(ctx, ctx->rsrc_node); ctx->rsrc_node = backup; @@ -655,10 +654,9 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) } node->item.rsrc = rsrc; + node->type = data->rsrc_type; node->item.tag = *tag_slot; *tag_slot = 0; - - node->rsrc_data = data; list_add_tail(&node->node, &ctx->rsrc_ref_list); io_put_rsrc_node(ctx, node); return 0; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 5d0733c4c08d..0a8a95e9b99e 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -40,10 +40,11 @@ struct io_rsrc_data { struct io_rsrc_node { union { struct io_cache_entry cache; - struct io_rsrc_data *rsrc_data; + struct io_ring_ctx *ctx; }; int refs; bool empty; + u16 type; struct list_head node; struct io_rsrc_put item; }; -- cgit From ea97f6c8558e83cb457c3b5f53351e4fd8519ab1 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 18 Apr 2023 15:58:18 -0700 Subject: io_uring: add support for multishot timeouts A multishot timeout submission will repeatedly generate completions with the IORING_CQE_F_MORE cflag set. Depending on the value of the `off' field in the submission, these timeouts can either repeat indefinitely until cancelled (`off' = 0) or for a fixed number of times (`off' > 0). Only noseq timeouts (i.e. not dependent on the number of I/O completions) are supported. An indefinite timer will be cancelled if the CQ ever overflows. Signed-off-by: David Wei Link: https://lore.kernel.org/r/20230418225817.1905027-1-davidhwei@meta.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 5c6c6f720809..fc950177e2e1 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -17,6 +17,7 @@ struct io_timeout { struct file *file; u32 off; u32 target_seq; + u32 repeats; struct list_head list; /* head of the link, used by linked timeouts only */ struct io_kiocb *head; @@ -37,8 +38,9 @@ struct io_timeout_rem { static inline bool io_is_timeout_noseq(struct io_kiocb *req) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); + struct io_timeout_data *data = req->async_data; - return !timeout->off; + return !timeout->off || data->flags & IORING_TIMEOUT_MULTISHOT; } static inline void io_put_req(struct io_kiocb *req) @@ -49,6 +51,44 @@ static inline void io_put_req(struct io_kiocb *req) } } +static inline bool io_timeout_finish(struct io_timeout *timeout, + struct io_timeout_data *data) +{ + if (!(data->flags & IORING_TIMEOUT_MULTISHOT)) + return true; + + if (!timeout->off || (timeout->repeats && --timeout->repeats)) + return false; + + return true; +} + +static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); + +static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); + struct io_timeout_data *data = req->async_data; + struct io_ring_ctx *ctx = req->ctx; + + if (!io_timeout_finish(timeout, data)) { + bool filled; + filled = io_aux_cqe(ctx, ts->locked, req->cqe.user_data, -ETIME, + IORING_CQE_F_MORE, false); + if (filled) { + /* re-arm timer */ + spin_lock_irq(&ctx->timeout_lock); + list_add(&timeout->list, ctx->timeout_list.prev); + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + spin_unlock_irq(&ctx->timeout_lock); + return; + } + } + + io_req_task_complete(req, ts); +} + static bool io_kill_timeout(struct io_kiocb *req, int status) __must_hold(&req->ctx->timeout_lock) { @@ -212,7 +252,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) req_set_fail(req); io_req_set_res(req, -ETIME, 0); - req->io_task_work.func = io_req_task_complete; + req->io_task_work.func = io_timeout_complete; io_req_task_work_add(req); return HRTIMER_NORESTART; } @@ -470,16 +510,27 @@ static int __io_timeout_prep(struct io_kiocb *req, return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | - IORING_TIMEOUT_ETIME_SUCCESS)) + IORING_TIMEOUT_ETIME_SUCCESS | + IORING_TIMEOUT_MULTISHOT)) return -EINVAL; /* more than one clock specified is invalid, obviously */ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; + /* multishot requests only make sense with rel values */ + if (!(~flags & (IORING_TIMEOUT_MULTISHOT | IORING_TIMEOUT_ABS))) + return -EINVAL; INIT_LIST_HEAD(&timeout->list); timeout->off = off; if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; + /* + * for multishot reqs w/ fixed nr of repeats, repeats tracks the + * remaining nr + */ + timeout->repeats = 0; + if ((flags & IORING_TIMEOUT_MULTISHOT) && off > 0) + timeout->repeats = off; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; -- cgit From 3c85cc43c8e7855d202da184baf00c7b8eeacf71 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2023 14:16:03 -0600 Subject: Revert "io_uring/rsrc: disallow multi-source reg buffers" This reverts commit edd478269640b360c6f301f2baa04abdda563ef3. There's really no specific need to disallow multiple sources of buffers, and io_uring really should not be mandating this by itself. We should be able to solely rely on GUP making these decisions. As this also stands in the way of a cleanup where io_uring is the odd one out, kill it. Link: https://lore.kernel.org/all/61ded378-51a8-1dcb-b631-fda1903248a9@gmail.com/ Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 20dcc7668cb0..ddee7adb4006 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1053,17 +1053,14 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages, vmas); if (pret == nr_pages) { - struct file *file = vmas[0]->vm_file; - /* don't support file backed memory */ for (i = 0; i < nr_pages; i++) { - if (vmas[i]->vm_file != file) { - ret = -EINVAL; - break; - } - if (!file) + struct vm_area_struct *vma = vmas[i]; + + if (vma_is_shmem(vma)) continue; - if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { + if (vma->vm_file && + !is_file_hugepages(vma->vm_file)) { ret = -EOPNOTSUPP; break; } -- cgit