summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2024-01-02 13:16:29 -0500
committerPaolo Bonzini <pbonzini@redhat.com>2024-01-02 13:16:29 -0500
commit136292522e43da46bee4c0fef80b2602f79525a2 (patch)
tree47c892c46e01fa4a3ef014f3737ecee3776969ee /io_uring
parent8ed26ab8d59111c2f7b86d200d1eb97d2a458fd1 (diff)
parent118e10cd893d57df55b3302dfd188a981b6e6d1c (diff)
Merge tag 'loongarch-kvm-6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson into HEAD
LoongArch KVM changes for v6.8 1. Optimization for memslot hugepage checking. 2. Cleanup and fix some HW/SW timer issues. 3. Add LSX/LASX (128bit/256bit SIMD) support.
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/cancel.c11
-rw-r--r--io_uring/fdinfo.c9
-rw-r--r--io_uring/fs.c2
-rw-r--r--io_uring/io_uring.c104
-rw-r--r--io_uring/io_uring.h3
-rw-r--r--io_uring/kbuf.c177
-rw-r--r--io_uring/kbuf.h5
-rw-r--r--io_uring/poll.c20
-rw-r--r--io_uring/rsrc.c2
-rw-r--r--io_uring/rsrc.h7
-rw-r--r--io_uring/sqpoll.c12
-rw-r--r--io_uring/uring_cmd.c2
12 files changed, 256 insertions, 98 deletions
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 3c19cccb1aec..8a8b07dfc444 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -273,7 +273,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
};
ktime_t timeout = KTIME_MAX;
struct io_uring_sync_cancel_reg sc;
- struct fd f = { };
+ struct file *file = NULL;
DEFINE_WAIT(wait);
int ret, i;
@@ -295,10 +295,10 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
/* we can grab a normal file descriptor upfront */
if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
!(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
- f = fdget(sc.fd);
- if (!f.file)
+ file = fget(sc.fd);
+ if (!file)
return -EBADF;
- cd.file = f.file;
+ cd.file = file;
}
ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
@@ -348,6 +348,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
if (ret == -ENOENT || ret > 0)
ret = 0;
out:
- fdput(f);
+ if (file)
+ fput(file);
return ret;
}
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index f04a43044d91..976e9500f651 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -145,13 +145,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
struct io_sq_data *sq = ctx->sq_data;
- if (mutex_trylock(&sq->lock)) {
- if (sq->thread) {
- sq_pid = task_pid_nr(sq->thread);
- sq_cpu = task_cpu(sq->thread);
- }
- mutex_unlock(&sq->lock);
- }
+ sq_pid = sq->task_pid;
+ sq_cpu = sq->sq_cpu;
}
seq_printf(m, "SqThread:\t%d\n", sq_pid);
diff --git a/io_uring/fs.c b/io_uring/fs.c
index 08e3b175469c..eccea851dd5a 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -254,7 +254,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
lnk->flags = READ_ONCE(sqe->hardlink_flags);
- lnk->oldpath = getname(oldf);
+ lnk->oldpath = getname_uflags(oldf, lnk->flags);
if (IS_ERR(lnk->oldpath))
return PTR_ERR(lnk->oldpath);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index db3f545ddcac..0efd775cb3ad 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -271,6 +271,7 @@ static __cold void io_fallback_req_func(struct work_struct *work)
struct io_kiocb *req, *tmp;
struct io_tw_state ts = { .locked = true, };
+ percpu_ref_get(&ctx->refs);
mutex_lock(&ctx->uring_lock);
llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
req->io_task_work.func(req, &ts);
@@ -278,6 +279,7 @@ static __cold void io_fallback_req_func(struct work_struct *work)
return;
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
+ percpu_ref_put(&ctx->refs);
}
static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
@@ -325,6 +327,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
+ INIT_HLIST_HEAD(&ctx->io_buf_list);
io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
sizeof(struct io_rsrc_node));
io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
@@ -2666,7 +2669,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-static void io_mem_free(void *ptr)
+void io_mem_free(void *ptr)
{
if (!ptr)
return;
@@ -2697,6 +2700,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
{
struct page **page_array;
unsigned int nr_pages;
+ void *page_addr;
int ret, i;
*npages = 0;
@@ -2718,27 +2722,29 @@ err:
io_pages_free(&page_array, ret > 0 ? ret : 0);
return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
}
- /*
- * Should be a single page. If the ring is small enough that we can
- * use a normal page, that is fine. If we need multiple pages, then
- * userspace should use a huge page. That's the only way to guarantee
- * that we get contigious memory, outside of just being lucky or
- * (currently) having low memory fragmentation.
- */
- if (page_array[0] != page_array[ret - 1])
- goto err;
- /*
- * Can't support mapping user allocated ring memory on 32-bit archs
- * where it could potentially reside in highmem. Just fail those with
- * -EINVAL, just like we did on kernels that didn't support this
- * feature.
- */
+ page_addr = page_address(page_array[0]);
for (i = 0; i < nr_pages; i++) {
- if (PageHighMem(page_array[i])) {
- ret = -EINVAL;
+ ret = -EINVAL;
+
+ /*
+ * Can't support mapping user allocated ring memory on 32-bit
+ * archs where it could potentially reside in highmem. Just
+ * fail those with -EINVAL, just like we did on kernels that
+ * didn't support this feature.
+ */
+ if (PageHighMem(page_array[i]))
goto err;
- }
+
+ /*
+ * No support for discontig pages for now, should either be a
+ * single normal page, or a huge page. Later on we can add
+ * support for remapping discontig pages, for now we will
+ * just fail them with EINVAL.
+ */
+ if (page_address(page_array[i]) != page_addr)
+ goto err;
+ page_addr += PAGE_SIZE;
}
*pages = page_array;
@@ -2775,7 +2781,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
}
}
-static void *io_mem_alloc(size_t size)
+void *io_mem_alloc(size_t size)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
void *ret;
@@ -2947,6 +2953,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
io_rings_free(ctx);
+ io_kbuf_mmap_list_free(ctx);
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
@@ -3141,12 +3148,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
init_completion(&exit.completion);
init_task_work(&exit.task_work, io_tctx_exit_cb);
exit.ctx = ctx;
- /*
- * Some may use context even when all refs and requests have been put,
- * and they are free to do so while still holding uring_lock or
- * completion_lock, see io_req_task_submit(). Apart from other work,
- * this lock/unlock section also waits them to finish.
- */
+
mutex_lock(&ctx->uring_lock);
while (!list_empty(&ctx->tctx_list)) {
WARN_ON_ONCE(time_after(jiffies, timeout));
@@ -3475,25 +3477,27 @@ static void *io_uring_validate_mmap_request(struct file *file,
struct page *page;
void *ptr;
- /* Don't allow mmap if the ring was setup without it */
- if (ctx->flags & IORING_SETUP_NO_MMAP)
- return ERR_PTR(-EINVAL);
-
switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
+ /* Don't allow mmap if the ring was setup without it */
+ if (ctx->flags & IORING_SETUP_NO_MMAP)
+ return ERR_PTR(-EINVAL);
ptr = ctx->rings;
break;
case IORING_OFF_SQES:
+ /* Don't allow mmap if the ring was setup without it */
+ if (ctx->flags & IORING_SETUP_NO_MMAP)
+ return ERR_PTR(-EINVAL);
ptr = ctx->sq_sqes;
break;
case IORING_OFF_PBUF_RING: {
unsigned int bgid;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- mutex_lock(&ctx->uring_lock);
+ rcu_read_lock();
ptr = io_pbuf_get_address(ctx, bgid);
- mutex_unlock(&ctx->uring_lock);
+ rcu_read_unlock();
if (!ptr)
return ERR_PTR(-EINVAL);
break;
@@ -3645,7 +3649,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
size_t, argsz)
{
struct io_ring_ctx *ctx;
- struct fd f;
+ struct file *file;
long ret;
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
@@ -3663,20 +3667,19 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
- f.file = tctx->registered_rings[fd];
- f.flags = 0;
- if (unlikely(!f.file))
+ file = tctx->registered_rings[fd];
+ if (unlikely(!file))
return -EBADF;
} else {
- f = fdget(fd);
- if (unlikely(!f.file))
+ file = fget(fd);
+ if (unlikely(!file))
return -EBADF;
ret = -EOPNOTSUPP;
- if (unlikely(!io_is_uring_fops(f.file)))
+ if (unlikely(!io_is_uring_fops(file)))
goto out;
}
- ctx = f.file->private_data;
+ ctx = file->private_data;
ret = -EBADFD;
if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
goto out;
@@ -3770,7 +3773,8 @@ iopoll_locked:
}
}
out:
- fdput(f);
+ if (!(flags & IORING_ENTER_REGISTERED_RING))
+ fput(file);
return ret;
}
@@ -4612,7 +4616,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
- struct fd f;
+ struct file *file;
bool use_registered_ring;
use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
@@ -4631,27 +4635,27 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
- f.file = tctx->registered_rings[fd];
- f.flags = 0;
- if (unlikely(!f.file))
+ file = tctx->registered_rings[fd];
+ if (unlikely(!file))
return -EBADF;
} else {
- f = fdget(fd);
- if (unlikely(!f.file))
+ file = fget(fd);
+ if (unlikely(!file))
return -EBADF;
ret = -EOPNOTSUPP;
- if (!io_is_uring_fops(f.file))
+ if (!io_is_uring_fops(file))
goto out_fput;
}
- ctx = f.file->private_data;
+ ctx = file->private_data;
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
- fdput(f);
+ if (!use_registered_ring)
+ fput(file);
return ret;
}
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index dc6d779b452b..ed84f2737b3a 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -86,6 +86,9 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);
+void *io_mem_alloc(size_t size);
+void io_mem_free(void *ptr);
+
#if defined(CONFIG_PROVE_LOCKING)
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index a1e4239c7d75..72b6af1d2ed3 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -33,19 +33,42 @@ struct io_provide_buf {
__u16 bid;
};
+struct io_buf_free {
+ struct hlist_node list;
+ void *mem;
+ size_t size;
+ int inuse;
+};
+
+static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl,
+ unsigned int bgid)
+{
+ if (bl && bgid < BGID_ARRAY)
+ return &bl[bgid];
+
+ return xa_load(&ctx->io_bl_xa, bgid);
+}
+
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
- if (ctx->io_bl && bgid < BGID_ARRAY)
- return &ctx->io_bl[bgid];
+ lockdep_assert_held(&ctx->uring_lock);
- return xa_load(&ctx->io_bl_xa, bgid);
+ return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
}
static int io_buffer_add_list(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned int bgid)
{
+ /*
+ * Store buffer group ID and finally mark the list as visible.
+ * The normal lookup doesn't care about the visibility as we're
+ * always under the ->uring_lock, but the RCU lookup from mmap does.
+ */
bl->bgid = bgid;
+ smp_store_release(&bl->is_ready, 1);
+
if (bgid < BGID_ARRAY)
return 0;
@@ -196,21 +219,40 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
{
+ struct io_buffer_list *bl;
int i;
- ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
- GFP_KERNEL);
- if (!ctx->io_bl)
+ bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
+ if (!bl)
return -ENOMEM;
for (i = 0; i < BGID_ARRAY; i++) {
- INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
- ctx->io_bl[i].bgid = i;
+ INIT_LIST_HEAD(&bl[i].buf_list);
+ bl[i].bgid = i;
}
+ smp_store_release(&ctx->io_bl, bl);
return 0;
}
+/*
+ * Mark the given mapped range as free for reuse
+ */
+static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+{
+ struct io_buf_free *ibf;
+
+ hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
+ if (bl->buf_ring == ibf->mem) {
+ ibf->inuse = 0;
+ return;
+ }
+ }
+
+ /* can't happen... */
+ WARN_ON_ONCE(1);
+}
+
static int __io_remove_buffers(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned nbufs)
{
@@ -223,7 +265,11 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->is_mapped) {
i = bl->buf_ring->tail - bl->head;
if (bl->is_mmap) {
- folio_put(virt_to_folio(bl->buf_ring));
+ /*
+ * io_kbuf_list_free() will free the page(s) at
+ * ->release() time.
+ */
+ io_kbuf_mark_free(ctx, bl);
bl->buf_ring = NULL;
bl->is_mmap = 0;
} else if (bl->buf_nr_pages) {
@@ -274,9 +320,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
xa_for_each(&ctx->io_bl_xa, index, bl) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
__io_remove_buffers(ctx, bl, -1U);
- kfree(bl);
+ kfree_rcu(bl, rcu);
}
+ /*
+ * Move deferred locked entries to cache before pruning
+ */
+ spin_lock(&ctx->completion_lock);
+ if (!list_empty(&ctx->io_buffers_comp))
+ list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache);
+ spin_unlock(&ctx->completion_lock);
+
list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
buf = list_entry(item, struct io_buffer, list);
kmem_cache_free(io_buf_cachep, buf);
@@ -460,7 +514,16 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
INIT_LIST_HEAD(&bl->buf_list);
ret = io_buffer_add_list(ctx, bl, p->bgid);
if (ret) {
- kfree(bl);
+ /*
+ * Doesn't need rcu free as it was never visible, but
+ * let's keep it consistent throughout. Also can't
+ * be a lower indexed array group, as adding one
+ * where lookup failed cannot happen.
+ */
+ if (p->bgid >= BGID_ARRAY)
+ kfree_rcu(bl, rcu);
+ else
+ WARN_ON_ONCE(1);
goto err;
}
}
@@ -531,19 +594,63 @@ error_unpin:
return -EINVAL;
}
-static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
+/*
+ * See if we have a suitable region that we can reuse, rather than allocate
+ * both a new io_buf_free and mem region again. We leave it on the list as
+ * even a reused entry will need freeing at ring release.
+ */
+static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
+ size_t ring_size)
+{
+ struct io_buf_free *ibf, *best = NULL;
+ size_t best_dist;
+
+ hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
+ size_t dist;
+
+ if (ibf->inuse || ibf->size < ring_size)
+ continue;
+ dist = ibf->size - ring_size;
+ if (!best || dist < best_dist) {
+ best = ibf;
+ if (!dist)
+ break;
+ best_dist = dist;
+ }
+ }
+
+ return best;
+}
+
+static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
+ struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+ struct io_buf_free *ibf;
size_t ring_size;
void *ptr;
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
- ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
- if (!ptr)
- return -ENOMEM;
- bl->buf_ring = ptr;
+ /* Reuse existing entry, if we can */
+ ibf = io_lookup_buf_free_entry(ctx, ring_size);
+ if (!ibf) {
+ ptr = io_mem_alloc(ring_size);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ /* Allocate and store deferred free entry */
+ ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
+ if (!ibf) {
+ io_mem_free(ptr);
+ return -ENOMEM;
+ }
+ ibf->mem = ptr;
+ ibf->size = ring_size;
+ hlist_add_head(&ibf->list, &ctx->io_buf_list);
+ }
+ ibf->inuse = 1;
+ bl->buf_ring = ibf->mem;
bl->is_mapped = 1;
bl->is_mmap = 1;
return 0;
@@ -555,6 +662,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
struct io_buffer_list *bl, *free_bl = NULL;
int ret;
+ lockdep_assert_held(&ctx->uring_lock);
+
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
@@ -599,7 +708,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!(reg.flags & IOU_PBUF_RING_MMAP))
ret = io_pin_pbuf_ring(&reg, bl);
else
- ret = io_alloc_pbuf_ring(&reg, bl);
+ ret = io_alloc_pbuf_ring(ctx, &reg, bl);
if (!ret) {
bl->nr_entries = reg.ring_entries;
@@ -609,7 +718,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0;
}
- kfree(free_bl);
+ kfree_rcu(free_bl, rcu);
return ret;
}
@@ -618,6 +727,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
struct io_uring_buf_reg reg;
struct io_buffer_list *bl;
+ lockdep_assert_held(&ctx->uring_lock);
+
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
@@ -634,7 +745,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
__io_remove_buffers(ctx, bl, -1U);
if (bl->bgid >= BGID_ARRAY) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
- kfree(bl);
+ kfree_rcu(bl, rcu);
}
return 0;
}
@@ -643,9 +754,33 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{
struct io_buffer_list *bl;
- bl = io_buffer_get_list(ctx, bgid);
+ bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
+
if (!bl || !bl->is_mmap)
return NULL;
+ /*
+ * Ensure the list is fully setup. Only strictly needed for RCU lookup
+ * via mmap, and in that case only for the array indexed groups. For
+ * the xarray lookups, it's either visible and ready, or not at all.
+ */
+ if (!smp_load_acquire(&bl->is_ready))
+ return NULL;
return bl->buf_ring;
}
+
+/*
+ * Called at or after ->release(), free the mmap'ed buffers that we used
+ * for memory mapped provided buffer rings.
+ */
+void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
+{
+ struct io_buf_free *ibf;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
+ hlist_del(&ibf->list);
+ io_mem_free(ibf->mem);
+ kfree(ibf);
+ }
+}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index f2d615236b2c..9be5960817ea 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -15,6 +15,7 @@ struct io_buffer_list {
struct page **buf_pages;
struct io_uring_buf_ring *buf_ring;
};
+ struct rcu_head rcu;
};
__u16 bgid;
@@ -28,6 +29,8 @@ struct io_buffer_list {
__u8 is_mapped;
/* ring mapped provided buffers, but mmap'ed by application */
__u8 is_mmap;
+ /* bl is visible from an RCU point of view for lookup */
+ __u8 is_ready;
};
struct io_buffer {
@@ -51,6 +54,8 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
+void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
+
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index d38d05edb4fa..d59b74a99d4e 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -366,11 +366,16 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts)
static void __io_poll_execute(struct io_kiocb *req, int mask)
{
+ unsigned flags = 0;
+
io_req_set_res(req, mask, 0);
req->io_task_work.func = io_poll_task_func;
trace_io_uring_task_add(req, mask);
- __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
+
+ if (!(req->flags & REQ_F_POLL_NO_LAZY))
+ flags = IOU_F_TWQ_LAZY_WAKE;
+ __io_req_task_work_add(req, flags);
}
static inline void io_poll_execute(struct io_kiocb *req, int res)
@@ -526,10 +531,19 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
poll->head = head;
poll->wait.private = (void *) wqe_private;
- if (poll->events & EPOLLEXCLUSIVE)
+ if (poll->events & EPOLLEXCLUSIVE) {
+ /*
+ * Exclusive waits may only wake a limited amount of entries
+ * rather than all of them, this may interfere with lazy
+ * wake if someone does wait(events > 1). Ensure we don't do
+ * lazy wake for those, as we need to process each one as they
+ * come in.
+ */
+ req->flags |= REQ_F_POLL_NO_LAZY;
add_wait_queue_exclusive(head, &poll->wait);
- else
+ } else {
add_wait_queue(head, &poll->wait);
+ }
}
static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 7034be555334..f521c5965a93 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1258,7 +1258,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
*/
const struct bio_vec *bvec = imu->bvec;
- if (offset <= bvec->bv_len) {
+ if (offset < bvec->bv_len) {
/*
* Note, huge pages buffers consists of one large
* bvec entry and should always go this way. The other
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 8625181fb87a..08ac0d8e07ef 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -77,17 +77,10 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file);
-#if defined(CONFIG_UNIX)
-static inline bool io_file_need_scm(struct file *filp)
-{
- return !!unix_get_socket(filp);
-}
-#else
static inline bool io_file_need_scm(struct file *filp)
{
return false;
}
-#endif
static inline int io_scm_file_account(struct io_ring_ctx *ctx,
struct file *file)
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index bd6c2c7959a5..65b5dbe3c850 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -214,6 +214,7 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd)
did_sig = get_signal(&ksig);
cond_resched();
mutex_lock(&sqd->lock);
+ sqd->sq_cpu = raw_smp_processor_id();
}
return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
}
@@ -229,10 +230,15 @@ static int io_sq_thread(void *data)
snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
set_task_comm(current, buf);
- if (sqd->sq_cpu != -1)
+ /* reset to our pid after we've set task_comm, for fdinfo */
+ sqd->task_pid = current->pid;
+
+ if (sqd->sq_cpu != -1) {
set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
- else
+ } else {
set_cpus_allowed_ptr(current, cpu_online_mask);
+ sqd->sq_cpu = raw_smp_processor_id();
+ }
mutex_lock(&sqd->lock);
while (1) {
@@ -261,6 +267,7 @@ static int io_sq_thread(void *data)
mutex_unlock(&sqd->lock);
cond_resched();
mutex_lock(&sqd->lock);
+ sqd->sq_cpu = raw_smp_processor_id();
}
continue;
}
@@ -294,6 +301,7 @@ static int io_sq_thread(void *data)
mutex_unlock(&sqd->lock);
schedule();
mutex_lock(&sqd->lock);
+ sqd->sq_cpu = raw_smp_processor_id();
}
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
atomic_andnot(IORING_SQ_NEED_WAKEUP,
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index acbc2924ecd2..7d3ef62e620a 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -7,7 +7,7 @@
#include <linux/nospec.h>
#include <uapi/linux/io_uring.h>
-#include <uapi/asm-generic/ioctls.h>
+#include <asm/ioctls.h>
#include "io_uring.h"
#include "rsrc.h"