diff options
Diffstat (limited to 'io_uring/register.c')
| -rw-r--r-- | io_uring/register.c | 335 |
1 files changed, 174 insertions, 161 deletions
diff --git a/io_uring/register.c b/io_uring/register.c index 1a60f4916649..62d39b3ff317 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -18,6 +18,7 @@ #include <linux/io_uring.h> #include <linux/io_uring_types.h> +#include "filetable.h" #include "io_uring.h" #include "opdef.h" #include "tctx.h" @@ -30,6 +31,8 @@ #include "eventfd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" +#include "query.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -45,13 +48,9 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, nr_args = IORING_OP_LAST; size = struct_size(p, ops, nr_args); - p = kzalloc(size, GFP_KERNEL); - if (!p) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(p, arg, size)) - goto out; + p = memdup_user(arg, size); + if (IS_ERR(p)) + return PTR_ERR(p); ret = -EINVAL; if (memchr_inv(p, 0, size)) goto out; @@ -104,21 +103,13 @@ static int io_register_personality(struct io_ring_ctx *ctx) return id; } -static __cold int io_register_restrictions(struct io_ring_ctx *ctx, - void __user *arg, unsigned int nr_args) +static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, + struct io_restriction *restrictions) { struct io_uring_restriction *res; size_t size; int i, ret; - /* Restrictions allowed only if rings started disabled */ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) - return -EBUSY; - if (!arg || nr_args > IORING_MAX_RESTRICTIONS) return -EINVAL; @@ -130,47 +121,57 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, if (IS_ERR(res)) return PTR_ERR(res); - ret = 0; + ret = -EINVAL; for (i = 0; i < nr_args; i++) { switch (res[i].opcode) { case IORING_RESTRICTION_REGISTER_OP: - if (res[i].register_op >= IORING_REGISTER_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].register_op, - ctx->restrictions.register_op); + if (res[i].register_op >= IORING_REGISTER_LAST) + goto err; + __set_bit(res[i].register_op, restrictions->register_op); break; case IORING_RESTRICTION_SQE_OP: - if (res[i].sqe_op >= IORING_OP_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); + if (res[i].sqe_op >= IORING_OP_LAST) + goto err; + __set_bit(res[i].sqe_op, restrictions->sqe_op); break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; + restrictions->sqe_flags_allowed = res[i].sqe_flags; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; + restrictions->sqe_flags_required = res[i].sqe_flags; break; default: - ret = -EINVAL; - goto out; + goto err; } } -out: + ret = 0; + +err: + kfree(res); + return ret; +} + +static __cold int io_register_restrictions(struct io_ring_ctx *ctx, + void __user *arg, unsigned int nr_args) +{ + int ret; + + /* Restrictions allowed only if rings started disabled */ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + /* We allow only a single restrictions registration */ + if (ctx->restrictions.registered) + return -EBUSY; + + ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); /* Reset all restrictions if an error happened */ if (ret != 0) memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); else ctx->restrictions.registered = true; - - kfree(res); return ret; } @@ -270,6 +271,8 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, if (ctx->flags & IORING_SETUP_SQPOLL) { sqd = ctx->sq_data; if (sqd) { + struct task_struct *tsk; + /* * Observe the correct sqd->lock -> ctx->uring_lock * ordering. Fine to drop uring_lock here, we hold @@ -279,8 +282,9 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, mutex_unlock(&ctx->uring_lock); mutex_lock(&sqd->lock); mutex_lock(&ctx->uring_lock); - if (sqd->thread) - tctx = sqd->thread->io_uring; + tsk = sqpoll_task_locked(sqd); + if (tsk) + tctx = tsk->io_uring; } } else { tctx = current->io_uring; @@ -367,28 +371,18 @@ static int io_register_clock(struct io_ring_ctx *ctx, * either mapping or freeing. */ struct io_ring_ctx_rings { - unsigned short n_ring_pages; - unsigned short n_sqe_pages; - struct page **ring_pages; - struct page **sqe_pages; - struct io_uring_sqe *sq_sqes; struct io_rings *rings; + struct io_uring_sqe *sq_sqes; + + struct io_mapped_region sq_region; + struct io_mapped_region ring_region; }; -static void io_register_free_rings(struct io_uring_params *p, +static void io_register_free_rings(struct io_ring_ctx *ctx, struct io_ring_ctx_rings *r) { - if (!(p->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, - true); - io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages, - true); - } else { - io_pages_free(&r->ring_pages, r->n_ring_pages); - io_pages_free(&r->sqe_pages, r->n_sqe_pages); - vunmap(r->rings); - vunmap(r->sq_sqes); - } + io_free_region(ctx->user, &r->sq_region); + io_free_region(ctx->user, &r->ring_region); } #define swap_old(ctx, o, n, field) \ @@ -399,82 +393,78 @@ static void io_register_free_rings(struct io_uring_params *p, #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ - IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ + IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED) static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { + struct io_ctx_config config; + struct io_uring_region_desc rd; struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; - size_t size, sq_array_offset; - struct io_uring_params p; - unsigned i, tail; - void *ptr; + unsigned i, tail, old_head; + struct io_uring_params *p = &config.p; + struct io_rings_layout *rl = &config.layout; int ret; - /* for single issuer, must be owner resizing */ - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && - current != ctx->submitter_task) - return -EEXIST; - if (copy_from_user(&p, arg, sizeof(p))) + memset(&config, 0, sizeof(config)); + + /* limited to DEFER_TASKRUN for now */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + if (copy_from_user(p, arg, sizeof(*p))) return -EFAULT; - if (p.flags & ~RESIZE_FLAGS) + if (p->flags & ~RESIZE_FLAGS) return -EINVAL; /* properties that are always inherited */ - p.flags |= (ctx->flags & COPY_FLAGS); + p->flags |= (ctx->flags & COPY_FLAGS); - ret = io_uring_fill_params(p.sq_entries, &p); + ret = io_prepare_config(&config); if (unlikely(ret)) return ret; - /* nothing to do, but copy params back */ - if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { - if (copy_to_user(arg, &p, sizeof(p))) - return -EFAULT; - return 0; + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(rl->rings_size); + if (p->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; } + ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) + return ret; - size = rings_size(p.flags, p.sq_entries, p.cq_entries, - &sq_array_offset); - if (size == SIZE_MAX) - return -EOVERFLOW; + n.rings = io_region_get_ptr(&n.ring_region); - if (!(p.flags & IORING_SETUP_NO_MMAP)) - n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); - else - n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, - p.cq_off.user_addr, size); - if (IS_ERR(n.rings)) - return PTR_ERR(n.rings); - - n.rings->sq_ring_mask = p.sq_entries - 1; - n.rings->cq_ring_mask = p.cq_entries - 1; - n.rings->sq_ring_entries = p.sq_entries; - n.rings->cq_ring_entries = p.cq_entries; - - if (copy_to_user(arg, &p, sizeof(p))) { - io_register_free_rings(&p, &n); + /* + * At this point n.rings is shared with userspace, just like o.rings + * is as well. While we don't expect userspace to modify it while + * a resize is in progress, and it's most likely that userspace will + * shoot itself in the foot if it does, we can't always assume good + * intent... Use read/write once helpers from here on to indicate the + * shared nature of it. + */ + WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1); + WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1); + WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries); + WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries); + + if (copy_to_user(arg, p, sizeof(*p))) { + io_register_free_rings(ctx, &n); return -EFAULT; } - if (p.flags & IORING_SETUP_SQE128) - size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); - else - size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); - if (size == SIZE_MAX) { - io_register_free_rings(&p, &n); - return -EOVERFLOW; + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(rl->sq_size); + if (p->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; } - - if (!(p.flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size); - else - ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages, - p.sq_off.user_addr, - size); - if (IS_ERR(ptr)) { - io_register_free_rings(&p, &n); - return PTR_ERR(ptr); + ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES); + if (ret) { + io_register_free_rings(ctx, &n); + return ret; } + n.sq_sqes = io_region_get_ptr(&n.sq_region); /* * If using SQPOLL, park the thread @@ -486,15 +476,15 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) } /* - * We'll do the swap. Grab the ctx->resize_lock, which will exclude + * We'll do the swap. Grab the ctx->mmap_lock, which will exclude * any new mmap's on the ring fd. Clear out existing mappings to prevent * mmap from seeing them, as we'll unmap them. Any attempt to mmap * existing rings beyond this point will fail. Not that it could proceed * at this point anyway, as the io_uring mmap side needs go grab the - * ctx->resize_lock as well. Likewise, hold the completion lock over the + * ctx->mmap_lock as well. Likewise, hold the completion lock over the * duration of the actual swap. */ - mutex_lock(&ctx->resize_lock); + mutex_lock(&ctx->mmap_lock); spin_lock(&ctx->completion_lock); o.rings = ctx->rings; ctx->rings = NULL; @@ -505,21 +495,22 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) * Now copy SQ and CQ entries, if any. If either of the destination * rings can't hold what is already there, then fail the operation. */ - n.sq_sqes = ptr; - tail = o.rings->sq.tail; - if (tail - o.rings->sq.head > p.sq_entries) + tail = READ_ONCE(o.rings->sq.tail); + old_head = READ_ONCE(o.rings->sq.head); + if (tail - old_head > p->sq_entries) goto overflow; - for (i = o.rings->sq.head; i < tail; i++) { + for (i = old_head; i < tail; i++) { unsigned src_head = i & (ctx->sq_entries - 1); - unsigned dst_head = i & n.rings->sq_ring_mask; + unsigned dst_head = i & (p->sq_entries - 1); n.sq_sqes[dst_head] = o.sq_sqes[src_head]; } - n.rings->sq.head = o.rings->sq.head; - n.rings->sq.tail = o.rings->sq.tail; + WRITE_ONCE(n.rings->sq.head, old_head); + WRITE_ONCE(n.rings->sq.tail, tail); - tail = o.rings->cq.tail; - if (tail - o.rings->cq.head > p.cq_entries) { + tail = READ_ONCE(o.rings->cq.tail); + old_head = READ_ONCE(o.rings->cq.head); + if (tail - old_head > p->cq_entries) { overflow: /* restore old rings, and return -EOVERFLOW via cleanup path */ ctx->rings = o.rings; @@ -528,41 +519,39 @@ overflow: ret = -EOVERFLOW; goto out; } - for (i = o.rings->cq.head; i < tail; i++) { + for (i = old_head; i < tail; i++) { unsigned src_head = i & (ctx->cq_entries - 1); - unsigned dst_head = i & n.rings->cq_ring_mask; + unsigned dst_head = i & (p->cq_entries - 1); n.rings->cqes[dst_head] = o.rings->cqes[src_head]; } - n.rings->cq.head = o.rings->cq.head; - n.rings->cq.tail = o.rings->cq.tail; + WRITE_ONCE(n.rings->cq.head, old_head); + WRITE_ONCE(n.rings->cq.tail, tail); /* invalidate cached cqe refill */ ctx->cqe_cached = ctx->cqe_sentinel = NULL; - n.rings->sq_dropped = o.rings->sq_dropped; - n.rings->sq_flags = o.rings->sq_flags; - n.rings->cq_flags = o.rings->cq_flags; - n.rings->cq_overflow = o.rings->cq_overflow; + WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); + atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags)); + WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); + WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); /* all done, store old pointers and assign new ones */ if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); + ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset); - ctx->sq_entries = p.sq_entries; - ctx->cq_entries = p.cq_entries; + ctx->sq_entries = p->sq_entries; + ctx->cq_entries = p->cq_entries; ctx->rings = n.rings; ctx->sq_sqes = n.sq_sqes; - swap_old(ctx, o, n, n_ring_pages); - swap_old(ctx, o, n, n_sqe_pages); - swap_old(ctx, o, n, ring_pages); - swap_old(ctx, o, n, sqe_pages); + swap_old(ctx, o, n, ring_region); + swap_old(ctx, o, n, sq_region); to_free = &o; ret = 0; out: spin_unlock(&ctx->completion_lock); - mutex_unlock(&ctx->resize_lock); - io_register_free_rings(&p, to_free); + mutex_unlock(&ctx->mmap_lock); + io_register_free_rings(ctx, to_free); if (ctx->sq_data) io_sq_thread_unpark(ctx->sq_data); @@ -576,6 +565,7 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) struct io_uring_mem_region_reg reg; struct io_uring_region_desc __user *rd_uptr; struct io_uring_region_desc rd; + struct io_mapped_region region = {}; int ret; if (io_region_is_set(&ctx->param_region)) @@ -585,7 +575,6 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) rd_uptr = u64_to_user_ptr(reg.region_uptr); if (copy_from_user(&rd, rd_uptr, sizeof(rd))) return -EFAULT; - if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) return -EINVAL; if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) @@ -600,18 +589,20 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) !(ctx->flags & IORING_SETUP_R_DISABLED)) return -EINVAL; - ret = io_create_region(ctx, &ctx->param_region, &rd); + ret = io_create_region(ctx, ®ion, &rd, IORING_MAP_OFF_PARAM_REGION); if (ret) return ret; if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { - io_free_region(ctx, &ctx->param_region); + io_free_region(ctx->user, ®ion); return -EFAULT; } if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { - ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); + ctx->cq_wait_arg = io_region_get_ptr(®ion); ctx->cq_wait_size = rd.size; } + + io_region_publish(ctx, ®ion, &ctx->param_region); return 0; } @@ -803,6 +794,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_clone_buffers(ctx, arg); break; + case IORING_REGISTER_ZCRX_IFQ: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_zcrx_ifq(ctx, arg); + break; case IORING_REGISTER_RESIZE_RINGS: ret = -EINVAL; if (!arg || nr_args != 1) @@ -815,6 +812,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_mem_region(ctx, arg); break; + case IORING_REGISTER_QUERY: + ret = io_query(arg, nr_args); + break; + case IORING_REGISTER_ZCRX_CTRL: + ret = io_zcrx_ctrl(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; @@ -843,6 +846,8 @@ struct file *io_uring_register_get_file(unsigned int fd, bool registered) return ERR_PTR(-EINVAL); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; + if (file) + get_file(file); } else { file = fget(fd); } @@ -855,6 +860,23 @@ struct file *io_uring_register_get_file(unsigned int fd, bool registered) return ERR_PTR(-EOPNOTSUPP); } +static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args) +{ + struct io_uring_sqe sqe; + + if (!arg || nr_args != 1) + return -EINVAL; + if (copy_from_user(&sqe, arg, sizeof(sqe))) + return -EFAULT; + /* no flags supported */ + if (sqe.flags) + return -EINVAL; + if (sqe.opcode != IORING_OP_MSG_RING) + return -EINVAL; + + return io_uring_sync_msg_ring(&sqe); +} + /* * "blind" registration opcodes are ones where there's no ring given, and * hence the source fd must be -1. @@ -863,21 +885,11 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg, unsigned int nr_args) { switch (opcode) { - case IORING_REGISTER_SEND_MSG_RING: { - struct io_uring_sqe sqe; - - if (!arg || nr_args != 1) - return -EINVAL; - if (copy_from_user(&sqe, arg, sizeof(sqe))) - return -EFAULT; - /* no flags supported */ - if (sqe.flags) - return -EINVAL; - if (sqe.opcode == IORING_OP_MSG_RING) - return io_uring_sync_msg_ring(&sqe); - } + case IORING_REGISTER_SEND_MSG_RING: + return io_uring_register_send_msg_ring(arg, nr_args); + case IORING_REGISTER_QUERY: + return io_query(arg, nr_args); } - return -EINVAL; } @@ -905,10 +917,11 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); - mutex_unlock(&ctx->uring_lock); + trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, ctx->buf_table.nr, ret); - if (!use_registered_ring) - fput(file); + mutex_unlock(&ctx->uring_lock); + + fput(file); return ret; } |
