diff options
Diffstat (limited to 'io_uring/io_uring.c')
| -rw-r--r-- | io_uring/io_uring.c | 1621 |
1 files changed, 784 insertions, 837 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 801293399883..5d130c578435 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -29,7 +29,7 @@ * * Also see the examples in the liburing library: * - * git://git.kernel.dk/liburing + * git://git.kernel.org/pub/scm/linux/kernel/git/axboe/liburing.git * * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens * from data shared between the kernel and application. This is done both @@ -79,6 +79,7 @@ #include "io-wq.h" +#include "filetable.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" @@ -97,6 +98,7 @@ #include "uring_cmd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" #include "timeout.h" #include "poll.h" @@ -107,30 +109,23 @@ #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) -#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ - IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ - REQ_F_ASYNC_DATA) + REQ_F_INFLIGHT | REQ_F_CREDS | REQ_F_ASYNC_DATA) -#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ +#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \ + REQ_F_REISSUE | REQ_F_POLLED | \ IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 - -struct io_defer_entry { - struct list_head list; - struct io_kiocb *req; - u32 seq; -}; +#define IO_LOCAL_TW_DEFAULT_MAX 20 /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) -#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) /* * No waiters. It's larger than any valid value of the tw counter @@ -140,11 +135,8 @@ struct io_defer_entry { /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) -static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct io_uring_task *tctx, - bool cancel_all); - -static void io_queue_sqe(struct io_kiocb *req); +static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); +static void __io_req_caches_free(struct io_ring_ctx *ctx); static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); @@ -155,7 +147,7 @@ static int __read_mostly sysctl_io_uring_disabled; static int __read_mostly sysctl_io_uring_group = -1; #ifdef CONFIG_SYSCTL -static struct ctl_table kernel_io_uring_disabled_table[] = { +static const struct ctl_table kernel_io_uring_disabled_table[] = { { .procname = "io_uring_disabled", .data = &sysctl_io_uring_disabled, @@ -175,52 +167,34 @@ static struct ctl_table kernel_io_uring_disabled_table[] = { }; #endif -static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) +static void io_poison_cached_req(struct io_kiocb *req) { - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); + req->ctx = IO_URING_PTR_POISON; + req->tctx = IO_URING_PTR_POISON; + req->file = IO_URING_PTR_POISON; + req->creds = IO_URING_PTR_POISON; + req->io_task_work.func = IO_URING_PTR_POISON; + req->apoll = IO_URING_PTR_POISON; } -static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) +static void io_poison_req(struct io_kiocb *req) { - return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); + io_poison_cached_req(req); + req->async_data = IO_URING_PTR_POISON; + req->kbuf = IO_URING_PTR_POISON; + req->comp_list.next = IO_URING_PTR_POISON; + req->file_node = IO_URING_PTR_POISON; + req->link = IO_URING_PTR_POISON; } -static bool io_match_linked(struct io_kiocb *head) +static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { - struct io_kiocb *req; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; + return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } -/* - * As io_match_task() but protected against racing with linked timeouts. - * User must not hold timeout_lock. - */ -bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, - bool cancel_all) +static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) { - bool matched; - - if (tctx && head->tctx != tctx) - return false; - if (cancel_all) - return true; - - if (head->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = head->ctx; - - /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); - matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); - } else { - matched = io_match_linked(head); - } - return matched; + return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); } static inline void req_fail_link_node(struct io_kiocb *req, int res) @@ -231,6 +205,8 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { + if (IS_ENABLED(CONFIG_KASAN)) + io_poison_cached_req(req); wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } @@ -241,6 +217,20 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } +/* + * Terminate the request if either of these conditions are true: + * + * 1) It's being executed by the original task, but that task is marked + * with PF_EXITING as it's exiting. + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is + * our fallback task_work. + * 3) The ring has been closed and is going away. + */ +static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) +{ + return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); +} + static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, @@ -251,8 +241,9 @@ static __cold void io_fallback_req_func(struct work_struct *work) percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); + ts.cancel = io_should_terminate_tw(ctx); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, &ts); + req->io_task_work.func((struct io_tw_req){req}, ts); io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -280,6 +271,16 @@ static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) return 0; } +static void io_free_alloc_caches(struct io_ring_ctx *ctx) +{ + io_alloc_cache_free(&ctx->apoll_cache, kfree); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); + io_futex_cache_free(ctx); + io_rsrc_cache_free(ctx); +} + static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; @@ -311,19 +312,19 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); - INIT_LIST_HEAD(&ctx->io_buffers_cache); ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, - sizeof(struct async_poll)); + sizeof(struct async_poll), 0); ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_async_msghdr)); + sizeof(struct io_async_msghdr), + offsetof(struct io_async_msghdr, clear)); ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_async_rw)); - ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct uring_cache)); - spin_lock_init(&ctx->msg_lock); - ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_kiocb)); + sizeof(struct io_async_rw), + offsetof(struct io_async_rw, clear)); + ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_cmd), + sizeof(struct io_async_cmd)); ret |= io_futex_cache_init(ctx); + ret |= io_rsrc_cache_init(ctx); if (ret) goto free_ref; init_completion(&ctx->ref_comp); @@ -332,9 +333,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); - spin_lock_init(&ctx->timeout_lock); + raw_spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); - INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); @@ -342,6 +342,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; INIT_HLIST_HEAD(&ctx->waitid_list); + xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); #endif @@ -349,51 +350,24 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); io_napi_init(ctx); - mutex_init(&ctx->resize_lock); + mutex_init(&ctx->mmap_lock); return ctx; free_ref: percpu_ref_exit(&ctx->refs); err: - io_alloc_cache_free(&ctx->apoll_cache, kfree); - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); - io_futex_cache_free(ctx); + io_free_alloc_caches(ctx); kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } -static void io_account_cq_overflow(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; -} - -static bool req_need_defer(struct io_kiocb *req, u32 seq) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) { - struct io_ring_ctx *ctx = req->ctx; - - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; - } - - return false; -} - static void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_BUFFER_SELECTED) { - spin_lock(&req->ctx->completion_lock); - io_kbuf_drop(req); - spin_unlock(&req->ctx->completion_lock); - } + if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) + io_kbuf_drop_legacy(req); if (req->flags & REQ_F_NEED_CLEANUP) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; @@ -401,11 +375,6 @@ static void io_clean_op(struct io_kiocb *req) if (def->cleanup) def->cleanup(req); } - if ((req->flags & REQ_F_POLLED) && req->apoll) { - kfree(req->apoll->double_poll); - kfree(req->apoll); - req->apoll = NULL; - } if (req->flags & REQ_F_INFLIGHT) atomic_dec(&req->tctx->inflight_tracked); if (req->flags & REQ_F_CREDS) @@ -417,7 +386,12 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } -static inline void io_req_track_inflight(struct io_kiocb *req) +/* + * Mark the request as inflight, so that file cancelation will find it. + * Can be used if the file is an io_uring instance, or if the request itself + * relies on ->mm being alive for the duration of the request. + */ +inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; @@ -439,24 +413,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) return req->link; } -static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) - return NULL; - return __io_prep_linked_timeout(req); -} - -static noinline void __io_arm_ltimeout(struct io_kiocb *req) -{ - io_queue_linked_timeout(__io_prep_linked_timeout(req)); -} - -static inline void io_arm_ltimeout(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) - __io_arm_ltimeout(req); -} - static void io_prep_async_work(struct io_kiocb *req) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; @@ -497,10 +453,10 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); @@ -509,11 +465,14 @@ static void io_prep_async_link(struct io_kiocb *req) static void io_queue_iowq(struct io_kiocb *req) { - struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); - BUG_ON(!tctx->io_wq); + + if ((current->flags & PF_KTHREAD) || !tctx->io_wq) { + io_req_task_queue_fail(req, -ECANCELED); + return; + } /* init ->work of the whole link before punting */ io_prep_async_link(req); @@ -530,13 +489,11 @@ static void io_queue_iowq(struct io_kiocb *req) trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); - if (link) - io_queue_linked_timeout(link); } -static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_queue_iowq_tw(struct io_tw_req tw_req, io_tw_token_t tw) { - io_queue_iowq(req); + io_queue_iowq(tw_req.req); } void io_req_queue_iowq(struct io_kiocb *req) @@ -545,17 +502,36 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } -static __cold void io_queue_deferred(struct io_ring_ctx *ctx) +unsigned io_linked_nr(struct io_kiocb *req) +{ + struct io_kiocb *tmp; + unsigned nr = 0; + + io_for_each_link(tmp, req) + nr++; + return nr; +} + +static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { + bool drain_seen = false, first = true; + + lockdep_assert_held(&ctx->uring_lock); + __io_req_caches_free(ctx); + while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); - if (req_need_defer(de->req, de->seq)) - break; + drain_seen |= de->req->flags & REQ_F_IO_DRAIN; + if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained) + return; + list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue(de->req); kfree(de); + first = false; } } @@ -565,13 +541,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) { - spin_lock(&ctx->completion_lock); - io_queue_deferred(ctx); - spin_unlock(&ctx->completion_lock); - } if (ctx->has_evfd) - io_eventfd_flush_signal(ctx); + io_eventfd_signal(ctx, true); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) @@ -610,27 +581,31 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { - size_t cqe_size = sizeof(struct io_uring_cqe); - lockdep_assert_held(&ctx->uring_lock); /* don't abort if we're dying, entries must get freed */ if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; - if (ctx->flags & IORING_SETUP_CQE32) - cqe_size <<= 1; - io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { + size_t cqe_size = sizeof(struct io_uring_cqe); struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; + bool is_cqe32 = false; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); + if (ocqe->cqe.flags & IORING_CQE_F_32 || + ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + cqe_size <<= 1; + } + if (ctx->flags & IORING_SETUP_CQE32) + is_cqe32 = false; if (!dying) { - if (!io_get_cqe_overflow(ctx, &cqe, true)) + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) break; memcpy(cqe, &ocqe->cqe, cqe_size); } @@ -645,6 +620,7 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) * to care for a non-real case. */ if (need_resched()) { + ctx->cqe_sentinel = ctx->cqe_cached; io_cq_unlock_post(ctx); mutex_unlock(&ctx->uring_lock); cond_resched(); @@ -697,7 +673,7 @@ void io_task_refs_refill(struct io_uring_task *tctx) tctx->cached_refs += refill; } -static __cold void io_uring_drop_tctx_refs(struct task_struct *task) +__cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; @@ -709,27 +685,20 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, u64 extra2) +static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx, + struct io_overflow_cqe *ocqe) { - struct io_overflow_cqe *ocqe; - size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - lockdep_assert_held(&ctx->completion_lock); - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); - - ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { + struct io_rings *r = ctx->rings; + /* * If we're in ring overflow flush mode, or in task cancel mode, * or cannot allocate an overflow entry, then we need to drop it * on the floor. */ - io_account_cq_overflow(ctx); + WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } @@ -738,23 +707,55 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; - } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } -static void io_req_cqe_overflow(struct io_kiocb *req) +static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe, gfp_t gfp) { - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, req->big_cqe.extra2); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); + struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = false; + + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + ocq_size += sizeof(struct io_uring_cqe); + } + + ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); + if (ocqe) { + ocqe->cqe.user_data = cqe->user_data; + ocqe->cqe.res = cqe->res; + ocqe->cqe.flags = cqe->flags; + if (is_cqe32 && big_cqe) { + ocqe->cqe.big_cqe[0] = big_cqe->extra1; + ocqe->cqe.big_cqe[1] = big_cqe->extra2; + } + } + if (big_cqe) + big_cqe->extra1 = big_cqe->extra2 = 0; + return ocqe; +} + +/* + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE + * because the ring is a single 16b entry away from wrapping. + */ +static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) +{ + if (__io_cqring_events(ctx) < ctx->cq_entries) { + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; + + cqe->user_data = 0; + cqe->res = 0; + cqe->flags = IORING_CQE_F_SKIP; + ctx->cached_cq_tail++; + return true; + } + return false; } /* @@ -762,7 +763,7 @@ static void io_req_cqe_overflow(struct io_kiocb *req) * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); @@ -776,12 +777,22 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) return false; + /* + * Post dummy CQE if a 32b CQE is needed and there's only room for a + * 16b CQE before the ring wraps. + */ + if (cqe32 && off + 1 == ctx->cq_entries) { + if (!io_fill_nop_cqe(ctx, off)) + return false; + off = 0; + } + /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); free = ctx->cq_entries - queued; /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); - if (!len) + if (len < (cqe32 + 1)) return false; if (ctx->flags & IORING_SETUP_CQE32) { @@ -794,24 +805,33 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) return true; } +static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, + struct io_uring_cqe src_cqe[2]) +{ + struct io_uring_cqe *cqe; + + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) + return false; + if (unlikely(!io_get_cqe(ctx, &cqe, true))) + return false; + + memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); + trace_io_uring_complete(ctx, NULL, cqe); + return true; +} + static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + bool cqe32 = cflags & IORING_CQE_F_32; struct io_uring_cqe *cqe; - ctx->cq_extra++; - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - if (likely(io_get_cqe(ctx, &cqe))) { + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - if (ctx->flags & IORING_SETUP_CQE32) { + if (cqe32) { WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } @@ -822,16 +842,30 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } -static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags) +static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags) { - bool filled; + return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags }; +} - filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled) - filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); +static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; - return filled; + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL); + spin_lock(&ctx->completion_lock); + io_cqring_add_overflow(ctx, ocqe); + spin_unlock(&ctx->completion_lock); +} + +static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC); + return io_cqring_add_overflow(ctx, ocqe); } bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) @@ -839,30 +873,70 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags bool filled; io_cq_lock(ctx); - filled = __io_post_aux_cqe(ctx, user_data, res, cflags); + filled = io_fill_cqe_aux(ctx, user_data, res, cflags); + if (unlikely(!filled)) { + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); + + filled = io_cqe_overflow_locked(ctx, &cqe, NULL); + } io_cq_unlock_post(ctx); return filled; } /* - * Must be called from inline task_work so we now a flush will happen later, + * Must be called from inline task_work so we know a flush will happen later, * and obviously with ctx->uring_lock held (tw always has that). */ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + lockdep_assert_held(&ctx->uring_lock); + lockdep_assert(ctx->lockless_cq); + if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); + + io_cqe_overflow(ctx, &cqe, NULL); + } + ctx->submit_state.cq_flush = true; +} + +/* + * A helper for multishot requests posting additional CQEs. + * Should only be used from a task_work including IO_URING_F_MULTISHOT. + */ +bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) +{ + struct io_ring_ctx *ctx = req->ctx; + bool posted; + + /* + * If multishot has already posted deferred completions, ensure that + * those are flushed first before posting this one. If not, CQEs + * could get reordered. + */ + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) + __io_submit_flush_completions(ctx); + + lockdep_assert(!io_wq_current_is_worker()); + lockdep_assert_held(&ctx->uring_lock); + + if (!ctx->lockless_cq) { spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); spin_unlock(&ctx->completion_lock); + } else { + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); } + ctx->submit_state.cq_flush = true; + return posted; } /* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ -bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) +bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2]) { struct io_ring_ctx *ctx = req->ctx; bool posted; @@ -870,16 +944,23 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); - __io_cq_lock(ctx); - posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + cqe[0].user_data = req->cqe.user_data; + if (!ctx->lockless_cq) { + spin_lock(&ctx->completion_lock); + posted = io_fill_cqe_aux32(ctx, cqe); + spin_unlock(&ctx->completion_lock); + } else { + posted = io_fill_cqe_aux32(ctx, cqe); + } + ctx->submit_state.cq_flush = true; - __io_cq_unlock_post(ctx); return posted; } static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; + bool completed = true; /* * All execution paths but io-wq use the deferred completions by @@ -892,19 +973,21 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { + if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { +defer_complete: req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return; } io_cq_lock(ctx); - if (!(req->flags & REQ_F_CQE_SKIP)) { - if (!io_fill_cqe_req(ctx, req)) - io_req_cqe_overflow(req); - } + if (!(req->flags & REQ_F_CQE_SKIP)) + completed = io_fill_cqe_req(ctx, req); io_cq_unlock_post(ctx); + if (!completed) + goto defer_complete; + /* * We don't free the request here because we know it's called from * io-wq only, which holds a reference, so it cannot be the last put. @@ -920,29 +1003,13 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); - io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED)); + io_req_set_res(req, res, io_put_kbuf(req, res, NULL)); if (def->fail) def->fail(req); io_req_complete_defer(req); } /* - * Don't initialise the fields below on every allocation, but do that in - * advance and keep them valid across allocations. - */ -static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - req->ctx = ctx; - req->buf_node = NULL; - req->file_node = NULL; - req->link = NULL; - req->async_data = NULL; - /* not necessary, but safer to zero */ - memset(&req->cqe, 0, sizeof(req->cqe)); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); -} - -/* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. * Because of that, io_alloc_req() should be called only under ->uring_lock @@ -951,7 +1018,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO; void *reqs[IO_REQ_ALLOC_BATCH]; int ret; @@ -969,10 +1036,11 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); + ctx->nr_req_allocated += ret; + while (ret--) { struct io_kiocb *req = reqs[ret]; - io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); } return true; @@ -1014,7 +1082,7 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return nxt; } -static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) { if (!ctx) return; @@ -1044,24 +1112,25 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, io_task_work.node); if (req->ctx != ctx) { - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); ctx = req->ctx; mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); + ts.cancel = io_should_terminate_tw(ctx); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, &ts); + (struct io_tw_req){req}, ts); node = next; (*count)++; if (unlikely(need_resched())) { - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); ctx = NULL; cond_resched(); } } while (node && *count < max_entries); - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); return node; } @@ -1073,21 +1142,22 @@ static __cold void __io_fallback_tw(struct llist_node *node, bool sync) while (node) { req = container_of(node, struct io_kiocb, io_task_work.node); node = node->next; - if (sync && last_ctx != req->ctx) { + if (last_ctx != req->ctx) { if (last_ctx) { - flush_delayed_work(&last_ctx->fallback_work); + if (sync) + flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } last_ctx = req->ctx; percpu_ref_get(&last_ctx->refs); } - if (llist_add(&req->io_task_work.node, - &req->ctx->fallback_llist)) - schedule_delayed_work(&req->ctx->fallback_work, 1); + if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) + schedule_delayed_work(&last_ctx->fallback_work, 1); } if (last_ctx) { - flush_delayed_work(&last_ctx->fallback_work); + if (sync) + flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } } @@ -1105,11 +1175,6 @@ struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, { struct llist_node *node; - if (unlikely(current->flags & PF_EXITING)) { - io_fallback_tw(tctx, true); - return NULL; - } - node = llist_del_all(&tctx->task_list); if (node) { node = llist_reverse_order(node); @@ -1136,10 +1201,9 @@ void tctx_task_work(struct callback_head *cb) WARN_ON_ONCE(ret); } -static inline void io_req_local_work_add(struct io_kiocb *req, - struct io_ring_ctx *ctx, - unsigned flags) +static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { + struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; @@ -1147,10 +1211,10 @@ static inline void io_req_local_work_add(struct io_kiocb *req, BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); /* - * We don't know how many reuqests is there in the link and whether + * We don't know how many requests there are in the link and whether * they can even be queued lazily, fall back to non-lazy. */ - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + if (req->flags & IO_REQ_LINK_FLAGS) flags &= ~IOU_F_TWQ_LAZY_WAKE; guard(rcu)(); @@ -1194,7 +1258,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ctx->has_evfd) - io_eventfd_signal(ctx); + io_eventfd_signal(ctx, false); } nr_wait = atomic_read(&ctx->cq_wait_nr); @@ -1221,10 +1285,7 @@ static void io_req_normal_work_add(struct io_kiocb *req) /* SQPOLL doesn't need the task_work added, it'll run it itself */ if (ctx->flags & IORING_SETUP_SQPOLL) { - struct io_sq_data *sqd = ctx->sq_data; - - if (sqd->thread) - __set_notify_signal(sqd->thread); + __set_notify_signal(tctx->task); return; } @@ -1237,17 +1298,16 @@ static void io_req_normal_work_add(struct io_kiocb *req) void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) - io_req_local_work_add(req, req->ctx, flags); + io_req_local_work_add(req, flags); else io_req_normal_work_add(req); } -void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, - unsigned flags) +void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) { - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))) + if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) return; - io_req_local_work_add(req, ctx, flags); + __io_req_task_work_add(req, flags); } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) @@ -1255,12 +1315,14 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) struct llist_node *node = llist_del_all(&ctx->work_llist); __io_fallback_tw(node, false); + node = llist_del_all(&ctx->retry_llist); + __io_fallback_tw(node, false); } static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, int min_events) { - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return false; if (events < min_events) return true; @@ -1269,8 +1331,29 @@ static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, return false; } -static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, - int min_events) +static int __io_run_local_work_loop(struct llist_node **node, + io_tw_token_t tw, + int events) +{ + int ret = 0; + + while (*node) { + struct llist_node *next = (*node)->next; + struct io_kiocb *req = container_of(*node, struct io_kiocb, + io_task_work.node); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + (struct io_tw_req){req}, tw); + *node = next; + if (++ret >= events) + break; + } + + return ret; +} + +static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, + int min_events, int max_events) { struct llist_node *node; unsigned int loops = 0; @@ -1281,25 +1364,24 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: + tw.cancel = io_should_terminate_tw(ctx); + min_events -= ret; + ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); + if (ctx->retry_llist.first) + goto retry_done; + /* * llists are in reverse order, flip it back the right way before * running the pending items. */ node = llist_reverse_order(llist_del_all(&ctx->work_llist)); - while (node) { - struct llist_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - INDIRECT_CALL_2(req->io_task_work.func, - io_poll_task_func, io_req_rw_complete, - req, ts); - ret++; - node = next; - } + ret += __io_run_local_work_loop(&node, tw, max_events - ret); + ctx->retry_llist.first = node; loops++; if (io_run_local_work_continue(ctx, ret, min_events)) goto again; +retry_done: io_submit_flush_completions(ctx); if (io_run_local_work_continue(ctx, ret, min_events)) goto again; @@ -1313,37 +1395,43 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, { struct io_tw_state ts = {}; - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return 0; - return __io_run_local_work(ctx, &ts, min_events); + return __io_run_local_work(ctx, ts, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); } -static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) +int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) { struct io_tw_state ts = {}; int ret; mutex_lock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &ts, min_events); + ret = __io_run_local_work(ctx, ts, min_events, max_events); mutex_unlock(&ctx->uring_lock); return ret; } -static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_task_cancel(struct io_tw_req tw_req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); + struct io_kiocb *req = tw_req.req; + + io_tw_lock(req->ctx, tw); io_req_defer_failed(req, req->cqe.res); } -void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); - if (unlikely(io_should_terminate_tw())) + struct io_kiocb *req = tw_req.req; + struct io_ring_ctx *ctx = req->ctx; + + io_tw_lock(ctx, tw); + if (unlikely(tw.cancel)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); else - io_queue_sqe(req); + io_queue_sqe(req, 0); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -1367,6 +1455,16 @@ void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } +static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) +{ + if (req->file_node) { + io_put_rsrc_node(req->ctx, req->file_node); + req->file_node = NULL; + } + if (req->flags & REQ_F_BUF_NODE) + io_put_rsrc_node(req->ctx, req->buf_node); +} + static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) @@ -1376,6 +1474,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, comp_list); if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { + if (req->flags & REQ_F_REISSUE) { + node = req->comp_list.next; + req->flags &= ~REQ_F_REISSUE; + io_queue_iowq(req); + continue; + } if (req->flags & REQ_F_REFCOUNT) { node = req->comp_list.next; if (!req_ref_put_and_test(req)) @@ -1386,8 +1490,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, if (apoll->double_poll) kfree(apoll->double_poll); - if (!io_alloc_cache_put(&ctx->apoll_cache, apoll)) - kfree(apoll); + io_cache_free(&ctx->apoll_cache, apoll); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) @@ -1415,15 +1518,17 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP) && + /* + * Requests marked with REQUEUE should not post a CQE, they + * will go through the io-wq retry machinery and post one + * later. + */ + if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { - if (ctx->lockless_cq) { - spin_lock(&ctx->completion_lock); - io_req_cqe_overflow(req); - spin_unlock(&ctx->completion_lock); - } else { - io_req_cqe_overflow(req); - } + if (ctx->lockless_cq) + io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); + else + io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); } } __io_cq_unlock_post(ctx); @@ -1432,6 +1537,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } + + if (unlikely(ctx->drain_active)) + io_queue_deferred(ctx); + ctx->submit_state.cq_flush = false; } @@ -1446,7 +1555,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx) * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) +__cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; @@ -1468,13 +1577,18 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) } } mutex_unlock(&ctx->uring_lock); + + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_move_task_work_from_local(ctx); } -static int io_iopoll_check(struct io_ring_ctx *ctx, long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) { unsigned int nr_events = 0; unsigned long check_cq; + min_events = min(min_events, ctx->cq_entries); + lockdep_assert_held(&ctx->uring_lock); if (!io_allowed_run_tw(ctx)) @@ -1516,7 +1630,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; - (void) io_run_local_work_locked(ctx, min); + (void) io_run_local_work_locked(ctx, min_events); if (task_work_pending(current) || wq_list_empty(&ctx->iopoll_list)) { @@ -1529,7 +1643,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) wq_list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, !min); + ret = io_do_iopoll(ctx, !min_events); if (unlikely(ret < 0)) return ret; @@ -1539,14 +1653,14 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) break; nr_events += ret; - } while (nr_events < min); + } while (nr_events < min_events); return 0; } -void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw) { - io_req_complete_defer(req); + io_req_complete_defer(tw_req.req); } /* @@ -1608,6 +1722,8 @@ io_req_flags_t io_file_get_flags(struct file *file) { io_req_flags_t res = 0; + BUILD_BUG_ON(REQ_F_ISREG_BIT != REQ_F_SUPPORT_NOWAIT_BIT + 1); + if (S_ISREG(file_inode(file)->i_mode)) res |= REQ_F_ISREG; if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) @@ -1615,69 +1731,28 @@ io_req_flags_t io_file_get_flags(struct file *file) return res; } -bool io_alloc_async_data(struct io_kiocb *req) -{ - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - - WARN_ON_ONCE(!def->async_size); - req->async_data = kmalloc(def->async_size, GFP_KERNEL); - if (req->async_data) { - req->flags |= REQ_F_ASYNC_DATA; - return false; - } - return true; -} - -static u32 io_get_sequence(struct io_kiocb *req) -{ - u32 seq = req->ctx->cached_sq_head; - struct io_kiocb *cur; - - /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(cur, req) - seq--; - return seq; -} - static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; + bool drain = req->flags & IOSQE_IO_DRAIN; struct io_defer_entry *de; - int ret; - u32 seq = io_get_sequence(req); - /* Still need defer if there is pending req in defer list. */ - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); -queue: - ctx->drain_active = false; - io_req_task_queue(req); - return; - } - spin_unlock(&ctx->completion_lock); - - io_prep_async_link(req); - de = kmalloc(sizeof(*de), GFP_KERNEL); + de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT); if (!de) { - ret = -ENOMEM; - io_req_defer_failed(req, ret); + io_req_defer_failed(req, -ENOMEM); return; } - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); - kfree(de); - goto queue; - } - + io_prep_async_link(req); trace_io_uring_defer(req); de->req = req; - de->seq = seq; + + ctx->nr_drained += io_linked_nr(req); list_add_tail(&de->list, &ctx->defer_list); - spin_unlock(&ctx->completion_lock); + io_queue_deferred(ctx); + if (!drain && list_empty(&ctx->defer_list)) + ctx->drain_active = false; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, @@ -1694,17 +1769,22 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, return !!req->file; } -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +#define REQ_ISSUE_SLOW_FLAGS (REQ_F_CREDS | REQ_F_ARM_LTIMEOUT) + +static inline int __io_issue_sqe(struct io_kiocb *req, + unsigned int issue_flags, + const struct io_issue_def *def) { - const struct io_issue_def *def = &io_issue_defs[req->opcode]; const struct cred *creds = NULL; + struct io_kiocb *link = NULL; int ret; - if (unlikely(!io_assign_file(req, def, issue_flags))) - return -EBADF; - - if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) - creds = override_creds(req->creds); + if (unlikely(req->flags & REQ_ISSUE_SLOW_FLAGS)) { + if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) + creds = override_creds(req->creds); + if (req->flags & REQ_F_ARM_LTIMEOUT) + link = __io_prep_linked_timeout(req); + } if (!def->audit_skip) audit_uring_entry(req->opcode); @@ -1714,10 +1794,27 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (!def->audit_skip) audit_uring_exit(!ret, ret); - if (creds) - revert_creds(creds); + if (unlikely(creds || link)) { + if (creds) + revert_creds(creds); + if (link) + io_queue_linked_timeout(link); + } - if (ret == IOU_OK) { + return ret; +} + +static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + int ret; + + if (unlikely(!io_assign_file(req, def, issue_flags))) + return -EBADF; + + ret = __io_issue_sqe(req, issue_flags, def); + + if (ret == IOU_COMPLETE) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_defer(req); else @@ -1728,7 +1825,6 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (ret == IOU_ISSUE_SKIP_COMPLETE) { ret = 0; - io_arm_ltimeout(req); /* If the op doesn't have a file, we're not polling for it */ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) @@ -1737,11 +1833,23 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return ret; } -int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) +int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); - return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| - IO_URING_F_COMPLETE_DEFER); + const unsigned int issue_flags = IO_URING_F_NONBLOCK | + IO_URING_F_MULTISHOT | + IO_URING_F_COMPLETE_DEFER; + int ret; + + io_tw_lock(req->ctx, tw); + + WARN_ON_ONCE(!req->file); + if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EFAULT; + + ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); + + WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE); + return ret; } struct io_wq_work *io_wq_free_work(struct io_wq_work *work) @@ -1749,7 +1857,7 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *nxt = NULL; - if (req_ref_put_and_test(req)) { + if (req_ref_put_and_test_atomic(req)) { if (req->flags & IO_REQ_LINK_FLAGS) nxt = io_req_find_next(req); io_free_req(req); @@ -1765,14 +1873,12 @@ void io_wq_submit_work(struct io_wq_work *work) bool needs_poll = false; int ret = 0, err = -ECANCELED; - /* one will be dropped by ->io_wq_free_work() after returning to io-wq */ + /* one will be dropped by io_wq_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) __io_req_set_refcount(req, 2); else req_ref_get(req); - io_arm_ltimeout(req); - /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { fail: @@ -1793,7 +1899,7 @@ fail: * Don't allow any multishot execution from io-wq. It's more restrictive * than necessary and also cleaner. */ - if (req->flags & REQ_F_APOLL_MULTISHOT) { + if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) { err = -EBADFD; if (!io_file_can_poll(req)) goto fail; @@ -1804,7 +1910,7 @@ fail: goto fail; return; } else { - req->flags &= ~REQ_F_APOLL_MULTISHOT; + req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT); } } @@ -1865,7 +1971,8 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->file_table.data, fd); if (node) { - io_req_assign_rsrc_node(&req->file_node, node); + node->refs++; + req->file_node = node; req->flags |= io_slot_flags(node); file = io_slot_file(node); } @@ -1885,48 +1992,61 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) return file; } -static void io_queue_async(struct io_kiocb *req, int ret) - __must_hold(&req->ctx->uring_lock) +static int io_req_sqe_copy(struct io_kiocb *req, unsigned int issue_flags) { - struct io_kiocb *linked_timeout; + const struct io_cold_def *def = &io_cold_defs[req->opcode]; + + if (req->flags & REQ_F_SQE_COPIED) + return 0; + req->flags |= REQ_F_SQE_COPIED; + if (!def->sqe_copy) + return 0; + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_INLINE))) + return -EFAULT; + def->sqe_copy(req); + return 0; +} +static void io_queue_async(struct io_kiocb *req, unsigned int issue_flags, int ret) + __must_hold(&req->ctx->uring_lock) +{ if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { +fail: io_req_defer_failed(req, ret); return; } - linked_timeout = io_prep_linked_timeout(req); + ret = io_req_sqe_copy(req, issue_flags); + if (unlikely(ret)) + goto fail; switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: - io_kbuf_recycle(req, 0); io_req_task_queue(req); break; case IO_APOLL_ABORTED: - io_kbuf_recycle(req, 0); io_queue_iowq(req); break; case IO_APOLL_OK: break; } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); } -static inline void io_queue_sqe(struct io_kiocb *req) +static inline void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags) __must_hold(&req->ctx->uring_lock) { + unsigned int issue_flags = IO_URING_F_NONBLOCK | + IO_URING_F_COMPLETE_DEFER | extra_flags; int ret; - ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + ret = io_issue_sqe(req, issue_flags); /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ if (unlikely(ret)) - io_queue_async(req, ret); + io_queue_async(req, issue_flags, ret); } static void io_queue_sqe_fallback(struct io_kiocb *req) @@ -1941,6 +2061,8 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); } else { + /* can't fail with IO_URING_F_INLINE */ + io_req_sqe_copy(req, IO_URING_F_INLINE); if (unlikely(req->ctx->drain_active)) io_drain_req(req); else @@ -1971,9 +2093,8 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, return true; } -static void io_init_req_drain(struct io_kiocb *req) +static void io_init_drain(struct io_ring_ctx *ctx) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *head = ctx->submit_state.link.head; ctx->drain_active = true; @@ -1998,7 +2119,7 @@ static __cold int io_init_fail_req(struct io_kiocb *req, int err) } static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, unsigned int *left) __must_hold(&ctx->uring_lock) { const struct io_issue_def *def; @@ -2006,7 +2127,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, int personality; u8 opcode; - /* req is partially pre-initialised, see io_preinit_req() */ + req->ctx = ctx; req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ sqe_flags = READ_ONCE(sqe->flags); @@ -2015,12 +2136,33 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->file = NULL; req->tctx = current->io_uring; req->cancel_seq_set = false; + req->async_data = NULL; if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; return io_init_fail_req(req, -EINVAL); } + opcode = array_index_nospec(opcode, IORING_OP_LAST); + def = &io_issue_defs[opcode]; + if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) { + /* + * A 128b op on a non-128b SQ requires mixed SQE support as + * well as 2 contiguous entries. + */ + if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || + !(ctx->cached_sq_head & (ctx->sq_entries - 1))) + return io_init_fail_req(req, -EINVAL); + /* + * A 128b operation on a mixed SQ uses two entries, so we have + * to increment the head and cached refs, and decrement what's + * left. + */ + current->io_uring->cached_refs++; + ctx->cached_sq_head++; + (*left)--; + } + if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) @@ -2035,7 +2177,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (sqe_flags & IOSQE_IO_DRAIN) { if (ctx->drain_disabled) return io_init_fail_req(req, -EOPNOTSUPP); - io_init_req_drain(req); + io_init_drain(ctx); } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { @@ -2130,13 +2272,13 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, } static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, unsigned int *left) __must_hold(&ctx->uring_lock) { struct io_submit_link *link = &ctx->submit_state.link; int ret; - ret = io_init_req(ctx, req, sqe); + ret = io_init_req(ctx, req, sqe, left); if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); @@ -2151,6 +2293,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, */ if (unlikely(link->head)) { trace_io_uring_link(req, link->last); + io_req_sqe_copy(req, IO_URING_F_INLINE); link->last->link = req; link->last = req; @@ -2174,7 +2317,7 @@ fallback: return 0; } - io_queue_sqe(req); + io_queue_sqe(req, IO_URING_F_INLINE); return 0; } @@ -2235,10 +2378,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { - /* drop invalid entries */ - spin_lock(&ctx->completion_lock); - ctx->cq_extra--; - spin_unlock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); return false; @@ -2269,10 +2408,11 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) unsigned int left; int ret; + entries = min(nr, entries); if (unlikely(!entries)) return 0; - /* make sure SQ entry isn't read before tail */ - ret = left = min(nr, entries); + + ret = left = entries; io_get_task_refs(left); io_submit_state_start(&ctx->submit_state, left); @@ -2291,7 +2431,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) * Continue submitting even for sqe failure if the * ring was setup with IORING_SETUP_SUBMIT_ALL */ - if (unlikely(io_submit_sqe(ctx, req, sqe)) && + if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) && !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { left--; break; @@ -2328,9 +2468,9 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int io_run_task_work_sig(struct io_ring_ctx *ctx) { - if (!llist_empty(&ctx->work_llist)) { + if (io_local_work_pending(ctx)) { __set_current_state(TASK_RUNNING); - if (io_run_local_work(ctx, INT_MAX) > 0) + if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) return 0; } if (io_run_task_work() > 0) @@ -2396,7 +2536,7 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) goto out_wake; } - iowq->t.function = io_cqring_timer_wakeup; + hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); hrtimer_set_expires(timer, iowq->timeout); return HRTIMER_RESTART; out_wake: @@ -2431,8 +2571,18 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; } +struct ext_arg { + size_t argsz; + struct timespec64 ts; + const sigset_t __user *sig; + ktime_t min_time; + bool ts_set; + bool iowait; +}; + static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, + struct ext_arg *ext_arg, ktime_t start_time) { int ret = 0; @@ -2442,7 +2592,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, * can take into account that the task is waiting for IO - turns out * to be important for low QD IO. */ - if (current_pending_io()) + if (ext_arg->iowait && current_pending_io()) current->in_iowait = 1; if (iowq->timeout != KTIME_MAX || iowq->min_timeout) ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); @@ -2455,11 +2605,12 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, /* If this returns > 0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, + struct ext_arg *ext_arg, ktime_t start_time) { if (unlikely(READ_ONCE(ctx->check_cq))) return 1; - if (unlikely(!llist_empty(&ctx->work_llist))) + if (unlikely(io_local_work_pending(ctx))) return 1; if (unlikely(task_work_pending(current))) return 1; @@ -2468,17 +2619,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (unlikely(io_should_wake(iowq))) return 0; - return __io_cqring_wait_schedule(ctx, iowq, start_time); + return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); } -struct ext_arg { - size_t argsz; - struct timespec64 ts; - const sigset_t __user *sig; - ktime_t min_time; - bool ts_set; -}; - /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -2491,10 +2634,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, ktime_t start_time; int ret; + min_events = min_t(int, min_events, ctx->cq_entries); + if (!io_allowed_run_tw(ctx)) return -EEXIST; - if (!llist_empty(&ctx->work_llist)) - io_run_local_work(ctx, min_events); + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); io_run_task_work(); if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) @@ -2555,7 +2701,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, TASK_INTERRUPTIBLE); } - ret = io_cqring_wait_schedule(ctx, &iowq, start_time); + ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); @@ -2564,8 +2710,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ - if (!llist_empty(&ctx->work_llist)) - io_run_local_work(ctx, nr_wait); + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, nr_wait, nr_wait); io_run_task_work(); /* @@ -2605,92 +2751,88 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, - size); -} - -static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, - size); -} - static void io_rings_free(struct io_ring_ctx *ctx) { - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, - true); - io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, - true); - } else { - io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); - ctx->n_ring_pages = 0; - io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); - ctx->n_sqe_pages = 0; - vunmap(ctx->rings); - vunmap(ctx->sq_sqes); - } - + io_free_region(ctx->user, &ctx->sq_region); + io_free_region(ctx->user, &ctx->ring_region); ctx->rings = NULL; ctx->sq_sqes = NULL; } -unsigned long rings_size(unsigned int flags, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) +static int rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, struct io_rings_layout *rl) { struct io_rings *rings; - size_t off, sq_array_size; + size_t sqe_size; + size_t off; + + if (flags & IORING_SETUP_CQE_MIXED) { + if (cq_entries < 2) + return -EOVERFLOW; + } + if (flags & IORING_SETUP_SQE_MIXED) { + if (sq_entries < 2) + return -EOVERFLOW; + } + + rl->sq_array_offset = SIZE_MAX; + + sqe_size = sizeof(struct io_uring_sqe); + if (flags & IORING_SETUP_SQE128) + sqe_size *= 2; + + rl->sq_size = array_size(sqe_size, sq_entries); + if (rl->sq_size == SIZE_MAX) + return -EOVERFLOW; off = struct_size(rings, cqes, cq_entries); + if (flags & IORING_SETUP_CQE32) + off = size_mul(off, 2); if (off == SIZE_MAX) - return SIZE_MAX; - if (flags & IORING_SETUP_CQE32) { - if (check_shl_overflow(off, 1, &off)) - return SIZE_MAX; - } + return -EOVERFLOW; #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); if (off == 0) - return SIZE_MAX; + return -EOVERFLOW; #endif - if (flags & IORING_SETUP_NO_SQARRAY) { - *sq_offset = SIZE_MAX; - return off; - } - - *sq_offset = off; + if (!(flags & IORING_SETUP_NO_SQARRAY)) { + size_t sq_array_size; - sq_array_size = array_size(sizeof(u32), sq_entries); - if (sq_array_size == SIZE_MAX) - return SIZE_MAX; + rl->sq_array_offset = off; - if (check_add_overflow(off, sq_array_size, &off)) - return SIZE_MAX; + sq_array_size = array_size(sizeof(u32), sq_entries); + off = size_add(off, sq_array_size); + if (off == SIZE_MAX) + return -EOVERFLOW; + } - return off; + rl->rings_size = off; + return 0; } -static void io_req_caches_free(struct io_ring_ctx *ctx) +static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; int nr = 0; - mutex_lock(&ctx->uring_lock); - while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); + io_poison_req(req); kmem_cache_free(req_cachep, req); nr++; } - if (nr) + if (nr) { + ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); - mutex_unlock(&ctx->uring_lock); + } +} + +static __cold void io_req_caches_free(struct io_ring_ctx *ctx) +{ + guard(mutex)(&ctx->uring_lock); + __io_req_caches_free(ctx); } static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) @@ -2700,16 +2842,12 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); io_sqe_buffers_unregister(ctx); io_sqe_files_unregister(ctx); + io_unregister_zcrx_ifqs(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); - io_alloc_cache_free(&ctx->apoll_cache, kfree); - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); - io_futex_cache_free(ctx); + io_free_alloc_caches(ctx); io_destroy_buffers(ctx); - io_free_region(ctx, &ctx->param_region); + io_free_region(ctx->user, &ctx->param_region); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); @@ -2730,6 +2868,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); + + WARN_ON_ONCE(ctx->nr_req_allocated); + if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); io_napi_free(ctx); @@ -2784,13 +2925,12 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) if (unlikely(!ctx->poll_activated)) io_activate_pollwq(ctx); - - poll_wait(file, &ctx->poll_wq, wait); /* - * synchronizes with barrier from wq_has_sleeper call in - * io_commit_cqring + * provides mb() which pairs with barrier from wq_has_sleeper + * call in io_commit_cqring */ - smp_rmb(); + poll_wait(file, &ctx->poll_wq, wait); + if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; @@ -2837,13 +2977,6 @@ static __cold void io_tctx_exit_cb(struct callback_head *cb) complete(&work->completion); } -static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - return req->ctx == data; -} - static __cold void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); @@ -2869,7 +3002,8 @@ static __cold void io_ring_exit_work(struct work_struct *work) if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); - while (io_uring_try_cancel_requests(ctx, NULL, true)) + /* The SQPOLL thread never reaches this path */ + while (io_uring_try_cancel_requests(ctx, NULL, true, false)) cond_resched(); if (ctx->sq_data) { @@ -2877,7 +3011,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) struct task_struct *tsk; io_sq_thread_park(sqd); - tsk = sqd->thread; + tsk = sqpoll_task_locked(sqd); if (tsk && tsk->io_uring && tsk->io_uring->io_wq) io_wq_cancel_cb(tsk->io_uring->io_wq, io_cancel_ctx_cb, ctx, true); @@ -2954,10 +3088,10 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* - * Use system_unbound_wq to avoid spawning tons of event kworkers + * Use system_dfl_wq to avoid spawning tons of event kworkers * if we're exiting a ton of rings at the same time. It just adds * noise and overhead, there's no discernable change in runtime - * over using system_wq. + * over using system_percpu_wq. */ queue_work(iou_wq, &ctx->exit_work); } @@ -2971,227 +3105,6 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } -struct io_task_cancel { - struct io_uring_task *tctx; - bool all; -}; - -static bool io_cancel_task_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_task_cancel *cancel = data; - - return io_match_task_safe(req, cancel->tctx, cancel->all); -} - -static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct io_uring_task *tctx, - bool cancel_all) -{ - struct io_defer_entry *de; - LIST_HEAD(list); - - spin_lock(&ctx->completion_lock); - list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task_safe(de->req, tctx, cancel_all)) { - list_cut_position(&list, &ctx->defer_list, &de->list); - break; - } - } - spin_unlock(&ctx->completion_lock); - if (list_empty(&list)) - return false; - - while (!list_empty(&list)) { - de = list_first_entry(&list, struct io_defer_entry, list); - list_del_init(&de->list); - io_req_task_queue_fail(de->req, -ECANCELED); - kfree(de); - } - return true; -} - -static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) -{ - struct io_tctx_node *node; - enum io_wq_cancel cret; - bool ret = false; - - mutex_lock(&ctx->uring_lock); - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - /* - * io_wq will stay alive while we hold uring_lock, because it's - * killed after ctx nodes, which requires to take the lock. - */ - if (!tctx || !tctx->io_wq) - continue; - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - mutex_unlock(&ctx->uring_lock); - - return ret; -} - -static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct io_uring_task *tctx, - bool cancel_all) -{ - struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; - enum io_wq_cancel cret; - bool ret = false; - - /* set it so io_req_local_work_add() would wake us up */ - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 1); - smp_mb(); - } - - /* failed during ring init, it couldn't have issued any requests */ - if (!ctx->rings) - return false; - - if (!tctx) { - ret |= io_uring_try_cancel_iowq(ctx); - } else if (tctx->io_wq) { - /* - * Cancels requests of all rings, not only @ctx, but - * it's fine as the task is in exit/exec. - */ - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, - &cancel, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - - /* SQPOLL thread does its own polling */ - if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - (ctx->sq_data && ctx->sq_data->thread == current)) { - while (!wq_list_empty(&ctx->iopoll_list)) { - io_iopoll_try_reap_events(ctx); - ret = true; - cond_resched(); - } - } - - if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && - io_allowed_defer_tw_run(ctx)) - ret |= io_run_local_work(ctx, INT_MAX) > 0; - ret |= io_cancel_defer_files(ctx, tctx, cancel_all); - mutex_lock(&ctx->uring_lock); - ret |= io_poll_remove_all(ctx, tctx, cancel_all); - ret |= io_waitid_remove_all(ctx, tctx, cancel_all); - ret |= io_futex_remove_all(ctx, tctx, cancel_all); - ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); - mutex_unlock(&ctx->uring_lock); - ret |= io_kill_timeouts(ctx, tctx, cancel_all); - if (tctx) - ret |= io_run_task_work() > 0; - else - ret |= flush_delayed_work(&ctx->fallback_work); - return ret; -} - -static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) -{ - if (tracked) - return atomic_read(&tctx->inflight_tracked); - return percpu_counter_sum(&tctx->inflight); -} - -/* - * Find any io_uring ctx that this task has registered or done IO on, and cancel - * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. - */ -__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_ring_ctx *ctx; - struct io_tctx_node *node; - unsigned long index; - s64 inflight; - DEFINE_WAIT(wait); - - WARN_ON_ONCE(sqd && sqd->thread != current); - - if (!current->io_uring) - return; - if (tctx->io_wq) - io_wq_exit_start(tctx->io_wq); - - atomic_inc(&tctx->in_cancel); - do { - bool loop = false; - - io_uring_drop_tctx_refs(current); - if (!tctx_inflight(tctx, !cancel_all)) - break; - - /* read completions before cancelations */ - inflight = tctx_inflight(tctx, false); - if (!inflight) - break; - - if (!sqd) { - xa_for_each(&tctx->xa, index, node) { - /* sqpoll task will cancel all its requests */ - if (node->ctx->sq_data) - continue; - loop |= io_uring_try_cancel_requests(node->ctx, - current->io_uring, - cancel_all); - } - } else { - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - loop |= io_uring_try_cancel_requests(ctx, - current->io_uring, - cancel_all); - } - - if (loop) { - cond_resched(); - continue; - } - - prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); - io_run_task_work(); - io_uring_drop_tctx_refs(current); - xa_for_each(&tctx->xa, index, node) { - if (!llist_empty(&node->ctx->work_llist)) { - WARN_ON_ONCE(node->ctx->submitter_task && - node->ctx->submitter_task != current); - goto end_wait; - } - } - /* - * If we've seen completions, retry without waiting. This - * avoids a race where a completion comes in before we did - * prepare_to_wait(). - */ - if (inflight == tctx_inflight(tctx, !cancel_all)) - schedule(); -end_wait: - finish_wait(&tctx->wait, &wait); - } while (1); - - io_uring_clean_tctx(tctx); - if (cancel_all) { - /* - * We shouldn't run task_works after cancel, so just leave - * ->in_cancel set for normal exit. - */ - atomic_dec(&tctx->in_cancel); - /* for exec all current's requests should be gone, kill tctx */ - __io_uring_free(current); - } -} - -void __io_uring_cancel(bool cancel_all) -{ - io_uring_cancel_generic(cancel_all, NULL); -} - static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, const struct io_uring_getevents_arg __user *uarg) { @@ -3207,6 +3120,7 @@ static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, end > ctx->cq_wait_size)) return ERR_PTR(-EFAULT); + offset = array_index_nospec(offset, ctx->cq_wait_size - size); return ctx->cq_wait_arg + offset; } @@ -3232,6 +3146,8 @@ static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags, const struct io_uring_getevents_arg __user *uarg = argp; struct io_uring_getevents_arg arg; + ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT); + /* * If EXT_ARG isn't set, then we have no timespec and the argp pointer * is just a pointer to the sigset_t. @@ -3305,11 +3221,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, struct file *file; long ret; - if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | - IORING_ENTER_REGISTERED_RING | - IORING_ENTER_ABS_TIMER | - IORING_ENTER_EXT_ARG_REG))) + if (unlikely(flags & ~IORING_ENTER_FLAGS)) return -EINVAL; /* @@ -3393,22 +3305,16 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, mutex_lock(&ctx->uring_lock); iopoll_locked: ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); + if (likely(!ret2)) ret2 = io_iopoll_check(ctx, min_complete); - } mutex_unlock(&ctx->uring_lock); } else { struct ext_arg ext_arg = { .argsz = argsz }; ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); + if (likely(!ret2)) ret2 = io_cqring_wait(ctx, min_complete, flags, &ext_arg); - } } if (!ret) { @@ -3449,57 +3355,49 @@ bool io_is_uring_fops(struct file *file) } static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, - struct io_uring_params *p) + struct io_ctx_config *config) { + struct io_uring_params *p = &config->p; + struct io_rings_layout *rl = &config->layout; + struct io_uring_region_desc rd; struct io_rings *rings; - size_t size, sq_array_offset; - void *ptr; + int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(ctx->flags, p->sq_entries, p->cq_entries, - &sq_array_offset); - if (size == SIZE_MAX) - return -EOVERFLOW; - - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); - else - rings = io_rings_map(ctx, p->cq_off.user_addr, size); - - if (IS_ERR(rings)) - return PTR_ERR(rings); - - ctx->rings = rings; - if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); - rings->sq_ring_mask = p->sq_entries - 1; - rings->cq_ring_mask = p->cq_entries - 1; - rings->sq_ring_entries = p->sq_entries; - rings->cq_ring_entries = p->cq_entries; - - if (p->flags & IORING_SETUP_SQE128) - size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); - else - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); - if (size == SIZE_MAX) { - io_rings_free(ctx); - return -EOVERFLOW; + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(rl->rings_size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; } + ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) + return ret; + ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset); - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); - else - ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); - - if (IS_ERR(ptr)) { + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(rl->sq_size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES); + if (ret) { io_rings_free(ctx); - return PTR_ERR(ptr); + return ret; } + ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); - ctx->sq_sqes = ptr; + memset(rings, 0, sizeof(*rings)); + WRITE_ONCE(rings->sq_ring_mask, ctx->sq_entries - 1); + WRITE_ONCE(rings->cq_ring_mask, ctx->cq_entries - 1); + WRITE_ONCE(rings->sq_ring_entries, ctx->sq_entries); + WRITE_ONCE(rings->cq_ring_entries, ctx->cq_entries); return 0; } @@ -3526,8 +3424,66 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) O_RDWR | O_CLOEXEC, NULL); } -int io_uring_fill_params(unsigned entries, struct io_uring_params *p) +static int io_uring_sanitise_params(struct io_uring_params *p) { + unsigned flags = p->flags; + + if (flags & ~IORING_SETUP_FLAGS) + return -EINVAL; + + /* There is no way to mmap rings without a real fd */ + if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && + !(flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + + if (flags & IORING_SETUP_SQPOLL) { + /* IPI related flags don't make sense with SQPOLL */ + if (flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_TASKRUN_FLAG | + IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + } + + if (flags & IORING_SETUP_TASKRUN_FLAG) { + if (!(flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_DEFER_TASKRUN))) + return -EINVAL; + } + + /* HYBRID_IOPOLL only valid with IOPOLL */ + if ((flags & IORING_SETUP_HYBRID_IOPOLL) && !(flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + /* + * For DEFER_TASKRUN we require the completion task to be the same as + * the submission task. This implies that there is only one submitter. + */ + if ((flags & IORING_SETUP_DEFER_TASKRUN) && + !(flags & IORING_SETUP_SINGLE_ISSUER)) + return -EINVAL; + + /* + * Nonsensical to ask for CQE32 and mixed CQE support, it's not + * supported to post 16b CQEs on a ring setup with CQE32. + */ + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) + return -EINVAL; + /* + * Nonsensical to ask for SQE128 and mixed SQE support, it's not + * supported to post 64b SQEs on a ring setup with SQE128. + */ + if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) == + (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) + return -EINVAL; + + return 0; +} + +static int io_uring_fill_params(struct io_uring_params *p) +{ + unsigned entries = p->sq_entries; + if (!entries) return -EINVAL; if (entries > IORING_MAX_ENTRIES) { @@ -3536,10 +3492,6 @@ int io_uring_fill_params(unsigned entries, struct io_uring_params *p) entries = IORING_MAX_ENTRIES; } - if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) - && !(p->flags & IORING_SETUP_NO_MMAP)) - return -EINVAL; - /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, @@ -3569,6 +3521,27 @@ int io_uring_fill_params(unsigned entries, struct io_uring_params *p) p->cq_entries = 2 * p->sq_entries; } + return 0; +} + +int io_prepare_config(struct io_ctx_config *config) +{ + struct io_uring_params *p = &config->p; + int ret; + + ret = io_uring_sanitise_params(p); + if (ret) + return ret; + + ret = io_uring_fill_params(p); + if (ret) + return ret; + + ret = rings_size(p->flags, p->sq_entries, p->cq_entries, + &config->layout); + if (ret) + return ret; + p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); @@ -3589,20 +3562,22 @@ int io_uring_fill_params(unsigned entries, struct io_uring_params *p) p->cq_off.resv1 = 0; if (!(p->flags & IORING_SETUP_NO_MMAP)) p->cq_off.user_addr = 0; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + p->sq_off.array = config->layout.sq_array_offset; return 0; } -static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, - struct io_uring_params __user *params) +static __cold int io_uring_create(struct io_ctx_config *config) { + struct io_uring_params *p = &config->p; struct io_ring_ctx *ctx; struct io_uring_task *tctx; struct file *file; int ret; - ret = io_uring_fill_params(entries, p); - if (unlikely(ret)) + ret = io_prepare_config(config); + if (ret) return ret; ctx = io_ring_ctx_alloc(p); @@ -3648,37 +3623,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if * COOP_TASKRUN is set, then IPIs are never needed by the app. */ - ret = -EINVAL; - if (ctx->flags & IORING_SETUP_SQPOLL) { - /* IPI related flags don't make sense with SQPOLL */ - if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | - IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_DEFER_TASKRUN)) - goto err; + if (ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_COOP_TASKRUN)) ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { - ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else { - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && - !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) - goto err; + else ctx->notify_method = TWA_SIGNAL; - } - - /* HYBRID_IOPOLL only valid with IOPOLL */ - if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) == - IORING_SETUP_HYBRID_IOPOLL) - goto err; - - /* - * For DEFER_TASKRUN we require the completion task to be the same as the - * submission task. This implies that there is only one submitter, so enforce - * that. - */ - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && - !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { - goto err; - } /* * This is just grabbed for accounting purposes. When a process exits, @@ -3689,34 +3637,29 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, mmgrab(current->mm); ctx->mm_account = current->mm; - ret = io_allocate_scq_urings(ctx, p); + ret = io_allocate_scq_urings(ctx, config); if (ret) goto err; - if (!(p->flags & IORING_SETUP_NO_SQARRAY)) - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; - ret = io_sq_offload_create(ctx, p); if (ret) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | - IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | - IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT; + p->features = IORING_FEAT_FLAGS; - if (copy_to_user(params, p, sizeof(*p))) { + if (copy_to_user(config->uptr, p, sizeof(*p))) { ret = -EFAULT; goto err; } if (ctx->flags & IORING_SETUP_SINGLE_ISSUER - && !(ctx->flags & IORING_SETUP_R_DISABLED)) - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); + && !(ctx->flags & IORING_SETUP_R_DISABLED)) { + /* + * Unlike io_register_enable_rings(), don't need WRITE_ONCE() + * since ctx isn't yet accessible from other tasks + */ + ctx->submitter_task = get_task_struct(current); + } file = io_uring_get_file(ctx); if (IS_ERR(file)) { @@ -3757,53 +3700,51 @@ err_fput: */ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { - struct io_uring_params p; - int i; + struct io_ctx_config config; + + memset(&config, 0, sizeof(config)); - if (copy_from_user(&p, params, sizeof(p))) + if (copy_from_user(&config.p, params, sizeof(config.p))) return -EFAULT; - for (i = 0; i < ARRAY_SIZE(p.resv); i++) { - if (p.resv[i]) - return -EINVAL; - } - if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | - IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | - IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | - IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) + if (!mem_is_zero(&config.p.resv, sizeof(config.p.resv))) return -EINVAL; - return io_uring_create(entries, &p, params); + config.p.sq_entries = entries; + config.uptr = params; + return io_uring_create(&config); } -static inline bool io_uring_allowed(void) +static inline int io_uring_allowed(void) { int disabled = READ_ONCE(sysctl_io_uring_disabled); kgid_t io_uring_group; if (disabled == 2) - return false; + return -EPERM; if (disabled == 0 || capable(CAP_SYS_ADMIN)) - return true; + goto allowed_lsm; io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group); if (!gid_valid(io_uring_group)) - return false; + return -EPERM; + + if (!in_group_p(io_uring_group)) + return -EPERM; - return in_group_p(io_uring_group); +allowed_lsm: + return security_uring_allowed(); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, struct io_uring_params __user *, params) { - if (!io_uring_allowed()) - return -EPERM; + int ret; + + ret = io_uring_allowed(); + if (ret) + return ret; return io_uring_setup(entries, params); } @@ -3865,9 +3806,13 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_SQE_ELEM(44, __u16, addr_len); + BUILD_BUG_SQE_ELEM(44, __u8, write_stream); + BUILD_BUG_SQE_ELEM(45, __u8, __pad4[0]); BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); + BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr); + BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != @@ -3894,6 +3839,9 @@ static int __init io_uring_init(void) io_uring_optable_init(); + /* imu->dir is u8 */ + BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); + /* * Allow user copy in the per-command field, which starts after the * file in io_kiocb and until the opcode field. The openat2 handling @@ -3904,10 +3852,9 @@ static int __init io_uring_init(void) req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); - io_buf_cachep = KMEM_CACHE(io_buffer, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); + BUG_ON(!iou_wq); #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); |
