diff options
Diffstat (limited to 'io_uring/io_uring.c')
| -rw-r--r-- | io_uring/io_uring.c | 4030 |
1 files changed, 1806 insertions, 2224 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index db623b3185c8..5d130c578435 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -29,7 +29,7 @@ * * Also see the examples in the liburing library: * - * git://git.kernel.dk/liburing + * git://git.kernel.org/pub/scm/linux/kernel/git/axboe/liburing.git * * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens * from data shared between the kernel and application. This is done both @@ -51,7 +51,6 @@ #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/percpu.h> @@ -59,19 +58,19 @@ #include <linux/bvec.h> #include <linux/net.h> #include <net/sock.h> -#include <net/af_unix.h> -#include <net/scm.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> #include <linux/nospec.h> -#include <linux/highmem.h> #include <linux/fsnotify.h> #include <linux/fadvise.h> #include <linux/task_work.h> #include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include <linux/audit.h> #include <linux/security.h> +#include <linux/jump_label.h> +#include <asm/shmparam.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -80,10 +79,12 @@ #include "io-wq.h" +#include "filetable.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" #include "tctx.h" +#include "register.h" #include "sqpoll.h" #include "fdinfo.h" #include "kbuf.h" @@ -91,86 +92,99 @@ #include "cancel.h" #include "net.h" #include "notif.h" +#include "waitid.h" +#include "futex.h" +#include "napi.h" +#include "uring_cmd.h" +#include "msg_ring.h" +#include "memmap.h" +#include "zcrx.h" #include "timeout.h" #include "poll.h" +#include "rw.h" #include "alloc_cache.h" - -#define IORING_MAX_ENTRIES 32768 -#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) - -#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ - IORING_REGISTER_LAST + IORING_OP_LAST) +#include "eventfd.h" #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) -#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ - IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ - REQ_F_ASYNC_DATA) + REQ_F_INFLIGHT | REQ_F_CREDS | REQ_F_ASYNC_DATA) -#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ +#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \ + REQ_F_REISSUE | REQ_F_POLLED | \ IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 - -enum { - IO_CHECK_CQ_OVERFLOW_BIT, - IO_CHECK_CQ_DROPPED_BIT, -}; - -enum { - IO_EVENTFD_OP_SIGNAL_BIT, - IO_EVENTFD_OP_FREE_BIT, -}; - -struct io_defer_entry { - struct list_head list; - struct io_kiocb *req; - u32 seq; -}; +#define IO_LOCAL_TW_DEFAULT_MAX 20 /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) -#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) - -static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all); - -static void io_dismantle_req(struct io_kiocb *req); -static void io_clean_op(struct io_kiocb *req); -static void io_queue_sqe(struct io_kiocb *req); -static void io_move_task_work_from_local(struct io_ring_ctx *ctx); -static void __io_submit_flush_completions(struct io_ring_ctx *ctx); -static __cold void io_fallback_tw(struct io_uring_task *tctx); -static struct kmem_cache *req_cachep; +/* + * No waiters. It's larger than any valid value of the tw counter + * so that tests against ->cq_wait_nr would fail and skip wake_up(). + */ +#define IO_CQ_WAKE_INIT (-1U) +/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ +#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) + +static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); +static void __io_req_caches_free(struct io_ring_ctx *ctx); + +static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); + +struct kmem_cache *req_cachep; +static struct workqueue_struct *iou_wq __ro_after_init; + +static int __read_mostly sysctl_io_uring_disabled; +static int __read_mostly sysctl_io_uring_group = -1; + +#ifdef CONFIG_SYSCTL +static const struct ctl_table kernel_io_uring_disabled_table[] = { + { + .procname = "io_uring_disabled", + .data = &sysctl_io_uring_disabled, + .maxlen = sizeof(sysctl_io_uring_disabled), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "io_uring_group", + .data = &sysctl_io_uring_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; +#endif -struct sock *io_uring_get_socket(struct file *file) +static void io_poison_cached_req(struct io_kiocb *req) { -#if defined(CONFIG_UNIX) - if (io_is_uring_fops(file)) { - struct io_ring_ctx *ctx = file->private_data; - - return ctx->ring_sock->sk; - } -#endif - return NULL; + req->ctx = IO_URING_PTR_POISON; + req->tctx = IO_URING_PTR_POISON; + req->file = IO_URING_PTR_POISON; + req->creds = IO_URING_PTR_POISON; + req->io_task_work.func = IO_URING_PTR_POISON; + req->apoll = IO_URING_PTR_POISON; } -EXPORT_SYMBOL(io_uring_get_socket); -static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) +static void io_poison_req(struct io_kiocb *req) { - if (!wq_list_empty(&ctx->submit_state.compl_reqs) || - ctx->submit_state.cqes_count) - __io_submit_flush_completions(ctx); + io_poison_cached_req(req); + req->async_data = IO_URING_PTR_POISON; + req->kbuf = IO_URING_PTR_POISON; + req->comp_list.next = IO_URING_PTR_POISON; + req->file_node = IO_URING_PTR_POISON; + req->link = IO_URING_PTR_POISON; } static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) @@ -183,44 +197,6 @@ static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); } -static bool io_match_linked(struct io_kiocb *head) -{ - struct io_kiocb *req; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; -} - -/* - * As io_match_task() but protected against racing with linked timeouts. - * User must not hold timeout_lock. - */ -bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, - bool cancel_all) -{ - bool matched; - - if (task && head->task != task) - return false; - if (cancel_all) - return true; - - if (head->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = head->ctx; - - /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); - matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); - } else { - matched = io_match_linked(head); - } - return matched; -} - static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); @@ -229,6 +205,8 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { + if (IS_ENABLED(CONFIG_KASAN)) + io_poison_cached_req(req); wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } @@ -239,43 +217,75 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } +/* + * Terminate the request if either of these conditions are true: + * + * 1) It's being executed by the original task, but that task is marked + * with PF_EXITING as it's exiting. + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is + * our fallback task_work. + * 3) The ring has been closed and is going away. + */ +static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) +{ + return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); +} + static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - bool locked = false; + struct io_tw_state ts = {}; percpu_ref_get(&ctx->refs); + mutex_lock(&ctx->uring_lock); + ts.cancel = io_should_terminate_tw(ctx); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, &locked); - - if (locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - } + req->io_task_work.func((struct io_tw_req){req}, ts); + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) { - unsigned hash_buckets = 1U << bits; - size_t hash_size = hash_buckets * sizeof(table->hbs[0]); + unsigned int hash_buckets; + int i; - table->hbs = kmalloc(hash_size, GFP_KERNEL); - if (!table->hbs) - return -ENOMEM; + do { + hash_buckets = 1U << bits; + table->hbs = kvmalloc_array(hash_buckets, sizeof(table->hbs[0]), + GFP_KERNEL_ACCOUNT); + if (table->hbs) + break; + if (bits == 1) + return -ENOMEM; + bits--; + } while (1); table->hash_bits = bits; - init_hash_table(table, hash_buckets); + for (i = 0; i < hash_buckets; i++) + INIT_HLIST_HEAD(&table->hbs[i].list); return 0; } +static void io_free_alloc_caches(struct io_ring_ctx *ctx) +{ + io_alloc_cache_free(&ctx->apoll_cache, kfree); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); + io_futex_cache_free(ctx); + io_rsrc_cache_free(ctx); +} + static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; + bool ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -292,84 +302,100 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) hash_bits = clamp(hash_bits, 1, 8); if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) goto err; - if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) - goto err; - - ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); - if (!ctx->dummy_ubuf) - goto err; - /* set invalid range, so io_import_fixed() fails meeting it */ - ctx->dummy_ubuf->ubuf = -1UL; - if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) goto err; ctx->flags = p->flags; + ctx->hybrid_poll_time = LLONG_MAX; + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); - INIT_LIST_HEAD(&ctx->io_buffers_cache); - io_alloc_cache_init(&ctx->apoll_cache); - io_alloc_cache_init(&ctx->netmsg_cache); + ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, + sizeof(struct async_poll), 0); + ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_msghdr), + offsetof(struct io_async_msghdr, clear)); + ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_rw), + offsetof(struct io_async_rw, clear)); + ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_cmd), + sizeof(struct io_async_cmd)); + ret |= io_futex_cache_init(ctx); + ret |= io_rsrc_cache_init(ctx); + if (ret) + goto free_ref; init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); + init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); - spin_lock_init(&ctx->timeout_lock); + raw_spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); - INIT_LIST_HEAD(&ctx->io_buffers_pages); - INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); - spin_lock_init(&ctx->rsrc_ref_lock); - INIT_LIST_HEAD(&ctx->rsrc_ref_list); - INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); - init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw); - init_llist_head(&ctx->rsrc_put_llist); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; - INIT_WQ_LIST(&ctx->locked_free_list); + INIT_HLIST_HEAD(&ctx->waitid_list); + xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); +#ifdef CONFIG_FUTEX + INIT_HLIST_HEAD(&ctx->futex_list); +#endif INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); + INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); + io_napi_init(ctx); + mutex_init(&ctx->mmap_lock); + return ctx; + +free_ref: + percpu_ref_exit(&ctx->refs); err: - kfree(ctx->dummy_ubuf); - kfree(ctx->cancel_table.hbs); - kfree(ctx->cancel_table_locked.hbs); - kfree(ctx->io_bl); + io_free_alloc_caches(ctx); + kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } -static void io_account_cq_overflow(struct io_ring_ctx *ctx) +static void io_clean_op(struct io_kiocb *req) { - struct io_rings *r = ctx->rings; - - WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; -} + if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) + io_kbuf_drop_legacy(req); -static bool req_need_defer(struct io_kiocb *req, u32 seq) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) { - struct io_ring_ctx *ctx = req->ctx; + if (req->flags & REQ_F_NEED_CLEANUP) { + const struct io_cold_def *def = &io_cold_defs[req->opcode]; - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; + if (def->cleanup) + def->cleanup(req); } - - return false; + if (req->flags & REQ_F_INFLIGHT) + atomic_dec(&req->tctx->inflight_tracked); + if (req->flags & REQ_F_CREDS) + put_cred(req->creds); + if (req->flags & REQ_F_ASYNC_DATA) { + kfree(req->async_data); + req->async_data = NULL; + } + req->flags &= ~IO_REQ_CLEAN_FLAGS; } -static inline void io_req_track_inflight(struct io_kiocb *req) +/* + * Mark the request as inflight, so that file cancelation will find it. + * Can be used if the file is an io_uring instance, or if the request itself + * relies on ->mm being alive for the duration of the request. + */ +inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; - atomic_inc(&req->task->io_uring->inflight_tracked); + atomic_inc(&req->tctx->inflight_tracked); } } @@ -387,27 +413,9 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) return req->link; } -static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) - return NULL; - return __io_prep_linked_timeout(req); -} - -static noinline void __io_arm_ltimeout(struct io_kiocb *req) -{ - io_queue_linked_timeout(__io_prep_linked_timeout(req)); -} - -static inline void io_arm_ltimeout(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) - __io_arm_ltimeout(req); -} - static void io_prep_async_work(struct io_kiocb *req) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { @@ -416,20 +424,25 @@ static void io_prep_async_work(struct io_kiocb *req) } req->work.list.next = NULL; - req->work.flags = 0; - req->work.cancel_seq = atomic_read(&ctx->cancel_seq); + atomic_set(&req->work.flags, 0); if (req->flags & REQ_F_FORCE_ASYNC) - req->work.flags |= IO_WQ_WORK_CONCURRENT; + atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags); + + if (req->file && !(req->flags & REQ_F_FIXED_FILE)) + req->flags |= io_file_get_flags(req->file); - if (req->file && !io_req_ffs_set(req)) - req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT; + if (req->file && (req->flags & REQ_F_ISREG)) { + bool should_hash = def->hash_reg_file; - if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) + /* don't serialize this request if the fs doesn't need it */ + if (should_hash && (req->file->f_flags & O_DIRECT) && + (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE)) + should_hash = false; + if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) - req->work.flags |= IO_WQ_WORK_UNBOUND; + atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags); } } @@ -440,23 +453,26 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); } } -void io_queue_iowq(struct io_kiocb *req, bool *dont_use) +static void io_queue_iowq(struct io_kiocb *req) { - struct io_kiocb *link = io_prep_linked_timeout(req); - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); - BUG_ON(!tctx->io_wq); + + if ((current->flags & PF_KTHREAD) || !tctx->io_wq) { + io_req_task_queue_fail(req, -ECANCELED); + return; + } /* init ->work of the whole link before punting */ io_prep_async_link(req); @@ -468,206 +484,149 @@ void io_queue_iowq(struct io_kiocb *req, bool *dont_use) * procedure rather than attempt to run this request (or create a new * worker for it). */ - if (WARN_ON_ONCE(!same_thread_group(req->task, current))) - req->work.flags |= IO_WQ_WORK_CANCEL; + if (WARN_ON_ONCE(!same_thread_group(tctx->task, current))) + atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags); trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); - if (link) - io_queue_linked_timeout(link); } -static __cold void io_queue_deferred(struct io_ring_ctx *ctx) +static void io_req_queue_iowq_tw(struct io_tw_req tw_req, io_tw_token_t tw) { - while (!list_empty(&ctx->defer_list)) { - struct io_defer_entry *de = list_first_entry(&ctx->defer_list, - struct io_defer_entry, list); - - if (req_need_defer(de->req, de->seq)) - break; - list_del_init(&de->list); - io_req_task_queue(de->req); - kfree(de); - } + io_queue_iowq(tw_req.req); } - -static void io_eventfd_ops(struct rcu_head *rcu) +void io_req_queue_iowq(struct io_kiocb *req) { - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - int ops = atomic_xchg(&ev_fd->ops, 0); - - if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) - eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE); - - /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback - * ordering in a race but if references are 0 we know we have to free - * it regardless. - */ - if (atomic_dec_and_test(&ev_fd->refs)) { - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); - } + req->io_task_work.func = io_req_queue_iowq_tw; + io_req_task_work_add(req); } -static void io_eventfd_signal(struct io_ring_ctx *ctx) +unsigned io_linked_nr(struct io_kiocb *req) { - struct io_ev_fd *ev_fd = NULL; - - rcu_read_lock(); - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ - ev_fd = rcu_dereference(ctx->io_ev_fd); - - /* - * Check again if ev_fd exists incase an io_eventfd_unregister call - * completed between the NULL check of ctx->io_ev_fd at the start of - * the function and rcu_read_lock. - */ - if (unlikely(!ev_fd)) - goto out; - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - goto out; - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) - goto out; + struct io_kiocb *tmp; + unsigned nr = 0; - if (likely(eventfd_signal_allowed())) { - eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE); - } else { - atomic_inc(&ev_fd->refs); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) - call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops); - else - atomic_dec(&ev_fd->refs); - } - -out: - rcu_read_unlock(); + io_for_each_link(tmp, req) + nr++; + return nr; } -static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) +static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { - bool skip; + bool drain_seen = false, first = true; - spin_lock(&ctx->completion_lock); + lockdep_assert_held(&ctx->uring_lock); + __io_req_caches_free(ctx); - /* - * Eventfd should only get triggered when at least one event has been - * posted. Some applications rely on the eventfd notification count - * only changing IFF a new CQE has been added to the CQ ring. There's - * no depedency on 1:1 relationship between how many times this - * function is called (and hence the eventfd count) and number of CQEs - * posted to the CQ ring. - */ - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - if (skip) - return; + while (!list_empty(&ctx->defer_list)) { + struct io_defer_entry *de = list_first_entry(&ctx->defer_list, + struct io_defer_entry, list); + + drain_seen |= de->req->flags & REQ_F_IO_DRAIN; + if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained) + return; - io_eventfd_signal(ctx); + list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); + io_req_task_queue(de->req); + kfree(de); + first = false; + } } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { + if (ctx->poll_activated) + io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) { - spin_lock(&ctx->completion_lock); - io_queue_deferred(ctx); - spin_unlock(&ctx->completion_lock); - } if (ctx->has_evfd) - io_eventfd_flush_signal(ctx); + io_eventfd_signal(ctx, true); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) - __acquires(ctx->completion_lock) { - if (!ctx->task_complete) + if (!ctx->lockless_cq) spin_lock(&ctx->completion_lock); } -static inline void __io_cq_unlock(struct io_ring_ctx *ctx) -{ - if (!ctx->task_complete) - spin_unlock(&ctx->completion_lock); -} - static inline void io_cq_lock(struct io_ring_ctx *ctx) __acquires(ctx->completion_lock) { spin_lock(&ctx->completion_lock); } -static inline void io_cq_unlock(struct io_ring_ctx *ctx) - __releases(ctx->completion_lock) -{ - spin_unlock(&ctx->completion_lock); -} - -/* keep it inlined for io_submit_flush_completions() */ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) - __releases(ctx->completion_lock) { io_commit_cqring(ctx); - __io_cq_unlock(ctx); + if (!ctx->task_complete) { + if (!ctx->lockless_cq) + spin_unlock(&ctx->completion_lock); + /* IOPOLL rings only need to wake up if it's also SQPOLL */ + if (!ctx->syscall_iopoll) + io_cqring_wake(ctx); + } io_commit_cqring_flush(ctx); - io_cqring_wake(ctx); } -void io_cq_unlock_post(struct io_ring_ctx *ctx) +static void io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - io_commit_cqring_flush(ctx); io_cqring_wake(ctx); + io_commit_cqring_flush(ctx); } -/* Returns true if there are no backlogged entries after the flush */ -static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) -{ - struct io_overflow_cqe *ocqe; - LIST_HEAD(list); - - io_cq_lock(ctx); - list_splice_init(&ctx->cq_overflow_list, &list); - clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - io_cq_unlock(ctx); - - while (!list_empty(&list)) { - ocqe = list_first_entry(&list, struct io_overflow_cqe, list); - list_del(&ocqe->list); - kfree(ocqe); - } -} - -/* Returns true if there are no backlogged entries after the flush */ -static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) +static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { - size_t cqe_size = sizeof(struct io_uring_cqe); + lockdep_assert_held(&ctx->uring_lock); - if (__io_cqring_events(ctx) == ctx->cq_entries) + /* don't abort if we're dying, entries must get freed */ + if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; - if (ctx->flags & IORING_SETUP_CQE32) - cqe_size <<= 1; - io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { - struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true); + size_t cqe_size = sizeof(struct io_uring_cqe); + struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; + bool is_cqe32 = false; - if (!cqe) - break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); - memcpy(cqe, &ocqe->cqe, cqe_size); + if (ocqe->cqe.flags & IORING_CQE_F_32 || + ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + cqe_size <<= 1; + } + if (ctx->flags & IORING_SETUP_CQE32) + is_cqe32 = false; + + if (!dying) { + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) + break; + memcpy(cqe, &ocqe->cqe, cqe_size); + } list_del(&ocqe->list); kfree(ocqe); + + /* + * For silly syzbot cases that deliberately overflow by huge + * amounts, check if we need to resched and drop and + * reacquire the locks if so. Nothing real would ever hit this. + * Ideally we'd have a non-posting unlock for this, but hard + * to care for a non-real case. + */ + if (need_resched()) { + ctx->cqe_sentinel = ctx->cqe_cached; + io_cq_unlock_post(ctx); + mutex_unlock(&ctx->uring_lock); + cond_resched(); + mutex_lock(&ctx->uring_lock); + io_cq_lock(ctx); + } } if (list_empty(&ctx->cq_overflow_list)) { @@ -677,30 +636,32 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cq_unlock_post(ctx); } -static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) +static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) { - /* iopoll syncs against uring_lock, not completion_lock */ - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - __io_cqring_overflow_flush(ctx); - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); + if (ctx->rings) + __io_cqring_overflow_flush(ctx, true); } -static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) +static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) { - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - io_cqring_do_overflow_flush(ctx); + mutex_lock(&ctx->uring_lock); + __io_cqring_overflow_flush(ctx, false); + mutex_unlock(&ctx->uring_lock); } -void __io_put_task(struct task_struct *task, int nr) +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct io_kiocb *req) { - struct io_uring_task *tctx = task->io_uring; + struct io_uring_task *tctx = req->tctx; - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) - wake_up(&tctx->wait); - put_task_struct_many(task, nr); + if (likely(tctx->task == current)) { + tctx->cached_refs++; + } else { + percpu_counter_sub(&tctx->inflight, 1); + if (unlikely(atomic_read(&tctx->in_cancel))) + wake_up(&tctx->wait); + put_task_struct(tctx->task); + } } void io_task_refs_refill(struct io_uring_task *tctx) @@ -712,7 +673,7 @@ void io_task_refs_refill(struct io_uring_task *tctx) tctx->cached_refs += refill; } -static __cold void io_uring_drop_tctx_refs(struct task_struct *task) +__cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; @@ -724,27 +685,20 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, u64 extra2) +static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx, + struct io_overflow_cqe *ocqe) { - struct io_overflow_cqe *ocqe; - size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - lockdep_assert_held(&ctx->completion_lock); - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); - - ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { + struct io_rings *r = ctx->rings; + /* * If we're in ring overflow flush mode, or in task cancel mode, * or cannot allocate an overflow entry, then we need to drop it * on the floor. */ - io_account_cq_overflow(ctx); + WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } @@ -753,26 +707,55 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; - } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } -bool io_req_cqe_overflow(struct io_kiocb *req) +static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe, gfp_t gfp) { - if (!(req->flags & REQ_F_CQE32_INIT)) { - req->extra1 = 0; - req->extra2 = 0; + struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = false; + + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + ocq_size += sizeof(struct io_uring_cqe); + } + + ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); + if (ocqe) { + ocqe->cqe.user_data = cqe->user_data; + ocqe->cqe.res = cqe->res; + ocqe->cqe.flags = cqe->flags; + if (is_cqe32 && big_cqe) { + ocqe->cqe.big_cqe[0] = big_cqe->extra1; + ocqe->cqe.big_cqe[1] = big_cqe->extra2; + } + } + if (big_cqe) + big_cqe->extra1 = big_cqe->extra2 = 0; + return ocqe; +} + +/* + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE + * because the ring is a single 16b entry away from wrapping. + */ +static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) +{ + if (__io_cqring_events(ctx) < ctx->cq_entries) { + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; + + cqe->user_data = 0; + cqe->res = 0; + cqe->flags = IORING_CQE_F_SKIP; + ctx->cached_cq_tail++; + return true; } - return io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->extra1, req->extra2); + return false; } /* @@ -780,7 +763,7 @@ bool io_req_cqe_overflow(struct io_kiocb *req) * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); @@ -792,15 +775,25 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) * Force overflow the completion. */ if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) - return NULL; + return false; + + /* + * Post dummy CQE if a 32b CQE is needed and there's only room for a + * 16b CQE before the ring wraps. + */ + if (cqe32 && off + 1 == ctx->cq_entries) { + if (!io_fill_nop_cqe(ctx, off)) + return false; + off = 0; + } /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); free = ctx->cq_entries - queued; /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); - if (!len) - return NULL; + if (len < (cqe32 + 1)) + return false; if (ctx->flags & IORING_SETUP_CQE32) { off <<= 1; @@ -809,211 +802,214 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) ctx->cqe_cached = &rings->cqes[off]; ctx->cqe_sentinel = ctx->cqe_cached + len; + return true; +} - ctx->cached_cq_tail++; - ctx->cqe_cached++; - if (ctx->flags & IORING_SETUP_CQE32) - ctx->cqe_cached++; - return &rings->cqes[off]; +static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, + struct io_uring_cqe src_cqe[2]) +{ + struct io_uring_cqe *cqe; + + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) + return false; + if (unlikely(!io_get_cqe(ctx, &cqe, true))) + return false; + + memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); + trace_io_uring_complete(ctx, NULL, cqe); + return true; } static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + bool cqe32 = cflags & IORING_CQE_F_32; struct io_uring_cqe *cqe; - ctx->cq_extra++; - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); - + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - if (ctx->flags & IORING_SETUP_CQE32) { + if (cqe32) { WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } + + trace_io_uring_complete(ctx, NULL, cqe); return true; } return false; } -static void __io_flush_post_cqes(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) +static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags) { - struct io_submit_state *state = &ctx->submit_state; - unsigned int i; + return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags }; +} - lockdep_assert_held(&ctx->uring_lock); - for (i = 0; i < state->cqes_count; i++) { - struct io_uring_cqe *cqe = &state->cqes[i]; - - if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { - if (ctx->task_complete) { - spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, cqe->user_data, - cqe->res, cqe->flags, 0, 0); - spin_unlock(&ctx->completion_lock); - } else { - io_cqring_event_overflow(ctx, cqe->user_data, - cqe->res, cqe->flags, 0, 0); - } - } - } - state->cqes_count = 0; +static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL); + spin_lock(&ctx->completion_lock); + io_cqring_add_overflow(ctx, ocqe); + spin_unlock(&ctx->completion_lock); } -static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC); + return io_cqring_add_overflow(ctx, ocqe); +} + +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled && allow_overflow) - filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + if (unlikely(!filled)) { + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); + filled = io_cqe_overflow_locked(ctx, &cqe, NULL); + } io_cq_unlock_post(ctx); return filled; } -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +/* + * Must be called from inline task_work so we know a flush will happen later, + * and obviously with ctx->uring_lock held (tw always has that). + */ +void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { - return __io_post_aux_cqe(ctx, user_data, res, cflags, true); + lockdep_assert_held(&ctx->uring_lock); + lockdep_assert(ctx->lockless_cq); + + if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); + + io_cqe_overflow(ctx, &cqe, NULL); + } + ctx->submit_state.cq_flush = true; } -bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +/* + * A helper for multishot requests posting additional CQEs. + * Should only be used from a task_work including IO_URING_F_MULTISHOT. + */ +bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) { - struct io_uring_cqe *cqe; - unsigned int length; - - if (!defer) - return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow); + struct io_ring_ctx *ctx = req->ctx; + bool posted; - length = ARRAY_SIZE(ctx->submit_state.cqes); + /* + * If multishot has already posted deferred completions, ensure that + * those are flushed first before posting this one. If not, CQEs + * could get reordered. + */ + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) + __io_submit_flush_completions(ctx); + lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); - if (ctx->submit_state.cqes_count == length) { - __io_cq_lock(ctx); - __io_flush_post_cqes(ctx); - /* no need to flush - flush is deferred */ - __io_cq_unlock_post(ctx); + if (!ctx->lockless_cq) { + spin_lock(&ctx->completion_lock); + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + spin_unlock(&ctx->completion_lock); + } else { + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); } - /* For defered completions this is not as strict as it is otherwise, - * however it's main job is to prevent unbounded posted completions, - * and in that it works just as well. - */ - if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - return false; - - cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++]; - cqe->user_data = user_data; - cqe->res = res; - cqe->flags = cflags; - return true; + ctx->submit_state.cq_flush = true; + return posted; } -static void __io_req_complete_post(struct io_kiocb *req) +/* + * A helper for multishot requests posting additional CQEs. + * Should only be used from a task_work including IO_URING_F_MULTISHOT. + */ +bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2]) { struct io_ring_ctx *ctx = req->ctx; + bool posted; - io_cq_lock(ctx); - if (!(req->flags & REQ_F_CQE_SKIP)) - io_fill_cqe_req(ctx, req); + lockdep_assert(!io_wq_current_is_worker()); + lockdep_assert_held(&ctx->uring_lock); - /* - * If we're the last reference to this request, add to our locked - * free_list cache. - */ - if (req_ref_put_and_test(req)) { - if (req->flags & IO_REQ_LINK_FLAGS) { - if (req->flags & IO_DISARM_MASK) - io_disarm_next(req); - if (req->link) { - io_req_task_queue(req->link); - req->link = NULL; - } - } - io_req_put_rsrc(req); - /* - * Selected buffer deallocation in io_clean_op() assumes that - * we don't hold ->completion_lock. Clean them here to avoid - * deadlocks. - */ - io_put_kbuf_comp(req); - io_dismantle_req(req); - io_put_task(req->task, 1); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; + cqe[0].user_data = req->cqe.user_data; + if (!ctx->lockless_cq) { + spin_lock(&ctx->completion_lock); + posted = io_fill_cqe_aux32(ctx, cqe); + spin_unlock(&ctx->completion_lock); + } else { + posted = io_fill_cqe_aux32(ctx, cqe); } - io_cq_unlock_post(ctx); + + ctx->submit_state.cq_flush = true; + return posted; } -void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) +static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { - if (req->ctx->task_complete && (issue_flags & IO_URING_F_IOWQ)) { + struct io_ring_ctx *ctx = req->ctx; + bool completed = true; + + /* + * All execution paths but io-wq use the deferred completions by + * passing IO_URING_F_COMPLETE_DEFER and thus should not end up here. + */ + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_IOWQ))) + return; + + /* + * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires + * the submitter task context, IOPOLL protects with uring_lock. + */ + if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { +defer_complete: req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); - } else if (!(issue_flags & IO_URING_F_UNLOCKED) || - !(req->ctx->flags & IORING_SETUP_IOPOLL)) { - __io_req_complete_post(req); - } else { - struct io_ring_ctx *ctx = req->ctx; - - mutex_lock(&ctx->uring_lock); - __io_req_complete_post(req); - mutex_unlock(&ctx->uring_lock); + return; } + + io_cq_lock(ctx); + if (!(req->flags & REQ_F_CQE_SKIP)) + completed = io_fill_cqe_req(ctx, req); + io_cq_unlock_post(ctx); + + if (!completed) + goto defer_complete; + + /* + * We don't free the request here because we know it's called from + * io-wq only, which holds a reference, so it cannot be the last put. + */ + req_ref_put(req); } void io_req_defer_failed(struct io_kiocb *req, s32 res) __must_hold(&ctx->uring_lock) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_cold_def *def = &io_cold_defs[req->opcode]; lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); - io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); + io_req_set_res(req, res, io_put_kbuf(req, res, NULL)); if (def->fail) def->fail(req); io_req_complete_defer(req); } /* - * Don't initialise the fields below on every allocation, but do that in - * advance and keep them valid across allocations. - */ -static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - req->ctx = ctx; - req->link = NULL; - req->async_data = NULL; - /* not necessary, but safer to zero */ - req->cqe.res = 0; -} - -static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, - struct io_submit_state *state) -{ - spin_lock(&ctx->completion_lock); - wq_list_splice(&ctx->locked_free_list, &state->free_list); - ctx->locked_free_nr = 0; - spin_unlock(&ctx->completion_lock); -} - -/* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. * Because of that, io_alloc_req() should be called only under ->uring_lock @@ -1022,20 +1018,9 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO; void *reqs[IO_REQ_ALLOC_BATCH]; - int ret, i; - - /* - * If we have more than a batch's worth of requests in our IRQ side - * locked cache, grab the lock and move them over to our submission - * side cache. - */ - if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); - if (!io_req_cache_empty(ctx)) - return true; - } + int ret; ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); @@ -1051,37 +1036,24 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); - for (i = 0; i < ret; i++) { - struct io_kiocb *req = reqs[i]; + ctx->nr_req_allocated += ret; + + while (ret--) { + struct io_kiocb *req = reqs[ret]; - io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); } return true; } -static inline void io_dismantle_req(struct io_kiocb *req) -{ - unsigned int flags = req->flags; - - if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) - io_clean_op(req); - if (!(flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); -} - __cold void io_free_req(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - - io_req_put_rsrc(req); - io_dismantle_req(req); - io_put_task(req->task, 1); - - spin_lock(&ctx->completion_lock); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; - spin_unlock(&ctx->completion_lock); + /* refs were already put, restore them for io_req_task_complete() */ + req->flags &= ~REQ_F_REFCOUNT; + /* we only want to free it, don't post CQEs */ + req->flags |= REQ_F_CQE_SKIP; + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); } static void __io_req_find_next_prep(struct io_kiocb *req) @@ -1110,165 +1082,200 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return nxt; } -static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) { if (!ctx) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (*locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - *locked = false; - } + + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } -static unsigned int handle_tw_list(struct llist_node *node, - struct io_ring_ctx **ctx, bool *locked, - struct llist_node *last) +/* + * Run queued task_work, returning the number of entries processed in *count. + * If more entries than max_entries are available, stop processing once this + * is reached and return the rest of the list. + */ +struct llist_node *io_handle_tw_list(struct llist_node *node, + unsigned int *count, + unsigned int max_entries) { - unsigned int count = 0; + struct io_ring_ctx *ctx = NULL; + struct io_tw_state ts = { }; - while (node != last) { + do { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); - prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - - if (req->ctx != *ctx) { - ctx_flush_and_put(*ctx, locked); - *ctx = req->ctx; - /* if not contended, grab and improve batching */ - *locked = mutex_trylock(&(*ctx)->uring_lock); - percpu_ref_get(&(*ctx)->refs); + if (req->ctx != ctx) { + ctx_flush_and_put(ctx, ts); + ctx = req->ctx; + mutex_lock(&ctx->uring_lock); + percpu_ref_get(&ctx->refs); + ts.cancel = io_should_terminate_tw(ctx); } - req->io_task_work.func(req, locked); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + (struct io_tw_req){req}, ts); node = next; - count++; - } + (*count)++; + if (unlikely(need_resched())) { + ctx_flush_and_put(ctx, ts); + ctx = NULL; + cond_resched(); + } + } while (node && *count < max_entries); - return count; + ctx_flush_and_put(ctx, ts); + return node; } -/** - * io_llist_xchg - swap all entries in a lock-less list - * @head: the head of lock-less list to delete all entries - * @new: new entry as the head of the list - * - * If list is empty, return NULL, otherwise, return the pointer to the first entry. - * The order of entries returned is from the newest to the oldest added one. - */ -static inline struct llist_node *io_llist_xchg(struct llist_head *head, - struct llist_node *new) +static __cold void __io_fallback_tw(struct llist_node *node, bool sync) { - return xchg(&head->first, new); -} + struct io_ring_ctx *last_ctx = NULL; + struct io_kiocb *req; -/** - * io_llist_cmpxchg - possibly swap all entries in a lock-less list - * @head: the head of lock-less list to delete all entries - * @old: expected old value of the first entry of the list - * @new: new entry as the head of the list - * - * perform a cmpxchg on the first entry of the list. - */ + while (node) { + req = container_of(node, struct io_kiocb, io_task_work.node); + node = node->next; + if (last_ctx != req->ctx) { + if (last_ctx) { + if (sync) + flush_delayed_work(&last_ctx->fallback_work); + percpu_ref_put(&last_ctx->refs); + } + last_ctx = req->ctx; + percpu_ref_get(&last_ctx->refs); + } + if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) + schedule_delayed_work(&last_ctx->fallback_work, 1); + } -static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, - struct llist_node *old, - struct llist_node *new) + if (last_ctx) { + if (sync) + flush_delayed_work(&last_ctx->fallback_work); + percpu_ref_put(&last_ctx->refs); + } +} + +static void io_fallback_tw(struct io_uring_task *tctx, bool sync) { - return cmpxchg(&head->first, old, new); + struct llist_node *node = llist_del_all(&tctx->task_list); + + __io_fallback_tw(node, sync); } -void tctx_task_work(struct callback_head *cb) +struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, + unsigned int max_entries, + unsigned int *count) { - bool uring_locked = false; - struct io_ring_ctx *ctx = NULL; - struct io_uring_task *tctx = container_of(cb, struct io_uring_task, - task_work); - struct llist_node fake = {}; struct llist_node *node; - unsigned int loops = 1; - unsigned int count; - - if (unlikely(current->flags & PF_EXITING)) { - io_fallback_tw(tctx); - return; - } - node = io_llist_xchg(&tctx->task_list, &fake); - count = handle_tw_list(node, &ctx, &uring_locked, NULL); - node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); - while (node != &fake) { - loops++; - node = io_llist_xchg(&tctx->task_list, &fake); - count += handle_tw_list(node, &ctx, &uring_locked, &fake); - node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); + node = llist_del_all(&tctx->task_list); + if (node) { + node = llist_reverse_order(node); + node = io_handle_tw_list(node, count, max_entries); } - ctx_flush_and_put(ctx, &uring_locked); - - /* relaxed read is enough as only the task itself sets ->in_idle */ - if (unlikely(atomic_read(&tctx->in_idle))) + /* relaxed read is enough as only the task itself sets ->in_cancel */ + if (unlikely(atomic_read(&tctx->in_cancel))) io_uring_drop_tctx_refs(current); - trace_io_uring_task_work_run(tctx, count, loops); + trace_io_uring_task_work_run(tctx, *count); + return node; } -static __cold void io_fallback_tw(struct io_uring_task *tctx) +void tctx_task_work(struct callback_head *cb) { - struct llist_node *node = llist_del_all(&tctx->task_list); - struct io_kiocb *req; + struct io_uring_task *tctx; + struct llist_node *ret; + unsigned int count = 0; - while (node) { - req = container_of(node, struct io_kiocb, io_task_work.node); - node = node->next; - if (llist_add(&req->io_task_work.node, - &req->ctx->fallback_llist)) - schedule_delayed_work(&req->ctx->fallback_work, 1); - } + tctx = container_of(cb, struct io_uring_task, task_work); + ret = tctx_task_work_run(tctx, UINT_MAX, &count); + /* can't happen */ + WARN_ON_ONCE(ret); } -static void io_req_local_work_add(struct io_kiocb *req) +static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; + unsigned nr_wait, nr_tw, nr_tw_prev; + struct llist_node *head; - percpu_ref_get(&ctx->refs); + /* See comment above IO_CQ_WAKE_INIT */ + BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); - if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) { - percpu_ref_put(&ctx->refs); - return; - } - /* need it for the following io_cqring_wake() */ - smp_mb__after_atomic(); + /* + * We don't know how many requests there are in the link and whether + * they can even be queued lazily, fall back to non-lazy. + */ + if (req->flags & IO_REQ_LINK_FLAGS) + flags &= ~IOU_F_TWQ_LAZY_WAKE; - if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { - io_move_task_work_from_local(ctx); - percpu_ref_put(&ctx->refs); - return; - } + guard(rcu)(); - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + head = READ_ONCE(ctx->work_llist.first); + do { + nr_tw_prev = 0; + if (head) { + struct io_kiocb *first_req = container_of(head, + struct io_kiocb, + io_task_work.node); + /* + * Might be executed at any moment, rely on + * SLAB_TYPESAFE_BY_RCU to keep it alive. + */ + nr_tw_prev = READ_ONCE(first_req->nr_tw); + } - if (ctx->has_evfd) - io_eventfd_signal(ctx); - __io_cqring_wake(ctx); - percpu_ref_put(&ctx->refs); + /* + * Theoretically, it can overflow, but that's fine as one of + * previous adds should've tried to wake the task. + */ + nr_tw = nr_tw_prev + 1; + if (!(flags & IOU_F_TWQ_LAZY_WAKE)) + nr_tw = IO_CQ_WAKE_FORCE; + + req->nr_tw = nr_tw; + req->io_task_work.node.next = head; + } while (!try_cmpxchg(&ctx->work_llist.first, &head, + &req->io_task_work.node)); + + /* + * cmpxchg implies a full barrier, which pairs with the barrier + * in set_current_state() on the io_cqring_wait() side. It's used + * to ensure that either we see updated ->cq_wait_nr, or waiters + * going to sleep will observe the work added to the list, which + * is similar to the wait/wawke task state sync. + */ + + if (!head) { + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + if (ctx->has_evfd) + io_eventfd_signal(ctx, false); + } + + nr_wait = atomic_read(&ctx->cq_wait_nr); + /* not enough or no one is waiting */ + if (nr_tw < nr_wait) + return; + /* the previous add has already woken it up */ + if (nr_tw_prev >= nr_wait) + return; + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } -void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) +static void io_req_normal_work_add(struct io_kiocb *req) { - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; struct io_ring_ctx *ctx = req->ctx; - if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - io_req_local_work_add(req); - return; - } - /* task_work already pending, we're done */ if (!llist_add(&req->io_task_work.node, &tctx->task_list)) return; @@ -1276,99 +1283,155 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) + /* SQPOLL doesn't need the task_work added, it'll run it itself */ + if (ctx->flags & IORING_SETUP_SQPOLL) { + __set_notify_signal(tctx->task); + return; + } + + if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) return; - io_fallback_tw(tctx); + io_fallback_tw(tctx, false); } -static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) +void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { - struct llist_node *node; + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_req_local_work_add(req, flags); + else + io_req_normal_work_add(req); +} - node = llist_del_all(&ctx->work_llist); - while (node) { - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); +void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) +{ + if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) + return; + __io_req_task_work_add(req, flags); +} - node = node->next; - __io_req_task_work_add(req, false); - } +static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) +{ + struct llist_node *node = llist_del_all(&ctx->work_llist); + + __io_fallback_tw(node, false); + node = llist_del_all(&ctx->retry_llist); + __io_fallback_tw(node, false); } -int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) +static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, + int min_events) { - struct llist_node *node; - struct llist_node fake; - struct llist_node *current_final = NULL; - int ret; - unsigned int loops = 1; + if (!io_local_work_pending(ctx)) + return false; + if (events < min_events) + return true; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + return false; +} - if (unlikely(ctx->submitter_task != current)) - return -EEXIST; +static int __io_run_local_work_loop(struct llist_node **node, + io_tw_token_t tw, + int events) +{ + int ret = 0; - node = io_llist_xchg(&ctx->work_llist, &fake); - ret = 0; -again: - while (node != current_final) { - struct llist_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, + while (*node) { + struct llist_node *next = (*node)->next; + struct io_kiocb *req = container_of(*node, struct io_kiocb, io_task_work.node); - prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - req->io_task_work.func(req, locked); - ret++; - node = next; + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + (struct io_tw_req){req}, tw); + *node = next; + if (++ret >= events) + break; } + return ret; +} + +static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, + int min_events, int max_events) +{ + struct llist_node *node; + unsigned int loops = 0; + int ret = 0; + + if (WARN_ON_ONCE(ctx->submitter_task != current)) + return -EEXIST; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); +again: + tw.cancel = io_should_terminate_tw(ctx); + min_events -= ret; + ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); + if (ctx->retry_llist.first) + goto retry_done; + + /* + * llists are in reverse order, flip it back the right way before + * running the pending items. + */ + node = llist_reverse_order(llist_del_all(&ctx->work_llist)); + ret += __io_run_local_work_loop(&node, tw, max_events - ret); + ctx->retry_llist.first = node; + loops++; - node = io_llist_cmpxchg(&ctx->work_llist, &fake, NULL); - if (node != &fake) { - loops++; - current_final = &fake; - node = io_llist_xchg(&ctx->work_llist, &fake); + if (io_run_local_work_continue(ctx, ret, min_events)) + goto again; +retry_done: + io_submit_flush_completions(ctx); + if (io_run_local_work_continue(ctx, ret, min_events)) goto again; - } - if (*locked) - io_submit_flush_completions(ctx); trace_io_uring_local_work_run(ctx, ret, loops); return ret; - } -int io_run_local_work(struct io_ring_ctx *ctx) +static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, + int min_events) { - bool locked; - int ret; + struct io_tw_state ts = {}; - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return 0; + return __io_run_local_work(ctx, ts, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); +} - __set_current_state(TASK_RUNNING); - locked = mutex_trylock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &locked); - if (locked) - mutex_unlock(&ctx->uring_lock); +int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) +{ + struct io_tw_state ts = {}; + int ret; + mutex_lock(&ctx->uring_lock); + ret = __io_run_local_work(ctx, ts, min_events, max_events); + mutex_unlock(&ctx->uring_lock); return ret; } -static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +static void io_req_task_cancel(struct io_tw_req tw_req, io_tw_token_t tw) { - io_tw_lock(req->ctx, locked); + struct io_kiocb *req = tw_req.req; + + io_tw_lock(req->ctx, tw); io_req_defer_failed(req, req->cqe.res); } -void io_req_task_submit(struct io_kiocb *req, bool *locked) +void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw) { - io_tw_lock(req->ctx, locked); - /* req->task == current here, checking PF_EXITING is safe */ - if (likely(!(req->task->flags & PF_EXITING))) - io_queue_sqe(req); - else + struct io_kiocb *req = tw_req.req; + struct io_ring_ctx *ctx = req->ctx; + + io_tw_lock(ctx, tw); + if (unlikely(tw.cancel)) io_req_defer_failed(req, -EFAULT); + else if (req->flags & REQ_F_FORCE_ASYNC) + io_queue_iowq(req); + else + io_queue_sqe(req, 0); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -1392,17 +1455,31 @@ void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } -void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) - __must_hold(&ctx->uring_lock) +static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) { - struct task_struct *task = NULL; - int task_refs = 0; + if (req->file_node) { + io_put_rsrc_node(req->ctx, req->file_node); + req->file_node = NULL; + } + if (req->flags & REQ_F_BUF_NODE) + io_put_rsrc_node(req->ctx, req->buf_node); +} +static void io_free_batch_list(struct io_ring_ctx *ctx, + struct io_wq_work_node *node) + __must_hold(&ctx->uring_lock) +{ do { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { + if (req->flags & REQ_F_REISSUE) { + node = req->comp_list.next; + req->flags &= ~REQ_F_REISSUE; + io_queue_iowq(req); + continue; + } if (req->flags & REQ_F_REFCOUNT) { node = req->comp_list.next; if (!req_ref_put_and_test(req)) @@ -1413,8 +1490,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) if (apoll->double_poll) kfree(apoll->double_poll); - if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache)) - kfree(apoll); + io_cache_free(&ctx->apoll_cache, apoll); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) @@ -1422,73 +1498,50 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); } - if (!(req->flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); + io_put_file(req); + io_req_put_rsrc_nodes(req); + io_put_task(req); - io_req_put_rsrc_locked(req, ctx); - - if (req->task != task) { - if (task) - io_put_task(task, task_refs); - task = req->task; - task_refs = 0; - } - task_refs++; node = req->comp_list.next; io_req_add_to_cache(req, ctx); } while (node); - - if (task) - io_put_task(task, task_refs); } -static void __io_submit_flush_completions(struct io_ring_ctx *ctx) +void __io_submit_flush_completions(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - struct io_wq_work_node *node, *prev; struct io_submit_state *state = &ctx->submit_state; + struct io_wq_work_node *node; __io_cq_lock(ctx); - /* must come first to preserve CQE ordering in failure cases */ - if (state->cqes_count) - __io_flush_post_cqes(ctx); - wq_list_for_each(node, prev, &state->compl_reqs) { + __wq_list_for_each(node, &state->compl_reqs) { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP) && - unlikely(!__io_fill_cqe_req(ctx, req))) { - if (ctx->task_complete) { - spin_lock(&ctx->completion_lock); - io_req_cqe_overflow(req); - spin_unlock(&ctx->completion_lock); - } else { - io_req_cqe_overflow(req); - } + /* + * Requests marked with REQUEUE should not post a CQE, they + * will go through the io-wq retry machinery and post one + * later. + */ + if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && + unlikely(!io_fill_cqe_req(ctx, req))) { + if (ctx->lockless_cq) + io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); + else + io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); } } __io_cq_unlock_post(ctx); - if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { + if (!wq_list_empty(&state->compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } -} -/* - * Drop reference to request, return next in chain (if there is one) if this - * was the last reference to this request. - */ -static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt = NULL; + if (unlikely(ctx->drain_active)) + io_queue_deferred(ctx); - if (req_ref_put_and_test(req)) { - if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) - nxt = io_req_find_next(req); - io_free_req(req); - } - return nxt; + ctx->submit_state.cq_flush = false; } static unsigned io_cqring_events(struct io_ring_ctx *ctx) @@ -1502,7 +1555,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx) * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) +__cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; @@ -1524,21 +1577,27 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) } } mutex_unlock(&ctx->uring_lock); + + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_move_task_work_from_local(ctx); } -static int io_iopoll_check(struct io_ring_ctx *ctx, long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) { unsigned int nr_events = 0; - int ret = 0; unsigned long check_cq; + min_events = min(min_events, ctx->cq_entries); + + lockdep_assert_held(&ctx->uring_lock); + if (!io_allowed_run_tw(ctx)) return -EEXIST; check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - __io_cqring_overflow_flush(ctx); + __io_cqring_overflow_flush(ctx, false); /* * Similarly do not spin if we have not informed the user of any * dropped CQE. @@ -1555,6 +1614,8 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return 0; do { + int ret = 0; + /* * If a submit got punted to a workqueue, we can have the * application entering polling for a command before it gets @@ -1569,7 +1630,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; - (void) io_run_local_work_locked(ctx); + (void) io_run_local_work_locked(ctx, min_events); if (task_work_pending(current) || wq_list_empty(&ctx->iopoll_list)) { @@ -1582,22 +1643,24 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) wq_list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, !min); - if (ret < 0) + ret = io_do_iopoll(ctx, !min_events); + if (unlikely(ret < 0)) + return ret; + + if (task_sigpending(current)) + return -EINTR; + if (need_resched()) break; + nr_events += ret; - ret = 0; - } while (nr_events < min && !need_resched()); + } while (nr_events < min_events); - return ret; + return 0; } -void io_req_task_complete(struct io_kiocb *req, bool *locked) +void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw) { - if (*locked) - io_req_complete_defer(req); - else - io_req_complete_post(req, IO_URING_F_UNLOCKED); + io_req_complete_defer(tw_req.req); } /* @@ -1655,174 +1718,47 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) } } -static bool io_bdev_nowait(struct block_device *bdev) -{ - return !bdev || bdev_nowait(bdev); -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ -static bool __io_file_supports_nowait(struct file *file, umode_t mode) +io_req_flags_t io_file_get_flags(struct file *file) { - if (S_ISBLK(mode)) { - if (IS_ENABLED(CONFIG_BLOCK) && - io_bdev_nowait(I_BDEV(file->f_mapping->host))) - return true; - return false; - } - if (S_ISSOCK(mode)) - return true; - if (S_ISREG(mode)) { - if (IS_ENABLED(CONFIG_BLOCK) && - io_bdev_nowait(file->f_inode->i_sb->s_bdev) && - !io_is_uring_fops(file)) - return true; - return false; - } + io_req_flags_t res = 0; - /* any ->read/write should understand O_NONBLOCK */ - if (file->f_flags & O_NONBLOCK) - return true; - return file->f_mode & FMODE_NOWAIT; -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ -unsigned int io_file_get_flags(struct file *file) -{ - umode_t mode = file_inode(file)->i_mode; - unsigned int res = 0; + BUILD_BUG_ON(REQ_F_ISREG_BIT != REQ_F_SUPPORT_NOWAIT_BIT + 1); - if (S_ISREG(mode)) - res |= FFS_ISREG; - if (__io_file_supports_nowait(file, mode)) - res |= FFS_NOWAIT; + if (S_ISREG(file_inode(file)->i_mode)) + res |= REQ_F_ISREG; + if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) + res |= REQ_F_SUPPORT_NOWAIT; return res; } -bool io_alloc_async_data(struct io_kiocb *req) -{ - WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); - req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); - if (req->async_data) { - req->flags |= REQ_F_ASYNC_DATA; - return false; - } - return true; -} - -int io_req_prep_async(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - - /* assign early for deferred execution for non-fixed file */ - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) - req->file = io_file_get_normal(req, req->cqe.fd); - if (!def->prep_async) - return 0; - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (!io_op_defs[req->opcode].manual_alloc) { - if (io_alloc_async_data(req)) - return -EAGAIN; - } - return def->prep_async(req); -} - -static u32 io_get_sequence(struct io_kiocb *req) -{ - u32 seq = req->ctx->cached_sq_head; - struct io_kiocb *cur; - - /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(cur, req) - seq--; - return seq; -} - static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; + bool drain = req->flags & IOSQE_IO_DRAIN; struct io_defer_entry *de; - int ret; - u32 seq = io_get_sequence(req); - /* Still need defer if there is pending req in defer list. */ - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); -queue: - ctx->drain_active = false; - io_req_task_queue(req); - return; - } - spin_unlock(&ctx->completion_lock); - - io_prep_async_link(req); - de = kmalloc(sizeof(*de), GFP_KERNEL); + de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT); if (!de) { - ret = -ENOMEM; - io_req_defer_failed(req, ret); + io_req_defer_failed(req, -ENOMEM); return; } - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); - kfree(de); - goto queue; - } - + io_prep_async_link(req); trace_io_uring_defer(req); de->req = req; - de->seq = seq; - list_add_tail(&de->list, &ctx->defer_list); - spin_unlock(&ctx->completion_lock); -} - -static void io_clean_op(struct io_kiocb *req) -{ - if (req->flags & REQ_F_BUFFER_SELECTED) { - spin_lock(&req->ctx->completion_lock); - io_put_kbuf_comp(req); - spin_unlock(&req->ctx->completion_lock); - } - if (req->flags & REQ_F_NEED_CLEANUP) { - const struct io_op_def *def = &io_op_defs[req->opcode]; - - if (def->cleanup) - def->cleanup(req); - } - if ((req->flags & REQ_F_POLLED) && req->apoll) { - kfree(req->apoll->double_poll); - kfree(req->apoll); - req->apoll = NULL; - } - if (req->flags & REQ_F_INFLIGHT) { - struct io_uring_task *tctx = req->task->io_uring; - - atomic_dec(&tctx->inflight_tracked); - } - if (req->flags & REQ_F_CREDS) - put_cred(req->creds); - if (req->flags & REQ_F_ASYNC_DATA) { - kfree(req->async_data); - req->async_data = NULL; - } - req->flags &= ~IO_REQ_CLEAN_FLAGS; + ctx->nr_drained += io_linked_nr(req); + list_add_tail(&de->list, &ctx->defer_list); + io_queue_deferred(ctx); + if (!drain && list_empty(&ctx->defer_list)) + ctx->drain_active = false; } -static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) +static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, + unsigned int issue_flags) { - if (req->file || !io_op_defs[req->opcode].needs_file) + if (req->file || !def->needs_file) return true; if (req->flags & REQ_F_FIXED_FILE) @@ -1833,17 +1769,22 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) return !!req->file; } -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +#define REQ_ISSUE_SLOW_FLAGS (REQ_F_CREDS | REQ_F_ARM_LTIMEOUT) + +static inline int __io_issue_sqe(struct io_kiocb *req, + unsigned int issue_flags, + const struct io_issue_def *def) { - const struct io_op_def *def = &io_op_defs[req->opcode]; const struct cred *creds = NULL; + struct io_kiocb *link = NULL; int ret; - if (unlikely(!io_assign_file(req, issue_flags))) - return -EBADF; - - if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) - creds = override_creds(req->creds); + if (unlikely(req->flags & REQ_ISSUE_SLOW_FLAGS)) { + if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) + creds = override_creds(req->creds); + if (req->flags & REQ_F_ARM_LTIMEOUT) + link = __io_prep_linked_timeout(req); + } if (!def->audit_skip) audit_uring_entry(req->opcode); @@ -1853,71 +1794,130 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (!def->audit_skip) audit_uring_exit(!ret, ret); - if (creds) - revert_creds(creds); + if (unlikely(creds || link)) { + if (creds) + revert_creds(creds); + if (link) + io_queue_linked_timeout(link); + } + + return ret; +} - if (ret == IOU_OK) { +static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + int ret; + + if (unlikely(!io_assign_file(req, def, issue_flags))) + return -EBADF; + + ret = __io_issue_sqe(req, issue_flags, def); + + if (ret == IOU_COMPLETE) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_defer(req); else io_req_complete_post(req, issue_flags); - } else if (ret != IOU_ISSUE_SKIP_COMPLETE) - return ret; - /* If the op doesn't have a file, we're not polling for it */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) - io_iopoll_req_issued(req, issue_flags); + return 0; + } - return 0; + if (ret == IOU_ISSUE_SKIP_COMPLETE) { + ret = 0; + + /* If the op doesn't have a file, we're not polling for it */ + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) + io_iopoll_req_issued(req, issue_flags); + } + return ret; } -int io_poll_issue(struct io_kiocb *req, bool *locked) +int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, locked); - return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| - IO_URING_F_COMPLETE_DEFER); + const unsigned int issue_flags = IO_URING_F_NONBLOCK | + IO_URING_F_MULTISHOT | + IO_URING_F_COMPLETE_DEFER; + int ret; + + io_tw_lock(req->ctx, tw); + + WARN_ON_ONCE(!req->file); + if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EFAULT; + + ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); + + WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE); + return ret; } struct io_wq_work *io_wq_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; - req = io_put_req_find_next(req); - return req ? &req->work : NULL; + if (req_ref_put_and_test_atomic(req)) { + if (req->flags & IO_REQ_LINK_FLAGS) + nxt = io_req_find_next(req); + io_free_req(req); + } + return nxt ? &nxt->work : NULL; } void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ; bool needs_poll = false; int ret = 0, err = -ECANCELED; - /* one will be dropped by ->io_wq_free_work() after returning to io-wq */ + /* one will be dropped by io_wq_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) __io_req_set_refcount(req, 2); else req_ref_get(req); - io_arm_ltimeout(req); - /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ - if (work->flags & IO_WQ_WORK_CANCEL) { + if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { fail: io_req_task_queue_fail(req, err); return; } - if (!io_assign_file(req, issue_flags)) { + if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); goto fail; } + /* + * If DEFER_TASKRUN is set, it's only allowed to post CQEs from the + * submitter task context. Final request completions are handed to the + * right context, however this is not the case of auxiliary CQEs, + * which is the main mean of operation for multishot requests. + * Don't allow any multishot execution from io-wq. It's more restrictive + * than necessary and also cleaner. + */ + if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) { + err = -EBADFD; + if (!io_file_can_poll(req)) + goto fail; + if (req->file->f_flags & O_NONBLOCK || + req->file->f_mode & FMODE_NOWAIT) { + err = -ECANCELED; + if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK) + goto fail; + return; + } else { + req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT); + } + } + if (req->flags & REQ_F_FORCE_ASYNC) { bool opcode_poll = def->pollin || def->pollout; - if (opcode_poll && file_can_poll(req->file)) { + if (opcode_poll && io_file_can_poll(req)) { needs_poll = true; issue_flags |= IO_URING_F_NONBLOCK; } @@ -1927,6 +1927,14 @@ fail: ret = io_issue_sqe(req, issue_flags); if (ret != -EAGAIN) break; + + /* + * If REQ_F_NOWAIT is set, then don't wait or retry with + * poll. -EAGAIN is final for that case. + */ + if (req->flags & REQ_F_NOWAIT) + break; + /* * We can get EAGAIN for iopolled IO even though we're * forcing a sync submission from here, since we can't @@ -1935,6 +1943,8 @@ fail: if (!needs_poll) { if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) break; + if (io_wq_worker_stopped()) + break; cond_resched(); continue; } @@ -1947,7 +1957,7 @@ fail: } while (1); /* avoid locking problems by failing it from a clean context */ - if (ret < 0) + if (ret) io_req_task_queue_fail(req, ret); } @@ -1955,21 +1965,17 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *node; struct file *file = NULL; - unsigned long file_ptr; io_ring_submit_lock(ctx, issue_flags); - - if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - goto out; - fd = array_index_nospec(fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; - file = (struct file *) (file_ptr & FFS_MASK); - file_ptr &= ~FFS_MASK; - /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); - io_req_set_rsrc_node(req, ctx, 0); -out: + node = io_rsrc_node_lookup(&ctx->file_table.data, fd); + if (node) { + node->refs++; + req->file_node = node; + req->flags |= io_slot_flags(node); + file = io_slot_file(node); + } io_ring_submit_unlock(ctx, issue_flags); return file; } @@ -1986,50 +1992,61 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) return file; } -static void io_queue_async(struct io_kiocb *req, int ret) - __must_hold(&req->ctx->uring_lock) +static int io_req_sqe_copy(struct io_kiocb *req, unsigned int issue_flags) { - struct io_kiocb *linked_timeout; + const struct io_cold_def *def = &io_cold_defs[req->opcode]; + if (req->flags & REQ_F_SQE_COPIED) + return 0; + req->flags |= REQ_F_SQE_COPIED; + if (!def->sqe_copy) + return 0; + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_INLINE))) + return -EFAULT; + def->sqe_copy(req); + return 0; +} + +static void io_queue_async(struct io_kiocb *req, unsigned int issue_flags, int ret) + __must_hold(&req->ctx->uring_lock) +{ if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { +fail: io_req_defer_failed(req, ret); return; } - linked_timeout = io_prep_linked_timeout(req); + ret = io_req_sqe_copy(req, issue_flags); + if (unlikely(ret)) + goto fail; switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: - io_kbuf_recycle(req, 0); io_req_task_queue(req); break; case IO_APOLL_ABORTED: - io_kbuf_recycle(req, 0); - io_queue_iowq(req, NULL); + io_queue_iowq(req); break; case IO_APOLL_OK: break; } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); } -static inline void io_queue_sqe(struct io_kiocb *req) +static inline void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags) __must_hold(&req->ctx->uring_lock) { + unsigned int issue_flags = IO_URING_F_NONBLOCK | + IO_URING_F_COMPLETE_DEFER | extra_flags; int ret; - ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + ret = io_issue_sqe(req, issue_flags); /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (likely(!ret)) - io_arm_ltimeout(req); - else - io_queue_async(req, ret); + if (unlikely(ret)) + io_queue_async(req, issue_flags, ret); } static void io_queue_sqe_fallback(struct io_kiocb *req) @@ -2044,17 +2061,12 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); } else { - int ret = io_req_prep_async(req); - - if (unlikely(ret)) { - io_req_defer_failed(req, ret); - return; - } - + /* can't fail with IO_URING_F_INLINE */ + io_req_sqe_copy(req, IO_URING_F_INLINE); if (unlikely(req->ctx->drain_active)) io_drain_req(req); else - io_queue_iowq(req, NULL); + io_queue_iowq(req); } } @@ -2081,9 +2093,8 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, return true; } -static void io_init_req_drain(struct io_kiocb *req) +static void io_init_drain(struct io_ring_ctx *ctx) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *head = ctx->submit_state.link.head; ctx->drain_active = true; @@ -2100,49 +2111,78 @@ static void io_init_req_drain(struct io_kiocb *req) } } +static __cold int io_init_fail_req(struct io_kiocb *req, int err) +{ + /* ensure per-opcode data is cleared if we fail before prep */ + memset(&req->cmd.data, 0, sizeof(req->cmd.data)); + return err; +} + static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, unsigned int *left) __must_hold(&ctx->uring_lock) { - const struct io_op_def *def; + const struct io_issue_def *def; unsigned int sqe_flags; int personality; u8 opcode; - /* req is partially pre-initialised, see io_preinit_req() */ + req->ctx = ctx; req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags = sqe_flags = READ_ONCE(sqe->flags); + sqe_flags = READ_ONCE(sqe->flags); + req->flags = (__force io_req_flags_t) sqe_flags; req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; - req->rsrc_node = NULL; - req->task = current; + req->tctx = current->io_uring; + req->cancel_seq_set = false; + req->async_data = NULL; if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; - return -EINVAL; + return io_init_fail_req(req, -EINVAL); } - def = &io_op_defs[opcode]; + opcode = array_index_nospec(opcode, IORING_OP_LAST); + + def = &io_issue_defs[opcode]; + if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) { + /* + * A 128b op on a non-128b SQ requires mixed SQE support as + * well as 2 contiguous entries. + */ + if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || + !(ctx->cached_sq_head & (ctx->sq_entries - 1))) + return io_init_fail_req(req, -EINVAL); + /* + * A 128b operation on a mixed SQ uses two entries, so we have + * to increment the head and cached refs, and decrement what's + * left. + */ + current->io_uring->cached_refs++; + ctx->cached_sq_head++; + (*left)--; + } + if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) - return -EINVAL; + return io_init_fail_req(req, -EINVAL); if (sqe_flags & IOSQE_BUFFER_SELECT) { if (!def->buffer_select) - return -EOPNOTSUPP; + return io_init_fail_req(req, -EOPNOTSUPP); req->buf_index = READ_ONCE(sqe->buf_group); } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) ctx->drain_disabled = true; if (sqe_flags & IOSQE_IO_DRAIN) { if (ctx->drain_disabled) - return -EOPNOTSUPP; - io_init_req_drain(req); + return io_init_fail_req(req, -EOPNOTSUPP); + io_init_drain(ctx); } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) - return -EACCES; + return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ if (ctx->drain_active) req->flags |= REQ_F_FORCE_ASYNC; @@ -2155,9 +2195,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } if (!def->ioprio && sqe->ioprio) - return -EINVAL; + return io_init_fail_req(req, -EINVAL); if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; + return io_init_fail_req(req, -EINVAL); if (def->needs_file) { struct io_submit_state *state = &ctx->submit_state; @@ -2181,12 +2221,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->creds = xa_load(&ctx->personalities, personality); if (!req->creds) - return -EINVAL; + return io_init_fail_req(req, -EINVAL); get_cred(req->creds); ret = security_uring_override_creds(req->creds); if (ret) { put_cred(req->creds); - return ret; + return io_init_fail_req(req, ret); } req->flags |= REQ_F_CREDS; } @@ -2232,18 +2272,17 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, } static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, unsigned int *left) __must_hold(&ctx->uring_lock) { struct io_submit_link *link = &ctx->submit_state.link; int ret; - ret = io_init_req(ctx, req, sqe); + ret = io_init_req(ctx, req, sqe, left); if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); - /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(req, true); + trace_io_uring_submit_req(req); /* * If we already have a head request, queue this one for async @@ -2253,11 +2292,8 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, * conditions are true (normal request), then just queue it. */ if (unlikely(link->head)) { - ret = io_req_prep_async(req); - if (unlikely(ret)) - return io_submit_fail_init(sqe, req, ret); - - trace_io_uring_link(req, link->head); + trace_io_uring_link(req, link->last); + io_req_sqe_copy(req, IO_URING_F_INLINE); link->last->link = req; link->last = req; @@ -2281,7 +2317,7 @@ fallback: return 0; } - io_queue_sqe(req); + io_queue_sqe(req, IO_URING_F_INLINE); return 0; } @@ -2333,10 +2369,21 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) +static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) { - unsigned head, mask = ctx->sq_entries - 1; - unsigned sq_idx = ctx->cached_sq_head++ & mask; + unsigned mask = ctx->sq_entries - 1; + unsigned head = ctx->cached_sq_head++ & mask; + + if (static_branch_unlikely(&io_key_has_sqarray) && + (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { + head = READ_ONCE(ctx->sq_array[head]); + if (unlikely(head >= ctx->sq_entries)) { + WRITE_ONCE(ctx->rings->sq_dropped, + READ_ONCE(ctx->rings->sq_dropped) + 1); + return false; + } + head = array_index_nospec(head, ctx->sq_entries); + } /* * The cached sq head (or cq tail) serves two purposes: @@ -2346,19 +2393,12 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ - head = READ_ONCE(ctx->sq_array[sq_idx]); - if (likely(head < ctx->sq_entries)) { - /* double index for 128-byte SQEs, twice as long */ - if (ctx->flags & IORING_SETUP_SQE128) - head <<= 1; - return &ctx->sq_sqes[head]; - } - /* drop invalid entries */ - ctx->cq_extra--; - WRITE_ONCE(ctx->rings->sq_dropped, - READ_ONCE(ctx->rings->sq_dropped) + 1); - return NULL; + /* double index for 128-byte SQEs, twice as long */ + if (ctx->flags & IORING_SETUP_SQE128) + head <<= 1; + *sqe = &ctx->sq_sqes[head]; + return true; } int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) @@ -2368,10 +2408,11 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) unsigned int left; int ret; + entries = min(nr, entries); if (unlikely(!entries)) return 0; - /* make sure SQ entry isn't read before tail */ - ret = left = min3(nr, ctx->sq_entries, entries); + + ret = left = entries; io_get_task_refs(left); io_submit_state_start(&ctx->submit_state, left); @@ -2379,11 +2420,9 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) const struct io_uring_sqe *sqe; struct io_kiocb *req; - if (unlikely(!io_alloc_req_refill(ctx))) + if (unlikely(!io_alloc_req(ctx, &req))) break; - req = io_alloc_req(ctx); - sqe = io_get_sqe(ctx); - if (unlikely(!sqe)) { + if (unlikely(!io_get_sqe(ctx, &sqe))) { io_req_add_to_cache(req, ctx); break; } @@ -2392,7 +2431,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) * Continue submitting even for sqe failure if the * ring was setup with IORING_SETUP_SUBMIT_ALL */ - if (unlikely(io_submit_sqe(ctx, req, sqe)) && + if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) && !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { left--; break; @@ -2413,366 +2452,485 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) return ret; } -struct io_wait_queue { - struct wait_queue_entry wq; - struct io_ring_ctx *ctx; - unsigned cq_tail; - unsigned nr_timeouts; -}; - -static inline bool io_has_work(struct io_ring_ctx *ctx) -{ - return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || - ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && - !llist_empty(&ctx->work_llist)); -} - -static inline bool io_should_wake(struct io_wait_queue *iowq) -{ - struct io_ring_ctx *ctx = iowq->ctx; - int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; - - /* - * Wake up if we have enough events, or if a timeout occurred since we - * started waiting. For timeouts, we always want to return to userspace, - * regardless of event count. - */ - return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; -} - static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int wake_flags, void *key) { - struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, - wq); - struct io_ring_ctx *ctx = iowq->ctx; + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); /* * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ - if (io_should_wake(iowq) || io_has_work(ctx)) + if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } int io_run_task_work_sig(struct io_ring_ctx *ctx) { - if (io_run_task_work_ctx(ctx) > 0) - return 1; + if (io_local_work_pending(ctx)) { + __set_current_state(TASK_RUNNING); + if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) + return 0; + } + if (io_run_task_work() > 0) + return 0; if (task_sigpending(current)) return -EINTR; return 0; } -/* when returns >0, the caller should retry */ -static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - ktime_t *timeout) +static bool current_pending_io(void) { - int ret; - unsigned long check_cq; + struct io_uring_task *tctx = current->io_uring; - /* make sure we run task_work before checking for signals */ - ret = io_run_task_work_sig(ctx); - if (ret || io_should_wake(iowq)) - return ret; + if (!tctx) + return false; + return percpu_counter_read_positive(&tctx->inflight); +} - check_cq = READ_ONCE(ctx->check_cq); - if (unlikely(check_cq)) { - /* let the caller flush overflows, retry */ - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - return 1; - if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) - return -EBADR; +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) +{ + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); + + WRITE_ONCE(iowq->hit_timeout, 1); + iowq->min_timeout = 0; + wake_up_process(iowq->wq.private); + return HRTIMER_NORESTART; +} + +/* + * Doing min_timeout portion. If we saw any timeouts, events, or have work, + * wake up. If not, and we have a normal timeout, switch to that and keep + * sleeping. + */ +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) +{ + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); + struct io_ring_ctx *ctx = iowq->ctx; + + /* no general timeout, or shorter (or equal), we are done */ + if (iowq->timeout == KTIME_MAX || + ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) + goto out_wake; + /* work we may need to run, wake function will see if we need to wake */ + if (io_has_work(ctx)) + goto out_wake; + /* got events since we started waiting, min timeout is done */ + if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) + goto out_wake; + /* if we have any events and min timeout expired, we're done */ + if (io_cqring_events(ctx)) + goto out_wake; + + /* + * If using deferred task_work running and application is waiting on + * more than one request, ensure we reset it now where we are switching + * to normal sleeps. Any request completion post min_wait should wake + * the task and return. + */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + if (!llist_empty(&ctx->work_llist)) + goto out_wake; } - if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS)) - return -ETIME; + + hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); + hrtimer_set_expires(timer, iowq->timeout); + return HRTIMER_RESTART; +out_wake: + return io_cqring_timer_wakeup(timer); +} + +static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, + clockid_t clock_id, ktime_t start_time) +{ + ktime_t timeout; + + if (iowq->min_timeout) { + timeout = ktime_add_ns(iowq->min_timeout, start_time); + hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, + HRTIMER_MODE_ABS); + } else { + timeout = iowq->timeout; + hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, + HRTIMER_MODE_ABS); + } + + hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); + hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); + + if (!READ_ONCE(iowq->hit_timeout)) + schedule(); + + hrtimer_cancel(&iowq->t); + destroy_hrtimer_on_stack(&iowq->t); + __set_current_state(TASK_RUNNING); + + return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; +} + +struct ext_arg { + size_t argsz; + struct timespec64 ts; + const sigset_t __user *sig; + ktime_t min_time; + bool ts_set; + bool iowait; +}; + +static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + struct ext_arg *ext_arg, + ktime_t start_time) +{ + int ret = 0; /* - * Run task_work after scheduling. If we got woken because of - * task_work being processed, run it now rather than let the caller - * do another wait loop. + * Mark us as being in io_wait if we have pending requests, so cpufreq + * can take into account that the task is waiting for IO - turns out + * to be important for low QD IO. */ - ret = io_run_task_work_sig(ctx); - return ret < 0 ? ret : 1; + if (ext_arg->iowait && current_pending_io()) + current->in_iowait = 1; + if (iowq->timeout != KTIME_MAX || iowq->min_timeout) + ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); + else + schedule(); + current->in_iowait = 0; + return ret; +} + +/* If this returns > 0, the caller should retry */ +static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + struct ext_arg *ext_arg, + ktime_t start_time) +{ + if (unlikely(READ_ONCE(ctx->check_cq))) + return 1; + if (unlikely(io_local_work_pending(ctx))) + return 1; + if (unlikely(task_work_pending(current))) + return 1; + if (unlikely(task_sigpending(current))) + return -EINTR; + if (unlikely(io_should_wake(iowq))) + return 0; + + return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); } /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. */ -static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, - const sigset_t __user *sig, size_t sigsz, - struct __kernel_timespec __user *uts) +static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, + struct ext_arg *ext_arg) { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; - ktime_t timeout = KTIME_MAX; + ktime_t start_time; int ret; + min_events = min_t(int, min_events, ctx->cq_entries); + if (!io_allowed_run_tw(ctx)) return -EEXIST; + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); + io_run_task_work(); - do { - /* always run at least 1 task work to process local work */ - ret = io_run_task_work_ctx(ctx); - if (ret < 0) - return ret; - io_cqring_overflow_flush(ctx); + if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) + io_cqring_do_overflow_flush(ctx); + if (__io_cqring_events_user(ctx) >= min_events) + return 0; - /* if user messes with these they will just get an early return */ - if (__io_cqring_events_user(ctx) >= min_events) - return 0; - } while (ret > 0); + init_waitqueue_func_entry(&iowq.wq, io_wake_function); + iowq.wq.private = current; + INIT_LIST_HEAD(&iowq.wq.entry); + iowq.ctx = ctx; + iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + iowq.hit_timeout = 0; + iowq.min_timeout = ext_arg->min_time; + iowq.timeout = KTIME_MAX; + start_time = io_get_time(ctx); - if (sig) { + if (ext_arg->ts_set) { + iowq.timeout = timespec64_to_ktime(ext_arg->ts); + if (!(flags & IORING_ENTER_ABS_TIMER)) + iowq.timeout = ktime_add(iowq.timeout, start_time); + } + + if (ext_arg->sig) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) - ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, - sigsz); + ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, + ext_arg->argsz); else #endif - ret = set_user_sigmask(sig, sigsz); + ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); if (ret) return ret; } - if (uts) { - struct timespec64 ts; - - if (get_timespec64(&ts, uts)) - return -EFAULT; - timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); - } - - init_waitqueue_func_entry(&iowq.wq, io_wake_function); - iowq.wq.private = current; - INIT_LIST_HEAD(&iowq.wq.entry); - iowq.ctx = ctx; - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); - iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + io_napi_busy_loop(ctx, &iowq); trace_io_uring_cqring_wait(ctx, min_events); do { - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { - finish_wait(&ctx->cq_wait, &iowq.wq); - io_cqring_do_overflow_flush(ctx); + unsigned long check_cq; + int nr_wait; + + /* if min timeout has been hit, don't reset wait count */ + if (!iowq.hit_timeout) + nr_wait = (int) iowq.cq_tail - + READ_ONCE(ctx->rings->cq.tail); + else + nr_wait = 1; + + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, nr_wait); + set_current_state(TASK_INTERRUPTIBLE); + } else { + prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, + TASK_INTERRUPTIBLE); } - prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, - TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); - if (__io_cqring_events_user(ctx) >= min_events) + + ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); + __set_current_state(TASK_RUNNING); + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); + + /* + * Run task_work after scheduling and before io_should_wake(). + * If we got woken because of task_work being processed, run it + * now rather than let the caller do another wait loop. + */ + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, nr_wait, nr_wait); + io_run_task_work(); + + /* + * Non-local task_work will be run on exit to userspace, but + * if we're using DEFER_TASKRUN, then we could have waited + * with a timeout for a number of requests. If the timeout + * hits, we could have some requests ready to process. Ensure + * this break is _after_ we have run task_work, to avoid + * deferring running potentially pending requests until the + * next time we wait for events. + */ + if (ret < 0) break; + + check_cq = READ_ONCE(ctx->check_cq); + if (unlikely(check_cq)) { + /* let the caller flush overflows, retry */ + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + io_cqring_do_overflow_flush(ctx); + if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { + ret = -EBADR; + break; + } + } + + if (io_should_wake(&iowq)) { + ret = 0; + break; + } cond_resched(); - } while (ret > 0); + } while (1); - finish_wait(&ctx->cq_wait, &iowq.wq); + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + finish_wait(&ctx->cq_wait, &iowq.wq); restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void io_mem_free(void *ptr) +static void io_rings_free(struct io_ring_ctx *ctx) { - struct page *page; - - if (!ptr) - return; - - page = virt_to_head_page(ptr); - if (put_page_testzero(page)) - free_compound_page(page); + io_free_region(ctx->user, &ctx->sq_region); + io_free_region(ctx->user, &ctx->ring_region); + ctx->rings = NULL; + ctx->sq_sqes = NULL; } -static void *io_mem_alloc(size_t size) +static int rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, struct io_rings_layout *rl) { - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; + struct io_rings *rings; + size_t sqe_size; + size_t off; - return (void *) __get_free_pages(gfp, get_order(size)); -} + if (flags & IORING_SETUP_CQE_MIXED) { + if (cq_entries < 2) + return -EOVERFLOW; + } + if (flags & IORING_SETUP_SQE_MIXED) { + if (sq_entries < 2) + return -EOVERFLOW; + } -static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) -{ - struct io_rings *rings; - size_t off, sq_array_size; + rl->sq_array_offset = SIZE_MAX; + + sqe_size = sizeof(struct io_uring_sqe); + if (flags & IORING_SETUP_SQE128) + sqe_size *= 2; + + rl->sq_size = array_size(sqe_size, sq_entries); + if (rl->sq_size == SIZE_MAX) + return -EOVERFLOW; off = struct_size(rings, cqes, cq_entries); + if (flags & IORING_SETUP_CQE32) + off = size_mul(off, 2); if (off == SIZE_MAX) - return SIZE_MAX; - if (ctx->flags & IORING_SETUP_CQE32) { - if (check_shl_overflow(off, 1, &off)) - return SIZE_MAX; - } + return -EOVERFLOW; #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); if (off == 0) - return SIZE_MAX; + return -EOVERFLOW; #endif - if (sq_offset) - *sq_offset = off; - - sq_array_size = array_size(sizeof(u32), sq_entries); - if (sq_array_size == SIZE_MAX) - return SIZE_MAX; - - if (check_add_overflow(off, sq_array_size, &off)) - return SIZE_MAX; - - return off; -} - -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int eventfd_async) -{ - struct io_ev_fd *ev_fd; - __s32 __user *fds = arg; - int fd; + if (!(flags & IORING_SETUP_NO_SQARRAY)) { + size_t sq_array_size; - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) - return -EBUSY; + rl->sq_array_offset = off; - if (copy_from_user(&fd, fds, sizeof(*fds))) - return -EFAULT; - - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); - if (!ev_fd) - return -ENOMEM; - - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ev_fd->cq_ev_fd)) { - int ret = PTR_ERR(ev_fd->cq_ev_fd); - kfree(ev_fd); - return ret; + sq_array_size = array_size(sizeof(u32), sq_entries); + off = size_add(off, sq_array_size); + if (off == SIZE_MAX) + return -EOVERFLOW; } - spin_lock(&ctx->completion_lock); - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - - ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - atomic_set(&ev_fd->refs, 1); - atomic_set(&ev_fd->ops, 0); + rl->rings_size = off; return 0; } -static int io_eventfd_unregister(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) { - ctx->has_evfd = false; - rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) - call_rcu(&ev_fd->rcu, io_eventfd_ops); - return 0; - } - - return -ENXIO; -} - -static void io_req_caches_free(struct io_ring_ctx *ctx) +static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) { + struct io_kiocb *req; int nr = 0; - mutex_lock(&ctx->uring_lock); - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); - while (!io_req_cache_empty(ctx)) { - struct io_kiocb *req = io_alloc_req(ctx); - + req = io_extract_req(ctx); + io_poison_req(req); kmem_cache_free(req_cachep, req); nr++; } - if (nr) + if (nr) { + ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); - mutex_unlock(&ctx->uring_lock); + } +} + +static __cold void io_req_caches_free(struct io_ring_ctx *ctx) +{ + guard(mutex)(&ctx->uring_lock); + __io_req_caches_free(ctx); } static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); - io_rsrc_refs_drop(ctx); - /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ - io_wait_rsrc_data(ctx->buf_data); - io_wait_rsrc_data(ctx->file_data); mutex_lock(&ctx->uring_lock); - if (ctx->buf_data) - __io_sqe_buffers_unregister(ctx); - if (ctx->file_data) - __io_sqe_files_unregister(ctx); + io_sqe_buffers_unregister(ctx); + io_sqe_files_unregister(ctx); + io_unregister_zcrx_ifqs(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); - io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); - mutex_unlock(&ctx->uring_lock); + io_free_alloc_caches(ctx); io_destroy_buffers(ctx); + io_free_region(ctx->user, &ctx->param_region); + mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); if (ctx->submitter_task) put_task_struct(ctx->submitter_task); - /* there are no registered resources left, nobody uses it */ - if (ctx->rsrc_node) - io_rsrc_node_destroy(ctx->rsrc_node); - if (ctx->rsrc_backup_node) - io_rsrc_node_destroy(ctx->rsrc_backup_node); - flush_delayed_work(&ctx->rsrc_put_work); - flush_delayed_work(&ctx->fallback_work); - - WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); - WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); - -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - ctx->ring_sock->file = NULL; /* so that iput() is called */ - sock_release(ctx->ring_sock); - } -#endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); + io_rings_free(ctx); + + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + static_branch_dec(&io_key_has_sqarray); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); + + WARN_ON_ONCE(ctx->nr_req_allocated); + if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); - kfree(ctx->cancel_table.hbs); - kfree(ctx->cancel_table_locked.hbs); - kfree(ctx->dummy_ubuf); - kfree(ctx->io_bl); + io_napi_free(ctx); + kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); } +static __cold void io_activate_pollwq_cb(struct callback_head *cb) +{ + struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, + poll_wq_task_work); + + mutex_lock(&ctx->uring_lock); + ctx->poll_activated = true; + mutex_unlock(&ctx->uring_lock); + + /* + * Wake ups for some events between start of polling and activation + * might've been lost due to loose synchronisation. + */ + wake_up_all(&ctx->poll_wq); + percpu_ref_put(&ctx->refs); +} + +__cold void io_activate_pollwq(struct io_ring_ctx *ctx) +{ + spin_lock(&ctx->completion_lock); + /* already activated or in progress */ + if (ctx->poll_activated || ctx->poll_wq_task_work.func) + goto out; + if (WARN_ON_ONCE(!ctx->task_complete)) + goto out; + if (!ctx->submitter_task) + goto out; + /* + * with ->submitter_task only the submitter task completes requests, we + * only need to sync with it, which is done by injecting a tw + */ + init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb); + percpu_ref_get(&ctx->refs); + if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL)) + percpu_ref_put(&ctx->refs); +out: + spin_unlock(&ctx->completion_lock); +} + static __poll_t io_uring_poll(struct file *file, poll_table *wait) { struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - poll_wait(file, &ctx->cq_wait, wait); + if (unlikely(!ctx->poll_activated)) + io_activate_pollwq(ctx); /* - * synchronizes with barrier from wq_has_sleeper call in - * io_commit_cqring + * provides mb() which pairs with barrier from wq_has_sleeper + * call in io_commit_cqring */ - smp_rmb(); + poll_wait(file, &ctx->poll_wq, wait); + if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; @@ -2790,25 +2948,12 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * pushes them to do the flush. */ - if (io_cqring_events(ctx) || io_has_work(ctx)) + if (__io_cqring_events_user(ctx) || io_has_work(ctx)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } -static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) -{ - const struct cred *creds; - - creds = xa_erase(&ctx->personalities, id); - if (creds) { - put_cred(creds); - return 0; - } - - return -EINVAL; -} - struct io_tctx_exit { struct callback_head task_work; struct completion completion; @@ -2822,23 +2967,16 @@ static __cold void io_tctx_exit_cb(struct callback_head *cb) work = container_of(cb, struct io_tctx_exit, task_work); /* - * When @in_idle, we're in cancellation and it's racy to remove the + * When @in_cancel, we're in cancellation and it's racy to remove the * node. It'll be removed by the end of cancellation, just ignore it. * tctx can be NULL if the queueing of this task_work raced with * work cancelation off the exec path. */ - if (tctx && !atomic_read(&tctx->in_idle)) + if (tctx && !atomic_read(&tctx->in_cancel)) io_uring_del_tctx_node((unsigned long)work->ctx); complete(&work->completion); } -static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - return req->ctx == data; -} - static __cold void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); @@ -2864,7 +3002,8 @@ static __cold void io_ring_exit_work(struct work_struct *work) if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); - while (io_uring_try_cancel_requests(ctx, NULL, true)) + /* The SQPOLL thread never reaches this path */ + while (io_uring_try_cancel_requests(ctx, NULL, true, false)) cond_resched(); if (ctx->sq_data) { @@ -2872,7 +3011,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) struct task_struct *tsk; io_sq_thread_park(sqd); - tsk = sqd->thread; + tsk = sqpoll_task_locked(sqd); if (tsk && tsk->io_uring && tsk->io_uring->io_wq) io_wq_cancel_cb(tsk->io_uring->io_wq, io_cancel_ctx_cb, ctx, true); @@ -2885,17 +3024,23 @@ static __cold void io_ring_exit_work(struct work_struct *work) /* there is little hope left, don't run it too often */ interval = HZ * 60; } - } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); + /* + * This is really an uninterruptible wait, as it has to be + * complete. But it's also run from a kworker, which doesn't + * take signals, so it's fine to make it interruptible. This + * avoids scenarios where we knowingly can wait much longer + * on completions, for example if someone does a SIGSTOP on + * a task that needs to finish task_work to make this loop + * complete. That's a synthetic situation that should not + * cause a stuck task backtrace, and hence a potential panic + * on stuck tasks if that is enabled. + */ + } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval)); init_completion(&exit.completion); init_task_work(&exit.task_work, io_tctx_exit_cb); exit.ctx = ctx; - /* - * Some may use context even when all refs and requests have been put, - * and they are free to do so while still holding uring_lock or - * completion_lock, see io_req_task_submit(). Apart from other work, - * this lock/unlock section also waits them to finish. - */ + mutex_lock(&ctx->uring_lock); while (!list_empty(&ctx->tctx_list)) { WARN_ON_ONCE(time_after(jiffies, timeout)); @@ -2909,13 +3054,22 @@ static __cold void io_ring_exit_work(struct work_struct *work) continue; mutex_unlock(&ctx->uring_lock); - wait_for_completion(&exit.completion); + /* + * See comment above for + * wait_for_completion_interruptible_timeout() on why this + * wait is marked as interruptible. + */ + wait_for_completion_interruptible(&exit.completion); mutex_lock(&ctx->uring_lock); } mutex_unlock(&ctx->uring_lock); spin_lock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock); + /* pairs with RCU read section in io_req_local_work_add() */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + synchronize_rcu(); + io_ring_ctx_free(ctx); } @@ -2928,25 +3082,18 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); - if (ctx->rings) - io_poll_remove_all(ctx, NULL, true); mutex_unlock(&ctx->uring_lock); - /* - * If we failed setting up the ctx, we might not have any rings - * and therefore did not submit any requests - */ - if (ctx->rings) - io_kill_timeouts(ctx, NULL, true); + flush_delayed_work(&ctx->fallback_work); INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* - * Use system_unbound_wq to avoid spawning tons of event kworkers + * Use system_dfl_wq to avoid spawning tons of event kworkers * if we're exiting a ton of rings at the same time. It just adds * noise and overhead, there's no discernable change in runtime - * over using system_wq. + * over using system_percpu_wq. */ - queue_work(system_unbound_wq, &ctx->exit_work); + queue_work(iou_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) @@ -2958,301 +3105,77 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } -struct io_task_cancel { - struct task_struct *task; - bool all; -}; - -static bool io_cancel_task_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_task_cancel *cancel = data; - - return io_match_task_safe(req, cancel->task, cancel->all); -} - -static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all) -{ - struct io_defer_entry *de; - LIST_HEAD(list); - - spin_lock(&ctx->completion_lock); - list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task_safe(de->req, task, cancel_all)) { - list_cut_position(&list, &ctx->defer_list, &de->list); - break; - } - } - spin_unlock(&ctx->completion_lock); - if (list_empty(&list)) - return false; - - while (!list_empty(&list)) { - de = list_first_entry(&list, struct io_defer_entry, list); - list_del_init(&de->list); - io_req_task_queue_fail(de->req, -ECANCELED); - kfree(de); - } - return true; -} - -static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) -{ - struct io_tctx_node *node; - enum io_wq_cancel cret; - bool ret = false; - - mutex_lock(&ctx->uring_lock); - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - /* - * io_wq will stay alive while we hold uring_lock, because it's - * killed after ctx nodes, which requires to take the lock. - */ - if (!tctx || !tctx->io_wq) - continue; - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - mutex_unlock(&ctx->uring_lock); - - return ret; -} - -static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all) +static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, + const struct io_uring_getevents_arg __user *uarg) { - struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; - struct io_uring_task *tctx = task ? task->io_uring : NULL; - enum io_wq_cancel cret; - bool ret = false; + unsigned long size = sizeof(struct io_uring_reg_wait); + unsigned long offset = (uintptr_t)uarg; + unsigned long end; - /* failed during ring init, it couldn't have issued any requests */ - if (!ctx->rings) - return false; + if (unlikely(offset % sizeof(long))) + return ERR_PTR(-EFAULT); - if (!task) { - ret |= io_uring_try_cancel_iowq(ctx); - } else if (tctx && tctx->io_wq) { - /* - * Cancels requests of all rings, not only @ctx, but - * it's fine as the task is in exit/exec. - */ - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, - &cancel, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - - /* SQPOLL thread does its own polling */ - if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - (ctx->sq_data && ctx->sq_data->thread == current)) { - while (!wq_list_empty(&ctx->iopoll_list)) { - io_iopoll_try_reap_events(ctx); - ret = true; - } - } + /* also protects from NULL ->cq_wait_arg as the size would be 0 */ + if (unlikely(check_add_overflow(offset, size, &end) || + end > ctx->cq_wait_size)) + return ERR_PTR(-EFAULT); - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) - ret |= io_run_local_work(ctx) > 0; - ret |= io_cancel_defer_files(ctx, task, cancel_all); - mutex_lock(&ctx->uring_lock); - ret |= io_poll_remove_all(ctx, task, cancel_all); - mutex_unlock(&ctx->uring_lock); - ret |= io_kill_timeouts(ctx, task, cancel_all); - if (task) - ret |= io_run_task_work() > 0; - return ret; + offset = array_index_nospec(offset, ctx->cq_wait_size - size); + return ctx->cq_wait_arg + offset; } -static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) +static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags, + const void __user *argp, size_t argsz) { - if (tracked) - return atomic_read(&tctx->inflight_tracked); - return percpu_counter_sum(&tctx->inflight); -} - -/* - * Find any io_uring ctx that this task has registered or done IO on, and cancel - * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. - */ -__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_ring_ctx *ctx; - s64 inflight; - DEFINE_WAIT(wait); - - WARN_ON_ONCE(sqd && sqd->thread != current); - - if (!current->io_uring) - return; - if (tctx->io_wq) - io_wq_exit_start(tctx->io_wq); - - atomic_inc(&tctx->in_idle); - do { - bool loop = false; - - io_uring_drop_tctx_refs(current); - /* read completions before cancelations */ - inflight = tctx_inflight(tctx, !cancel_all); - if (!inflight) - break; - - if (!sqd) { - struct io_tctx_node *node; - unsigned long index; - - xa_for_each(&tctx->xa, index, node) { - /* sqpoll task will cancel all its requests */ - if (node->ctx->sq_data) - continue; - loop |= io_uring_try_cancel_requests(node->ctx, - current, cancel_all); - } - } else { - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - loop |= io_uring_try_cancel_requests(ctx, - current, - cancel_all); - } - - if (loop) { - cond_resched(); - continue; - } - - prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); - io_run_task_work(); - io_uring_drop_tctx_refs(current); - - /* - * If we've seen completions, retry without waiting. This - * avoids a race where a completion comes in before we did - * prepare_to_wait(). - */ - if (inflight == tctx_inflight(tctx, !cancel_all)) - schedule(); - finish_wait(&tctx->wait, &wait); - } while (1); - - io_uring_clean_tctx(tctx); - if (cancel_all) { - /* - * We shouldn't run task_works after cancel, so just leave - * ->in_idle set for normal exit. - */ - atomic_dec(&tctx->in_idle); - /* for exec all current's requests should be gone, kill tctx */ - __io_uring_free(current); - } -} - -void __io_uring_cancel(bool cancel_all) -{ - io_uring_cancel_generic(cancel_all, NULL); -} - -static void *io_uring_validate_mmap_request(struct file *file, - loff_t pgoff, size_t sz) -{ - struct io_ring_ctx *ctx = file->private_data; - loff_t offset = pgoff << PAGE_SHIFT; - struct page *page; - void *ptr; - - switch (offset) { - case IORING_OFF_SQ_RING: - case IORING_OFF_CQ_RING: - ptr = ctx->rings; - break; - case IORING_OFF_SQES: - ptr = ctx->sq_sqes; - break; - default: - return ERR_PTR(-EINVAL); - } - - page = virt_to_head_page(ptr); - if (sz > page_size(page)) - return ERR_PTR(-EINVAL); - - return ptr; -} - -#ifdef CONFIG_MMU - -static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - size_t sz = vma->vm_end - vma->vm_start; - unsigned long pfn; - void *ptr; - - ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - pfn = virt_to_phys(ptr) >> PAGE_SHIFT; - return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); -} - -#else /* !CONFIG_MMU */ - -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; -} - -static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) -{ - return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; -} - -static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - void *ptr; - - ptr = io_uring_validate_mmap_request(file, pgoff, len); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - return (unsigned long) ptr; -} - -#endif /* !CONFIG_MMU */ - -static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) -{ - if (flags & IORING_ENTER_EXT_ARG) { - struct io_uring_getevents_arg arg; + struct io_uring_getevents_arg arg; - if (argsz != sizeof(arg)) - return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) - return -EFAULT; - } + if (!(flags & IORING_ENTER_EXT_ARG)) + return 0; + if (flags & IORING_ENTER_EXT_ARG_REG) + return -EINVAL; + if (argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; return 0; } -static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, - struct __kernel_timespec __user **ts, - const sigset_t __user **sig) +static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags, + const void __user *argp, struct ext_arg *ext_arg) { + const struct io_uring_getevents_arg __user *uarg = argp; struct io_uring_getevents_arg arg; + ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT); + /* * If EXT_ARG isn't set, then we have no timespec and the argp pointer * is just a pointer to the sigset_t. */ if (!(flags & IORING_ENTER_EXT_ARG)) { - *sig = (const sigset_t __user *) argp; - *ts = NULL; + ext_arg->sig = (const sigset_t __user *) argp; + return 0; + } + + if (flags & IORING_ENTER_EXT_ARG_REG) { + struct io_uring_reg_wait *w; + + if (ext_arg->argsz != sizeof(struct io_uring_reg_wait)) + return -EINVAL; + w = io_get_ext_arg_reg(ctx, argp); + if (IS_ERR(w)) + return PTR_ERR(w); + + if (w->flags & ~IORING_REG_WAIT_TS) + return -EINVAL; + ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC; + ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask)); + ext_arg->argsz = READ_ONCE(w->sigmask_sz); + if (w->flags & IORING_REG_WAIT_TS) { + ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec); + ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec); + ext_arg->ts_set = true; + } return 0; } @@ -3260,16 +3183,34 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. */ - if (*argsz != sizeof(arg)) + if (ext_arg->argsz != sizeof(arg)) return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) +#ifdef CONFIG_64BIT + if (!user_access_begin(uarg, sizeof(*uarg))) return -EFAULT; - if (arg.pad) - return -EINVAL; - *sig = u64_to_user_ptr(arg.sigmask); - *argsz = arg.sigmask_sz; - *ts = u64_to_user_ptr(arg.ts); + unsafe_get_user(arg.sigmask, &uarg->sigmask, uaccess_end); + unsafe_get_user(arg.sigmask_sz, &uarg->sigmask_sz, uaccess_end); + unsafe_get_user(arg.min_wait_usec, &uarg->min_wait_usec, uaccess_end); + unsafe_get_user(arg.ts, &uarg->ts, uaccess_end); + user_access_end(); +#else + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; +#endif + ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC; + ext_arg->sig = u64_to_user_ptr(arg.sigmask); + ext_arg->argsz = arg.sigmask_sz; + if (arg.ts) { + if (get_timespec64(&ext_arg->ts, u64_to_user_ptr(arg.ts))) + return -EFAULT; + ext_arg->ts_set = true; + } return 0; +#ifdef CONFIG_64BIT +uaccess_end: + user_access_end(); + return -EFAULT; +#endif } SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, @@ -3277,12 +3218,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, size_t, argsz) { struct io_ring_ctx *ctx; - struct fd f; + struct file *file; long ret; - if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | - IORING_ENTER_REGISTERED_RING))) + if (unlikely(flags & ~IORING_ENTER_FLAGS)) return -EINVAL; /* @@ -3295,20 +3234,19 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) return -EINVAL; fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); - f.file = tctx->registered_rings[fd]; - f.flags = 0; - if (unlikely(!f.file)) + file = tctx->registered_rings[fd]; + if (unlikely(!file)) return -EBADF; } else { - f = fdget(fd); - if (unlikely(!f.file)) + file = fget(fd); + if (unlikely(!file)) return -EBADF; ret = -EOPNOTSUPP; - if (unlikely(!io_is_uring_fops(f.file))) + if (unlikely(!io_is_uring_fops(file))) goto out; } - ctx = f.file->private_data; + ctx = file->private_data; ret = -EBADFD; if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) goto out; @@ -3320,19 +3258,15 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { - io_cqring_overflow_flush(ctx); - if (unlikely(ctx->sq_data->thread == NULL)) { ret = -EOWNERDEAD; goto out; } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); - if (flags & IORING_ENTER_SQ_WAIT) { - ret = io_sqpoll_wait_sq(ctx); - if (ret) - goto out; - } + if (flags & IORING_ENTER_SQ_WAIT) + io_sqpoll_wait_sq(ctx); + ret = to_submit; } else if (to_submit) { ret = io_uring_add_tctx_node(ctx); @@ -3353,7 +3287,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, * it should handle ownership problems if any. */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) - (void)io_run_local_work_locked(ctx); + (void)io_run_local_work_locked(ctx, min_complete); } mutex_unlock(&ctx->uring_lock); } @@ -3370,24 +3304,17 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ mutex_lock(&ctx->uring_lock); iopoll_locked: - ret2 = io_validate_ext_arg(flags, argp, argsz); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); + ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); + if (likely(!ret2)) ret2 = io_iopoll_check(ctx, min_complete); - } mutex_unlock(&ctx->uring_lock); } else { - const sigset_t __user *sig; - struct __kernel_timespec __user *ts; - - ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); - ret2 = io_cqring_wait(ctx, min_complete, sig, - argsz, ts); - } + struct ext_arg ext_arg = { .argsz = argsz }; + + ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg); + if (likely(!ret2)) + ret2 = io_cqring_wait(ctx, min_complete, flags, + &ext_arg); } if (!ret) { @@ -3404,15 +3331,16 @@ iopoll_locked: } } out: - fdput(f); + if (!(flags & IORING_ENTER_REGISTERED_RING)) + fput(file); return ret; } static const struct file_operations io_uring_fops = { .release = io_uring_release, .mmap = io_uring_mmap, + .get_unmapped_area = io_uring_get_unmapped_area, #ifndef CONFIG_MMU - .get_unmapped_area = io_uring_nommu_get_unmapped_area, .mmap_capabilities = io_uring_nommu_mmap_capabilities, #endif .poll = io_uring_poll, @@ -3427,63 +3355,59 @@ bool io_is_uring_fops(struct file *file) } static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, - struct io_uring_params *p) + struct io_ctx_config *config) { + struct io_uring_params *p = &config->p; + struct io_rings_layout *rl = &config->layout; + struct io_uring_region_desc rd; struct io_rings *rings; - size_t size, sq_array_offset; + int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); - if (size == SIZE_MAX) - return -EOVERFLOW; - - rings = io_mem_alloc(size); - if (!rings) - return -ENOMEM; - - ctx->rings = rings; - ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); - rings->sq_ring_mask = p->sq_entries - 1; - rings->cq_ring_mask = p->cq_entries - 1; - rings->sq_ring_entries = p->sq_entries; - rings->cq_ring_entries = p->cq_entries; - - if (p->flags & IORING_SETUP_SQE128) - size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); - else - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); - if (size == SIZE_MAX) { - io_mem_free(ctx->rings); - ctx->rings = NULL; - return -EOVERFLOW; + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(rl->rings_size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; } + ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) + return ret; + ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset); - ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { - io_mem_free(ctx->rings); - ctx->rings = NULL; - return -ENOMEM; + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(rl->sq_size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES); + if (ret) { + io_rings_free(ctx); + return ret; } + ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); + memset(rings, 0, sizeof(*rings)); + WRITE_ONCE(rings->sq_ring_mask, ctx->sq_entries - 1); + WRITE_ONCE(rings->cq_ring_mask, ctx->cq_entries - 1); + WRITE_ONCE(rings->sq_ring_entries, ctx->sq_entries); + WRITE_ONCE(rings->cq_ring_entries, ctx->cq_entries); return 0; } -static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) +static int io_uring_install_fd(struct file *file) { - int ret, fd; + int fd; fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (fd < 0) return fd; - - ret = __io_uring_add_tctx_node(ctx); - if (ret) { - put_unused_fd(fd); - return ret; - } fd_install(fd, file); return fd; } @@ -3491,40 +3415,74 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) /* * Allocate an anonymous fd, this is what constitutes the application * visible backing of an io_uring instance. The application mmaps this - * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, - * we have to tie this fd to a socket for file garbage collection purposes. + * fd to gain access to the SQ/CQ ring details. */ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) { - struct file *file; -#if defined(CONFIG_UNIX) - int ret; + /* Create a new inode so that the LSM can block the creation. */ + return anon_inode_create_getfile("[io_uring]", &io_uring_fops, ctx, + O_RDWR | O_CLOEXEC, NULL); +} - ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, - &ctx->ring_sock); - if (ret) - return ERR_PTR(ret); -#endif +static int io_uring_sanitise_params(struct io_uring_params *p) +{ + unsigned flags = p->flags; - file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, - O_RDWR | O_CLOEXEC, NULL); -#if defined(CONFIG_UNIX) - if (IS_ERR(file)) { - sock_release(ctx->ring_sock); - ctx->ring_sock = NULL; - } else { - ctx->ring_sock->file = file; + if (flags & ~IORING_SETUP_FLAGS) + return -EINVAL; + + /* There is no way to mmap rings without a real fd */ + if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && + !(flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + + if (flags & IORING_SETUP_SQPOLL) { + /* IPI related flags don't make sense with SQPOLL */ + if (flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_TASKRUN_FLAG | + IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; } -#endif - return file; + + if (flags & IORING_SETUP_TASKRUN_FLAG) { + if (!(flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_DEFER_TASKRUN))) + return -EINVAL; + } + + /* HYBRID_IOPOLL only valid with IOPOLL */ + if ((flags & IORING_SETUP_HYBRID_IOPOLL) && !(flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + /* + * For DEFER_TASKRUN we require the completion task to be the same as + * the submission task. This implies that there is only one submitter. + */ + if ((flags & IORING_SETUP_DEFER_TASKRUN) && + !(flags & IORING_SETUP_SINGLE_ISSUER)) + return -EINVAL; + + /* + * Nonsensical to ask for CQE32 and mixed CQE support, it's not + * supported to post 16b CQEs on a ring setup with CQE32. + */ + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) + return -EINVAL; + /* + * Nonsensical to ask for SQE128 and mixed SQE support, it's not + * supported to post 64b SQEs on a ring setup with SQE128. + */ + if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) == + (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) + return -EINVAL; + + return 0; } -static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, - struct io_uring_params __user *params) +static int io_uring_fill_params(struct io_uring_params *p) { - struct io_ring_ctx *ctx; - struct file *file; - int ret; + unsigned entries = p->sq_entries; if (!entries) return -EINVAL; @@ -3563,15 +3521,90 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_entries = 2 * p->sq_entries; } + return 0; +} + +int io_prepare_config(struct io_ctx_config *config) +{ + struct io_uring_params *p = &config->p; + int ret; + + ret = io_uring_sanitise_params(p); + if (ret) + return ret; + + ret = io_uring_fill_params(p); + if (ret) + return ret; + + ret = rings_size(p->flags, p->sq_entries, p->cq_entries, + &config->layout); + if (ret) + return ret; + + p->sq_off.head = offsetof(struct io_rings, sq.head); + p->sq_off.tail = offsetof(struct io_rings, sq.tail); + p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); + p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); + p->sq_off.flags = offsetof(struct io_rings, sq_flags); + p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); + p->sq_off.resv1 = 0; + if (!(p->flags & IORING_SETUP_NO_MMAP)) + p->sq_off.user_addr = 0; + + p->cq_off.head = offsetof(struct io_rings, cq.head); + p->cq_off.tail = offsetof(struct io_rings, cq.tail); + p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); + p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); + p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); + p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->cq_off.flags = offsetof(struct io_rings, cq_flags); + p->cq_off.resv1 = 0; + if (!(p->flags & IORING_SETUP_NO_MMAP)) + p->cq_off.user_addr = 0; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + p->sq_off.array = config->layout.sq_array_offset; + + return 0; +} + +static __cold int io_uring_create(struct io_ctx_config *config) +{ + struct io_uring_params *p = &config->p; + struct io_ring_ctx *ctx; + struct io_uring_task *tctx; + struct file *file; + int ret; + + ret = io_prepare_config(config); + if (ret) + return ret; + ctx = io_ring_ctx_alloc(p); if (!ctx) return -ENOMEM; + ctx->clockid = CLOCK_MONOTONIC; + ctx->clock_offset = 0; + + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + static_branch_inc(&io_key_has_sqarray); + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_SQPOLL)) ctx->task_complete = true; + if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) + ctx->lockless_cq = true; + + /* + * lazy poll_wq activation relies on ->task_complete for synchronisation + * purposes, see io_activate_pollwq() + */ + if (!ctx->task_complete) + ctx->poll_activated = true; + /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user * space applications don't need to do io completion events @@ -3583,39 +3616,17 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->syscall_iopoll = 1; ctx->compat = in_compat_syscall(); - if (!capable(CAP_IPC_LOCK)) + if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); /* * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if * COOP_TASKRUN is set, then IPIs are never needed by the app. */ - ret = -EINVAL; - if (ctx->flags & IORING_SETUP_SQPOLL) { - /* IPI related flags don't make sense with SQPOLL */ - if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | - IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_DEFER_TASKRUN)) - goto err; + if (ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_COOP_TASKRUN)) ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { - ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else { - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && - !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) - goto err; + else ctx->notify_method = TWA_SIGNAL; - } - - /* - * For DEFER_TASKRUN we require the completion task to be the same as the - * submission task. This implies that there is only one submitter, so enforce - * that. - */ - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && - !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { - goto err; - } /* * This is just grabbed for accounting purposes. When a process exits, @@ -3626,53 +3637,29 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, mmgrab(current->mm); ctx->mm_account = current->mm; - ret = io_allocate_scq_urings(ctx, p); + ret = io_allocate_scq_urings(ctx, config); if (ret) goto err; ret = io_sq_offload_create(ctx, p); if (ret) goto err; - /* always set a rsrc node */ - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto err; - io_rsrc_node_switch(ctx, NULL); - memset(&p->sq_off, 0, sizeof(p->sq_off)); - p->sq_off.head = offsetof(struct io_rings, sq.head); - p->sq_off.tail = offsetof(struct io_rings, sq.tail); - p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); - p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); - p->sq_off.flags = offsetof(struct io_rings, sq_flags); - p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; + p->features = IORING_FEAT_FLAGS; - memset(&p->cq_off, 0, sizeof(p->cq_off)); - p->cq_off.head = offsetof(struct io_rings, cq.head); - p->cq_off.tail = offsetof(struct io_rings, cq.tail); - p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); - p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); - p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); - p->cq_off.cqes = offsetof(struct io_rings, cqes); - p->cq_off.flags = offsetof(struct io_rings, cq_flags); - - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | - IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE; - - if (copy_to_user(params, p, sizeof(*p))) { + if (copy_to_user(config->uptr, p, sizeof(*p))) { ret = -EFAULT; goto err; } if (ctx->flags & IORING_SETUP_SINGLE_ISSUER - && !(ctx->flags & IORING_SETUP_R_DISABLED)) - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); + && !(ctx->flags & IORING_SETUP_R_DISABLED)) { + /* + * Unlike io_register_enable_rings(), don't need WRITE_ONCE() + * since ctx isn't yet accessible from other tasks + */ + ctx->submitter_task = get_task_struct(current); + } file = io_uring_get_file(ctx); if (IS_ERR(file)) { @@ -3680,22 +3667,30 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; } + ret = __io_uring_add_tctx_node(ctx); + if (ret) + goto err_fput; + tctx = current->io_uring; + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ - ret = io_uring_install_fd(ctx, file); - if (ret < 0) { - /* fput will clean it up */ - fput(file); - return ret; - } + if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) + ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX); + else + ret = io_uring_install_fd(file); + if (ret < 0) + goto err_fput; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: io_ring_ctx_wait_and_kill(ctx); return ret; +err_fput: + fput(file); + return ret; } /* @@ -3705,503 +3700,64 @@ err: */ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { - struct io_uring_params p; - int i; + struct io_ctx_config config; - if (copy_from_user(&p, params, sizeof(p))) - return -EFAULT; - for (i = 0; i < ARRAY_SIZE(p.resv); i++) { - if (p.resv[i]) - return -EINVAL; - } + memset(&config, 0, sizeof(config)); - if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | - IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | - IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN)) - return -EINVAL; - - return io_uring_create(entries, &p, params); -} - -SYSCALL_DEFINE2(io_uring_setup, u32, entries, - struct io_uring_params __user *, params) -{ - return io_uring_setup(entries, params); -} - -static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) -{ - struct io_uring_probe *p; - size_t size; - int i, ret; - - size = struct_size(p, ops, nr_args); - if (size == SIZE_MAX) - return -EOVERFLOW; - p = kzalloc(size, GFP_KERNEL); - if (!p) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(p, arg, size)) - goto out; - ret = -EINVAL; - if (memchr_inv(p, 0, size)) - goto out; - - p->last_op = IORING_OP_LAST - 1; - if (nr_args > IORING_OP_LAST) - nr_args = IORING_OP_LAST; - - for (i = 0; i < nr_args; i++) { - p->ops[i].op = i; - if (!io_op_defs[i].not_supported) - p->ops[i].flags = IO_URING_OP_SUPPORTED; - } - p->ops_len = i; - - ret = 0; - if (copy_to_user(arg, p, size)) - ret = -EFAULT; -out: - kfree(p); - return ret; -} - -static int io_register_personality(struct io_ring_ctx *ctx) -{ - const struct cred *creds; - u32 id; - int ret; - - creds = get_current_cred(); - - ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, - XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); - if (ret < 0) { - put_cred(creds); - return ret; - } - return id; -} - -static __cold int io_register_restrictions(struct io_ring_ctx *ctx, - void __user *arg, unsigned int nr_args) -{ - struct io_uring_restriction *res; - size_t size; - int i, ret; - - /* Restrictions allowed only if rings started disabled */ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) - return -EBUSY; - - if (!arg || nr_args > IORING_MAX_RESTRICTIONS) - return -EINVAL; - - size = array_size(nr_args, sizeof(*res)); - if (size == SIZE_MAX) - return -EOVERFLOW; - - res = memdup_user(arg, size); - if (IS_ERR(res)) - return PTR_ERR(res); - - ret = 0; - - for (i = 0; i < nr_args; i++) { - switch (res[i].opcode) { - case IORING_RESTRICTION_REGISTER_OP: - if (res[i].register_op >= IORING_REGISTER_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].register_op, - ctx->restrictions.register_op); - break; - case IORING_RESTRICTION_SQE_OP: - if (res[i].sqe_op >= IORING_OP_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); - break; - case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; - break; - case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; - break; - default: - ret = -EINVAL; - goto out; - } - } - -out: - /* Reset all restrictions if an error happened */ - if (ret != 0) - memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); - else - ctx->restrictions.registered = true; - - kfree(res); - return ret; -} - -static int io_register_enable_rings(struct io_ring_ctx *ctx) -{ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); - - if (ctx->restrictions.registered) - ctx->restricted = 1; - - ctx->flags &= ~IORING_SETUP_R_DISABLED; - if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); - return 0; -} - -static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, - void __user *arg, unsigned len) -{ - struct io_uring_task *tctx = current->io_uring; - cpumask_var_t new_mask; - int ret; - - if (!tctx || !tctx->io_wq) - return -EINVAL; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_clear(new_mask); - if (len > cpumask_size()) - len = cpumask_size(); - - if (in_compat_syscall()) { - ret = compat_get_bitmap(cpumask_bits(new_mask), - (const compat_ulong_t __user *)arg, - len * 8 /* CHAR_BIT */); - } else { - ret = copy_from_user(new_mask, arg, len); - } - - if (ret) { - free_cpumask_var(new_mask); + if (copy_from_user(&config.p, params, sizeof(config.p))) return -EFAULT; - } - ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); - free_cpumask_var(new_mask); - return ret; -} - -static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx = current->io_uring; - - if (!tctx || !tctx->io_wq) + if (!mem_is_zero(&config.p.resv, sizeof(config.p.resv))) return -EINVAL; - return io_wq_cpu_affinity(tctx->io_wq, NULL); + config.p.sq_entries = entries; + config.uptr = params; + return io_uring_create(&config); } -static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, - void __user *arg) - __must_hold(&ctx->uring_lock) +static inline int io_uring_allowed(void) { - struct io_tctx_node *node; - struct io_uring_task *tctx = NULL; - struct io_sq_data *sqd = NULL; - __u32 new_count[2]; - int i, ret; - - if (copy_from_user(new_count, arg, sizeof(new_count))) - return -EFAULT; - for (i = 0; i < ARRAY_SIZE(new_count); i++) - if (new_count[i] > INT_MAX) - return -EINVAL; - - if (ctx->flags & IORING_SETUP_SQPOLL) { - sqd = ctx->sq_data; - if (sqd) { - /* - * Observe the correct sqd->lock -> ctx->uring_lock - * ordering. Fine to drop uring_lock here, we hold - * a ref to the ctx. - */ - refcount_inc(&sqd->refs); - mutex_unlock(&ctx->uring_lock); - mutex_lock(&sqd->lock); - mutex_lock(&ctx->uring_lock); - if (sqd->thread) - tctx = sqd->thread->io_uring; - } - } else { - tctx = current->io_uring; - } - - BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); + int disabled = READ_ONCE(sysctl_io_uring_disabled); + kgid_t io_uring_group; - for (i = 0; i < ARRAY_SIZE(new_count); i++) - if (new_count[i]) - ctx->iowq_limits[i] = new_count[i]; - ctx->iowq_limits_set = true; - - if (tctx && tctx->io_wq) { - ret = io_wq_max_workers(tctx->io_wq, new_count); - if (ret) - goto err; - } else { - memset(new_count, 0, sizeof(new_count)); - } - - if (sqd) { - mutex_unlock(&sqd->lock); - io_put_sq_data(sqd); - } - - if (copy_to_user(arg, new_count, sizeof(new_count))) - return -EFAULT; + if (disabled == 2) + return -EPERM; - /* that's it for SQPOLL, only the SQPOLL task creates requests */ - if (sqd) - return 0; + if (disabled == 0 || capable(CAP_SYS_ADMIN)) + goto allowed_lsm; - /* now propagate the restriction to all registered users */ - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; + io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group); + if (!gid_valid(io_uring_group)) + return -EPERM; - if (WARN_ON_ONCE(!tctx->io_wq)) - continue; + if (!in_group_p(io_uring_group)) + return -EPERM; - for (i = 0; i < ARRAY_SIZE(new_count); i++) - new_count[i] = ctx->iowq_limits[i]; - /* ignore errors, it always returns zero anyway */ - (void)io_wq_max_workers(tctx->io_wq, new_count); - } - return 0; -err: - if (sqd) { - mutex_unlock(&sqd->lock); - io_put_sq_data(sqd); - } - return ret; +allowed_lsm: + return security_uring_allowed(); } -static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, - void __user *arg, unsigned nr_args) - __releases(ctx->uring_lock) - __acquires(ctx->uring_lock) +SYSCALL_DEFINE2(io_uring_setup, u32, entries, + struct io_uring_params __user *, params) { int ret; - /* - * We don't quiesce the refs for register anymore and so it can't be - * dying as we're holding a file ref here. - */ - if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) - return -ENXIO; - - if (ctx->submitter_task && ctx->submitter_task != current) - return -EEXIST; - - if (ctx->restricted) { - opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); - if (!test_bit(opcode, ctx->restrictions.register_op)) - return -EACCES; - } - - switch (opcode) { - case IORING_REGISTER_BUFFERS: - ret = -EFAULT; - if (!arg) - break; - ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); - break; - case IORING_UNREGISTER_BUFFERS: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_sqe_buffers_unregister(ctx); - break; - case IORING_REGISTER_FILES: - ret = -EFAULT; - if (!arg) - break; - ret = io_sqe_files_register(ctx, arg, nr_args, NULL); - break; - case IORING_UNREGISTER_FILES: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_sqe_files_unregister(ctx); - break; - case IORING_REGISTER_FILES_UPDATE: - ret = io_register_files_update(ctx, arg, nr_args); - break; - case IORING_REGISTER_EVENTFD: - ret = -EINVAL; - if (nr_args != 1) - break; - ret = io_eventfd_register(ctx, arg, 0); - break; - case IORING_REGISTER_EVENTFD_ASYNC: - ret = -EINVAL; - if (nr_args != 1) - break; - ret = io_eventfd_register(ctx, arg, 1); - break; - case IORING_UNREGISTER_EVENTFD: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_eventfd_unregister(ctx); - break; - case IORING_REGISTER_PROBE: - ret = -EINVAL; - if (!arg || nr_args > 256) - break; - ret = io_probe(ctx, arg, nr_args); - break; - case IORING_REGISTER_PERSONALITY: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_register_personality(ctx); - break; - case IORING_UNREGISTER_PERSONALITY: - ret = -EINVAL; - if (arg) - break; - ret = io_unregister_personality(ctx, nr_args); - break; - case IORING_REGISTER_ENABLE_RINGS: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_register_enable_rings(ctx); - break; - case IORING_REGISTER_RESTRICTIONS: - ret = io_register_restrictions(ctx, arg, nr_args); - break; - case IORING_REGISTER_FILES2: - ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); - break; - case IORING_REGISTER_FILES_UPDATE2: - ret = io_register_rsrc_update(ctx, arg, nr_args, - IORING_RSRC_FILE); - break; - case IORING_REGISTER_BUFFERS2: - ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); - break; - case IORING_REGISTER_BUFFERS_UPDATE: - ret = io_register_rsrc_update(ctx, arg, nr_args, - IORING_RSRC_BUFFER); - break; - case IORING_REGISTER_IOWQ_AFF: - ret = -EINVAL; - if (!arg || !nr_args) - break; - ret = io_register_iowq_aff(ctx, arg, nr_args); - break; - case IORING_UNREGISTER_IOWQ_AFF: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_unregister_iowq_aff(ctx); - break; - case IORING_REGISTER_IOWQ_MAX_WORKERS: - ret = -EINVAL; - if (!arg || nr_args != 2) - break; - ret = io_register_iowq_max_workers(ctx, arg); - break; - case IORING_REGISTER_RING_FDS: - ret = io_ringfd_register(ctx, arg, nr_args); - break; - case IORING_UNREGISTER_RING_FDS: - ret = io_ringfd_unregister(ctx, arg, nr_args); - break; - case IORING_REGISTER_PBUF_RING: - ret = -EINVAL; - if (!arg || nr_args != 1) - break; - ret = io_register_pbuf_ring(ctx, arg); - break; - case IORING_UNREGISTER_PBUF_RING: - ret = -EINVAL; - if (!arg || nr_args != 1) - break; - ret = io_unregister_pbuf_ring(ctx, arg); - break; - case IORING_REGISTER_SYNC_CANCEL: - ret = -EINVAL; - if (!arg || nr_args != 1) - break; - ret = io_sync_cancel(ctx, arg); - break; - case IORING_REGISTER_FILE_ALLOC_RANGE: - ret = -EINVAL; - if (!arg || nr_args) - break; - ret = io_register_file_alloc_range(ctx, arg); - break; - default: - ret = -EINVAL; - break; - } - - return ret; -} - -SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, - void __user *, arg, unsigned int, nr_args) -{ - struct io_ring_ctx *ctx; - long ret = -EBADF; - struct fd f; - - if (opcode >= IORING_REGISTER_LAST) - return -EINVAL; - - f = fdget(fd); - if (!f.file) - return -EBADF; - - ret = -EOPNOTSUPP; - if (!io_is_uring_fops(f.file)) - goto out_fput; - - ctx = f.file->private_data; + ret = io_uring_allowed(); + if (ret) + return ret; - mutex_lock(&ctx->uring_lock); - ret = __io_uring_register(ctx, opcode, arg, nr_args); - mutex_unlock(&ctx->uring_lock); - trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); -out_fput: - fdput(f); - return ret; + return io_uring_setup(entries, params); } static int __init io_uring_init(void) { + struct kmem_cache_args kmem_args = { + .useroffset = offsetof(struct io_kiocb, cmd.data), + .usersize = sizeof_field(struct io_kiocb, cmd.data), + .freeptr_offset = offsetof(struct io_kiocb, work), + .use_freeptr_offset = true, + }; + #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \ @@ -4250,9 +3806,13 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_SQE_ELEM(44, __u16, addr_len); + BUILD_BUG_SQE_ELEM(44, __u8, write_stream); + BUILD_BUG_SQE_ELEM(45, __u8, __pad4[0]); BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); + BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr); + BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != @@ -4270,14 +3830,36 @@ static int __init io_uring_init(void) BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); - BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); + BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof_field(struct io_kiocb, flags)); BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); + /* top 8bits are for internal use */ + BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0); + io_uring_optable_init(); - req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT); + /* imu->dir is u8 */ + BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); + + /* + * Allow user copy in the per-command field, which starts after the + * file in io_kiocb and until the opcode field. The openat2 handling + * requires copying in user memory into the io_kiocb object in that + * range, and HARDENED_USERCOPY will complain if we haven't + * correctly annotated this range. + */ + req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | + SLAB_TYPESAFE_BY_RCU); + + iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); + BUG_ON(!iou_wq); + +#ifdef CONFIG_SYSCTL + register_sysctl_init("kernel", kernel_io_uring_disabled_table); +#endif + return 0; }; __initcall(io_uring_init); |
