diff options
Diffstat (limited to 'io_uring/sqpoll.c')
-rw-r--r-- | io_uring/sqpoll.c | 138 |
1 files changed, 108 insertions, 30 deletions
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 65b5dbe3c850..d037cc68e9d3 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -10,14 +10,17 @@ #include <linux/slab.h> #include <linux/audit.h> #include <linux/security.h> +#include <linux/cpuset.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "napi.h" #include "sqpoll.h" #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 +#define IORING_TW_CAP_ENTRIES_VALUE 8 enum { IO_SQ_THREAD_SHOULD_STOP = 0, @@ -37,12 +40,13 @@ void io_sq_thread_unpark(struct io_sq_data *sqd) if (atomic_dec_return(&sqd->park_pending)) set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); mutex_unlock(&sqd->lock); + wake_up(&sqd->wait); } void io_sq_thread_park(struct io_sq_data *sqd) __acquires(&sqd->lock) { - WARN_ON_ONCE(sqd->thread == current); + WARN_ON_ONCE(data_race(sqd->thread) == current); atomic_inc(&sqd->park_pending); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); @@ -103,29 +107,21 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) { struct io_ring_ctx *ctx_attach; struct io_sq_data *sqd; - struct fd f; + CLASS(fd, f)(p->wq_fd); - f = fdget(p->wq_fd); - if (!f.file) + if (fd_empty(f)) return ERR_PTR(-ENXIO); - if (!io_is_uring_fops(f.file)) { - fdput(f); + if (!io_is_uring_fops(fd_file(f))) return ERR_PTR(-EINVAL); - } - ctx_attach = f.file->private_data; + ctx_attach = fd_file(f)->private_data; sqd = ctx_attach->sq_data; - if (!sqd) { - fdput(f); + if (!sqd) return ERR_PTR(-EINVAL); - } - if (sqd->task_tgid != current->tgid) { - fdput(f); + if (sqd->task_tgid != current->tgid) return ERR_PTR(-EPERM); - } refcount_inc(&sqd->refs); - fdput(f); return sqd; } @@ -174,7 +170,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { + if (to_submit || !wq_list_empty(&ctx->iopoll_list)) { const struct cred *creds = NULL; if (ctx->sq_creds != current_cred()) @@ -212,21 +208,73 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd) mutex_unlock(&sqd->lock); if (signal_pending(current)) did_sig = get_signal(&ksig); - cond_resched(); + wait_event(sqd->wait, !atomic_read(&sqd->park_pending)); mutex_lock(&sqd->lock); sqd->sq_cpu = raw_smp_processor_id(); } return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); } +/* + * Run task_work, processing the retry_list first. The retry_list holds + * entries that we passed on in the previous run, if we had more task_work + * than we were asked to process. Newly queued task_work isn't run until the + * retry list has been fully processed. + */ +static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries) +{ + struct io_uring_task *tctx = current->io_uring; + unsigned int count = 0; + + if (*retry_list) { + *retry_list = io_handle_tw_list(*retry_list, &count, max_entries); + if (count >= max_entries) + goto out; + max_entries -= count; + } + *retry_list = tctx_task_work_run(tctx, max_entries, &count); +out: + if (task_work_pending(current)) + task_work_run(); + return count; +} + +static bool io_sq_tw_pending(struct llist_node *retry_list) +{ + struct io_uring_task *tctx = current->io_uring; + + return retry_list || !llist_empty(&tctx->task_list); +} + +static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start) +{ + struct rusage end; + + getrusage(current, RUSAGE_SELF, &end); + end.ru_stime.tv_sec -= start->ru_stime.tv_sec; + end.ru_stime.tv_usec -= start->ru_stime.tv_usec; + + sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000; +} + static int io_sq_thread(void *data) { + struct llist_node *retry_list = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; + struct rusage start; unsigned long timeout = 0; - char buf[TASK_COMM_LEN]; + char buf[TASK_COMM_LEN] = {}; DEFINE_WAIT(wait); + /* offload context creation failed, just exit */ + if (!current->io_uring) { + mutex_lock(&sqd->lock); + sqd->thread = NULL; + mutex_unlock(&sqd->lock); + goto err_out; + } + snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); set_task_comm(current, buf); @@ -240,6 +288,14 @@ static int io_sq_thread(void *data) sqd->sq_cpu = raw_smp_processor_id(); } + /* + * Force audit context to get setup, in case we do prep side async + * operations that would trigger an audit call before any issue side + * audit has been done. + */ + audit_uring_entry(IORING_OP_NOP); + audit_uring_exit(true, 0); + mutex_lock(&sqd->lock); while (1) { bool cap_entries, sqt_spin = false; @@ -251,18 +307,25 @@ static int io_sq_thread(void *data) } cap_entries = !list_is_singular(&sqd->ctx_list); + getrusage(current, RUSAGE_SELF, &start); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { int ret = __io_sq_thread(ctx, cap_entries); if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) sqt_spin = true; } - if (io_run_task_work()) + if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) sqt_spin = true; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + if (io_napi(ctx)) + io_napi_sqpoll_busy_poll(ctx); + if (sqt_spin || !time_after(jiffies, timeout)) { - if (sqt_spin) + if (sqt_spin) { + io_sq_update_worktime(sqd, &start); timeout = jiffies + sqd->sq_thread_idle; + } if (unlikely(need_resched())) { mutex_unlock(&sqd->lock); cond_resched(); @@ -273,7 +336,7 @@ static int io_sq_thread(void *data) } prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); - if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { + if (!io_sqd_events_pending(sqd) && !io_sq_tw_pending(retry_list)) { bool needs_sched = true; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { @@ -312,13 +375,16 @@ static int io_sq_thread(void *data) timeout = jiffies + sqd->sq_thread_idle; } + if (retry_list) + io_sq_tw(&retry_list, UINT_MAX); + io_uring_cancel_generic(true, sqd); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); io_run_task_work(); mutex_unlock(&sqd->lock); - +err_out: complete(&sqd->exited); do_exit(0); } @@ -343,21 +409,17 @@ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) __cold int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct task_struct *task_to_put = NULL; int ret; /* Retain compatibility with failing for an invalid attach attempt */ if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == IORING_SETUP_ATTACH_WQ) { - struct fd f; - - f = fdget(p->wq_fd); - if (!f.file) + CLASS(fd, f)(p->wq_fd); + if (fd_empty(f)) return -ENXIO; - if (!io_is_uring_fops(f.file)) { - fdput(f); + if (!io_is_uring_fops(fd_file(f))) return -EINVAL; - } - fdput(f); } if (ctx->flags & IORING_SETUP_SQPOLL) { struct task_struct *tsk; @@ -393,11 +455,22 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, return 0; if (p->flags & IORING_SETUP_SQ_AFF) { + cpumask_var_t allowed_mask; int cpu = p->sq_thread_cpu; ret = -EINVAL; if (cpu >= nr_cpu_ids || !cpu_online(cpu)) goto err_sqpoll; + ret = -ENOMEM; + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL)) + goto err_sqpoll; + ret = -EINVAL; + cpuset_cpus_allowed(current, allowed_mask); + if (!cpumask_test_cpu(cpu, allowed_mask)) { + free_cpumask_var(allowed_mask); + goto err_sqpoll; + } + free_cpumask_var(allowed_mask); sqd->sq_cpu = cpu; } else { sqd->sq_cpu = -1; @@ -412,6 +485,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, } sqd->thread = tsk; + task_to_put = get_task_struct(tsk); ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) @@ -422,11 +496,15 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, goto err; } + if (task_to_put) + put_task_struct(task_to_put); return 0; err_sqpoll: complete(&ctx->sq_data->exited); err: io_sq_thread_finish(ctx); + if (task_to_put) + put_task_struct(task_to_put); return ret; } |