diff options
Diffstat (limited to 'io_uring/io-wq.c')
-rw-r--r-- | io_uring/io-wq.c | 421 |
1 files changed, 254 insertions, 167 deletions
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 522196dfb0ff..cd1fcb115739 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/rculist_nulls.h> #include <linux/cpu.h> +#include <linux/cpuset.h> #include <linux/task_work.h> #include <linux/audit.h> #include <linux/mmu_context.h> @@ -23,12 +24,12 @@ #include "io_uring.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) +#define WORKER_INIT_LIMIT 3 enum { - IO_WORKER_F_UP = 1, /* up and active */ - IO_WORKER_F_RUNNING = 2, /* account as running */ - IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_BOUND = 8, /* is doing bounded work */ + IO_WORKER_F_UP = 0, /* up and active */ + IO_WORKER_F_RUNNING = 1, /* account as running */ + IO_WORKER_F_FREE = 2, /* worker on free list */ }; enum { @@ -44,25 +45,25 @@ enum { */ struct io_worker { refcount_t ref; - unsigned flags; + unsigned long flags; struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; struct io_wq *wq; + struct io_wq_acct *acct; struct io_wq_work *cur_work; - struct io_wq_work *next_work; raw_spinlock_t lock; struct completion ref_done; unsigned long create_state; struct callback_head create_work; - int create_index; + int init_retries; union { struct rcu_head rcu; - struct work_struct work; + struct delayed_work work; }; }; @@ -75,10 +76,27 @@ struct io_worker { #define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) struct io_wq_acct { + /** + * Protects access to the worker lists. + */ + raw_spinlock_t workers_lock; + unsigned nr_workers; unsigned max_workers; - int index; atomic_t nr_running; + + /** + * The list of free workers. Protected by #workers_lock + * (write) and RCU (read). + */ + struct hlist_nulls_head free_list; + + /** + * The list of all workers. Protected by #workers_lock + * (write) and RCU (read). + */ + struct list_head all_list; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long flags; @@ -96,9 +114,6 @@ enum { struct io_wq { unsigned long state; - free_work_fn *free_work; - io_wq_work_fn *do_work; - struct io_wq_hash *hash; atomic_t worker_refs; @@ -110,12 +125,6 @@ struct io_wq { struct io_wq_acct acct[IO_WQ_ACCT_NR]; - /* lock protects access to elements below */ - raw_spinlock_t lock; - - struct hlist_nulls_head free_list; - struct list_head all_list; - struct wait_queue_entry wait; struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; @@ -133,7 +142,7 @@ struct io_cb_cancel_data { bool cancel_all; }; -static bool create_io_worker(struct io_wq *wq, int index); +static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct); static void io_wq_dec_running(struct io_worker *worker); static bool io_acct_cancel_pending_work(struct io_wq *wq, struct io_wq_acct *acct, @@ -141,6 +150,16 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq, static void create_worker_cb(struct callback_head *cb); static void io_wq_cancel_tw_create(struct io_wq *wq); +static inline unsigned int __io_get_work_hash(unsigned int work_flags) +{ + return work_flags >> IO_WQ_HASH_SHIFT; +} + +static inline unsigned int io_get_work_hash(struct io_wq_work *work) +{ + return __io_get_work_hash(atomic_read(&work->flags)); +} + static bool io_worker_get(struct io_worker *worker) { return refcount_inc_not_zero(&worker->ref); @@ -158,14 +177,14 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) } static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, - struct io_wq_work *work) + unsigned int work_flags) { - return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(work_flags & IO_WQ_WORK_UNBOUND)); } static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND); + return worker->acct; } static void io_worker_ref_put(struct io_wq *wq) @@ -190,9 +209,9 @@ static void io_worker_cancel_cb(struct io_worker *worker) struct io_wq *wq = worker->wq; atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_worker_ref_put(wq); clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); @@ -211,6 +230,7 @@ static bool io_task_worker_match(struct callback_head *cb, void *data) static void io_worker_exit(struct io_worker *worker) { struct io_wq *wq = worker->wq; + struct io_wq_acct *acct = io_wq_get_acct(worker); while (1) { struct callback_head *cb = task_work_cancel_match(wq->task, @@ -224,11 +244,11 @@ static void io_worker_exit(struct io_worker *worker) io_worker_release(worker); wait_for_completion(&worker->ref_done); - raw_spin_lock(&wq->lock); - if (worker->flags & IO_WORKER_F_FREE) + raw_spin_lock(&acct->workers_lock); + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_wq_dec_running(worker); /* * this worker is a goner, clear ->worker_private to avoid any @@ -267,8 +287,7 @@ static inline bool io_acct_run_queue(struct io_wq_acct *acct) * Check head of free list for an available worker. If one isn't available, * caller must create one. */ -static bool io_wq_activate_free_worker(struct io_wq *wq, - struct io_wq_acct *acct) +static bool io_acct_activate_free_worker(struct io_wq_acct *acct) __must_hold(RCU) { struct hlist_nulls_node *n; @@ -279,13 +298,9 @@ static bool io_wq_activate_free_worker(struct io_wq *wq, * activate. If a given worker is on the free_list but in the process * of exiting, keep trying. */ - hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) { + hlist_nulls_for_each_entry_rcu(worker, n, &acct->free_list, nulls_node) { if (!io_worker_get(worker)) continue; - if (io_wq_get_acct(worker) != acct) { - io_worker_release(worker); - continue; - } /* * If the worker is already running, it's either already * starting work or finishing work. In either case, if it does @@ -312,16 +327,16 @@ static bool io_wq_create_worker(struct io_wq *wq, struct io_wq_acct *acct) if (unlikely(!acct->max_workers)) pr_warn_once("io-wq is not configured for unbound workers"); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers >= acct->max_workers) { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); return true; } acct->nr_workers++; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); atomic_inc(&acct->nr_running); atomic_inc(&wq->worker_refs); - return create_io_worker(wq, acct->index); + return create_io_worker(wq, acct); } static void io_wq_inc_running(struct io_worker *worker) @@ -341,16 +356,16 @@ static void create_worker_cb(struct callback_head *cb) worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; - acct = &wq->acct[worker->create_index]; - raw_spin_lock(&wq->lock); + acct = worker->acct; + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers < acct->max_workers) { acct->nr_workers++; do_create = true; } - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); if (do_create) { - create_io_worker(wq, worker->create_index); + create_io_worker(wq, acct); } else { atomic_dec(&acct->nr_running); io_worker_ref_put(wq); @@ -382,7 +397,6 @@ static bool io_queue_worker_create(struct io_worker *worker, atomic_inc(&wq->worker_refs); init_task_work(&worker->create_work, func); - worker->create_index = acct->index; if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { /* * EXIT may have been set after checking it above, check after @@ -405,18 +419,48 @@ fail: return false; } +/* Defer if current and next work are both hashed to the same chain */ +static bool io_wq_hash_defer(struct io_wq_work *work, struct io_wq_acct *acct) +{ + unsigned int hash, work_flags; + struct io_wq_work *next; + + lockdep_assert_held(&acct->lock); + + work_flags = atomic_read(&work->flags); + if (!__io_wq_is_hashed(work_flags)) + return false; + + /* should not happen, io_acct_run_queue() said we had work */ + if (wq_list_empty(&acct->work_list)) + return true; + + hash = __io_get_work_hash(work_flags); + next = container_of(acct->work_list.first, struct io_wq_work, list); + work_flags = atomic_read(&next->flags); + if (!__io_wq_is_hashed(work_flags)) + return false; + return hash == __io_get_work_hash(work_flags); +} + static void io_wq_dec_running(struct io_worker *worker) { struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wq *wq = worker->wq; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; if (!atomic_dec_and_test(&acct->nr_running)) return; + if (!worker->cur_work) + return; if (!io_acct_run_queue(acct)) return; + if (io_wq_hash_defer(worker->cur_work, acct)) { + raw_spin_unlock(&acct->lock); + return; + } raw_spin_unlock(&acct->lock); atomic_inc(&acct->nr_running); @@ -428,33 +472,28 @@ static void io_wq_dec_running(struct io_worker *worker) * Worker will start processing some work. Move it to the busy list, if * it's currently on the freelist */ -static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) +static void __io_worker_busy(struct io_wq_acct *acct, struct io_worker *worker) { - if (worker->flags & IO_WORKER_F_FREE) { - worker->flags &= ~IO_WORKER_F_FREE; - raw_spin_lock(&wq->lock); + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { + clear_bit(IO_WORKER_F_FREE, &worker->flags); + raw_spin_lock(&acct->workers_lock); hlist_nulls_del_init_rcu(&worker->nulls_node); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); } } /* * No work, worker going to sleep. Move to freelist. */ -static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) - __must_hold(wq->lock) +static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker) + __must_hold(acct->workers_lock) { - if (!(worker->flags & IO_WORKER_F_FREE)) { - worker->flags |= IO_WORKER_F_FREE; - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); + if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { + set_bit(IO_WORKER_F_FREE, &worker->flags); + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); } } -static inline unsigned int io_get_work_hash(struct io_wq_work *work) -{ - return work->flags >> IO_WQ_HASH_SHIFT; -} - static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) { bool ret = false; @@ -473,26 +512,27 @@ static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) } static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, - struct io_worker *worker) + struct io_wq *wq) __must_hold(acct->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; unsigned int stall_hash = -1U; - struct io_wq *wq = worker->wq; wq_list_for_each(node, prev, &acct->work_list) { + unsigned int work_flags; unsigned int hash; work = container_of(node, struct io_wq_work, list); /* not hashed, can run anytime */ - if (!io_wq_is_hashed(work)) { + work_flags = atomic_read(&work->flags); + if (!__io_wq_is_hashed(work_flags)) { wq_list_del(&acct->work_list, node, prev); return work; } - hash = io_get_work_hash(work); + hash = __io_get_work_hash(work_flags); /* all items with this hash lie in [work, tail] */ tail = wq->hash_tail[hash]; @@ -539,7 +579,6 @@ static void io_assign_current_work(struct io_worker *worker, raw_spin_lock(&worker->lock); worker->cur_work = work; - worker->next_work = NULL; raw_spin_unlock(&worker->lock); } @@ -563,11 +602,8 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * can't make progress, any work completion or insertion will * clear the stalled flag. */ - work = io_get_next_work(acct, worker); - raw_spin_unlock(&acct->lock); + work = io_get_next_work(acct, wq); if (work) { - __io_worker_busy(wq, worker); - /* * Make sure cancelation can find this, even before * it becomes the active work. That avoids a window @@ -576,27 +612,37 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * current work item for this worker. */ raw_spin_lock(&worker->lock); - worker->next_work = work; + worker->cur_work = work; raw_spin_unlock(&worker->lock); - } else { - break; } + + raw_spin_unlock(&acct->lock); + + if (!work) + break; + + __io_worker_busy(acct, worker); + io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); /* handle a whole dependent link */ do { struct io_wq_work *next_hashed, *linked; - unsigned int hash = io_get_work_hash(work); + unsigned int work_flags = atomic_read(&work->flags); + unsigned int hash = __io_wq_is_hashed(work_flags) + ? __io_get_work_hash(work_flags) + : -1U; next_hashed = wq_next_work(work); - if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) - work->flags |= IO_WQ_WORK_CANCEL; - wq->do_work(work); + if (do_kill && + (work_flags & IO_WQ_WORK_UNBOUND)) + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); + io_wq_submit_work(work); io_assign_current_work(worker, NULL); - linked = wq->free_work(work); + linked = io_wq_free_work(work); work = next_hashed; if (!work && linked && !io_wq_is_hashed(linked)) { work = linked; @@ -629,9 +675,10 @@ static int io_wq_worker(void *data) struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wq *wq = worker->wq; bool exit_mask = false, last_timeout = false; - char buf[TASK_COMM_LEN]; + char buf[TASK_COMM_LEN] = {}; - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); + set_mask_bits(&worker->flags, 0, + BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING)); snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid); set_task_comm(current, buf); @@ -648,20 +695,20 @@ static int io_wq_worker(void *data) while (io_acct_run_queue(acct)) io_worker_handle_work(acct, worker); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); /* * Last sleep timed out. Exit if we're not the last worker, * or if someone modified our affinity. */ if (last_timeout && (exit_mask || acct->nr_workers > 1)) { acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); __set_current_state(TASK_RUNNING); break; } last_timeout = false; - __io_worker_idle(wq, worker); - raw_spin_unlock(&wq->lock); + __io_worker_idle(acct, worker); + raw_spin_unlock(&acct->workers_lock); if (io_run_task_work()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); @@ -695,11 +742,11 @@ void io_wq_worker_running(struct task_struct *tsk) if (!worker) return; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; - if (worker->flags & IO_WORKER_F_RUNNING) + if (test_bit(IO_WORKER_F_RUNNING, &worker->flags)) return; - worker->flags |= IO_WORKER_F_RUNNING; + set_bit(IO_WORKER_F_RUNNING, &worker->flags); io_wq_inc_running(worker); } @@ -713,27 +760,27 @@ void io_wq_worker_sleeping(struct task_struct *tsk) if (!worker) return; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; - if (!(worker->flags & IO_WORKER_F_RUNNING)) + if (!test_bit(IO_WORKER_F_RUNNING, &worker->flags)) return; - worker->flags &= ~IO_WORKER_F_RUNNING; + clear_bit(IO_WORKER_F_RUNNING, &worker->flags); io_wq_dec_running(worker); } -static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, +static void io_init_new_worker(struct io_wq *wq, struct io_wq_acct *acct, struct io_worker *worker, struct task_struct *tsk) { tsk->worker_private = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, wq->cpu_mask); - raw_spin_lock(&wq->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); - list_add_tail_rcu(&worker->all_list, &wq->all_list); - worker->flags |= IO_WORKER_F_FREE; - raw_spin_unlock(&wq->lock); + raw_spin_lock(&acct->workers_lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); + list_add_tail_rcu(&worker->all_list, &acct->all_list); + set_bit(IO_WORKER_F_FREE, &worker->flags); + raw_spin_unlock(&acct->workers_lock); wake_up_new_task(tsk); } @@ -742,7 +789,7 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data) return true; } -static inline bool io_should_retry_thread(long err) +static inline bool io_should_retry_thread(struct io_worker *worker, long err) { /* * Prevent perpetual task_work retry, if the task (or its group) is @@ -750,6 +797,8 @@ static inline bool io_should_retry_thread(long err) */ if (fatal_signal_pending(current)) return false; + if (worker->init_retries++ >= WORKER_INIT_LIMIT) + return false; switch (err) { case -EAGAIN: @@ -762,25 +811,37 @@ static inline bool io_should_retry_thread(long err) } } +static void queue_create_worker_retry(struct io_worker *worker) +{ + /* + * We only bother retrying because there's a chance that the + * failure to create a worker is due to some temporary condition + * in the forking task (e.g. outstanding signal); give the task + * some time to clear that condition. + */ + schedule_delayed_work(&worker->work, + msecs_to_jiffies(worker->init_retries * 5)); +} + static void create_worker_cont(struct callback_head *cb) { struct io_worker *worker; struct task_struct *tsk; struct io_wq *wq; + struct io_wq_acct *acct; worker = container_of(cb, struct io_worker, create_work); clear_bit_unlock(0, &worker->create_state); wq = worker->wq; + acct = io_wq_get_acct(worker); tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wq, worker, tsk); + io_init_new_worker(wq, acct, worker, tsk); io_worker_release(worker); return; - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { - struct io_wq_acct *acct = io_wq_get_acct(worker); - + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; if (!acct->nr_workers) { struct io_cb_cancel_data match = { @@ -788,11 +849,11 @@ static void create_worker_cont(struct callback_head *cb) .cancel_all = true, }; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); while (io_acct_cancel_pending_work(wq, acct, &match)) ; } else { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); } io_worker_ref_put(wq); kfree(worker); @@ -801,21 +862,21 @@ static void create_worker_cont(struct callback_head *cb) /* re-create attempts grab a new worker ref, drop the existing one */ io_worker_release(worker); - schedule_work(&worker->work); + queue_create_worker_retry(worker); } static void io_workqueue_create(struct work_struct *work) { - struct io_worker *worker = container_of(work, struct io_worker, work); + struct io_worker *worker = container_of(work, struct io_worker, + work.work); struct io_wq_acct *acct = io_wq_get_acct(worker); if (!io_queue_worker_create(worker, acct, create_worker_cont)) kfree(worker); } -static bool create_io_worker(struct io_wq *wq, int index) +static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct) { - struct io_wq_acct *acct = &wq->acct[index]; struct io_worker *worker; struct task_struct *tsk; @@ -825,30 +886,28 @@ static bool create_io_worker(struct io_wq *wq, int index) if (!worker) { fail: atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_worker_ref_put(wq); return false; } refcount_set(&worker->ref, 1); worker->wq = wq; + worker->acct = acct; raw_spin_lock_init(&worker->lock); init_completion(&worker->ref_done); - if (index == IO_WQ_ACCT_BOUND) - worker->flags |= IO_WORKER_F_BOUND; - tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wq, worker, tsk); - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + io_init_new_worker(wq, acct, worker, tsk); + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { kfree(worker); goto fail; } else { - INIT_WORK(&worker->work, io_workqueue_create); - schedule_work(&worker->work); + INIT_DELAYED_WORK(&worker->work, io_workqueue_create); + queue_create_worker_retry(worker); } return true; @@ -858,14 +917,14 @@ fail: * Iterate the passed in list and call the specific function for each * worker that isn't exiting */ -static bool io_wq_for_each_worker(struct io_wq *wq, - bool (*func)(struct io_worker *, void *), - void *data) +static bool io_acct_for_each_worker(struct io_wq_acct *acct, + bool (*func)(struct io_worker *, void *), + void *data) { struct io_worker *worker; bool ret = false; - list_for_each_entry_rcu(worker, &wq->all_list, all_list) { + list_for_each_entry_rcu(worker, &acct->all_list, all_list) { if (io_worker_get(worker)) { /* no task if node is/was offline */ if (worker->task) @@ -879,6 +938,18 @@ static bool io_wq_for_each_worker(struct io_wq *wq, return ret; } +static bool io_wq_for_each_worker(struct io_wq *wq, + bool (*func)(struct io_worker *, void *), + void *data) +{ + for (int i = 0; i < IO_WQ_ACCT_NR; i++) { + if (!io_acct_for_each_worker(&wq->acct[i], func, data)) + return false; + } + + return true; +} + static bool io_wq_worker_wake(struct io_worker *worker, void *data) { __set_notify_signal(worker->task); @@ -889,25 +960,25 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { - work->flags |= IO_WQ_WORK_CANCEL; - wq->do_work(work); - work = wq->free_work(work); + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); + io_wq_submit_work(work); + work = io_wq_free_work(work); } while (work); } -static void io_wq_insert_work(struct io_wq *wq, struct io_wq_work *work) +static void io_wq_insert_work(struct io_wq *wq, struct io_wq_acct *acct, + struct io_wq_work *work, unsigned int work_flags) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash; struct io_wq_work *tail; - if (!io_wq_is_hashed(work)) { + if (!__io_wq_is_hashed(work_flags)) { append: wq_list_add_tail(&work->list, &acct->work_list); return; } - hash = io_get_work_hash(work); + hash = __io_get_work_hash(work_flags); tail = wq->hash_tail[hash]; wq->hash_tail[hash] = work; if (!tail) @@ -923,9 +994,13 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); - struct io_cb_cancel_data match; - unsigned work_flags = work->flags; + unsigned int work_flags = atomic_read(&work->flags); + struct io_wq_acct *acct = io_work_get_acct(wq, work_flags); + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_item, + .data = work, + .cancel_all = false, + }; bool do_create; /* @@ -933,18 +1008,18 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) * been marked as one that should not get executed, cancel it here. */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || - (work->flags & IO_WQ_WORK_CANCEL)) { + (work_flags & IO_WQ_WORK_CANCEL)) { io_run_cancel(work, wq); return; } raw_spin_lock(&acct->lock); - io_wq_insert_work(wq, work); + io_wq_insert_work(wq, acct, work, work_flags); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); rcu_read_lock(); - do_create = !io_wq_activate_free_worker(wq, acct); + do_create = !io_acct_activate_free_worker(acct); rcu_read_unlock(); if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || @@ -955,18 +1030,14 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) if (likely(did_create)) return; - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers) { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); return; } - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); /* fatal condition, failed to create the first worker */ - match.fn = io_wq_work_match_item, - match.data = work, - match.cancel_all = false, - io_acct_cancel_pending_work(wq, acct, &match); } } @@ -980,7 +1051,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) unsigned int bit; bit = hash_ptr(val, IO_WQ_HASH_ORDER); - work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); + atomic_or(IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT), &work->flags); } static bool __io_wq_worker_cancel(struct io_worker *worker, @@ -988,7 +1059,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, struct io_wq_work *work) { if (work && match->fn(work, match->data)) { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); __set_notify_signal(worker->task); return true; } @@ -1005,8 +1076,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) * may dereference the passed in work. */ raw_spin_lock(&worker->lock); - if (__io_wq_worker_cancel(worker, match, worker->cur_work) || - __io_wq_worker_cancel(worker, match, worker->next_work)) + if (__io_wq_worker_cancel(worker, match, worker->cur_work)) match->nr_running++; raw_spin_unlock(&worker->lock); @@ -1014,10 +1084,10 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) } static inline void io_wq_remove_pending(struct io_wq *wq, + struct io_wq_acct *acct, struct io_wq_work *work, struct io_wq_work_node *prev) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash = io_get_work_hash(work); struct io_wq_work *prev_work = NULL; @@ -1044,7 +1114,7 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq, work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; - io_wq_remove_pending(wq, work, prev); + io_wq_remove_pending(wq, acct, work, prev); raw_spin_unlock(&acct->lock); io_run_cancel(work, wq); match->nr_pending++; @@ -1072,11 +1142,22 @@ retry: } } +static void io_acct_cancel_running_work(struct io_wq_acct *acct, + struct io_cb_cancel_data *match) +{ + raw_spin_lock(&acct->workers_lock); + io_acct_for_each_worker(acct, io_wq_worker_cancel, match); + raw_spin_unlock(&acct->workers_lock); +} + static void io_wq_cancel_running_work(struct io_wq *wq, struct io_cb_cancel_data *match) { rcu_read_lock(); - io_wq_for_each_worker(wq, io_wq_worker_cancel, match); + + for (int i = 0; i < IO_WQ_ACCT_NR; i++) + io_acct_cancel_running_work(&wq->acct[i], match); + rcu_read_unlock(); } @@ -1099,16 +1180,14 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. * - * Do both of these while holding the wq->lock, to ensure that + * Do both of these while holding the acct->workers_lock, to ensure that * we'll find a work item regardless of state. */ io_wq_cancel_pending_work(wq, &match); if (match.nr_pending && !match.cancel_all) return IO_WQ_CANCEL_OK; - raw_spin_lock(&wq->lock); io_wq_cancel_running_work(wq, &match); - raw_spin_unlock(&wq->lock); if (match.nr_running && !match.cancel_all) return IO_WQ_CANCEL_RUNNING; @@ -1132,7 +1211,7 @@ static int io_wq_hash_wake(struct wait_queue_entry *wait, unsigned mode, struct io_wq_acct *acct = &wq->acct[i]; if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - io_wq_activate_free_worker(wq, acct); + io_acct_activate_free_worker(acct); } rcu_read_unlock(); return 1; @@ -1143,8 +1222,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) int ret, i; struct io_wq *wq; - if (WARN_ON_ONCE(!data->free_work || !data->do_work)) - return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(!bounded)) return ERR_PTR(-EINVAL); @@ -1154,14 +1231,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) refcount_inc(&data->hash->refs); wq->hash = data->hash; - wq->free_work = data->free_work; - wq->do_work = data->do_work; ret = -ENOMEM; if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL)) goto err; - cpumask_copy(wq->cpu_mask, cpu_possible_mask); + cpuset_cpus_allowed(data->task, wq->cpu_mask); wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; wq->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); @@ -1170,16 +1245,16 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) for (i = 0; i < IO_WQ_ACCT_NR; i++) { struct io_wq_acct *acct = &wq->acct[i]; - acct->index = i; atomic_set(&acct->nr_running, 0); + + raw_spin_lock_init(&acct->workers_lock); + INIT_HLIST_NULLS_HEAD(&acct->free_list, 0); + INIT_LIST_HEAD(&acct->all_list); + INIT_WQ_LIST(&acct->work_list); raw_spin_lock_init(&acct->lock); } - raw_spin_lock_init(&wq->lock); - INIT_HLIST_NULLS_HEAD(&wq->free_list, 0); - INIT_LIST_HEAD(&wq->all_list); - wq->task = get_task_struct(data->task); atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); @@ -1316,17 +1391,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask) { + cpumask_var_t allowed_mask; + int ret = 0; + if (!tctx || !tctx->io_wq) return -EINVAL; + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL)) + return -ENOMEM; + rcu_read_lock(); - if (mask) - cpumask_copy(tctx->io_wq->cpu_mask, mask); - else - cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask); + cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask); + if (mask) { + if (cpumask_subset(mask, allowed_mask)) + cpumask_copy(tctx->io_wq->cpu_mask, mask); + else + ret = -EINVAL; + } else { + cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask); + } rcu_read_unlock(); - return 0; + free_cpumask_var(allowed_mask); + return ret; } /* @@ -1353,14 +1440,14 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) rcu_read_lock(); - raw_spin_lock(&wq->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { acct = &wq->acct[i]; + raw_spin_lock(&acct->workers_lock); prev[i] = max_t(int, acct->max_workers, prev[i]); if (new_count[i]) acct->max_workers = new_count[i]; + raw_spin_unlock(&acct->workers_lock); } - raw_spin_unlock(&wq->lock); rcu_read_unlock(); for (i = 0; i < IO_WQ_ACCT_NR; i++) |