diff options
Diffstat (limited to 'kernel/workqueue.c')
-rw-r--r-- | kernel/workqueue.c | 2945 |
1 files changed, 1893 insertions, 1052 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 894bb885b40b..7b482a26d741 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -49,7 +49,10 @@ #include <linux/moduleparam.h> #include <linux/uaccess.h> #include <linux/sched/isolation.h> +#include <linux/sched/debug.h> #include <linux/nmi.h> +#include <linux/kvm_para.h> +#include <linux/delay.h> #include "workqueue_internal.h" @@ -119,10 +122,11 @@ enum { * * L: pool->lock protected. Access with pool->lock held. * - * X: During normal operation, modification requires pool->lock and should - * be done only from local cpu. Either disabling preemption on local - * cpu or grabbing pool->lock is enough for read access. If - * POOL_DISASSOCIATED is set, it's identical to L. + * K: Only modified by worker while holding pool->lock. Can be safely read by + * self, while holding pool->lock or from IRQ context if %current is the + * kworker. + * + * S: Only modified by worker self. * * A: wq_pool_attach_mutex protected. * @@ -140,6 +144,8 @@ enum { * WR: wq->mutex protected for writes. RCU protected for reads. * * MD: wq_mayday_lock protected. + * + * WD: Used internally by the watchdog. */ /* struct worker is defined in workqueue_internal.h */ @@ -149,18 +155,29 @@ struct worker_pool { int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ - unsigned int flags; /* X: flags */ + unsigned int flags; /* L: flags */ unsigned long watchdog_ts; /* L: watchdog timestamp */ + bool cpu_stall; /* WD: stalled cpu bound pool */ + + /* + * The counter is incremented in a process context on the associated CPU + * w/ preemption disabled, and decremented or reset in the same context + * but w/ pool->lock held. The readers grab pool->lock and are + * guaranteed to see if the counter reached zero. + */ + int nr_running; struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ int nr_idle; /* L: currently idle workers */ - struct list_head idle_list; /* X: list of idle workers */ + struct list_head idle_list; /* L: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ - struct timer_list mayday_timer; /* L: SOS timer for workers */ + struct work_struct idle_cull_work; /* L: worker idle cleanup */ + + struct timer_list mayday_timer; /* L: SOS timer for workers */ /* a workers is either on busy_hash or idle_list, or the manager */ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); @@ -168,6 +185,7 @@ struct worker_pool { struct worker *manager; /* L: purely informational */ struct list_head workers; /* A: attached workers */ + struct list_head dying_workers; /* A: workers about to die */ struct completion *detach_completion; /* all workers detached */ struct ida worker_ida; /* worker IDs for task name */ @@ -177,18 +195,28 @@ struct worker_pool { int refcnt; /* PL: refcnt for unbound pools */ /* - * The current concurrency level. As it's likely to be accessed - * from other CPUs during try_to_wake_up(), put it in a separate - * cacheline. - */ - atomic_t nr_running ____cacheline_aligned_in_smp; - - /* * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; -} ____cacheline_aligned_in_smp; +}; + +/* + * Per-pool_workqueue statistics. These can be monitored using + * tools/workqueue/wq_monitor.py. + */ +enum pool_workqueue_stats { + PWQ_STAT_STARTED, /* work items started execution */ + PWQ_STAT_COMPLETED, /* work items completed execution */ + PWQ_STAT_CPU_TIME, /* total CPU time consumed */ + PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ + PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ + PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */ + PWQ_STAT_MAYDAY, /* maydays to rescuer */ + PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ + + PWQ_NR_STATS, +}; /* * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS @@ -204,19 +232,38 @@ struct pool_workqueue { int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ + + /* + * nr_active management and WORK_STRUCT_INACTIVE: + * + * When pwq->nr_active >= max_active, new work item is queued to + * pwq->inactive_works instead of pool->worklist and marked with + * WORK_STRUCT_INACTIVE. + * + * All work items marked with WORK_STRUCT_INACTIVE do not participate + * in pwq->nr_active and all work items in pwq->inactive_works are + * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE + * work items are in pwq->inactive_works. Some of them are ready to + * run in pool->worklist or worker->scheduled. Those work itmes are + * only struct wq_barrier which is used for flush_work() and should + * not participate in pwq->nr_active. For non-barrier work item, it + * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. + */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ - struct list_head delayed_works; /* L: delayed works */ + struct list_head inactive_works; /* L: inactive works */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ + u64 stats[PWQ_NR_STATS]; + /* - * Release of unbound pwq is punted to system_wq. See put_pwq() - * and pwq_unbound_release_workfn() for details. pool_workqueue - * itself is also RCU protected so that the first pwq can be - * determined without grabbing wq->mutex. + * Release of unbound pwq is punted to a kthread_worker. See put_pwq() + * and pwq_release_workfn() for details. pool_workqueue itself is also + * RCU protected so that the first pwq can be determined without + * grabbing wq->mutex. */ - struct work_struct unbound_release_work; + struct kthread_work release_work; struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_FLAG_BITS); @@ -275,17 +322,43 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ - struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ + struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ }; static struct kmem_cache *pwq_cache; -static cpumask_var_t *wq_numa_possible_cpumask; - /* possible CPUs of each node */ +/* + * Each pod type describes how CPUs should be grouped for unbound workqueues. + * See the comment above workqueue_attrs->affn_scope. + */ +struct wq_pod_type { + int nr_pods; /* number of pods */ + cpumask_var_t *pod_cpus; /* pod -> cpus */ + int *pod_node; /* pod -> node */ + int *cpu_pod; /* cpu -> pod */ +}; + +static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; +static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; -static bool wq_disable_numa; -module_param_named(disable_numa, wq_disable_numa, bool, 0444); +static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { + [WQ_AFFN_DFL] = "default", + [WQ_AFFN_CPU] = "cpu", + [WQ_AFFN_SMT] = "smt", + [WQ_AFFN_CACHE] = "cache", + [WQ_AFFN_NUMA] = "numa", + [WQ_AFFN_SYSTEM] = "system", +}; + +/* + * Per-cpu work items which run for longer than the following threshold are + * automatically considered CPU intensive and excluded from concurrency + * management to prevent them from noticeably delaying other per-cpu work items. + * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter. + * The actual value is initialized in wq_cpu_intensive_thresh_init(). + */ +static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; +module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); /* see the comment above the definition of WQ_POWER_EFFICIENT */ static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); @@ -293,10 +366,8 @@ module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_online; /* can kworkers be created yet? */ -static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ - -/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ -static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; +/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ +static struct workqueue_attrs *wq_update_pod_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ @@ -307,9 +378,18 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ -/* PL: allowable cpus for unbound wqs and work items */ +/* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; +/* PL: user requested unbound cpumask via sysfs */ +static cpumask_var_t wq_requested_unbound_cpumask; + +/* PL: isolated cpumask to be excluded from unbound cpumask */ +static cpumask_var_t wq_isolated_cpumask; + +/* for further constrain wq_unbound_cpumask by cmdline parameter*/ +static struct cpumask wq_cmdline_cpumask __initdata; + /* CPU where unbound work was last round robin scheduled from this CPU */ static DEFINE_PER_CPU(int, wq_rr_cpu_last); @@ -339,24 +419,32 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; /* I: attributes used when instantiating ordered pools on demand */ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; -struct workqueue_struct *system_wq __read_mostly; +/* + * I: kthread_worker to release pwq's. pwq release needs to be bounced to a + * process context while holding a pool lock. Bounce to a dedicated kthread + * worker to avoid A-A deadlocks. + */ +static struct kthread_worker *pwq_release_worker __ro_after_init; + +struct workqueue_struct *system_wq __ro_after_init; EXPORT_SYMBOL(system_wq); -struct workqueue_struct *system_highpri_wq __read_mostly; +struct workqueue_struct *system_highpri_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_highpri_wq); -struct workqueue_struct *system_long_wq __read_mostly; +struct workqueue_struct *system_long_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_long_wq); -struct workqueue_struct *system_unbound_wq __read_mostly; +struct workqueue_struct *system_unbound_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_unbound_wq); -struct workqueue_struct *system_freezable_wq __read_mostly; +struct workqueue_struct *system_freezable_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_wq); -struct workqueue_struct *system_power_efficient_wq __read_mostly; +struct workqueue_struct *system_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_power_efficient_wq); -struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; +struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); static void show_pwq(struct pool_workqueue *pwq); +static void show_one_worker_pool(struct worker_pool *pool); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> @@ -523,7 +611,7 @@ static inline void debug_work_deactivate(struct work_struct *work) { } #endif /** - * worker_pool_assign_id - allocate ID and assing it to @pool + * worker_pool_assign_id - allocate ID and assign it to @pool * @pool: the pool pointer of interest * * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned @@ -544,43 +632,14 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } -/** - * unbound_pwq_by_node - return the unbound pool_workqueue for the given node - * @wq: the target workqueue - * @node: the node ID - * - * This must be called with any of wq_pool_mutex, wq->mutex or RCU - * read locked. - * If the pwq needs to be used beyond the locking in effect, the caller is - * responsible for guaranteeing that the pwq stays online. - * - * Return: The unbound pool_workqueue for @node. - */ -static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, - int node) -{ - assert_rcu_or_wq_mutex_or_pool_mutex(wq); - - /* - * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a - * delayed item is pending. The plan is to keep CPU -> NODE - * mapping valid and stable across CPU on/offlines. Once that - * happens, this workaround can be removed. - */ - if (unlikely(node == NUMA_NO_NODE)) - return wq->dfl_pwq; - - return rcu_dereference_raw(wq->numa_pwq_tbl[node]); -} - static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; } -static int get_work_color(struct work_struct *work) +static int get_work_color(unsigned long work_data) { - return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & + return (work_data >> WORK_STRUCT_COLOR_SHIFT) & ((1 << WORK_STRUCT_COLOR_BITS) - 1); } @@ -678,12 +737,17 @@ static void clear_work_data(struct work_struct *work) set_work_data(work, WORK_STRUCT_NO_POOL, 0); } +static inline struct pool_workqueue *work_struct_pwq(unsigned long data) +{ + return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK); +} + static struct pool_workqueue *get_work_pwq(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) - return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); + return work_struct_pwq(data); else return NULL; } @@ -711,8 +775,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) assert_rcu_or_pool_mutex(); if (data & WORK_STRUCT_PWQ) - return ((struct pool_workqueue *) - (data & WORK_STRUCT_WQ_DATA_MASK))->pool; + return work_struct_pwq(data)->pool; pool_id = data >> WORK_OFFQ_POOL_SHIFT; if (pool_id == WORK_OFFQ_POOL_NONE) @@ -733,8 +796,7 @@ static int get_work_pool_id(struct work_struct *work) unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) - return ((struct pool_workqueue *) - (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id; + return work_struct_pwq(data)->pool->id; return data >> WORK_OFFQ_POOL_SHIFT; } @@ -760,11 +822,6 @@ static bool work_is_canceling(struct work_struct *work) * they're being called with pool->lock held. */ -static bool __need_more_worker(struct worker_pool *pool) -{ - return !atomic_read(&pool->nr_running); -} - /* * Need to wake up a worker? Called from anything but currently * running workers. @@ -775,7 +832,7 @@ static bool __need_more_worker(struct worker_pool *pool) */ static bool need_more_worker(struct worker_pool *pool) { - return !list_empty(&pool->worklist) && __need_more_worker(pool); + return !list_empty(&pool->worklist) && !pool->nr_running; } /* Can I start working? Called from busy but !running workers. */ @@ -787,8 +844,7 @@ static bool may_start_working(struct worker_pool *pool) /* Do I need to keep working? Called from currently running workers. */ static bool keep_working(struct worker_pool *pool) { - return !list_empty(&pool->worklist) && - atomic_read(&pool->nr_running) <= 1; + return !list_empty(&pool->worklist) && (pool->nr_running <= 1); } /* Do we need a new worker? Called from manager. */ @@ -807,154 +863,23 @@ static bool too_many_workers(struct worker_pool *pool) return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } -/* - * Wake up functions. - */ - -/* Return the first idle worker. Safe with preemption disabled */ -static struct worker *first_idle_worker(struct worker_pool *pool) -{ - if (unlikely(list_empty(&pool->idle_list))) - return NULL; - - return list_first_entry(&pool->idle_list, struct worker, entry); -} - -/** - * wake_up_worker - wake up an idle worker - * @pool: worker pool to wake worker from - * - * Wake up the first idle worker of @pool. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - */ -static void wake_up_worker(struct worker_pool *pool) -{ - struct worker *worker = first_idle_worker(pool); - - if (likely(worker)) - wake_up_process(worker->task); -} - -/** - * wq_worker_running - a worker is running again - * @task: task waking up - * - * This function is called when a worker returns from schedule() - */ -void wq_worker_running(struct task_struct *task) -{ - struct worker *worker = kthread_data(task); - - if (!worker->sleeping) - return; - if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(&worker->pool->nr_running); - worker->sleeping = 0; -} - -/** - * wq_worker_sleeping - a worker is going to sleep - * @task: task going to sleep - * - * This function is called from schedule() when a busy worker is - * going to sleep. Preemption needs to be disabled to protect ->sleeping - * assignment. - */ -void wq_worker_sleeping(struct task_struct *task) -{ - struct worker *next, *worker = kthread_data(task); - struct worker_pool *pool; - - /* - * Rescuers, which may not have all the fields set up like normal - * workers, also reach here, let's not access anything before - * checking NOT_RUNNING. - */ - if (worker->flags & WORKER_NOT_RUNNING) - return; - - pool = worker->pool; - - /* Return if preempted before wq_worker_running() was reached */ - if (worker->sleeping) - return; - - worker->sleeping = 1; - raw_spin_lock_irq(&pool->lock); - - /* - * The counterpart of the following dec_and_test, implied mb, - * worklist not empty test sequence is in insert_work(). - * Please read comment there. - * - * NOT_RUNNING is clear. This means that we're bound to and - * running on the local cpu w/ rq lock held and preemption - * disabled, which in turn means that none else could be - * manipulating idle_list, so dereferencing idle_list without pool - * lock is safe. - */ - if (atomic_dec_and_test(&pool->nr_running) && - !list_empty(&pool->worklist)) { - next = first_idle_worker(pool); - if (next) - wake_up_process(next->task); - } - raw_spin_unlock_irq(&pool->lock); -} - -/** - * wq_worker_last_func - retrieve worker's last work function - * @task: Task to retrieve last work function of. - * - * Determine the last function a worker executed. This is called from - * the scheduler to get a worker's last known identity. - * - * CONTEXT: - * raw_spin_lock_irq(rq->lock) - * - * This function is called during schedule() when a kworker is going - * to sleep. It's used by psi to identify aggregation workers during - * dequeuing, to allow periodic aggregation to shut-off when that - * worker is the last task in the system or cgroup to go to sleep. - * - * As this function doesn't involve any workqueue-related locking, it - * only returns stable values when called from inside the scheduler's - * queuing and dequeuing paths, when @task, which must be a kworker, - * is guaranteed to not be processing any works. - * - * Return: - * The last work function %current executed as a worker, NULL if it - * hasn't executed any work yet. - */ -work_func_t wq_worker_last_func(struct task_struct *task) -{ - struct worker *worker = kthread_data(task); - - return worker->last_func; -} - /** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set * * Set @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; - WARN_ON_ONCE(worker->task != current); + lockdep_assert_held(&pool->lock); /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - atomic_dec(&pool->nr_running); + pool->nr_running--; } worker->flags |= flags; @@ -966,16 +891,13 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) * @flags: flags to clear * * Clear @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; - WARN_ON_ONCE(worker->task != current); + lockdep_assert_held(&pool->lock); worker->flags &= ~flags; @@ -986,7 +908,70 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(&pool->nr_running); + pool->nr_running++; +} + +/* Return the first idle worker. Called with pool->lock held. */ +static struct worker *first_idle_worker(struct worker_pool *pool) +{ + if (unlikely(list_empty(&pool->idle_list))) + return NULL; + + return list_first_entry(&pool->idle_list, struct worker, entry); +} + +/** + * worker_enter_idle - enter idle state + * @worker: worker which is entering idle state + * + * @worker is entering idle state. Update stats and idle timer if + * necessary. + * + * LOCKING: + * raw_spin_lock_irq(pool->lock). + */ +static void worker_enter_idle(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + + if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || + WARN_ON_ONCE(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev))) + return; + + /* can't use worker_set_flags(), also called from create_worker() */ + worker->flags |= WORKER_IDLE; + pool->nr_idle++; + worker->last_active = jiffies; + + /* idle_list is LIFO */ + list_add(&worker->entry, &pool->idle_list); + + if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) + mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); + + /* Sanity check nr_running. */ + WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); +} + +/** + * worker_leave_idle - leave idle state + * @worker: worker which is leaving idle state + * + * @worker is leaving idle state. Update stats. + * + * LOCKING: + * raw_spin_lock_irq(pool->lock). + */ +static void worker_leave_idle(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + + if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) + return; + worker_clr_flags(worker, WORKER_IDLE); + pool->nr_idle--; + list_del_init(&worker->entry); } /** @@ -1042,13 +1027,10 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, * @head: target list to append @work to * @nextp: out parameter for nested worklist walking * - * Schedule linked works starting from @work to @head. Work series to - * be scheduled starts at @work and includes any consecutive work with - * WORK_STRUCT_LINKED set in its predecessor. - * - * If @nextp is not NULL, it's updated to point to the next work of - * the last scheduled work. This allows move_linked_works() to be - * nested inside outer list_for_each_entry_safe(). + * Schedule linked works starting from @work to @head. Work series to be + * scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on + * @nextp. * * CONTEXT: * raw_spin_lock_irq(pool->lock). @@ -1078,6 +1060,348 @@ static void move_linked_works(struct work_struct *work, struct list_head *head, } /** + * assign_work - assign a work item and its linked work items to a worker + * @work: work to assign + * @worker: worker to assign to + * @nextp: out parameter for nested worklist walking + * + * Assign @work and its linked work items to @worker. If @work is already being + * executed by another worker in the same pool, it'll be punted there. + * + * If @nextp is not NULL, it's updated to point to the next work of the last + * scheduled work. This allows assign_work() to be nested inside + * list_for_each_entry_safe(). + * + * Returns %true if @work was successfully assigned to @worker. %false if @work + * was punted to another worker already executing it. + */ +static bool assign_work(struct work_struct *work, struct worker *worker, + struct work_struct **nextp) +{ + struct worker_pool *pool = worker->pool; + struct worker *collision; + + lockdep_assert_held(&pool->lock); + + /* + * A single work shouldn't be executed concurrently by multiple workers. + * __queue_work() ensures that @work doesn't jump to a different pool + * while still running in the previous pool. Here, we should ensure that + * @work is not executed concurrently by multiple workers from the same + * pool. Check whether anyone is already processing the work. If so, + * defer the work to the currently executing one. + */ + collision = find_worker_executing_work(pool, work); + if (unlikely(collision)) { + move_linked_works(work, &collision->scheduled, nextp); + return false; + } + + move_linked_works(work, &worker->scheduled, nextp); + return true; +} + +/** + * kick_pool - wake up an idle worker if necessary + * @pool: pool to kick + * + * @pool may have pending work items. Wake up worker if necessary. Returns + * whether a worker was woken up. + */ +static bool kick_pool(struct worker_pool *pool) +{ + struct worker *worker = first_idle_worker(pool); + struct task_struct *p; + + lockdep_assert_held(&pool->lock); + + if (!need_more_worker(pool) || !worker) + return false; + + p = worker->task; + +#ifdef CONFIG_SMP + /* + * Idle @worker is about to execute @work and waking up provides an + * opportunity to migrate @worker at a lower cost by setting the task's + * wake_cpu field. Let's see if we want to move @worker to improve + * execution locality. + * + * We're waking the worker that went idle the latest and there's some + * chance that @worker is marked idle but hasn't gone off CPU yet. If + * so, setting the wake_cpu won't do anything. As this is a best-effort + * optimization and the race window is narrow, let's leave as-is for + * now. If this becomes pronounced, we can skip over workers which are + * still on cpu when picking an idle worker. + * + * If @pool has non-strict affinity, @worker might have ended up outside + * its affinity scope. Repatriate. + */ + if (!pool->attrs->affn_strict && + !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) { + struct work_struct *work = list_first_entry(&pool->worklist, + struct work_struct, entry); + p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask); + get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; + } +#endif + wake_up_process(p); + return true; +} + +#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT + +/* + * Concurrency-managed per-cpu work items that hog CPU for longer than + * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism, + * which prevents them from stalling other concurrency-managed work items. If a + * work function keeps triggering this mechanism, it's likely that the work item + * should be using an unbound workqueue instead. + * + * wq_cpu_intensive_report() tracks work functions which trigger such conditions + * and report them so that they can be examined and converted to use unbound + * workqueues as appropriate. To avoid flooding the console, each violating work + * function is tracked and reported with exponential backoff. + */ +#define WCI_MAX_ENTS 128 + +struct wci_ent { + work_func_t func; + atomic64_t cnt; + struct hlist_node hash_node; +}; + +static struct wci_ent wci_ents[WCI_MAX_ENTS]; +static int wci_nr_ents; +static DEFINE_RAW_SPINLOCK(wci_lock); +static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS)); + +static struct wci_ent *wci_find_ent(work_func_t func) +{ + struct wci_ent *ent; + + hash_for_each_possible_rcu(wci_hash, ent, hash_node, + (unsigned long)func) { + if (ent->func == func) + return ent; + } + return NULL; +} + +static void wq_cpu_intensive_report(work_func_t func) +{ + struct wci_ent *ent; + +restart: + ent = wci_find_ent(func); + if (ent) { + u64 cnt; + + /* + * Start reporting from the fourth time and back off + * exponentially. + */ + cnt = atomic64_inc_return_relaxed(&ent->cnt); + if (cnt >= 4 && is_power_of_2(cnt)) + printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", + ent->func, wq_cpu_intensive_thresh_us, + atomic64_read(&ent->cnt)); + return; + } + + /* + * @func is a new violation. Allocate a new entry for it. If wcn_ents[] + * is exhausted, something went really wrong and we probably made enough + * noise already. + */ + if (wci_nr_ents >= WCI_MAX_ENTS) + return; + + raw_spin_lock(&wci_lock); + + if (wci_nr_ents >= WCI_MAX_ENTS) { + raw_spin_unlock(&wci_lock); + return; + } + + if (wci_find_ent(func)) { + raw_spin_unlock(&wci_lock); + goto restart; + } + + ent = &wci_ents[wci_nr_ents++]; + ent->func = func; + atomic64_set(&ent->cnt, 1); + hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); + + raw_spin_unlock(&wci_lock); +} + +#else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ +static void wq_cpu_intensive_report(work_func_t func) {} +#endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ + +/** + * wq_worker_running - a worker is running again + * @task: task waking up + * + * This function is called when a worker returns from schedule() + */ +void wq_worker_running(struct task_struct *task) +{ + struct worker *worker = kthread_data(task); + + if (!READ_ONCE(worker->sleeping)) + return; + + /* + * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check + * and the nr_running increment below, we may ruin the nr_running reset + * and leave with an unexpected pool->nr_running == 1 on the newly unbound + * pool. Protect against such race. + */ + preempt_disable(); + if (!(worker->flags & WORKER_NOT_RUNNING)) + worker->pool->nr_running++; + preempt_enable(); + + /* + * CPU intensive auto-detection cares about how long a work item hogged + * CPU without sleeping. Reset the starting timestamp on wakeup. + */ + worker->current_at = worker->task->se.sum_exec_runtime; + + WRITE_ONCE(worker->sleeping, 0); +} + +/** + * wq_worker_sleeping - a worker is going to sleep + * @task: task going to sleep + * + * This function is called from schedule() when a busy worker is + * going to sleep. + */ +void wq_worker_sleeping(struct task_struct *task) +{ + struct worker *worker = kthread_data(task); + struct worker_pool *pool; + + /* + * Rescuers, which may not have all the fields set up like normal + * workers, also reach here, let's not access anything before + * checking NOT_RUNNING. + */ + if (worker->flags & WORKER_NOT_RUNNING) + return; + + pool = worker->pool; + + /* Return if preempted before wq_worker_running() was reached */ + if (READ_ONCE(worker->sleeping)) + return; + + WRITE_ONCE(worker->sleeping, 1); + raw_spin_lock_irq(&pool->lock); + + /* + * Recheck in case unbind_workers() preempted us. We don't + * want to decrement nr_running after the worker is unbound + * and nr_running has been reset. + */ + if (worker->flags & WORKER_NOT_RUNNING) { + raw_spin_unlock_irq(&pool->lock); + return; + } + + pool->nr_running--; + if (kick_pool(pool)) + worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; + + raw_spin_unlock_irq(&pool->lock); +} + +/** + * wq_worker_tick - a scheduler tick occurred while a kworker is running + * @task: task currently running + * + * Called from scheduler_tick(). We're in the IRQ context and the current + * worker's fields which follow the 'K' locking rule can be accessed safely. + */ +void wq_worker_tick(struct task_struct *task) +{ + struct worker *worker = kthread_data(task); + struct pool_workqueue *pwq = worker->current_pwq; + struct worker_pool *pool = worker->pool; + + if (!pwq) + return; + + pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC; + + if (!wq_cpu_intensive_thresh_us) + return; + + /* + * If the current worker is concurrency managed and hogged the CPU for + * longer than wq_cpu_intensive_thresh_us, it's automatically marked + * CPU_INTENSIVE to avoid stalling other concurrency-managed work items. + * + * Set @worker->sleeping means that @worker is in the process of + * switching out voluntarily and won't be contributing to + * @pool->nr_running until it wakes up. As wq_worker_sleeping() also + * decrements ->nr_running, setting CPU_INTENSIVE here can lead to + * double decrements. The task is releasing the CPU anyway. Let's skip. + * We probably want to make this prettier in the future. + */ + if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) || + worker->task->se.sum_exec_runtime - worker->current_at < + wq_cpu_intensive_thresh_us * NSEC_PER_USEC) + return; + + raw_spin_lock(&pool->lock); + + worker_set_flags(worker, WORKER_CPU_INTENSIVE); + wq_cpu_intensive_report(worker->current_func); + pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; + + if (kick_pool(pool)) + pwq->stats[PWQ_STAT_CM_WAKEUP]++; + + raw_spin_unlock(&pool->lock); +} + +/** + * wq_worker_last_func - retrieve worker's last work function + * @task: Task to retrieve last work function of. + * + * Determine the last function a worker executed. This is called from + * the scheduler to get a worker's last known identity. + * + * CONTEXT: + * raw_spin_lock_irq(rq->lock) + * + * This function is called during schedule() when a kworker is going + * to sleep. It's used by psi to identify aggregation workers during + * dequeuing, to allow periodic aggregation to shut-off when that + * worker is the last task in the system or cgroup to go to sleep. + * + * As this function doesn't involve any workqueue-related locking, it + * only returns stable values when called from inside the scheduler's + * queuing and dequeuing paths, when @task, which must be a kworker, + * is guaranteed to not be processing any works. + * + * Return: + * The last work function %current executed as a worker, NULL if it + * hasn't executed any work yet. + */ +work_func_t wq_worker_last_func(struct task_struct *task) +{ + struct worker *worker = kthread_data(task); + + return worker->last_func; +} + +/** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get * @@ -1103,17 +1427,11 @@ static void put_pwq(struct pool_workqueue *pwq) lockdep_assert_held(&pwq->pool->lock); if (likely(--pwq->refcnt)) return; - if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) - return; /* - * @pwq can't be released under pool->lock, bounce to - * pwq_unbound_release_workfn(). This never recurses on the same - * pool->lock as this path is taken only for unbound workqueues and - * the release work item is scheduled on a per-cpu workqueue. To - * avoid lockdep warning, unbound pool->locks are given lockdep - * subclass of 1 in get_unbound_pool(). + * @pwq can't be released under pool->lock, bounce to a dedicated + * kthread_worker to avoid A-A deadlocks. */ - schedule_work(&pwq->unbound_release_work); + kthread_queue_work(pwq_release_worker, &pwq->release_work); } /** @@ -1135,7 +1453,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) } } -static void pwq_activate_delayed_work(struct work_struct *work) +static void pwq_activate_inactive_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); @@ -1143,22 +1461,22 @@ static void pwq_activate_delayed_work(struct work_struct *work) if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); + __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); pwq->nr_active++; } -static void pwq_activate_first_delayed(struct pool_workqueue *pwq) +static void pwq_activate_first_inactive(struct pool_workqueue *pwq) { - struct work_struct *work = list_first_entry(&pwq->delayed_works, + struct work_struct *work = list_first_entry(&pwq->inactive_works, struct work_struct, entry); - pwq_activate_delayed_work(work); + pwq_activate_inactive_work(work); } /** * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight * @pwq: pwq of interest - * @color: color of work which left the queue + * @work_data: work_data of work which left the queue * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. @@ -1166,21 +1484,21 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq) * CONTEXT: * raw_spin_lock_irq(pool->lock). */ -static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) +static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) { - /* uncolored work items don't participate in flushing or nr_active */ - if (color == WORK_NO_COLOR) - goto out_put; + int color = get_work_color(work_data); - pwq->nr_in_flight[color]--; - - pwq->nr_active--; - if (!list_empty(&pwq->delayed_works)) { - /* one down, submit a delayed one */ - if (pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); + if (!(work_data & WORK_STRUCT_INACTIVE)) { + pwq->nr_active--; + if (!list_empty(&pwq->inactive_works)) { + /* one down, submit an inactive one */ + if (pwq->nr_active < pwq->max_active) + pwq_activate_first_inactive(pwq); + } } + pwq->nr_in_flight[color]--; + /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) goto out_put; @@ -1280,17 +1598,21 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, debug_work_deactivate(work); /* - * A delayed work item cannot be grabbed directly because - * it might have linked NO_COLOR work items which, if left - * on the delayed_list, will confuse pwq->nr_active + * A cancelable inactive work item must be in the + * pwq->inactive_works since a queued barrier can't be + * canceled (see the comments in insert_wq_barrier()). + * + * An inactive work item cannot be grabbed directly because + * it might have linked barrier work items which, if left + * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. */ - if (*work_data_bits(work) & WORK_STRUCT_DELAYED) - pwq_activate_delayed_work(work); + if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) + pwq_activate_inactive_work(work); list_del_init(&work->entry); - pwq_dec_nr_in_flight(pwq, get_work_color(work)); + pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); @@ -1325,25 +1647,15 @@ fail: static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { - struct worker_pool *pool = pwq->pool; + debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + kasan_record_aux_stack_noalloc(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); get_pwq(pwq); - - /* - * Ensure either wq_worker_sleeping() sees the above - * list_add_tail() or we see zero nr_running to avoid workers lying - * around lazily while there are works to be processed. - */ - smp_mb(); - - if (__need_more_worker(pool)) - wake_up_worker(pool); } /* @@ -1369,20 +1681,15 @@ static bool is_chained_work(struct workqueue_struct *wq) */ static int wq_select_unbound_cpu(int cpu) { - static bool printed_dbg_warning; int new_cpu; if (likely(!wq_debug_force_rr_cpu)) { if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) return cpu; - } else if (!printed_dbg_warning) { - pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n"); - printed_dbg_warning = true; + } else { + pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); } - if (cpumask_empty(wq_unbound_cpumask)) - return cpu; - new_cpu = __this_cpu_read(wq_rr_cpu_last); new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); if (unlikely(new_cpu >= nr_cpu_ids)) { @@ -1399,8 +1706,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; - struct worker_pool *last_pool; - struct list_head *worklist; + struct worker_pool *last_pool, *pool; unsigned int work_flags; unsigned int req_cpu = cpu; @@ -1412,32 +1718,35 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, */ lockdep_assert_irqs_disabled(); - debug_work_activate(work); - /* if draining, only works from the same workqueue are allowed */ - if (unlikely(wq->flags & __WQ_DRAINING) && - WARN_ON_ONCE(!is_chained_work(wq))) + /* + * For a draining wq, only works from the same workqueue are + * allowed. The __WQ_DESTROYING helps to spot the issue that + * queues a new work item to a wq after destroy_workqueue(wq). + */ + if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && + WARN_ON_ONCE(!is_chained_work(wq)))) return; rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ - if (wq->flags & WQ_UNBOUND) { - if (req_cpu == WORK_CPU_UNBOUND) + if (req_cpu == WORK_CPU_UNBOUND) { + if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); - pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - } else { - if (req_cpu == WORK_CPU_UNBOUND) + else cpu = raw_smp_processor_id(); - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); } + pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu)); + pool = pwq->pool; + /* * If @work was previously on a different pool, it might still be * running there, in which case the work needs to be queued on that * pool to guarantee non-reentrancy. */ last_pool = get_work_pool(work); - if (last_pool && last_pool != pwq->pool) { + if (last_pool && last_pool != pool) { struct worker *worker; raw_spin_lock(&last_pool->lock); @@ -1446,26 +1755,27 @@ retry: if (worker && worker->current_pwq->wq == wq) { pwq = worker->current_pwq; + pool = pwq->pool; + WARN_ON_ONCE(pool != last_pool); } else { /* meh... not running there, queue here */ raw_spin_unlock(&last_pool->lock); - raw_spin_lock(&pwq->pool->lock); + raw_spin_lock(&pool->lock); } } else { - raw_spin_lock(&pwq->pool->lock); + raw_spin_lock(&pool->lock); } /* - * pwq is determined and locked. For unbound pools, we could have - * raced with pwq release and it could already be dead. If its - * refcnt is zero, repeat pwq selection. Note that pwqs never die - * without another pwq replacing it in the numa_pwq_tbl or while - * work items are executing on it, so the retrying is guaranteed to - * make forward-progress. + * pwq is determined and locked. For unbound pools, we could have raced + * with pwq release and it could already be dead. If its refcnt is zero, + * repeat pwq selection. Note that unbound pwqs never die without + * another pwq replacing it in cpu_pwq or while work items are executing + * on it, so the retrying is guaranteed to make forward-progress. */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { - raw_spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pool->lock); cpu_relax(); goto retry; } @@ -1484,20 +1794,20 @@ retry: work_flags = work_color_to_flags(pwq->work_color); if (likely(pwq->nr_active < pwq->max_active)) { + if (list_empty(&pool->worklist)) + pool->watchdog_ts = jiffies; + trace_workqueue_activate_work(work); pwq->nr_active++; - worklist = &pwq->pool->worklist; - if (list_empty(worklist)) - pwq->pool->watchdog_ts = jiffies; + insert_work(pwq, work, &pool->worklist, work_flags); + kick_pool(pool); } else { - work_flags |= WORK_STRUCT_DELAYED; - worklist = &pwq->delayed_works; + work_flags |= WORK_STRUCT_INACTIVE; + insert_work(pwq, work, &pwq->inactive_works, work_flags); } - insert_work(pwq, work, worklist, work_flags); - out: - raw_spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pool->lock); rcu_read_unlock(); } @@ -1508,7 +1818,10 @@ out: * @work: work to queue * * We queue the work to a specific CPU, the caller must ensure it - * can't go away. + * can't go away. Callers that fail to ensure that the specified + * CPU cannot go away will execute on a randomly chosen CPU. + * But note well that callers specifying a CPU that never has been + * online will get a splat. * * Return: %false if @work was already on a queue, %true otherwise. */ @@ -1531,7 +1844,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, EXPORT_SYMBOL(queue_work_on); /** - * workqueue_select_cpu_near - Select a CPU based on NUMA node + * select_numa_node_cpu - Select a CPU based on NUMA node * @node: NUMA node ID that we want to select a CPU from * * This function will attempt to find a "random" cpu available on a given @@ -1539,14 +1852,10 @@ EXPORT_SYMBOL(queue_work_on); * WORK_CPU_UNBOUND indicating that we should just schedule to any * available CPU if we need to schedule this work. */ -static int workqueue_select_cpu_near(int node) +static int select_numa_node_cpu(int node) { int cpu; - /* No point in doing this if NUMA isn't enabled for workqueues */ - if (!wq_numa_enabled) - return WORK_CPU_UNBOUND; - /* Delay binding to CPU if node is not valid or online */ if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) return WORK_CPU_UNBOUND; @@ -1603,7 +1912,7 @@ bool queue_work_node(int node, struct workqueue_struct *wq, local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - int cpu = workqueue_select_cpu_near(node); + int cpu = select_numa_node_cpu(node); __queue_work(cpu, wq, work); ret = true; @@ -1750,7 +2059,7 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { rwork->wq = wq; - call_rcu(&rwork->rcu, rcu_work_rcufn); + call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); return true; } @@ -1758,67 +2067,6 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) } EXPORT_SYMBOL(queue_rcu_work); -/** - * worker_enter_idle - enter idle state - * @worker: worker which is entering idle state - * - * @worker is entering idle state. Update stats and idle timer if - * necessary. - * - * LOCKING: - * raw_spin_lock_irq(pool->lock). - */ -static void worker_enter_idle(struct worker *worker) -{ - struct worker_pool *pool = worker->pool; - - if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || - WARN_ON_ONCE(!list_empty(&worker->entry) && - (worker->hentry.next || worker->hentry.pprev))) - return; - - /* can't use worker_set_flags(), also called from create_worker() */ - worker->flags |= WORKER_IDLE; - pool->nr_idle++; - worker->last_active = jiffies; - - /* idle_list is LIFO */ - list_add(&worker->entry, &pool->idle_list); - - if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) - mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); - - /* - * Sanity check nr_running. Because unbind_workers() releases - * pool->lock between setting %WORKER_UNBOUND and zapping - * nr_running, the warning may trigger spuriously. Check iff - * unbind is not in progress. - */ - WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && - pool->nr_workers == pool->nr_idle && - atomic_read(&pool->nr_running)); -} - -/** - * worker_leave_idle - leave idle state - * @worker: worker which is leaving idle state - * - * @worker is leaving idle state. Update stats. - * - * LOCKING: - * raw_spin_lock_irq(pool->lock). - */ -static void worker_leave_idle(struct worker *worker) -{ - struct worker_pool *pool = worker->pool; - - if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) - return; - worker_clr_flags(worker, WORKER_IDLE); - pool->nr_idle--; - list_del_init(&worker->entry); -} - static struct worker *alloc_worker(int node) { struct worker *worker; @@ -1834,6 +2082,14 @@ static struct worker *alloc_worker(int node) return worker; } +static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) +{ + if (pool->cpu < 0 && pool->attrs->affn_strict) + return pool->attrs->__pod_cpumask; + else + return pool->attrs->cpumask; +} + /** * worker_attach_to_pool() - attach a worker to a pool * @worker: worker to be attached @@ -1859,7 +2115,7 @@ static void worker_attach_to_pool(struct worker *worker, kthread_set_per_cpu(worker->task, pool->cpu); if (worker->rescue_wq) - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)); list_add_tail(&worker->node, &pool->workers); worker->pool = pool; @@ -1886,7 +2142,7 @@ static void worker_detach_from_pool(struct worker *worker) list_del(&worker->node); worker->pool = NULL; - if (list_empty(&pool->workers)) + if (list_empty(&pool->workers) && list_empty(&pool->dying_workers)) detach_completion = pool->detach_completion; mutex_unlock(&wq_pool_attach_mutex); @@ -1911,18 +2167,23 @@ static void worker_detach_from_pool(struct worker *worker) */ static struct worker *create_worker(struct worker_pool *pool) { - struct worker *worker = NULL; - int id = -1; - char id_buf[16]; + struct worker *worker; + int id; + char id_buf[23]; /* ID is needed to determine kthread name */ - id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); - if (id < 0) - goto fail; + id = ida_alloc(&pool->worker_ida, GFP_KERNEL); + if (id < 0) { + pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n", + ERR_PTR(id)); + return NULL; + } worker = alloc_worker(pool->node); - if (!worker) + if (!worker) { + pr_err_once("workqueue: Failed to allocate a worker\n"); goto fail; + } worker->id = id; @@ -1934,46 +2195,96 @@ static struct worker *create_worker(struct worker_pool *pool) worker->task = kthread_create_on_node(worker_thread, worker, pool->node, "kworker/%s", id_buf); - if (IS_ERR(worker->task)) + if (IS_ERR(worker->task)) { + if (PTR_ERR(worker->task) == -EINTR) { + pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", + id_buf); + } else { + pr_err_once("workqueue: Failed to create a worker thread: %pe", + worker->task); + } goto fail; + } set_user_nice(worker->task, pool->attrs->nice); - kthread_bind_mask(worker->task, pool->attrs->cpumask); + kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); /* start the newly created worker */ raw_spin_lock_irq(&pool->lock); + worker->pool->nr_workers++; worker_enter_idle(worker); + kick_pool(pool); + + /* + * @worker is waiting on a completion in kthread() and will trigger hung + * check if not woken up soon. As kick_pool() might not have waken it + * up, wake it up explicitly once more. + */ wake_up_process(worker->task); + raw_spin_unlock_irq(&pool->lock); return worker; fail: - if (id >= 0) - ida_simple_remove(&pool->worker_ida, id); + ida_free(&pool->worker_ida, id); kfree(worker); return NULL; } +static void unbind_worker(struct worker *worker) +{ + lockdep_assert_held(&wq_pool_attach_mutex); + + kthread_set_per_cpu(worker->task, -1); + if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); + else + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); +} + +static void wake_dying_workers(struct list_head *cull_list) +{ + struct worker *worker, *tmp; + + list_for_each_entry_safe(worker, tmp, cull_list, entry) { + list_del_init(&worker->entry); + unbind_worker(worker); + /* + * If the worker was somehow already running, then it had to be + * in pool->idle_list when set_worker_dying() happened or we + * wouldn't have gotten here. + * + * Thus, the worker must either have observed the WORKER_DIE + * flag, or have set its state to TASK_IDLE. Either way, the + * below will be observed by the worker and is safe to do + * outside of pool->lock. + */ + wake_up_process(worker->task); + } +} + /** - * destroy_worker - destroy a workqueue worker + * set_worker_dying - Tag a worker for destruction * @worker: worker to be destroyed + * @list: transfer worker away from its pool->idle_list and into list * - * Destroy @worker and adjust @pool stats accordingly. The worker should - * be idle. + * Tag @worker for destruction and adjust @pool stats accordingly. The worker + * should be idle. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ -static void destroy_worker(struct worker *worker) +static void set_worker_dying(struct worker *worker, struct list_head *list) { struct worker_pool *pool = worker->pool; lockdep_assert_held(&pool->lock); + lockdep_assert_held(&wq_pool_attach_mutex); /* sanity check frenzy */ if (WARN_ON(worker->current_work) || @@ -1984,34 +2295,93 @@ static void destroy_worker(struct worker *worker) pool->nr_workers--; pool->nr_idle--; - list_del_init(&worker->entry); worker->flags |= WORKER_DIE; - wake_up_process(worker->task); + + list_move(&worker->entry, list); + list_move(&worker->node, &pool->dying_workers); } +/** + * idle_worker_timeout - check if some idle workers can now be deleted. + * @t: The pool's idle_timer that just expired + * + * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in + * worker_leave_idle(), as a worker flicking between idle and active while its + * pool is at the too_many_workers() tipping point would cause too much timer + * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let + * it expire and re-evaluate things from there. + */ static void idle_worker_timeout(struct timer_list *t) { struct worker_pool *pool = from_timer(pool, t, idle_timer); + bool do_cull = false; + + if (work_pending(&pool->idle_cull_work)) + return; raw_spin_lock_irq(&pool->lock); - while (too_many_workers(pool)) { + if (too_many_workers(pool)) { struct worker *worker; unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; + do_cull = !time_before(jiffies, expires); + + if (!do_cull) + mod_timer(&pool->idle_timer, expires); + } + raw_spin_unlock_irq(&pool->lock); + + if (do_cull) + queue_work(system_unbound_wq, &pool->idle_cull_work); +} + +/** + * idle_cull_fn - cull workers that have been idle for too long. + * @work: the pool's work for handling these idle workers + * + * This goes through a pool's idle workers and gets rid of those that have been + * idle for at least IDLE_WORKER_TIMEOUT seconds. + * + * We don't want to disturb isolated CPUs because of a pcpu kworker being + * culled, so this also resets worker affinity. This requires a sleepable + * context, hence the split between timer callback and work item. + */ +static void idle_cull_fn(struct work_struct *work) +{ + struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); + LIST_HEAD(cull_list); + + /* + * Grabbing wq_pool_attach_mutex here ensures an already-running worker + * cannot proceed beyong worker_detach_from_pool() in its self-destruct + * path. This is required as a previously-preempted worker could run after + * set_worker_dying() has happened but before wake_dying_workers() did. + */ + mutex_lock(&wq_pool_attach_mutex); + raw_spin_lock_irq(&pool->lock); + + while (too_many_workers(pool)) { + struct worker *worker; + unsigned long expires; + + worker = list_entry(pool->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { mod_timer(&pool->idle_timer, expires); break; } - destroy_worker(worker); + set_worker_dying(worker, &cull_list); } raw_spin_unlock_irq(&pool->lock); + wake_dying_workers(&cull_list); + mutex_unlock(&wq_pool_attach_mutex); } static void send_mayday(struct work_struct *work) @@ -2034,6 +2404,7 @@ static void send_mayday(struct work_struct *work) get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); + pwq->stats[PWQ_STAT_MAYDAY]++; } } @@ -2171,9 +2542,7 @@ __acquires(&pool->lock) { struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; - bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; - int work_color; - struct worker *collision; + unsigned long work_data; #ifdef CONFIG_LOCKDEP /* * It is permissible to free the struct work_struct from @@ -2190,25 +2559,15 @@ __acquires(&pool->lock) WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != pool->cpu); - /* - * A single work shouldn't be executed concurrently by - * multiple workers on a single cpu. Check whether anyone is - * already processing the work. If so, defer the work to the - * currently executing one. - */ - collision = find_worker_executing_work(pool, work); - if (unlikely(collision)) { - move_linked_works(work, &collision->scheduled, NULL); - return; - } - /* claim and dequeue */ debug_work_deactivate(work); hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; - work_color = get_work_color(work); + worker->current_at = worker->task->se.sum_exec_runtime; + work_data = *work_data_bits(work); + worker->current_color = get_work_color(work_data); /* * Record wq name for cmdline and debug reporting, may get @@ -2224,18 +2583,16 @@ __acquires(&pool->lock) * of concurrency management and the next code block will chain * execution of the pending work items. */ - if (unlikely(cpu_intensive)) + if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE)) worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* - * Wake up another worker if necessary. The condition is always - * false for normal per-cpu workers since nr_running would always - * be >= 1 at this point. This is used to chain execution of the - * pending work items for WORKER_NOT_RUNNING workers such as the - * UNBOUND and CPU_INTENSIVE ones. + * Kick @pool if necessary. It's always noop for per-cpu worker pools + * since nr_running would always be >= 1 at this point. This is used to + * chain execution of the pending work items for WORKER_NOT_RUNNING + * workers such as the UNBOUND and CPU_INTENSIVE ones. */ - if (need_more_worker(pool)) - wake_up_worker(pool); + kick_pool(pool); /* * Record the last pool and clear PENDING which should be the last @@ -2245,6 +2602,7 @@ __acquires(&pool->lock) */ set_work_pool_and_clear_pending(work, pool->id); + pwq->stats[PWQ_STAT_STARTED]++; raw_spin_unlock_irq(&pool->lock); lock_map_acquire(&pwq->wq->lockdep_map); @@ -2278,6 +2636,7 @@ __acquires(&pool->lock) * point will only record its address. */ trace_workqueue_execute_end(work, worker->current_func); + pwq->stats[PWQ_STAT_COMPLETED]++; lock_map_release(&lockdep_map); lock_map_release(&pwq->wq->lockdep_map); @@ -2302,9 +2661,12 @@ __acquires(&pool->lock) raw_spin_lock_irq(&pool->lock); - /* clear cpu intensive status */ - if (unlikely(cpu_intensive)) - worker_clr_flags(worker, WORKER_CPU_INTENSIVE); + /* + * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked + * CPU intensive by wq_worker_tick() if @work hogged CPU longer than + * wq_cpu_intensive_thresh_us. Clear it. + */ + worker_clr_flags(worker, WORKER_CPU_INTENSIVE); /* tag the worker for identification in schedule() */ worker->last_func = worker->current_func; @@ -2314,7 +2676,8 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; - pwq_dec_nr_in_flight(pwq, work_color); + worker->current_color = INT_MAX; + pwq_dec_nr_in_flight(pwq, work_data); } /** @@ -2331,9 +2694,15 @@ __acquires(&pool->lock) */ static void process_scheduled_works(struct worker *worker) { - while (!list_empty(&worker->scheduled)) { - struct work_struct *work = list_first_entry(&worker->scheduled, - struct work_struct, entry); + struct work_struct *work; + bool first = true; + + while ((work = list_first_entry_or_null(&worker->scheduled, + struct work_struct, entry))) { + if (first) { + worker->pool->watchdog_ts = jiffies; + first = false; + } process_one_work(worker, work); } } @@ -2373,12 +2742,12 @@ woke_up: /* am I supposed to die? */ if (unlikely(worker->flags & WORKER_DIE)) { raw_spin_unlock_irq(&pool->lock); - WARN_ON_ONCE(!list_empty(&worker->entry)); set_pf_worker(false); set_task_comm(worker->task, "kworker/dying"); - ida_simple_remove(&pool->worker_ida, worker->id); + ida_free(&pool->worker_ida, worker->id); worker_detach_from_pool(worker); + WARN_ON_ONCE(!list_empty(&worker->entry)); kfree(worker); return 0; } @@ -2414,17 +2783,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); - pool->watchdog_ts = jiffies; - - if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { - /* optimization path, not strictly necessary */ - process_one_work(worker, work); - if (unlikely(!list_empty(&worker->scheduled))) - process_scheduled_works(worker); - } else { - move_linked_works(work, &worker->scheduled, NULL); + if (assign_work(work, worker, NULL)) process_scheduled_works(worker); - } } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP); @@ -2468,7 +2828,6 @@ static int rescuer_thread(void *__rescuer) { struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; - struct list_head *scheduled = &rescuer->scheduled; bool should_stop; set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2499,7 +2858,6 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; - bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2514,23 +2872,20 @@ repeat: * Slurp in all works issued via this workqueue and * process'em. */ - WARN_ON_ONCE(!list_empty(scheduled)); + WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) { - if (get_work_pwq(work) == pwq) { - if (first) - pool->watchdog_ts = jiffies; - move_linked_works(work, scheduled, &n); - } - first = false; + if (get_work_pwq(work) == pwq && + assign_work(work, rescuer, &n)) + pwq->stats[PWQ_STAT_RESCUED]++; } - if (!list_empty(scheduled)) { + if (!list_empty(&rescuer->scheduled)) { process_scheduled_works(rescuer); /* * The above execution of rescued work items could * have created more to rescue through - * pwq_activate_first_delayed() or chained + * pwq_activate_first_inactive() or chained * queueing. Let's put @pwq back on mayday list so * that such back-to-back work items, which may be * being used to relieve memory pressure, don't @@ -2557,12 +2912,10 @@ repeat: put_pwq(pwq); /* - * Leave this pool. If need_more_worker() is %true, notify a - * regular worker; otherwise, we end up with 0 concurrency - * and stalling the execution. + * Leave this pool. Notify regular workers; otherwise, we end up + * with 0 concurrency and stalling the execution. */ - if (need_more_worker(pool)) - wake_up_worker(pool); + kick_pool(pool); raw_spin_unlock_irq(&pool->lock); @@ -2657,8 +3010,9 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { + unsigned int work_flags = 0; + unsigned int work_color; struct list_head *head; - unsigned int linked = 0; /* * debugobject calls are safe here even with pool->lock locked @@ -2673,24 +3027,30 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, barr->task = current; + /* The barrier work item does not participate in pwq->nr_active. */ + work_flags |= WORK_STRUCT_INACTIVE; + /* * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. */ - if (worker) + if (worker) { head = worker->scheduled.next; - else { + work_color = worker->current_color; + } else { unsigned long *bits = work_data_bits(target); head = target->entry.next; /* there can already be other linked works, inherit and set */ - linked = *bits & WORK_STRUCT_LINKED; + work_flags |= *bits & WORK_STRUCT_LINKED; + work_color = get_work_color(*bits); __set_bit(WORK_STRUCT_LINKED_BIT, bits); } - debug_work_activate(&barr->work); - insert_work(pwq, &barr->work, head, - work_color_to_flags(WORK_NO_COLOR) | linked); + pwq->nr_in_flight[work_color]++; + work_flags |= work_color_to_flags(work_color); + + insert_work(pwq, &barr->work, head, work_flags); } /** @@ -2765,13 +3125,13 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, } /** - * flush_workqueue - ensure that any scheduled work has run to completion. + * __flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * * This function sleeps until all work items which were queued on entry * have finished execution, but it is not livelocked by new incoming ones. */ -void flush_workqueue(struct workqueue_struct *wq) +void __flush_workqueue(struct workqueue_struct *wq) { struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), @@ -2920,7 +3280,7 @@ void flush_workqueue(struct workqueue_struct *wq) out_unlock: mutex_unlock(&wq->mutex); } -EXPORT_SYMBOL(flush_workqueue); +EXPORT_SYMBOL(__flush_workqueue); /** * drain_workqueue - drain a workqueue @@ -2948,7 +3308,7 @@ void drain_workqueue(struct workqueue_struct *wq) wq->flags |= __WQ_DRAINING; mutex_unlock(&wq->mutex); reflush: - flush_workqueue(wq); + __flush_workqueue(wq); mutex_lock(&wq->mutex); @@ -2956,7 +3316,7 @@ reflush: bool drained; raw_spin_lock_irq(&pwq->pool->lock); - drained = !pwq->nr_active && list_empty(&pwq->delayed_works); + drained = !pwq->nr_active && list_empty(&pwq->inactive_works); raw_spin_unlock_irq(&pwq->pool->lock); if (drained) @@ -2964,8 +3324,8 @@ reflush: if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n", - wq->name, flush_cnt); + pr_warn("workqueue %s: %s() isn't complete after %u tries\n", + wq->name, __func__, flush_cnt); mutex_unlock(&wq->mutex); goto reflush; @@ -3043,10 +3403,8 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!work->func)) return false; - if (!from_cancel) { - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - } + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); if (start_flush_work(work, &barr, from_cancel)) { wait_for_completion(&barr.done); @@ -3235,6 +3593,15 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork) return ret; } +/* + * See cancel_delayed_work() + */ +bool cancel_work(struct work_struct *work) +{ + return __cancel_work(work, false); +} +EXPORT_SYMBOL(cancel_work); + /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel @@ -3292,7 +3659,7 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); @@ -3304,7 +3671,7 @@ int schedule_on_each_cpu(work_func_t func) for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); + cpus_read_unlock(); free_percpu(works); return 0; } @@ -3345,6 +3712,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); + free_cpumask_var(attrs->__pod_cpumask); kfree(attrs); } } @@ -3366,8 +3734,11 @@ struct workqueue_attrs *alloc_workqueue_attrs(void) goto fail; if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) goto fail; + if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL)) + goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); + attrs->affn_scope = WQ_AFFN_DFL; return attrs; fail: free_workqueue_attrs(attrs); @@ -3379,12 +3750,26 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, { to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); + cpumask_copy(to->__pod_cpumask, from->__pod_cpumask); + to->affn_strict = from->affn_strict; + /* - * Unlike hash and equality test, this function doesn't ignore - * ->no_numa as it is used for both pool and wq attrs. Instead, - * get_unbound_pool() explicitly clears ->no_numa after copying. + * Unlike hash and equality test, copying shouldn't ignore wq-only + * fields as copying is used for both pool and wq attrs. Instead, + * get_unbound_pool() explicitly clears the fields. */ - to->no_numa = from->no_numa; + to->affn_scope = from->affn_scope; + to->ordered = from->ordered; +} + +/* + * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the + * comments in 'struct workqueue_attrs' definition. + */ +static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) +{ + attrs->affn_scope = WQ_AFFN_NR_TYPES; + attrs->ordered = false; } /* hash value of the content of @attr */ @@ -3395,6 +3780,9 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) hash = jhash_1word(attrs->nice, hash); hash = jhash(cpumask_bits(attrs->cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash(cpumask_bits(attrs->__pod_cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash_1word(attrs->affn_strict, hash); return hash; } @@ -3406,9 +3794,57 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, return false; if (!cpumask_equal(a->cpumask, b->cpumask)) return false; + if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) + return false; + if (a->affn_strict != b->affn_strict) + return false; return true; } +/* Update @attrs with actually available CPUs */ +static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs, + const cpumask_t *unbound_cpumask) +{ + /* + * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If + * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to + * @unbound_cpumask. + */ + cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask); + if (unlikely(cpumask_empty(attrs->cpumask))) + cpumask_copy(attrs->cpumask, unbound_cpumask); +} + +/* find wq_pod_type to use for @attrs */ +static const struct wq_pod_type * +wqattrs_pod_type(const struct workqueue_attrs *attrs) +{ + enum wq_affn_scope scope; + struct wq_pod_type *pt; + + /* to synchronize access to wq_affn_dfl */ + lockdep_assert_held(&wq_pool_mutex); + + if (attrs->affn_scope == WQ_AFFN_DFL) + scope = wq_affn_dfl; + else + scope = attrs->affn_scope; + + pt = &wq_pod_types[scope]; + + if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) && + likely(pt->nr_pods)) + return pt; + + /* + * Before workqueue_init_topology(), only SYSTEM is available which is + * initialized in workqueue_init_early(). + */ + pt = &wq_pod_types[WQ_AFFN_SYSTEM]; + BUG_ON(!pt->nr_pods); + return pt; +} + /** * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize @@ -3432,10 +3868,12 @@ static int init_worker_pool(struct worker_pool *pool) hash_init(pool->busy_hash); timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); + INIT_WORK(&pool->idle_cull_work, idle_cull_fn); timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); INIT_LIST_HEAD(&pool->workers); + INIT_LIST_HEAD(&pool->dying_workers); ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); @@ -3445,6 +3883,9 @@ static int init_worker_pool(struct worker_pool *pool) pool->attrs = alloc_workqueue_attrs(); if (!pool->attrs) return -ENOMEM; + + wqattrs_clear_for_pool(pool->attrs); + return 0; } @@ -3492,12 +3933,8 @@ static void rcu_free_wq(struct rcu_head *rcu) container_of(rcu, struct workqueue_struct, rcu); wq_free_lockdep(wq); - - if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_pwqs); - else - free_workqueue_attrs(wq->unbound_attrs); - + free_percpu(wq->cpu_pwq); + free_workqueue_attrs(wq->unbound_attrs); kfree(wq); } @@ -3510,18 +3947,6 @@ static void rcu_free_pool(struct rcu_head *rcu) kfree(pool); } -/* This returns with the lock held on success (pool manager is inactive). */ -static bool wq_manager_inactive(struct worker_pool *pool) -{ - raw_spin_lock_irq(&pool->lock); - - if (pool->flags & POOL_MANAGER_ACTIVE) { - raw_spin_unlock_irq(&pool->lock); - return false; - } - return true; -} - /** * put_unbound_pool - put a worker_pool * @pool: worker_pool to put @@ -3537,6 +3962,7 @@ static void put_unbound_pool(struct worker_pool *pool) { DECLARE_COMPLETION_ONSTACK(detach_completion); struct worker *worker; + LIST_HEAD(cull_list); lockdep_assert_held(&wq_pool_mutex); @@ -3557,20 +3983,38 @@ static void put_unbound_pool(struct worker_pool *pool) * Become the manager and destroy all workers. This prevents * @pool's workers from blocking on attach_mutex. We're the last * manager and @pool gets freed with the flag set. - * Because of how wq_manager_inactive() works, we will hold the - * spinlock after a successful wait. + * + * Having a concurrent manager is quite unlikely to happen as we can + * only get here with + * pwq->refcnt == pool->refcnt == 0 + * which implies no work queued to the pool, which implies no worker can + * become the manager. However a worker could have taken the role of + * manager before the refcnts dropped to 0, since maybe_create_worker() + * drops pool->lock */ - rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool), - TASK_UNINTERRUPTIBLE); - pool->flags |= POOL_MANAGER_ACTIVE; + while (true) { + rcuwait_wait_event(&manager_wait, + !(pool->flags & POOL_MANAGER_ACTIVE), + TASK_UNINTERRUPTIBLE); + + mutex_lock(&wq_pool_attach_mutex); + raw_spin_lock_irq(&pool->lock); + if (!(pool->flags & POOL_MANAGER_ACTIVE)) { + pool->flags |= POOL_MANAGER_ACTIVE; + break; + } + raw_spin_unlock_irq(&pool->lock); + mutex_unlock(&wq_pool_attach_mutex); + } while ((worker = first_idle_worker(pool))) - destroy_worker(worker); + set_worker_dying(worker, &cull_list); WARN_ON(pool->nr_workers || pool->nr_idle); raw_spin_unlock_irq(&pool->lock); - mutex_lock(&wq_pool_attach_mutex); - if (!list_empty(&pool->workers)) + wake_dying_workers(&cull_list); + + if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers)) pool->detach_completion = &detach_completion; mutex_unlock(&wq_pool_attach_mutex); @@ -3579,6 +4023,7 @@ static void put_unbound_pool(struct worker_pool *pool) /* shut down the timers */ del_timer_sync(&pool->idle_timer); + cancel_work_sync(&pool->idle_cull_work); del_timer_sync(&pool->mayday_timer); /* RCU protected to allow dereferences from get_work_pool() */ @@ -3601,10 +4046,10 @@ static void put_unbound_pool(struct worker_pool *pool) */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { + struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA]; u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - int node; - int target_node = NUMA_NO_NODE; + int pod, node = NUMA_NO_NODE; lockdep_assert_held(&wq_pool_mutex); @@ -3616,31 +4061,22 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) } } - /* if cpumask is contained inside a NUMA node, we belong to that node */ - if (wq_numa_enabled) { - for_each_node(node) { - if (cpumask_subset(attrs->cpumask, - wq_numa_possible_cpumask[node])) { - target_node = node; - break; - } + /* If __pod_cpumask is contained inside a NUMA pod, that's our node */ + for (pod = 0; pod < pt->nr_pods; pod++) { + if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) { + node = pt->pod_node[pod]; + break; } } /* nope, create a new one */ - pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node); + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node); if (!pool || init_worker_pool(pool) < 0) goto fail; - lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ + pool->node = node; copy_workqueue_attrs(pool->attrs, attrs); - pool->node = target_node; - - /* - * no_numa isn't a worker_pool attribute, always clear it. See - * 'struct workqueue_attrs' comments for detail. - */ - pool->attrs->no_numa = false; + wqattrs_clear_for_pool(pool->attrs); if (worker_pool_assign_id(pool) < 0) goto fail; @@ -3666,28 +4102,33 @@ static void rcu_free_pwq(struct rcu_head *rcu) } /* - * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt - * and needs to be destroyed. + * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero + * refcnt and needs to be destroyed. */ -static void pwq_unbound_release_workfn(struct work_struct *work) +static void pwq_release_workfn(struct kthread_work *work) { struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, - unbound_release_work); + release_work); struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; - bool is_last; + bool is_last = false; - if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) - return; - - mutex_lock(&wq->mutex); - list_del_rcu(&pwq->pwqs_node); - is_last = list_empty(&wq->pwqs); - mutex_unlock(&wq->mutex); + /* + * When @pwq is not linked, it doesn't hold any reference to the + * @wq, and @wq is invalid to access. + */ + if (!list_empty(&pwq->pwqs_node)) { + mutex_lock(&wq->mutex); + list_del_rcu(&pwq->pwqs_node); + is_last = list_empty(&wq->pwqs); + mutex_unlock(&wq->mutex); + } - mutex_lock(&wq_pool_mutex); - put_unbound_pool(pool); - mutex_unlock(&wq_pool_mutex); + if (wq->flags & WQ_UNBOUND) { + mutex_lock(&wq_pool_mutex); + put_unbound_pool(pool); + mutex_unlock(&wq_pool_mutex); + } call_rcu(&pwq->rcu, rcu_free_pwq); @@ -3706,7 +4147,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * @pwq: target pool_workqueue * * If @pwq isn't freezing, set @pwq->max_active to the associated - * workqueue's saved_max_active and activate delayed work items + * workqueue's saved_max_active and activate inactive work items * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. */ static void pwq_adjust_max_active(struct pool_workqueue *pwq) @@ -3731,24 +4172,13 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) * is updated and visible. */ if (!freezable || !workqueue_freezing) { - bool kick = false; - pwq->max_active = wq->saved_max_active; - while (!list_empty(&pwq->delayed_works) && - pwq->nr_active < pwq->max_active) { - pwq_activate_first_delayed(pwq); - kick = true; - } + while (!list_empty(&pwq->inactive_works) && + pwq->nr_active < pwq->max_active) + pwq_activate_first_inactive(pwq); - /* - * Need to kick a worker after thawed or an unbound wq's - * max_active is bumped. In realtime scenarios, always kicking a - * worker will cause interference on the isolated cpu cores, so - * let's kick iff work items were activated. - */ - if (kick) - wake_up_worker(pwq->pool); + kick_pool(pwq->pool); } else { pwq->max_active = 0; } @@ -3756,7 +4186,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); } -/* initialize newly alloced @pwq which is associated with @wq and @pool */ +/* initialize newly allocated @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { @@ -3768,10 +4198,10 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, pwq->wq = wq; pwq->flush_color = -1; pwq->refcnt = 1; - INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); - INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); + kthread_init_work(&pwq->release_work, pwq_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ @@ -3819,61 +4249,49 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, } /** - * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node + * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod * @attrs: the wq_attrs of the default pwq of the target workqueue - * @node: the target NUMA node + * @cpu: the target CPU * @cpu_going_down: if >= 0, the CPU to consider as offline - * @cpumask: outarg, the resulting cpumask - * - * Calculate the cpumask a workqueue with @attrs should use on @node. If - * @cpu_going_down is >= 0, that cpu is considered offline during - * calculation. The result is stored in @cpumask. * - * If NUMA affinity is not enabled, @attrs->cpumask is always used. If - * enabled and @node has online CPUs requested by @attrs, the returned - * cpumask is the intersection of the possible CPUs of @node and - * @attrs->cpumask. + * Calculate the cpumask a workqueue with @attrs should use on @pod. If + * @cpu_going_down is >= 0, that cpu is considered offline during calculation. + * The result is stored in @attrs->__pod_cpumask. * - * The caller is responsible for ensuring that the cpumask of @node stays - * stable. + * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled + * and @pod has online CPUs requested by @attrs, the returned cpumask is the + * intersection of the possible CPUs of @pod and @attrs->cpumask. * - * Return: %true if the resulting @cpumask is different from @attrs->cpumask, - * %false if equal. + * The caller is responsible for ensuring that the cpumask of @pod stays stable. */ -static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, - int cpu_going_down, cpumask_t *cpumask) +static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu, + int cpu_going_down) { - if (!wq_numa_enabled || attrs->no_numa) - goto use_dfl; + const struct wq_pod_type *pt = wqattrs_pod_type(attrs); + int pod = pt->cpu_pod[cpu]; - /* does @node have any online CPUs @attrs wants? */ - cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); + /* does @pod have any online CPUs @attrs wants? */ + cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask); + cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask); if (cpu_going_down >= 0) - cpumask_clear_cpu(cpu_going_down, cpumask); + cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask); - if (cpumask_empty(cpumask)) - goto use_dfl; + if (cpumask_empty(attrs->__pod_cpumask)) { + cpumask_copy(attrs->__pod_cpumask, attrs->cpumask); + return; + } - /* yeap, return possible CPUs in @node that @attrs wants */ - cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + /* yeap, return possible CPUs in @pod that @attrs wants */ + cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]); - if (cpumask_empty(cpumask)) { + if (cpumask_empty(attrs->__pod_cpumask)) pr_warn_once("WARNING: workqueue cpumask: online intersect > " "possible intersect\n"); - return false; - } - - return !cpumask_equal(cpumask, attrs->cpumask); - -use_dfl: - cpumask_copy(cpumask, attrs->cpumask); - return false; } -/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ -static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, - int node, - struct pool_workqueue *pwq) +/* install @pwq into @wq's cpu_pwq and return the old pwq */ +static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, + int cpu, struct pool_workqueue *pwq) { struct pool_workqueue *old_pwq; @@ -3883,8 +4301,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, /* link_pwq() can handle duplicate calls */ link_pwq(pwq); - old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); - rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); + rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq); return old_pwq; } @@ -3901,10 +4319,10 @@ struct apply_wqattrs_ctx { static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) { if (ctx) { - int node; + int cpu; - for_each_node(node) - put_pwq_unlocked(ctx->pwq_tbl[node]); + for_each_possible_cpu(cpu) + put_pwq_unlocked(ctx->pwq_tbl[cpu]); put_pwq_unlocked(ctx->dfl_pwq); free_workqueue_attrs(ctx->attrs); @@ -3916,78 +4334,68 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) /* allocate the attrs and pwqs for later installation */ static struct apply_wqattrs_ctx * apply_wqattrs_prepare(struct workqueue_struct *wq, - const struct workqueue_attrs *attrs) + const struct workqueue_attrs *attrs, + const cpumask_var_t unbound_cpumask) { struct apply_wqattrs_ctx *ctx; - struct workqueue_attrs *new_attrs, *tmp_attrs; - int node; + struct workqueue_attrs *new_attrs; + int cpu; lockdep_assert_held(&wq_pool_mutex); - ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); + if (WARN_ON(attrs->affn_scope < 0 || + attrs->affn_scope >= WQ_AFFN_NR_TYPES)) + return ERR_PTR(-EINVAL); + + ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(); - tmp_attrs = alloc_workqueue_attrs(); - if (!ctx || !new_attrs || !tmp_attrs) + if (!ctx || !new_attrs) goto out_free; /* - * Calculate the attrs of the default pwq. - * If the user configured cpumask doesn't overlap with the - * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. - */ - copy_workqueue_attrs(new_attrs, attrs); - cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask); - if (unlikely(cpumask_empty(new_attrs->cpumask))) - cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask); - - /* - * We may create multiple pwqs with differing cpumasks. Make a - * copy of @new_attrs which will be modified and used to obtain - * pools. - */ - copy_workqueue_attrs(tmp_attrs, new_attrs); - - /* * If something goes wrong during CPU up/down, we'll fall back to * the default pwq covering whole @attrs->cpumask. Always create * it even if we don't use it immediately. */ + copy_workqueue_attrs(new_attrs, attrs); + wqattrs_actualize_cpumask(new_attrs, unbound_cpumask); + cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); if (!ctx->dfl_pwq) goto out_free; - for_each_node(node) { - if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) { - ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); - if (!ctx->pwq_tbl[node]) - goto out_free; - } else { + for_each_possible_cpu(cpu) { + if (new_attrs->ordered) { ctx->dfl_pwq->refcnt++; - ctx->pwq_tbl[node] = ctx->dfl_pwq; + ctx->pwq_tbl[cpu] = ctx->dfl_pwq; + } else { + wq_calc_pod_cpumask(new_attrs, cpu, -1); + ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs); + if (!ctx->pwq_tbl[cpu]) + goto out_free; } } /* save the user configured attrs and sanitize it. */ copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->attrs = new_attrs; ctx->wq = wq; - free_workqueue_attrs(tmp_attrs); return ctx; out_free: - free_workqueue_attrs(tmp_attrs); free_workqueue_attrs(new_attrs); apply_wqattrs_cleanup(ctx); - return NULL; + return ERR_PTR(-ENOMEM); } /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) { - int node; + int cpu; /* all pwqs have been created successfully, let's install'em */ mutex_lock(&ctx->wq->mutex); @@ -3995,9 +4403,9 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); /* save the previous pwq and install the new one */ - for_each_node(node) - ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node, - ctx->pwq_tbl[node]); + for_each_possible_cpu(cpu) + ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu, + ctx->pwq_tbl[cpu]); /* @dfl_pwq might not have been used, ensure it's linked */ link_pwq(ctx->dfl_pwq); @@ -4006,19 +4414,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) mutex_unlock(&ctx->wq->mutex); } -static void apply_wqattrs_lock(void) -{ - /* CPUs should stay stable across pwq creations and installations */ - get_online_cpus(); - mutex_lock(&wq_pool_mutex); -} - -static void apply_wqattrs_unlock(void) -{ - mutex_unlock(&wq_pool_mutex); - put_online_cpus(); -} - static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { @@ -4036,9 +4431,9 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, wq->flags &= ~__WQ_ORDERED; } - ctx = apply_wqattrs_prepare(wq, attrs); - if (!ctx) - return -ENOMEM; + ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); /* the ctx has been prepared successfully, let's commit it */ apply_wqattrs_commit(ctx); @@ -4052,16 +4447,15 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * @wq: the target workqueue * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() * - * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA - * machines, this function maps a separate pwq to each NUMA node with - * possibles CPUs in @attrs->cpumask so that work items are affine to the - * NUMA node it was issued on. Older pwqs are released as in-flight work - * items finish. Note that a work item which repeatedly requeues itself - * back-to-back will stay on its current pwq. + * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps + * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that + * work items are affine to the pod it was issued on. Older pwqs are released as + * in-flight work items finish. Note that a work item which repeatedly requeues + * itself back-to-back will stay on its current pwq. * * Performs GFP_KERNEL allocations. * - * Assumes caller has CPU hotplug read exclusion, i.e. get_online_cpus(). + * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock(). * * Return: 0 on success and -errno on failure. */ @@ -4080,40 +4474,37 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, } /** - * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug + * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug * @wq: the target workqueue - * @cpu: the CPU coming up or going down + * @cpu: the CPU to update pool association for + * @hotplug_cpu: the CPU coming up or going down * @online: whether @cpu is coming up or going down * * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and - * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of + * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of * @wq accordingly. * - * If NUMA affinity can't be adjusted due to memory allocation failure, it - * falls back to @wq->dfl_pwq which may not be optimal but is always - * correct. - * - * Note that when the last allowed CPU of a NUMA node goes offline for a - * workqueue with a cpumask spanning multiple nodes, the workers which were - * already executing the work items for the workqueue will lose their CPU - * affinity and may execute on any CPU. This is similar to how per-cpu - * workqueues behave on CPU_DOWN. If a workqueue user wants strict - * affinity, it's the user's responsibility to flush the work item from - * CPU_DOWN_PREPARE. + * + * If pod affinity can't be adjusted due to memory allocation failure, it falls + * back to @wq->dfl_pwq which may not be optimal but is always correct. + * + * Note that when the last allowed CPU of a pod goes offline for a workqueue + * with a cpumask spanning multiple pods, the workers which were already + * executing the work items for the workqueue will lose their CPU affinity and + * may execute on any CPU. This is similar to how per-cpu workqueues behave on + * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's + * responsibility to flush the work item from CPU_DOWN_PREPARE. */ -static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, - bool online) +static void wq_update_pod(struct workqueue_struct *wq, int cpu, + int hotplug_cpu, bool online) { - int node = cpu_to_node(cpu); - int cpu_off = online ? -1 : cpu; + int off_cpu = online ? -1 : hotplug_cpu; struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; - cpumask_t *cpumask; lockdep_assert_held(&wq_pool_mutex); - if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) || - wq->unbound_attrs->no_numa) + if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered) return; /* @@ -4121,36 +4512,29 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, * Let's use a preallocated one. The following buf is protected by * CPU hotplug exclusion. */ - target_attrs = wq_update_unbound_numa_attrs_buf; - cpumask = target_attrs->cpumask; + target_attrs = wq_update_pod_attrs_buf; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); - pwq = unbound_pwq_by_node(wq, node); + wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); - /* - * Let's determine what needs to be done. If the target cpumask is - * different from the default pwq's, we need to compare it to @pwq's - * and create a new one if they don't match. If the target cpumask - * equals the default pwq's, the default pwq should be used. - */ - if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) { - if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) - return; - } else { - goto use_dfl_pwq; - } + /* nothing to do if the target cpumask matches the current pwq */ + wq_calc_pod_cpumask(target_attrs, cpu, off_cpu); + pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), + lockdep_is_held(&wq_pool_mutex)); + if (wqattrs_equal(target_attrs, pwq->pool->attrs)) + return; /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { - pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", + pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n", wq->name); goto use_dfl_pwq; } /* Install the new pwq. */ mutex_lock(&wq->mutex); - old_pwq = numa_pwq_tbl_install(wq, node, pwq); + old_pwq = install_unbound_pwq(wq, cpu, pwq); goto out_unlock; use_dfl_pwq: @@ -4158,7 +4542,7 @@ use_dfl_pwq: raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); - old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); + old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq); out_unlock: mutex_unlock(&wq->mutex); put_pwq_unlocked(old_pwq); @@ -4169,27 +4553,32 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) bool highpri = wq->flags & WQ_HIGHPRI; int cpu, ret; - if (!(wq->flags & WQ_UNBOUND)) { - wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); - if (!wq->cpu_pwqs) - return -ENOMEM; + wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); + if (!wq->cpu_pwq) + goto enomem; + if (!(wq->flags & WQ_UNBOUND)) { for_each_possible_cpu(cpu) { - struct pool_workqueue *pwq = - per_cpu_ptr(wq->cpu_pwqs, cpu); - struct worker_pool *cpu_pools = - per_cpu(cpu_worker_pools, cpu); + struct pool_workqueue **pwq_p = + per_cpu_ptr(wq->cpu_pwq, cpu); + struct worker_pool *pool = + &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]); + + *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, + pool->node); + if (!*pwq_p) + goto enomem; - init_pwq(pwq, wq, &cpu_pools[highpri]); + init_pwq(*pwq_p, wq, pool); mutex_lock(&wq->mutex); - link_pwq(pwq); + link_pwq(*pwq_p); mutex_unlock(&wq->mutex); } return 0; } - get_online_cpus(); + cpus_read_lock(); if (wq->flags & __WQ_ORDERED) { ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ @@ -4199,21 +4588,38 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) } else { ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } - put_online_cpus(); + cpus_read_unlock(); + + /* for unbound pwq, flush the pwq_release_worker ensures that the + * pwq_release_workfn() completes before calling kfree(wq). + */ + if (ret) + kthread_flush_worker(pwq_release_worker); return ret; + +enomem: + if (wq->cpu_pwq) { + for_each_possible_cpu(cpu) { + struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); + + if (pwq) + kmem_cache_free(pwq_cache, pwq); + } + free_percpu(wq->cpu_pwq); + wq->cpu_pwq = NULL; + } + return -ENOMEM; } static int wq_clamp_max_active(int max_active, unsigned int flags, const char *name) { - int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; - - if (max_active < 1 || max_active > lim) + if (max_active < 1 || max_active > WQ_MAX_ACTIVE) pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", - max_active, name, 1, lim); + max_active, name, 1, WQ_MAX_ACTIVE); - return clamp_val(max_active, 1, lim); + return clamp_val(max_active, 1, WQ_MAX_ACTIVE); } /* @@ -4229,13 +4635,18 @@ static int init_rescuer(struct workqueue_struct *wq) return 0; rescuer = alloc_worker(NUMA_NO_NODE); - if (!rescuer) + if (!rescuer) { + pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n", + wq->name); return -ENOMEM; + } rescuer->rescue_wq = wq; - rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); + rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name); if (IS_ERR(rescuer->task)) { ret = PTR_ERR(rescuer->task); + pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", + wq->name, ERR_PTR(ret)); kfree(rescuer); return ret; } @@ -4252,17 +4663,15 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...) { - size_t tbl_size = 0; va_list args; struct workqueue_struct *wq; struct pool_workqueue *pwq; /* - * Unbound && max_active == 1 used to imply ordered, which is no - * longer the case on NUMA machines due to per-node pools. While + * Unbound && max_active == 1 used to imply ordered, which is no longer + * the case on many machines due to per-pod pools. While * alloc_ordered_workqueue() is the right way to create an ordered - * workqueue, keep the previous behavior to avoid subtle breakages - * on NUMA. + * workqueue, keep the previous behavior to avoid subtle breakages. */ if ((flags & WQ_UNBOUND) && max_active == 1) flags |= __WQ_ORDERED; @@ -4272,10 +4681,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, flags |= WQ_UNBOUND; /* allocate wq and format name */ - if (flags & WQ_UNBOUND) - tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]); - - wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; @@ -4355,7 +4761,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) return true; - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) return true; return false; @@ -4370,7 +4776,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) void destroy_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; - int node; + int cpu; /* * Remove it from sysfs first so that sanity check failure doesn't @@ -4378,6 +4784,11 @@ void destroy_workqueue(struct workqueue_struct *wq) */ workqueue_sysfs_unregister(wq); + /* mark the workqueue destruction is in progress */ + mutex_lock(&wq->mutex); + wq->flags |= __WQ_DESTROYING; + mutex_unlock(&wq->mutex); + /* drain it before proceeding with destruction */ drain_workqueue(wq); @@ -4410,7 +4821,7 @@ void destroy_workqueue(struct workqueue_struct *wq) raw_spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); - show_workqueue_state(); + show_one_workqueue(wq); return; } raw_spin_unlock_irq(&pwq->pool->lock); @@ -4424,33 +4835,23 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); - if (!(wq->flags & WQ_UNBOUND)) { - wq_unregister_lockdep(wq); - /* - * The base ref is never dropped on per-cpu pwqs. Directly - * schedule RCU free. - */ - call_rcu(&wq->rcu, rcu_free_wq); - } else { - /* - * We're the sole accessor of @wq at this point. Directly - * access numa_pwq_tbl[] and dfl_pwq to put the base refs. - * @wq will be freed when the last pwq is released. - */ - for_each_node(node) { - pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); - RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL); - put_pwq_unlocked(pwq); - } + /* + * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq + * to put the base refs. @wq will be auto-destroyed from the last + * pwq_put. RCU read lock prevents @wq from going away from under us. + */ + rcu_read_lock(); - /* - * Put dfl_pwq. @wq may be freed any time after dfl_pwq is - * put. Don't access it afterwards. - */ - pwq = wq->dfl_pwq; - wq->dfl_pwq = NULL; + for_each_possible_cpu(cpu) { + pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); + RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL); put_pwq_unlocked(pwq); } + + put_pwq_unlocked(wq->dfl_pwq); + wq->dfl_pwq = NULL; + + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(destroy_workqueue); @@ -4527,10 +4928,11 @@ bool current_is_workqueue_rescuer(void) * unreliable and only useful as advisory hints or for debugging. * * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. - * Note that both per-cpu and unbound workqueues may be associated with - * multiple pool_workqueues which have separate congested states. A - * workqueue being congested on one CPU doesn't mean the workqueue is also - * contested on other CPUs / NUMA nodes. + * + * With the exception of ordered workqueues, all workqueues have per-cpu + * pool_workqueues, each with its own congested state. A workqueue being + * congested on one CPU doesn't mean that the workqueue is contested on any + * other CPUs. * * Return: * %true if congested, %false otherwise. @@ -4546,12 +4948,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); - if (!(wq->flags & WQ_UNBOUND)) - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); - else - pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); + pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); + ret = !list_empty(&pwq->inactive_works); - ret = !list_empty(&pwq->delayed_works); preempt_enable(); rcu_read_unlock(); @@ -4673,22 +5072,53 @@ static void pr_cont_pool_info(struct worker_pool *pool) pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); } -static void pr_cont_work(bool comma, struct work_struct *work) +struct pr_cont_work_struct { + bool comma; + work_func_t func; + long ctr; +}; + +static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp) +{ + if (!pcwsp->ctr) + goto out_record; + if (func == pcwsp->func) { + pcwsp->ctr++; + return; + } + if (pcwsp->ctr == 1) + pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func); + else + pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func); + pcwsp->ctr = 0; +out_record: + if ((long)func == -1L) + return; + pcwsp->comma = comma; + pcwsp->func = func; + pcwsp->ctr = 1; +} + +static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp) { if (work->func == wq_barrier_func) { struct wq_barrier *barr; barr = container_of(work, struct wq_barrier, work); + pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { - pr_cont("%s %ps", comma ? "," : "", work->func); + if (!comma) + pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); + pr_cont_work_flush(comma, work->func, pcwsp); } } static void show_pwq(struct pool_workqueue *pwq) { + struct pr_cont_work_struct pcws = { .ctr = 0, }; struct worker_pool *pool = pwq->pool; struct work_struct *work; struct worker *worker; @@ -4721,7 +5151,8 @@ static void show_pwq(struct pool_workqueue *pwq) worker->rescue_wq ? "(RESCUER)" : "", worker->current_func); list_for_each_entry(work, &worker->scheduled, entry) - pr_cont_work(false, work); + pr_cont_work(false, work, &pcws); + pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); comma = true; } pr_cont("\n"); @@ -4741,100 +5172,161 @@ static void show_pwq(struct pool_workqueue *pwq) if (get_work_pwq(work) != pwq) continue; - pr_cont_work(comma, work); + pr_cont_work(comma, work, &pcws); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } + pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); pr_cont("\n"); } - if (!list_empty(&pwq->delayed_works)) { + if (!list_empty(&pwq->inactive_works)) { bool comma = false; - pr_info(" delayed:"); - list_for_each_entry(work, &pwq->delayed_works, entry) { - pr_cont_work(comma, work); + pr_info(" inactive:"); + list_for_each_entry(work, &pwq->inactive_works, entry) { + pr_cont_work(comma, work, &pcws); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } + pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); pr_cont("\n"); } } /** - * show_workqueue_state - dump workqueue state + * show_one_workqueue - dump state of specified workqueue + * @wq: workqueue whose state will be printed + */ +void show_one_workqueue(struct workqueue_struct *wq) +{ + struct pool_workqueue *pwq; + bool idle = true; + unsigned long flags; + + for_each_pwq(pwq, wq) { + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { + idle = false; + break; + } + } + if (idle) /* Nothing to print for idle workqueue */ + return; + + pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); + + for_each_pwq(pwq, wq) { + raw_spin_lock_irqsave(&pwq->pool->lock, flags); + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_deferred_enter(); + show_pwq(pwq); + printk_deferred_exit(); + } + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_all_workqueues(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); + } + +} + +/** + * show_one_worker_pool - dump state of specified worker pool + * @pool: worker pool whose state will be printed + */ +static void show_one_worker_pool(struct worker_pool *pool) +{ + struct worker *worker; + bool first = true; + unsigned long flags; + unsigned long hung = 0; + + raw_spin_lock_irqsave(&pool->lock, flags); + if (pool->nr_workers == pool->nr_idle) + goto next_pool; + + /* How long the first pending work is waiting for a worker. */ + if (!list_empty(&pool->worklist)) + hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000; + + /* + * Defer printing to avoid deadlocks in console drivers that + * queue work while holding locks also taken in their write + * paths. + */ + printk_deferred_enter(); + pr_info("pool %d:", pool->id); + pr_cont_pool_info(pool); + pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers); + if (pool->manager) + pr_cont(" manager: %d", + task_pid_nr(pool->manager->task)); + list_for_each_entry(worker, &pool->idle_list, entry) { + pr_cont(" %s%d", first ? "idle: " : "", + task_pid_nr(worker->task)); + first = false; + } + pr_cont("\n"); + printk_deferred_exit(); +next_pool: + raw_spin_unlock_irqrestore(&pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_all_workqueues(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); + +} + +/** + * show_all_workqueues - dump workqueue state * - * Called from a sysrq handler or try_to_freeze_tasks() and prints out - * all busy workqueues and pools. + * Called from a sysrq handler and prints out all busy workqueues and pools. */ -void show_workqueue_state(void) +void show_all_workqueues(void) { struct workqueue_struct *wq; struct worker_pool *pool; - unsigned long flags; int pi; rcu_read_lock(); pr_info("Showing busy workqueues and worker pools:\n"); - list_for_each_entry_rcu(wq, &workqueues, list) { - struct pool_workqueue *pwq; - bool idle = true; + list_for_each_entry_rcu(wq, &workqueues, list) + show_one_workqueue(wq); - for_each_pwq(pwq, wq) { - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { - idle = false; - break; - } - } - if (idle) - continue; + for_each_pool(pool, pi) + show_one_worker_pool(pool); - pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); + rcu_read_unlock(); +} - for_each_pwq(pwq, wq) { - raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) - show_pwq(pwq); - raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); - /* - * We could be printing a lot from atomic context, e.g. - * sysrq-t -> show_workqueue_state(). Avoid triggering - * hard lockup. - */ - touch_nmi_watchdog(); - } - } +/** + * show_freezable_workqueues - dump freezable workqueue state + * + * Called from try_to_freeze_tasks() and prints out all freezable workqueues + * still busy. + */ +void show_freezable_workqueues(void) +{ + struct workqueue_struct *wq; - for_each_pool(pool, pi) { - struct worker *worker; - bool first = true; + rcu_read_lock(); - raw_spin_lock_irqsave(&pool->lock, flags); - if (pool->nr_workers == pool->nr_idle) - goto next_pool; - - pr_info("pool %d:", pool->id); - pr_cont_pool_info(pool); - pr_cont(" hung=%us workers=%d", - jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, - pool->nr_workers); - if (pool->manager) - pr_cont(" manager: %d", - task_pid_nr(pool->manager->task)); - list_for_each_entry(worker, &pool->idle_list, entry) { - pr_cont(" %s%d", first ? "idle: " : "", - task_pid_nr(worker->task)); - first = false; - } - pr_cont("\n"); - next_pool: - raw_spin_unlock_irqrestore(&pool->lock, flags); - /* - * We could be printing a lot from atomic context, e.g. - * sysrq-t -> show_workqueue_state(). Avoid triggering - * hard lockup. - */ - touch_nmi_watchdog(); + pr_info("Showing freezable workqueues that are still busy:\n"); + + list_for_each_entry_rcu(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; + show_one_workqueue(wq); } rcu_read_unlock(); @@ -4908,50 +5400,39 @@ static void unbind_workers(int cpu) /* * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers - * except for the ones which are still executing works from - * before the last CPU down must be on the cpu. After - * this, they may become diasporas. + * must be on the cpu. After this, they may become diasporas. + * And the preemption disabled section in their sched callbacks + * are guaranteed to see WORKER_UNBOUND since the code here + * is on the same cpu. */ for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; - raw_spin_unlock_irq(&pool->lock); - - for_each_pool_worker(worker, pool) { - kthread_set_per_cpu(worker->task, -1); - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); - } - - mutex_unlock(&wq_pool_attach_mutex); - - /* - * Call schedule() so that we cross rq->lock and thus can - * guarantee sched callbacks see the %WORKER_UNBOUND flag. - * This is necessary as scheduler callbacks may be invoked - * from other cpus. - */ - schedule(); - /* - * Sched callbacks are disabled now. Zap nr_running. - * After this, nr_running stays zero and need_more_worker() - * and keep_working() are always true as long as the - * worklist is not empty. This pool now behaves as an - * unbound (in terms of concurrency management) pool which + * The handling of nr_running in sched callbacks are disabled + * now. Zap nr_running. After this, nr_running stays zero and + * need_more_worker() and keep_working() are always true as + * long as the worklist is not empty. This pool now behaves as + * an unbound (in terms of concurrency management) pool which * are served by workers tied to the pool. */ - atomic_set(&pool->nr_running, 0); + pool->nr_running = 0; /* * With concurrency management just turned off, a busy * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ - raw_spin_lock_irq(&pool->lock); - wake_up_worker(pool); + kick_pool(pool); + raw_spin_unlock_irq(&pool->lock); + + for_each_pool_worker(worker, pool) + unbind_worker(worker); + + mutex_unlock(&wq_pool_attach_mutex); } } @@ -4977,7 +5458,7 @@ static void rebind_workers(struct worker_pool *pool) for_each_pool_worker(worker, pool) { kthread_set_per_cpu(worker->task, pool->cpu); WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, - pool->attrs->cpumask) < 0); + pool_allowed_cpus(pool)) < 0); } raw_spin_lock_irq(&pool->lock); @@ -4988,17 +5469,6 @@ static void rebind_workers(struct worker_pool *pool) unsigned int worker_flags = worker->flags; /* - * A bound idle worker should actually be on the runqueue - * of the associated CPU for local wake-ups targeting it to - * work. Kick all idle workers so that they migrate to the - * associated CPU. Doing this in the same loop as - * replacing UNBOUND with REBOUND is safe as no worker will - * be bound before @pool->lock is released. - */ - if (worker_flags & WORKER_IDLE) - wake_up_process(worker->task); - - /* * We want to clear UNBOUND but can't directly call * worker_clr_flags() or adjust nr_running. Atomically * replace UNBOUND with another NOT_RUNNING flag REBOUND. @@ -5082,9 +5552,18 @@ int workqueue_online_cpu(unsigned int cpu) mutex_unlock(&wq_pool_attach_mutex); } - /* update NUMA affinity of unbound workqueues */ - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, true); + /* update pod affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) { + struct workqueue_attrs *attrs = wq->unbound_attrs; + + if (attrs) { + const struct wq_pod_type *pt = wqattrs_pod_type(attrs); + int tcpu; + + for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) + wq_update_pod(wq, tcpu, cpu, true); + } + } mutex_unlock(&wq_pool_mutex); return 0; @@ -5100,10 +5579,19 @@ int workqueue_offline_cpu(unsigned int cpu) unbind_workers(cpu); - /* update NUMA affinity of unbound workqueues */ + /* update pod affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, false); + list_for_each_entry(wq, &workqueues, list) { + struct workqueue_attrs *attrs = wq->unbound_attrs; + + if (attrs) { + const struct wq_pod_type *pt = wqattrs_pod_type(attrs); + int tcpu; + + for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) + wq_update_pod(wq, tcpu, cpu, false); + } + } mutex_unlock(&wq_pool_mutex); return 0; @@ -5124,50 +5612,54 @@ static void work_for_cpu_fn(struct work_struct *work) } /** - * work_on_cpu - run a function in thread context on a particular cpu + * work_on_cpu_key - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg + * @key: The lock class key for lock debugging purposes * * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ -long work_on_cpu(int cpu, long (*fn)(void *), void *arg) +long work_on_cpu_key(int cpu, long (*fn)(void *), + void *arg, struct lock_class_key *key) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; - INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key); schedule_work_on(cpu, &wfc.work); flush_work(&wfc.work); destroy_work_on_stack(&wfc.work); return wfc.ret; } -EXPORT_SYMBOL_GPL(work_on_cpu); +EXPORT_SYMBOL_GPL(work_on_cpu_key); /** - * work_on_cpu_safe - run a function in thread context on a particular cpu + * work_on_cpu_safe_key - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function argument + * @key: The lock class key for lock debugging purposes * * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold * any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ -long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) +long work_on_cpu_safe_key(int cpu, long (*fn)(void *), + void *arg, struct lock_class_key *key) { long ret = -ENODEV; - get_online_cpus(); + cpus_read_lock(); if (cpu_online(cpu)) - ret = work_on_cpu(cpu, fn, arg); - put_online_cpus(); + ret = work_on_cpu_key(cpu, fn, arg, key); + cpus_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(work_on_cpu_safe); +EXPORT_SYMBOL_GPL(work_on_cpu_safe_key); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER @@ -5176,7 +5668,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe); * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their delayed_works list instead of + * workqueues will queue new works to their inactive_works list instead of * pool->worklist. * * CONTEXT: @@ -5282,7 +5774,7 @@ out_unlock: } #endif /* CONFIG_FREEZER */ -static int workqueue_apply_unbound_cpumask(void) +static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) { LIST_HEAD(ctxs); int ret = 0; @@ -5298,9 +5790,9 @@ static int workqueue_apply_unbound_cpumask(void) if (wq->flags & __WQ_ORDERED) continue; - ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs); - if (!ctx) { - ret = -ENOMEM; + ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); + if (IS_ERR(ctx)) { + ret = PTR_ERR(ctx); break; } @@ -5313,70 +5805,118 @@ static int workqueue_apply_unbound_cpumask(void) apply_wqattrs_cleanup(ctx); } + if (!ret) { + mutex_lock(&wq_pool_attach_mutex); + cpumask_copy(wq_unbound_cpumask, unbound_cpumask); + mutex_unlock(&wq_pool_attach_mutex); + } return ret; } /** - * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask - * @cpumask: the cpumask to set + * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask + * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask * - * The low-level workqueues cpumask is a global cpumask that limits - * the affinity of all unbound workqueues. This function check the @cpumask - * and apply it to all unbound workqueues and updates all pwqs of them. - * - * Retun: 0 - Success - * -EINVAL - Invalid @cpumask - * -ENOMEM - Failed to allocate memory for attrs or pwqs. + * This function can be called from cpuset code to provide a set of isolated + * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold + * either cpus_read_lock or cpus_write_lock. */ -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) { - int ret = -EINVAL; - cpumask_var_t saved_cpumask; + cpumask_var_t cpumask; + int ret = 0; - if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) return -ENOMEM; + lockdep_assert_cpus_held(); + mutex_lock(&wq_pool_mutex); + + /* Save the current isolated cpumask & export it via sysfs */ + cpumask_copy(wq_isolated_cpumask, exclude_cpumask); + /* - * Not excluding isolated cpus on purpose. - * If the user wishes to include them, we allow that. + * If the operation fails, it will fall back to + * wq_requested_unbound_cpumask which is initially set to + * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten + * by any subsequent write to workqueue/cpumask sysfs file. */ - cpumask_and(cpumask, cpumask, cpu_possible_mask); - if (!cpumask_empty(cpumask)) { - apply_wqattrs_lock(); + if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask)) + cpumask_copy(cpumask, wq_requested_unbound_cpumask); + if (!cpumask_equal(cpumask, wq_unbound_cpumask)) + ret = workqueue_apply_unbound_cpumask(cpumask); + + mutex_unlock(&wq_pool_mutex); + free_cpumask_var(cpumask); + return ret; +} - /* save the old wq_unbound_cpumask. */ - cpumask_copy(saved_cpumask, wq_unbound_cpumask); +static int parse_affn_scope(const char *val) +{ + int i; - /* update wq_unbound_cpumask at first and apply it to wqs. */ - cpumask_copy(wq_unbound_cpumask, cpumask); - ret = workqueue_apply_unbound_cpumask(); + for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) { + if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i]))) + return i; + } + return -EINVAL; +} - /* restore the wq_unbound_cpumask when failed. */ - if (ret < 0) - cpumask_copy(wq_unbound_cpumask, saved_cpumask); +static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) +{ + struct workqueue_struct *wq; + int affn, cpu; - apply_wqattrs_unlock(); + affn = parse_affn_scope(val); + if (affn < 0) + return affn; + if (affn == WQ_AFFN_DFL) + return -EINVAL; + + cpus_read_lock(); + mutex_lock(&wq_pool_mutex); + + wq_affn_dfl = affn; + + list_for_each_entry(wq, &workqueues, list) { + for_each_online_cpu(cpu) { + wq_update_pod(wq, cpu, cpu, true); + } } - free_cpumask_var(saved_cpumask); - return ret; + mutex_unlock(&wq_pool_mutex); + cpus_read_unlock(); + + return 0; } +static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp) +{ + return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]); +} + +static const struct kernel_param_ops wq_affn_dfl_ops = { + .set = wq_affn_dfl_set, + .get = wq_affn_dfl_get, +}; + +module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644); + #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the * following attributes. * - * per_cpu RO bool : whether the workqueue is per-cpu or unbound - * max_active RW int : maximum number of in-flight work items + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items * * Unbound workqueues have the following extra attributes. * - * pool_ids RO int : the associated pool IDs for each node - * nice RW int : nice value of the workers - * cpumask RW mask : bitmask of allowed CPUs for the workers - * numa RW bool : whether enable NUMA affinity + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + * affinity_scope RW str : worker CPU affinity scope (cache, numa, none) + * affinity_strict RW bool : worker CPU affinity is strict */ struct wq_device { struct workqueue_struct *wq; @@ -5429,26 +5969,17 @@ static struct attribute *wq_sysfs_attrs[] = { }; ATTRIBUTE_GROUPS(wq_sysfs); -static ssize_t wq_pool_ids_show(struct device *dev, - struct device_attribute *attr, char *buf) +static void apply_wqattrs_lock(void) { - struct workqueue_struct *wq = dev_to_wq(dev); - const char *delim = ""; - int node, written = 0; - - get_online_cpus(); - rcu_read_lock(); - for_each_node(node) { - written += scnprintf(buf + written, PAGE_SIZE - written, - "%s%d:%d", delim, node, - unbound_pwq_by_node(wq, node)->pool->id); - delim = " "; - } - written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock(); - put_online_cpus(); + /* CPUs should stay stable across pwq creations and installations */ + cpus_read_lock(); + mutex_lock(&wq_pool_mutex); +} - return written; +static void apply_wqattrs_unlock(void) +{ + mutex_unlock(&wq_pool_mutex); + cpus_read_unlock(); } static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, @@ -5541,50 +6072,84 @@ out_unlock: return ret ?: count; } -static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t wq_affn_scope_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", - !wq->unbound_attrs->no_numa); + if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL) + written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n", + wq_affn_names[WQ_AFFN_DFL], + wq_affn_names[wq_affn_dfl]); + else + written = scnprintf(buf, PAGE_SIZE, "%s\n", + wq_affn_names[wq->unbound_attrs->affn_scope]); mutex_unlock(&wq->mutex); return written; } -static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t wq_affn_scope_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; - int v, ret = -ENOMEM; + int affn, ret = -ENOMEM; - apply_wqattrs_lock(); + affn = parse_affn_scope(buf); + if (affn < 0) + return affn; + apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - goto out_unlock; - - ret = -EINVAL; - if (sscanf(buf, "%d", &v) == 1) { - attrs->no_numa = !v; + if (attrs) { + attrs->affn_scope = affn; ret = apply_workqueue_attrs_locked(wq, attrs); } + apply_wqattrs_unlock(); + free_workqueue_attrs(attrs); + return ret ?: count; +} -out_unlock: +static ssize_t wq_affinity_strict_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", + wq->unbound_attrs->affn_strict); +} + +static ssize_t wq_affinity_strict_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret = -ENOMEM; + + if (sscanf(buf, "%d", &v) != 1) + return -EINVAL; + + apply_wqattrs_lock(); + attrs = wq_sysfs_prep_attrs(wq); + if (attrs) { + attrs->affn_strict = (bool)v; + ret = apply_workqueue_attrs_locked(wq, attrs); + } apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), - __ATTR(numa, 0644, wq_numa_show, wq_numa_store), + __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store), + __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store), __ATTR_NULL, }; @@ -5593,19 +6158,74 @@ static struct bus_type wq_subsys = { .dev_groups = wq_sysfs_groups, }; -static ssize_t wq_unbound_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) +/** + * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask + * @cpumask: the cpumask to set + * + * The low-level workqueues cpumask is a global cpumask that limits + * the affinity of all unbound workqueues. This function check the @cpumask + * and apply it to all unbound workqueues and updates all pwqs of them. + * + * Return: 0 - Success + * -EINVAL - Invalid @cpumask + * -ENOMEM - Failed to allocate memory for attrs or pwqs. + */ +static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +{ + int ret = -EINVAL; + + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ + cpumask_and(cpumask, cpumask, cpu_possible_mask); + if (!cpumask_empty(cpumask)) { + apply_wqattrs_lock(); + cpumask_copy(wq_requested_unbound_cpumask, cpumask); + if (cpumask_equal(cpumask, wq_unbound_cpumask)) { + ret = 0; + goto out_unlock; + } + + ret = workqueue_apply_unbound_cpumask(cpumask); + +out_unlock: + apply_wqattrs_unlock(); + } + + return ret; +} + +static ssize_t __wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf, cpumask_var_t mask) { int written; mutex_lock(&wq_pool_mutex); - written = scnprintf(buf, PAGE_SIZE, "%*pb\n", - cpumask_pr_args(wq_unbound_cpumask)); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); mutex_unlock(&wq_pool_mutex); return written; } +static ssize_t wq_unbound_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); +} + +static ssize_t wq_requested_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); +} + +static ssize_t wq_isolated_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); +} + static ssize_t wq_unbound_cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -5623,19 +6243,35 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev, return ret ? ret : count; } -static struct device_attribute wq_sysfs_cpumask_attr = +static struct device_attribute wq_sysfs_cpumask_attrs[] = { __ATTR(cpumask, 0644, wq_unbound_cpumask_show, - wq_unbound_cpumask_store); + wq_unbound_cpumask_store), + __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL), + __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL), + __ATTR_NULL, +}; static int __init wq_sysfs_init(void) { + struct device *dev_root; int err; err = subsys_virtual_register(&wq_subsys, NULL); if (err) return err; - return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr); + dev_root = bus_get_dev_root(&wq_subsys); + if (dev_root) { + struct device_attribute *attr; + + for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) { + err = device_create_file(dev_root, attr); + if (err) + break; + } + put_device(dev_root); + } + return err; } core_initcall(wq_sysfs_init); @@ -5759,6 +6395,57 @@ static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; +/* + * Show workers that might prevent the processing of pending work items. + * The only candidates are CPU-bound workers in the running state. + * Pending work items should be handled by another idle worker + * in all other situations. + */ +static void show_cpu_pool_hog(struct worker_pool *pool) +{ + struct worker *worker; + unsigned long flags; + int bkt; + + raw_spin_lock_irqsave(&pool->lock, flags); + + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (task_is_running(worker->task)) { + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_deferred_enter(); + + pr_info("pool %d:\n", pool->id); + sched_show_task(worker->task); + + printk_deferred_exit(); + } + } + + raw_spin_unlock_irqrestore(&pool->lock, flags); +} + +static void show_cpu_pools_hogs(void) +{ + struct worker_pool *pool; + int pi; + + pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); + + rcu_read_lock(); + + for_each_pool(pool, pi) { + if (pool->cpu_stall) + show_cpu_pool_hog(pool); + + } + + rcu_read_unlock(); +} + static void wq_watchdog_reset_touched(void) { int cpu; @@ -5772,6 +6459,8 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; bool lockup_detected = false; + bool cpu_pool_stall = false; + unsigned long now = jiffies; struct worker_pool *pool; int pi; @@ -5783,40 +6472,51 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) for_each_pool(pool, pi) { unsigned long pool_ts, touched, ts; + pool->cpu_stall = false; if (list_empty(&pool->worklist)) continue; + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like a stall. + */ + kvm_check_and_clear_guest_paused(); + /* get the latest of pool and touched timestamps */ + if (pool->cpu >= 0) + touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); + else + touched = READ_ONCE(wq_watchdog_touched); pool_ts = READ_ONCE(pool->watchdog_ts); - touched = READ_ONCE(wq_watchdog_touched); if (time_after(pool_ts, touched)) ts = pool_ts; else ts = touched; - if (pool->cpu >= 0) { - unsigned long cpu_touched = - READ_ONCE(per_cpu(wq_watchdog_touched_cpu, - pool->cpu)); - if (time_after(cpu_touched, ts)) - ts = cpu_touched; - } - /* did we stall? */ - if (time_after(jiffies, ts + thresh)) { + if (time_after(now, ts + thresh)) { lockup_detected = true; + if (pool->cpu >= 0) { + pool->cpu_stall = true; + cpu_pool_stall = true; + } pr_emerg("BUG: workqueue lockup - pool"); pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", - jiffies_to_msecs(jiffies - pool_ts) / 1000); + jiffies_to_msecs(now - pool_ts) / 1000); } + + } rcu_read_unlock(); if (lockup_detected) - show_workqueue_state(); + show_all_workqueues(); + + if (cpu_pool_stall) + show_cpu_pools_hogs(); wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); @@ -5826,8 +6526,8 @@ notrace void wq_watchdog_touch(int cpu) { if (cpu >= 0) per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; - else - wq_watchdog_touched = jiffies; + + wq_watchdog_touched = jiffies; } static void wq_watchdog_set_thresh(unsigned long thresh) @@ -5880,71 +6580,65 @@ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ -static void __init wq_numa_init(void) +static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask) { - cpumask_var_t *tbl; - int node, cpu; - - if (num_possible_nodes() <= 1) - return; - - if (wq_disable_numa) { - pr_info("workqueue: NUMA affinity support disabled\n"); + if (!cpumask_intersects(wq_unbound_cpumask, mask)) { + pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n", + cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask)); return; } - wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); - BUG_ON(!wq_update_unbound_numa_attrs_buf); - - /* - * We want masks of possible CPUs of each node which isn't readily - * available. Build one from cpu_to_node() which should have been - * fully initialized by now. - */ - tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL); - BUG_ON(!tbl); - - for_each_node(node) - BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, - node_online(node) ? node : NUMA_NO_NODE)); - - for_each_possible_cpu(cpu) { - node = cpu_to_node(cpu); - if (WARN_ON(node == NUMA_NO_NODE)) { - pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); - /* happens iff arch is bonkers, let's just proceed */ - return; - } - cpumask_set_cpu(cpu, tbl[node]); - } - - wq_numa_possible_cpumask = tbl; - wq_numa_enabled = true; + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask); } /** * workqueue_init_early - early init for workqueue subsystem * - * This is the first half of two-staged workqueue subsystem initialization - * and invoked as soon as the bare basics - memory allocation, cpumasks and - * idr are up. It sets up all the data structures and system workqueues - * and allows early boot code to create workqueues and queue/cancel work - * items. Actual work item execution starts only after kthreads can be - * created and scheduled right before early initcalls. + * This is the first step of three-staged workqueue subsystem initialization and + * invoked as soon as the bare basics - memory allocation, cpumasks and idr are + * up. It sets up all the data structures and system workqueues and allows early + * boot code to create workqueues and queue/cancel work items. Actual work item + * execution starts only after kthreads can be created and scheduled right + * before early initcalls. */ void __init workqueue_init_early(void) { + struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM]; int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; - int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); + BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL)); + + cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ)); + restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN)); + if (!cpumask_empty(&wq_cmdline_cpumask)) + restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); + + cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); + wq_update_pod_attrs_buf = alloc_workqueue_attrs(); + BUG_ON(!wq_update_pod_attrs_buf); + + /* initialize WQ_AFFN_SYSTEM pods */ + pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL); + pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL); + pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); + BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod); + + BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE)); + + pt->nr_pods = 1; + cpumask_copy(pt->pod_cpus[0], cpu_possible_mask); + pt->pod_node[0] = NUMA_NO_NODE; + pt->cpu_pod[0] = 0; + /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; @@ -5954,7 +6648,9 @@ void __init workqueue_init_early(void) BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); + cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu)); pool->attrs->nice = std_nice[i++]; + pool->attrs->affn_strict = true; pool->node = cpu_to_node(cpu); /* alloc pool ID */ @@ -5975,11 +6671,10 @@ void __init workqueue_init_early(void) /* * An ordered wq should have only one pwq as ordering is * guaranteed by max_active which is enforced by pwqs. - * Turn off NUMA so that dfl_pwq is used for all nodes. */ BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; - attrs->no_numa = true; + attrs->ordered = true; ordered_wq_attrs[i] = attrs; } @@ -5987,7 +6682,7 @@ void __init workqueue_init_early(void) system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, - WQ_UNBOUND_MAX_ACTIVE); + WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", @@ -6001,14 +6696,53 @@ void __init workqueue_init_early(void) !system_freezable_power_efficient_wq); } +static void __init wq_cpu_intensive_thresh_init(void) +{ + unsigned long thresh; + unsigned long bogo; + + pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); + BUG_ON(IS_ERR(pwq_release_worker)); + + /* if the user set it to a specific value, keep it */ + if (wq_cpu_intensive_thresh_us != ULONG_MAX) + return; + + /* + * The default of 10ms is derived from the fact that most modern (as of + * 2023) processors can do a lot in 10ms and that it's just below what + * most consider human-perceivable. However, the kernel also runs on a + * lot slower CPUs including microcontrollers where the threshold is way + * too low. + * + * Let's scale up the threshold upto 1 second if BogoMips is below 4000. + * This is by no means accurate but it doesn't have to be. The mechanism + * is still useful even when the threshold is fully scaled up. Also, as + * the reports would usually be applicable to everyone, some machines + * operating on longer thresholds won't significantly diminish their + * usefulness. + */ + thresh = 10 * USEC_PER_MSEC; + + /* see init/calibrate.c for lpj -> BogoMIPS calculation */ + bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1); + if (bogo < 4000) + thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC); + + pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n", + loops_per_jiffy, bogo, thresh); + + wq_cpu_intensive_thresh_us = thresh; +} + /** * workqueue_init - bring workqueue subsystem fully online * - * This is the latter half of two-staged workqueue subsystem initialization - * and invoked as soon as kthreads can be created and scheduled. - * Workqueues have been created and work items queued on them, but there - * are no kworkers executing the work items yet. Populate the worker pools - * with the initial workers and enable future kworker creations. + * This is the second step of three-staged workqueue subsystem initialization + * and invoked as soon as kthreads can be created and scheduled. Workqueues have + * been created and work items queued on them, but there are no kworkers + * executing the work items yet. Populate the worker pools with the initial + * workers and enable future kworker creations. */ void __init workqueue_init(void) { @@ -6016,19 +6750,14 @@ void __init workqueue_init(void) struct worker_pool *pool; int cpu, bkt; - /* - * It'd be simpler to initialize NUMA in workqueue_init_early() but - * CPU to node mapping may not be available that early on some - * archs such as power and arm64. As per-cpu pools created - * previously could be missing node hint and unbound pools NUMA - * affinity, fix them up. - * - * Also, while iterating workqueues, create rescuers if requested. - */ - wq_numa_init(); + wq_cpu_intensive_thresh_init(); mutex_lock(&wq_pool_mutex); + /* + * Per-cpu pools created earlier could be missing node hint. Fix them + * up. Also, create a rescuer for workqueues that requested it. + */ for_each_possible_cpu(cpu) { for_each_cpu_worker_pool(pool, cpu) { pool->node = cpu_to_node(cpu); @@ -6036,7 +6765,6 @@ void __init workqueue_init(void) } list_for_each_entry(wq, &workqueues, list) { - wq_update_unbound_numa(wq, smp_processor_id(), true); WARN(init_rescuer(wq), "workqueue: failed to create early rescuer for %s", wq->name); @@ -6058,3 +6786,116 @@ void __init workqueue_init(void) wq_online = true; wq_watchdog_init(); } + +/* + * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to + * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique + * and consecutive pod ID. The rest of @pt is initialized accordingly. + */ +static void __init init_pod_type(struct wq_pod_type *pt, + bool (*cpus_share_pod)(int, int)) +{ + int cur, pre, cpu, pod; + + pt->nr_pods = 0; + + /* init @pt->cpu_pod[] according to @cpus_share_pod() */ + pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); + BUG_ON(!pt->cpu_pod); + + for_each_possible_cpu(cur) { + for_each_possible_cpu(pre) { + if (pre >= cur) { + pt->cpu_pod[cur] = pt->nr_pods++; + break; + } + if (cpus_share_pod(cur, pre)) { + pt->cpu_pod[cur] = pt->cpu_pod[pre]; + break; + } + } + } + + /* init the rest to match @pt->cpu_pod[] */ + pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL); + pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL); + BUG_ON(!pt->pod_cpus || !pt->pod_node); + + for (pod = 0; pod < pt->nr_pods; pod++) + BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL)); + + for_each_possible_cpu(cpu) { + cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]); + pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu); + } +} + +static bool __init cpus_dont_share(int cpu0, int cpu1) +{ + return false; +} + +static bool __init cpus_share_smt(int cpu0, int cpu1) +{ +#ifdef CONFIG_SCHED_SMT + return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); +#else + return false; +#endif +} + +static bool __init cpus_share_numa(int cpu0, int cpu1) +{ + return cpu_to_node(cpu0) == cpu_to_node(cpu1); +} + +/** + * workqueue_init_topology - initialize CPU pods for unbound workqueues + * + * This is the third step of there-staged workqueue subsystem initialization and + * invoked after SMP and topology information are fully initialized. It + * initializes the unbound CPU pods accordingly. + */ +void __init workqueue_init_topology(void) +{ + struct workqueue_struct *wq; + int cpu; + + init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share); + init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt); + init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache); + init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); + + mutex_lock(&wq_pool_mutex); + + /* + * Workqueues allocated earlier would have all CPUs sharing the default + * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU + * combinations to apply per-pod sharing. + */ + list_for_each_entry(wq, &workqueues, list) { + for_each_online_cpu(cpu) { + wq_update_pod(wq, cpu, cpu, true); + } + } + + mutex_unlock(&wq_pool_mutex); +} + +void __warn_flushing_systemwide_wq(void) +{ + pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n"); + dump_stack(); +} +EXPORT_SYMBOL(__warn_flushing_systemwide_wq); + +static int __init workqueue_unbound_cpus_setup(char *str) +{ + if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) { + cpumask_clear(&wq_cmdline_cpumask); + pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n"); + } + + return 1; +} +__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup); |