summaryrefslogtreecommitdiff
path: root/kernel/workqueue.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/workqueue.c')
-rw-r--r--kernel/workqueue.c837
1 files changed, 459 insertions, 378 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3fbaecfc88c2..253311af47c6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -216,15 +216,15 @@ struct worker_pool {
struct worker *manager; /* L: purely informational */
struct list_head workers; /* A: attached workers */
- struct list_head dying_workers; /* A: workers about to die */
- struct completion *detach_completion; /* all workers detached */
struct ida worker_ida; /* worker IDs for task name */
struct workqueue_attrs *attrs; /* I: worker attributes */
struct hlist_node hash_node; /* PL: unbound_pool_hash node */
int refcnt; /* PL: refcnt for unbound pools */
-
+#ifdef CONFIG_PREEMPT_RT
+ spinlock_t cb_lock; /* BH worker cancel lock */
+#endif
/*
* Destruction of pool is RCU protected to allow dereferences
* from get_work_pool().
@@ -366,7 +366,8 @@ struct workqueue_struct {
#ifdef CONFIG_LOCKDEP
char *lock_name;
struct lock_class_key key;
- struct lockdep_map lockdep_map;
+ struct lockdep_map __lockdep_map;
+ struct lockdep_map *lockdep_map;
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
@@ -379,7 +380,7 @@ struct workqueue_struct {
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
- struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
+ struct pool_workqueue __rcu * __percpu *cpu_pwq; /* I: per-cpu pwqs */
struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
};
@@ -436,7 +437,7 @@ static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
-static struct workqueue_attrs *wq_update_pod_attrs_buf;
+static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
@@ -447,6 +448,9 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
+/* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */
+static cpumask_var_t wq_online_cpumask;
+
/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
@@ -475,16 +479,13 @@ static bool wq_debug_force_rr_cpu = false;
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
/* to raise softirq for the BH worker pools on other CPUs */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS],
- bh_pool_irq_works);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works);
/* the BH worker pools */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
- bh_worker_pools);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools);
/* the per-cpu worker pools */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
- cpu_worker_pools);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
@@ -506,12 +507,16 @@ static struct kthread_worker *pwq_release_worker __ro_after_init;
struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
+struct workqueue_struct *system_percpu_wq __ro_after_init;
+EXPORT_SYMBOL(system_percpu_wq);
struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
+struct workqueue_struct *system_dfl_wq __ro_after_init;
+EXPORT_SYMBOL_GPL(system_dfl_wq);
struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
struct workqueue_struct *system_power_efficient_wq __ro_after_init;
@@ -536,12 +541,6 @@ static void show_one_worker_pool(struct worker_pool *pool);
!lockdep_is_held(&wq_pool_mutex), \
"RCU or wq_pool_mutex should be held")
-#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
- RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \
- !lockdep_is_held(&wq->mutex) && \
- !lockdep_is_held(&wq_pool_mutex), \
- "RCU, wq->mutex or wq_pool_mutex should be held")
-
#define for_each_bh_worker_pool(pool, cpu) \
for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \
(pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -687,7 +686,7 @@ EXPORT_SYMBOL_GPL(destroy_work_on_stack);
void destroy_delayed_work_on_stack(struct delayed_work *work)
{
- destroy_timer_on_stack(&work->timer);
+ timer_destroy_on_stack(&work->timer);
debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
@@ -896,7 +895,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits)
{
- return (v >> shift) & ((1 << bits) - 1);
+ return (v >> shift) & ((1U << bits) - 1);
}
static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data)
@@ -1684,47 +1683,17 @@ static void __pwq_activate_work(struct pool_workqueue *pwq,
__clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
}
-/**
- * pwq_activate_work - Activate a work item if inactive
- * @pwq: pool_workqueue @work belongs to
- * @work: work item to activate
- *
- * Returns %true if activated. %false if already active.
- */
-static bool pwq_activate_work(struct pool_workqueue *pwq,
- struct work_struct *work)
-{
- struct worker_pool *pool = pwq->pool;
- struct wq_node_nr_active *nna;
-
- lockdep_assert_held(&pool->lock);
-
- if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
- return false;
-
- nna = wq_node_nr_active(pwq->wq, pool->node);
- if (nna)
- atomic_inc(&nna->nr);
-
- pwq->nr_active++;
- __pwq_activate_work(pwq, work);
- return true;
-}
-
static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
{
int max = READ_ONCE(nna->max);
+ int old = atomic_read(&nna->nr);
- while (true) {
- int old, tmp;
-
- old = atomic_read(&nna->nr);
+ do {
if (old >= max)
return false;
- tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
- if (tmp == old)
- return true;
- }
+ } while (!atomic_try_cmpxchg_relaxed(&nna->nr, &old, old + 1));
+
+ return true;
}
/**
@@ -2085,11 +2054,11 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
struct delayed_work *dwork = to_delayed_work(work);
/*
- * dwork->timer is irqsafe. If del_timer() fails, it's
+ * dwork->timer is irqsafe. If timer_delete() fails, it's
* guaranteed that the timer is not queued anywhere and not
* running on the local CPU.
*/
- if (likely(del_timer(&dwork->timer)))
+ if (likely(timer_delete(&dwork->timer)))
return 1;
}
@@ -2117,7 +2086,7 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
*/
pwq = get_work_pwq(work);
if (pwq && pwq->pool == pool) {
- unsigned long work_data;
+ unsigned long work_data = *work_data_bits(work);
debug_work_deactivate(work);
@@ -2126,13 +2095,17 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
* pwq->inactive_works since a queued barrier can't be
* canceled (see the comments in insert_wq_barrier()).
*
- * An inactive work item cannot be grabbed directly because
+ * An inactive work item cannot be deleted directly because
* it might have linked barrier work items which, if left
* on the inactive_works list, will confuse pwq->nr_active
- * management later on and cause stall. Make sure the work
- * item is activated before grabbing.
+ * management later on and cause stall. Move the linked
+ * barrier work items to the worklist when deleting the grabbed
+ * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that
+ * it doesn't participate in nr_active management in later
+ * pwq_dec_nr_in_flight().
*/
- pwq_activate_work(pwq, work);
+ if (work_data & WORK_STRUCT_INACTIVE)
+ move_linked_works(work, &pwq->pool->worklist, NULL);
list_del_init(&work->entry);
@@ -2140,7 +2113,6 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
* work->data points to pwq iff queued. Let's point to pool. As
* this destroys work->data needed by the next step, stash it.
*/
- work_data = *work_data_bits(work);
set_work_pool_and_keep_pending(work, pool->id,
pool_offq_flags(pool));
@@ -2205,7 +2177,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
debug_work_activate(work);
/* record the work call stack in order to print it in KASAN reports */
- kasan_record_aux_stack_noalloc(work);
+ kasan_record_aux_stack(work);
/* we own @work, set data and link */
set_work_pwq(work, pwq, extra_flags);
@@ -2246,12 +2218,9 @@ static int wq_select_unbound_cpu(int cpu)
}
new_cpu = __this_cpu_read(wq_rr_cpu_last);
- new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
- if (unlikely(new_cpu >= nr_cpu_ids)) {
- new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
- if (unlikely(new_cpu >= nr_cpu_ids))
- return cpu;
- }
+ new_cpu = cpumask_next_and_wrap(new_cpu, wq_unbound_cpumask, cpu_online_mask);
+ if (unlikely(new_cpu >= nr_cpu_ids))
+ return cpu;
__this_cpu_write(wq_rr_cpu_last, new_cpu);
return new_cpu;
@@ -2279,8 +2248,10 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
* queues a new work item to a wq after destroy_workqueue(wq).
*/
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
- WARN_ON_ONCE(!is_chained_work(wq))))
+ WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n",
+ work->func, wq->name))) {
return;
+ }
rcu_read_lock();
retry:
/* pwq which will be used unless @work is executing elsewhere */
@@ -2298,9 +2269,13 @@ retry:
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
+ *
+ * For ordered workqueue, work items must be queued on the newest pwq
+ * for accurate order management. Guaranteed order also guarantees
+ * non-reentrancy. See the comments above unplug_oldest_pwq().
*/
last_pool = get_work_pool(work);
- if (last_pool && last_pool != pool) {
+ if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) {
struct worker *worker;
raw_spin_lock(&last_pool->lock);
@@ -2500,7 +2475,7 @@ EXPORT_SYMBOL_GPL(queue_work_node);
void delayed_work_timer_fn(struct timer_list *t)
{
- struct delayed_work *dwork = from_timer(dwork, t, timer);
+ struct delayed_work *dwork = timer_container_of(dwork, t, timer);
/* should have been called from irqsafe timer with irq already off */
__queue_work(dwork->cpu, dwork->wq, &dwork->work);
@@ -2529,6 +2504,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
return;
}
+ WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu));
dwork->wq = wq;
dwork->cpu = cpu;
timer->expires = jiffies + delay;
@@ -2554,6 +2530,12 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
* @dwork: work to queue
* @delay: number of jiffies to wait before queueing
*
+ * We queue the delayed_work to a specific CPU, for non-zero delays the
+ * caller must ensure it is online and can't go away. Callers that fail
+ * to ensure this, may get @dwork->timer queued to an offlined CPU and
+ * this will prevent queueing of @dwork->work unless the offlined CPU
+ * becomes online again.
+ *
* Return: %false if @work was already on a queue, %true otherwise. If
* @delay is zero and @dwork is idle, it will be scheduled for immediate
* execution.
@@ -2710,6 +2692,26 @@ static void worker_attach_to_pool(struct worker *worker,
mutex_unlock(&wq_pool_attach_mutex);
}
+static void unbind_worker(struct worker *worker)
+{
+ lockdep_assert_held(&wq_pool_attach_mutex);
+
+ kthread_set_per_cpu(worker->task, -1);
+ if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+ else
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+}
+
+
+static void detach_worker(struct worker *worker)
+{
+ lockdep_assert_held(&wq_pool_attach_mutex);
+
+ unbind_worker(worker);
+ list_del(&worker->node);
+}
+
/**
* worker_detach_from_pool() - detach a worker from its pool
* @worker: worker which is attached to its pool
@@ -2721,26 +2723,17 @@ static void worker_attach_to_pool(struct worker *worker,
static void worker_detach_from_pool(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- struct completion *detach_completion = NULL;
/* there is one permanent BH worker per CPU which should never detach */
WARN_ON_ONCE(pool->flags & POOL_BH);
mutex_lock(&wq_pool_attach_mutex);
-
- kthread_set_per_cpu(worker->task, -1);
- list_del(&worker->node);
+ detach_worker(worker);
worker->pool = NULL;
-
- if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
- detach_completion = pool->detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
/* clear leftover flags without pool->lock after it is detached */
worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
-
- if (detach_completion)
- complete(detach_completion);
}
static int format_worker_id(char *buf, size_t size, struct worker *worker,
@@ -2844,35 +2837,22 @@ fail:
return NULL;
}
-static void unbind_worker(struct worker *worker)
+static void detach_dying_workers(struct list_head *cull_list)
{
- lockdep_assert_held(&wq_pool_attach_mutex);
+ struct worker *worker;
- kthread_set_per_cpu(worker->task, -1);
- if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
- else
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+ list_for_each_entry(worker, cull_list, entry)
+ detach_worker(worker);
}
-static void wake_dying_workers(struct list_head *cull_list)
+static void reap_dying_workers(struct list_head *cull_list)
{
struct worker *worker, *tmp;
list_for_each_entry_safe(worker, tmp, cull_list, entry) {
list_del_init(&worker->entry);
- unbind_worker(worker);
- /*
- * If the worker was somehow already running, then it had to be
- * in pool->idle_list when set_worker_dying() happened or we
- * wouldn't have gotten here.
- *
- * Thus, the worker must either have observed the WORKER_DIE
- * flag, or have set its state to TASK_IDLE. Either way, the
- * below will be observed by the worker and is safe to do
- * outside of pool->lock.
- */
- wake_up_process(worker->task);
+ kthread_stop_put(worker->task);
+ kfree(worker);
}
}
@@ -2906,7 +2886,9 @@ static void set_worker_dying(struct worker *worker, struct list_head *list)
worker->flags |= WORKER_DIE;
list_move(&worker->entry, list);
- list_move(&worker->node, &pool->dying_workers);
+
+ /* get an extra task struct reference for later kthread_stop_put() */
+ get_task_struct(worker->task);
}
/**
@@ -2921,7 +2903,7 @@ static void set_worker_dying(struct worker *worker, struct list_head *list)
*/
static void idle_worker_timeout(struct timer_list *t)
{
- struct worker_pool *pool = from_timer(pool, t, idle_timer);
+ struct worker_pool *pool = timer_container_of(pool, t, idle_timer);
bool do_cull = false;
if (work_pending(&pool->idle_cull_work))
@@ -2944,7 +2926,7 @@ static void idle_worker_timeout(struct timer_list *t)
raw_spin_unlock_irq(&pool->lock);
if (do_cull)
- queue_work(system_unbound_wq, &pool->idle_cull_work);
+ queue_work(system_dfl_wq, &pool->idle_cull_work);
}
/**
@@ -2965,9 +2947,9 @@ static void idle_cull_fn(struct work_struct *work)
/*
* Grabbing wq_pool_attach_mutex here ensures an already-running worker
- * cannot proceed beyong worker_detach_from_pool() in its self-destruct
- * path. This is required as a previously-preempted worker could run after
- * set_worker_dying() has happened but before wake_dying_workers() did.
+ * cannot proceed beyong set_pf_worker() in its self-destruct path.
+ * This is required as a previously-preempted worker could run after
+ * set_worker_dying() has happened but before detach_dying_workers() did.
*/
mutex_lock(&wq_pool_attach_mutex);
raw_spin_lock_irq(&pool->lock);
@@ -2988,8 +2970,10 @@ static void idle_cull_fn(struct work_struct *work)
}
raw_spin_unlock_irq(&pool->lock);
- wake_dying_workers(&cull_list);
+ detach_dying_workers(&cull_list);
mutex_unlock(&wq_pool_attach_mutex);
+
+ reap_dying_workers(&cull_list);
}
static void send_mayday(struct work_struct *work)
@@ -3018,7 +3002,7 @@ static void send_mayday(struct work_struct *work)
static void pool_mayday_timeout(struct timer_list *t)
{
- struct worker_pool *pool = from_timer(pool, t, mayday_timer);
+ struct worker_pool *pool = timer_container_of(pool, t, mayday_timer);
struct work_struct *work;
raw_spin_lock_irq(&pool->lock);
@@ -3079,7 +3063,7 @@ restart:
break;
}
- del_timer_sync(&pool->mayday_timer);
+ timer_delete_sync(&pool->mayday_timer);
raw_spin_lock_irq(&pool->lock);
/*
* This is necessary even after a new worker was just successfully
@@ -3090,6 +3074,31 @@ restart:
goto restart;
}
+#ifdef CONFIG_PREEMPT_RT
+static void worker_lock_callback(struct worker_pool *pool)
+{
+ spin_lock(&pool->cb_lock);
+}
+
+static void worker_unlock_callback(struct worker_pool *pool)
+{
+ spin_unlock(&pool->cb_lock);
+}
+
+static void workqueue_callback_cancel_wait_running(struct worker_pool *pool)
+{
+ spin_lock(&pool->cb_lock);
+ spin_unlock(&pool->cb_lock);
+}
+
+#else
+
+static void worker_lock_callback(struct worker_pool *pool) { }
+static void worker_unlock_callback(struct worker_pool *pool) { }
+static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) { }
+
+#endif
+
/**
* manage_workers - manage worker pool
* @worker: self
@@ -3220,7 +3229,7 @@ __acquires(&pool->lock)
lockdep_start_depth = lockdep_depth(current);
/* see drain_dead_softirq_workfn() */
if (!bh_draining)
- lock_map_acquire(&pwq->wq->lockdep_map);
+ lock_map_acquire(pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
/*
* Strictly speaking we should mark the invariant state without holding
@@ -3251,10 +3260,10 @@ __acquires(&pool->lock)
* point will only record its address.
*/
trace_workqueue_execute_end(work, worker->current_func);
- pwq->stats[PWQ_STAT_COMPLETED]++;
+
lock_map_release(&lockdep_map);
if (!bh_draining)
- lock_map_release(&pwq->wq->lockdep_map);
+ lock_map_release(pwq->wq->lockdep_map);
if (unlikely((worker->task && in_atomic()) ||
lockdep_depth(current) != lockdep_start_depth ||
@@ -3282,6 +3291,8 @@ __acquires(&pool->lock)
raw_spin_lock_irq(&pool->lock);
+ pwq->stats[PWQ_STAT_COMPLETED]++;
+
/*
* In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
* CPU intensive by wq_worker_tick() if @work hogged CPU longer than
@@ -3366,11 +3377,12 @@ woke_up:
if (unlikely(worker->flags & WORKER_DIE)) {
raw_spin_unlock_irq(&pool->lock);
set_pf_worker(false);
-
+ /*
+ * The worker is dead and PF_WQ_WORKER is cleared, worker->pool
+ * shouldn't be accessed, reset it to NULL in case otherwise.
+ */
+ worker->pool = NULL;
ida_free(&pool->worker_ida, worker->id);
- worker_detach_from_pool(worker);
- WARN_ON_ONCE(!list_empty(&worker->entry));
- kfree(worker);
return 0;
}
@@ -3425,6 +3437,27 @@ sleep:
goto woke_up;
}
+static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct work_struct *work, *n;
+
+ /* need rescue? */
+ if (!pwq->nr_active || !need_to_create_worker(pool))
+ return false;
+
+ /*
+ * Slurp in all works issued via this workqueue and
+ * process'em.
+ */
+ list_for_each_entry_safe(work, n, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n))
+ pwq->stats[PWQ_STAT_RESCUED]++;
+ }
+
+ return !list_empty(&rescuer->scheduled);
+}
+
/**
* rescuer_thread - the rescuer thread function
* @__rescuer: self
@@ -3479,7 +3512,6 @@ repeat:
struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
- struct work_struct *work, *n;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -3490,18 +3522,9 @@ repeat:
raw_spin_lock_irq(&pool->lock);
- /*
- * Slurp in all works issued via this workqueue and
- * process'em.
- */
WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
- list_for_each_entry_safe(work, n, &pool->worklist, entry) {
- if (get_work_pwq(work) == pwq &&
- assign_work(work, rescuer, &n))
- pwq->stats[PWQ_STAT_RESCUED]++;
- }
- if (!list_empty(&rescuer->scheduled)) {
+ if (assign_rescuer_work(pwq, rescuer)) {
process_scheduled_works(rescuer);
/*
@@ -3516,10 +3539,9 @@ repeat:
if (pwq->nr_active && need_to_create_worker(pool)) {
raw_spin_lock(&wq_mayday_lock);
/*
- * Queue iff we aren't racing destruction
- * and somebody else hasn't queued it already.
+ * Queue iff somebody else hasn't queued it already.
*/
- if (wq->rescuer && list_empty(&pwq->mayday_node)) {
+ if (list_empty(&pwq->mayday_node)) {
get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
}
@@ -3528,12 +3550,6 @@ repeat:
}
/*
- * Put the reference grabbed by send_mayday(). @pool won't
- * go away while we're still attached to it.
- */
- put_pwq(pwq);
-
- /*
* Leave this pool. Notify regular workers; otherwise, we end up
* with 0 concurrency and stalling the execution.
*/
@@ -3543,6 +3559,12 @@ repeat:
worker_detach_from_pool(rescuer);
+ /*
+ * Put the reference grabbed by send_mayday(). @pool might
+ * go away any time after it.
+ */
+ put_pwq_unlocked(pwq);
+
raw_spin_lock_irq(&wq_mayday_lock);
}
@@ -3566,6 +3588,7 @@ static void bh_worker(struct worker *worker)
int nr_restarts = BH_WORKER_RESTARTS;
unsigned long end = jiffies + BH_WORKER_JIFFIES;
+ worker_lock_callback(pool);
raw_spin_lock_irq(&pool->lock);
worker_leave_idle(worker);
@@ -3594,6 +3617,7 @@ done:
worker_enter_idle(worker);
kick_pool(pool);
raw_spin_unlock_irq(&pool->lock);
+ worker_unlock_callback(pool);
}
/*
@@ -3698,23 +3722,27 @@ void workqueue_softirq_dead(unsigned int cpu)
* check_flush_dependency - check for flush dependency sanity
* @target_wq: workqueue being flushed
* @target_work: work item being flushed (NULL for workqueue flushes)
+ * @from_cancel: are we called from the work cancel path
*
* %current is trying to flush the whole @target_wq or @target_work on it.
- * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
- * reclaiming memory or running on a workqueue which doesn't have
- * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
- * a deadlock.
+ * If this is not the cancel path (which implies work being flushed is either
+ * already running, or will not be at all), check if @target_wq doesn't have
+ * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running
+ * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward-
+ * progress guarantee leading to a deadlock.
*/
static void check_flush_dependency(struct workqueue_struct *target_wq,
- struct work_struct *target_work)
+ struct work_struct *target_work,
+ bool from_cancel)
{
- work_func_t target_func = target_work ? target_work->func : NULL;
+ work_func_t target_func;
struct worker *worker;
- if (target_wq->flags & WQ_MEM_RECLAIM)
+ if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM)
return;
worker = current_wq_worker();
+ target_func = target_work ? target_work->func : NULL;
WARN_ONCE(current->flags & PF_MEMALLOC,
"workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
@@ -3851,16 +3879,28 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
{
bool wait = false;
struct pool_workqueue *pwq;
+ struct worker_pool *current_pool = NULL;
if (flush_color >= 0) {
WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
atomic_set(&wq->nr_pwqs_to_flush, 1);
}
+ /*
+ * For unbound workqueue, pwqs will map to only a few pools.
+ * Most of the time, pwqs within the same pool will be linked
+ * sequentially to wq->pwqs by cpu index. So in the majority
+ * of pwq iters, the pool is the same, only doing lock/unlock
+ * if the pool has changed. This can largely reduce expensive
+ * lock operations.
+ */
for_each_pwq(pwq, wq) {
- struct worker_pool *pool = pwq->pool;
-
- raw_spin_lock_irq(&pool->lock);
+ if (current_pool != pwq->pool) {
+ if (likely(current_pool))
+ raw_spin_unlock_irq(&current_pool->lock);
+ current_pool = pwq->pool;
+ raw_spin_lock_irq(&current_pool->lock);
+ }
if (flush_color >= 0) {
WARN_ON_ONCE(pwq->flush_color != -1);
@@ -3877,9 +3917,11 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
pwq->work_color = work_color;
}
- raw_spin_unlock_irq(&pool->lock);
}
+ if (current_pool)
+ raw_spin_unlock_irq(&current_pool->lock);
+
if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
complete(&wq->first_flusher->done);
@@ -3889,11 +3931,14 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
static void touch_wq_lockdep_map(struct workqueue_struct *wq)
{
#ifdef CONFIG_LOCKDEP
+ if (unlikely(!wq->lockdep_map))
+ return;
+
if (wq->flags & WQ_BH)
local_bh_disable();
- lock_map_acquire(&wq->lockdep_map);
- lock_map_release(&wq->lockdep_map);
+ lock_map_acquire(wq->lockdep_map);
+ lock_map_release(wq->lockdep_map);
if (wq->flags & WQ_BH)
local_bh_enable();
@@ -3927,7 +3972,7 @@ void __flush_workqueue(struct workqueue_struct *wq)
struct wq_flusher this_flusher = {
.list = LIST_HEAD_INIT(this_flusher.list),
.flush_color = -1,
- .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
+ .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)),
};
int next_color;
@@ -3981,7 +4026,7 @@ void __flush_workqueue(struct workqueue_struct *wq)
list_add_tail(&this_flusher.list, &wq->flusher_overflow);
}
- check_flush_dependency(wq, NULL);
+ check_flush_dependency(wq, NULL, false);
mutex_unlock(&wq->mutex);
@@ -4156,7 +4201,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
}
wq = pwq->wq;
- check_flush_dependency(wq, work);
+ check_flush_dependency(wq, work, from_cancel);
insert_wq_barrier(pwq, barr, work, worker);
raw_spin_unlock_irq(&pool->lock);
@@ -4186,7 +4231,6 @@ already_gone:
static bool __flush_work(struct work_struct *work, bool from_cancel)
{
struct wq_barrier barr;
- unsigned long data;
if (WARN_ON(!wq_online))
return false;
@@ -4204,29 +4248,35 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
* was queued on a BH workqueue, we also know that it was running in the
* BH context and thus can be busy-waited.
*/
- data = *work_data_bits(work);
- if (from_cancel &&
- !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) {
- /*
- * On RT, prevent a live lock when %current preempted soft
- * interrupt processing or prevents ksoftirqd from running by
- * keeping flipping BH. If the BH work item runs on a different
- * CPU then this has no effect other than doing the BH
- * disable/enable dance for nothing. This is copied from
- * kernel/softirq.c::tasklet_unlock_spin_wait().
- */
- while (!try_wait_for_completion(&barr.done)) {
- if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- local_bh_disable();
- local_bh_enable();
- } else {
- cpu_relax();
+ if (from_cancel) {
+ unsigned long data = *work_data_bits(work);
+
+ if (!WARN_ON_ONCE(data & WORK_STRUCT_PWQ) &&
+ (data & WORK_OFFQ_BH)) {
+ /*
+ * On RT, prevent a live lock when %current preempted
+ * soft interrupt processing by blocking on lock which
+ * is owned by the thread invoking the callback.
+ */
+ while (!try_wait_for_completion(&barr.done)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ struct worker_pool *pool;
+
+ guard(rcu)();
+ pool = get_work_pool(work);
+ if (pool)
+ workqueue_callback_cancel_wait_running(pool);
+ } else {
+ cpu_relax();
+ }
}
+ goto out_destroy;
}
- } else {
- wait_for_completion(&barr.done);
}
+ wait_for_completion(&barr.done);
+
+out_destroy:
destroy_work_on_stack(&barr.work);
return true;
}
@@ -4264,7 +4314,7 @@ EXPORT_SYMBOL_GPL(flush_work);
bool flush_delayed_work(struct delayed_work *dwork)
{
local_irq_disable();
- if (del_timer_sync(&dwork->timer))
+ if (timer_delete_sync(&dwork->timer))
__queue_work(dwork->cpu, dwork->wq, &dwork->work);
local_irq_enable();
return flush_work(&dwork->work);
@@ -4610,7 +4660,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
*
* Return: The allocated new workqueue_attr on success. %NULL on failure.
*/
-struct workqueue_attrs *alloc_workqueue_attrs(void)
+struct workqueue_attrs *alloc_workqueue_attrs_noprof(void)
{
struct workqueue_attrs *attrs;
@@ -4761,11 +4811,13 @@ static int init_worker_pool(struct worker_pool *pool)
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
INIT_LIST_HEAD(&pool->workers);
- INIT_LIST_HEAD(&pool->dying_workers);
ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
pool->refcnt = 1;
+#ifdef CONFIG_PREEMPT_RT
+ spin_lock_init(&pool->cb_lock);
+#endif
/* shouldn't fail above this point */
pool->attrs = alloc_workqueue_attrs();
@@ -4788,16 +4840,23 @@ static void wq_init_lockdep(struct workqueue_struct *wq)
lock_name = wq->name;
wq->lock_name = lock_name;
- lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
+ wq->lockdep_map = &wq->__lockdep_map;
+ lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, 0);
}
static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
+ if (wq->lockdep_map != &wq->__lockdep_map)
+ return;
+
lockdep_unregister_key(&wq->key);
}
static void wq_free_lockdep(struct workqueue_struct *wq)
{
+ if (wq->lockdep_map != &wq->__lockdep_map)
+ return;
+
if (wq->lock_name != wq->name)
kfree(wq->lock_name);
}
@@ -4903,7 +4962,6 @@ static void rcu_free_pool(struct rcu_head *rcu)
*/
static void put_unbound_pool(struct worker_pool *pool)
{
- DECLARE_COMPLETION_ONSTACK(detach_completion);
struct worker *worker;
LIST_HEAD(cull_list);
@@ -4955,19 +5013,16 @@ static void put_unbound_pool(struct worker_pool *pool)
WARN_ON(pool->nr_workers || pool->nr_idle);
raw_spin_unlock_irq(&pool->lock);
- wake_dying_workers(&cull_list);
+ detach_dying_workers(&cull_list);
- if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
- pool->detach_completion = &detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
- if (pool->detach_completion)
- wait_for_completion(pool->detach_completion);
+ reap_dying_workers(&cull_list);
/* shut down the timers */
- del_timer_sync(&pool->idle_timer);
+ timer_delete_sync(&pool->idle_timer);
cancel_work_sync(&pool->idle_cull_work);
- del_timer_sync(&pool->mayday_timer);
+ timer_delete_sync(&pool->mayday_timer);
/* RCU protected to allow dereferences from get_work_pool() */
call_rcu(&pool->rcu, rcu_free_pool);
@@ -5038,12 +5093,6 @@ fail:
return NULL;
}
-static void rcu_free_pwq(struct rcu_head *rcu)
-{
- kmem_cache_free(pwq_cache,
- container_of(rcu, struct pool_workqueue, rcu));
-}
-
/*
* Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
* refcnt and needs to be destroyed.
@@ -5089,7 +5138,7 @@ static void pwq_release_workfn(struct kthread_work *work)
raw_spin_unlock_irq(&nna->lock);
}
- call_rcu(&pwq->rcu, rcu_free_pwq);
+ kfree_rcu(pwq, rcu);
/*
* If we're the last pwq going away, @wq is already dead and no one
@@ -5161,14 +5210,22 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
return pwq;
}
+static void apply_wqattrs_lock(void)
+{
+ mutex_lock(&wq_pool_mutex);
+}
+
+static void apply_wqattrs_unlock(void)
+{
+ mutex_unlock(&wq_pool_mutex);
+}
+
/**
* wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
* @attrs: the wq_attrs of the default pwq of the target workqueue
* @cpu: the target CPU
- * @cpu_going_down: if >= 0, the CPU to consider as offline
*
- * Calculate the cpumask a workqueue with @attrs should use on @pod. If
- * @cpu_going_down is >= 0, that cpu is considered offline during calculation.
+ * Calculate the cpumask a workqueue with @attrs should use on @pod.
* The result is stored in @attrs->__pod_cpumask.
*
* If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
@@ -5177,29 +5234,18 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
*
* The caller is responsible for ensuring that the cpumask of @pod stays stable.
*/
-static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
- int cpu_going_down)
+static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu)
{
const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
int pod = pt->cpu_pod[cpu];
- /* does @pod have any online CPUs @attrs wants? */
+ /* calculate possible CPUs in @pod that @attrs wants */
cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
- cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);
- if (cpu_going_down >= 0)
- cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);
-
- if (cpumask_empty(attrs->__pod_cpumask)) {
+ /* does @pod have any online CPUs @attrs wants? */
+ if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) {
cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
return;
}
-
- /* yeap, return possible CPUs in @pod that @attrs wants */
- cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);
-
- if (cpumask_empty(attrs->__pod_cpumask))
- pr_warn_once("WARNING: workqueue cpumask: online intersect > "
- "possible intersect\n");
}
/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
@@ -5284,7 +5330,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
ctx->dfl_pwq->refcnt++;
ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
} else {
- wq_calc_pod_cpumask(new_attrs, cpu, -1);
+ wq_calc_pod_cpumask(new_attrs, cpu);
ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
if (!ctx->pwq_tbl[cpu])
goto out_free;
@@ -5334,11 +5380,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
/* update node_nr_active->max */
wq_update_node_max_active(ctx->wq, -1);
- /* rescuer needs to respect wq cpumask changes */
- if (ctx->wq->rescuer)
- set_cpus_allowed_ptr(ctx->wq->rescuer->task,
- unbound_effective_cpumask(ctx->wq));
-
mutex_unlock(&ctx->wq->mutex);
}
@@ -5375,8 +5416,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
*
* Performs GFP_KERNEL allocations.
*
- * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock().
- *
* Return: 0 on success and -errno on failure.
*/
int apply_workqueue_attrs(struct workqueue_struct *wq,
@@ -5384,8 +5423,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
{
int ret;
- lockdep_assert_cpus_held();
-
mutex_lock(&wq_pool_mutex);
ret = apply_workqueue_attrs_locked(wq, attrs);
mutex_unlock(&wq_pool_mutex);
@@ -5394,15 +5431,12 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
}
/**
- * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug
+ * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug
* @wq: the target workqueue
- * @cpu: the CPU to update pool association for
- * @hotplug_cpu: the CPU coming up or going down
- * @online: whether @cpu is coming up or going down
+ * @cpu: the CPU to update the pwq slot for
*
* This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
- * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of
- * @wq accordingly.
+ * %CPU_DOWN_FAILED. @cpu is in the same pod of the CPU being hot[un]plugged.
*
*
* If pod affinity can't be adjusted due to memory allocation failure, it falls
@@ -5415,10 +5449,8 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
* CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
* responsibility to flush the work item from CPU_DOWN_PREPARE.
*/
-static void wq_update_pod(struct workqueue_struct *wq, int cpu,
- int hotplug_cpu, bool online)
+static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu)
{
- int off_cpu = online ? -1 : hotplug_cpu;
struct pool_workqueue *old_pwq = NULL, *pwq;
struct workqueue_attrs *target_attrs;
@@ -5432,13 +5464,13 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
* Let's use a preallocated one. The following buf is protected by
* CPU hotplug exclusion.
*/
- target_attrs = wq_update_pod_attrs_buf;
+ target_attrs = unbound_wq_update_pwq_attrs_buf;
copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);
/* nothing to do if the target cpumask matches the current pwq */
- wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
+ wq_calc_pod_cpumask(target_attrs, cpu);
if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
return;
@@ -5472,21 +5504,24 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
bool highpri = wq->flags & WQ_HIGHPRI;
int cpu, ret;
+ lockdep_assert_held(&wq_pool_mutex);
+
wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
if (!wq->cpu_pwq)
goto enomem;
if (!(wq->flags & WQ_UNBOUND)) {
+ struct worker_pool __percpu *pools;
+
+ if (wq->flags & WQ_BH)
+ pools = bh_worker_pools;
+ else
+ pools = cpu_worker_pools;
+
for_each_possible_cpu(cpu) {
struct pool_workqueue **pwq_p;
- struct worker_pool __percpu *pools;
struct worker_pool *pool;
- if (wq->flags & WQ_BH)
- pools = bh_worker_pools;
- else
- pools = cpu_worker_pools;
-
pool = &(per_cpu_ptr(pools, cpu)[highpri]);
pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);
@@ -5504,26 +5539,18 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
return 0;
}
- cpus_read_lock();
if (wq->flags & __WQ_ORDERED) {
struct pool_workqueue *dfl_pwq;
- ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+ ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]);
/* there should only be single pwq for ordering guarantee */
dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
wq->pwqs.prev != &dfl_pwq->pwqs_node),
"ordering guarantee broken for workqueue %s\n", wq->name);
} else {
- ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
+ ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
}
- cpus_read_unlock();
-
- /* for unbound pwq, flush the pwq_release_worker ensures that the
- * pwq_release_workfn() completes before calling kfree(wq).
- */
- if (ret)
- kthread_flush_worker(pwq_release_worker);
return ret;
@@ -5561,6 +5588,8 @@ static int init_rescuer(struct workqueue_struct *wq)
char id_buf[WORKER_ID_LEN];
int ret;
+ lockdep_assert_held(&wq_pool_mutex);
+
if (!(wq->flags & WQ_MEM_RECLAIM))
return 0;
@@ -5584,10 +5613,13 @@ static int init_rescuer(struct workqueue_struct *wq)
}
wq->rescuer = rescuer;
- if (wq->flags & WQ_UNBOUND)
+
+ /* initial cpumask is consistent with the detached rescuer and unbind_worker() */
+ if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
kthread_bind_mask(rescuer->task, wq_unbound_cpumask);
else
kthread_bind_mask(rescuer->task, cpu_possible_mask);
+
wake_up_process(rescuer->task);
return 0;
@@ -5656,12 +5688,11 @@ static void wq_adjust_max_active(struct workqueue_struct *wq)
} while (activated);
}
-__printf(1, 4)
-struct workqueue_struct *alloc_workqueue(const char *fmt,
- unsigned int flags,
- int max_active, ...)
+__printf(1, 0)
+static struct workqueue_struct *__alloc_workqueue(const char *fmt,
+ unsigned int flags,
+ int max_active, va_list args)
{
- va_list args;
struct workqueue_struct *wq;
size_t wq_size;
int name_len;
@@ -5683,19 +5714,17 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
else
wq_size = sizeof(*wq);
- wq = kzalloc(wq_size, GFP_KERNEL);
+ wq = kzalloc_noprof(wq_size, GFP_KERNEL);
if (!wq)
return NULL;
if (flags & WQ_UNBOUND) {
- wq->unbound_attrs = alloc_workqueue_attrs();
+ wq->unbound_attrs = alloc_workqueue_attrs_noprof();
if (!wq->unbound_attrs)
goto err_free_wq;
}
- va_start(args, max_active);
name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
- va_end(args);
if (name_len >= WQ_NAME_LEN)
pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
@@ -5725,29 +5754,21 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
INIT_LIST_HEAD(&wq->flusher_overflow);
INIT_LIST_HEAD(&wq->maydays);
- wq_init_lockdep(wq);
INIT_LIST_HEAD(&wq->list);
if (flags & WQ_UNBOUND) {
if (alloc_node_nr_active(wq->node_nr_active) < 0)
- goto err_unreg_lockdep;
+ goto err_free_wq;
}
- if (alloc_and_link_pwqs(wq) < 0)
- goto err_free_node_nr_active;
-
- if (wq_online && init_rescuer(wq) < 0)
- goto err_destroy;
-
- if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
- goto err_destroy;
-
/*
- * wq_pool_mutex protects global freeze state and workqueues list.
- * Grab it, adjust max_active and add the new @wq to workqueues
- * list.
+ * wq_pool_mutex protects the workqueues list, allocations of PWQs,
+ * and the global freeze state.
*/
- mutex_lock(&wq_pool_mutex);
+ apply_wqattrs_lock();
+
+ if (alloc_and_link_pwqs(wq) < 0)
+ goto err_unlock_free_node_nr_active;
mutex_lock(&wq->mutex);
wq_adjust_max_active(wq);
@@ -5755,25 +5776,79 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
list_add_tail_rcu(&wq->list, &workqueues);
- mutex_unlock(&wq_pool_mutex);
+ if (wq_online && init_rescuer(wq) < 0)
+ goto err_unlock_destroy;
+
+ apply_wqattrs_unlock();
+
+ if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
+ goto err_destroy;
return wq;
-err_free_node_nr_active:
- if (wq->flags & WQ_UNBOUND)
+err_unlock_free_node_nr_active:
+ apply_wqattrs_unlock();
+ /*
+ * Failed alloc_and_link_pwqs() may leave pending pwq->release_work,
+ * flushing the pwq_release_worker ensures that the pwq_release_workfn()
+ * completes before calling kfree(wq).
+ */
+ if (wq->flags & WQ_UNBOUND) {
+ kthread_flush_worker(pwq_release_worker);
free_node_nr_active(wq->node_nr_active);
-err_unreg_lockdep:
- wq_unregister_lockdep(wq);
- wq_free_lockdep(wq);
+ }
err_free_wq:
free_workqueue_attrs(wq->unbound_attrs);
kfree(wq);
return NULL;
+err_unlock_destroy:
+ apply_wqattrs_unlock();
err_destroy:
destroy_workqueue(wq);
return NULL;
}
-EXPORT_SYMBOL_GPL(alloc_workqueue);
+
+__printf(1, 4)
+struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
+ unsigned int flags,
+ int max_active, ...)
+{
+ struct workqueue_struct *wq;
+ va_list args;
+
+ va_start(args, max_active);
+ wq = __alloc_workqueue(fmt, flags, max_active, args);
+ va_end(args);
+ if (!wq)
+ return NULL;
+
+ wq_init_lockdep(wq);
+
+ return wq;
+}
+EXPORT_SYMBOL_GPL(alloc_workqueue_noprof);
+
+#ifdef CONFIG_LOCKDEP
+__printf(1, 5)
+struct workqueue_struct *
+alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags,
+ int max_active, struct lockdep_map *lockdep_map, ...)
+{
+ struct workqueue_struct *wq;
+ va_list args;
+
+ va_start(args, lockdep_map);
+ wq = __alloc_workqueue(fmt, flags, max_active, args);
+ va_end(args);
+ if (!wq)
+ return NULL;
+
+ wq->lockdep_map = lockdep_map;
+
+ return wq;
+}
+EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map);
+#endif
static bool pwq_busy(struct pool_workqueue *pwq)
{
@@ -5796,6 +5871,17 @@ static bool pwq_busy(struct pool_workqueue *pwq)
* @wq: target workqueue
*
* Safely destroy a workqueue. All work currently pending will be done first.
+ *
+ * This function does NOT guarantee that non-pending work that has been
+ * submitted with queue_delayed_work() and similar functions will be done
+ * before destroying the workqueue. The fundamental problem is that, currently,
+ * the workqueue has no way of accessing non-pending delayed_work. delayed_work
+ * is only linked on the timer-side. All delayed_work must, therefore, be
+ * canceled before calling this function.
+ *
+ * TODO: It would be better if the problem described above wouldn't exist and
+ * destroy_workqueue() would cleanly cancel all pending and non-pending
+ * delayed_work.
*/
void destroy_workqueue(struct workqueue_struct *wq)
{
@@ -5818,16 +5904,10 @@ void destroy_workqueue(struct workqueue_struct *wq)
/* kill rescuer, if sanity checks fail, leave it w/o rescuer */
if (wq->rescuer) {
- struct worker *rescuer = wq->rescuer;
-
- /* this prevents new queueing */
- raw_spin_lock_irq(&wq_mayday_lock);
- wq->rescuer = NULL;
- raw_spin_unlock_irq(&wq_mayday_lock);
-
/* rescuer will empty maydays list before exiting */
- kthread_stop(rescuer->task);
- kfree(rescuer);
+ kthread_stop(wq->rescuer->task);
+ kfree(wq->rescuer);
+ wq->rescuer = NULL;
}
/*
@@ -5994,7 +6074,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
struct pool_workqueue *pwq;
bool ret;
- rcu_read_lock();
preempt_disable();
if (cpu == WORK_CPU_UNBOUND)
@@ -6004,7 +6083,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
ret = !list_empty(&pwq->inactive_works);
preempt_enable();
- rcu_read_unlock();
return ret;
}
@@ -6607,6 +6685,8 @@ int workqueue_online_cpu(unsigned int cpu)
mutex_lock(&wq_pool_mutex);
+ cpumask_set_cpu(cpu, wq_online_cpumask);
+
for_each_pool(pool, pi) {
/* BH pools aren't affected by hotplug */
if (pool->flags & POOL_BH)
@@ -6629,7 +6709,7 @@ int workqueue_online_cpu(unsigned int cpu)
int tcpu;
for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
- wq_update_pod(wq, tcpu, cpu, true);
+ unbound_wq_update_pwq(wq, tcpu);
mutex_lock(&wq->mutex);
wq_update_node_max_active(wq, -1);
@@ -6653,6 +6733,9 @@ int workqueue_offline_cpu(unsigned int cpu)
/* update pod affinity of unbound workqueues */
mutex_lock(&wq_pool_mutex);
+
+ cpumask_clear_cpu(cpu, wq_online_cpumask);
+
list_for_each_entry(wq, &workqueues, list) {
struct workqueue_attrs *attrs = wq->unbound_attrs;
@@ -6661,7 +6744,7 @@ int workqueue_offline_cpu(unsigned int cpu)
int tcpu;
for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
- wq_update_pod(wq, tcpu, cpu, false);
+ unbound_wq_update_pwq(wq, tcpu);
mutex_lock(&wq->mutex);
wq_update_node_max_active(wq, cpu);
@@ -6711,31 +6794,6 @@ long work_on_cpu_key(int cpu, long (*fn)(void *),
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_key);
-
-/**
- * work_on_cpu_safe_key - run a function in thread context on a particular cpu
- * @cpu: the cpu to run on
- * @fn: the function to run
- * @arg: the function argument
- * @key: The lock class key for lock debugging purposes
- *
- * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
- * any locks which would prevent @fn from completing.
- *
- * Return: The value @fn returns.
- */
-long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
- void *arg, struct lock_class_key *key)
-{
- long ret = -ENODEV;
-
- cpus_read_lock();
- if (cpu_online(cpu))
- ret = work_on_cpu_key(cpu, fn, arg, key);
- cpus_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER
@@ -6875,8 +6933,26 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
}
if (!ret) {
+ int cpu;
+ struct worker_pool *pool;
+ struct worker *worker;
+
mutex_lock(&wq_pool_attach_mutex);
cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
+ /* rescuer needs to respect cpumask changes when it is not attached */
+ list_for_each_entry(wq, &workqueues, list) {
+ if (wq->rescuer && !wq->rescuer->pool)
+ unbind_worker(wq->rescuer);
+ }
+ /* DISASSOCIATED worker needs to respect wq_unbound_cpumask */
+ for_each_possible_cpu(cpu) {
+ for_each_cpu_worker_pool(pool, cpu) {
+ if (!(pool->flags & POOL_DISASSOCIATED))
+ continue;
+ for_each_pool_worker(worker, pool)
+ unbind_worker(worker);
+ }
+ }
mutex_unlock(&wq_pool_attach_mutex);
}
return ret;
@@ -6887,8 +6963,7 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
* @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
*
* This function can be called from cpuset code to provide a set of isolated
- * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
- * either cpus_read_lock or cpus_write_lock.
+ * CPUs that should be excluded from wq_unbound_cpumask.
*/
int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
{
@@ -6898,12 +6973,8 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
return -ENOMEM;
- lockdep_assert_cpus_held();
mutex_lock(&wq_pool_mutex);
- /* Save the current isolated cpumask & export it via sysfs */
- cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
-
/*
* If the operation fails, it will fall back to
* wq_requested_unbound_cpumask which is initially set to
@@ -6915,6 +6986,10 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
if (!cpumask_equal(cpumask, wq_unbound_cpumask))
ret = workqueue_apply_unbound_cpumask(cpumask);
+ /* Save the current isolated cpumask & export it via sysfs */
+ if (!ret)
+ cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
+
mutex_unlock(&wq_pool_mutex);
free_cpumask_var(cpumask);
return ret;
@@ -6948,9 +7023,8 @@ static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
wq_affn_dfl = affn;
list_for_each_entry(wq, &workqueues, list) {
- for_each_online_cpu(cpu) {
- wq_update_pod(wq, cpu, cpu, true);
- }
+ for_each_online_cpu(cpu)
+ unbound_wq_update_pwq(wq, cpu);
}
mutex_unlock(&wq_pool_mutex);
@@ -7038,19 +7112,6 @@ static struct attribute *wq_sysfs_attrs[] = {
};
ATTRIBUTE_GROUPS(wq_sysfs);
-static void apply_wqattrs_lock(void)
-{
- /* CPUs should stay stable across pwq creations and installations */
- cpus_read_lock();
- mutex_lock(&wq_pool_mutex);
-}
-
-static void apply_wqattrs_unlock(void)
-{
- mutex_unlock(&wq_pool_mutex);
- cpus_read_unlock();
-}
-
static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -7249,16 +7310,12 @@ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
*/
cpumask_and(cpumask, cpumask, cpu_possible_mask);
if (!cpumask_empty(cpumask)) {
+ ret = 0;
apply_wqattrs_lock();
- cpumask_copy(wq_requested_unbound_cpumask, cpumask);
- if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
- ret = 0;
- goto out_unlock;
- }
-
- ret = workqueue_apply_unbound_cpumask(cpumask);
-
-out_unlock:
+ if (!cpumask_equal(cpumask, wq_unbound_cpumask))
+ ret = workqueue_apply_unbound_cpumask(cpumask);
+ if (!ret)
+ cpumask_copy(wq_requested_unbound_cpumask, cpumask);
apply_wqattrs_unlock();
}
@@ -7448,6 +7505,9 @@ static struct timer_list wq_watchdog_timer;
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
+static unsigned int wq_panic_on_stall;
+module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);
+
/*
* Show workers that might prevent the processing of pending work items.
* The only candidates are CPU-bound workers in the running state.
@@ -7499,6 +7559,16 @@ static void show_cpu_pools_hogs(void)
rcu_read_unlock();
}
+static void panic_on_wq_watchdog(void)
+{
+ static unsigned int wq_stall;
+
+ if (wq_panic_on_stall) {
+ wq_stall++;
+ BUG_ON(wq_stall >= wq_panic_on_stall);
+ }
+}
+
static void wq_watchdog_reset_touched(void)
{
int cpu;
@@ -7520,8 +7590,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
if (!thresh)
return;
- rcu_read_lock();
-
for_each_pool(pool, pi) {
unsigned long pool_ts, touched, ts;
@@ -7563,30 +7631,39 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
}
- rcu_read_unlock();
-
if (lockup_detected)
show_all_workqueues();
if (cpu_pool_stall)
show_cpu_pools_hogs();
+ if (lockup_detected)
+ panic_on_wq_watchdog();
+
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh);
}
notrace void wq_watchdog_touch(int cpu)
{
+ unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+ unsigned long touch_ts = READ_ONCE(wq_watchdog_touched);
+ unsigned long now = jiffies;
+
if (cpu >= 0)
- per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+ per_cpu(wq_watchdog_touched_cpu, cpu) = now;
+ else
+ WARN_ONCE(1, "%s should be called with valid CPU", __func__);
- wq_watchdog_touched = jiffies;
+ /* Don't unnecessarily store to global cacheline */
+ if (time_after(now, touch_ts + thresh / 4))
+ WRITE_ONCE(wq_watchdog_touched, jiffies);
}
static void wq_watchdog_set_thresh(unsigned long thresh)
{
wq_watchdog_thresh = 0;
- del_timer_sync(&wq_watchdog_timer);
+ timer_delete_sync(&wq_watchdog_timer);
if (thresh) {
wq_watchdog_thresh = thresh;
@@ -7605,7 +7682,7 @@ static int wq_watchdog_param_set_thresh(const char *val,
if (ret)
return ret;
- if (system_wq)
+ if (system_percpu_wq)
wq_watchdog_set_thresh(thresh);
else
wq_watchdog_thresh = thresh;
@@ -7690,10 +7767,12 @@ void __init workqueue_init_early(void)
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
+ BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
+ cpumask_copy(wq_online_cpumask, cpu_online_mask);
cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
@@ -7701,11 +7780,12 @@ void __init workqueue_init_early(void)
restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
-
+ cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
- wq_update_pod_attrs_buf = alloc_workqueue_attrs();
- BUG_ON(!wq_update_pod_attrs_buf);
+ unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();
+ BUG_ON(!unbound_wq_update_pwq_attrs_buf);
/*
* If nohz_full is enabled, set power efficient workqueue as unbound.
@@ -7762,23 +7842,24 @@ void __init workqueue_init_early(void)
ordered_wq_attrs[i] = attrs;
}
- system_wq = alloc_workqueue("events", 0, 0);
- system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
- system_long_wq = alloc_workqueue("events_long", 0, 0);
- system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
- WQ_MAX_ACTIVE);
+ system_wq = alloc_workqueue("events", WQ_PERCPU, 0);
+ system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0);
+ system_highpri_wq = alloc_workqueue("events_highpri",
+ WQ_HIGHPRI | WQ_PERCPU, 0);
+ system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0);
+ system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
+ system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
- WQ_FREEZABLE, 0);
+ WQ_FREEZABLE | WQ_PERCPU, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
- WQ_POWER_EFFICIENT, 0);
+ WQ_POWER_EFFICIENT | WQ_PERCPU, 0);
system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
- WQ_FREEZABLE | WQ_POWER_EFFICIENT,
- 0);
- system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
+ WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0);
+ system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0);
system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
- WQ_BH | WQ_HIGHPRI, 0);
- BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
- !system_unbound_wq || !system_freezable_wq ||
+ WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0);
+ BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
+ !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq ||
!system_bh_wq || !system_bh_highpri_wq);
@@ -7789,7 +7870,7 @@ static void __init wq_cpu_intensive_thresh_init(void)
unsigned long thresh;
unsigned long bogo;
- pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
+ pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release");
BUG_ON(IS_ERR(pwq_release_worker));
/* if the user set it to a specific value, keep it */
@@ -7970,12 +8051,12 @@ void __init workqueue_init_topology(void)
/*
* Workqueues allocated earlier would have all CPUs sharing the default
- * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU
- * combinations to apply per-pod sharing.
+ * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue
+ * and CPU combinations to apply per-pod sharing.
*/
list_for_each_entry(wq, &workqueues, list) {
for_each_online_cpu(cpu)
- wq_update_pod(wq, cpu, cpu, true);
+ unbound_wq_update_pwq(wq, cpu);
if (wq->flags & WQ_UNBOUND) {
mutex_lock(&wq->mutex);
wq_update_node_max_active(wq, -1);