From 686f669780276da534e93ba769e02bdcf1f89f8d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 13 Mar 2023 19:28:50 +0100 Subject: workqueue: move to use bus_get_dev_root() Direct access to the struct bus_type dev_root pointer is going away soon so replace that with a call to bus_get_dev_root() instead, which is what it is there for. Cc: Lai Jiangshan Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20230313182918.1312597-8-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b8b541caed48..209917792fa4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5826,13 +5826,19 @@ static struct device_attribute wq_sysfs_cpumask_attr = static int __init wq_sysfs_init(void) { + struct device *dev_root; int err; err = subsys_virtual_register(&wq_subsys, NULL); if (err) return err; - return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr); + dev_root = bus_get_dev_root(&wq_subsys); + if (dev_root) { + err = device_create_file(dev_root, &wq_sysfs_cpumask_attr); + put_device(dev_root); + } + return err; } core_initcall(wq_sysfs_init); -- cgit From a8ec5880bd82b834717770cba4596381ffd50545 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Sun, 26 Feb 2023 23:53:20 +0700 Subject: workqueue: Simplify a pr_warn() call in wq_select_unbound_cpu() Use pr_warn_once() to achieve the same thing. It's simpler. Signed-off-by: Ammar Faizi Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b8b541caed48..3f1fabea000f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1392,15 +1392,13 @@ static bool is_chained_work(struct workqueue_struct *wq) */ static int wq_select_unbound_cpu(int cpu) { - static bool printed_dbg_warning; int new_cpu; if (likely(!wq_debug_force_rr_cpu)) { if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) return cpu; - } else if (!printed_dbg_warning) { - pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n"); - printed_dbg_warning = true; + } else { + pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); } if (cpumask_empty(wq_unbound_cpumask)) -- cgit From 335a42ebb0ca8ee9997a1731aaaae6dcd704c113 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 7 Mar 2023 13:53:31 +0100 Subject: workqueue: Fix hung time report of worker pools The workqueue watchdog prints a warning when there is no progress in a worker pool. Where the progress means that the pool started processing a pending work item. Note that it is perfectly fine to process work items much longer. The progress should be guaranteed by waking up or creating idle workers. show_one_worker_pool() prints state of non-idle worker pool. It shows a delay since the last pool->watchdog_ts. The timestamp is updated when a first pending work is queued in __queue_work(). Also it is updated when a work is dequeued for processing in worker_thread() and rescuer_thread(). The delay is misleading when there is no pending work item. In this case it shows how long the last work item is being proceed. Show zero instead. There is no stall if there is no pending work. Fixes: 82607adcf9cdf40fb7b ("workqueue: implement lockup detector") Signed-off-by: Petr Mladek Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3f1fabea000f..8c0ec21a86a2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5000,10 +5000,16 @@ static void show_one_worker_pool(struct worker_pool *pool) struct worker *worker; bool first = true; unsigned long flags; + unsigned long hung = 0; raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; + + /* How long the first pending work is waiting for a worker. */ + if (!list_empty(&pool->worklist)) + hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000; + /* * Defer printing to avoid deadlocks in console drivers that * queue work while holding locks also taken in their write @@ -5012,9 +5018,7 @@ static void show_one_worker_pool(struct worker_pool *pool) printk_deferred_enter(); pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" hung=%us workers=%d", - jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, - pool->nr_workers); + pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); -- cgit From 3f0ea0b864562c6bd1cee892026067eaea7be242 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 7 Mar 2023 13:53:32 +0100 Subject: workqueue: Warn when a new worker could not be created The workqueue watchdog reports a lockup when there was not any progress in the worker pool for a long time. The progress means that a pending work item starts being proceed. The progress is guaranteed by using idle workers or creating new workers for pending work items. There are several reasons why a new worker could not be created: + there is not enough memory + there is no free pool ID (IDR API) + the system reached PID limit + the process creating the new worker was interrupted + the last idle worker (manager) has not been scheduled for a long time. It was not able to even start creating the kthread. None of these failures is reported at the moment. The only clue is that show_one_worker_pool() prints that there is a manager. It is the last idle worker that is responsible for creating a new one. But it is not clear if create_worker() is failing and why. Make the debugging easier by printing errors in create_worker(). The error code is important, especially from kthread_create_on_node(). It helps to distinguish the various reasons. For example, reaching memory limit (-ENOMEM), other system limits (-EAGAIN), or process interrupted (-EINTR). Use pr_once() to avoid repeating the same error every CREATE_COOLDOWN for each stuck worker pool. Ratelimited printk() might be better. It would help to know if the problem remains. It would be more clear if the create_worker() errors and workqueue stalls are related. Also old messages might get lost when the internal log buffer is full. The problem is that printk() might touch the watchdog. For example, see touch_nmi_watchdog() in serial8250_console_write(). It would require synchronization of the begin and length of the ratelimit interval with the workqueue watchdog. Otherwise, the error messages might break the watchdog. This does not look worth the complexity. Signed-off-by: Petr Mladek Signed-off-by: Tejun Heo --- kernel/workqueue.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8c0ec21a86a2..9760f0fca82d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1936,12 +1936,17 @@ static struct worker *create_worker(struct worker_pool *pool) /* ID is needed to determine kthread name */ id = ida_alloc(&pool->worker_ida, GFP_KERNEL); - if (id < 0) + if (id < 0) { + pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n", + ERR_PTR(id)); return NULL; + } worker = alloc_worker(pool->node); - if (!worker) + if (!worker) { + pr_err_once("workqueue: Failed to allocate a worker\n"); goto fail; + } worker->id = id; @@ -1953,8 +1958,11 @@ static struct worker *create_worker(struct worker_pool *pool) worker->task = kthread_create_on_node(worker_thread, worker, pool->node, "kworker/%s", id_buf); - if (IS_ERR(worker->task)) + if (IS_ERR(worker->task)) { + pr_err_once("workqueue: Failed to create a worker thread: %pe", + worker->task); goto fail; + } set_user_nice(worker->task, pool->attrs->nice); kthread_bind_mask(worker->task, pool->attrs->cpumask); -- cgit From 60f540389a5d2df25ddc7ad511b4fa2880dea521 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 7 Mar 2023 13:53:33 +0100 Subject: workqueue: Interrupted create_worker() is not a repeated event kthread_create_on_node() might get interrupted(). It is rare but realistic. For example, when an unbound workqueue is allocated in module_init() callback. It is done in the context of the "modprobe" process. And, for example, systemd might kill pending processes when switching root from initrd to the booted system. The interrupt is a one-off event and the race might be hard to reproduce. It is always worth printing. Signed-off-by: Petr Mladek Signed-off-by: Tejun Heo --- kernel/workqueue.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9760f0fca82d..5f0ecaaaf997 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1959,8 +1959,13 @@ static struct worker *create_worker(struct worker_pool *pool) worker->task = kthread_create_on_node(worker_thread, worker, pool->node, "kworker/%s", id_buf); if (IS_ERR(worker->task)) { - pr_err_once("workqueue: Failed to create a worker thread: %pe", - worker->task); + if (PTR_ERR(worker->task) == -EINTR) { + pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", + id_buf); + } else { + pr_err_once("workqueue: Failed to create a worker thread: %pe", + worker->task); + } goto fail; } -- cgit From 4c0736a76a186e5df2cd2afda3e7a04d2a427d1b Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 7 Mar 2023 13:53:34 +0100 Subject: workqueue: Warn when a rescuer could not be created Rescuers are created when a workqueue with WQ_MEM_RECLAIM is allocated. It typically happens during the system boot. systemd switches the root filesystem from initrd to the booted system during boot. It kills processes that block the switch for too long. One of the process might be modprobe that tries to create a workqueue. These problems are hard to reproduce. Also alloc_workqueue() does not pass the error code. Make the debugging easier by printing an error, similar to create_worker(). Signed-off-by: Petr Mladek Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f0ecaaaf997..fb1eb7a3f49b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4391,13 +4391,18 @@ static int init_rescuer(struct workqueue_struct *wq) return 0; rescuer = alloc_worker(NUMA_NO_NODE); - if (!rescuer) + if (!rescuer) { + pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n", + wq->name); return -ENOMEM; + } rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); if (IS_ERR(rescuer->task)) { ret = PTR_ERR(rescuer->task); + pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", + wq->name, ERR_PTR(ret)); kfree(rescuer); return ret; } -- cgit From cd2440d66fec7d1bdb4f605b64c27c63c9141989 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 7 Mar 2023 13:53:35 +0100 Subject: workqueue: Print backtraces from CPUs with hung CPU bound workqueues The workqueue watchdog reports a lockup when there was not any progress in the worker pool for a long time. The progress means that a pending work item starts being proceed. Worker pools for unbound workqueues always wake up an idle worker and try to process the work immediately. The last idle worker has to create new worker first. The stall might happen only when a new worker could not be created in which case an error should get printed. Another problem might be too high load. In this case, workers are victims of a global system problem. Worker pools for CPU bound workqueues are designed for lightweight work items that do not need much CPU time. They are proceed one by one on a single worker. New worker is used only when a work is sleeping. It creates one additional scenario. The stall might happen when the CPU-bound workqueue is used for CPU-intensive work. More precisely, the stall is detected when a CPU-bound worker is in the TASK_RUNNING state for too long. In this case, it might be useful to see the backtrace from the problematic worker. The information how long a worker is in the running state is not available. But the CPU-bound worker pools do not have many workers in the running state by definition. And only few pools are typically blocked. It should be acceptable to print backtraces from all workers in TASK_RUNNING state in the stalled worker pools. The number of false positives should be very low. Signed-off-by: Petr Mladek Signed-off-by: Tejun Heo --- kernel/workqueue.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fb1eb7a3f49b..044b4eee760b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -141,6 +142,8 @@ enum { * WR: wq->mutex protected for writes. RCU protected for reads. * * MD: wq_mayday_lock protected. + * + * WD: Used internally by the watchdog. */ /* struct worker is defined in workqueue_internal.h */ @@ -153,6 +156,7 @@ struct worker_pool { unsigned int flags; /* X: flags */ unsigned long watchdog_ts; /* L: watchdog timestamp */ + bool cpu_stall; /* WD: stalled cpu bound pool */ /* * The counter is incremented in a process context on the associated CPU @@ -5976,6 +5980,57 @@ static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; +/* + * Show workers that might prevent the processing of pending work items. + * The only candidates are CPU-bound workers in the running state. + * Pending work items should be handled by another idle worker + * in all other situations. + */ +static void show_cpu_pool_hog(struct worker_pool *pool) +{ + struct worker *worker; + unsigned long flags; + int bkt; + + raw_spin_lock_irqsave(&pool->lock, flags); + + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (task_is_running(worker->task)) { + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_deferred_enter(); + + pr_info("pool %d:\n", pool->id); + sched_show_task(worker->task); + + printk_deferred_exit(); + } + } + + raw_spin_unlock_irqrestore(&pool->lock, flags); +} + +static void show_cpu_pools_hogs(void) +{ + struct worker_pool *pool; + int pi; + + pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); + + rcu_read_lock(); + + for_each_pool(pool, pi) { + if (pool->cpu_stall) + show_cpu_pool_hog(pool); + + } + + rcu_read_unlock(); +} + static void wq_watchdog_reset_touched(void) { int cpu; @@ -5989,6 +6044,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; bool lockup_detected = false; + bool cpu_pool_stall = false; unsigned long now = jiffies; struct worker_pool *pool; int pi; @@ -6001,6 +6057,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) for_each_pool(pool, pi) { unsigned long pool_ts, touched, ts; + pool->cpu_stall = false; if (list_empty(&pool->worklist)) continue; @@ -6025,11 +6082,17 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) /* did we stall? */ if (time_after(now, ts + thresh)) { lockup_detected = true; + if (pool->cpu >= 0) { + pool->cpu_stall = true; + cpu_pool_stall = true; + } pr_emerg("BUG: workqueue lockup - pool"); pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", jiffies_to_msecs(now - pool_ts) / 1000); } + + } rcu_read_unlock(); @@ -6037,6 +6100,9 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) if (lockup_detected) show_all_workqueues(); + if (cpu_pool_stall) + show_cpu_pools_hogs(); + wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); } -- cgit From 704bc669e1dda3eb8f6d5cb462b21e85558a3912 Mon Sep 17 00:00:00 2001 From: Jungseung Lee Date: Mon, 20 Mar 2023 12:29:05 +0900 Subject: workqueue: Introduce show_freezable_workqueues Currently show_all_workqueue is called if freeze fails at the time of freeze the workqueues, which shows the status of all workqueues and of all worker pools. In this cases we may only need to dump state of only workqueues that are freezable and busy. This patch defines show_freezable_workqueues, which uses show_one_workqueue, a granular function that shows the state of individual workqueues, so that dump only the state of freezable workqueues at that time. tj: Minor message adjustment. Signed-off-by: Jungseung Lee Signed-off-by: Tejun Heo --- kernel/workqueue.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 044b4eee760b..d78935e4fb5d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5065,8 +5065,7 @@ next_pool: /** * show_all_workqueues - dump workqueue state * - * Called from a sysrq handler or try_to_freeze_tasks() and prints out - * all busy workqueues and pools. + * Called from a sysrq handler and prints out all busy workqueues and pools. */ void show_all_workqueues(void) { @@ -5087,6 +5086,29 @@ void show_all_workqueues(void) rcu_read_unlock(); } +/** + * show_freezable_workqueues - dump freezable workqueue state + * + * Called from try_to_freeze_tasks() and prints out all freezable workqueues + * still busy. + */ +void show_freezable_workqueues(void) +{ + struct workqueue_struct *wq; + + rcu_read_lock(); + + pr_info("Showing freezable workqueues that are still busy:\n"); + + list_for_each_entry_rcu(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; + show_one_workqueue(wq); + } + + rcu_read_unlock(); +} + /* used to show worker information through /proc/PID/{comm,stat,status} */ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) { -- cgit From afa4bb778e48d79e4a642ed41e3b4e0de7489a6c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Jun 2023 12:08:14 -0700 Subject: workqueue: clean up WORK_* constant types, clarify masking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dave Airlie reports that gcc-13.1.1 has started complaining about some of the workqueue code in 32-bit arm builds: kernel/workqueue.c: In function ‘get_work_pwq’: kernel/workqueue.c:713:24: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] 713 | return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); | ^ [ ... a couple of other cases ... ] and while it's not immediately clear exactly why gcc started complaining about it now, I suspect it's some C23-induced enum type handlign fixup in gcc-13 is the cause. Whatever the reason for starting to complain, the code and data types are indeed disgusting enough that the complaint is warranted. The wq code ends up creating various "helper constants" (like that WORK_STRUCT_WQ_DATA_MASK) using an enum type, which is all kinds of confused. The mask needs to be 'unsigned long', not some unspecified enum type. To make matters worse, the actual "mask and cast to a pointer" is repeated a couple of times, and the cast isn't even always done to the right pointer, but - as the error case above - to a 'void *' with then the compiler finishing the job. That's now how we roll in the kernel. So create the masks using the proper types rather than some ambiguous enumeration, and use a nice helper that actually does the type conversion in one well-defined place. Incidentally, this magically makes clang generate better code. That, admittedly, is really just a sign of clang having been seriously confused before, and cleaning up the typing unconfuses the compiler too. Reported-by: Dave Airlie Link: https://lore.kernel.org/lkml/CAPM=9twNnV4zMCvrPkw3H-ajZOH-01JVh_kDrxdPYQErz8ZTdA@mail.gmail.com/ Cc: Arnd Bergmann Cc: Tejun Heo Cc: Nick Desaulniers Cc: Nathan Chancellor Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4666a1a92a31..c913e333cce8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -705,12 +705,17 @@ static void clear_work_data(struct work_struct *work) set_work_data(work, WORK_STRUCT_NO_POOL, 0); } +static inline struct pool_workqueue *work_struct_pwq(unsigned long data) +{ + return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK); +} + static struct pool_workqueue *get_work_pwq(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) - return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); + return work_struct_pwq(data); else return NULL; } @@ -738,8 +743,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) assert_rcu_or_pool_mutex(); if (data & WORK_STRUCT_PWQ) - return ((struct pool_workqueue *) - (data & WORK_STRUCT_WQ_DATA_MASK))->pool; + return work_struct_pwq(data)->pool; pool_id = data >> WORK_OFFQ_POOL_SHIFT; if (pool_id == WORK_OFFQ_POOL_NONE) @@ -760,8 +764,7 @@ static int get_work_pool_id(struct work_struct *work) unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) - return ((struct pool_workqueue *) - (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id; + return work_struct_pwq(data)->pool->id; return data >> WORK_OFFQ_POOL_SHIFT; } -- cgit