diff options
Diffstat (limited to 'kernel/workqueue.c')
| -rw-r--r-- | kernel/workqueue.c | 280 | 
1 files changed, 209 insertions, 71 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 07895deca271..b8b541caed48 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -169,7 +169,9 @@ struct worker_pool {  	struct list_head	idle_list;	/* L: list of idle workers */  	struct timer_list	idle_timer;	/* L: worker idle timeout */ -	struct timer_list	mayday_timer;	/* L: SOS timer for workers */ +	struct work_struct      idle_cull_work; /* L: worker idle cleanup */ + +	struct timer_list	mayday_timer;	  /* L: SOS timer for workers */  	/* a workers is either on busy_hash or idle_list, or the manager */  	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); @@ -177,6 +179,7 @@ struct worker_pool {  	struct worker		*manager;	/* L: purely informational */  	struct list_head	workers;	/* A: attached workers */ +	struct list_head        dying_workers;  /* A: workers about to die */  	struct completion	*detach_completion; /* all workers detached */  	struct ida		worker_ida;	/* worker IDs for task name */ @@ -326,7 +329,7 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);  static LIST_HEAD(workqueues);		/* PR: list of all workqueues */  static bool workqueue_freezing;		/* PL: have wqs started freezing? */ -/* PL: allowable cpus for unbound wqs and work items */ +/* PL&A: allowable cpus for unbound wqs and work items */  static cpumask_var_t wq_unbound_cpumask;  /* CPU where unbound work was last round robin scheduled from this CPU */ @@ -1433,9 +1436,13 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,  	lockdep_assert_irqs_disabled(); -	/* if draining, only works from the same workqueue are allowed */ -	if (unlikely(wq->flags & __WQ_DRAINING) && -	    WARN_ON_ONCE(!is_chained_work(wq))) +	/* +	 * For a draining wq, only works from the same workqueue are +	 * allowed. The __WQ_DESTROYING helps to spot the issue that +	 * queues a new work item to a wq after destroy_workqueue(wq). +	 */ +	if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && +		     WARN_ON_ONCE(!is_chained_work(wq))))  		return;  	rcu_read_lock();  retry: @@ -1900,7 +1907,7 @@ static void worker_detach_from_pool(struct worker *worker)  	list_del(&worker->node);  	worker->pool = NULL; -	if (list_empty(&pool->workers)) +	if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))  		detach_completion = pool->detach_completion;  	mutex_unlock(&wq_pool_attach_mutex); @@ -1972,21 +1979,55 @@ fail:  	return NULL;  } +static void unbind_worker(struct worker *worker) +{ +	lockdep_assert_held(&wq_pool_attach_mutex); + +	kthread_set_per_cpu(worker->task, -1); +	if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) +		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); +	else +		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); +} + +static void wake_dying_workers(struct list_head *cull_list) +{ +	struct worker *worker, *tmp; + +	list_for_each_entry_safe(worker, tmp, cull_list, entry) { +		list_del_init(&worker->entry); +		unbind_worker(worker); +		/* +		 * If the worker was somehow already running, then it had to be +		 * in pool->idle_list when set_worker_dying() happened or we +		 * wouldn't have gotten here. +		 * +		 * Thus, the worker must either have observed the WORKER_DIE +		 * flag, or have set its state to TASK_IDLE. Either way, the +		 * below will be observed by the worker and is safe to do +		 * outside of pool->lock. +		 */ +		wake_up_process(worker->task); +	} +} +  /** - * destroy_worker - destroy a workqueue worker + * set_worker_dying - Tag a worker for destruction   * @worker: worker to be destroyed + * @list: transfer worker away from its pool->idle_list and into list   * - * Destroy @worker and adjust @pool stats accordingly.  The worker should - * be idle. + * Tag @worker for destruction and adjust @pool stats accordingly.  The worker + * should be idle.   *   * CONTEXT:   * raw_spin_lock_irq(pool->lock).   */ -static void destroy_worker(struct worker *worker) +static void set_worker_dying(struct worker *worker, struct list_head *list)  {  	struct worker_pool *pool = worker->pool;  	lockdep_assert_held(&pool->lock); +	lockdep_assert_held(&wq_pool_attach_mutex);  	/* sanity check frenzy */  	if (WARN_ON(worker->current_work) || @@ -1997,34 +2038,94 @@ static void destroy_worker(struct worker *worker)  	pool->nr_workers--;  	pool->nr_idle--; -	list_del_init(&worker->entry);  	worker->flags |= WORKER_DIE; -	wake_up_process(worker->task); + +	list_move(&worker->entry, list); +	list_move(&worker->node, &pool->dying_workers);  } +/** + * idle_worker_timeout - check if some idle workers can now be deleted. + * @t: The pool's idle_timer that just expired + * + * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in + * worker_leave_idle(), as a worker flicking between idle and active while its + * pool is at the too_many_workers() tipping point would cause too much timer + * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let + * it expire and re-evaluate things from there. + */  static void idle_worker_timeout(struct timer_list *t)  {  	struct worker_pool *pool = from_timer(pool, t, idle_timer); +	bool do_cull = false; + +	if (work_pending(&pool->idle_cull_work)) +		return;  	raw_spin_lock_irq(&pool->lock); -	while (too_many_workers(pool)) { +	if (too_many_workers(pool)) {  		struct worker *worker;  		unsigned long expires;  		/* idle_list is kept in LIFO order, check the last one */  		worker = list_entry(pool->idle_list.prev, struct worker, entry);  		expires = worker->last_active + IDLE_WORKER_TIMEOUT; +		do_cull = !time_before(jiffies, expires); + +		if (!do_cull) +			mod_timer(&pool->idle_timer, expires); +	} +	raw_spin_unlock_irq(&pool->lock); + +	if (do_cull) +		queue_work(system_unbound_wq, &pool->idle_cull_work); +} + +/** + * idle_cull_fn - cull workers that have been idle for too long. + * @work: the pool's work for handling these idle workers + * + * This goes through a pool's idle workers and gets rid of those that have been + * idle for at least IDLE_WORKER_TIMEOUT seconds. + * + * We don't want to disturb isolated CPUs because of a pcpu kworker being + * culled, so this also resets worker affinity. This requires a sleepable + * context, hence the split between timer callback and work item. + */ +static void idle_cull_fn(struct work_struct *work) +{ +	struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); +	struct list_head cull_list; + +	INIT_LIST_HEAD(&cull_list); +	/* +	 * Grabbing wq_pool_attach_mutex here ensures an already-running worker +	 * cannot proceed beyong worker_detach_from_pool() in its self-destruct +	 * path. This is required as a previously-preempted worker could run after +	 * set_worker_dying() has happened but before wake_dying_workers() did. +	 */ +	mutex_lock(&wq_pool_attach_mutex); +	raw_spin_lock_irq(&pool->lock); + +	while (too_many_workers(pool)) { +		struct worker *worker; +		unsigned long expires; + +		worker = list_entry(pool->idle_list.prev, struct worker, entry); +		expires = worker->last_active + IDLE_WORKER_TIMEOUT;  		if (time_before(jiffies, expires)) {  			mod_timer(&pool->idle_timer, expires);  			break;  		} -		destroy_worker(worker); +		set_worker_dying(worker, &cull_list);  	}  	raw_spin_unlock_irq(&pool->lock); +	wake_dying_workers(&cull_list); +	mutex_unlock(&wq_pool_attach_mutex);  }  static void send_mayday(struct work_struct *work) @@ -2388,12 +2489,12 @@ woke_up:  	/* am I supposed to die? */  	if (unlikely(worker->flags & WORKER_DIE)) {  		raw_spin_unlock_irq(&pool->lock); -		WARN_ON_ONCE(!list_empty(&worker->entry));  		set_pf_worker(false);  		set_task_comm(worker->task, "kworker/dying");  		ida_free(&pool->worker_ida, worker->id);  		worker_detach_from_pool(worker); +		WARN_ON_ONCE(!list_empty(&worker->entry));  		kfree(worker);  		return 0;  	} @@ -3462,10 +3563,12 @@ static int init_worker_pool(struct worker_pool *pool)  	hash_init(pool->busy_hash);  	timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); +	INIT_WORK(&pool->idle_cull_work, idle_cull_fn);  	timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);  	INIT_LIST_HEAD(&pool->workers); +	INIT_LIST_HEAD(&pool->dying_workers);  	ida_init(&pool->worker_ida);  	INIT_HLIST_NODE(&pool->hash_node); @@ -3540,18 +3643,6 @@ static void rcu_free_pool(struct rcu_head *rcu)  	kfree(pool);  } -/* This returns with the lock held on success (pool manager is inactive). */ -static bool wq_manager_inactive(struct worker_pool *pool) -{ -	raw_spin_lock_irq(&pool->lock); - -	if (pool->flags & POOL_MANAGER_ACTIVE) { -		raw_spin_unlock_irq(&pool->lock); -		return false; -	} -	return true; -} -  /**   * put_unbound_pool - put a worker_pool   * @pool: worker_pool to put @@ -3566,8 +3657,11 @@ static bool wq_manager_inactive(struct worker_pool *pool)  static void put_unbound_pool(struct worker_pool *pool)  {  	DECLARE_COMPLETION_ONSTACK(detach_completion); +	struct list_head cull_list;  	struct worker *worker; +	INIT_LIST_HEAD(&cull_list); +  	lockdep_assert_held(&wq_pool_mutex);  	if (--pool->refcnt) @@ -3587,20 +3681,38 @@ static void put_unbound_pool(struct worker_pool *pool)  	 * Become the manager and destroy all workers.  This prevents  	 * @pool's workers from blocking on attach_mutex.  We're the last  	 * manager and @pool gets freed with the flag set. -	 * Because of how wq_manager_inactive() works, we will hold the -	 * spinlock after a successful wait. +	 * +	 * Having a concurrent manager is quite unlikely to happen as we can +	 * only get here with +	 *   pwq->refcnt == pool->refcnt == 0 +	 * which implies no work queued to the pool, which implies no worker can +	 * become the manager. However a worker could have taken the role of +	 * manager before the refcnts dropped to 0, since maybe_create_worker() +	 * drops pool->lock  	 */ -	rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool), -			   TASK_UNINTERRUPTIBLE); -	pool->flags |= POOL_MANAGER_ACTIVE; +	while (true) { +		rcuwait_wait_event(&manager_wait, +				   !(pool->flags & POOL_MANAGER_ACTIVE), +				   TASK_UNINTERRUPTIBLE); + +		mutex_lock(&wq_pool_attach_mutex); +		raw_spin_lock_irq(&pool->lock); +		if (!(pool->flags & POOL_MANAGER_ACTIVE)) { +			pool->flags |= POOL_MANAGER_ACTIVE; +			break; +		} +		raw_spin_unlock_irq(&pool->lock); +		mutex_unlock(&wq_pool_attach_mutex); +	}  	while ((worker = first_idle_worker(pool))) -		destroy_worker(worker); +		set_worker_dying(worker, &cull_list);  	WARN_ON(pool->nr_workers || pool->nr_idle);  	raw_spin_unlock_irq(&pool->lock); -	mutex_lock(&wq_pool_attach_mutex); -	if (!list_empty(&pool->workers)) +	wake_dying_workers(&cull_list); + +	if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))  		pool->detach_completion = &detach_completion;  	mutex_unlock(&wq_pool_attach_mutex); @@ -3609,6 +3721,7 @@ static void put_unbound_pool(struct worker_pool *pool)  	/* shut down the timers */  	del_timer_sync(&pool->idle_timer); +	cancel_work_sync(&pool->idle_cull_work);  	del_timer_sync(&pool->mayday_timer);  	/* RCU protected to allow dereferences from get_work_pool() */ @@ -3952,7 +4065,8 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)  /* allocate the attrs and pwqs for later installation */  static struct apply_wqattrs_ctx *  apply_wqattrs_prepare(struct workqueue_struct *wq, -		      const struct workqueue_attrs *attrs) +		      const struct workqueue_attrs *attrs, +		      const cpumask_var_t unbound_cpumask)  {  	struct apply_wqattrs_ctx *ctx;  	struct workqueue_attrs *new_attrs, *tmp_attrs; @@ -3968,14 +4082,15 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,  		goto out_free;  	/* -	 * Calculate the attrs of the default pwq. +	 * Calculate the attrs of the default pwq with unbound_cpumask +	 * which is wq_unbound_cpumask or to set to wq_unbound_cpumask.  	 * If the user configured cpumask doesn't overlap with the  	 * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.  	 */  	copy_workqueue_attrs(new_attrs, attrs); -	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask); +	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask);  	if (unlikely(cpumask_empty(new_attrs->cpumask))) -		cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask); +		cpumask_copy(new_attrs->cpumask, unbound_cpumask);  	/*  	 * We may create multiple pwqs with differing cpumasks.  Make a @@ -4072,7 +4187,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,  		wq->flags &= ~__WQ_ORDERED;  	} -	ctx = apply_wqattrs_prepare(wq, attrs); +	ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);  	if (!ctx)  		return -ENOMEM; @@ -4414,6 +4529,11 @@ void destroy_workqueue(struct workqueue_struct *wq)  	 */  	workqueue_sysfs_unregister(wq); +	/* mark the workqueue destruction is in progress */ +	mutex_lock(&wq->mutex); +	wq->flags |= __WQ_DESTROYING; +	mutex_unlock(&wq->mutex); +  	/* drain it before proceeding with destruction */  	drain_workqueue(wq); @@ -4709,22 +4829,53 @@ static void pr_cont_pool_info(struct worker_pool *pool)  	pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);  } -static void pr_cont_work(bool comma, struct work_struct *work) +struct pr_cont_work_struct { +	bool comma; +	work_func_t func; +	long ctr; +}; + +static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp) +{ +	if (!pcwsp->ctr) +		goto out_record; +	if (func == pcwsp->func) { +		pcwsp->ctr++; +		return; +	} +	if (pcwsp->ctr == 1) +		pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func); +	else +		pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func); +	pcwsp->ctr = 0; +out_record: +	if ((long)func == -1L) +		return; +	pcwsp->comma = comma; +	pcwsp->func = func; +	pcwsp->ctr = 1; +} + +static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)  {  	if (work->func == wq_barrier_func) {  		struct wq_barrier *barr;  		barr = container_of(work, struct wq_barrier, work); +		pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);  		pr_cont("%s BAR(%d)", comma ? "," : "",  			task_pid_nr(barr->task));  	} else { -		pr_cont("%s %ps", comma ? "," : "", work->func); +		if (!comma) +			pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); +		pr_cont_work_flush(comma, work->func, pcwsp);  	}  }  static void show_pwq(struct pool_workqueue *pwq)  { +	struct pr_cont_work_struct pcws = { .ctr = 0, };  	struct worker_pool *pool = pwq->pool;  	struct work_struct *work;  	struct worker *worker; @@ -4757,7 +4908,8 @@ static void show_pwq(struct pool_workqueue *pwq)  				worker->rescue_wq ? "(RESCUER)" : "",  				worker->current_func);  			list_for_each_entry(work, &worker->scheduled, entry) -				pr_cont_work(false, work); +				pr_cont_work(false, work, &pcws); +			pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);  			comma = true;  		}  		pr_cont("\n"); @@ -4777,9 +4929,10 @@ static void show_pwq(struct pool_workqueue *pwq)  			if (get_work_pwq(work) != pwq)  				continue; -			pr_cont_work(comma, work); +			pr_cont_work(comma, work, &pcws);  			comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);  		} +		pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);  		pr_cont("\n");  	} @@ -4788,9 +4941,10 @@ static void show_pwq(struct pool_workqueue *pwq)  		pr_info("    inactive:");  		list_for_each_entry(work, &pwq->inactive_works, entry) { -			pr_cont_work(comma, work); +			pr_cont_work(comma, work, &pcws);  			comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);  		} +		pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);  		pr_cont("\n");  	}  } @@ -5006,13 +5160,8 @@ static void unbind_workers(int cpu)  		raw_spin_unlock_irq(&pool->lock); -		for_each_pool_worker(worker, pool) { -			kthread_set_per_cpu(worker->task, -1); -			if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) -				WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); -			else -				WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); -		} +		for_each_pool_worker(worker, pool) +			unbind_worker(worker);  		mutex_unlock(&wq_pool_attach_mutex);  	} @@ -5334,7 +5483,7 @@ out_unlock:  }  #endif /* CONFIG_FREEZER */ -static int workqueue_apply_unbound_cpumask(void) +static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)  {  	LIST_HEAD(ctxs);  	int ret = 0; @@ -5350,7 +5499,7 @@ static int workqueue_apply_unbound_cpumask(void)  		if (wq->flags & __WQ_ORDERED)  			continue; -		ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs); +		ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);  		if (!ctx) {  			ret = -ENOMEM;  			break; @@ -5365,6 +5514,11 @@ static int workqueue_apply_unbound_cpumask(void)  		apply_wqattrs_cleanup(ctx);  	} +	if (!ret) { +		mutex_lock(&wq_pool_attach_mutex); +		cpumask_copy(wq_unbound_cpumask, unbound_cpumask); +		mutex_unlock(&wq_pool_attach_mutex); +	}  	return ret;  } @@ -5383,7 +5537,6 @@ static int workqueue_apply_unbound_cpumask(void)  int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)  {  	int ret = -EINVAL; -	cpumask_var_t saved_cpumask;  	/*  	 * Not excluding isolated cpus on purpose. @@ -5397,23 +5550,8 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)  			goto out_unlock;  		} -		if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) { -			ret = -ENOMEM; -			goto out_unlock; -		} - -		/* save the old wq_unbound_cpumask. */ -		cpumask_copy(saved_cpumask, wq_unbound_cpumask); - -		/* update wq_unbound_cpumask at first and apply it to wqs. */ -		cpumask_copy(wq_unbound_cpumask, cpumask); -		ret = workqueue_apply_unbound_cpumask(); - -		/* restore the wq_unbound_cpumask when failed. */ -		if (ret < 0) -			cpumask_copy(wq_unbound_cpumask, saved_cpumask); +		ret = workqueue_apply_unbound_cpumask(cpumask); -		free_cpumask_var(saved_cpumask);  out_unlock:  		apply_wqattrs_unlock();  	}  | 
