summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/core-api/workqueue.rst30
-rw-r--r--include/linux/workqueue.h11
-rw-r--r--kernel/workqueue.c74
-rw-r--r--tools/workqueue/wq_dump.py16
-rw-r--r--tools/workqueue/wq_monitor.py21
5 files changed, 132 insertions, 20 deletions
diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst
index 56af317508c9..c73a6df6a118 100644
--- a/Documentation/core-api/workqueue.rst
+++ b/Documentation/core-api/workqueue.rst
@@ -353,9 +353,10 @@ Affinity Scopes
An unbound workqueue groups CPUs according to its affinity scope to improve
cache locality. For example, if a workqueue is using the default affinity
scope of "cache", it will group CPUs according to last level cache
-boundaries. A work item queued on the workqueue will be processed by a
-worker running on one of the CPUs which share the last level cache with the
-issuing CPU.
+boundaries. A work item queued on the workqueue will be assigned to a worker
+on one of the CPUs which share the last level cache with the issuing CPU.
+Once started, the worker may or may not be allowed to move outside the scope
+depending on the ``affinity_strict`` setting of the scope.
Workqueue currently supports the following five affinity scopes.
@@ -391,6 +392,21 @@ directory.
``affinity_scope``
Read to see the current affinity scope. Write to change.
+``affinity_strict``
+ 0 by default indicating that affinity scopes are not strict. When a work
+ item starts execution, workqueue makes a best-effort attempt to ensure
+ that the worker is inside its affinity scope, which is called
+ repatriation. Once started, the scheduler is free to move the worker
+ anywhere in the system as it sees fit. This enables benefiting from scope
+ locality while still being able to utilize other CPUs if necessary and
+ available.
+
+ If set to 1, all workers of the scope are guaranteed always to be in the
+ scope. This may be useful when crossing affinity scopes has other
+ implications, for example, in terms of power consumption or workload
+ isolation. Strict NUMA scope can also be used to match the workqueue
+ behavior of older kernels.
+
Examining Configuration
=======================
@@ -475,21 +491,21 @@ Monitoring
Use tools/workqueue/wq_monitor.py to monitor workqueue operations: ::
$ tools/workqueue/wq_monitor.py events
- total infl CPUtime CPUhog CMwake mayday rescued
+ total infl CPUtime CPUhog CMW/RPR mayday rescued
events 18545 0 6.1 0 5 - -
events_highpri 8 0 0.0 0 0 - -
events_long 3 0 0.0 0 0 - -
- events_unbound 38306 0 0.1 - - - -
+ events_unbound 38306 0 0.1 - 7 - -
events_freezable 0 0 0.0 0 0 - -
events_power_efficient 29598 0 0.2 0 0 - -
events_freezable_power_ 10 0 0.0 0 0 - -
sock_diag_events 0 0 0.0 0 0 - -
- total infl CPUtime CPUhog CMwake mayday rescued
+ total infl CPUtime CPUhog CMW/RPR mayday rescued
events 18548 0 6.1 0 5 - -
events_highpri 8 0 0.0 0 0 - -
events_long 3 0 0.0 0 0 - -
- events_unbound 38322 0 0.1 - - - -
+ events_unbound 38322 0 0.1 - 7 - -
events_freezable 0 0 0.0 0 0 - -
events_power_efficient 29603 0 0.2 0 0 - -
events_freezable_power_ 10 0 0.0 0 0 - -
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index fe53976e088e..0c1cad38f9db 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -169,6 +169,17 @@ struct workqueue_attrs {
*/
cpumask_var_t __pod_cpumask;
+ /**
+ * @affn_strict: affinity scope is strict
+ *
+ * If clear, workqueue will make a best-effort attempt at starting the
+ * worker inside @__pod_cpumask but the scheduler is free to migrate it
+ * outside.
+ *
+ * If set, workers are only allowed to run inside @__pod_cpumask.
+ */
+ bool affn_strict;
+
/*
* Below fields aren't properties of a worker_pool. They only modify how
* :c:func:`apply_workqueue_attrs` select pools and thus don't
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e61b4291bec8..6f6f4f37ceb3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -211,6 +211,7 @@ enum pool_workqueue_stats {
PWQ_STAT_CPU_TIME, /* total CPU time consumed */
PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */
+ PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */
PWQ_STAT_MAYDAY, /* maydays to rescuer */
PWQ_STAT_RESCUED, /* linked work items executed by rescuer */
@@ -1103,13 +1104,41 @@ static bool assign_work(struct work_struct *work, struct worker *worker,
static bool kick_pool(struct worker_pool *pool)
{
struct worker *worker = first_idle_worker(pool);
+ struct task_struct *p;
lockdep_assert_held(&pool->lock);
if (!need_more_worker(pool) || !worker)
return false;
- wake_up_process(worker->task);
+ p = worker->task;
+
+#ifdef CONFIG_SMP
+ /*
+ * Idle @worker is about to execute @work and waking up provides an
+ * opportunity to migrate @worker at a lower cost by setting the task's
+ * wake_cpu field. Let's see if we want to move @worker to improve
+ * execution locality.
+ *
+ * We're waking the worker that went idle the latest and there's some
+ * chance that @worker is marked idle but hasn't gone off CPU yet. If
+ * so, setting the wake_cpu won't do anything. As this is a best-effort
+ * optimization and the race window is narrow, let's leave as-is for
+ * now. If this becomes pronounced, we can skip over workers which are
+ * still on cpu when picking an idle worker.
+ *
+ * If @pool has non-strict affinity, @worker might have ended up outside
+ * its affinity scope. Repatriate.
+ */
+ if (!pool->attrs->affn_strict &&
+ !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
+ struct work_struct *work = list_first_entry(&pool->worklist,
+ struct work_struct, entry);
+ p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask);
+ get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
+ }
+#endif
+ wake_up_process(p);
return true;
}
@@ -2051,7 +2080,10 @@ static struct worker *alloc_worker(int node)
static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
{
- return pool->attrs->__pod_cpumask;
+ if (pool->cpu < 0 && pool->attrs->affn_strict)
+ return pool->attrs->__pod_cpumask;
+ else
+ return pool->attrs->cpumask;
}
/**
@@ -3715,6 +3747,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
to->nice = from->nice;
cpumask_copy(to->cpumask, from->cpumask);
cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
+ to->affn_strict = from->affn_strict;
/*
* Unlike hash and equality test, copying shouldn't ignore wq-only
@@ -3745,6 +3778,7 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
hash = jhash(cpumask_bits(attrs->__pod_cpumask),
BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+ hash = jhash_1word(attrs->affn_strict, hash);
return hash;
}
@@ -3758,6 +3792,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
return false;
if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
return false;
+ if (a->affn_strict != b->affn_strict)
+ return false;
return true;
}
@@ -5847,6 +5883,7 @@ module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);
* nice RW int : nice value of the workers
* cpumask RW mask : bitmask of allowed CPUs for the workers
* affinity_scope RW str : worker CPU affinity scope (cache, numa, none)
+ * affinity_strict RW bool : worker CPU affinity is strict
*/
struct wq_device {
struct workqueue_struct *wq;
@@ -6026,10 +6063,42 @@ static ssize_t wq_affn_scope_store(struct device *dev,
return ret ?: count;
}
+static ssize_t wq_affinity_strict_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n",
+ wq->unbound_attrs->affn_strict);
+}
+
+static ssize_t wq_affinity_strict_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret = -ENOMEM;
+
+ if (sscanf(buf, "%d", &v) != 1)
+ return -EINVAL;
+
+ apply_wqattrs_lock();
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (attrs) {
+ attrs->affn_strict = (bool)v;
+ ret = apply_workqueue_attrs_locked(wq, attrs);
+ }
+ apply_wqattrs_unlock();
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
static struct device_attribute wq_sysfs_unbound_attrs[] = {
__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
__ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
+ __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store),
__ATTR_NULL,
};
@@ -6452,6 +6521,7 @@ void __init workqueue_init_early(void)
cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
pool->attrs->nice = std_nice[i++];
+ pool->attrs->affn_strict = true;
pool->node = cpu_to_node(cpu);
/* alloc pool ID */
diff --git a/tools/workqueue/wq_dump.py b/tools/workqueue/wq_dump.py
index 43ab71a193b8..d0df5833f2c1 100644
--- a/tools/workqueue/wq_dump.py
+++ b/tools/workqueue/wq_dump.py
@@ -36,10 +36,11 @@ Workqueue CPU -> pool
Lists all workqueues along with their type and worker pool association. For
each workqueue:
- NAME TYPE POOL_ID...
+ NAME TYPE[,FLAGS] POOL_ID...
NAME name of the workqueue
TYPE percpu, unbound or ordered
+ FLAGS S: strict affinity scope
POOL_ID worker pool ID associated with each possible CPU
"""
@@ -138,13 +139,16 @@ for pi, pool in idr_for_each(worker_pool_idr):
print(f'cpu={pool.cpu.value_():3}', end='')
else:
print(f'cpus={cpumask_str(pool.attrs.cpumask)}', end='')
+ print(f' pod_cpus={cpumask_str(pool.attrs.__pod_cpumask)}', end='')
+ if pool.attrs.affn_strict:
+ print(' strict', end='')
print('')
print('')
print('Workqueue CPU -> pool')
print('=====================')
-print('[ workqueue \ CPU ', end='')
+print('[ workqueue \ type CPU', end='')
for cpu in for_each_possible_cpu(prog):
print(f' {cpu:{max_pool_id_len}}', end='')
print(' dfl]')
@@ -153,11 +157,15 @@ for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(
print(f'{wq.name.string_().decode()[-24:]:24}', end='')
if wq.flags & WQ_UNBOUND:
if wq.flags & WQ_ORDERED:
- print(' ordered', end='')
+ print(' ordered ', end='')
else:
print(' unbound', end='')
+ if wq.unbound_attrs.affn_strict:
+ print(',S ', end='')
+ else:
+ print(' ', end='')
else:
- print(' percpu ', end='')
+ print(' percpu ', end='')
for cpu in for_each_possible_cpu(prog):
pool_id = per_cpu_ptr(wq.cpu_pwq, cpu)[0].pool.id.value_()
diff --git a/tools/workqueue/wq_monitor.py b/tools/workqueue/wq_monitor.py
index 6e258d123e8c..a8856a9c45dc 100644
--- a/tools/workqueue/wq_monitor.py
+++ b/tools/workqueue/wq_monitor.py
@@ -20,8 +20,11 @@ https://github.com/osandov/drgn.
and got excluded from concurrency management to avoid stalling
other work items.
- CMwake The number of concurrency-management wake-ups while executing a
- work item of the workqueue.
+ CMW/RPR For per-cpu workqueues, the number of concurrency-management
+ wake-ups while executing a work item of the workqueue. For
+ unbound workqueues, the number of times a worker was repatriated
+ to its affinity scope after being migrated to an off-scope CPU by
+ the scheduler.
mayday The number of times the rescuer was requested while waiting for
new worker creation.
@@ -65,6 +68,7 @@ PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed exec
PWQ_STAT_CPU_TIME = prog['PWQ_STAT_CPU_TIME'] # total CPU time consumed
PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations
PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups
+PWQ_STAT_REPATRIATED = prog['PWQ_STAT_REPATRIATED'] # unbound workers brought back into scope
PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer
PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer
PWQ_NR_STATS = prog['PWQ_NR_STATS']
@@ -89,22 +93,25 @@ class WqStats:
'cpu_time' : self.stats[PWQ_STAT_CPU_TIME],
'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE],
'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP],
+ 'repatriated' : self.stats[PWQ_STAT_REPATRIATED],
'mayday' : self.stats[PWQ_STAT_MAYDAY],
'rescued' : self.stats[PWQ_STAT_RESCUED], }
def table_header_str():
return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\
- f'{"CPUitsv":>7} {"CMwake":>7} {"mayday":>7} {"rescued":>7}'
+ f'{"CPUitsv":>7} {"CMW/RPR":>7} {"mayday":>7} {"rescued":>7}'
def table_row_str(self):
cpu_intensive = '-'
- cm_wakeup = '-'
+ cmw_rpr = '-'
mayday = '-'
rescued = '-'
- if not self.unbound:
+ if self.unbound:
+ cmw_rpr = str(self.stats[PWQ_STAT_REPATRIATED]);
+ else:
cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE])
- cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP])
+ cmw_rpr = str(self.stats[PWQ_STAT_CM_WAKEUP])
if self.mem_reclaim:
mayday = str(self.stats[PWQ_STAT_MAYDAY])
@@ -115,7 +122,7 @@ class WqStats:
f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \
f'{cpu_intensive:>7} ' \
- f'{cm_wakeup:>7} ' \
+ f'{cmw_rpr:>7} ' \
f'{mayday:>7} ' \
f'{rescued:>7} '
return out.rstrip(':')