workqueue: Add multiple affinity scopes and interface to select them

Add three more affinity scopes - WQ_AFFN_CPU, SMT and CACHE - and make CACHE the default. The code changes to actually add the additional scopes are trivial. Also add module parameter "workqueue.default_affinity_scope" to override the default scope and "affinity_scope" sysfs file to configure it per workqueue. wq_dump.py and documentations are updated accordingly. This enables significant flexibility in configuring how unbound workqueues behave. If affinity scope is set to "cpu", it'll behave close to a per-cpu workqueue. On the other hand, "system" removes all locality boundaries. Many modern machines have multiple L3 caches often while being mostly uniform in terms of memory access. Thus, workqueue's previous behavior of spreading work items in each NUMA node had negative performance implications from unncessarily crossing L3 boundaries between issue and execution. However, picking a finer grained affinity scope also has a downside in that an issuer in one group can't utilize CPUs in other groups. While dependent on the specifics of workload, there's usually a noticeable penalty in crossing L3 boundaries, so let's default to CACHE. This issue will be further addressed and documented with examples in future patches. Signed-off-by: Tejun Heo <tj@kernel.org>
author: Tejun Heo <tj@kernel.org> 2023-08-07 15:57:24 -1000
committer: Tejun Heo <tj@kernel.org> 2023-08-07 15:57:24 -1000
commit: 63c5484e74952f60f5810256bd69814d167b8d22 (patch)
tree: e045e2d922b5f462856ffb2058621d8e63e8a59e /kernel/workqueue.c
parent: 025e16845877e80cb169274b330c236056ba553c (diff)
1 files changed, 105 insertions, 5 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a2cc0432abd5..8e3a499c3f00 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -338,6 +338,15 @@ struct wq_pod_type {
 };
 
 static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
+static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_DFL;
+
+static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
+	[WQ_AFFN_CPU]			= "cpu",
+	[WQ_AFFN_SMT]			= "smt",
+	[WQ_AFFN_CACHE]			= "cache",
+	[WQ_AFFN_NUMA]			= "numa",
+	[WQ_AFFN_SYSTEM]		= "system",
+};
 
 /*
  * Per-cpu work items which run for longer than the following threshold are
@@ -3664,7 +3673,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void)
 		goto fail;
 
 	cpumask_copy(attrs->cpumask, cpu_possible_mask);
-	attrs->affn_scope = WQ_AFFN_DFL;
+	attrs->affn_scope = wq_affn_dfl;
 	return attrs;
 fail:
 	free_workqueue_attrs(attrs);
@@ -5777,19 +5786,55 @@ out_unlock:
 	return ret;
 }
 
+static int parse_affn_scope(const char *val)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
+		if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
+			return i;
+	}
+	return -EINVAL;
+}
+
+static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
+{
+	int affn;
+
+	affn = parse_affn_scope(val);
+	if (affn < 0)
+		return affn;
+
+	wq_affn_dfl = affn;
+	return 0;
+}
+
+static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)
+{
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);
+}
+
+static const struct kernel_param_ops wq_affn_dfl_ops = {
+	.set	= wq_affn_dfl_set,
+	.get	= wq_affn_dfl_get,
+};
+
+module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);
+
 #ifdef CONFIG_SYSFS
 /*
  * Workqueues with WQ_SYSFS flag set is visible to userland via
  * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
  * following attributes.
  *
- *  per_cpu	RO bool	: whether the workqueue is per-cpu or unbound
- *  max_active	RW int	: maximum number of in-flight work items
+ *  per_cpu		RO bool	: whether the workqueue is per-cpu or unbound
+ *  max_active		RW int	: maximum number of in-flight work items
  *
  * Unbound workqueues have the following extra attributes.
  *
- *  nice	RW int	: nice value of the workers
- *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
+ *  nice		RW int	: nice value of the workers
+ *  cpumask		RW mask	: bitmask of allowed CPUs for the workers
+ *  affinity_scope	RW str  : worker CPU affinity scope (cache, numa, none)
  */
 struct wq_device {
 	struct workqueue_struct		*wq;
@@ -5932,9 +5977,47 @@ out_unlock:
 	return ret ?: count;
 }
 
+static ssize_t wq_affn_scope_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	mutex_lock(&wq->mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%s\n",
+			    wq_affn_names[wq->unbound_attrs->affn_scope]);
+	mutex_unlock(&wq->mutex);
+
+	return written;
+}
+
+static ssize_t wq_affn_scope_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int affn, ret = -ENOMEM;
+
+	affn = parse_affn_scope(buf);
+	if (affn < 0)
+		return affn;
+
+	apply_wqattrs_lock();
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (attrs) {
+		attrs->affn_scope = affn;
+		ret = apply_workqueue_attrs_locked(wq, attrs);
+	}
+	apply_wqattrs_unlock();
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
 static struct device_attribute wq_sysfs_unbound_attrs[] = {
 	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
 	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+	__ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
 	__ATTR_NULL,
 };
 
@@ -6537,6 +6620,20 @@ static void __init init_pod_type(struct wq_pod_type *pt,
 	}
 }
 
+static bool __init cpus_dont_share(int cpu0, int cpu1)
+{
+	return false;
+}
+
+static bool __init cpus_share_smt(int cpu0, int cpu1)
+{
+#ifdef CONFIG_SCHED_SMT
+	return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
+#else
+	return false;
+#endif
+}
+
 static bool __init cpus_share_numa(int cpu0, int cpu1)
 {
 	return cpu_to_node(cpu0) == cpu_to_node(cpu1);
@@ -6554,6 +6651,9 @@ void __init workqueue_init_topology(void)
 	struct workqueue_struct *wq;
 	int cpu;
 
+	init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
+	init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
+	init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
 	init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);
 
 	mutex_lock(&wq_pool_mutex);
author	Tejun Heo <tj@kernel.org>	2023-08-07 15:57:24 -1000
committer	Tejun Heo <tj@kernel.org>	2023-08-07 15:57:24 -1000
commit	63c5484e74952f60f5810256bd69814d167b8d22 (patch)
tree	e045e2d922b5f462856ffb2058621d8e63e8a59e /kernel/workqueue.c
parent	025e16845877e80cb169274b330c236056ba553c (diff)