summaryrefslogtreecommitdiff
path: root/include/linux/topology.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/topology.h')
-rw-r--r--include/linux/topology.h287
1 files changed, 168 insertions, 119 deletions
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d3cf0d6e7712..6575af39fd10 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -27,30 +27,25 @@
#ifndef _LINUX_TOPOLOGY_H
#define _LINUX_TOPOLOGY_H
+#include <linux/arch_topology.h>
#include <linux/cpumask.h>
+#include <linux/nodemask.h>
#include <linux/bitops.h>
#include <linux/mmzone.h>
#include <linux/smp.h>
#include <linux/percpu.h>
#include <asm/topology.h>
-#ifndef node_has_online_mem
-#define node_has_online_mem(nid) (1)
-#endif
-
#ifndef nr_cpus_node
#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
#endif
-#define for_each_node_with_cpus(node) \
- for_each_online_node(node) \
- if (nr_cpus_node(node))
-
int arch_update_cpu_topology(void);
/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE 10
#define REMOTE_DISTANCE 20
+#define DISTANCE_BITS 8
#ifndef node_distance
#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
#endif
@@ -58,122 +53,28 @@ int arch_update_cpu_topology(void);
/*
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
* (in whatever arch specific measurement units returned by node_distance())
- * then switch on zone reclaim on boot.
+ * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
+ * on nodes within this distance.
*/
#define RECLAIM_DISTANCE 30
#endif
-#ifndef PENALTY_FOR_NODE_WITH_CPUS
-#define PENALTY_FOR_NODE_WITH_CPUS (1)
-#endif
/*
- * Below are the 3 major initializers used in building sched_domains:
- * SD_SIBLING_INIT, for SMT domains
- * SD_CPU_INIT, for SMP domains
+ * The following tunable allows platforms to override the default node
+ * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
+ * sufficiently fast that the default value actually hurts
+ * performance.
*
- * Any architecture that cares to do any tuning to these values should do so
- * by defining their own arch-specific initializer in include/asm/topology.h.
- * A definition there will automagically override these default initializers
- * and allow arch-specific performance tuning of sched_domains.
- * (Only non-zero and non-null fields need be specified.)
+ * AMD EPYC machines use this because even though the 2-hop distance
+ * is 32 (3.2x slower than a local memory access) performance actually
+ * *improves* if allowed to reclaim memory and load balance tasks
+ * between NUMA nodes 2-hops apart.
*/
+extern int __read_mostly node_reclaim_distance;
-#ifdef CONFIG_SCHED_SMT
-/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is,
- * so can't we drop this in favor of CONFIG_SCHED_SMT?
- */
-#define ARCH_HAS_SCHED_WAKE_IDLE
-/* Common values for SMT siblings */
-#ifndef SD_SIBLING_INIT
-#define SD_SIBLING_INIT (struct sched_domain) { \
- .min_interval = 1, \
- .max_interval = 2, \
- .busy_factor = 64, \
- .imbalance_pct = 110, \
- \
- .flags = 1*SD_LOAD_BALANCE \
- | 1*SD_BALANCE_NEWIDLE \
- | 1*SD_BALANCE_EXEC \
- | 1*SD_BALANCE_FORK \
- | 0*SD_BALANCE_WAKE \
- | 1*SD_WAKE_AFFINE \
- | 1*SD_SHARE_CPUPOWER \
- | 1*SD_SHARE_PKG_RESOURCES \
- | 0*SD_SERIALIZE \
- | 0*SD_PREFER_SIBLING \
- | arch_sd_sibling_asym_packing() \
- , \
- .last_balance = jiffies, \
- .balance_interval = 1, \
- .smt_gain = 1178, /* 15% */ \
-}
-#endif
-#endif /* CONFIG_SCHED_SMT */
-
-#ifdef CONFIG_SCHED_MC
-/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
-#ifndef SD_MC_INIT
-#define SD_MC_INIT (struct sched_domain) { \
- .min_interval = 1, \
- .max_interval = 4, \
- .busy_factor = 64, \
- .imbalance_pct = 125, \
- .cache_nice_tries = 1, \
- .busy_idx = 2, \
- .wake_idx = 0, \
- .forkexec_idx = 0, \
- \
- .flags = 1*SD_LOAD_BALANCE \
- | 1*SD_BALANCE_NEWIDLE \
- | 1*SD_BALANCE_EXEC \
- | 1*SD_BALANCE_FORK \
- | 0*SD_BALANCE_WAKE \
- | 1*SD_WAKE_AFFINE \
- | 0*SD_SHARE_CPUPOWER \
- | 1*SD_SHARE_PKG_RESOURCES \
- | 0*SD_SERIALIZE \
- , \
- .last_balance = jiffies, \
- .balance_interval = 1, \
-}
-#endif
-#endif /* CONFIG_SCHED_MC */
-
-/* Common values for CPUs */
-#ifndef SD_CPU_INIT
-#define SD_CPU_INIT (struct sched_domain) { \
- .min_interval = 1, \
- .max_interval = 4, \
- .busy_factor = 64, \
- .imbalance_pct = 125, \
- .cache_nice_tries = 1, \
- .busy_idx = 2, \
- .idle_idx = 1, \
- .newidle_idx = 0, \
- .wake_idx = 0, \
- .forkexec_idx = 0, \
- \
- .flags = 1*SD_LOAD_BALANCE \
- | 1*SD_BALANCE_NEWIDLE \
- | 1*SD_BALANCE_EXEC \
- | 1*SD_BALANCE_FORK \
- | 0*SD_BALANCE_WAKE \
- | 1*SD_WAKE_AFFINE \
- | 0*SD_SHARE_CPUPOWER \
- | 0*SD_SHARE_PKG_RESOURCES \
- | 0*SD_SERIALIZE \
- | 1*SD_PREFER_SIBLING \
- , \
- .last_balance = jiffies, \
- .balance_interval = 1, \
-}
-#endif
-
-#ifdef CONFIG_SCHED_BOOK
-#ifndef SD_BOOK_INIT
-#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
+#ifndef PENALTY_FOR_NODE_WITH_CPUS
+#define PENALTY_FOR_NODE_WITH_CPUS (1)
#endif
-#endif /* CONFIG_SCHED_BOOK */
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DECLARE_PER_CPU(int, numa_node);
@@ -182,7 +83,7 @@ DECLARE_PER_CPU(int, numa_node);
/* Returns the number of the current Node. */
static inline int numa_node_id(void)
{
- return __this_cpu_read(numa_node);
+ return raw_cpu_read(numa_node);
}
#endif
@@ -239,7 +140,7 @@ static inline void set_numa_mem(int node)
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
- return __this_cpu_read(_numa_mem_);
+ return raw_cpu_read(_numa_mem_);
}
#endif
@@ -276,17 +177,165 @@ static inline int cpu_to_mem(int cpu)
#endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */
+#if defined(topology_die_id) && defined(topology_die_cpumask)
+#define TOPOLOGY_DIE_SYSFS
+#endif
+#if defined(topology_cluster_id) && defined(topology_cluster_cpumask)
+#define TOPOLOGY_CLUSTER_SYSFS
+#endif
+#if defined(topology_book_id) && defined(topology_book_cpumask)
+#define TOPOLOGY_BOOK_SYSFS
+#endif
+#if defined(topology_drawer_id) && defined(topology_drawer_cpumask)
+#define TOPOLOGY_DRAWER_SYSFS
+#endif
+
#ifndef topology_physical_package_id
#define topology_physical_package_id(cpu) ((void)(cpu), -1)
#endif
+#ifndef topology_die_id
+#define topology_die_id(cpu) ((void)(cpu), -1)
+#endif
+#ifndef topology_cluster_id
+#define topology_cluster_id(cpu) ((void)(cpu), -1)
+#endif
#ifndef topology_core_id
#define topology_core_id(cpu) ((void)(cpu), 0)
#endif
-#ifndef topology_thread_cpumask
-#define topology_thread_cpumask(cpu) cpumask_of(cpu)
+#ifndef topology_book_id
+#define topology_book_id(cpu) ((void)(cpu), -1)
+#endif
+#ifndef topology_drawer_id
+#define topology_drawer_id(cpu) ((void)(cpu), -1)
+#endif
+#ifndef topology_ppin
+#define topology_ppin(cpu) ((void)(cpu), 0ull)
+#endif
+#ifndef topology_sibling_cpumask
+#define topology_sibling_cpumask(cpu) cpumask_of(cpu)
#endif
#ifndef topology_core_cpumask
#define topology_core_cpumask(cpu) cpumask_of(cpu)
#endif
+#ifndef topology_cluster_cpumask
+#define topology_cluster_cpumask(cpu) cpumask_of(cpu)
+#endif
+#ifndef topology_die_cpumask
+#define topology_die_cpumask(cpu) cpumask_of(cpu)
+#endif
+#ifndef topology_book_cpumask
+#define topology_book_cpumask(cpu) cpumask_of(cpu)
+#endif
+#ifndef topology_drawer_cpumask
+#define topology_drawer_cpumask(cpu) cpumask_of(cpu)
+#endif
+
+#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
+static inline const struct cpumask *cpu_smt_mask(int cpu)
+{
+ return topology_sibling_cpumask(cpu);
+}
+#endif
+
+#ifndef topology_is_primary_thread
+
+static inline bool topology_is_primary_thread(unsigned int cpu)
+{
+ /*
+ * When disabling SMT, the primary thread of the SMT will remain
+ * enabled/active. Architectures that have a special primary thread
+ * (e.g. x86) need to override this function. Otherwise the first
+ * thread in the SMT can be made the primary thread.
+ *
+ * The sibling cpumask of an offline CPU always contains the CPU
+ * itself on architectures using the implementation of
+ * CONFIG_GENERIC_ARCH_TOPOLOGY for building their topology.
+ * Other architectures not using CONFIG_GENERIC_ARCH_TOPOLOGY for
+ * building their topology have to check whether to use this default
+ * implementation or to override it.
+ */
+ return cpu == cpumask_first(topology_sibling_cpumask(cpu));
+}
+#define topology_is_primary_thread topology_is_primary_thread
+
+#endif
+
+static inline const struct cpumask *cpu_node_mask(int cpu)
+{
+ return cpumask_of_node(cpu_to_node(cpu));
+}
+
+#ifdef CONFIG_NUMA
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
+extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
+#else
+static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+ return cpumask_nth_and(cpu, cpus, cpu_online_mask);
+}
+
+static inline const struct cpumask *
+sched_numa_hop_mask(unsigned int node, unsigned int hops)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+#endif /* CONFIG_NUMA */
+
+/**
+ * for_each_node_numadist() - iterate over nodes in increasing distance
+ * order, starting from a given node
+ * @node: the iteration variable and the starting node.
+ * @unvisited: a nodemask to keep track of the unvisited nodes.
+ *
+ * This macro iterates over NUMA node IDs in increasing distance from the
+ * starting @node and yields MAX_NUMNODES when all the nodes have been
+ * visited.
+ *
+ * Note that by the time the loop completes, the @unvisited nodemask will
+ * be fully cleared, unless the loop exits early.
+ *
+ * The difference between for_each_node() and for_each_node_numadist() is
+ * that the former allows to iterate over nodes in numerical order, whereas
+ * the latter iterates over nodes in increasing order of distance.
+ *
+ * This complexity of this iterator is O(N^2), where N represents the
+ * number of nodes, as each iteration involves scanning all nodes to
+ * find the one with the shortest distance.
+ *
+ * Requires rcu_lock to be held.
+ */
+#define for_each_node_numadist(node, unvisited) \
+ for (int __start = (node), \
+ (node) = nearest_node_nodemask((__start), &(unvisited)); \
+ (node) < MAX_NUMNODES; \
+ node_clear((node), (unvisited)), \
+ (node) = nearest_node_nodemask((__start), &(unvisited)))
+
+/**
+ * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
+ * from a given node.
+ * @mask: the iteration variable.
+ * @node: the NUMA node to start the search from.
+ *
+ * Requires rcu_lock to be held.
+ *
+ * Yields cpu_online_mask for @node == NUMA_NO_NODE.
+ */
+#define for_each_numa_hop_mask(mask, node) \
+ for (unsigned int __hops = 0; \
+ mask = (node != NUMA_NO_NODE || __hops) ? \
+ sched_numa_hop_mask(node, __hops) : \
+ cpu_online_mask, \
+ !IS_ERR_OR_NULL(mask); \
+ __hops++)
+
+DECLARE_PER_CPU(unsigned long, cpu_scale);
+
+static inline unsigned long topology_get_cpu_scale(int cpu)
+{
+ return per_cpu(cpu_scale, cpu);
+}
+
+void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
#endif /* _LINUX_TOPOLOGY_H */