summaryrefslogtreecommitdiff
path: root/mm/memory-tiers.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory-tiers.c')
-rw-r--r--mm/memory-tiers.c384
1 files changed, 326 insertions, 58 deletions
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index c734658c6242..864811fff409 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -5,6 +5,8 @@
#include <linux/kobject.h>
#include <linux/memory.h>
#include <linux/memory-tiers.h>
+#include <linux/notifier.h>
+#include <linux/sched/sysctl.h>
#include "internal.h"
@@ -35,14 +37,38 @@ struct node_memory_type_map {
static DEFINE_MUTEX(memory_tier_lock);
static LIST_HEAD(memory_tiers);
+/*
+ * The list is used to store all memory types that are not created
+ * by a device driver.
+ */
+static LIST_HEAD(default_memory_types);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
-static struct memory_dev_type *default_dram_type;
+struct memory_dev_type *default_dram_type;
+nodemask_t default_dram_nodes __initdata = NODE_MASK_NONE;
-static struct bus_type memory_tier_subsys = {
+static const struct bus_type memory_tier_subsys = {
.name = "memory_tiering",
.dev_name = "memory_tier",
};
+#ifdef CONFIG_NUMA_BALANCING
+/**
+ * folio_use_access_time - check if a folio reuses cpupid for page access time
+ * @folio: folio to check
+ *
+ * folio's _last_cpupid field is repurposed by memory tiering. In memory
+ * tiering mode, cpupid of slow memory folio (not toptier memory) is used to
+ * record page access time.
+ *
+ * Return: the folio _last_cpupid is used to record page access time
+ */
+bool folio_use_access_time(struct folio *folio)
+{
+ return (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(folio_nid(folio));
+}
+#endif
+
#ifdef CONFIG_MIGRATION
static int top_tier_adistance;
/*
@@ -105,6 +131,15 @@ static int top_tier_adistance;
static struct demotion_nodes *node_demotion __read_mostly;
#endif /* CONFIG_MIGRATION */
+static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
+
+/* The lock is used to protect `default_dram_perf*` info and nid. */
+static DEFINE_MUTEX(default_dram_perf_lock);
+static bool default_dram_perf_error;
+static struct access_coordinate default_dram_perf;
+static int default_dram_perf_ref_nid = NUMA_NO_NODE;
+static const char *default_dram_perf_ref_source;
+
static inline struct memory_tier *to_memory_tier(struct device *device)
{
return container_of(device, struct memory_tier, dev);
@@ -115,7 +150,7 @@ static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memti
nodemask_t nodes = NODE_MASK_NONE;
struct memory_dev_type *memtype;
- list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
+ list_for_each_entry(memtype, &memtier->memory_types, tier_sibling)
nodes_or(nodes, nodes, memtype->nodes);
return nodes;
@@ -174,7 +209,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
* If the memtype is already part of a memory tier,
* just return that.
*/
- if (!list_empty(&memtype->tier_sibiling)) {
+ if (!list_empty(&memtype->tier_sibling)) {
list_for_each_entry(memtier, &memory_tiers, list) {
if (adistance == memtier->adistance_start)
return memtier;
@@ -211,14 +246,14 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
ret = device_register(&new_memtier->dev);
if (ret) {
- list_del(&memtier->list);
- put_device(&memtier->dev);
+ list_del(&new_memtier->list);
+ put_device(&new_memtier->dev);
return ERR_PTR(ret);
}
memtier = new_memtier;
link_memtype:
- list_add(&memtype->tier_sibiling, &memtier->memory_types);
+ list_add(&memtype->tier_sibling, &memtier->memory_types);
return memtier;
}
@@ -351,6 +386,26 @@ static void disable_all_demotion_targets(void)
synchronize_rcu();
}
+static void dump_demotion_targets(void)
+{
+ int node;
+
+ for_each_node_state(node, N_MEMORY) {
+ struct memory_tier *memtier = __node_get_memory_tier(node);
+ nodemask_t preferred = node_demotion[node].preferred;
+
+ if (!memtier)
+ continue;
+
+ if (nodes_empty(preferred))
+ pr_info("Demotion targets for Node %d: null\n", node);
+ else
+ pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n",
+ node, nodemask_pr_args(&preferred),
+ nodemask_pr_args(&memtier->lower_tier_mask));
+ }
+}
+
/*
* Find an automatic demotion target for all memory
* nodes. Failing here is OK. It might just indicate
@@ -366,7 +421,7 @@ static void establish_demotion_targets(void)
lockdep_assert_held_once(&memory_tier_lock);
- if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
+ if (!node_demotion)
return;
disable_all_demotion_targets();
@@ -435,7 +490,7 @@ static void establish_demotion_targets(void)
* Now build the lower_tier mask for each node collecting node mask from
* all memory tier below it. This allows us to fallback demotion page
* allocation to a set of nodes that is closer the above selected
- * perferred node.
+ * preferred node.
*/
lower_tier = node_states[N_MEMORY];
list_for_each_entry(memtier, &memory_tiers, list) {
@@ -448,10 +503,11 @@ static void establish_demotion_targets(void)
nodes_andnot(lower_tier, lower_tier, tier_nodes);
memtier->lower_tier_mask = lower_tier;
}
+
+ dump_demotion_targets();
}
#else
-static inline void disable_all_demotion_targets(void) {}
static inline void establish_demotion_targets(void) {}
#endif /* CONFIG_MIGRATION */
@@ -463,7 +519,7 @@ static inline void __init_node_memory_type(int node, struct memory_dev_type *mem
* for each device getting added in the same NUMA node
* with this specific memtype, bump the map count. We
* Only take memtype device reference once, so that
- * changing a node memtype can be done by droping the
+ * changing a node memtype can be done by dropping the
* only reference count taken here.
*/
@@ -476,7 +532,8 @@ static inline void __init_node_memory_type(int node, struct memory_dev_type *mem
static struct memory_tier *set_node_memory_tier(int node)
{
struct memory_tier *memtier;
- struct memory_dev_type *memtype;
+ struct memory_dev_type *memtype = default_dram_type;
+ int adist = MEMTIER_ADISTANCE_DRAM;
pg_data_t *pgdat = NODE_DATA(node);
@@ -485,7 +542,16 @@ static struct memory_tier *set_node_memory_tier(int node)
if (!node_state(node, N_MEMORY))
return ERR_PTR(-EINVAL);
- __init_node_memory_type(node, default_dram_type);
+ mt_calc_adistance(node, &adist);
+ if (!node_memory_types[node].memtype) {
+ memtype = mt_find_alloc_memory_type(adist, &default_memory_types);
+ if (IS_ERR(memtype)) {
+ memtype = default_dram_type;
+ pr_info("Failed to allocate a memory type. Fall back.\n");
+ }
+ }
+
+ __init_node_memory_type(node, memtype);
memtype = node_memory_types[node].memtype;
node_set(node, memtype->nodes);
@@ -528,7 +594,7 @@ static bool clear_node_memory_tier(int node)
memtype = node_memory_types[node].memtype;
node_clear(node, memtype->nodes);
if (nodes_empty(memtype->nodes)) {
- list_del_init(&memtype->tier_sibiling);
+ list_del_init(&memtype->tier_sibling);
if (list_empty(&memtier->memory_types))
destroy_memory_tier(memtier);
}
@@ -554,18 +620,18 @@ struct memory_dev_type *alloc_memory_type(int adistance)
return ERR_PTR(-ENOMEM);
memtype->adistance = adistance;
- INIT_LIST_HEAD(&memtype->tier_sibiling);
+ INIT_LIST_HEAD(&memtype->tier_sibling);
memtype->nodes = NODE_MASK_NONE;
kref_init(&memtype->kref);
return memtype;
}
EXPORT_SYMBOL_GPL(alloc_memory_type);
-void destroy_memory_type(struct memory_dev_type *memtype)
+void put_memory_type(struct memory_dev_type *memtype)
{
kref_put(&memtype->kref, release_memtype);
}
-EXPORT_SYMBOL_GPL(destroy_memory_type);
+EXPORT_SYMBOL_GPL(put_memory_type);
void init_node_memory_type(int node, struct memory_dev_type *memtype)
{
@@ -579,43 +645,245 @@ EXPORT_SYMBOL_GPL(init_node_memory_type);
void clear_node_memory_type(int node, struct memory_dev_type *memtype)
{
mutex_lock(&memory_tier_lock);
- if (node_memory_types[node].memtype == memtype)
+ if (node_memory_types[node].memtype == memtype || !memtype)
node_memory_types[node].map_count--;
/*
* If we umapped all the attached devices to this node,
* clear the node memory type.
*/
if (!node_memory_types[node].map_count) {
+ memtype = node_memory_types[node].memtype;
node_memory_types[node].memtype = NULL;
- kref_put(&memtype->kref, release_memtype);
+ put_memory_type(memtype);
}
mutex_unlock(&memory_tier_lock);
}
EXPORT_SYMBOL_GPL(clear_node_memory_type);
-static int __meminit memtier_hotplug_callback(struct notifier_block *self,
- unsigned long action, void *_arg)
+struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types)
{
+ struct memory_dev_type *mtype;
+
+ list_for_each_entry(mtype, memory_types, list)
+ if (mtype->adistance == adist)
+ return mtype;
+
+ mtype = alloc_memory_type(adist);
+ if (IS_ERR(mtype))
+ return mtype;
+
+ list_add(&mtype->list, memory_types);
+
+ return mtype;
+}
+EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type);
+
+void mt_put_memory_types(struct list_head *memory_types)
+{
+ struct memory_dev_type *mtype, *mtn;
+
+ list_for_each_entry_safe(mtype, mtn, memory_types, list) {
+ list_del(&mtype->list);
+ put_memory_type(mtype);
+ }
+}
+EXPORT_SYMBOL_GPL(mt_put_memory_types);
+
+/*
+ * This is invoked via `late_initcall()` to initialize memory tiers for
+ * memory nodes, both with and without CPUs. After the initialization of
+ * firmware and devices, adistance algorithms are expected to be provided.
+ */
+static int __init memory_tier_late_init(void)
+{
+ int nid;
struct memory_tier *memtier;
- struct memory_notify *arg = _arg;
+
+ get_online_mems();
+ guard(mutex)(&memory_tier_lock);
+
+ /* Assign each uninitialized N_MEMORY node to a memory tier. */
+ for_each_node_state(nid, N_MEMORY) {
+ /*
+ * Some device drivers may have initialized
+ * memory tiers, potentially bringing memory nodes
+ * online and configuring memory tiers.
+ * Exclude them here.
+ */
+ if (node_memory_types[nid].memtype)
+ continue;
+
+ memtier = set_node_memory_tier(nid);
+ if (IS_ERR(memtier))
+ continue;
+ }
+
+ establish_demotion_targets();
+ put_online_mems();
+
+ return 0;
+}
+late_initcall(memory_tier_late_init);
+
+static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix)
+{
+ pr_info(
+"%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n",
+ prefix, coord->read_latency, coord->write_latency,
+ coord->read_bandwidth, coord->write_bandwidth);
+}
+
+int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
+ const char *source)
+{
+ guard(mutex)(&default_dram_perf_lock);
+ if (default_dram_perf_error)
+ return -EIO;
+
+ if (perf->read_latency + perf->write_latency == 0 ||
+ perf->read_bandwidth + perf->write_bandwidth == 0)
+ return -EINVAL;
+
+ if (default_dram_perf_ref_nid == NUMA_NO_NODE) {
+ default_dram_perf = *perf;
+ default_dram_perf_ref_nid = nid;
+ default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL);
+ return 0;
+ }
+
+ /*
+ * The performance of all default DRAM nodes is expected to be
+ * same (that is, the variation is less than 10%). And it
+ * will be used as base to calculate the abstract distance of
+ * other memory nodes.
+ */
+ if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 >
+ default_dram_perf.read_latency ||
+ abs(perf->write_latency - default_dram_perf.write_latency) * 10 >
+ default_dram_perf.write_latency ||
+ abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 >
+ default_dram_perf.read_bandwidth ||
+ abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 >
+ default_dram_perf.write_bandwidth) {
+ pr_info(
+"memory-tiers: the performance of DRAM node %d mismatches that of the reference\n"
+"DRAM node %d.\n", nid, default_dram_perf_ref_nid);
+ pr_info(" performance of reference DRAM node %d from %s:\n",
+ default_dram_perf_ref_nid, default_dram_perf_ref_source);
+ dump_hmem_attrs(&default_dram_perf, " ");
+ pr_info(" performance of DRAM node %d from %s:\n", nid, source);
+ dump_hmem_attrs(perf, " ");
+ pr_info(
+" disable default DRAM node performance based abstract distance algorithm.\n");
+ default_dram_perf_error = true;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int mt_perf_to_adistance(struct access_coordinate *perf, int *adist)
+{
+ guard(mutex)(&default_dram_perf_lock);
+ if (default_dram_perf_error)
+ return -EIO;
+
+ if (perf->read_latency + perf->write_latency == 0 ||
+ perf->read_bandwidth + perf->write_bandwidth == 0)
+ return -EINVAL;
+
+ if (default_dram_perf_ref_nid == NUMA_NO_NODE)
+ return -ENOENT;
/*
- * Only update the node migration order when a node is
- * changing status, like online->offline.
+ * The abstract distance of a memory node is in direct proportion to
+ * its memory latency (read + write) and inversely proportional to its
+ * memory bandwidth (read + write). The abstract distance, memory
+ * latency, and memory bandwidth of the default DRAM nodes are used as
+ * the base.
*/
- if (arg->status_change_nid < 0)
- return notifier_from_errno(0);
+ *adist = MEMTIER_ADISTANCE_DRAM *
+ (perf->read_latency + perf->write_latency) /
+ (default_dram_perf.read_latency + default_dram_perf.write_latency) *
+ (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) /
+ (perf->read_bandwidth + perf->write_bandwidth);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mt_perf_to_adistance);
+
+/**
+ * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm
+ * @nb: The notifier block which describe the algorithm
+ *
+ * Return: 0 on success, errno on error.
+ *
+ * Every memory tiering abstract distance algorithm provider needs to
+ * register the algorithm with register_mt_adistance_algorithm(). To
+ * calculate the abstract distance for a specified memory node, the
+ * notifier function will be called unless some high priority
+ * algorithm has provided result. The prototype of the notifier
+ * function is as follows,
+ *
+ * int (*algorithm_notifier)(struct notifier_block *nb,
+ * unsigned long nid, void *data);
+ *
+ * Where "nid" specifies the memory node, "data" is the pointer to the
+ * returned abstract distance (that is, "int *adist"). If the
+ * algorithm provides the result, NOTIFY_STOP should be returned.
+ * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next
+ * algorithm in the chain to provide the result.
+ */
+int register_mt_adistance_algorithm(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&mt_adistance_algorithms, nb);
+}
+EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm);
+
+/**
+ * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm
+ * @nb: the notifier block which describe the algorithm
+ *
+ * Return: 0 on success, errno on error.
+ */
+int unregister_mt_adistance_algorithm(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm);
+
+/**
+ * mt_calc_adistance() - Calculate abstract distance with registered algorithms
+ * @node: the node to calculate abstract distance for
+ * @adist: the returned abstract distance
+ *
+ * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some
+ * abstract distance algorithm provides the result, and return it via
+ * @adist. Otherwise, no algorithm can provide the result and @adist
+ * will be kept as it is.
+ */
+int mt_calc_adistance(int node, int *adist)
+{
+ return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist);
+}
+EXPORT_SYMBOL_GPL(mt_calc_adistance);
+
+static int __meminit memtier_hotplug_callback(struct notifier_block *self,
+ unsigned long action, void *_arg)
+{
+ struct memory_tier *memtier;
+ struct node_notify *nn = _arg;
switch (action) {
- case MEM_OFFLINE:
+ case NODE_REMOVED_LAST_MEMORY:
mutex_lock(&memory_tier_lock);
- if (clear_node_memory_tier(arg->status_change_nid))
+ if (clear_node_memory_tier(nn->nid))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
break;
- case MEM_ONLINE:
+ case NODE_ADDED_FIRST_MEMORY:
mutex_lock(&memory_tier_lock);
- memtier = set_node_memory_tier(arg->status_change_nid);
+ memtier = set_node_memory_tier(nn->nid);
if (!IS_ERR(memtier))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
@@ -627,8 +895,7 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
static int __init memory_tier_init(void)
{
- int ret, node;
- struct memory_tier *memtier;
+ int ret;
ret = subsys_virtual_register(&memory_tier_subsys, NULL);
if (ret)
@@ -639,32 +906,23 @@ static int __init memory_tier_init(void)
GFP_KERNEL);
WARN_ON(!node_demotion);
#endif
+
mutex_lock(&memory_tier_lock);
/*
* For now we can have 4 faster memory tiers with smaller adistance
* than default DRAM tier.
*/
- default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
+ default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM,
+ &default_memory_types);
+ mutex_unlock(&memory_tier_lock);
if (IS_ERR(default_dram_type))
panic("%s() failed to allocate default DRAM tier\n", __func__);
- /*
- * Look at all the existing N_MEMORY nodes and add them to
- * default memory tier or to a tier if we already have memory
- * types assigned.
- */
- for_each_node_state(node, N_MEMORY) {
- memtier = set_node_memory_tier(node);
- if (IS_ERR(memtier))
- /*
- * Continue with memtiers we are able to setup
- */
- break;
- }
- establish_demotion_targets();
- mutex_unlock(&memory_tier_lock);
+ /* Record nodes with memory and CPU to set default DRAM performance. */
+ nodes_and(default_dram_nodes, node_states[N_MEMORY],
+ node_states[N_CPU]);
- hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
+ hotplug_node_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
return 0;
}
subsys_initcall(memory_tier_init);
@@ -673,29 +931,39 @@ bool numa_demotion_enabled = false;
#ifdef CONFIG_MIGRATION
#ifdef CONFIG_SYSFS
-static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t demotion_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
- return sysfs_emit(buf, "%s\n",
- numa_demotion_enabled ? "true" : "false");
+ return sysfs_emit(buf, "%s\n", str_true_false(numa_demotion_enabled));
}
-static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t demotion_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
ssize_t ret;
+ bool before = numa_demotion_enabled;
ret = kstrtobool(buf, &numa_demotion_enabled);
if (ret)
return ret;
+ /*
+ * Reset kswapd_failures statistics. They may no longer be
+ * valid since the policy for kswapd has changed.
+ */
+ if (before == false && numa_demotion_enabled == true) {
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ atomic_set(&pgdat->kswapd_failures, 0);
+ }
+
return count;
}
static struct kobj_attribute numa_demotion_enabled_attr =
- __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
- numa_demotion_enabled_store);
+ __ATTR_RW(demotion_enabled);
static struct attribute *numa_attrs[] = {
&numa_demotion_enabled_attr.attr,