summaryrefslogtreecommitdiff
path: root/drivers/base/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/base/memory.c')
-rw-r--r--drivers/base/memory.c378
1 files changed, 276 insertions, 102 deletions
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 365cd4a7f239..751f248ca4a8 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -22,6 +22,7 @@
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/xarray.h>
+#include <linux/export.h>
#include <linux/atomic.h>
#include <linux/uaccess.h>
@@ -48,27 +49,13 @@ int mhp_online_type_from_str(const char *str)
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)
-static int sections_per_block;
-
-static inline unsigned long memory_block_id(unsigned long section_nr)
-{
- return section_nr / sections_per_block;
-}
-
-static inline unsigned long pfn_to_block_id(unsigned long pfn)
-{
- return memory_block_id(pfn_to_section_nr(pfn));
-}
-
-static inline unsigned long phys_to_block_id(unsigned long phys)
-{
- return pfn_to_block_id(PFN_DOWN(phys));
-}
+int sections_per_block;
+EXPORT_SYMBOL(sections_per_block);
static int memory_subsys_online(struct device *dev);
static int memory_subsys_offline(struct device *dev);
-static struct bus_type memory_subsys = {
+static const struct bus_type memory_subsys = {
.name = MEMORY_CLASS_NAME,
.dev_name = MEMORY_CLASS_NAME,
.online = memory_subsys_online,
@@ -105,28 +92,75 @@ EXPORT_SYMBOL(unregister_memory_notifier);
static void memory_block_release(struct device *dev)
{
struct memory_block *mem = to_memory_block(dev);
-
+ /* Verify that the altmap is freed */
+ WARN_ON(mem->altmap);
kfree(mem);
}
+
+/* Max block size to be set by memory_block_advise_max_size */
+static unsigned long memory_block_advised_size;
+static bool memory_block_advised_size_queried;
+
+/**
+ * memory_block_advise_max_size() - advise memory hotplug on the max suggested
+ * block size, usually for alignment.
+ * @size: suggestion for maximum block size. must be aligned on power of 2.
+ *
+ * Early boot software (pre-allocator init) may advise archs on the max block
+ * size. This value can only decrease after initialization, as the intent is
+ * to identify the largest supported alignment for all sources.
+ *
+ * Use of this value is arch-defined, as is min/max block size.
+ *
+ * Return: 0 on success
+ * -EINVAL if size is 0 or not pow2 aligned
+ * -EBUSY if value has already been probed
+ */
+int __init memory_block_advise_max_size(unsigned long size)
+{
+ if (!size || !is_power_of_2(size))
+ return -EINVAL;
+
+ if (memory_block_advised_size_queried)
+ return -EBUSY;
+
+ if (memory_block_advised_size)
+ memory_block_advised_size = min(memory_block_advised_size, size);
+ else
+ memory_block_advised_size = size;
+
+ return 0;
+}
+
+/**
+ * memory_block_advised_max_size() - query advised max hotplug block size.
+ *
+ * After the first call, the value can never change. Callers looking for the
+ * actual block size should use memory_block_size_bytes. This interface is
+ * intended for use by arch-init when initializing the hotplug block size.
+ *
+ * Return: advised size in bytes, or 0 if never set.
+ */
+unsigned long memory_block_advised_max_size(void)
+{
+ memory_block_advised_size_queried = true;
+ return memory_block_advised_size;
+}
+
unsigned long __weak memory_block_size_bytes(void)
{
return MIN_MEMORY_BLOCK_SIZE;
}
EXPORT_SYMBOL_GPL(memory_block_size_bytes);
-/*
- * Show the first physical section index (number) of this memory block.
- */
+/* Show the memory block ID, relative to the memory block size */
static ssize_t phys_index_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct memory_block *mem = to_memory_block(dev);
- unsigned long phys_index;
-
- phys_index = mem->start_section_nr / sections_per_block;
- return sysfs_emit(buf, "%08lx\n", phys_index);
+ return sysfs_emit(buf, "%08lx\n", memory_block_id(mem->start_section_nr));
}
/*
@@ -164,25 +198,40 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr,
break;
default:
WARN_ON(1);
- return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
+ return sysfs_emit(buf, "ERROR-UNKNOWN-%d\n", mem->state);
}
return sysfs_emit(buf, "%s\n", output);
}
-int memory_notify(unsigned long val, void *v)
+int memory_notify(enum memory_block_state state, void *v)
{
- return blocking_notifier_call_chain(&memory_chain, val, v);
+ return blocking_notifier_call_chain(&memory_chain, state, v);
}
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
+static unsigned long memblk_nr_poison(struct memory_block *mem);
+#else
+static inline unsigned long memblk_nr_poison(struct memory_block *mem)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Must acquire mem_hotplug_lock in write mode.
+ */
static int memory_block_online(struct memory_block *mem)
{
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
- unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+ unsigned long nr_vmemmap_pages = 0;
struct zone *zone;
int ret;
+ if (memblk_nr_poison(mem))
+ return -EHWPOISON;
+
zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
start_pfn, nr_pages);
@@ -193,10 +242,14 @@ static int memory_block_online(struct memory_block *mem)
* stage helps to keep accounting easier to follow - e.g vmemmaps
* belong to the same zone as the memory they backed.
*/
+ if (mem->altmap)
+ nr_vmemmap_pages = mem->altmap->free;
+
+ mem_hotplug_begin();
if (nr_vmemmap_pages) {
ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
if (ret)
- return ret;
+ goto out;
}
ret = online_pages(start_pfn + nr_vmemmap_pages,
@@ -204,7 +257,7 @@ static int memory_block_online(struct memory_block *mem)
if (ret) {
if (nr_vmemmap_pages)
mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
- return ret;
+ goto out;
}
/*
@@ -215,37 +268,53 @@ static int memory_block_online(struct memory_block *mem)
adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
nr_vmemmap_pages);
+ mem->zone = zone;
+out:
+ mem_hotplug_done();
return ret;
}
+/*
+ * Must acquire mem_hotplug_lock in write mode.
+ */
static int memory_block_offline(struct memory_block *mem)
{
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
- unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+ unsigned long nr_vmemmap_pages = 0;
int ret;
+ if (!mem->zone)
+ return -EINVAL;
+
/*
* Unaccount before offlining, such that unpopulated zone and kthreads
* can properly be torn down in offline_pages().
*/
+ if (mem->altmap)
+ nr_vmemmap_pages = mem->altmap->free;
+
+ mem_hotplug_begin();
if (nr_vmemmap_pages)
adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
-nr_vmemmap_pages);
ret = offline_pages(start_pfn + nr_vmemmap_pages,
- nr_pages - nr_vmemmap_pages, mem->group);
+ nr_pages - nr_vmemmap_pages, mem->zone, mem->group);
if (ret) {
/* offline_pages() failed. Account back. */
if (nr_vmemmap_pages)
adjust_present_page_count(pfn_to_page(start_pfn),
mem->group, nr_vmemmap_pages);
- return ret;
+ goto out;
}
if (nr_vmemmap_pages)
mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
+ mem->zone = NULL;
+out:
+ mem_hotplug_done();
return ret;
}
@@ -403,7 +472,7 @@ static ssize_t valid_zones_show(struct device *dev,
struct memory_group *group = mem->group;
struct zone *default_zone;
int nid = mem->nid;
- int len = 0;
+ int len;
/*
* Check the existing zone. Make sure that we do that only on the
@@ -411,26 +480,21 @@ static ssize_t valid_zones_show(struct device *dev,
*/
if (mem->state == MEM_ONLINE) {
/*
- * The block contains more than one zone can not be offlined.
- * This can happen e.g. for ZONE_DMA and ZONE_DMA32
+ * If !mem->zone, the memory block spans multiple zones and
+ * cannot get offlined.
*/
- default_zone = test_pages_in_a_zone(start_pfn,
- start_pfn + nr_pages);
- if (!default_zone)
- return sysfs_emit(buf, "%s\n", "none");
- len += sysfs_emit_at(buf, len, "%s", default_zone->name);
- goto out;
+ return sysfs_emit(buf, "%s\n",
+ mem->zone ? mem->zone->name : "none");
}
default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
start_pfn, nr_pages);
- len += sysfs_emit_at(buf, len, "%s", default_zone->name);
+ len = sysfs_emit(buf, "%s", default_zone->name);
len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
MMOP_ONLINE_KERNEL, default_zone);
len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
MMOP_ONLINE_MOVABLE, default_zone);
-out:
len += sysfs_emit_at(buf, len, "\n");
return len;
}
@@ -461,7 +525,7 @@ static ssize_t auto_online_blocks_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%s\n",
- online_type_to_str[mhp_default_online_type]);
+ online_type_to_str[mhp_get_default_online_type()]);
}
static ssize_t auto_online_blocks_store(struct device *dev,
@@ -473,12 +537,22 @@ static ssize_t auto_online_blocks_store(struct device *dev,
if (online_type < 0)
return -EINVAL;
- mhp_default_online_type = online_type;
+ mhp_set_default_online_type(online_type);
return count;
}
static DEVICE_ATTR_RW(auto_online_blocks);
+#ifdef CONFIG_CRASH_HOTPLUG
+#include <linux/kexec.h>
+static ssize_t crash_hotplug_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", crash_check_hotplug_support());
+}
+static DEVICE_ATTR_RO(crash_hotplug);
+#endif
+
/*
* Some architectures will have custom drivers to do this, and
* will not need to do it from userspace. The fake hot-add code
@@ -554,7 +628,9 @@ static ssize_t hard_offline_page_store(struct device *dev,
if (kstrtoull(buf, 0, &pfn) < 0)
return -EINVAL;
pfn >>= PAGE_SHIFT;
- ret = memory_failure(pfn, 0);
+ ret = memory_failure(pfn, MF_SW_SIMULATED);
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
return ret ? ret : count;
}
@@ -573,7 +649,7 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
*
* Called under device_hotplug_lock.
*/
-static struct memory_block *find_memory_block_by_id(unsigned long block_id)
+struct memory_block *find_memory_block_by_id(unsigned long block_id)
{
struct memory_block *mem;
@@ -613,11 +689,7 @@ static const struct attribute_group *memory_memblk_attr_groups[] = {
NULL,
};
-/*
- * register_memory - Setup a sysfs device for a memory block
- */
-static
-int register_memory(struct memory_block *memory)
+static int __add_memory_block(struct memory_block *memory)
{
int ret;
@@ -634,16 +706,91 @@ int register_memory(struct memory_block *memory)
}
ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
GFP_KERNEL));
- if (ret) {
- put_device(&memory->dev);
+ if (ret)
device_unregister(&memory->dev);
- }
+
return ret;
}
-static int init_memory_block(unsigned long block_id, unsigned long state,
- unsigned long nr_vmemmap_pages,
- struct memory_group *group)
+static struct zone *early_node_zone_for_memory_block(struct memory_block *mem,
+ int nid)
+{
+ const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
+ const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+ struct zone *zone, *matching_zone = NULL;
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int i;
+
+ /*
+ * This logic only works for early memory, when the applicable zones
+ * already span the memory block. We don't expect overlapping zones on
+ * a single node for early memory. So if we're told that some PFNs
+ * of a node fall into this memory block, we can assume that all node
+ * zones that intersect with the memory block are actually applicable.
+ * No need to look at the memmap.
+ */
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ zone = pgdat->node_zones + i;
+ if (!populated_zone(zone))
+ continue;
+ if (!zone_intersects(zone, start_pfn, nr_pages))
+ continue;
+ if (!matching_zone) {
+ matching_zone = zone;
+ continue;
+ }
+ /* Spans multiple zones ... */
+ matching_zone = NULL;
+ break;
+ }
+ return matching_zone;
+}
+
+#ifdef CONFIG_NUMA
+/**
+ * memory_block_add_nid_early() - Indicate that early system RAM falling into
+ * this memory block device (partially) belongs
+ * to the given node.
+ * @mem: The memory block device.
+ * @nid: The node id.
+ *
+ * Indicate that early system RAM falling into this memory block (partially)
+ * belongs to the given node. This will also properly set/adjust mem->zone based
+ * on the zone ranges of the given node.
+ *
+ * Memory hotplug handles this on memory block creation, where we can only have
+ * a single nid span a memory block.
+ */
+void memory_block_add_nid_early(struct memory_block *mem, int nid)
+{
+ if (mem->nid != nid) {
+ /*
+ * For early memory we have to determine the zone when setting
+ * the node id and handle multiple nodes spanning a single
+ * memory block by indicate via zone == NULL that we're not
+ * dealing with a single zone. So if we're setting the node id
+ * the first time, determine if there is a single zone. If we're
+ * setting the node id a second time to a different node,
+ * invalidate the single detected zone.
+ */
+ if (mem->nid == NUMA_NO_NODE)
+ mem->zone = early_node_zone_for_memory_block(mem, nid);
+ else
+ mem->zone = NULL;
+ /*
+ * If this memory block spans multiple nodes, we only indicate
+ * the last processed node. If we span multiple nodes (not applicable
+ * to hotplugged memory), zone == NULL will prohibit memory offlining
+ * and consequently unplug.
+ */
+ mem->nid = nid;
+ }
+}
+#endif
+
+static int add_memory_block(unsigned long block_id, int nid, unsigned long state,
+ struct vmem_altmap *altmap,
+ struct memory_group *group)
{
struct memory_block *mem;
int ret = 0;
@@ -659,37 +806,34 @@ static int init_memory_block(unsigned long block_id, unsigned long state,
mem->start_section_nr = block_id * sections_per_block;
mem->state = state;
- mem->nid = NUMA_NO_NODE;
- mem->nr_vmemmap_pages = nr_vmemmap_pages;
+ mem->nid = nid;
+ mem->altmap = altmap;
INIT_LIST_HEAD(&mem->group_next);
+#ifndef CONFIG_NUMA
+ if (state == MEM_ONLINE)
+ /*
+ * MEM_ONLINE at this point implies early memory. With NUMA,
+ * we'll determine the zone when setting the node id via
+ * memory_block_add_nid(). Memory hotplug updated the zone
+ * manually when memory onlining/offlining succeeds.
+ */
+ mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE);
+#endif /* CONFIG_NUMA */
+
+ ret = __add_memory_block(mem);
+ if (ret)
+ return ret;
+
if (group) {
mem->group = group;
list_add(&mem->group_next, &group->memory_blocks);
}
- ret = register_memory(mem);
-
- return ret;
-}
-
-static int add_memory_block(unsigned long base_section_nr)
-{
- int section_count = 0;
- unsigned long nr;
-
- for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
- nr++)
- if (present_section_nr(nr))
- section_count++;
-
- if (section_count == 0)
- return 0;
- return init_memory_block(memory_block_id(base_section_nr),
- MEM_ONLINE, 0, NULL);
+ return 0;
}
-static void unregister_memory(struct memory_block *memory)
+static void remove_memory_block(struct memory_block *memory)
{
if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
return;
@@ -714,7 +858,7 @@ static void unregister_memory(struct memory_block *memory)
* Called under device_hotplug_lock.
*/
int create_memory_block_devices(unsigned long start, unsigned long size,
- unsigned long vmemmap_pages,
+ int nid, struct vmem_altmap *altmap,
struct memory_group *group)
{
const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
@@ -728,8 +872,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
return -EINVAL;
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
- ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages,
- group);
+ ret = add_memory_block(block_id, nid, MEM_OFFLINE, altmap, group);
if (ret)
break;
}
@@ -740,7 +883,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
mem = find_memory_block_by_id(block_id);
if (WARN_ON_ONCE(!mem))
continue;
- unregister_memory(mem);
+ remove_memory_block(mem);
}
}
return ret;
@@ -768,17 +911,12 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
mem = find_memory_block_by_id(block_id);
if (WARN_ON_ONCE(!mem))
continue;
+ num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem));
unregister_memory_block_under_nodes(mem);
- unregister_memory(mem);
+ remove_memory_block(mem);
}
}
-/* return true if the memory block is offlined, otherwise, return false */
-bool is_memblock_offlined(struct memory_block *mem)
-{
- return mem->state == MEM_OFFLINE;
-}
-
static struct attribute *memory_root_attrs[] = {
#ifdef CONFIG_ARCH_MEMORY_PROBE
&dev_attr_probe.attr,
@@ -791,6 +929,9 @@ static struct attribute *memory_root_attrs[] = {
&dev_attr_block_size_bytes.attr,
&dev_attr_auto_online_blocks.attr,
+#ifdef CONFIG_CRASH_HOTPLUG
+ &dev_attr_crash_hotplug.attr,
+#endif
NULL
};
@@ -811,7 +952,7 @@ static const struct attribute_group *memory_root_attr_groups[] = {
void __init memory_dev_init(void)
{
int ret;
- unsigned long block_sz, nr;
+ unsigned long block_sz, block_id, nr;
/* Validate the configured memory block size */
block_sz = memory_block_size_bytes();
@@ -824,15 +965,23 @@ void __init memory_dev_init(void)
panic("%s() failed to register subsystem: %d\n", __func__, ret);
/*
- * Create entries for memory sections that were found
- * during boot and have been initialized
+ * Create entries for memory sections that were found during boot
+ * and have been initialized. Use @block_id to track the last
+ * handled block and initialize it to an invalid value (ULONG_MAX)
+ * to bypass the block ID matching check for the first present
+ * block so that it can be covered.
*/
- for (nr = 0; nr <= __highest_present_section_nr;
- nr += sections_per_block) {
- ret = add_memory_block(nr);
- if (ret)
- panic("%s() failed to add memory block: %d\n", __func__,
- ret);
+ block_id = ULONG_MAX;
+ for_each_present_section_nr(0, nr) {
+ if (block_id != ULONG_MAX && memory_block_id(nr) == block_id)
+ continue;
+
+ block_id = memory_block_id(nr);
+ ret = add_memory_block(block_id, NUMA_NO_NODE, MEM_ONLINE, NULL, NULL);
+ if (ret) {
+ panic("%s() failed to add memory block: %d\n",
+ __func__, ret);
+ }
}
}
@@ -1074,3 +1223,28 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
}
return ret;
}
+
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
+void memblk_nr_poison_inc(unsigned long pfn)
+{
+ const unsigned long block_id = pfn_to_block_id(pfn);
+ struct memory_block *mem = find_memory_block_by_id(block_id);
+
+ if (mem)
+ atomic_long_inc(&mem->nr_hwpoison);
+}
+
+void memblk_nr_poison_sub(unsigned long pfn, long i)
+{
+ const unsigned long block_id = pfn_to_block_id(pfn);
+ struct memory_block *mem = find_memory_block_by_id(block_id);
+
+ if (mem)
+ atomic_long_sub(i, &mem->nr_hwpoison);
+}
+
+static unsigned long memblk_nr_poison(struct memory_block *mem)
+{
+ return atomic_long_read(&mem->nr_hwpoison);
+}
+#endif