summaryrefslogtreecommitdiff
path: root/mm/memory_hotplug.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory_hotplug.c')
-rw-r--r--mm/memory_hotplug.c456
1 files changed, 213 insertions, 243 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 431b1f6753c0..a63ec679d861 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -35,6 +35,7 @@
#include <linux/compaction.h>
#include <linux/rmap.h>
#include <linux/module.h>
+#include <linux/node.h>
#include <asm/tlbflush.h>
@@ -219,11 +220,30 @@ void put_online_mems(void)
bool movable_node_enabled = false;
-#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-int mhp_default_online_type = MMOP_OFFLINE;
-#else
-int mhp_default_online_type = MMOP_ONLINE;
-#endif
+static int mhp_default_online_type = -1;
+int mhp_get_default_online_type(void)
+{
+ if (mhp_default_online_type >= 0)
+ return mhp_default_online_type;
+
+ if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE))
+ mhp_default_online_type = MMOP_OFFLINE;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO))
+ mhp_default_online_type = MMOP_ONLINE;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL))
+ mhp_default_online_type = MMOP_ONLINE_KERNEL;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE))
+ mhp_default_online_type = MMOP_ONLINE_MOVABLE;
+ else
+ mhp_default_online_type = MMOP_OFFLINE;
+
+ return mhp_default_online_type;
+}
+
+void mhp_set_default_online_type(int online_type)
+{
+ mhp_default_online_type = online_type;
+}
static int __init setup_memhp_default_state(char *str)
{
@@ -355,7 +375,7 @@ struct page *pfn_to_online_page(unsigned long pfn)
* the section may be 'offline' but 'valid'. Only
* get_dev_pagemap() can determine sub-section online status.
*/
- pgmap = get_dev_pagemap(pfn, NULL);
+ pgmap = get_dev_pagemap(pfn);
put_dev_pagemap(pgmap);
/* The presence of a pgmap indicates ZONE_DEVICE offline pfn */
@@ -366,7 +386,7 @@ struct page *pfn_to_online_page(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(pfn_to_online_page);
-int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
+int __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
struct mhp_params *params)
{
const unsigned long end_pfn = pfn + nr_pages;
@@ -524,7 +544,7 @@ static void update_pgdat_span(struct pglist_data *pgdat)
pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
-void __ref remove_pfn_range_from_zone(struct zone *zone,
+void remove_pfn_range_from_zone(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -628,16 +648,10 @@ int restore_online_page_callback(online_page_callback_t callback)
}
EXPORT_SYMBOL_GPL(restore_online_page_callback);
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
void generic_online_page(struct page *page, unsigned int order)
{
- /*
- * Freeing the page with debug_pagealloc enabled will try to unmap it,
- * so we should map it first. This is better than introducing a special
- * case in page freeing fast path.
- */
- debug_pagealloc_map_pages(page, 1 << order);
- __free_pages_core(page, order);
- totalram_pages_add(1UL << order);
+ __free_pages_core(page, order, MEMINIT_HOTPLUG);
}
EXPORT_SYMBOL_GPL(generic_online_page);
@@ -656,6 +670,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
* this and the first chunk to online will be pageblock_nr_pages.
*/
for (pfn = start_pfn; pfn < end_pfn;) {
+ struct page *page = pfn_to_page(pfn);
int order;
/*
@@ -670,7 +685,14 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
else
order = MAX_PAGE_ORDER;
- (*online_page_callback)(pfn_to_page(pfn), order);
+ /*
+ * Exposing the page to the buddy by freeing can cause
+ * issues with debug_pagealloc enabled: some archs don't
+ * like double-unmappings. So treat them like any pages that
+ * were allocated from the buddy.
+ */
+ debug_pagealloc_map_pages(page, 1 << order);
+ (*online_page_callback)(page, order);
pfn += (1UL << order);
}
@@ -678,30 +700,6 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
online_mem_sections(start_pfn, end_pfn);
}
-/* check which state of node_states will be changed when online memory */
-static void node_states_check_changes_online(unsigned long nr_pages,
- struct zone *zone, struct memory_notify *arg)
-{
- int nid = zone_to_nid(zone);
-
- arg->status_change_nid = NUMA_NO_NODE;
- arg->status_change_nid_normal = NUMA_NO_NODE;
-
- if (!node_state(nid, N_MEMORY))
- arg->status_change_nid = nid;
- if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
- arg->status_change_nid_normal = nid;
-}
-
-static void node_states_set_node(int node, struct memory_notify *arg)
-{
- if (arg->status_change_nid_normal >= 0)
- node_set_state(node, N_NORMAL_MEMORY);
-
- if (arg->status_change_nid >= 0)
- node_set_state(node, N_MEMORY);
-}
-
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -741,15 +739,16 @@ static inline void section_taint_zone_device(unsigned long pfn)
/*
* Associate the pfn range with the given zone, initializing the memmaps
* and resizing the pgdat/zone data to span the added pages. After this
- * call, all affected pages are PG_reserved.
+ * call, all affected pages are PageOffline().
*
* All aligned pageblocks are initialized to the specified migratetype
* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
* zone stats (e.g., nr_isolate_pageblock) are touched.
*/
-void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
+void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages,
- struct vmem_altmap *altmap, int migratetype)
+ struct vmem_altmap *altmap, int migratetype,
+ bool isolate_pageblock)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
@@ -776,12 +775,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
/*
* TODO now we have a visible range of pages which are not associated
- * with their zone properly. Not nice but set_pfnblock_flags_mask
+ * with their zone properly. Not nice but set_pfnblock_migratetype()
* expects the zone spans the pfn range. All the pages in the range
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
- MEMINIT_HOTPLUG, altmap, migratetype);
+ MEMINIT_HOTPLUG, altmap, migratetype,
+ isolate_pageblock);
set_zone_contiguous(zone);
}
@@ -846,7 +846,6 @@ static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
unsigned long kernel_early_pages, movable_pages;
struct auto_movable_group_stats group_stats = {};
struct auto_movable_stats stats = {};
- pg_data_t *pgdat = NODE_DATA(nid);
struct zone *zone;
int i;
@@ -857,6 +856,8 @@ static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
auto_movable_stats_account_zone(&stats, zone);
} else {
for (i = 0; i < MAX_NR_ZONES; i++) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+
zone = pgdat->node_zones + i;
if (populated_zone(zone))
auto_movable_stats_account_zone(&stats, zone);
@@ -954,7 +955,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn
* effectively unused by the kernel, yet they account to "present pages".
* Fortunately, these allocations are comparatively small in relevant setups
* (e.g., fraction of system memory).
- * b) Some hotplugged memory blocks in virtualized environments, esecially
+ * b) Some hotplugged memory blocks in virtualized environments, especially
* hotplugged by virtio-mem, look like they are completely present, however,
* only parts of the memory block are actually currently usable.
* "present pages" is an upper limit that can get reached at runtime. As
@@ -1087,7 +1088,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
}
int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
- struct zone *zone, bool mhp_off_inaccessible)
+ struct zone *zone)
{
unsigned long end_pfn = pfn + nr_pages;
int ret, i;
@@ -1096,19 +1097,15 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
if (ret)
return ret;
- /*
- * Memory block is accessible at this stage and hence poison the struct
- * pages now. If the memory block is accessible during memory hotplug
- * addition phase, then page poisining is already performed in
- * sparse_add_section().
- */
- if (mhp_off_inaccessible)
- page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
+ false);
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pfn_to_page(pfn + i);
- for (i = 0; i < nr_pages; i++)
- SetPageVmemmapSelfHosted(pfn_to_page(pfn + i));
+ __ClearPageOffline(page);
+ SetPageVmemmapSelfHosted(page);
+ }
/*
* It might be that the vmemmap_pages fully span sections. If that is
@@ -1144,14 +1141,20 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
/*
* Must be called with mem_hotplug_lock in write mode.
*/
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+int online_pages(unsigned long pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
- unsigned long flags;
- int need_zonelists_rebuild = 0;
+ struct memory_notify mem_arg = {
+ .start_pfn = pfn,
+ .nr_pages = nr_pages,
+ };
+ struct node_notify node_arg = {
+ .nid = NUMA_NO_NODE,
+ };
const int nid = zone_to_nid(zone);
+ int need_zonelists_rebuild = 0;
+ unsigned long flags;
int ret;
- struct memory_notify arg;
/*
* {on,off}lining is constrained to full memory sections (or more
@@ -1166,13 +1169,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
/* associate pfn range with the zone */
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
-
- arg.start_pfn = pfn;
- arg.nr_pages = nr_pages;
- node_states_check_changes_online(nr_pages, zone, &arg);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE,
+ true);
+
+ if (!node_state(nid, N_MEMORY)) {
+ /* Adding memory to the node for the first time */
+ node_arg.nid = nid;
+ ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto failed_addition;
+ }
- ret = memory_notify(MEM_GOING_ONLINE, &arg);
+ ret = memory_notify(MEM_GOING_ONLINE, &mem_arg);
ret = notifier_to_errno(ret);
if (ret)
goto failed_addition;
@@ -1198,12 +1207,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
online_pages_range(pfn, nr_pages);
adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
- node_states_set_node(nid, &arg);
+ if (node_arg.nid >= 0)
+ node_set_state(nid, N_MEMORY);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
/* Basic onlining is complete, allow allocation of onlined pages. */
- undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
+ undo_isolate_page_range(pfn, pfn + nr_pages);
/*
* Freshly onlined pages aren't shuffled (e.g., all pages are placed to
@@ -1219,22 +1229,28 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
kswapd_run(nid);
kcompactd_run(nid);
+ if (node_arg.nid >= 0)
+ /* First memory added successfully. Notify consumers. */
+ node_notify(NODE_ADDED_FIRST_MEMORY, &node_arg);
+
writeback_set_ratelimit();
- memory_notify(MEM_ONLINE, &arg);
+ memory_notify(MEM_ONLINE, &mem_arg);
return 0;
failed_addition:
pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
(unsigned long long) pfn << PAGE_SHIFT,
(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
- memory_notify(MEM_CANCEL_ONLINE, &arg);
+ memory_notify(MEM_CANCEL_ONLINE, &mem_arg);
+ if (node_arg.nid != NUMA_NO_NODE)
+ node_notify(NODE_CANCEL_ADDING_FIRST_MEMORY, &node_arg);
remove_pfn_range_from_zone(zone, pfn, nr_pages);
return ret;
}
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-static pg_data_t __ref *hotadd_init_pgdat(int nid)
+static pg_data_t *hotadd_init_pgdat(int nid)
{
struct pglist_data *pgdat;
@@ -1286,7 +1302,7 @@ static int __try_online_node(int nid, bool set_node_online)
if (set_node_online) {
node_set_online(nid);
- ret = register_one_node(nid);
+ ret = register_node(nid);
BUG_ON(ret);
}
out:
@@ -1321,7 +1337,7 @@ static int check_hotplug_memory_range(u64 start, u64 size)
static int online_memory_block(struct memory_block *mem, void *arg)
{
- mem->online_type = mhp_default_online_type;
+ mem->online_type = mhp_get_default_online_type();
return device_online(&mem->dev);
}
@@ -1387,7 +1403,7 @@ bool mhp_supports_memmap_on_memory(void)
}
EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory);
-static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
+static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
{
unsigned long memblock_size = memory_block_size_bytes();
u64 cur_start;
@@ -1419,7 +1435,7 @@ static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
}
static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
- u64 start, u64 size, mhp_t mhp_flags)
+ u64 start, u64 size)
{
unsigned long memblock_size = memory_block_size_bytes();
u64 cur_start;
@@ -1435,8 +1451,6 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
};
mhp_altmap.free = memory_block_memmap_on_memory_pages();
- if (mhp_flags & MHP_OFFLINE_INACCESSIBLE)
- mhp_altmap.inaccessible = true;
params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
GFP_KERNEL);
if (!params.altmap) {
@@ -1452,7 +1466,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
}
/* create memory block devices after memory was added */
- ret = create_memory_block_devices(cur_start, memblock_size,
+ ret = create_memory_block_devices(cur_start, memblock_size, nid,
params.altmap, group);
if (ret) {
arch_remove_memory(cur_start, memblock_size, NULL);
@@ -1474,7 +1488,7 @@ out:
*
* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
*/
-int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
+int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
enum memblock_flags memblock_flags = MEMBLOCK_NONE;
@@ -1514,15 +1528,23 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
ret = __try_online_node(nid, false);
if (ret < 0)
- goto error;
- new_node = ret;
+ goto error_memblock_remove;
+ if (ret) {
+ node_set_online(nid);
+ ret = register_node(nid);
+ if (WARN_ON(ret)) {
+ node_set_offline(nid);
+ goto error_memblock_remove;
+ }
+ new_node = true;
+ }
/*
* Self hosted memmap array
*/
if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
mhp_supports_memmap_on_memory()) {
- ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
+ ret = create_altmaps_and_memory_blocks(nid, group, start, size);
if (ret)
goto error;
} else {
@@ -1531,27 +1553,15 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
goto error;
/* create memory block devices after memory was added */
- ret = create_memory_block_devices(start, size, NULL, group);
+ ret = create_memory_block_devices(start, size, nid, NULL, group);
if (ret) {
arch_remove_memory(start, size, params.altmap);
goto error;
}
}
- if (new_node) {
- /* If sysfs file of new node can't be created, cpu on the node
- * can't be hot-added. There is no rollback way now.
- * So, check by BUG_ON() to catch it reluctantly..
- * We online node here. We can't roll back from here.
- */
- node_set_online(nid);
- ret = __register_one_node(nid);
- BUG_ON(ret);
- }
-
- register_memory_blocks_under_node(nid, PFN_DOWN(start),
- PFN_UP(start + size - 1),
- MEMINIT_HOTPLUG);
+ register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start),
+ PFN_UP(start + size - 1));
/* create new memmap entry */
if (!strcmp(res->name, "System RAM"))
@@ -1568,11 +1578,16 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
merge_system_ram_resource(res);
/* online pages if requested */
- if (mhp_default_online_type != MMOP_OFFLINE)
+ if (mhp_get_default_online_type() != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
error:
+ if (new_node) {
+ node_set_offline(nid);
+ unregister_node(nid);
+ }
+error_memblock_remove:
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
error_mem_hotplug_end:
@@ -1581,7 +1596,7 @@ error_mem_hotplug_end:
}
/* requires device_hotplug_lock, see add_memory_resource() */
-int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
+int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
struct resource *res;
int ret;
@@ -1682,7 +1697,7 @@ struct range __weak arch_get_mappable_range(void)
struct range mhp_get_pluggable_range(bool need_mapping)
{
- const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
+ const u64 max_phys = DIRECT_MAP_PHYSMEM_END;
struct range mhp_range;
if (need_mapping) {
@@ -1715,8 +1730,8 @@ bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
- * non-lru movable pages and hugepages). Will skip over most unmovable
+ * Scan pfn range [start,end) to find movable/migratable pages (LRU and
+ * hugetlb folio, movable_ops pages). Will skip over most unmovable
* pages (esp., pages that can be skipped when offlining), but bail out on
* definitely unmovable pages.
*
@@ -1730,20 +1745,16 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
{
unsigned long pfn;
- for (pfn = start; pfn < end; pfn++) {
- struct page *page, *head;
- unsigned long skip;
+ for_each_valid_pfn(pfn, start, end) {
+ struct page *page;
+ struct folio *folio;
- if (!pfn_valid(pfn))
- continue;
page = pfn_to_page(pfn);
- if (PageLRU(page))
- goto found;
- if (__PageMovable(page))
+ if (PageLRU(page) || page_has_movable_ops(page))
goto found;
/*
- * PageOffline() pages that are not marked __PageMovable() and
+ * PageOffline() pages that do not have movable_ops and
* have a reference count > 0 (after MEM_GOING_OFFLINE) are
* definitely unmovable. If their reference count would be 0,
* they could at least be skipped when offlining memory.
@@ -1753,7 +1764,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
if (!PageHuge(page))
continue;
- head = compound_head(page);
+ folio = page_folio(page);
/*
* This test is racy as we hold no reference or lock. The
* hugetlb page could have been free'ed and head is no longer
@@ -1761,10 +1772,9 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
* cases false positives and negatives are possible. Calling
* code must deal with these scenarios.
*/
- if (HPageMigratable(head))
+ if (folio_test_hugetlb_migratable(folio))
goto found;
- skip = compound_nr(head) - (pfn - page_to_pfn(head));
- pfn += skip - 1;
+ pfn |= folio_nr_pages(folio) - 1;
}
return -ENOENT;
found:
@@ -1774,73 +1784,60 @@ found:
static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
+ struct folio *folio;
unsigned long pfn;
- struct page *page, *head;
LIST_HEAD(source);
static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- struct folio *folio;
- bool isolated;
+ for_each_valid_pfn(pfn, start_pfn, end_pfn) {
+ struct page *page;
- if (!pfn_valid(pfn))
- continue;
page = pfn_to_page(pfn);
folio = page_folio(page);
- head = &folio->page;
- if (PageHuge(page)) {
- pfn = page_to_pfn(head) + compound_nr(head) - 1;
- isolate_hugetlb(folio, &source);
+ if (!folio_try_get(folio))
continue;
- } else if (PageTransHuge(page))
- pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
- /*
- * HWPoison pages have elevated reference counts so the migration would
- * fail on them. It also doesn't make any sense to migrate them in the
- * first place. Still try to unmap such a page in case it is still mapped
- * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
- * the unmap as the catch all safety net).
- */
- if (PageHWPoison(page)) {
- if (WARN_ON(folio_test_lru(folio)))
- folio_isolate_lru(folio);
- if (folio_mapped(folio))
- try_to_unmap(folio, TTU_IGNORE_MLOCK);
- continue;
- }
+ if (unlikely(page_folio(page) != folio))
+ goto put_folio;
- if (!get_page_unless_zero(page))
- continue;
- /*
- * We can skip free pages. And we can deal with pages on
- * LRU and non-lru movable pages.
- */
- if (PageLRU(page))
- isolated = isolate_lru_page(page);
- else
- isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
- if (isolated) {
- list_add_tail(&page->lru, &source);
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
+ if (folio_test_large(folio))
+ pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
- } else {
+ if (folio_contain_hwpoisoned_page(folio)) {
+ /*
+ * unmap_poisoned_folio() cannot handle large folios
+ * in all cases yet.
+ */
+ if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+ goto put_folio;
+ if (folio_test_lru(folio) && !folio_isolate_lru(folio))
+ goto put_folio;
+ if (folio_mapped(folio)) {
+ folio_lock(folio);
+ unmap_poisoned_folio(folio, pfn, false);
+ folio_unlock(folio);
+ }
+
+ goto put_folio;
+ }
+
+ if (!isolate_folio_to_list(folio, &source)) {
if (__ratelimit(&migrate_rs)) {
- pr_warn("failed to isolate pfn %lx\n", pfn);
+ pr_warn("failed to isolate pfn %lx\n",
+ page_to_pfn(page));
dump_page(page, "isolation failed");
}
}
- put_page(page);
+put_folio:
+ folio_put(folio);
}
if (!list_empty(&source)) {
nodemask_t nmask = node_states[N_MEMORY];
struct migration_target_control mtc = {
.nmask = &nmask,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
.reason = MR_MEMORY_HOTPLUG,
};
int ret;
@@ -1849,7 +1846,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* We have checked that migration range is on a single zone so
* we can use the nid of the first page to all the others.
*/
- mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
+ mtc.nid = folio_nid(list_first_entry(&source, struct folio, lru));
/*
* try to allocate from a different node but reuse this node
@@ -1862,11 +1859,12 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
ret = migrate_pages(&source, alloc_migration_target, NULL,
(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
if (ret) {
- list_for_each_entry(page, &source, lru) {
+ list_for_each_entry(folio, &source, lru) {
if (__ratelimit(&migrate_rs)) {
pr_warn("migrating pfn %lx failed ret:%d\n",
- page_to_pfn(page), ret);
- dump_page(page, "migration failure");
+ folio_pfn(folio), ret);
+ dump_page(&folio->page,
+ "migration failure");
}
}
putback_movable_pages(&source);
@@ -1881,54 +1879,6 @@ static int __init cmdline_parse_movable_node(char *p)
}
early_param("movable_node", cmdline_parse_movable_node);
-/* check which state of node_states will be changed when offline memory */
-static void node_states_check_changes_offline(unsigned long nr_pages,
- struct zone *zone, struct memory_notify *arg)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long present_pages = 0;
- enum zone_type zt;
-
- arg->status_change_nid = NUMA_NO_NODE;
- arg->status_change_nid_normal = NUMA_NO_NODE;
-
- /*
- * Check whether node_states[N_NORMAL_MEMORY] will be changed.
- * If the memory to be offline is within the range
- * [0..ZONE_NORMAL], and it is the last present memory there,
- * the zones in that range will become empty after the offlining,
- * thus we can determine that we need to clear the node from
- * node_states[N_NORMAL_MEMORY].
- */
- for (zt = 0; zt <= ZONE_NORMAL; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
- if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
- arg->status_change_nid_normal = zone_to_nid(zone);
-
- /*
- * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
- * does not apply as we don't support 32bit.
- * Here we count the possible pages from ZONE_MOVABLE.
- * If after having accounted all the pages, we see that the nr_pages
- * to be offlined is over or equal to the accounted pages,
- * we know that the node will become empty, and so, we can clear
- * it for N_MEMORY as well.
- */
- present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
-
- if (nr_pages >= present_pages)
- arg->status_change_nid = zone_to_nid(zone);
-}
-
-static void node_states_clear_node(int node, struct memory_notify *arg)
-{
- if (arg->status_change_nid_normal >= 0)
- node_clear_state(node, N_NORMAL_MEMORY);
-
- if (arg->status_change_nid >= 0)
- node_clear_state(node, N_MEMORY);
-}
-
static int count_system_ram_pages_cb(unsigned long start_pfn,
unsigned long nr_pages, void *data)
{
@@ -1941,14 +1891,21 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
/*
* Must be called with mem_hotplug_lock in write mode.
*/
-int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
+ unsigned long pfn, managed_pages, system_ram_pages = 0;
const unsigned long end_pfn = start_pfn + nr_pages;
- unsigned long pfn, system_ram_pages = 0;
+ struct pglist_data *pgdat = zone->zone_pgdat;
const int node = zone_to_nid(zone);
+ struct memory_notify mem_arg = {
+ .start_pfn = start_pfn,
+ .nr_pages = nr_pages,
+ };
+ struct node_notify node_arg = {
+ .nid = NUMA_NO_NODE,
+ };
unsigned long flags;
- struct memory_notify arg;
char *reason;
int ret;
@@ -1967,9 +1924,9 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* Don't allow to offline memory blocks that contain holes.
* Consequently, memory blocks with holes can never get onlined
* via the hotplug path - online_pages() - as hotplugged memory has
- * no holes. This way, we e.g., don't have to worry about marking
- * memory holes PG_reserved, don't need pfn_valid() checks, and can
- * avoid using walk_system_ram_range() later.
+ * no holes. This way, we don't have to worry about memory holes,
+ * don't need pfn_valid() checks, and can avoid using
+ * walk_system_ram_range() later.
*/
walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
count_system_ram_pages_cb);
@@ -2000,19 +1957,28 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
- MIGRATE_MOVABLE,
- MEMORY_OFFLINE | REPORT_FAILURE,
- GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
+ PB_ISOLATE_MODE_MEM_OFFLINE);
if (ret) {
reason = "failure to isolate range";
goto failed_removal_pcplists_disabled;
}
- arg.start_pfn = start_pfn;
- arg.nr_pages = nr_pages;
- node_states_check_changes_offline(nr_pages, zone, &arg);
+ /*
+ * Check whether the node will have no present pages after we offline
+ * 'nr_pages' more. If so, we know that the node will become empty, and
+ * so we will clear N_MEMORY for it.
+ */
+ if (nr_pages >= pgdat->node_present_pages) {
+ node_arg.nid = node;
+ ret = node_notify(NODE_REMOVING_LAST_MEMORY, &node_arg);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ reason = "node notifier failure";
+ goto failed_removal_isolated;
+ }
+ }
- ret = memory_notify(MEM_GOING_OFFLINE, &arg);
+ ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg);
ret = notifier_to_errno(ret);
if (ret) {
reason = "notifier failure";
@@ -2061,12 +2027,13 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
goto failed_removal_isolated;
}
- ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
+ ret = test_pages_isolated(start_pfn, end_pfn,
+ PB_ISOLATE_MODE_MEM_OFFLINE);
} while (ret);
/* Mark all sections offline and remove free pages from the buddy. */
- __offline_isolated_pages(start_pfn, end_pfn);
+ managed_pages = __offline_isolated_pages(start_pfn, end_pfn);
pr_debug("Offlined Pages %ld\n", nr_pages);
/*
@@ -2082,7 +2049,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
zone_pcp_enable(zone);
/* removal success */
- adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+ adjust_managed_page_count(pfn_to_page(start_pfn), -managed_pages);
adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages);
/* reinitialise watermarks and update pcp limits */
@@ -2092,27 +2059,32 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* Make sure to mark the node as memory-less before rebuilding the zone
* list. Otherwise this node would still appear in the fallback lists.
*/
- node_states_clear_node(node, &arg);
+ if (node_arg.nid >= 0)
+ node_clear_state(node, N_MEMORY);
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
build_all_zonelists(NULL);
}
- if (arg.status_change_nid >= 0) {
+ if (node_arg.nid >= 0) {
kcompactd_stop(node);
kswapd_stop(node);
+ /* Node went memoryless. Notify consumers */
+ node_notify(NODE_REMOVED_LAST_MEMORY, &node_arg);
}
writeback_set_ratelimit();
- memory_notify(MEM_OFFLINE, &arg);
+ memory_notify(MEM_OFFLINE, &mem_arg);
remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
return 0;
failed_removal_isolated:
/* pushback to free area */
- undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
- memory_notify(MEM_CANCEL_OFFLINE, &arg);
+ undo_isolate_page_range(start_pfn, end_pfn);
+ memory_notify(MEM_CANCEL_OFFLINE, &mem_arg);
+ if (node_arg.nid != NUMA_NO_NODE)
+ node_notify(NODE_CANCEL_REMOVING_LAST_MEMORY, &node_arg);
failed_removal_pcplists_disabled:
lru_cache_enable();
zone_pcp_enable(zone);
@@ -2218,7 +2190,7 @@ void try_offline_node(int nid)
* node now.
*/
node_set_offline(nid);
- unregister_one_node(nid);
+ unregister_node(nid);
}
EXPORT_SYMBOL(try_offline_node);
@@ -2242,7 +2214,7 @@ static int memory_blocks_have_altmaps(u64 start, u64 size)
return 1;
}
-static int __ref try_remove_memory(u64 start, u64 size)
+static int try_remove_memory(u64 start, u64 size)
{
int rc, nid = NUMA_NO_NODE;
@@ -2283,10 +2255,8 @@ static int __ref try_remove_memory(u64 start, u64 size)
remove_memory_blocks_and_altmaps(start, size);
}
- if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
- memblock_phys_free(start, size);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
- }
release_mem_region_adjustable(start, size);
@@ -2346,7 +2316,7 @@ static int try_offline_memory_block(struct memory_block *mem, void *arg)
* by offlining code ... so we don't care about that.
*/
page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
- if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
+ if (page && page_zonenum(page) == ZONE_MOVABLE)
online_type = MMOP_ONLINE_MOVABLE;
rc = device_offline(&mem->dev);