diff options
Diffstat (limited to 'mm/memory_hotplug.c')
| -rw-r--r-- | mm/memory_hotplug.c | 456 |
1 files changed, 213 insertions, 243 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 431b1f6753c0..a63ec679d861 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -35,6 +35,7 @@ #include <linux/compaction.h> #include <linux/rmap.h> #include <linux/module.h> +#include <linux/node.h> #include <asm/tlbflush.h> @@ -219,11 +220,30 @@ void put_online_mems(void) bool movable_node_enabled = false; -#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -int mhp_default_online_type = MMOP_OFFLINE; -#else -int mhp_default_online_type = MMOP_ONLINE; -#endif +static int mhp_default_online_type = -1; +int mhp_get_default_online_type(void) +{ + if (mhp_default_online_type >= 0) + return mhp_default_online_type; + + if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE)) + mhp_default_online_type = MMOP_OFFLINE; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO)) + mhp_default_online_type = MMOP_ONLINE; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL)) + mhp_default_online_type = MMOP_ONLINE_KERNEL; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE)) + mhp_default_online_type = MMOP_ONLINE_MOVABLE; + else + mhp_default_online_type = MMOP_OFFLINE; + + return mhp_default_online_type; +} + +void mhp_set_default_online_type(int online_type) +{ + mhp_default_online_type = online_type; +} static int __init setup_memhp_default_state(char *str) { @@ -355,7 +375,7 @@ struct page *pfn_to_online_page(unsigned long pfn) * the section may be 'offline' but 'valid'. Only * get_dev_pagemap() can determine sub-section online status. */ - pgmap = get_dev_pagemap(pfn, NULL); + pgmap = get_dev_pagemap(pfn); put_dev_pagemap(pgmap); /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ @@ -366,7 +386,7 @@ struct page *pfn_to_online_page(unsigned long pfn) } EXPORT_SYMBOL_GPL(pfn_to_online_page); -int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, +int __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, struct mhp_params *params) { const unsigned long end_pfn = pfn + nr_pages; @@ -524,7 +544,7 @@ static void update_pgdat_span(struct pglist_data *pgdat) pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; } -void __ref remove_pfn_range_from_zone(struct zone *zone, +void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { @@ -628,16 +648,10 @@ int restore_online_page_callback(online_page_callback_t callback) } EXPORT_SYMBOL_GPL(restore_online_page_callback); +/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ void generic_online_page(struct page *page, unsigned int order) { - /* - * Freeing the page with debug_pagealloc enabled will try to unmap it, - * so we should map it first. This is better than introducing a special - * case in page freeing fast path. - */ - debug_pagealloc_map_pages(page, 1 << order); - __free_pages_core(page, order); - totalram_pages_add(1UL << order); + __free_pages_core(page, order, MEMINIT_HOTPLUG); } EXPORT_SYMBOL_GPL(generic_online_page); @@ -656,6 +670,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * this and the first chunk to online will be pageblock_nr_pages. */ for (pfn = start_pfn; pfn < end_pfn;) { + struct page *page = pfn_to_page(pfn); int order; /* @@ -670,7 +685,14 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) else order = MAX_PAGE_ORDER; - (*online_page_callback)(pfn_to_page(pfn), order); + /* + * Exposing the page to the buddy by freeing can cause + * issues with debug_pagealloc enabled: some archs don't + * like double-unmappings. So treat them like any pages that + * were allocated from the buddy. + */ + debug_pagealloc_map_pages(page, 1 << order); + (*online_page_callback)(page, order); pfn += (1UL << order); } @@ -678,30 +700,6 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) online_mem_sections(start_pfn, end_pfn); } -/* check which state of node_states will be changed when online memory */ -static void node_states_check_changes_online(unsigned long nr_pages, - struct zone *zone, struct memory_notify *arg) -{ - int nid = zone_to_nid(zone); - - arg->status_change_nid = NUMA_NO_NODE; - arg->status_change_nid_normal = NUMA_NO_NODE; - - if (!node_state(nid, N_MEMORY)) - arg->status_change_nid = nid; - if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) - arg->status_change_nid_normal = nid; -} - -static void node_states_set_node(int node, struct memory_notify *arg) -{ - if (arg->status_change_nid_normal >= 0) - node_set_state(node, N_NORMAL_MEMORY); - - if (arg->status_change_nid >= 0) - node_set_state(node, N_MEMORY); -} - static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { @@ -741,15 +739,16 @@ static inline void section_taint_zone_device(unsigned long pfn) /* * Associate the pfn range with the given zone, initializing the memmaps * and resizing the pgdat/zone data to span the added pages. After this - * call, all affected pages are PG_reserved. + * call, all affected pages are PageOffline(). * * All aligned pageblocks are initialized to the specified migratetype * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related * zone stats (e.g., nr_isolate_pageblock) are touched. */ -void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, +void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, int migratetype) + struct vmem_altmap *altmap, int migratetype, + bool isolate_pageblock) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; @@ -776,12 +775,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, /* * TODO now we have a visible range of pages which are not associated - * with their zone properly. Not nice but set_pfnblock_flags_mask + * with their zone properly. Not nice but set_pfnblock_migratetype() * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, - MEMINIT_HOTPLUG, altmap, migratetype); + MEMINIT_HOTPLUG, altmap, migratetype, + isolate_pageblock); set_zone_contiguous(zone); } @@ -846,7 +846,6 @@ static bool auto_movable_can_online_movable(int nid, struct memory_group *group, unsigned long kernel_early_pages, movable_pages; struct auto_movable_group_stats group_stats = {}; struct auto_movable_stats stats = {}; - pg_data_t *pgdat = NODE_DATA(nid); struct zone *zone; int i; @@ -857,6 +856,8 @@ static bool auto_movable_can_online_movable(int nid, struct memory_group *group, auto_movable_stats_account_zone(&stats, zone); } else { for (i = 0; i < MAX_NR_ZONES; i++) { + pg_data_t *pgdat = NODE_DATA(nid); + zone = pgdat->node_zones + i; if (populated_zone(zone)) auto_movable_stats_account_zone(&stats, zone); @@ -954,7 +955,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * effectively unused by the kernel, yet they account to "present pages". * Fortunately, these allocations are comparatively small in relevant setups * (e.g., fraction of system memory). - * b) Some hotplugged memory blocks in virtualized environments, esecially + * b) Some hotplugged memory blocks in virtualized environments, especially * hotplugged by virtio-mem, look like they are completely present, however, * only parts of the memory block are actually currently usable. * "present pages" is an upper limit that can get reached at runtime. As @@ -1087,7 +1088,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone, bool mhp_off_inaccessible) + struct zone *zone) { unsigned long end_pfn = pfn + nr_pages; int ret, i; @@ -1096,19 +1097,15 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, if (ret) return ret; - /* - * Memory block is accessible at this stage and hence poison the struct - * pages now. If the memory block is accessible during memory hotplug - * addition phase, then page poisining is already performed in - * sparse_add_section(). - */ - if (mhp_off_inaccessible) - page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE, + false); - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn + i); - for (i = 0; i < nr_pages; i++) - SetPageVmemmapSelfHosted(pfn_to_page(pfn + i)); + __ClearPageOffline(page); + SetPageVmemmapSelfHosted(page); + } /* * It might be that the vmemmap_pages fully span sections. If that is @@ -1144,14 +1141,20 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) /* * Must be called with mem_hotplug_lock in write mode. */ -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, +int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { - unsigned long flags; - int need_zonelists_rebuild = 0; + struct memory_notify mem_arg = { + .start_pfn = pfn, + .nr_pages = nr_pages, + }; + struct node_notify node_arg = { + .nid = NUMA_NO_NODE, + }; const int nid = zone_to_nid(zone); + int need_zonelists_rebuild = 0; + unsigned long flags; int ret; - struct memory_notify arg; /* * {on,off}lining is constrained to full memory sections (or more @@ -1166,13 +1169,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, /* associate pfn range with the zone */ - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); - - arg.start_pfn = pfn; - arg.nr_pages = nr_pages; - node_states_check_changes_online(nr_pages, zone, &arg); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE, + true); + + if (!node_state(nid, N_MEMORY)) { + /* Adding memory to the node for the first time */ + node_arg.nid = nid; + ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg); + ret = notifier_to_errno(ret); + if (ret) + goto failed_addition; + } - ret = memory_notify(MEM_GOING_ONLINE, &arg); + ret = memory_notify(MEM_GOING_ONLINE, &mem_arg); ret = notifier_to_errno(ret); if (ret) goto failed_addition; @@ -1198,12 +1207,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, online_pages_range(pfn, nr_pages); adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); - node_states_set_node(nid, &arg); + if (node_arg.nid >= 0) + node_set_state(nid, N_MEMORY); if (need_zonelists_rebuild) build_all_zonelists(NULL); /* Basic onlining is complete, allow allocation of onlined pages. */ - undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); + undo_isolate_page_range(pfn, pfn + nr_pages); /* * Freshly onlined pages aren't shuffled (e.g., all pages are placed to @@ -1219,22 +1229,28 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, kswapd_run(nid); kcompactd_run(nid); + if (node_arg.nid >= 0) + /* First memory added successfully. Notify consumers. */ + node_notify(NODE_ADDED_FIRST_MEMORY, &node_arg); + writeback_set_ratelimit(); - memory_notify(MEM_ONLINE, &arg); + memory_notify(MEM_ONLINE, &mem_arg); return 0; failed_addition: pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", (unsigned long long) pfn << PAGE_SHIFT, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); - memory_notify(MEM_CANCEL_ONLINE, &arg); + memory_notify(MEM_CANCEL_ONLINE, &mem_arg); + if (node_arg.nid != NUMA_NO_NODE) + node_notify(NODE_CANCEL_ADDING_FIRST_MEMORY, &node_arg); remove_pfn_range_from_zone(zone, pfn, nr_pages); return ret; } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_init_pgdat(int nid) +static pg_data_t *hotadd_init_pgdat(int nid) { struct pglist_data *pgdat; @@ -1286,7 +1302,7 @@ static int __try_online_node(int nid, bool set_node_online) if (set_node_online) { node_set_online(nid); - ret = register_one_node(nid); + ret = register_node(nid); BUG_ON(ret); } out: @@ -1321,7 +1337,7 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { - mem->online_type = mhp_default_online_type; + mem->online_type = mhp_get_default_online_type(); return device_online(&mem->dev); } @@ -1387,7 +1403,7 @@ bool mhp_supports_memmap_on_memory(void) } EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory); -static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) +static void remove_memory_blocks_and_altmaps(u64 start, u64 size) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; @@ -1419,7 +1435,7 @@ static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) } static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, - u64 start, u64 size, mhp_t mhp_flags) + u64 start, u64 size) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; @@ -1435,8 +1451,6 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, }; mhp_altmap.free = memory_block_memmap_on_memory_pages(); - if (mhp_flags & MHP_OFFLINE_INACCESSIBLE) - mhp_altmap.inaccessible = true; params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), GFP_KERNEL); if (!params.altmap) { @@ -1452,7 +1466,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, } /* create memory block devices after memory was added */ - ret = create_memory_block_devices(cur_start, memblock_size, + ret = create_memory_block_devices(cur_start, memblock_size, nid, params.altmap, group); if (ret) { arch_remove_memory(cur_start, memblock_size, NULL); @@ -1474,7 +1488,7 @@ out: * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) +int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; enum memblock_flags memblock_flags = MEMBLOCK_NONE; @@ -1514,15 +1528,23 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) ret = __try_online_node(nid, false); if (ret < 0) - goto error; - new_node = ret; + goto error_memblock_remove; + if (ret) { + node_set_online(nid); + ret = register_node(nid); + if (WARN_ON(ret)) { + node_set_offline(nid); + goto error_memblock_remove; + } + new_node = true; + } /* * Self hosted memmap array */ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && mhp_supports_memmap_on_memory()) { - ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); + ret = create_altmaps_and_memory_blocks(nid, group, start, size); if (ret) goto error; } else { @@ -1531,27 +1553,15 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, NULL, group); + ret = create_memory_block_devices(start, size, nid, NULL, group); if (ret) { arch_remove_memory(start, size, params.altmap); goto error; } } - if (new_node) { - /* If sysfs file of new node can't be created, cpu on the node - * can't be hot-added. There is no rollback way now. - * So, check by BUG_ON() to catch it reluctantly.. - * We online node here. We can't roll back from here. - */ - node_set_online(nid); - ret = __register_one_node(nid); - BUG_ON(ret); - } - - register_memory_blocks_under_node(nid, PFN_DOWN(start), - PFN_UP(start + size - 1), - MEMINIT_HOTPLUG); + register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start), + PFN_UP(start + size - 1)); /* create new memmap entry */ if (!strcmp(res->name, "System RAM")) @@ -1568,11 +1578,16 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) merge_system_ram_resource(res); /* online pages if requested */ - if (mhp_default_online_type != MMOP_OFFLINE) + if (mhp_get_default_online_type() != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; error: + if (new_node) { + node_set_offline(nid); + unregister_node(nid); + } +error_memblock_remove: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); error_mem_hotplug_end: @@ -1581,7 +1596,7 @@ error_mem_hotplug_end: } /* requires device_hotplug_lock, see add_memory_resource() */ -int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) +int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { struct resource *res; int ret; @@ -1682,7 +1697,7 @@ struct range __weak arch_get_mappable_range(void) struct range mhp_get_pluggable_range(bool need_mapping) { - const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1; + const u64 max_phys = DIRECT_MAP_PHYSMEM_END; struct range mhp_range; if (need_mapping) { @@ -1715,8 +1730,8 @@ bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, - * non-lru movable pages and hugepages). Will skip over most unmovable + * Scan pfn range [start,end) to find movable/migratable pages (LRU and + * hugetlb folio, movable_ops pages). Will skip over most unmovable * pages (esp., pages that can be skipped when offlining), but bail out on * definitely unmovable pages. * @@ -1730,20 +1745,16 @@ static int scan_movable_pages(unsigned long start, unsigned long end, { unsigned long pfn; - for (pfn = start; pfn < end; pfn++) { - struct page *page, *head; - unsigned long skip; + for_each_valid_pfn(pfn, start, end) { + struct page *page; + struct folio *folio; - if (!pfn_valid(pfn)) - continue; page = pfn_to_page(pfn); - if (PageLRU(page)) - goto found; - if (__PageMovable(page)) + if (PageLRU(page) || page_has_movable_ops(page)) goto found; /* - * PageOffline() pages that are not marked __PageMovable() and + * PageOffline() pages that do not have movable_ops and * have a reference count > 0 (after MEM_GOING_OFFLINE) are * definitely unmovable. If their reference count would be 0, * they could at least be skipped when offlining memory. @@ -1753,7 +1764,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end, if (!PageHuge(page)) continue; - head = compound_head(page); + folio = page_folio(page); /* * This test is racy as we hold no reference or lock. The * hugetlb page could have been free'ed and head is no longer @@ -1761,10 +1772,9 @@ static int scan_movable_pages(unsigned long start, unsigned long end, * cases false positives and negatives are possible. Calling * code must deal with these scenarios. */ - if (HPageMigratable(head)) + if (folio_test_hugetlb_migratable(folio)) goto found; - skip = compound_nr(head) - (pfn - page_to_pfn(head)); - pfn += skip - 1; + pfn |= folio_nr_pages(folio) - 1; } return -ENOENT; found: @@ -1774,73 +1784,60 @@ found: static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { + struct folio *folio; unsigned long pfn; - struct page *page, *head; LIST_HEAD(source); static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - struct folio *folio; - bool isolated; + for_each_valid_pfn(pfn, start_pfn, end_pfn) { + struct page *page; - if (!pfn_valid(pfn)) - continue; page = pfn_to_page(pfn); folio = page_folio(page); - head = &folio->page; - if (PageHuge(page)) { - pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_hugetlb(folio, &source); + if (!folio_try_get(folio)) continue; - } else if (PageTransHuge(page)) - pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; - /* - * HWPoison pages have elevated reference counts so the migration would - * fail on them. It also doesn't make any sense to migrate them in the - * first place. Still try to unmap such a page in case it is still mapped - * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep - * the unmap as the catch all safety net). - */ - if (PageHWPoison(page)) { - if (WARN_ON(folio_test_lru(folio))) - folio_isolate_lru(folio); - if (folio_mapped(folio)) - try_to_unmap(folio, TTU_IGNORE_MLOCK); - continue; - } + if (unlikely(page_folio(page) != folio)) + goto put_folio; - if (!get_page_unless_zero(page)) - continue; - /* - * We can skip free pages. And we can deal with pages on - * LRU and non-lru movable pages. - */ - if (PageLRU(page)) - isolated = isolate_lru_page(page); - else - isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE); - if (isolated) { - list_add_tail(&page->lru, &source); - if (!__PageMovable(page)) - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_lru(page)); + if (folio_test_large(folio)) + pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; - } else { + if (folio_contain_hwpoisoned_page(folio)) { + /* + * unmap_poisoned_folio() cannot handle large folios + * in all cases yet. + */ + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + goto put_folio; + if (folio_test_lru(folio) && !folio_isolate_lru(folio)) + goto put_folio; + if (folio_mapped(folio)) { + folio_lock(folio); + unmap_poisoned_folio(folio, pfn, false); + folio_unlock(folio); + } + + goto put_folio; + } + + if (!isolate_folio_to_list(folio, &source)) { if (__ratelimit(&migrate_rs)) { - pr_warn("failed to isolate pfn %lx\n", pfn); + pr_warn("failed to isolate pfn %lx\n", + page_to_pfn(page)); dump_page(page, "isolation failed"); } } - put_page(page); +put_folio: + folio_put(folio); } if (!list_empty(&source)) { nodemask_t nmask = node_states[N_MEMORY]; struct migration_target_control mtc = { .nmask = &nmask, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, .reason = MR_MEMORY_HOTPLUG, }; int ret; @@ -1849,7 +1846,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * We have checked that migration range is on a single zone so * we can use the nid of the first page to all the others. */ - mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru)); + mtc.nid = folio_nid(list_first_entry(&source, struct folio, lru)); /* * try to allocate from a different node but reuse this node @@ -1862,11 +1859,12 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) ret = migrate_pages(&source, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); if (ret) { - list_for_each_entry(page, &source, lru) { + list_for_each_entry(folio, &source, lru) { if (__ratelimit(&migrate_rs)) { pr_warn("migrating pfn %lx failed ret:%d\n", - page_to_pfn(page), ret); - dump_page(page, "migration failure"); + folio_pfn(folio), ret); + dump_page(&folio->page, + "migration failure"); } } putback_movable_pages(&source); @@ -1881,54 +1879,6 @@ static int __init cmdline_parse_movable_node(char *p) } early_param("movable_node", cmdline_parse_movable_node); -/* check which state of node_states will be changed when offline memory */ -static void node_states_check_changes_offline(unsigned long nr_pages, - struct zone *zone, struct memory_notify *arg) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long present_pages = 0; - enum zone_type zt; - - arg->status_change_nid = NUMA_NO_NODE; - arg->status_change_nid_normal = NUMA_NO_NODE; - - /* - * Check whether node_states[N_NORMAL_MEMORY] will be changed. - * If the memory to be offline is within the range - * [0..ZONE_NORMAL], and it is the last present memory there, - * the zones in that range will become empty after the offlining, - * thus we can determine that we need to clear the node from - * node_states[N_NORMAL_MEMORY]. - */ - for (zt = 0; zt <= ZONE_NORMAL; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) - arg->status_change_nid_normal = zone_to_nid(zone); - - /* - * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM - * does not apply as we don't support 32bit. - * Here we count the possible pages from ZONE_MOVABLE. - * If after having accounted all the pages, we see that the nr_pages - * to be offlined is over or equal to the accounted pages, - * we know that the node will become empty, and so, we can clear - * it for N_MEMORY as well. - */ - present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; - - if (nr_pages >= present_pages) - arg->status_change_nid = zone_to_nid(zone); -} - -static void node_states_clear_node(int node, struct memory_notify *arg) -{ - if (arg->status_change_nid_normal >= 0) - node_clear_state(node, N_NORMAL_MEMORY); - - if (arg->status_change_nid >= 0) - node_clear_state(node, N_MEMORY); -} - static int count_system_ram_pages_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { @@ -1941,14 +1891,21 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, /* * Must be called with mem_hotplug_lock in write mode. */ -int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, +int offline_pages(unsigned long start_pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { + unsigned long pfn, managed_pages, system_ram_pages = 0; const unsigned long end_pfn = start_pfn + nr_pages; - unsigned long pfn, system_ram_pages = 0; + struct pglist_data *pgdat = zone->zone_pgdat; const int node = zone_to_nid(zone); + struct memory_notify mem_arg = { + .start_pfn = start_pfn, + .nr_pages = nr_pages, + }; + struct node_notify node_arg = { + .nid = NUMA_NO_NODE, + }; unsigned long flags; - struct memory_notify arg; char *reason; int ret; @@ -1967,9 +1924,9 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, * Don't allow to offline memory blocks that contain holes. * Consequently, memory blocks with holes can never get onlined * via the hotplug path - online_pages() - as hotplugged memory has - * no holes. This way, we e.g., don't have to worry about marking - * memory holes PG_reserved, don't need pfn_valid() checks, and can - * avoid using walk_system_ram_range() later. + * no holes. This way, we don't have to worry about memory holes, + * don't need pfn_valid() checks, and can avoid using + * walk_system_ram_range() later. */ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages, count_system_ram_pages_cb); @@ -2000,19 +1957,28 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, - MIGRATE_MOVABLE, - MEMORY_OFFLINE | REPORT_FAILURE, - GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL); + PB_ISOLATE_MODE_MEM_OFFLINE); if (ret) { reason = "failure to isolate range"; goto failed_removal_pcplists_disabled; } - arg.start_pfn = start_pfn; - arg.nr_pages = nr_pages; - node_states_check_changes_offline(nr_pages, zone, &arg); + /* + * Check whether the node will have no present pages after we offline + * 'nr_pages' more. If so, we know that the node will become empty, and + * so we will clear N_MEMORY for it. + */ + if (nr_pages >= pgdat->node_present_pages) { + node_arg.nid = node; + ret = node_notify(NODE_REMOVING_LAST_MEMORY, &node_arg); + ret = notifier_to_errno(ret); + if (ret) { + reason = "node notifier failure"; + goto failed_removal_isolated; + } + } - ret = memory_notify(MEM_GOING_OFFLINE, &arg); + ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg); ret = notifier_to_errno(ret); if (ret) { reason = "notifier failure"; @@ -2061,12 +2027,13 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, goto failed_removal_isolated; } - ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); + ret = test_pages_isolated(start_pfn, end_pfn, + PB_ISOLATE_MODE_MEM_OFFLINE); } while (ret); /* Mark all sections offline and remove free pages from the buddy. */ - __offline_isolated_pages(start_pfn, end_pfn); + managed_pages = __offline_isolated_pages(start_pfn, end_pfn); pr_debug("Offlined Pages %ld\n", nr_pages); /* @@ -2082,7 +2049,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, zone_pcp_enable(zone); /* removal success */ - adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); + adjust_managed_page_count(pfn_to_page(start_pfn), -managed_pages); adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages); /* reinitialise watermarks and update pcp limits */ @@ -2092,27 +2059,32 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, * Make sure to mark the node as memory-less before rebuilding the zone * list. Otherwise this node would still appear in the fallback lists. */ - node_states_clear_node(node, &arg); + if (node_arg.nid >= 0) + node_clear_state(node, N_MEMORY); if (!populated_zone(zone)) { zone_pcp_reset(zone); build_all_zonelists(NULL); } - if (arg.status_change_nid >= 0) { + if (node_arg.nid >= 0) { kcompactd_stop(node); kswapd_stop(node); + /* Node went memoryless. Notify consumers */ + node_notify(NODE_REMOVED_LAST_MEMORY, &node_arg); } writeback_set_ratelimit(); - memory_notify(MEM_OFFLINE, &arg); + memory_notify(MEM_OFFLINE, &mem_arg); remove_pfn_range_from_zone(zone, start_pfn, nr_pages); return 0; failed_removal_isolated: /* pushback to free area */ - undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); - memory_notify(MEM_CANCEL_OFFLINE, &arg); + undo_isolate_page_range(start_pfn, end_pfn); + memory_notify(MEM_CANCEL_OFFLINE, &mem_arg); + if (node_arg.nid != NUMA_NO_NODE) + node_notify(NODE_CANCEL_REMOVING_LAST_MEMORY, &node_arg); failed_removal_pcplists_disabled: lru_cache_enable(); zone_pcp_enable(zone); @@ -2218,7 +2190,7 @@ void try_offline_node(int nid) * node now. */ node_set_offline(nid); - unregister_one_node(nid); + unregister_node(nid); } EXPORT_SYMBOL(try_offline_node); @@ -2242,7 +2214,7 @@ static int memory_blocks_have_altmaps(u64 start, u64 size) return 1; } -static int __ref try_remove_memory(u64 start, u64 size) +static int try_remove_memory(u64 start, u64 size) { int rc, nid = NUMA_NO_NODE; @@ -2283,10 +2255,8 @@ static int __ref try_remove_memory(u64 start, u64 size) remove_memory_blocks_and_altmaps(start, size); } - if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { - memblock_phys_free(start, size); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); - } release_mem_region_adjustable(start, size); @@ -2346,7 +2316,7 @@ static int try_offline_memory_block(struct memory_block *mem, void *arg) * by offlining code ... so we don't care about that. */ page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); - if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) + if (page && page_zonenum(page) == ZONE_MOVABLE) online_type = MMOP_ONLINE_MOVABLE; rc = device_offline(&mem->dev); |
