diff options
Diffstat (limited to 'mm/memory_hotplug.c')
| -rw-r--r-- | mm/memory_hotplug.c | 801 |
1 files changed, 480 insertions, 321 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3f231cf1b410..a63ec679d861 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -35,23 +35,92 @@ #include <linux/compaction.h> #include <linux/rmap.h> #include <linux/module.h> +#include <linux/node.h> #include <asm/tlbflush.h> #include "internal.h" #include "shuffle.h" +enum { + MEMMAP_ON_MEMORY_DISABLE = 0, + MEMMAP_ON_MEMORY_ENABLE, + MEMMAP_ON_MEMORY_FORCE, +}; + +static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE; + +static inline unsigned long memory_block_memmap_size(void) +{ + return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); +} + +static inline unsigned long memory_block_memmap_on_memory_pages(void) +{ + unsigned long nr_pages = PFN_UP(memory_block_memmap_size()); + + /* + * In "forced" memmap_on_memory mode, we add extra pages to align the + * vmemmap size to cover full pageblocks. That way, we can add memory + * even if the vmemmap size is not properly aligned, however, we might waste + * memory. + */ + if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) + return pageblock_align(nr_pages); + return nr_pages; +} + #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY /* * memory_hotplug.memmap_on_memory parameter */ -static bool memmap_on_memory __ro_after_init; -module_param(memmap_on_memory, bool, 0444); -MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); +static int set_memmap_mode(const char *val, const struct kernel_param *kp) +{ + int ret, mode; + bool enabled; + + if (sysfs_streq(val, "force") || sysfs_streq(val, "FORCE")) { + mode = MEMMAP_ON_MEMORY_FORCE; + } else { + ret = kstrtobool(val, &enabled); + if (ret < 0) + return ret; + if (enabled) + mode = MEMMAP_ON_MEMORY_ENABLE; + else + mode = MEMMAP_ON_MEMORY_DISABLE; + } + *((int *)kp->arg) = mode; + if (mode == MEMMAP_ON_MEMORY_FORCE) { + unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); + + pr_info_once("Memory hotplug will waste %ld pages in each memory block\n", + memmap_pages - PFN_UP(memory_block_memmap_size())); + } + return 0; +} + +static int get_memmap_mode(char *buffer, const struct kernel_param *kp) +{ + int mode = *((int *)kp->arg); + + if (mode == MEMMAP_ON_MEMORY_FORCE) + return sprintf(buffer, "force\n"); + return sprintf(buffer, "%c\n", mode ? 'Y' : 'N'); +} + +static const struct kernel_param_ops memmap_mode_ops = { + .set = set_memmap_mode, + .get = get_memmap_mode, +}; +module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444); +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n" + "With value \"force\" it could result in memory wastage due " + "to memmap size limitations (Y/N/force)"); static inline bool mhp_memmap_on_memory(void) { - return memmap_on_memory; + return memmap_mode != MEMMAP_ON_MEMORY_DISABLE; } #else static inline bool mhp_memmap_on_memory(void) @@ -151,11 +220,30 @@ void put_online_mems(void) bool movable_node_enabled = false; -#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -int mhp_default_online_type = MMOP_OFFLINE; -#else -int mhp_default_online_type = MMOP_ONLINE; -#endif +static int mhp_default_online_type = -1; +int mhp_get_default_online_type(void) +{ + if (mhp_default_online_type >= 0) + return mhp_default_online_type; + + if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE)) + mhp_default_online_type = MMOP_OFFLINE; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO)) + mhp_default_online_type = MMOP_ONLINE; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL)) + mhp_default_online_type = MMOP_ONLINE_KERNEL; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE)) + mhp_default_online_type = MMOP_ONLINE_MOVABLE; + else + mhp_default_online_type = MMOP_OFFLINE; + + return mhp_default_online_type; +} + +void mhp_set_default_online_type(int online_type) +{ + mhp_default_online_type = online_type; +} static int __init setup_memhp_default_state(char *str) { @@ -287,7 +375,7 @@ struct page *pfn_to_online_page(unsigned long pfn) * the section may be 'offline' but 'valid'. Only * get_dev_pagemap() can determine sub-section online status. */ - pgmap = get_dev_pagemap(pfn, NULL); + pgmap = get_dev_pagemap(pfn); put_dev_pagemap(pgmap); /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ @@ -298,7 +386,7 @@ struct page *pfn_to_online_page(unsigned long pfn) } EXPORT_SYMBOL_GPL(pfn_to_online_page); -int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, +int __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, struct mhp_params *params) { const unsigned long end_pfn = pfn + nr_pages; @@ -456,7 +544,7 @@ static void update_pgdat_span(struct pglist_data *pgdat) pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; } -void __ref remove_pfn_range_from_zone(struct zone *zone, +void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { @@ -560,16 +648,10 @@ int restore_online_page_callback(online_page_callback_t callback) } EXPORT_SYMBOL_GPL(restore_online_page_callback); +/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ void generic_online_page(struct page *page, unsigned int order) { - /* - * Freeing the page with debug_pagealloc enabled will try to unmap it, - * so we should map it first. This is better than introducing a special - * case in page freeing fast path. - */ - debug_pagealloc_map_pages(page, 1 << order); - __free_pages_core(page, order); - totalram_pages_add(1UL << order); + __free_pages_core(page, order, MEMINIT_HOTPLUG); } EXPORT_SYMBOL_GPL(generic_online_page); @@ -579,7 +661,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) unsigned long pfn; /* - * Online the pages in MAX_ORDER aligned chunks. The callback might + * Online the pages in MAX_PAGE_ORDER aligned chunks. The callback might * decide to not expose all pages to the buddy (e.g., expose them * later). We account all pages as being online and belonging to this * zone ("present"). @@ -588,20 +670,29 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * this and the first chunk to online will be pageblock_nr_pages. */ for (pfn = start_pfn; pfn < end_pfn;) { + struct page *page = pfn_to_page(pfn); int order; /* * Free to online pages in the largest chunks alignment allows. * * __ffs() behaviour is undefined for 0. start == 0 is - * MAX_ORDER-aligned, Set order to MAX_ORDER for the case. + * MAX_PAGE_ORDER-aligned, Set order to MAX_PAGE_ORDER for + * the case. */ if (pfn) - order = min_t(int, MAX_ORDER, __ffs(pfn)); + order = min_t(int, MAX_PAGE_ORDER, __ffs(pfn)); else - order = MAX_ORDER; + order = MAX_PAGE_ORDER; - (*online_page_callback)(pfn_to_page(pfn), order); + /* + * Exposing the page to the buddy by freeing can cause + * issues with debug_pagealloc enabled: some archs don't + * like double-unmappings. So treat them like any pages that + * were allocated from the buddy. + */ + debug_pagealloc_map_pages(page, 1 << order); + (*online_page_callback)(page, order); pfn += (1UL << order); } @@ -609,30 +700,6 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) online_mem_sections(start_pfn, end_pfn); } -/* check which state of node_states will be changed when online memory */ -static void node_states_check_changes_online(unsigned long nr_pages, - struct zone *zone, struct memory_notify *arg) -{ - int nid = zone_to_nid(zone); - - arg->status_change_nid = NUMA_NO_NODE; - arg->status_change_nid_normal = NUMA_NO_NODE; - - if (!node_state(nid, N_MEMORY)) - arg->status_change_nid = nid; - if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) - arg->status_change_nid_normal = nid; -} - -static void node_states_set_node(int node, struct memory_notify *arg) -{ - if (arg->status_change_nid_normal >= 0) - node_set_state(node, N_NORMAL_MEMORY); - - if (arg->status_change_nid >= 0) - node_set_state(node, N_MEMORY); -} - static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { @@ -672,15 +739,16 @@ static inline void section_taint_zone_device(unsigned long pfn) /* * Associate the pfn range with the given zone, initializing the memmaps * and resizing the pgdat/zone data to span the added pages. After this - * call, all affected pages are PG_reserved. + * call, all affected pages are PageOffline(). * * All aligned pageblocks are initialized to the specified migratetype * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related * zone stats (e.g., nr_isolate_pageblock) are touched. */ -void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, +void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, int migratetype) + struct vmem_altmap *altmap, int migratetype, + bool isolate_pageblock) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; @@ -707,12 +775,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, /* * TODO now we have a visible range of pages which are not associated - * with their zone properly. Not nice but set_pfnblock_flags_mask + * with their zone properly. Not nice but set_pfnblock_migratetype() * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, - MEMINIT_HOTPLUG, altmap, migratetype); + MEMINIT_HOTPLUG, altmap, migratetype, + isolate_pageblock); set_zone_contiguous(zone); } @@ -777,7 +846,6 @@ static bool auto_movable_can_online_movable(int nid, struct memory_group *group, unsigned long kernel_early_pages, movable_pages; struct auto_movable_group_stats group_stats = {}; struct auto_movable_stats stats = {}; - pg_data_t *pgdat = NODE_DATA(nid); struct zone *zone; int i; @@ -788,6 +856,8 @@ static bool auto_movable_can_online_movable(int nid, struct memory_group *group, auto_movable_stats_account_zone(&stats, zone); } else { for (i = 0; i < MAX_NR_ZONES; i++) { + pg_data_t *pgdat = NODE_DATA(nid); + zone = pgdat->node_zones + i; if (populated_zone(zone)) auto_movable_stats_account_zone(&stats, zone); @@ -885,7 +955,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * effectively unused by the kernel, yet they account to "present pages". * Fortunately, these allocations are comparatively small in relevant setups * (e.g., fraction of system memory). - * b) Some hotplugged memory blocks in virtualized environments, esecially + * b) Some hotplugged memory blocks in virtualized environments, especially * hotplugged by virtio-mem, look like they are completely present, however, * only parts of the memory block are actually currently usable. * "present pages" is an upper limit that can get reached at runtime. As @@ -1027,10 +1097,15 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, if (ret) return ret; - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE, + false); - for (i = 0; i < nr_pages; i++) - SetPageVmemmapSelfHosted(pfn_to_page(pfn + i)); + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn + i); + + __ClearPageOffline(page); + SetPageVmemmapSelfHosted(page); + } /* * It might be that the vmemmap_pages fully span sections. If that is @@ -1063,14 +1138,23 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, +/* + * Must be called with mem_hotplug_lock in write mode. + */ +int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { - unsigned long flags; - int need_zonelists_rebuild = 0; + struct memory_notify mem_arg = { + .start_pfn = pfn, + .nr_pages = nr_pages, + }; + struct node_notify node_arg = { + .nid = NUMA_NO_NODE, + }; const int nid = zone_to_nid(zone); + int need_zonelists_rebuild = 0; + unsigned long flags; int ret; - struct memory_notify arg; /* * {on,off}lining is constrained to full memory sections (or more @@ -1083,16 +1167,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; - mem_hotplug_begin(); /* associate pfn range with the zone */ - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); - - arg.start_pfn = pfn; - arg.nr_pages = nr_pages; - node_states_check_changes_online(nr_pages, zone, &arg); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE, + true); + + if (!node_state(nid, N_MEMORY)) { + /* Adding memory to the node for the first time */ + node_arg.nid = nid; + ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg); + ret = notifier_to_errno(ret); + if (ret) + goto failed_addition; + } - ret = memory_notify(MEM_GOING_ONLINE, &arg); + ret = memory_notify(MEM_GOING_ONLINE, &mem_arg); ret = notifier_to_errno(ret); if (ret) goto failed_addition; @@ -1118,12 +1207,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, online_pages_range(pfn, nr_pages); adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); - node_states_set_node(nid, &arg); + if (node_arg.nid >= 0) + node_set_state(nid, N_MEMORY); if (need_zonelists_rebuild) build_all_zonelists(NULL); /* Basic onlining is complete, allow allocation of onlined pages. */ - undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); + undo_isolate_page_range(pfn, pfn + nr_pages); /* * Freshly onlined pages aren't shuffled (e.g., all pages are placed to @@ -1139,24 +1229,28 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, kswapd_run(nid); kcompactd_run(nid); + if (node_arg.nid >= 0) + /* First memory added successfully. Notify consumers. */ + node_notify(NODE_ADDED_FIRST_MEMORY, &node_arg); + writeback_set_ratelimit(); - memory_notify(MEM_ONLINE, &arg); - mem_hotplug_done(); + memory_notify(MEM_ONLINE, &mem_arg); return 0; failed_addition: pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", (unsigned long long) pfn << PAGE_SHIFT, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); - memory_notify(MEM_CANCEL_ONLINE, &arg); + memory_notify(MEM_CANCEL_ONLINE, &mem_arg); + if (node_arg.nid != NUMA_NO_NODE) + node_notify(NODE_CANCEL_ADDING_FIRST_MEMORY, &node_arg); remove_pfn_range_from_zone(zone, pfn, nr_pages); - mem_hotplug_done(); return ret; } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_init_pgdat(int nid) +static pg_data_t *hotadd_init_pgdat(int nid) { struct pglist_data *pgdat; @@ -1208,7 +1302,7 @@ static int __try_online_node(int nid, bool set_node_online) if (set_node_online) { node_set_online(nid); - ret = register_one_node(nid); + ret = register_node(nid); BUG_ON(ret); } out: @@ -1243,31 +1337,36 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { - mem->online_type = mhp_default_online_type; + mem->online_type = mhp_get_default_online_type(); return device_online(&mem->dev); } -bool mhp_supports_memmap_on_memory(unsigned long size) +#ifndef arch_supports_memmap_on_memory +static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) +{ + /* + * As default, we want the vmemmap to span a complete PMD such that we + * can map the vmemmap using a single PMD if supported by the + * architecture. + */ + return IS_ALIGNED(vmemmap_size, PMD_SIZE); +} +#endif + +bool mhp_supports_memmap_on_memory(void) { - unsigned long nr_vmemmap_pages = size / PAGE_SIZE; - unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); - unsigned long remaining_size = size - vmemmap_size; + unsigned long vmemmap_size = memory_block_memmap_size(); + unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); /* * Besides having arch support and the feature enabled at runtime, we * need a few more assumptions to hold true: * - * a) We span a single memory block: memory onlining/offlinin;g happens - * in memory block granularity. We don't want the vmemmap of online - * memory blocks to reside on offline memory blocks. In the future, - * we might want to support variable-sized memory blocks to make the - * feature more versatile. - * - * b) The vmemmap pages span complete PMDs: We don't want vmemmap code + * a) The vmemmap pages span complete PMDs: We don't want vmemmap code * to populate memory from the altmap for unrelated parts (i.e., * other memory blocks) * - * c) The vmemmap pages (and thereby the pages that will be exposed to + * b) The vmemmap pages (and thereby the pages that will be exposed to * the buddy) have to cover full pageblocks: memory onlining/offlining * code requires applicable ranges to be page-aligned, for example, to * set the migratetypes properly. @@ -1279,10 +1378,108 @@ bool mhp_supports_memmap_on_memory(unsigned long size) * altmap as an alternative source of memory, and we do not exactly * populate a single PMD. */ - return mhp_memmap_on_memory() && - size == memory_block_size_bytes() && - IS_ALIGNED(vmemmap_size, PMD_SIZE) && - IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)); + if (!mhp_memmap_on_memory()) + return false; + + /* + * Make sure the vmemmap allocation is fully contained + * so that we always allocate vmemmap memory from altmap area. + */ + if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE)) + return false; + + /* + * start pfn should be pageblock_nr_pages aligned for correctly + * setting migrate types + */ + if (!pageblock_aligned(memmap_pages)) + return false; + + if (memmap_pages == PHYS_PFN(memory_block_size_bytes())) + /* No effective hotplugged memory doesn't make sense. */ + return false; + + return arch_supports_memmap_on_memory(vmemmap_size); +} +EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory); + +static void remove_memory_blocks_and_altmaps(u64 start, u64 size) +{ + unsigned long memblock_size = memory_block_size_bytes(); + u64 cur_start; + + /* + * For memmap_on_memory, the altmaps were added on a per-memblock + * basis; we have to process each individual memory block. + */ + for (cur_start = start; cur_start < start + size; + cur_start += memblock_size) { + struct vmem_altmap *altmap = NULL; + struct memory_block *mem; + + mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start))); + if (WARN_ON_ONCE(!mem)) + continue; + + altmap = mem->altmap; + mem->altmap = NULL; + + remove_memory_block_devices(cur_start, memblock_size); + + arch_remove_memory(cur_start, memblock_size, altmap); + + /* Verify that all vmemmap pages have actually been freed. */ + WARN(altmap->alloc, "Altmap not fully unmapped"); + kfree(altmap); + } +} + +static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, + u64 start, u64 size) +{ + unsigned long memblock_size = memory_block_size_bytes(); + u64 cur_start; + int ret; + + for (cur_start = start; cur_start < start + size; + cur_start += memblock_size) { + struct mhp_params params = { .pgprot = + pgprot_mhp(PAGE_KERNEL) }; + struct vmem_altmap mhp_altmap = { + .base_pfn = PHYS_PFN(cur_start), + .end_pfn = PHYS_PFN(cur_start + memblock_size - 1), + }; + + mhp_altmap.free = memory_block_memmap_on_memory_pages(); + params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), + GFP_KERNEL); + if (!params.altmap) { + ret = -ENOMEM; + goto out; + } + + /* call arch's memory hotadd */ + ret = arch_add_memory(nid, cur_start, memblock_size, ¶ms); + if (ret < 0) { + kfree(params.altmap); + goto out; + } + + /* create memory block devices after memory was added */ + ret = create_memory_block_devices(cur_start, memblock_size, nid, + params.altmap, group); + if (ret) { + arch_remove_memory(cur_start, memblock_size, NULL); + kfree(params.altmap); + goto out; + } + } + + return 0; +out: + if (ret && cur_start != start) + remove_memory_blocks_and_altmaps(start, cur_start - start); + return ret; } /* @@ -1291,11 +1488,10 @@ bool mhp_supports_memmap_on_memory(unsigned long size) * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) +int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; enum memblock_flags memblock_flags = MEMBLOCK_NONE; - struct vmem_altmap mhp_altmap = {}; struct memory_group *group = NULL; u64 start, size; bool new_node = false; @@ -1332,49 +1528,40 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) ret = __try_online_node(nid, false); if (ret < 0) - goto error; - new_node = ret; + goto error_memblock_remove; + if (ret) { + node_set_online(nid); + ret = register_node(nid); + if (WARN_ON(ret)) { + node_set_offline(nid); + goto error_memblock_remove; + } + new_node = true; + } /* * Self hosted memmap array */ - if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { - if (!mhp_supports_memmap_on_memory(size)) { - ret = -EINVAL; + if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && + mhp_supports_memmap_on_memory()) { + ret = create_altmaps_and_memory_blocks(nid, group, start, size); + if (ret) + goto error; + } else { + ret = arch_add_memory(nid, start, size, ¶ms); + if (ret < 0) goto error; - } - mhp_altmap.free = PHYS_PFN(size); - mhp_altmap.base_pfn = PHYS_PFN(start); - params.altmap = &mhp_altmap; - } - - /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, ¶ms); - if (ret < 0) - goto error; - - /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, mhp_altmap.alloc, - group); - if (ret) { - arch_remove_memory(start, size, NULL); - goto error; - } - if (new_node) { - /* If sysfs file of new node can't be created, cpu on the node - * can't be hot-added. There is no rollback way now. - * So, check by BUG_ON() to catch it reluctantly.. - * We online node here. We can't roll back from here. - */ - node_set_online(nid); - ret = __register_one_node(nid); - BUG_ON(ret); + /* create memory block devices after memory was added */ + ret = create_memory_block_devices(start, size, nid, NULL, group); + if (ret) { + arch_remove_memory(start, size, params.altmap); + goto error; + } } - register_memory_blocks_under_node(nid, PFN_DOWN(start), - PFN_UP(start + size - 1), - MEMINIT_HOTPLUG); + register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start), + PFN_UP(start + size - 1)); /* create new memmap entry */ if (!strcmp(res->name, "System RAM")) @@ -1391,11 +1578,16 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) merge_system_ram_resource(res); /* online pages if requested */ - if (mhp_default_online_type != MMOP_OFFLINE) + if (mhp_get_default_online_type() != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; error: + if (new_node) { + node_set_offline(nid); + unregister_node(nid); + } +error_memblock_remove: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); error_mem_hotplug_end: @@ -1404,7 +1596,7 @@ error_mem_hotplug_end: } /* requires device_hotplug_lock, see add_memory_resource() */ -int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) +int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { struct resource *res; int ret; @@ -1505,7 +1697,7 @@ struct range __weak arch_get_mappable_range(void) struct range mhp_get_pluggable_range(bool need_mapping) { - const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1; + const u64 max_phys = DIRECT_MAP_PHYSMEM_END; struct range mhp_range; if (need_mapping) { @@ -1538,8 +1730,8 @@ bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, - * non-lru movable pages and hugepages). Will skip over most unmovable + * Scan pfn range [start,end) to find movable/migratable pages (LRU and + * hugetlb folio, movable_ops pages). Will skip over most unmovable * pages (esp., pages that can be skipped when offlining), but bail out on * definitely unmovable pages. * @@ -1553,20 +1745,16 @@ static int scan_movable_pages(unsigned long start, unsigned long end, { unsigned long pfn; - for (pfn = start; pfn < end; pfn++) { - struct page *page, *head; - unsigned long skip; + for_each_valid_pfn(pfn, start, end) { + struct page *page; + struct folio *folio; - if (!pfn_valid(pfn)) - continue; page = pfn_to_page(pfn); - if (PageLRU(page)) - goto found; - if (__PageMovable(page)) + if (PageLRU(page) || page_has_movable_ops(page)) goto found; /* - * PageOffline() pages that are not marked __PageMovable() and + * PageOffline() pages that do not have movable_ops and * have a reference count > 0 (after MEM_GOING_OFFLINE) are * definitely unmovable. If their reference count would be 0, * they could at least be skipped when offlining memory. @@ -1576,7 +1764,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end, if (!PageHuge(page)) continue; - head = compound_head(page); + folio = page_folio(page); /* * This test is racy as we hold no reference or lock. The * hugetlb page could have been free'ed and head is no longer @@ -1584,10 +1772,9 @@ static int scan_movable_pages(unsigned long start, unsigned long end, * cases false positives and negatives are possible. Calling * code must deal with these scenarios. */ - if (HPageMigratable(head)) + if (folio_test_hugetlb_migratable(folio)) goto found; - skip = compound_nr(head) - (page - head); - pfn += skip - 1; + pfn |= folio_nr_pages(folio) - 1; } return -ENOENT; found: @@ -1597,73 +1784,61 @@ found: static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { + struct folio *folio; unsigned long pfn; - struct page *page, *head; LIST_HEAD(source); static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - struct folio *folio; - bool isolated; + for_each_valid_pfn(pfn, start_pfn, end_pfn) { + struct page *page; - if (!pfn_valid(pfn)) - continue; page = pfn_to_page(pfn); folio = page_folio(page); - head = &folio->page; - if (PageHuge(page)) { - pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_hugetlb(folio, &source); + if (!folio_try_get(folio)) continue; - } else if (PageTransHuge(page)) - pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; - /* - * HWPoison pages have elevated reference counts so the migration would - * fail on them. It also doesn't make any sense to migrate them in the - * first place. Still try to unmap such a page in case it is still mapped - * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep - * the unmap as the catch all safety net). - */ - if (PageHWPoison(page)) { - if (WARN_ON(folio_test_lru(folio))) - folio_isolate_lru(folio); - if (folio_mapped(folio)) - try_to_unmap(folio, TTU_IGNORE_MLOCK); - continue; - } + if (unlikely(page_folio(page) != folio)) + goto put_folio; - if (!get_page_unless_zero(page)) - continue; - /* - * We can skip free pages. And we can deal with pages on - * LRU and non-lru movable pages. - */ - if (PageLRU(page)) - isolated = isolate_lru_page(page); - else - isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE); - if (isolated) { - list_add_tail(&page->lru, &source); - if (!__PageMovable(page)) - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_lru(page)); + if (folio_test_large(folio)) + pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; - } else { + if (folio_contain_hwpoisoned_page(folio)) { + /* + * unmap_poisoned_folio() cannot handle large folios + * in all cases yet. + */ + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + goto put_folio; + if (folio_test_lru(folio) && !folio_isolate_lru(folio)) + goto put_folio; + if (folio_mapped(folio)) { + folio_lock(folio); + unmap_poisoned_folio(folio, pfn, false); + folio_unlock(folio); + } + + goto put_folio; + } + + if (!isolate_folio_to_list(folio, &source)) { if (__ratelimit(&migrate_rs)) { - pr_warn("failed to isolate pfn %lx\n", pfn); + pr_warn("failed to isolate pfn %lx\n", + page_to_pfn(page)); dump_page(page, "isolation failed"); } } - put_page(page); +put_folio: + folio_put(folio); } if (!list_empty(&source)) { nodemask_t nmask = node_states[N_MEMORY]; struct migration_target_control mtc = { .nmask = &nmask, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .reason = MR_MEMORY_HOTPLUG, }; int ret; @@ -1671,7 +1846,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * We have checked that migration range is on a single zone so * we can use the nid of the first page to all the others. */ - mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru)); + mtc.nid = folio_nid(list_first_entry(&source, struct folio, lru)); /* * try to allocate from a different node but reuse this node @@ -1684,11 +1859,12 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) ret = migrate_pages(&source, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); if (ret) { - list_for_each_entry(page, &source, lru) { + list_for_each_entry(folio, &source, lru) { if (__ratelimit(&migrate_rs)) { pr_warn("migrating pfn %lx failed ret:%d\n", - page_to_pfn(page), ret); - dump_page(page, "migration failure"); + folio_pfn(folio), ret); + dump_page(&folio->page, + "migration failure"); } } putback_movable_pages(&source); @@ -1703,54 +1879,6 @@ static int __init cmdline_parse_movable_node(char *p) } early_param("movable_node", cmdline_parse_movable_node); -/* check which state of node_states will be changed when offline memory */ -static void node_states_check_changes_offline(unsigned long nr_pages, - struct zone *zone, struct memory_notify *arg) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long present_pages = 0; - enum zone_type zt; - - arg->status_change_nid = NUMA_NO_NODE; - arg->status_change_nid_normal = NUMA_NO_NODE; - - /* - * Check whether node_states[N_NORMAL_MEMORY] will be changed. - * If the memory to be offline is within the range - * [0..ZONE_NORMAL], and it is the last present memory there, - * the zones in that range will become empty after the offlining, - * thus we can determine that we need to clear the node from - * node_states[N_NORMAL_MEMORY]. - */ - for (zt = 0; zt <= ZONE_NORMAL; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) - arg->status_change_nid_normal = zone_to_nid(zone); - - /* - * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM - * does not apply as we don't support 32bit. - * Here we count the possible pages from ZONE_MOVABLE. - * If after having accounted all the pages, we see that the nr_pages - * to be offlined is over or equal to the accounted pages, - * we know that the node will become empty, and so, we can clear - * it for N_MEMORY as well. - */ - present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; - - if (nr_pages >= present_pages) - arg->status_change_nid = zone_to_nid(zone); -} - -static void node_states_clear_node(int node, struct memory_notify *arg) -{ - if (arg->status_change_nid_normal >= 0) - node_clear_state(node, N_NORMAL_MEMORY); - - if (arg->status_change_nid >= 0) - node_clear_state(node, N_MEMORY); -} - static int count_system_ram_pages_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { @@ -1760,14 +1888,24 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, return 0; } -int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, +/* + * Must be called with mem_hotplug_lock in write mode. + */ +int offline_pages(unsigned long start_pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { + unsigned long pfn, managed_pages, system_ram_pages = 0; const unsigned long end_pfn = start_pfn + nr_pages; - unsigned long pfn, system_ram_pages = 0; + struct pglist_data *pgdat = zone->zone_pgdat; const int node = zone_to_nid(zone); + struct memory_notify mem_arg = { + .start_pfn = start_pfn, + .nr_pages = nr_pages, + }; + struct node_notify node_arg = { + .nid = NUMA_NO_NODE, + }; unsigned long flags; - struct memory_notify arg; char *reason; int ret; @@ -1782,15 +1920,13 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; - mem_hotplug_begin(); - /* * Don't allow to offline memory blocks that contain holes. * Consequently, memory blocks with holes can never get onlined * via the hotplug path - online_pages() - as hotplugged memory has - * no holes. This way, we e.g., don't have to worry about marking - * memory holes PG_reserved, don't need pfn_valid() checks, and can - * avoid using walk_system_ram_range() later. + * no holes. This way, we don't have to worry about memory holes, + * don't need pfn_valid() checks, and can avoid using + * walk_system_ram_range() later. */ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages, count_system_ram_pages_cb); @@ -1821,19 +1957,28 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, - MIGRATE_MOVABLE, - MEMORY_OFFLINE | REPORT_FAILURE, - GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL); + PB_ISOLATE_MODE_MEM_OFFLINE); if (ret) { reason = "failure to isolate range"; goto failed_removal_pcplists_disabled; } - arg.start_pfn = start_pfn; - arg.nr_pages = nr_pages; - node_states_check_changes_offline(nr_pages, zone, &arg); + /* + * Check whether the node will have no present pages after we offline + * 'nr_pages' more. If so, we know that the node will become empty, and + * so we will clear N_MEMORY for it. + */ + if (nr_pages >= pgdat->node_present_pages) { + node_arg.nid = node; + ret = node_notify(NODE_REMOVING_LAST_MEMORY, &node_arg); + ret = notifier_to_errno(ret); + if (ret) { + reason = "node notifier failure"; + goto failed_removal_isolated; + } + } - ret = memory_notify(MEM_GOING_OFFLINE, &arg); + ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg); ret = notifier_to_errno(ret); if (ret) { reason = "notifier failure"; @@ -1843,6 +1988,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, do { pfn = start_pfn; do { + /* + * Historically we always checked for any signal and + * can't limit it to fatal signals without eventually + * breaking user space. + */ if (signal_pending(current)) { ret = -EINTR; reason = "signal backoff"; @@ -1867,22 +2017,23 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, } /* - * Dissolve free hugepages in the memory block before doing + * Dissolve free hugetlb folios in the memory block before doing * offlining actually in order to make hugetlbfs's object * counting consistent. */ - ret = dissolve_free_huge_pages(start_pfn, end_pfn); + ret = dissolve_free_hugetlb_folios(start_pfn, end_pfn); if (ret) { reason = "failure to dissolve huge pages"; goto failed_removal_isolated; } - ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); + ret = test_pages_isolated(start_pfn, end_pfn, + PB_ISOLATE_MODE_MEM_OFFLINE); } while (ret); /* Mark all sections offline and remove free pages from the buddy. */ - __offline_isolated_pages(start_pfn, end_pfn); + managed_pages = __offline_isolated_pages(start_pfn, end_pfn); pr_debug("Offlined Pages %ld\n", nr_pages); /* @@ -1898,34 +2049,42 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, zone_pcp_enable(zone); /* removal success */ - adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); + adjust_managed_page_count(pfn_to_page(start_pfn), -managed_pages); adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages); /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); + /* + * Make sure to mark the node as memory-less before rebuilding the zone + * list. Otherwise this node would still appear in the fallback lists. + */ + if (node_arg.nid >= 0) + node_clear_state(node, N_MEMORY); if (!populated_zone(zone)) { zone_pcp_reset(zone); build_all_zonelists(NULL); } - node_states_clear_node(node, &arg); - if (arg.status_change_nid >= 0) { + if (node_arg.nid >= 0) { kcompactd_stop(node); kswapd_stop(node); + /* Node went memoryless. Notify consumers */ + node_notify(NODE_REMOVED_LAST_MEMORY, &node_arg); } writeback_set_ratelimit(); - memory_notify(MEM_OFFLINE, &arg); + memory_notify(MEM_OFFLINE, &mem_arg); remove_pfn_range_from_zone(zone, start_pfn, nr_pages); - mem_hotplug_done(); return 0; failed_removal_isolated: /* pushback to free area */ - undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); - memory_notify(MEM_CANCEL_OFFLINE, &arg); + undo_isolate_page_range(start_pfn, end_pfn); + memory_notify(MEM_CANCEL_OFFLINE, &mem_arg); + if (node_arg.nid != NUMA_NO_NODE) + node_notify(NODE_CANCEL_REMOVING_LAST_MEMORY, &node_arg); failed_removal_pcplists_disabled: lru_cache_enable(); zone_pcp_enable(zone); @@ -1934,7 +2093,6 @@ failed_removal: (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, reason); - mem_hotplug_done(); return ret; } @@ -1956,12 +2114,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) return 0; } -static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) +static int count_memory_range_altmaps_cb(struct memory_block *mem, void *arg) { - /* - * If not set, continue with the next block. - */ - return mem->nr_vmemmap_pages; + u64 *num_altmaps = (u64 *)arg; + + if (mem->altmap) + *num_altmaps += 1; + + return 0; } static int check_cpu_on_node(int nid) @@ -2030,16 +2190,33 @@ void try_offline_node(int nid) * node now. */ node_set_offline(nid); - unregister_one_node(nid); + unregister_node(nid); } EXPORT_SYMBOL(try_offline_node); -static int __ref try_remove_memory(u64 start, u64 size) +static int memory_blocks_have_altmaps(u64 start, u64 size) +{ + u64 num_memblocks = size / memory_block_size_bytes(); + u64 num_altmaps = 0; + + if (!mhp_memmap_on_memory()) + return 0; + + walk_memory_blocks(start, size, &num_altmaps, + count_memory_range_altmaps_cb); + + if (num_altmaps == 0) + return 0; + + if (WARN_ON_ONCE(num_memblocks != num_altmaps)) + return -EINVAL; + + return 1; +} + +static int try_remove_memory(u64 start, u64 size) { - struct vmem_altmap mhp_altmap = {}; - struct vmem_altmap *altmap = NULL; - unsigned long nr_vmemmap_pages; - int rc = 0, nid = NUMA_NO_NODE; + int rc, nid = NUMA_NO_NODE; BUG_ON(check_hotplug_memory_range(start, size)); @@ -2056,48 +2233,30 @@ static int __ref try_remove_memory(u64 start, u64 size) if (rc) return rc; - /* - * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in - * the same granularity it was added - a single memory block. - */ - if (mhp_memmap_on_memory()) { - nr_vmemmap_pages = walk_memory_blocks(start, size, NULL, - get_nr_vmemmap_pages_cb); - if (nr_vmemmap_pages) { - if (size != memory_block_size_bytes()) { - pr_warn("Refuse to remove %#llx - %#llx," - "wrong granularity\n", - start, start + size); - return -EINVAL; - } - - /* - * Let remove_pmd_table->free_hugepage_table do the - * right thing if we used vmem_altmap when hot-adding - * the range. - */ - mhp_altmap.alloc = nr_vmemmap_pages; - altmap = &mhp_altmap; - } - } - /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); - /* - * Memory block device removal under the device_hotplug_lock is - * a barrier against racing online attempts. - */ - remove_memory_block_devices(start, size); - mem_hotplug_begin(); - arch_remove_memory(start, size, altmap); + rc = memory_blocks_have_altmaps(start, size); + if (rc < 0) { + mem_hotplug_done(); + return rc; + } else if (!rc) { + /* + * Memory block device removal under the device_hotplug_lock is + * a barrier against racing online attempts. + * No altmaps present, do the removal directly + */ + remove_memory_block_devices(start, size); + arch_remove_memory(start, size, NULL); + } else { + /* all memblocks in the range have altmaps */ + remove_memory_blocks_and_altmaps(start, size); + } - if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { - memblock_phys_free(start, size); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); - } release_mem_region_adjustable(start, size); @@ -2157,7 +2316,7 @@ static int try_offline_memory_block(struct memory_block *mem, void *arg) * by offlining code ... so we don't care about that. */ page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); - if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) + if (page && page_zonenum(page) == ZONE_MOVABLE) online_type = MMOP_ONLINE_MOVABLE; rc = device_offline(&mem->dev); |
