summaryrefslogtreecommitdiff
path: root/mm/memory_hotplug.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory_hotplug.c')
-rw-r--r--mm/memory_hotplug.c236
1 files changed, 128 insertions, 108 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 21890994c1d3..16cf9e17077e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -219,11 +219,30 @@ void put_online_mems(void)
bool movable_node_enabled = false;
-#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-int mhp_default_online_type = MMOP_OFFLINE;
-#else
-int mhp_default_online_type = MMOP_ONLINE;
-#endif
+static int mhp_default_online_type = -1;
+int mhp_get_default_online_type(void)
+{
+ if (mhp_default_online_type >= 0)
+ return mhp_default_online_type;
+
+ if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE))
+ mhp_default_online_type = MMOP_OFFLINE;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO))
+ mhp_default_online_type = MMOP_ONLINE;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL))
+ mhp_default_online_type = MMOP_ONLINE_KERNEL;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE))
+ mhp_default_online_type = MMOP_ONLINE_MOVABLE;
+ else
+ mhp_default_online_type = MMOP_OFFLINE;
+
+ return mhp_default_online_type;
+}
+
+void mhp_set_default_online_type(int online_type)
+{
+ mhp_default_online_type = online_type;
+}
static int __init setup_memhp_default_state(char *str)
{
@@ -366,7 +385,7 @@ struct page *pfn_to_online_page(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(pfn_to_online_page);
-int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
+int __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
struct mhp_params *params)
{
const unsigned long end_pfn = pfn + nr_pages;
@@ -524,7 +543,7 @@ static void update_pgdat_span(struct pglist_data *pgdat)
pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
-void __ref remove_pfn_range_from_zone(struct zone *zone,
+void remove_pfn_range_from_zone(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -628,16 +647,10 @@ int restore_online_page_callback(online_page_callback_t callback)
}
EXPORT_SYMBOL_GPL(restore_online_page_callback);
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
void generic_online_page(struct page *page, unsigned int order)
{
- /*
- * Freeing the page with debug_pagealloc enabled will try to unmap it,
- * so we should map it first. This is better than introducing a special
- * case in page freeing fast path.
- */
- debug_pagealloc_map_pages(page, 1 << order);
- __free_pages_core(page, order);
- totalram_pages_add(1UL << order);
+ __free_pages_core(page, order, MEMINIT_HOTPLUG);
}
EXPORT_SYMBOL_GPL(generic_online_page);
@@ -656,6 +669,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
* this and the first chunk to online will be pageblock_nr_pages.
*/
for (pfn = start_pfn; pfn < end_pfn;) {
+ struct page *page = pfn_to_page(pfn);
int order;
/*
@@ -670,7 +684,14 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
else
order = MAX_PAGE_ORDER;
- (*online_page_callback)(pfn_to_page(pfn), order);
+ /*
+ * Exposing the page to the buddy by freeing can cause
+ * issues with debug_pagealloc enabled: some archs don't
+ * like double-unmappings. So treat them like any pages that
+ * were allocated from the buddy.
+ */
+ debug_pagealloc_map_pages(page, 1 << order);
+ (*online_page_callback)(page, order);
pfn += (1UL << order);
}
@@ -741,13 +762,13 @@ static inline void section_taint_zone_device(unsigned long pfn)
/*
* Associate the pfn range with the given zone, initializing the memmaps
* and resizing the pgdat/zone data to span the added pages. After this
- * call, all affected pages are PG_reserved.
+ * call, all affected pages are PageOffline().
*
* All aligned pageblocks are initialized to the specified migratetype
* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
* zone stats (e.g., nr_isolate_pageblock) are touched.
*/
-void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
+void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages,
struct vmem_altmap *altmap, int migratetype)
{
@@ -846,7 +867,6 @@ static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
unsigned long kernel_early_pages, movable_pages;
struct auto_movable_group_stats group_stats = {};
struct auto_movable_stats stats = {};
- pg_data_t *pgdat = NODE_DATA(nid);
struct zone *zone;
int i;
@@ -857,6 +877,8 @@ static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
auto_movable_stats_account_zone(&stats, zone);
} else {
for (i = 0; i < MAX_NR_ZONES; i++) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+
zone = pgdat->node_zones + i;
if (populated_zone(zone))
auto_movable_stats_account_zone(&stats, zone);
@@ -1087,7 +1109,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
}
int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
- struct zone *zone)
+ struct zone *zone, bool mhp_off_inaccessible)
{
unsigned long end_pfn = pfn + nr_pages;
int ret, i;
@@ -1096,10 +1118,23 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
if (ret)
return ret;
+ /*
+ * Memory block is accessible at this stage and hence poison the struct
+ * pages now. If the memory block is accessible during memory hotplug
+ * addition phase, then page poisining is already performed in
+ * sparse_add_section().
+ */
+ if (mhp_off_inaccessible)
+ page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
+
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
- for (i = 0; i < nr_pages; i++)
- SetPageVmemmapSelfHosted(pfn_to_page(pfn + i));
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pfn_to_page(pfn + i);
+
+ __ClearPageOffline(page);
+ SetPageVmemmapSelfHosted(page);
+ }
/*
* It might be that the vmemmap_pages fully span sections. If that is
@@ -1135,7 +1170,7 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
/*
* Must be called with mem_hotplug_lock in write mode.
*/
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+int online_pages(unsigned long pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
unsigned long flags;
@@ -1225,7 +1260,7 @@ failed_addition:
}
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-static pg_data_t __ref *hotadd_init_pgdat(int nid)
+static pg_data_t *hotadd_init_pgdat(int nid)
{
struct pglist_data *pgdat;
@@ -1312,7 +1347,7 @@ static int check_hotplug_memory_range(u64 start, u64 size)
static int online_memory_block(struct memory_block *mem, void *arg)
{
- mem->online_type = mhp_default_online_type;
+ mem->online_type = mhp_get_default_online_type();
return device_online(&mem->dev);
}
@@ -1328,7 +1363,7 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
}
#endif
-static bool mhp_supports_memmap_on_memory(unsigned long size)
+bool mhp_supports_memmap_on_memory(void)
{
unsigned long vmemmap_size = memory_block_memmap_size();
unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
@@ -1337,17 +1372,11 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
* Besides having arch support and the feature enabled at runtime, we
* need a few more assumptions to hold true:
*
- * a) We span a single memory block: memory onlining/offlinin;g happens
- * in memory block granularity. We don't want the vmemmap of online
- * memory blocks to reside on offline memory blocks. In the future,
- * we might want to support variable-sized memory blocks to make the
- * feature more versatile.
- *
- * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+ * a) The vmemmap pages span complete PMDs: We don't want vmemmap code
* to populate memory from the altmap for unrelated parts (i.e.,
* other memory blocks)
*
- * c) The vmemmap pages (and thereby the pages that will be exposed to
+ * b) The vmemmap pages (and thereby the pages that will be exposed to
* the buddy) have to cover full pageblocks: memory onlining/offlining
* code requires applicable ranges to be page-aligned, for example, to
* set the migratetypes properly.
@@ -1359,7 +1388,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
* altmap as an alternative source of memory, and we do not exactly
* populate a single PMD.
*/
- if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+ if (!mhp_memmap_on_memory())
return false;
/*
@@ -1382,8 +1411,9 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
return arch_supports_memmap_on_memory(vmemmap_size);
}
+EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory);
-static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
+static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
{
unsigned long memblock_size = memory_block_size_bytes();
u64 cur_start;
@@ -1415,7 +1445,7 @@ static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
}
static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
- u64 start, u64 size)
+ u64 start, u64 size, mhp_t mhp_flags)
{
unsigned long memblock_size = memory_block_size_bytes();
u64 cur_start;
@@ -1431,6 +1461,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
};
mhp_altmap.free = memory_block_memmap_on_memory_pages();
+ if (mhp_flags & MHP_OFFLINE_INACCESSIBLE)
+ mhp_altmap.inaccessible = true;
params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
GFP_KERNEL);
if (!params.altmap) {
@@ -1468,7 +1500,7 @@ out:
*
* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
*/
-int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
+int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
enum memblock_flags memblock_flags = MEMBLOCK_NONE;
@@ -1515,8 +1547,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
* Self hosted memmap array
*/
if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
- mhp_supports_memmap_on_memory(memory_block_size_bytes())) {
- ret = create_altmaps_and_memory_blocks(nid, group, start, size);
+ mhp_supports_memmap_on_memory()) {
+ ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
if (ret)
goto error;
} else {
@@ -1562,7 +1594,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
merge_system_ram_resource(res);
/* online pages if requested */
- if (mhp_default_online_type != MMOP_OFFLINE)
+ if (mhp_get_default_online_type() != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
@@ -1575,7 +1607,7 @@ error_mem_hotplug_end:
}
/* requires device_hotplug_lock, see add_memory_resource() */
-int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
+int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
struct resource *res;
int ret;
@@ -1676,7 +1708,7 @@ struct range __weak arch_get_mappable_range(void)
struct range mhp_get_pluggable_range(bool need_mapping)
{
- const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
+ const u64 max_phys = DIRECT_MAP_PHYSMEM_END;
struct range mhp_range;
if (need_mapping) {
@@ -1725,8 +1757,8 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
unsigned long pfn;
for (pfn = start; pfn < end; pfn++) {
- struct page *page, *head;
- unsigned long skip;
+ struct page *page;
+ struct folio *folio;
if (!pfn_valid(pfn))
continue;
@@ -1747,7 +1779,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
if (!PageHuge(page))
continue;
- head = compound_head(page);
+ folio = page_folio(page);
/*
* This test is racy as we hold no reference or lock. The
* hugetlb page could have been free'ed and head is no longer
@@ -1755,10 +1787,9 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
* cases false positives and negatives are possible. Calling
* code must deal with these scenarios.
*/
- if (HPageMigratable(head))
+ if (folio_test_hugetlb_migratable(folio))
goto found;
- skip = compound_nr(head) - (pfn - page_to_pfn(head));
- pfn += skip - 1;
+ pfn |= folio_nr_pages(folio) - 1;
}
return -ENOENT;
found:
@@ -1768,73 +1799,64 @@ found:
static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
+ struct folio *folio;
unsigned long pfn;
- struct page *page, *head;
LIST_HEAD(source);
static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- struct folio *folio;
- bool isolated;
+ struct page *page;
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
folio = page_folio(page);
- head = &folio->page;
-
- if (PageHuge(page)) {
- pfn = page_to_pfn(head) + compound_nr(head) - 1;
- isolate_hugetlb(folio, &source);
- continue;
- } else if (PageTransHuge(page))
- pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
/*
- * HWPoison pages have elevated reference counts so the migration would
- * fail on them. It also doesn't make any sense to migrate them in the
- * first place. Still try to unmap such a page in case it is still mapped
- * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
- * the unmap as the catch all safety net).
+ * No reference or lock is held on the folio, so it might
+ * be modified concurrently (e.g. split). As such,
+ * folio_nr_pages() may read garbage. This is fine as the outer
+ * loop will revisit the split folio later.
*/
- if (PageHWPoison(page)) {
+ if (folio_test_large(folio))
+ pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
+
+ if (!folio_try_get(folio))
+ continue;
+
+ if (unlikely(page_folio(page) != folio))
+ goto put_folio;
+
+ if (folio_test_hwpoison(folio) ||
+ (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
if (WARN_ON(folio_test_lru(folio)))
folio_isolate_lru(folio);
- if (folio_mapped(folio))
- try_to_unmap(folio, TTU_IGNORE_MLOCK);
- continue;
- }
+ if (folio_mapped(folio)) {
+ folio_lock(folio);
+ unmap_poisoned_folio(folio, pfn, false);
+ folio_unlock(folio);
+ }
- if (!get_page_unless_zero(page))
- continue;
- /*
- * We can skip free pages. And we can deal with pages on
- * LRU and non-lru movable pages.
- */
- if (PageLRU(page))
- isolated = isolate_lru_page(page);
- else
- isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
- if (isolated) {
- list_add_tail(&page->lru, &source);
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
+ goto put_folio;
+ }
- } else {
+ if (!isolate_folio_to_list(folio, &source)) {
if (__ratelimit(&migrate_rs)) {
- pr_warn("failed to isolate pfn %lx\n", pfn);
+ pr_warn("failed to isolate pfn %lx\n",
+ page_to_pfn(page));
dump_page(page, "isolation failed");
}
}
- put_page(page);
+put_folio:
+ folio_put(folio);
}
if (!list_empty(&source)) {
nodemask_t nmask = node_states[N_MEMORY];
struct migration_target_control mtc = {
.nmask = &nmask,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .reason = MR_MEMORY_HOTPLUG,
};
int ret;
@@ -1842,7 +1864,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* We have checked that migration range is on a single zone so
* we can use the nid of the first page to all the others.
*/
- mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
+ mtc.nid = folio_nid(list_first_entry(&source, struct folio, lru));
/*
* try to allocate from a different node but reuse this node
@@ -1855,11 +1877,12 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
ret = migrate_pages(&source, alloc_migration_target, NULL,
(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
if (ret) {
- list_for_each_entry(page, &source, lru) {
+ list_for_each_entry(folio, &source, lru) {
if (__ratelimit(&migrate_rs)) {
pr_warn("migrating pfn %lx failed ret:%d\n",
- page_to_pfn(page), ret);
- dump_page(page, "migration failure");
+ folio_pfn(folio), ret);
+ dump_page(&folio->page,
+ "migration failure");
}
}
putback_movable_pages(&source);
@@ -1934,11 +1957,11 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
/*
* Must be called with mem_hotplug_lock in write mode.
*/
-int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
const unsigned long end_pfn = start_pfn + nr_pages;
- unsigned long pfn, system_ram_pages = 0;
+ unsigned long pfn, managed_pages, system_ram_pages = 0;
const int node = zone_to_nid(zone);
unsigned long flags;
struct memory_notify arg;
@@ -1960,9 +1983,9 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* Don't allow to offline memory blocks that contain holes.
* Consequently, memory blocks with holes can never get onlined
* via the hotplug path - online_pages() - as hotplugged memory has
- * no holes. This way, we e.g., don't have to worry about marking
- * memory holes PG_reserved, don't need pfn_valid() checks, and can
- * avoid using walk_system_ram_range() later.
+ * no holes. This way, we don't have to worry about memory holes,
+ * don't need pfn_valid() checks, and can avoid using
+ * walk_system_ram_range() later.
*/
walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
count_system_ram_pages_cb);
@@ -1994,8 +2017,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
- MEMORY_OFFLINE | REPORT_FAILURE,
- GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
+ MEMORY_OFFLINE | REPORT_FAILURE);
if (ret) {
reason = "failure to isolate range";
goto failed_removal_pcplists_disabled;
@@ -2044,11 +2066,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
}
/*
- * Dissolve free hugepages in the memory block before doing
+ * Dissolve free hugetlb folios in the memory block before doing
* offlining actually in order to make hugetlbfs's object
* counting consistent.
*/
- ret = dissolve_free_huge_pages(start_pfn, end_pfn);
+ ret = dissolve_free_hugetlb_folios(start_pfn, end_pfn);
if (ret) {
reason = "failure to dissolve huge pages";
goto failed_removal_isolated;
@@ -2059,7 +2081,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
} while (ret);
/* Mark all sections offline and remove free pages from the buddy. */
- __offline_isolated_pages(start_pfn, end_pfn);
+ managed_pages = __offline_isolated_pages(start_pfn, end_pfn);
pr_debug("Offlined Pages %ld\n", nr_pages);
/*
@@ -2075,7 +2097,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
zone_pcp_enable(zone);
/* removal success */
- adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+ adjust_managed_page_count(pfn_to_page(start_pfn), -managed_pages);
adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages);
/* reinitialise watermarks and update pcp limits */
@@ -2235,7 +2257,7 @@ static int memory_blocks_have_altmaps(u64 start, u64 size)
return 1;
}
-static int __ref try_remove_memory(u64 start, u64 size)
+static int try_remove_memory(u64 start, u64 size)
{
int rc, nid = NUMA_NO_NODE;
@@ -2276,10 +2298,8 @@ static int __ref try_remove_memory(u64 start, u64 size)
remove_memory_blocks_and_altmaps(start, size);
}
- if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
- memblock_phys_free(start, size);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
- }
release_mem_region_adjustable(start, size);