summaryrefslogtreecommitdiff
path: root/mm/memory_hotplug.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-08-29 14:25:26 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-08-29 14:25:26 -0700
commitb96a3e9142fdf346b05b20e867b4f0dfca119e96 (patch)
treeb338a8f8930abc24888fc3871c6627f6ad46e23b /mm/memory_hotplug.c
parent651a00bc56403161351090a9d7ddbd7095975324 (diff)
parent52ae298e3e5c9be5bb95e1c6d9199e5210f2a156 (diff)
Merge tag 'mm-stable-2023-08-28-18-26' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Some swap cleanups from Ma Wupeng ("fix WARN_ON in add_to_avail_list") - Peter Xu has a series (mm/gup: Unify hugetlb, speed up thp") which reduces the special-case code for handling hugetlb pages in GUP. It also speeds up GUP handling of transparent hugepages. - Peng Zhang provides some maple tree speedups ("Optimize the fast path of mas_store()"). - Sergey Senozhatsky has improved te performance of zsmalloc during compaction (zsmalloc: small compaction improvements"). - Domenico Cerasuolo has developed additional selftest code for zswap ("selftests: cgroup: add zswap test program"). - xu xin has doe some work on KSM's handling of zero pages. These changes are mainly to enable the user to better understand the effectiveness of KSM's treatment of zero pages ("ksm: support tracking KSM-placed zero-pages"). - Jeff Xu has fixes the behaviour of memfd's MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED sysctl ("mm/memfd: fix sysctl MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED"). - David Howells has fixed an fscache optimization ("mm, netfs, fscache: Stop read optimisation when folio removed from pagecache"). - Axel Rasmussen has given userfaultfd the ability to simulate memory poisoning ("add UFFDIO_POISON to simulate memory poisoning with UFFD"). - Miaohe Lin has contributed some routine maintenance work on the memory-failure code ("mm: memory-failure: remove unneeded PageHuge() check"). - Peng Zhang has contributed some maintenance work on the maple tree code ("Improve the validation for maple tree and some cleanup"). - Hugh Dickins has optimized the collapsing of shmem or file pages into THPs ("mm: free retracted page table by RCU"). - Jiaqi Yan has a patch series which permits us to use the healthy subpages within a hardware poisoned huge page for general purposes ("Improve hugetlbfs read on HWPOISON hugepages"). - Kemeng Shi has done some maintenance work on the pagetable-check code ("Remove unused parameters in page_table_check"). - More folioification work from Matthew Wilcox ("More filesystem folio conversions for 6.6"), ("Followup folio conversions for zswap"). And from ZhangPeng ("Convert several functions in page_io.c to use a folio"). - page_ext cleanups from Kemeng Shi ("minor cleanups for page_ext"). - Baoquan He has converted some architectures to use the GENERIC_IOREMAP ioremap()/iounmap() code ("mm: ioremap: Convert architectures to take GENERIC_IOREMAP way"). - Anshuman Khandual has optimized arm64 tlb shootdown ("arm64: support batched/deferred tlb shootdown during page reclamation/migration"). - Better maple tree lockdep checking from Liam Howlett ("More strict maple tree lockdep"). Liam also developed some efficiency improvements ("Reduce preallocations for maple tree"). - Cleanup and optimization to the secondary IOMMU TLB invalidation, from Alistair Popple ("Invalidate secondary IOMMU TLB on permission upgrade"). - Ryan Roberts fixes some arm64 MM selftest issues ("selftests/mm fixes for arm64"). - Kemeng Shi provides some maintenance work on the compaction code ("Two minor cleanups for compaction"). - Some reduction in mmap_lock pressure from Matthew Wilcox ("Handle most file-backed faults under the VMA lock"). - Aneesh Kumar contributes code to use the vmemmap optimization for DAX on ppc64, under some circumstances ("Add support for DAX vmemmap optimization for ppc64"). - page-ext cleanups from Kemeng Shi ("add page_ext_data to get client data in page_ext"), ("minor cleanups to page_ext header"). - Some zswap cleanups from Johannes Weiner ("mm: zswap: three cleanups"). - kmsan cleanups from ZhangPeng ("minor cleanups for kmsan"). - VMA handling cleanups from Kefeng Wang ("mm: convert to vma_is_initial_heap/stack()"). - DAMON feature work from SeongJae Park ("mm/damon/sysfs-schemes: implement DAMOS tried total bytes file"), ("Extend DAMOS filters for address ranges and DAMON monitoring targets"). - Compaction work from Kemeng Shi ("Fixes and cleanups to compaction"). - Liam Howlett has improved the maple tree node replacement code ("maple_tree: Change replacement strategy"). - ZhangPeng has a general code cleanup - use the K() macro more widely ("cleanup with helper macro K()"). - Aneesh Kumar brings memmap-on-memory to ppc64 ("Add support for memmap on memory feature on ppc64"). - pagealloc cleanups from Kemeng Shi ("Two minor cleanups for pcp list in page_alloc"), ("Two minor cleanups for get pageblock migratetype"). - Vishal Moola introduces a memory descriptor for page table tracking, "struct ptdesc" ("Split ptdesc from struct page"). - memfd selftest maintenance work from Aleksa Sarai ("memfd: cleanups for vm.memfd_noexec"). - MM include file rationalization from Hugh Dickins ("arch: include asm/cacheflush.h in asm/hugetlb.h"). - THP debug output fixes from Hugh Dickins ("mm,thp: fix sloppy text output"). - kmemleak improvements from Xiaolei Wang ("mm/kmemleak: use object_cache instead of kmemleak_initialized"). - More folio-related cleanups from Matthew Wilcox ("Remove _folio_dtor and _folio_order"). - A VMA locking scalability improvement from Suren Baghdasaryan ("Per-VMA lock support for swap and userfaults"). - pagetable handling cleanups from Matthew Wilcox ("New page table range API"). - A batch of swap/thp cleanups from David Hildenbrand ("mm/swap: stop using page->private on tail pages for THP_SWAP + cleanups"). - Cleanups and speedups to the hugetlb fault handling from Matthew Wilcox ("Change calling convention for ->huge_fault"). - Matthew Wilcox has also done some maintenance work on the MM subsystem documentation ("Improve mm documentation"). * tag 'mm-stable-2023-08-28-18-26' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (489 commits) maple_tree: shrink struct maple_tree maple_tree: clean up mas_wr_append() secretmem: convert page_is_secretmem() to folio_is_secretmem() nios2: fix flush_dcache_page() for usage from irq context hugetlb: add documentation for vma_kernel_pagesize() mm: add orphaned kernel-doc to the rst files. mm: fix clean_record_shared_mapping_range kernel-doc mm: fix get_mctgt_type() kernel-doc mm: fix kernel-doc warning from tlb_flush_rmaps() mm: remove enum page_entry_size mm: allow ->huge_fault() to be called without the mmap_lock held mm: move PMD_ORDER to pgtable.h mm: remove checks for pte_index memcg: remove duplication detection for mem_cgroup_uncharge_swap mm/huge_memory: work on folio->swap instead of page->private when splitting folio mm/swap: inline folio_set_swap_entry() and folio_swap_entry() mm/swap: use dedicated entry for swap in folio mm/swap: stop using page->private on tail pages for THP_SWAP selftests/mm: fix WARNING comparing pointer to 0 selftests: cgroup: fix test_kmem_memcg_deletion kernel mem check ...
Diffstat (limited to 'mm/memory_hotplug.c')
-rw-r--r--mm/memory_hotplug.c192
1 files changed, 154 insertions, 38 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3f231cf1b410..1b03f4ec6fd2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -41,17 +41,83 @@
#include "internal.h"
#include "shuffle.h"
+enum {
+ MEMMAP_ON_MEMORY_DISABLE = 0,
+ MEMMAP_ON_MEMORY_ENABLE,
+ MEMMAP_ON_MEMORY_FORCE,
+};
+
+static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
+
+static inline unsigned long memory_block_memmap_size(void)
+{
+ return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
+}
+
+static inline unsigned long memory_block_memmap_on_memory_pages(void)
+{
+ unsigned long nr_pages = PFN_UP(memory_block_memmap_size());
+
+ /*
+ * In "forced" memmap_on_memory mode, we add extra pages to align the
+ * vmemmap size to cover full pageblocks. That way, we can add memory
+ * even if the vmemmap size is not properly aligned, however, we might waste
+ * memory.
+ */
+ if (memmap_mode == MEMMAP_ON_MEMORY_FORCE)
+ return pageblock_align(nr_pages);
+ return nr_pages;
+}
+
#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
/*
* memory_hotplug.memmap_on_memory parameter
*/
-static bool memmap_on_memory __ro_after_init;
-module_param(memmap_on_memory, bool, 0444);
-MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+static int set_memmap_mode(const char *val, const struct kernel_param *kp)
+{
+ int ret, mode;
+ bool enabled;
+
+ if (sysfs_streq(val, "force") || sysfs_streq(val, "FORCE")) {
+ mode = MEMMAP_ON_MEMORY_FORCE;
+ } else {
+ ret = kstrtobool(val, &enabled);
+ if (ret < 0)
+ return ret;
+ if (enabled)
+ mode = MEMMAP_ON_MEMORY_ENABLE;
+ else
+ mode = MEMMAP_ON_MEMORY_DISABLE;
+ }
+ *((int *)kp->arg) = mode;
+ if (mode == MEMMAP_ON_MEMORY_FORCE) {
+ unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
+
+ pr_info_once("Memory hotplug will waste %ld pages in each memory block\n",
+ memmap_pages - PFN_UP(memory_block_memmap_size()));
+ }
+ return 0;
+}
+
+static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
+{
+ if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE)
+ return sprintf(buffer, "force\n");
+ return param_get_bool(buffer, kp);
+}
+
+static const struct kernel_param_ops memmap_mode_ops = {
+ .set = set_memmap_mode,
+ .get = get_memmap_mode,
+};
+module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n"
+ "With value \"force\" it could result in memory wastage due "
+ "to memmap size limitations (Y/N/force)");
static inline bool mhp_memmap_on_memory(void)
{
- return memmap_on_memory;
+ return memmap_mode != MEMMAP_ON_MEMORY_DISABLE;
}
#else
static inline bool mhp_memmap_on_memory(void)
@@ -1247,11 +1313,22 @@ static int online_memory_block(struct memory_block *mem, void *arg)
return device_online(&mem->dev);
}
-bool mhp_supports_memmap_on_memory(unsigned long size)
+#ifndef arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
+{
+ /*
+ * As default, we want the vmemmap to span a complete PMD such that we
+ * can map the vmemmap using a single PMD if supported by the
+ * architecture.
+ */
+ return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+}
+#endif
+
+static bool mhp_supports_memmap_on_memory(unsigned long size)
{
- unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
- unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
- unsigned long remaining_size = size - vmemmap_size;
+ unsigned long vmemmap_size = memory_block_memmap_size();
+ unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
/*
* Besides having arch support and the feature enabled at runtime, we
@@ -1279,10 +1356,28 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
* altmap as an alternative source of memory, and we do not exactly
* populate a single PMD.
*/
- return mhp_memmap_on_memory() &&
- size == memory_block_size_bytes() &&
- IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
- IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+ if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+ return false;
+
+ /*
+ * Make sure the vmemmap allocation is fully contained
+ * so that we always allocate vmemmap memory from altmap area.
+ */
+ if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE))
+ return false;
+
+ /*
+ * start pfn should be pageblock_nr_pages aligned for correctly
+ * setting migrate types
+ */
+ if (!pageblock_aligned(memmap_pages))
+ return false;
+
+ if (memmap_pages == PHYS_PFN(memory_block_size_bytes()))
+ /* No effective hotplugged memory doesn't make sense. */
+ return false;
+
+ return arch_supports_memmap_on_memory(vmemmap_size);
}
/*
@@ -1295,7 +1390,10 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
enum memblock_flags memblock_flags = MEMBLOCK_NONE;
- struct vmem_altmap mhp_altmap = {};
+ struct vmem_altmap mhp_altmap = {
+ .base_pfn = PHYS_PFN(res->start),
+ .end_pfn = PHYS_PFN(res->end),
+ };
struct memory_group *group = NULL;
u64 start, size;
bool new_node = false;
@@ -1339,26 +1437,29 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
* Self hosted memmap array
*/
if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
- if (!mhp_supports_memmap_on_memory(size)) {
- ret = -EINVAL;
- goto error;
+ if (mhp_supports_memmap_on_memory(size)) {
+ mhp_altmap.free = memory_block_memmap_on_memory_pages();
+ params.altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
+ if (!params.altmap) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap));
}
- mhp_altmap.free = PHYS_PFN(size);
- mhp_altmap.base_pfn = PHYS_PFN(start);
- params.altmap = &mhp_altmap;
+ /* fallback to not using altmap */
}
/* call arch's memory hotadd */
ret = arch_add_memory(nid, start, size, &params);
if (ret < 0)
- goto error;
+ goto error_free;
/* create memory block devices after memory was added */
- ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
- group);
+ ret = create_memory_block_devices(start, size, params.altmap, group);
if (ret) {
arch_remove_memory(start, size, NULL);
- goto error;
+ goto error_free;
}
if (new_node) {
@@ -1395,6 +1496,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
+error_free:
+ kfree(params.altmap);
error:
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
@@ -1843,6 +1946,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
do {
pfn = start_pfn;
do {
+ /*
+ * Historically we always checked for any signal and
+ * can't limit it to fatal signals without eventually
+ * breaking user space.
+ */
if (signal_pending(current)) {
ret = -EINTR;
reason = "signal backoff";
@@ -1956,12 +2064,18 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
return 0;
}
-static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+static int test_has_altmap_cb(struct memory_block *mem, void *arg)
{
+ struct memory_block **mem_ptr = (struct memory_block **)arg;
/*
- * If not set, continue with the next block.
+ * return the memblock if we have altmap
+ * and break callback.
*/
- return mem->nr_vmemmap_pages;
+ if (mem->altmap) {
+ *mem_ptr = mem;
+ return 1;
+ }
+ return 0;
}
static int check_cpu_on_node(int nid)
@@ -2036,10 +2150,9 @@ EXPORT_SYMBOL(try_offline_node);
static int __ref try_remove_memory(u64 start, u64 size)
{
- struct vmem_altmap mhp_altmap = {};
- struct vmem_altmap *altmap = NULL;
- unsigned long nr_vmemmap_pages;
+ struct memory_block *mem;
int rc = 0, nid = NUMA_NO_NODE;
+ struct vmem_altmap *altmap = NULL;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -2061,23 +2174,20 @@ static int __ref try_remove_memory(u64 start, u64 size)
* the same granularity it was added - a single memory block.
*/
if (mhp_memmap_on_memory()) {
- nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
- get_nr_vmemmap_pages_cb);
- if (nr_vmemmap_pages) {
+ rc = walk_memory_blocks(start, size, &mem, test_has_altmap_cb);
+ if (rc) {
if (size != memory_block_size_bytes()) {
pr_warn("Refuse to remove %#llx - %#llx,"
"wrong granularity\n",
start, start + size);
return -EINVAL;
}
-
+ altmap = mem->altmap;
/*
- * Let remove_pmd_table->free_hugepage_table do the
- * right thing if we used vmem_altmap when hot-adding
- * the range.
+ * Mark altmap NULL so that we can add a debug
+ * check on memblock free.
*/
- mhp_altmap.alloc = nr_vmemmap_pages;
- altmap = &mhp_altmap;
+ mem->altmap = NULL;
}
}
@@ -2094,6 +2204,12 @@ static int __ref try_remove_memory(u64 start, u64 size)
arch_remove_memory(start, size, altmap);
+ /* Verify that all vmemmap pages have actually been freed. */
+ if (altmap) {
+ WARN(altmap->alloc, "Altmap not fully unmapped");
+ kfree(altmap);
+ }
+
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
memblock_phys_free(start, size);
memblock_remove(start, size);