diff options
Diffstat (limited to 'mm')
127 files changed, 10294 insertions, 6072 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index d3fb3762887b..d5d4eca947a6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -201,7 +201,7 @@ config KVFREE_RCU_BATCHED config SLUB_TINY bool "Configure for minimal memory footprint" - depends on EXPERT + depends on EXPERT && !COMPILE_TEST select SLAB_MERGE_DEFAULT help Configures the slab allocator in a way to achieve minimal memory @@ -469,6 +469,10 @@ config HAVE_GUP_FAST depends on MMU bool +# Enable memblock support for scratch memory which is needed for kexec handover +config MEMBLOCK_KHO_SCRATCH + bool + # Don't discard allocated memory used to track "memory" and "reserved" memblocks # after early boot, so it can still be used to test for validity of memory. # Also, memblocks are updated with memory hot(un)plug. @@ -882,7 +886,7 @@ config THP_SWAP config READ_ONLY_THP_FOR_FS bool "Read-only THP for filesystems (EXPERIMENTAL)" - depends on TRANSPARENT_HUGEPAGE && SHMEM + depends on TRANSPARENT_HUGEPAGE help Allow khugepaged to put read-only file-backed pages in THP. @@ -930,6 +934,13 @@ config ARCH_SUPPORTS_PUD_PFNMAP depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD # +# Architectures that always use weak definitions for percpu +# variables in modules should set this. +# +config ARCH_MODULE_NEEDS_WEAK_PER_CPU + bool + +# # UP and nommu archs use km based percpu allocator # config NEED_PER_CPU_KM @@ -989,6 +1000,41 @@ config CMA_AREAS If unsure, leave the default value "8" in UMA and "20" in NUMA. +# +# Select this config option from the architecture Kconfig, if available, to set +# the max page order for physically contiguous allocations. +# +config ARCH_FORCE_MAX_ORDER + int + +# +# When ARCH_FORCE_MAX_ORDER is not defined, +# the default page block order is MAX_PAGE_ORDER (10) as per +# include/linux/mmzone.h. +# +config PAGE_BLOCK_MAX_ORDER + int "Page Block Order Upper Limit" + range 1 10 if ARCH_FORCE_MAX_ORDER = 0 + default 10 if ARCH_FORCE_MAX_ORDER = 0 + range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0 + default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0 + help + The page block order refers to the power of two number of pages that + are physically contiguous and can have a migrate type associated to + them. The maximum size of the page block order is at least limited by + ARCH_FORCE_MAX_ORDER/MAX_PAGE_ORDER. + + This config adds a new upper limit of default page block + order when the page block order is required to be smaller than + ARCH_FORCE_MAX_ORDER/MAX_PAGE_ORDER or other limits + (see include/linux/pageblock-flags.h for details). + + Reducing pageblock order can negatively impact THP generation + success rate. If your workloads use THP heavily, please use this + option with caution. + + Don't change if unsure. + config MEM_SOFT_DIRTY bool "Track memory changes" depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS @@ -1071,9 +1117,6 @@ config ARCH_HAS_CURRENT_STACK_POINTER register alias named "current_stack_pointer", this config can be selected. -config ARCH_HAS_PTE_DEVMAP - bool - config ARCH_HAS_ZONE_DMA_SET bool @@ -1091,7 +1134,6 @@ config ZONE_DEVICE depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP - depends on ARCH_HAS_PTE_DEVMAP select XARRAY_MULTI help @@ -1317,6 +1359,7 @@ config NUMA_MEMBLKS config NUMA_EMU bool "NUMA emulation" depends on NUMA_MEMBLKS + depends on X86 || GENERIC_ARCH_NUMA help Enable NUMA emulation. A flat machine will be split into virtual nodes when booted with "numa=fake=N", where N is the diff --git a/mm/Makefile b/mm/Makefile index e7f6bbf8ae5f..1a7a11d4933d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,7 +37,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ - pgtable-generic.o rmap.o vmalloc.o vma.o + pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o ifdef CONFIG_CROSS_MEMORY_ATTACH @@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ mm_init.o percpu.o slab_common.o \ compaction.o show_mem.o \ interval_tree.o list_lru.o workingset.o \ - debug.o gup.o mmap_lock.o $(mmu-y) + debug.o gup.o mmap_lock.o vma_init.o $(mmu-y) # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e61bbb1bd622..783904d8c5ef 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1151,7 +1151,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - del_timer_sync(&bdi->laptop_mode_wb_timer); + timer_delete_sync(&bdi->laptop_mode_wb_timer); /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index d3e00731e262..2a4a649805c1 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -94,13 +94,8 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, if (!trylock_page(page)) continue; - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION) && - PageIsolated(page)) { - /* raced with isolation */ - unlock_page(page); - continue; - } - balloon_page_delete(page); + list_del(&page->lru); + balloon_page_finalize(page); __count_vm_event(BALLOON_DEFLATE); list_add(&page->lru, pages); unlock_page(page); @@ -211,6 +206,9 @@ static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; + if (!b_dev_info) + return false; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); list_del(&page->lru); b_dev_info->isolated_pages++; @@ -224,6 +222,10 @@ static void balloon_page_putback(struct page *page) struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; + /* Isolated balloon pages cannot get deflated. */ + if (WARN_ON_ONCE(!b_dev_info)) + return; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); list_add(&page->lru, &b_dev_info->pages); b_dev_info->isolated_pages--; @@ -239,6 +241,10 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + /* Isolated balloon pages cannot get deflated. */ + if (WARN_ON_ONCE(!balloon)) + return -EAGAIN; + return balloon->migratepage(balloon, newpage, page, mode); } @@ -247,6 +253,5 @@ const struct movable_operations balloon_mops = { .isolate_page = balloon_page_isolate, .putback_page = balloon_page_putback, }; -EXPORT_SYMBOL_GPL(balloon_mops); #endif /* CONFIG_BALLOON_COMPACTION */ @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/sizes.h> #include <linux/slab.h> +#include <linux/string_choices.h> #include <linux/log2.h> #include <linux/cma.h> #include <linux/highmem.h> @@ -35,12 +36,6 @@ struct cma cma_areas[MAX_CMA_AREAS]; unsigned int cma_area_count; -static int __init __cma_declare_contiguous_nid(phys_addr_t base, - phys_addr_t size, phys_addr_t limit, - phys_addr_t alignment, unsigned int order_per_bit, - bool fixed, const char *name, struct cma **res_cma, - int nid); - phys_addr_t cma_get_base(const struct cma *cma) { WARN_ON_ONCE(cma->nranges != 1); @@ -143,13 +138,14 @@ bool cma_validate_zones(struct cma *cma) static void __init cma_activate_area(struct cma *cma) { - unsigned long pfn, end_pfn; + unsigned long pfn, end_pfn, early_pfn[CMA_MAX_RANGES]; int allocrange, r; struct cma_memrange *cmr; unsigned long bitmap_count, count; for (allocrange = 0; allocrange < cma->nranges; allocrange++) { cmr = &cma->ranges[allocrange]; + early_pfn[allocrange] = cmr->early_pfn; cmr->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma, cmr), GFP_KERNEL); if (!cmr->bitmap) @@ -161,13 +157,13 @@ static void __init cma_activate_area(struct cma *cma) for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; - if (cmr->early_pfn != cmr->base_pfn) { - count = cmr->early_pfn - cmr->base_pfn; + if (early_pfn[r] != cmr->base_pfn) { + count = early_pfn[r] - cmr->base_pfn; bitmap_count = cma_bitmap_pages_to_bits(cma, count); bitmap_set(cmr->bitmap, 0, bitmap_count); } - for (pfn = cmr->early_pfn; pfn < cmr->base_pfn + cmr->count; + for (pfn = early_pfn[r]; pfn < cmr->base_pfn + cmr->count; pfn += pageblock_nr_pages) init_cma_reserved_pageblock(pfn_to_page(pfn)); } @@ -193,7 +189,7 @@ cleanup: for (r = 0; r < allocrange; r++) { cmr = &cma->ranges[r]; end_pfn = cmr->base_pfn + cmr->count; - for (pfn = cmr->early_pfn; pfn < end_pfn; pfn++) + for (pfn = early_pfn[r]; pfn < end_pfn; pfn++) free_reserved_page(pfn_to_page(pfn)); } } @@ -357,6 +353,168 @@ static void __init list_insert_sorted( } } +static int __init cma_fixed_reserve(phys_addr_t base, phys_addr_t size) +{ + if (IS_ENABLED(CONFIG_HIGHMEM)) { + phys_addr_t highmem_start = __pa(high_memory - 1) + 1; + + /* + * If allocating at a fixed base the request region must not + * cross the low/high memory boundary. + */ + if (base < highmem_start && base + size > highmem_start) { + pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", + &base, &highmem_start); + return -EINVAL; + } + } + + if (memblock_is_region_reserved(base, size) || + memblock_reserve(base, size) < 0) { + return -EBUSY; + } + + return 0; +} + +static phys_addr_t __init cma_alloc_mem(phys_addr_t base, phys_addr_t size, + phys_addr_t align, phys_addr_t limit, int nid) +{ + phys_addr_t addr = 0; + + /* + * If there is enough memory, try a bottom-up allocation first. + * It will place the new cma area close to the start of the node + * and guarantee that the compaction is moving pages out of the + * cma area and not into it. + * Avoid using first 4GB to not interfere with constrained zones + * like DMA/DMA32. + */ +#ifdef CONFIG_PHYS_ADDR_T_64BIT + if (!memblock_bottom_up() && limit >= SZ_4G + size) { + memblock_set_bottom_up(true); + addr = memblock_alloc_range_nid(size, align, SZ_4G, limit, + nid, true); + memblock_set_bottom_up(false); + } +#endif + + /* + * On systems with HIGHMEM try allocating from there before consuming + * memory in lower zones. + */ + if (!addr && IS_ENABLED(CONFIG_HIGHMEM)) { + phys_addr_t highmem = __pa(high_memory - 1) + 1; + + /* + * All pages in the reserved area must come from the same zone. + * If the requested region crosses the low/high memory boundary, + * try allocating from high memory first and fall back to low + * memory in case of failure. + */ + if (base < highmem && limit > highmem) { + addr = memblock_alloc_range_nid(size, align, highmem, + limit, nid, true); + limit = highmem; + } + } + + if (!addr) + addr = memblock_alloc_range_nid(size, align, base, limit, nid, + true); + + return addr; +} + +static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, + phys_addr_t size, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + bool fixed, const char *name, struct cma **res_cma, + int nid) +{ + phys_addr_t memblock_end = memblock_end_of_DRAM(); + phys_addr_t base = *basep; + int ret; + + pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", + __func__, &size, &base, &limit, &alignment); + + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size) + return -EINVAL; + + if (alignment && !is_power_of_2(alignment)) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_NUMA)) + nid = NUMA_NO_NODE; + + /* Sanitise input arguments. */ + alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); + if (fixed && base & (alignment - 1)) { + pr_err("Region at %pa must be aligned to %pa bytes\n", + &base, &alignment); + return -EINVAL; + } + base = ALIGN(base, alignment); + size = ALIGN(size, alignment); + limit &= ~(alignment - 1); + + if (!base) + fixed = false; + + /* size should be aligned with order_per_bit */ + if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + + /* + * If the limit is unspecified or above the memblock end, its effective + * value will be the memblock end. Set it explicitly to simplify further + * checks. + */ + if (limit == 0 || limit > memblock_end) + limit = memblock_end; + + if (base + size > limit) { + pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n", + &size, &base, &limit); + return -EINVAL; + } + + /* Reserve memory */ + if (fixed) { + ret = cma_fixed_reserve(base, size); + if (ret) + return ret; + } else { + base = cma_alloc_mem(base, size, alignment, limit, nid); + if (!base) + return -ENOMEM; + + /* + * kmemleak scans/reads tracked objects for pointers to other + * objects but this address isn't mapped and accessible + */ + kmemleak_ignore_phys(base); + } + + ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); + if (ret) { + memblock_phys_free(base, size); + return ret; + } + + (*res_cma)->nid = nid; + *basep = base; + + return 0; +} + /* * Create CMA areas with a total size of @total_size. A normal allocation * for one area is tried first. If that fails, the biggest memblock @@ -370,7 +528,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size, phys_addr_t align, unsigned int order_per_bit, const char *name, struct cma **res_cma, int nid) { - phys_addr_t start, end; + phys_addr_t start = 0, end; phys_addr_t size, sizesum, sizeleft; struct cma_init_memrange *mrp, *mlp, *failed; struct cma_memrange *cmrp; @@ -384,7 +542,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size, /* * First, try it the normal way, producing just one range. */ - ret = __cma_declare_contiguous_nid(0, total_size, 0, align, + ret = __cma_declare_contiguous_nid(&start, total_size, 0, align, order_per_bit, false, name, res_cma, nid); if (ret != -ENOMEM) goto out; @@ -547,8 +705,7 @@ out: (unsigned long)total_size / SZ_1M); else pr_info("Reserved %lu MiB in %d range%s\n", - (unsigned long)total_size / SZ_1M, nr, - nr > 1 ? "s" : ""); + (unsigned long)total_size / SZ_1M, nr, str_plural(nr)); return ret; } @@ -580,7 +737,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, { int ret; - ret = __cma_declare_contiguous_nid(base, size, limit, alignment, + ret = __cma_declare_contiguous_nid(&base, size, limit, alignment, order_per_bit, fixed, name, res_cma, nid); if (ret != 0) pr_err("Failed to reserve %ld MiB\n", @@ -592,148 +749,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, return ret; } -static int __init __cma_declare_contiguous_nid(phys_addr_t base, - phys_addr_t size, phys_addr_t limit, - phys_addr_t alignment, unsigned int order_per_bit, - bool fixed, const char *name, struct cma **res_cma, - int nid) -{ - phys_addr_t memblock_end = memblock_end_of_DRAM(); - phys_addr_t highmem_start; - int ret; - - /* - * We can't use __pa(high_memory) directly, since high_memory - * isn't a valid direct map VA, and DEBUG_VIRTUAL will (validly) - * complain. Find the boundary by adding one to the last valid - * address. - */ - highmem_start = __pa(high_memory - 1) + 1; - pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", - __func__, &size, &base, &limit, &alignment); - - if (cma_area_count == ARRAY_SIZE(cma_areas)) { - pr_err("Not enough slots for CMA reserved regions!\n"); - return -ENOSPC; - } - - if (!size) - return -EINVAL; - - if (alignment && !is_power_of_2(alignment)) - return -EINVAL; - - if (!IS_ENABLED(CONFIG_NUMA)) - nid = NUMA_NO_NODE; - - /* Sanitise input arguments. */ - alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); - if (fixed && base & (alignment - 1)) { - pr_err("Region at %pa must be aligned to %pa bytes\n", - &base, &alignment); - return -EINVAL; - } - base = ALIGN(base, alignment); - size = ALIGN(size, alignment); - limit &= ~(alignment - 1); - - if (!base) - fixed = false; - - /* size should be aligned with order_per_bit */ - if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) - return -EINVAL; - - /* - * If allocating at a fixed base the request region must not cross the - * low/high memory boundary. - */ - if (fixed && base < highmem_start && base + size > highmem_start) { - pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", - &base, &highmem_start); - return -EINVAL; - } - - /* - * If the limit is unspecified or above the memblock end, its effective - * value will be the memblock end. Set it explicitly to simplify further - * checks. - */ - if (limit == 0 || limit > memblock_end) - limit = memblock_end; - - if (base + size > limit) { - pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n", - &size, &base, &limit); - return -EINVAL; - } - - /* Reserve memory */ - if (fixed) { - if (memblock_is_region_reserved(base, size) || - memblock_reserve(base, size) < 0) { - return -EBUSY; - } - } else { - phys_addr_t addr = 0; - - /* - * If there is enough memory, try a bottom-up allocation first. - * It will place the new cma area close to the start of the node - * and guarantee that the compaction is moving pages out of the - * cma area and not into it. - * Avoid using first 4GB to not interfere with constrained zones - * like DMA/DMA32. - */ -#ifdef CONFIG_PHYS_ADDR_T_64BIT - if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) { - memblock_set_bottom_up(true); - addr = memblock_alloc_range_nid(size, alignment, SZ_4G, - limit, nid, true); - memblock_set_bottom_up(false); - } -#endif - - /* - * All pages in the reserved area must come from the same zone. - * If the requested region crosses the low/high memory boundary, - * try allocating from high memory first and fall back to low - * memory in case of failure. - */ - if (!addr && base < highmem_start && limit > highmem_start) { - addr = memblock_alloc_range_nid(size, alignment, - highmem_start, limit, nid, true); - limit = highmem_start; - } - - if (!addr) { - addr = memblock_alloc_range_nid(size, alignment, base, - limit, nid, true); - if (!addr) - return -ENOMEM; - } - - /* - * kmemleak scans/reads tracked objects for pointers to other - * objects but this address isn't mapped and accessible - */ - kmemleak_ignore_phys(addr); - base = addr; - } - - ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); - if (ret) - memblock_phys_free(base, size); - - (*res_cma)->nid = nid; - - return ret; -} - static void cma_debug_show_areas(struct cma *cma) { - unsigned long next_zero_bit, next_set_bit, nr_zero; - unsigned long start; + unsigned long start, end; unsigned long nr_part; unsigned long nbits; int r; @@ -744,22 +762,12 @@ static void cma_debug_show_areas(struct cma *cma) for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; - start = 0; nbits = cma_bitmap_maxno(cma, cmr); pr_info("range %d: ", r); - for (;;) { - next_zero_bit = find_next_zero_bit(cmr->bitmap, - nbits, start); - if (next_zero_bit >= nbits) - break; - next_set_bit = find_next_bit(cmr->bitmap, nbits, - next_zero_bit); - nr_zero = next_set_bit - next_zero_bit; - nr_part = nr_zero << cma->order_per_bit; - pr_cont("%s%lu@%lu", start ? "+" : "", nr_part, - next_zero_bit); - start = next_zero_bit + nr_zero; + for_each_clear_bitrange(start, end, cmr->bitmap, nbits) { + nr_part = (end - start) << cma->order_per_bit; + pr_cont("%s%lu@%lu", start ? "+" : "", nr_part, start); } pr_info("\n"); } @@ -815,7 +823,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma->alloc_mutex); - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp); + ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); mutex_unlock(&cma->alloc_mutex); if (ret == 0) { page = pfn_to_page(pfn); @@ -847,8 +855,6 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, unsigned long i; const char *name = cma ? cma->name : NULL; - trace_cma_alloc_start(name, count, align); - if (!cma || !cma->count) return page; @@ -858,6 +864,8 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, if (!count) return page; + trace_cma_alloc_start(name, count, align); + for (r = 0; r < cma->nranges; r++) { page = NULL; @@ -25,9 +25,11 @@ struct cma_kobject { */ struct cma_memrange { unsigned long base_pfn; - unsigned long early_pfn; unsigned long count; - unsigned long *bitmap; + union { + unsigned long early_pfn; + unsigned long *bitmap; + }; #ifdef CONFIG_CMA_DEBUGFS struct debugfs_u32_array dfs_bitmap; #endif diff --git a/mm/cma_debug.c b/mm/cma_debug.c index fdf899532ca0..8c7d7f8e8fbd 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -56,16 +56,8 @@ static int cma_maxchunk_get(void *data, u64 *val) for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; bitmap_maxno = cma_bitmap_maxno(cma, cmr); - end = 0; - for (;;) { - start = find_next_zero_bit(cmr->bitmap, - bitmap_maxno, end); - if (start >= bitmap_maxno) - break; - end = find_next_bit(cmr->bitmap, bitmap_maxno, - start); + for_each_clear_bitrange(start, end, cmr->bitmap, bitmap_maxno) maxchunk = max(end - start, maxchunk); - } } spin_unlock_irq(&cma->lock); *val = (u64)maxchunk << cma->order_per_bit; diff --git a/mm/compaction.c b/mm/compaction.c index 139f00c0308a..bf021b31c7ec 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -114,39 +114,6 @@ static unsigned long release_free_list(struct list_head *freepages) } #ifdef CONFIG_COMPACTION -bool PageMovable(struct page *page) -{ - const struct movable_operations *mops; - - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (!__PageMovable(page)) - return false; - - mops = page_movable_ops(page); - if (mops) - return true; - - return false; -} - -void __SetPageMovable(struct page *page, const struct movable_operations *mops) -{ - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page); - page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE); -} -EXPORT_SYMBOL(__SetPageMovable); - -void __ClearPageMovable(struct page *page) -{ - VM_BUG_ON_PAGE(!PageMovable(page), page); - /* - * This page still has the type of a movable page, but it's - * actually not movable any more. - */ - page->mapping = (void *)PAGE_MAPPING_MOVABLE; -} -EXPORT_SYMBOL(__ClearPageMovable); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -981,13 +948,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } if (PageHuge(page)) { + const unsigned int order = compound_order(page); /* * skip hugetlbfs if we are not compacting for pages * bigger than its order. THPs and other compound pages * are handled below. */ if (!cc->alloc_contig) { - const unsigned int order = compound_order(page); if (order <= MAX_PAGE_ORDER) { low_pfn += (1UL << order) - 1; @@ -1001,27 +968,27 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = NULL; } - ret = isolate_or_dissolve_huge_page(page, &cc->migratepages); + folio = page_folio(page); + ret = isolate_or_dissolve_huge_folio(folio, &cc->migratepages); /* - * Fail isolation in case isolate_or_dissolve_huge_page() + * Fail isolation in case isolate_or_dissolve_huge_folio() * reports an error. In case of -ENOMEM, abort right away. */ if (ret < 0) { /* Do not report -EBUSY down the chain */ if (ret == -EBUSY) ret = 0; - low_pfn += compound_nr(page) - 1; - nr_scanned += compound_nr(page) - 1; + low_pfn += (1UL << order) - 1; + nr_scanned += (1UL << order) - 1; goto isolate_fail; } - if (PageHuge(page)) { + if (folio_test_hugetlb(folio)) { /* * Hugepage was successfully isolated and placed * on the cc->migratepages list. */ - folio = page_folio(page); low_pfn += folio_nr_pages(folio) - 1; goto isolate_success_no_list; } @@ -1082,18 +1049,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * Skip any other type of page */ if (!PageLRU(page)) { - /* - * __PageMovable can return false positive so we need - * to verify it under page_lock. - */ - if (unlikely(__PageMovable(page)) && - !PageIsolated(page)) { + /* Isolation code will deal with any races. */ + if (unlikely(page_has_movable_ops(page)) && + !PageMovableOpsIsolated(page)) { if (locked) { unlock_page_lruvec_irqrestore(locked, flags); locked = NULL; } - if (isolate_movable_page(page, mode)) { + if (isolate_movable_ops_page(page, mode)) { folio = page_folio(page); goto isolate_success; } @@ -2249,15 +2213,11 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat) static unsigned int fragmentation_score_wmark(bool low) { - unsigned int wmark_low; + unsigned int wmark_low, leeway; - /* - * Cap the low watermark to avoid excessive compaction - * activity in case a user sets the proactiveness tunable - * close to 100 (maximum). - */ - wmark_low = max(100U - sysctl_compaction_proactiveness, 5U); - return low ? wmark_low : min(wmark_low + 10, 100U); + wmark_low = 100U - sysctl_compaction_proactiveness; + leeway = min(10U, wmark_low / 2); + return low ? wmark_low : min(wmark_low + leeway, 100U); } static bool should_proactive_compact_node(pg_data_t *pgdat) @@ -2348,7 +2308,6 @@ static enum compact_result __compact_finished(struct compact_control *cc) ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &cc->zone->free_area[order]; - bool claim_block; /* Job done if page is free of the right migratetype */ if (!free_area_empty(area, migratetype)) @@ -2364,8 +2323,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) * Job done if allocation would steal freepages from * other migratetype buddy lists. */ - if (find_suitable_fallback(area, order, migratetype, - true, &claim_block) != -1) + if (find_suitable_fallback(area, order, migratetype, true) >= 0) /* * Movable pages are OK in any pageblock. If we are * stealing for a non-movable allocation, make sure diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index c213cf8b5638..b3171f9406c1 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -28,6 +28,7 @@ config DAMON_VADDR bool "Data access monitoring operations for virtual address spaces" depends on DAMON && MMU select PAGE_IDLE_FLAG + default DAMON help This builds the default data access monitoring operations for DAMON that work for virtual address spaces. @@ -36,6 +37,7 @@ config DAMON_PADDR bool "Data access monitoring operations for the physical address space" depends on DAMON && MMU select PAGE_IDLE_FLAG + default DAMON help This builds the default data access monitoring operations for DAMON that works for the physical address space. @@ -55,6 +57,7 @@ config DAMON_VADDR_KUNIT_TEST config DAMON_SYSFS bool "DAMON sysfs interface" depends on DAMON && SYSFS + default DAMON help This builds the sysfs interface for DAMON. The user space can use the interface for arbitrary data access monitoring. @@ -91,4 +94,20 @@ config DAMON_LRU_SORT protect frequently accessed (hot) pages while rarely accessed (cold) pages reclaimed first under memory pressure. +config DAMON_STAT + bool "Build data access monitoring stat (DAMON_STAT)" + depends on DAMON_PADDR + help + This builds the DAMON-based access monitoring statistics subsystem. + It runs DAMON and expose access monitoring results in simple stat + metrics. + +config DAMON_STAT_ENABLED_DEFAULT + bool "Enable DAMON_STAT by default" + depends on DAMON_PADDR + default DAMON_STAT + help + Whether to enable DAMON_STAT by default. Users can disable it in + boot or runtime using its 'enabled' parameter. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 8b49012ba8c3..d8d6bf5f8bff 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o +obj-$(CONFIG_DAMON_STAT) += modules-common.o stat.o diff --git a/mm/damon/core.c b/mm/damon/core.c index fc1eba3da419..52a48c9316bc 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -76,14 +76,13 @@ int damon_register_ops(struct damon_operations *ops) if (ops->id >= NR_DAMON_OPS) return -EINVAL; + mutex_lock(&damon_ops_lock); /* Fail for already registered ops */ - if (__damon_is_registered_ops(ops->id)) { + if (__damon_is_registered_ops(ops->id)) err = -EINVAL; - goto out; - } - damon_registered_ops[ops->id] = *ops; -out: + else + damon_registered_ops[ops->id] = *ops; mutex_unlock(&damon_ops_lock); return err; } @@ -408,6 +407,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme->wmarks = *wmarks; scheme->wmarks.activated = true; + scheme->migrate_dests = (struct damos_migrate_dests){}; scheme->target_nid = target_nid; return scheme; @@ -450,6 +450,9 @@ void damon_destroy_scheme(struct damos *s) damos_for_each_filter_safe(f, next, s) damos_destroy_filter(f); + + kfree(s->migrate_dests.node_id_arr); + kfree(s->migrate_dests.weight_arr); damon_del_scheme(s); damon_free_scheme(s); } @@ -499,8 +502,12 @@ void damon_free_target(struct damon_target *t) kfree(t); } -void damon_destroy_target(struct damon_target *t) +void damon_destroy_target(struct damon_target *t, struct damon_ctx *ctx) { + + if (ctx && ctx->ops.cleanup_target) + ctx->ops.cleanup_target(t); + damon_del_target(t); damon_free_target(t); } @@ -530,7 +537,8 @@ struct damon_ctx *damon_new_ctx(void) ctx->next_ops_update_sis = 0; mutex_init(&ctx->kdamond_lock); - mutex_init(&ctx->call_control_lock); + INIT_LIST_HEAD(&ctx->call_controls); + mutex_init(&ctx->call_controls_lock); mutex_init(&ctx->walk_control_lock); ctx->attrs.min_nr_regions = 10; @@ -546,13 +554,8 @@ static void damon_destroy_targets(struct damon_ctx *ctx) { struct damon_target *t, *next_t; - if (ctx->ops.cleanup) { - ctx->ops.cleanup(ctx); - return; - } - damon_for_each_target_safe(t, next_t, ctx) - damon_destroy_target(t); + damon_destroy_target(t, ctx); } void damon_destroy_ctx(struct damon_ctx *ctx) @@ -677,9 +680,7 @@ static bool damon_valid_intervals_goal(struct damon_attrs *attrs) * @attrs: monitoring attributes * * This function should be called while the kdamond is not running, an access - * check results aggregation is not ongoing (e.g., from &struct - * damon_callback->after_aggregation or &struct - * damon_callback->after_wmarks_check callbacks), or from damon_call(). + * check results aggregation is not ongoing (e.g., from damon_call(). * * Every time interval is in micro-seconds. * @@ -755,6 +756,19 @@ static struct damos_quota_goal *damos_nth_quota_goal( return NULL; } +static void damos_commit_quota_goal_union( + struct damos_quota_goal *dst, struct damos_quota_goal *src) +{ + switch (dst->metric) { + case DAMOS_QUOTA_NODE_MEM_USED_BP: + case DAMOS_QUOTA_NODE_MEM_FREE_BP: + dst->nid = src->nid; + break; + default: + break; + } +} + static void damos_commit_quota_goal( struct damos_quota_goal *dst, struct damos_quota_goal *src) { @@ -763,6 +777,7 @@ static void damos_commit_quota_goal( if (dst->metric == DAMOS_QUOTA_USER_INPUT) dst->current_value = src->current_value; /* keep last_psi_total as is, since it will be updated in next cycle */ + damos_commit_quota_goal_union(dst, src); } /** @@ -775,7 +790,7 @@ static void damos_commit_quota_goal( * DAMON contexts, instead of manual in-place updates. * * This function should be called from parameters-update safe context, like - * DAMON callbacks. + * damon_call(). */ int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src) { @@ -796,6 +811,7 @@ int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src) src_goal->metric, src_goal->target_value); if (!new_goal) return -ENOMEM; + damos_commit_quota_goal_union(new_goal, src_goal); damos_add_quota_goal(dst, new_goal); } return 0; @@ -940,6 +956,41 @@ static void damos_set_filters_default_reject(struct damos *s) damos_filters_default_reject(&s->ops_filters); } +static int damos_commit_dests(struct damos *dst, struct damos *src) +{ + struct damos_migrate_dests *dst_dests, *src_dests; + + dst_dests = &dst->migrate_dests; + src_dests = &src->migrate_dests; + + if (dst_dests->nr_dests != src_dests->nr_dests) { + kfree(dst_dests->node_id_arr); + kfree(dst_dests->weight_arr); + + dst_dests->node_id_arr = kmalloc_array(src_dests->nr_dests, + sizeof(*dst_dests->node_id_arr), GFP_KERNEL); + if (!dst_dests->node_id_arr) { + dst_dests->weight_arr = NULL; + return -ENOMEM; + } + + dst_dests->weight_arr = kmalloc_array(src_dests->nr_dests, + sizeof(*dst_dests->weight_arr), GFP_KERNEL); + if (!dst_dests->weight_arr) { + /* ->node_id_arr will be freed by scheme destruction */ + return -ENOMEM; + } + } + + dst_dests->nr_dests = src_dests->nr_dests; + for (int i = 0; i < src_dests->nr_dests; i++) { + dst_dests->node_id_arr[i] = src_dests->node_id_arr[i]; + dst_dests->weight_arr[i] = src_dests->weight_arr[i]; + } + + return 0; +} + static int damos_commit_filters(struct damos *dst, struct damos *src) { int err; @@ -979,6 +1030,11 @@ static int damos_commit(struct damos *dst, struct damos *src) return err; dst->wmarks = src->wmarks; + dst->target_nid = src->target_nid; + + err = damos_commit_dests(dst, src); + if (err) + return err; err = damos_commit_filters(dst, src); return err; @@ -1094,9 +1150,15 @@ static int damon_commit_targets( if (err) return err; } else { - if (damon_target_has_pid(dst)) - put_pid(dst_target->pid); - damon_destroy_target(dst_target); + struct damos *s; + + damon_destroy_target(dst_target, dst); + damon_for_each_scheme(s, dst) { + if (s->quota.charge_target_from == dst_target) { + s->quota.charge_target_from = NULL; + s->quota.charge_addr_from = 0; + } + } } } @@ -1109,7 +1171,7 @@ static int damon_commit_targets( err = damon_commit_target(new_target, false, src_target, damon_target_has_pid(src)); if (err) { - damon_destroy_target(new_target); + damon_destroy_target(new_target, NULL); return err; } damon_add_target(dst, new_target); @@ -1128,7 +1190,7 @@ static int damon_commit_targets( * in-place updates. * * This function should be called from parameters-update safe context, like - * DAMON callbacks. + * damon_call(). */ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) { @@ -1304,7 +1366,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) return err; } -static bool damon_is_running(struct damon_ctx *ctx) +/** + * damon_is_running() - Returns if a given DAMON context is running. + * @ctx: The DAMON context to see if running. + * + * Return: true if @ctx is running, false otherwise. + */ +bool damon_is_running(struct damon_ctx *ctx) { bool running; @@ -1321,8 +1389,9 @@ static bool damon_is_running(struct damon_ctx *ctx) * * Ask DAMON worker thread (kdamond) of @ctx to call a function with an * argument data that respectively passed via &damon_call_control->fn and - * &damon_call_control->data of @control, and wait until the kdamond finishes - * handling of the request. + * &damon_call_control->data of @control. If &damon_call_control->repeat of + * @control is set, further wait until the kdamond finishes handling of the + * request. Otherwise, return as soon as the request is made. * * The kdamond executes the function with the argument in the main loop, just * after a sampling of the iteration is finished. The function can hence @@ -1334,18 +1403,18 @@ static bool damon_is_running(struct damon_ctx *ctx) */ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) { - init_completion(&control->completion); + if (!control->repeat) + init_completion(&control->completion); control->canceled = false; + INIT_LIST_HEAD(&control->list); - mutex_lock(&ctx->call_control_lock); - if (ctx->call_control) { - mutex_unlock(&ctx->call_control_lock); - return -EBUSY; - } - ctx->call_control = control; - mutex_unlock(&ctx->call_control_lock); + mutex_lock(&ctx->call_controls_lock); + list_add_tail(&ctx->call_controls, &control->list); + mutex_unlock(&ctx->call_controls_lock); if (!damon_is_running(ctx)) return -EINVAL; + if (control->repeat) + return 0; wait_for_completion(&control->completion); if (control->canceled) return -ECANCELED; @@ -1393,6 +1462,19 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) } /* + * Warn and fix corrupted ->nr_accesses[_bp] for investigations and preventing + * the problem being propagated. + */ +static void damon_warn_fix_nr_accesses_corruption(struct damon_region *r) +{ + if (r->nr_accesses_bp == r->nr_accesses * 10000) + return; + WARN_ONCE(true, "invalid nr_accesses_bp at reset: %u %u\n", + r->nr_accesses_bp, r->nr_accesses); + r->nr_accesses_bp = r->nr_accesses * 10000; +} + +/* * Reset the aggregated monitoring results ('nr_accesses' of each region). */ static void kdamond_reset_aggregated(struct damon_ctx *c) @@ -1405,6 +1487,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) damon_for_each_region(r, t) { trace_damon_aggregated(ti, r, damon_nr_regions(t)); + damon_warn_fix_nr_accesses_corruption(r); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } @@ -1428,6 +1511,7 @@ static unsigned long damon_get_intervals_score(struct damon_ctx *c) } } target_access_events = max_access_events * goal_bp / 10000; + target_access_events = target_access_events ? : 1; return access_events * 10000 / target_access_events; } @@ -1468,6 +1552,7 @@ static void kdamond_tune_intervals(struct damon_ctx *c) new_attrs.sample_interval); new_attrs.aggr_interval = new_attrs.sample_interval * c->attrs.aggr_samples; + trace_damon_monitor_intervals_tune(new_attrs.sample_interval); damon_set_attrs(c, &new_attrs); } @@ -1890,6 +1975,29 @@ static inline u64 damos_get_some_mem_psi_total(void) #endif /* CONFIG_PSI */ +#ifdef CONFIG_NUMA +static __kernel_ulong_t damos_get_node_mem_bp( + struct damos_quota_goal *goal) +{ + struct sysinfo i; + __kernel_ulong_t numerator; + + si_meminfo_node(&i, goal->nid); + if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP) + numerator = i.totalram - i.freeram; + else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */ + numerator = i.freeram; + return numerator * 10000 / i.totalram; +} +#else +static __kernel_ulong_t damos_get_node_mem_bp( + struct damos_quota_goal *goal) +{ + return 0; +} +#endif + + static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) { u64 now_psi_total; @@ -1903,6 +2011,10 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) goal->current_value = now_psi_total - goal->last_psi_total; goal->last_psi_total = now_psi_total; break; + case DAMOS_QUOTA_NODE_MEM_USED_BP: + case DAMOS_QUOTA_NODE_MEM_FREE_BP: + goal->current_value = damos_get_node_mem_bp(goal); + break; default: break; } @@ -1961,12 +2073,26 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->esz = esz; } +static void damos_trace_esz(struct damon_ctx *c, struct damos *s, + struct damos_quota *quota) +{ + unsigned int cidx = 0, sidx = 0; + struct damos *siter; + + damon_for_each_scheme(siter, c) { + if (siter == s) + break; + sidx++; + } + trace_damos_esz(cidx, sidx, quota->esz); +} + static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) { struct damos_quota *quota = &s->quota; struct damon_target *t; struct damon_region *r; - unsigned long cumulated_sz; + unsigned long cumulated_sz, cached_esz; unsigned int score, max_score = 0; if (!quota->ms && !quota->sz && list_empty("a->goals)) @@ -1980,7 +2106,11 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; quota->charged_sz = 0; + if (trace_damos_esz_enabled()) + cached_esz = quota->esz; damos_set_effective_quota(quota); + if (trace_damos_esz_enabled() && quota->esz != cached_esz) + damos_trace_esz(c, s, quota); } if (!c->ops.get_scheme_score) @@ -2301,36 +2431,49 @@ static void kdamond_usleep(unsigned long usecs) } /* - * kdamond_call() - handle damon_call_control. + * kdamond_call() - handle damon_call_control objects. * @ctx: The &struct damon_ctx of the kdamond. * @cancel: Whether to cancel the invocation of the function. * - * If there is a &struct damon_call_control request that registered via + * If there are &struct damon_call_control requests that registered via * &damon_call() on @ctx, do or cancel the invocation of the function depending - * on @cancel. @cancel is set when the kdamond is deactivated by DAMOS - * watermarks, or the kdamond is already out of the main loop and therefore - * will be terminated. + * on @cancel. @cancel is set when the kdamond is already out of the main loop + * and therefore will be terminated. */ static void kdamond_call(struct damon_ctx *ctx, bool cancel) { struct damon_call_control *control; + LIST_HEAD(repeat_controls); int ret = 0; - mutex_lock(&ctx->call_control_lock); - control = ctx->call_control; - mutex_unlock(&ctx->call_control_lock); - if (!control) - return; - if (cancel) { - control->canceled = true; - } else { - ret = control->fn(control->data); - control->return_code = ret; + while (true) { + mutex_lock(&ctx->call_controls_lock); + control = list_first_entry_or_null(&ctx->call_controls, + struct damon_call_control, list); + mutex_unlock(&ctx->call_controls_lock); + if (!control) + break; + if (cancel) { + control->canceled = true; + } else { + ret = control->fn(control->data); + control->return_code = ret; + } + mutex_lock(&ctx->call_controls_lock); + list_del(&control->list); + mutex_unlock(&ctx->call_controls_lock); + if (!control->repeat) + complete(&control->completion); + else + list_add(&control->list, &repeat_controls); } - complete(&control->completion); - mutex_lock(&ctx->call_control_lock); - ctx->call_control = NULL; - mutex_unlock(&ctx->call_control_lock); + control = list_first_entry_or_null(&repeat_controls, + struct damon_call_control, list); + if (!control || cancel) + return; + mutex_lock(&ctx->call_controls_lock); + list_add_tail(&control->list, &ctx->call_controls); + mutex_unlock(&ctx->call_controls_lock); } /* Returns negative error code if it's not activated but should return */ @@ -2354,10 +2497,7 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) kdamond_usleep(min_wait_time); - if (ctx->callback.after_wmarks_check && - ctx->callback.after_wmarks_check(ctx)) - break; - kdamond_call(ctx, true); + kdamond_call(ctx, false); damos_walk_cancel(ctx); } return -EBUSY; @@ -2413,10 +2553,9 @@ static int kdamond_fn(void *data) while (!kdamond_need_stop(ctx)) { /* * ctx->attrs and ctx->next_{aggregation,ops_update}_sis could - * be changed from after_wmarks_check() or after_aggregation() - * callbacks. Read the values here, and use those for this - * iteration. That is, damon_set_attrs() updated new values - * are respected from next iteration. + * be changed from kdamond_call(). Read the values here, and + * use those for this iteration. That is, damon_set_attrs() + * updated new values are respected from next iteration. */ unsigned long next_aggregation_sis = ctx->next_aggregation_sis; unsigned long next_ops_update_sis = ctx->next_ops_update_sis; @@ -2434,14 +2573,10 @@ static int kdamond_fn(void *data) if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); - if (ctx->passed_sample_intervals >= next_aggregation_sis) { + if (ctx->passed_sample_intervals >= next_aggregation_sis) kdamond_merge_regions(ctx, max_nr_accesses / 10, sz_limit); - if (ctx->callback.after_aggregation && - ctx->callback.after_aggregation(ctx)) - break; - } /* * do kdamond_call() and kdamond_apply_schemes() after @@ -2507,8 +2642,6 @@ done: damon_destroy_region(r, t); } - if (ctx->callback.before_terminate) - ctx->callback.before_terminate(ctx); if (ctx->ops.cleanup) ctx->ops.cleanup(ctx); kfree(ctx->regions_score_histogram); @@ -2527,6 +2660,7 @@ done: running_exclusive_ctxs = false; mutex_unlock(&damon_lock); + damon_destroy_targets(ctx); return 0; } diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 4af8fd4a390b..151a9de5ad8b 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -230,6 +230,39 @@ out: return err; } +static int damon_lru_sort_handle_commit_inputs(void) +{ + int err; + + if (!commit_inputs) + return 0; + + err = damon_lru_sort_apply_parameters(); + commit_inputs = false; + return err; +} + +static int damon_lru_sort_damon_call_fn(void *arg) +{ + struct damon_ctx *c = arg; + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) { + if (s->action == DAMOS_LRU_PRIO) + damon_lru_sort_hot_stat = s->stat; + else if (s->action == DAMOS_LRU_DEPRIO) + damon_lru_sort_cold_stat = s->stat; + } + + return damon_lru_sort_handle_commit_inputs(); +} + +static struct damon_call_control call_control = { + .fn = damon_lru_sort_damon_call_fn, + .repeat = true, +}; + static int damon_lru_sort_turn(bool on) { int err; @@ -249,7 +282,7 @@ static int damon_lru_sort_turn(bool on) if (err) return err; kdamond_pid = ctx->kdamond->pid; - return 0; + return damon_call(ctx, &call_control); } static int damon_lru_sort_enabled_store(const char *val, @@ -288,52 +321,22 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); MODULE_PARM_DESC(enabled, "Enable or disable DAMON_LRU_SORT (default: disabled)"); -static int damon_lru_sort_handle_commit_inputs(void) -{ - int err; - - if (!commit_inputs) - return 0; - - err = damon_lru_sort_apply_parameters(); - commit_inputs = false; - return err; -} - -static int damon_lru_sort_after_aggregation(struct damon_ctx *c) -{ - struct damos *s; - - /* update the stats parameter */ - damon_for_each_scheme(s, c) { - if (s->action == DAMOS_LRU_PRIO) - damon_lru_sort_hot_stat = s->stat; - else if (s->action == DAMOS_LRU_DEPRIO) - damon_lru_sort_cold_stat = s->stat; - } - - return damon_lru_sort_handle_commit_inputs(); -} - -static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c) -{ - return damon_lru_sort_handle_commit_inputs(); -} - static int __init damon_lru_sort_init(void) { int err = damon_modules_new_paddr_ctx_target(&ctx, &target); if (err) - return err; + goto out; - ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; - ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; + call_control.data = ctx; /* 'enabled' has set before this function, probably via command line */ if (enabled) err = damon_lru_sort_turn(true); +out: + if (err && enabled) + enabled = false; return err; } diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c index 7cf96574cde7..86d58f8c4f63 100644 --- a/mm/damon/modules-common.c +++ b/mm/damon/modules-common.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Common Primitives for DAMON Modules + * Common Code for DAMON Modules * * Author: SeongJae Park <sj@kernel.org> */ diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index f49cdb417005..f103ad556368 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Common Primitives for DAMON Modules + * Common Code for DAMON Modules * * Author: SeongJae Park <sj@kernel.org> */ diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 0db1fc70c84d..99321ff5cb92 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -1,10 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Common Primitives for Data Access Monitoring + * Common Code for Data Access Monitoring * * Author: SeongJae Park <sj@kernel.org> */ +#include <linux/migrate.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/pagemap.h> @@ -12,6 +13,7 @@ #include <linux/swap.h> #include <linux/swapops.h> +#include "../internal.h" #include "ops-common.h" /* @@ -138,3 +140,275 @@ int damon_cold_score(struct damon_ctx *c, struct damon_region *r, /* Return coldness of the region */ return DAMOS_MAX_SCORE - hotness; } + +static bool damon_folio_mkold_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, void *arg) +{ + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); + + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) + damon_ptep_mkold(pvmw.pte, vma, addr); + else + damon_pmdp_mkold(pvmw.pmd, vma, addr); + } + return true; +} + +void damon_folio_mkold(struct folio *folio) +{ + struct rmap_walk_control rwc = { + .rmap_one = damon_folio_mkold_one, + .anon_lock = folio_lock_anon_vma_read, + }; + bool need_lock; + + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { + folio_set_idle(folio); + return; + } + + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); + if (need_lock && !folio_trylock(folio)) + return; + + rmap_walk(folio, &rwc); + + if (need_lock) + folio_unlock(folio); + +} + +static bool damon_folio_young_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, void *arg) +{ + bool *accessed = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); + pte_t pte; + + *accessed = false; + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) { + pte = ptep_get(pvmw.pte); + + /* + * PFN swap PTEs, such as device-exclusive ones, that + * actually map pages are "old" from a CPU perspective. + * The MMU notifier takes care of any device aspects. + */ + *accessed = (pte_present(pte) && pte_young(pte)) || + !folio_test_idle(folio) || + mmu_notifier_test_young(vma->vm_mm, addr); + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + *accessed = pmd_young(pmdp_get(pvmw.pmd)) || + !folio_test_idle(folio) || + mmu_notifier_test_young(vma->vm_mm, addr); +#else + WARN_ON_ONCE(1); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + } + if (*accessed) { + page_vma_mapped_walk_done(&pvmw); + break; + } + } + + /* If accessed, stop walking */ + return *accessed == false; +} + +bool damon_folio_young(struct folio *folio) +{ + bool accessed = false; + struct rmap_walk_control rwc = { + .arg = &accessed, + .rmap_one = damon_folio_young_one, + .anon_lock = folio_lock_anon_vma_read, + }; + bool need_lock; + + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { + if (folio_test_idle(folio)) + return false; + else + return true; + } + + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); + if (need_lock && !folio_trylock(folio)) + return false; + + rmap_walk(folio, &rwc); + + if (need_lock) + folio_unlock(folio); + + return accessed; +} + +bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio) +{ + bool matched = false; + struct mem_cgroup *memcg; + size_t folio_sz; + + switch (filter->type) { + case DAMOS_FILTER_TYPE_ANON: + matched = folio_test_anon(folio); + break; + case DAMOS_FILTER_TYPE_ACTIVE: + matched = folio_test_active(folio); + break; + case DAMOS_FILTER_TYPE_MEMCG: + rcu_read_lock(); + memcg = folio_memcg_check(folio); + if (!memcg) + matched = false; + else + matched = filter->memcg_id == mem_cgroup_id(memcg); + rcu_read_unlock(); + break; + case DAMOS_FILTER_TYPE_YOUNG: + matched = damon_folio_young(folio); + if (matched) + damon_folio_mkold(folio); + break; + case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: + folio_sz = folio_size(folio); + matched = filter->sz_range.min <= folio_sz && + folio_sz <= filter->sz_range.max; + break; + case DAMOS_FILTER_TYPE_UNMAPPED: + matched = !folio_mapped(folio) || !folio_raw_mapping(folio); + break; + default: + break; + } + + return matched == filter->matching; +} + +static unsigned int __damon_migrate_folio_list( + struct list_head *migrate_folios, struct pglist_data *pgdat, + int target_nid) +{ + unsigned int nr_succeeded = 0; + struct migration_target_control mtc = { + /* + * Allocate from 'node', or fail quickly and quietly. + * When this happens, 'page' will likely just be discarded + * instead of migrated. + */ + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | + __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT, + .nid = target_nid, + }; + + if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE) + return 0; + + if (list_empty(migrate_folios)) + return 0; + + /* Migration ignores all cpuset and mempolicy settings */ + migrate_pages(migrate_folios, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON, + &nr_succeeded); + + return nr_succeeded; +} + +static unsigned int damon_migrate_folio_list(struct list_head *folio_list, + struct pglist_data *pgdat, + int target_nid) +{ + unsigned int nr_migrated = 0; + struct folio *folio; + LIST_HEAD(ret_folios); + LIST_HEAD(migrate_folios); + + while (!list_empty(folio_list)) { + struct folio *folio; + + cond_resched(); + + folio = lru_to_folio(folio_list); + list_del(&folio->lru); + + if (!folio_trylock(folio)) + goto keep; + + /* Relocate its contents to another node. */ + list_add(&folio->lru, &migrate_folios); + folio_unlock(folio); + continue; +keep: + list_add(&folio->lru, &ret_folios); + } + /* 'folio_list' is always empty here */ + + /* Migrate folios selected for migration */ + nr_migrated += __damon_migrate_folio_list( + &migrate_folios, pgdat, target_nid); + /* + * Folios that could not be migrated are still in @migrate_folios. Add + * those back on @folio_list + */ + if (!list_empty(&migrate_folios)) + list_splice_init(&migrate_folios, folio_list); + + try_to_unmap_flush(); + + list_splice(&ret_folios, folio_list); + + while (!list_empty(folio_list)) { + folio = lru_to_folio(folio_list); + list_del(&folio->lru); + folio_putback_lru(folio); + } + + return nr_migrated; +} + +unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid) +{ + int nid; + unsigned long nr_migrated = 0; + LIST_HEAD(node_folio_list); + unsigned int noreclaim_flag; + + if (list_empty(folio_list)) + return nr_migrated; + + if (target_nid < 0 || target_nid >= MAX_NUMNODES || + !node_state(target_nid, N_MEMORY)) + return nr_migrated; + + noreclaim_flag = memalloc_noreclaim_save(); + + nid = folio_nid(lru_to_folio(folio_list)); + do { + struct folio *folio = lru_to_folio(folio_list); + + if (nid == folio_nid(folio)) { + list_move(&folio->lru, &node_folio_list); + continue; + } + + nr_migrated += damon_migrate_folio_list(&node_folio_list, + NODE_DATA(nid), + target_nid); + nid = folio_nid(lru_to_folio(folio_list)); + } while (!list_empty(folio_list)); + + nr_migrated += damon_migrate_folio_list(&node_folio_list, + NODE_DATA(nid), + target_nid); + + memalloc_noreclaim_restore(noreclaim_flag); + + return nr_migrated; +} diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 18d837d11bce..61ad54aaf256 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Common Primitives for Data Access Monitoring + * Common Code for Data Access Monitoring * * Author: SeongJae Park <sj@kernel.org> */ @@ -11,8 +11,13 @@ struct folio *damon_get_folio(unsigned long pfn); void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr); void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr); +void damon_folio_mkold(struct folio *folio); +bool damon_folio_young(struct folio *folio); int damon_cold_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); int damon_hot_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); + +bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio); +unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 1b70d3f36046..53a55c5114fb 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * DAMON Primitives for The Physical Address Space + * DAMON Code for The Physical Address Space * * Author: SeongJae Park <sj@kernel.org> */ @@ -13,51 +13,11 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/memory-tiers.h> -#include <linux/migrate.h> #include <linux/mm_inline.h> #include "../internal.h" #include "ops-common.h" -static bool damon_folio_mkold_one(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *arg) -{ - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); - - while (page_vma_mapped_walk(&pvmw)) { - addr = pvmw.address; - if (pvmw.pte) - damon_ptep_mkold(pvmw.pte, vma, addr); - else - damon_pmdp_mkold(pvmw.pmd, vma, addr); - } - return true; -} - -static void damon_folio_mkold(struct folio *folio) -{ - struct rmap_walk_control rwc = { - .rmap_one = damon_folio_mkold_one, - .anon_lock = folio_lock_anon_vma_read, - }; - bool need_lock; - - if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { - folio_set_idle(folio); - return; - } - - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) - return; - - rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); - -} - static void damon_pa_mkold(unsigned long paddr) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); @@ -87,75 +47,6 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) } } -static bool damon_folio_young_one(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *arg) -{ - bool *accessed = arg; - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); - pte_t pte; - - *accessed = false; - while (page_vma_mapped_walk(&pvmw)) { - addr = pvmw.address; - if (pvmw.pte) { - pte = ptep_get(pvmw.pte); - - /* - * PFN swap PTEs, such as device-exclusive ones, that - * actually map pages are "old" from a CPU perspective. - * The MMU notifier takes care of any device aspects. - */ - *accessed = (pte_present(pte) && pte_young(pte)) || - !folio_test_idle(folio) || - mmu_notifier_test_young(vma->vm_mm, addr); - } else { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - *accessed = pmd_young(pmdp_get(pvmw.pmd)) || - !folio_test_idle(folio) || - mmu_notifier_test_young(vma->vm_mm, addr); -#else - WARN_ON_ONCE(1); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - } - if (*accessed) { - page_vma_mapped_walk_done(&pvmw); - break; - } - } - - /* If accessed, stop walking */ - return *accessed == false; -} - -static bool damon_folio_young(struct folio *folio) -{ - bool accessed = false; - struct rmap_walk_control rwc = { - .arg = &accessed, - .rmap_one = damon_folio_young_one, - .anon_lock = folio_lock_anon_vma_read, - }; - bool need_lock; - - if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { - if (folio_test_idle(folio)) - return false; - else - return true; - } - - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) - return false; - - rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); - - return accessed; -} - static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); @@ -206,49 +97,6 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } -static bool damos_pa_filter_match(struct damos_filter *filter, - struct folio *folio) -{ - bool matched = false; - struct mem_cgroup *memcg; - size_t folio_sz; - - switch (filter->type) { - case DAMOS_FILTER_TYPE_ANON: - matched = folio_test_anon(folio); - break; - case DAMOS_FILTER_TYPE_ACTIVE: - matched = folio_test_active(folio); - break; - case DAMOS_FILTER_TYPE_MEMCG: - rcu_read_lock(); - memcg = folio_memcg_check(folio); - if (!memcg) - matched = false; - else - matched = filter->memcg_id == mem_cgroup_id(memcg); - rcu_read_unlock(); - break; - case DAMOS_FILTER_TYPE_YOUNG: - matched = damon_folio_young(folio); - if (matched) - damon_folio_mkold(folio); - break; - case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: - folio_sz = folio_size(folio); - matched = filter->sz_range.min <= folio_sz && - folio_sz <= filter->sz_range.max; - break; - case DAMOS_FILTER_TYPE_UNMAPPED: - matched = !folio_mapped(folio) || !folio_raw_mapping(folio); - break; - default: - break; - } - - return matched == filter->matching; -} - /* * damos_pa_filter_out - Return true if the page should be filtered out. */ @@ -260,7 +108,7 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) return false; damos_for_each_ops_filter(filter, scheme) { - if (damos_pa_filter_match(filter, folio)) + if (damos_folio_filter_match(filter, folio)) return !filter->allow; } return scheme->ops_filters_default_reject; @@ -381,127 +229,6 @@ static unsigned long damon_pa_deactivate_pages(struct damon_region *r, sz_filter_passed); } -static unsigned int __damon_pa_migrate_folio_list( - struct list_head *migrate_folios, struct pglist_data *pgdat, - int target_nid) -{ - unsigned int nr_succeeded = 0; - nodemask_t allowed_mask = NODE_MASK_NONE; - struct migration_target_control mtc = { - /* - * Allocate from 'node', or fail quickly and quietly. - * When this happens, 'page' will likely just be discarded - * instead of migrated. - */ - .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | - __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT, - .nid = target_nid, - .nmask = &allowed_mask - }; - - if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE) - return 0; - - if (list_empty(migrate_folios)) - return 0; - - /* Migration ignores all cpuset and mempolicy settings */ - migrate_pages(migrate_folios, alloc_migrate_folio, NULL, - (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON, - &nr_succeeded); - - return nr_succeeded; -} - -static unsigned int damon_pa_migrate_folio_list(struct list_head *folio_list, - struct pglist_data *pgdat, - int target_nid) -{ - unsigned int nr_migrated = 0; - struct folio *folio; - LIST_HEAD(ret_folios); - LIST_HEAD(migrate_folios); - - while (!list_empty(folio_list)) { - struct folio *folio; - - cond_resched(); - - folio = lru_to_folio(folio_list); - list_del(&folio->lru); - - if (!folio_trylock(folio)) - goto keep; - - /* Relocate its contents to another node. */ - list_add(&folio->lru, &migrate_folios); - folio_unlock(folio); - continue; -keep: - list_add(&folio->lru, &ret_folios); - } - /* 'folio_list' is always empty here */ - - /* Migrate folios selected for migration */ - nr_migrated += __damon_pa_migrate_folio_list( - &migrate_folios, pgdat, target_nid); - /* - * Folios that could not be migrated are still in @migrate_folios. Add - * those back on @folio_list - */ - if (!list_empty(&migrate_folios)) - list_splice_init(&migrate_folios, folio_list); - - try_to_unmap_flush(); - - list_splice(&ret_folios, folio_list); - - while (!list_empty(folio_list)) { - folio = lru_to_folio(folio_list); - list_del(&folio->lru); - folio_putback_lru(folio); - } - - return nr_migrated; -} - -static unsigned long damon_pa_migrate_pages(struct list_head *folio_list, - int target_nid) -{ - int nid; - unsigned long nr_migrated = 0; - LIST_HEAD(node_folio_list); - unsigned int noreclaim_flag; - - if (list_empty(folio_list)) - return nr_migrated; - - noreclaim_flag = memalloc_noreclaim_save(); - - nid = folio_nid(lru_to_folio(folio_list)); - do { - struct folio *folio = lru_to_folio(folio_list); - - if (nid == folio_nid(folio)) { - list_move(&folio->lru, &node_folio_list); - continue; - } - - nr_migrated += damon_pa_migrate_folio_list(&node_folio_list, - NODE_DATA(nid), - target_nid); - nid = folio_nid(lru_to_folio(folio_list)); - } while (!list_empty(folio_list)); - - nr_migrated += damon_pa_migrate_folio_list(&node_folio_list, - NODE_DATA(nid), - target_nid); - - memalloc_noreclaim_restore(noreclaim_flag); - - return nr_migrated; -} - static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, unsigned long *sz_filter_passed) { @@ -529,7 +256,7 @@ put_folio: addr += folio_size(folio); folio_put(folio); } - applied = damon_pa_migrate_pages(&folio_list, s->target_nid); + applied = damon_migrate_pages(&folio_list, s->target_nid); cond_resched(); s->last_applied = folio; return applied * PAGE_SIZE; @@ -548,7 +275,6 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, unsigned long *sz_filter_passed) { unsigned long addr; - LIST_HEAD(folio_list); struct folio *folio; if (!damon_pa_scheme_has_filter(s)) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index a675150965e0..3c71b4596676 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -194,7 +194,7 @@ static int damon_reclaim_apply_parameters(void) if (err) return err; - err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); + err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs); if (err) goto out; @@ -202,7 +202,7 @@ static int damon_reclaim_apply_parameters(void) scheme = damon_reclaim_new_scheme(); if (!scheme) goto out; - damon_set_schemes(ctx, &scheme, 1); + damon_set_schemes(param_ctx, &scheme, 1); if (quota_mem_pressure_us) { goal = damos_new_quota_goal(DAMOS_QUOTA_SOME_MEM_PSI_US, @@ -238,6 +238,35 @@ out: return err; } +static int damon_reclaim_handle_commit_inputs(void) +{ + int err; + + if (!commit_inputs) + return 0; + + err = damon_reclaim_apply_parameters(); + commit_inputs = false; + return err; +} + +static int damon_reclaim_damon_call_fn(void *arg) +{ + struct damon_ctx *c = arg; + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) + damon_reclaim_stat = s->stat; + + return damon_reclaim_handle_commit_inputs(); +} + +static struct damon_call_control call_control = { + .fn = damon_reclaim_damon_call_fn, + .repeat = true, +}; + static int damon_reclaim_turn(bool on) { int err; @@ -257,7 +286,7 @@ static int damon_reclaim_turn(bool on) if (err) return err; kdamond_pid = ctx->kdamond->pid; - return 0; + return damon_call(ctx, &call_control); } static int damon_reclaim_enabled_store(const char *val, @@ -296,48 +325,22 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); MODULE_PARM_DESC(enabled, "Enable or disable DAMON_RECLAIM (default: disabled)"); -static int damon_reclaim_handle_commit_inputs(void) -{ - int err; - - if (!commit_inputs) - return 0; - - err = damon_reclaim_apply_parameters(); - commit_inputs = false; - return err; -} - -static int damon_reclaim_after_aggregation(struct damon_ctx *c) -{ - struct damos *s; - - /* update the stats parameter */ - damon_for_each_scheme(s, c) - damon_reclaim_stat = s->stat; - - return damon_reclaim_handle_commit_inputs(); -} - -static int damon_reclaim_after_wmarks_check(struct damon_ctx *c) -{ - return damon_reclaim_handle_commit_inputs(); -} - static int __init damon_reclaim_init(void) { int err = damon_modules_new_paddr_ctx_target(&ctx, &target); if (err) - return err; + goto out; - ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; - ctx->callback.after_aggregation = damon_reclaim_after_aggregation; + call_control.data = ctx; /* 'enabled' has set before this function, probably via command line */ if (enabled) err = damon_reclaim_turn(true); +out: + if (err && enabled) + enabled = false; return err; } diff --git a/mm/damon/stat.c b/mm/damon/stat.c new file mode 100644 index 000000000000..87bcd8866d4b --- /dev/null +++ b/mm/damon/stat.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shows data access monitoring resutls in simple metrics. + */ + +#define pr_fmt(fmt) "damon-stat: " fmt + +#include <linux/damon.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sort.h> + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_stat." + +static int damon_stat_enabled_store( + const char *val, const struct kernel_param *kp); + +static const struct kernel_param_ops enabled_param_ops = { + .set = damon_stat_enabled_store, + .get = param_get_bool, +}; + +static bool enabled __read_mostly = IS_ENABLED( + CONFIG_DAMON_STAT_ENABLED_DEFAULT); +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, "Enable of disable DAMON_STAT"); + +static unsigned long estimated_memory_bandwidth __read_mostly; +module_param(estimated_memory_bandwidth, ulong, 0400); +MODULE_PARM_DESC(estimated_memory_bandwidth, + "Estimated memory bandwidth usage in bytes per second"); + +static unsigned long memory_idle_ms_percentiles[101] __read_mostly = {0,}; +module_param_array(memory_idle_ms_percentiles, ulong, NULL, 0400); +MODULE_PARM_DESC(memory_idle_ms_percentiles, + "Memory idle time percentiles in milliseconds"); + +static struct damon_ctx *damon_stat_context; + +static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long access_bytes = 0; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) + access_bytes += (r->ar.end - r->ar.start) * + r->nr_accesses; + } + estimated_memory_bandwidth = access_bytes * USEC_PER_MSEC * + MSEC_PER_SEC / c->attrs.aggr_interval; +} + +static unsigned int damon_stat_idletime(const struct damon_region *r) +{ + if (r->nr_accesses) + return 0; + return r->age + 1; +} + +static int damon_stat_cmp_regions(const void *a, const void *b) +{ + const struct damon_region *ra = *(const struct damon_region **)a; + const struct damon_region *rb = *(const struct damon_region **)b; + + return damon_stat_idletime(ra) - damon_stat_idletime(rb); +} + +static int damon_stat_sort_regions(struct damon_ctx *c, + struct damon_region ***sorted_ptr, int *nr_regions_ptr, + unsigned long *total_sz_ptr) +{ + struct damon_target *t; + struct damon_region *r; + struct damon_region **region_pointers; + unsigned int nr_regions = 0; + unsigned long total_sz = 0; + + damon_for_each_target(t, c) { + /* there is only one target */ + region_pointers = kmalloc_array(damon_nr_regions(t), + sizeof(*region_pointers), GFP_KERNEL); + if (!region_pointers) + return -ENOMEM; + damon_for_each_region(r, t) { + region_pointers[nr_regions++] = r; + total_sz += r->ar.end - r->ar.start; + } + } + sort(region_pointers, nr_regions, sizeof(*region_pointers), + damon_stat_cmp_regions, NULL); + *sorted_ptr = region_pointers; + *nr_regions_ptr = nr_regions; + *total_sz_ptr = total_sz; + return 0; +} + +static void damon_stat_set_idletime_percentiles(struct damon_ctx *c) +{ + struct damon_region **sorted_regions, *region; + int nr_regions; + unsigned long total_sz, accounted_bytes = 0; + int err, i, next_percentile = 0; + + err = damon_stat_sort_regions(c, &sorted_regions, &nr_regions, + &total_sz); + if (err) + return; + for (i = 0; i < nr_regions; i++) { + region = sorted_regions[i]; + accounted_bytes += region->ar.end - region->ar.start; + while (next_percentile <= accounted_bytes * 100 / total_sz) + memory_idle_ms_percentiles[next_percentile++] = + damon_stat_idletime(region) * + c->attrs.aggr_interval / USEC_PER_MSEC; + } + kfree(sorted_regions); +} + +static int damon_stat_damon_call_fn(void *data) +{ + struct damon_ctx *c = data; + static unsigned long last_refresh_jiffies; + + /* avoid unnecessarily frequent stat update */ + if (time_before_eq(jiffies, last_refresh_jiffies + + msecs_to_jiffies(5 * MSEC_PER_SEC))) + return 0; + last_refresh_jiffies = jiffies; + + damon_stat_set_estimated_memory_bandwidth(c); + damon_stat_set_idletime_percentiles(c); + return 0; +} + +static struct damon_ctx *damon_stat_build_ctx(void) +{ + struct damon_ctx *ctx; + struct damon_attrs attrs; + struct damon_target *target; + unsigned long start = 0, end = 0; + + ctx = damon_new_ctx(); + if (!ctx) + return NULL; + attrs = (struct damon_attrs) { + .sample_interval = 5 * USEC_PER_MSEC, + .aggr_interval = 100 * USEC_PER_MSEC, + .ops_update_interval = 60 * USEC_PER_MSEC * MSEC_PER_SEC, + .min_nr_regions = 10, + .max_nr_regions = 1000, + }; + /* + * auto-tune sampling and aggregation interval aiming 4% DAMON-observed + * accesses ratio, keeping sampling interval in [5ms, 10s] range. + */ + attrs.intervals_goal = (struct damon_intervals_goal) { + .access_bp = 400, .aggrs = 3, + .min_sample_us = 5000, .max_sample_us = 10000000, + }; + if (damon_set_attrs(ctx, &attrs)) + goto free_out; + + /* + * auto-tune sampling and aggregation interval aiming 4% DAMON-observed + * accesses ratio, keeping sampling interval in [5ms, 10s] range. + */ + ctx->attrs.intervals_goal = (struct damon_intervals_goal) { + .access_bp = 400, .aggrs = 3, + .min_sample_us = 5000, .max_sample_us = 10000000, + }; + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) + goto free_out; + + target = damon_new_target(); + if (!target) + goto free_out; + damon_add_target(ctx, target); + if (damon_set_region_biggest_system_ram_default(target, &start, &end)) + goto free_out; + return ctx; +free_out: + damon_destroy_ctx(ctx); + return NULL; +} + +static struct damon_call_control call_control = { + .fn = damon_stat_damon_call_fn, + .repeat = true, +}; + +static int damon_stat_start(void) +{ + int err; + + damon_stat_context = damon_stat_build_ctx(); + if (!damon_stat_context) + return -ENOMEM; + err = damon_start(&damon_stat_context, 1, true); + if (err) + return err; + call_control.data = damon_stat_context; + return damon_call(damon_stat_context, &call_control); +} + +static void damon_stat_stop(void) +{ + damon_stop(&damon_stat_context, 1); + damon_destroy_ctx(damon_stat_context); +} + +static bool damon_stat_init_called; + +static int damon_stat_enabled_store( + const char *val, const struct kernel_param *kp) +{ + bool is_enabled = enabled; + int err; + + err = kstrtobool(val, &enabled); + if (err) + return err; + + if (is_enabled == enabled) + return 0; + + if (!damon_stat_init_called) + /* + * probably called from command line parsing (parse_args()). + * Cannot call damon_new_ctx(). Let damon_stat_init() handle. + */ + return 0; + + if (enabled) { + err = damon_stat_start(); + if (err) + enabled = false; + return err; + } + damon_stat_stop(); + return 0; +} + +static int __init damon_stat_init(void) +{ + int err = 0; + + damon_stat_init_called = true; + + /* probably set via command line */ + if (enabled) + err = damon_stat_start(); + + if (err && enabled) + enabled = false; + return err; +} + +module_init(damon_stat_init); diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c index 70edf45c2174..ffaf285e241a 100644 --- a/mm/damon/sysfs-common.c +++ b/mm/damon/sysfs-common.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Common Primitives for DAMON Sysfs Interface + * Common Code for DAMON Sysfs Interface * * Author: SeongJae Park <sj@kernel.org> */ diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 70d84bdc9f5f..2099adee11d0 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Common Primitives for DAMON Sysfs Interface + * Common Code for DAMON Sysfs Interface * * Author: SeongJae Park <sj@kernel.org> */ diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 23b562df0839..74056bcd6a2c 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -341,16 +341,45 @@ static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc( return filter; } -/* Should match with enum damos_filter_type */ -static const char * const damon_sysfs_scheme_filter_type_strs[] = { - "anon", - "active", - "memcg", - "young", - "hugepage_size", - "unmapped", - "addr", - "target", +struct damos_sysfs_filter_type_name { + enum damos_filter_type type; + char *name; +}; + +static const struct damos_sysfs_filter_type_name +damos_sysfs_filter_type_names[] = { + { + .type = DAMOS_FILTER_TYPE_ANON, + .name = "anon", + }, + { + .type = DAMOS_FILTER_TYPE_ACTIVE, + .name = "active", + }, + { + .type = DAMOS_FILTER_TYPE_MEMCG, + .name = "memcg", + }, + { + .type = DAMOS_FILTER_TYPE_YOUNG, + .name = "young", + }, + { + .type = DAMOS_FILTER_TYPE_HUGEPAGE_SIZE, + .name = "hugepage_size", + }, + { + .type = DAMOS_FILTER_TYPE_UNMAPPED, + .name = "unmapped", + }, + { + .type = DAMOS_FILTER_TYPE_ADDR, + .name = "addr", + }, + { + .type = DAMOS_FILTER_TYPE_TARGET, + .name = "target", + }, }; static ssize_t type_show(struct kobject *kobj, @@ -358,9 +387,16 @@ static ssize_t type_show(struct kobject *kobj, { struct damon_sysfs_scheme_filter *filter = container_of(kobj, struct damon_sysfs_scheme_filter, kobj); + int i; - return sysfs_emit(buf, "%s\n", - damon_sysfs_scheme_filter_type_strs[filter->type]); + for (i = 0; i < ARRAY_SIZE(damos_sysfs_filter_type_names); i++) { + const struct damos_sysfs_filter_type_name *type_name; + + type_name = &damos_sysfs_filter_type_names[i]; + if (type_name->type == filter->type) + return sysfs_emit(buf, "%s\n", type_name->name); + } + return -EINVAL; } static bool damos_sysfs_scheme_filter_valid_type( @@ -385,16 +421,19 @@ static ssize_t type_store(struct kobject *kobj, { struct damon_sysfs_scheme_filter *filter = container_of(kobj, struct damon_sysfs_scheme_filter, kobj); - enum damos_filter_type type; ssize_t ret = -EINVAL; + int i; - for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) { - if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[ - type])) { + for (i = 0; i < ARRAY_SIZE(damos_sysfs_filter_type_names); i++) { + const struct damos_sysfs_filter_type_name *type_name; + + type_name = &damos_sysfs_filter_type_names[i]; + if (sysfs_streq(buf, type_name->name)) { if (!damos_sysfs_scheme_filter_valid_type( - filter->handle_layer, type)) + filter->handle_layer, + type_name->type)) break; - filter->type = type; + filter->type = type_name->type; ret = count; break; } @@ -465,12 +504,14 @@ static ssize_t memcg_path_store(struct kobject *kobj, { struct damon_sysfs_scheme_filter *filter = container_of(kobj, struct damon_sysfs_scheme_filter, kobj); - char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL); + char *path = kmalloc_array(size_add(count, 1), sizeof(*path), + GFP_KERNEL); if (!path) return -ENOMEM; strscpy(path, buf, count + 1); + kfree(filter->memcg_path); filter->memcg_path = path; return count; } @@ -783,10 +824,21 @@ static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( return watermarks; } -/* Should match with enum damos_wmark_metric */ -static const char * const damon_sysfs_wmark_metric_strs[] = { - "none", - "free_mem_rate", +struct damos_sysfs_wmark_metric_name { + enum damos_wmark_metric metric; + char *name; +}; + +static const struct damos_sysfs_wmark_metric_name +damos_sysfs_wmark_metric_names[] = { + { + .metric = DAMOS_WMARK_NONE, + .name = "none", + }, + { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .name = "free_mem_rate", + }, }; static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -794,9 +846,16 @@ static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_watermarks *watermarks = container_of(kobj, struct damon_sysfs_watermarks, kobj); + int i; - return sysfs_emit(buf, "%s\n", - damon_sysfs_wmark_metric_strs[watermarks->metric]); + for (i = 0; i < ARRAY_SIZE(damos_sysfs_wmark_metric_names); i++) { + const struct damos_sysfs_wmark_metric_name *metric_name; + + metric_name = &damos_sysfs_wmark_metric_names[i]; + if (metric_name->metric == watermarks->metric) + return sysfs_emit(buf, "%s\n", metric_name->name); + } + return -EINVAL; } static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -804,11 +863,14 @@ static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_watermarks *watermarks = container_of(kobj, struct damon_sysfs_watermarks, kobj); - enum damos_wmark_metric metric; + int i; + + for (i = 0; i < ARRAY_SIZE(damos_sysfs_wmark_metric_names); i++) { + const struct damos_sysfs_wmark_metric_name *metric_name; - for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { - if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { - watermarks->metric = metric; + metric_name = &damos_sysfs_wmark_metric_names[i]; + if (sysfs_streq(buf, metric_name->name)) { + watermarks->metric = metric_name->metric; return count; } } @@ -936,12 +998,7 @@ struct damos_sysfs_quota_goal { enum damos_quota_goal_metric metric; unsigned long target_value; unsigned long current_value; -}; - -/* This should match with enum damos_action */ -static const char * const damos_sysfs_quota_goal_metric_strs[] = { - "user_input", - "some_mem_psi_us", + int nid; }; static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void) @@ -949,14 +1006,46 @@ static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void) return kzalloc(sizeof(struct damos_sysfs_quota_goal), GFP_KERNEL); } +struct damos_sysfs_qgoal_metric_name { + enum damos_quota_goal_metric metric; + char *name; +}; + +static +struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = { + { + .metric = DAMOS_QUOTA_USER_INPUT, + .name = "user_input", + }, + { + .metric = DAMOS_QUOTA_SOME_MEM_PSI_US, + .name = "some_mem_psi_us", + }, + { + .metric = DAMOS_QUOTA_NODE_MEM_USED_BP, + .name = "node_mem_used_bp", + }, + { + .metric = DAMOS_QUOTA_NODE_MEM_FREE_BP, + .name = "node_mem_free_bp", + }, +}; + static ssize_t target_metric_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct damos_sysfs_quota_goal *goal = container_of(kobj, struct damos_sysfs_quota_goal, kobj); + int i; - return sysfs_emit(buf, "%s\n", - damos_sysfs_quota_goal_metric_strs[goal->metric]); + for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_metric_names); i++) { + struct damos_sysfs_qgoal_metric_name *metric_name; + + metric_name = &damos_sysfs_qgoal_metric_names[i]; + if (metric_name->metric == goal->metric) + return sysfs_emit(buf, "%s\n", metric_name->name); + } + return -EINVAL; } static ssize_t target_metric_store(struct kobject *kobj, @@ -964,11 +1053,14 @@ static ssize_t target_metric_store(struct kobject *kobj, { struct damos_sysfs_quota_goal *goal = container_of(kobj, struct damos_sysfs_quota_goal, kobj); - enum damos_quota_goal_metric m; + int i; + + for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_metric_names); i++) { + struct damos_sysfs_qgoal_metric_name *metric_name; - for (m = 0; m < NR_DAMOS_QUOTA_GOAL_METRICS; m++) { - if (sysfs_streq(buf, damos_sysfs_quota_goal_metric_strs[m])) { - goal->metric = m; + metric_name = &damos_sysfs_qgoal_metric_names[i]; + if (sysfs_streq(buf, metric_name->name)) { + goal->metric = metric_name->metric; return count; } } @@ -1014,6 +1106,28 @@ static ssize_t current_value_store(struct kobject *kobj, return err ? err : count; } +static ssize_t nid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, struct + damos_sysfs_quota_goal, kobj); + + /* todo: return error if the goal is not using nid */ + + return sysfs_emit(buf, "%d\n", goal->nid); +} + +static ssize_t nid_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, struct + damos_sysfs_quota_goal, kobj); + int err = kstrtoint(buf, 0, &goal->nid); + + /* feed callback should check existence of this file and read value */ + return err ? err : count; +} + static void damos_sysfs_quota_goal_release(struct kobject *kobj) { /* or, notify this release to the feed callback */ @@ -1029,10 +1143,14 @@ static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr = static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr = __ATTR_RW_MODE(current_value, 0600); +static struct kobj_attribute damos_sysfs_quota_goal_nid_attr = + __ATTR_RW_MODE(nid, 0600); + static struct attribute *damos_sysfs_quota_goal_attrs[] = { &damos_sysfs_quota_goal_target_metric_attr.attr, &damos_sysfs_quota_goal_target_value_attr.attr, &damos_sysfs_quota_goal_current_value_attr.attr, + &damos_sysfs_quota_goal_nid_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damos_sysfs_quota_goal); @@ -1538,6 +1656,204 @@ static const struct kobj_type damon_sysfs_access_pattern_ktype = { }; /* + * dest (action destination) directory + */ + +struct damos_sysfs_dest { + struct kobject kobj; + unsigned int id; + unsigned int weight; +}; + +static struct damos_sysfs_dest *damos_sysfs_dest_alloc(void) +{ + return kzalloc(sizeof(struct damos_sysfs_dest), GFP_KERNEL); +} + +static ssize_t id_show( + struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_dest *dest = container_of(kobj, + struct damos_sysfs_dest, kobj); + + return sysfs_emit(buf, "%u\n", dest->id); +} + +static ssize_t id_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_dest *dest = container_of(kobj, + struct damos_sysfs_dest, kobj); + int err = kstrtouint(buf, 0, &dest->id); + + return err ? err : count; +} + +static ssize_t weight_show( + struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_dest *dest = container_of(kobj, + struct damos_sysfs_dest, kobj); + + return sysfs_emit(buf, "%u\n", dest->weight); +} + +static ssize_t weight_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_dest *dest = container_of(kobj, + struct damos_sysfs_dest, kobj); + int err = kstrtouint(buf, 0, &dest->weight); + + return err ? err : count; +} + +static void damos_sysfs_dest_release(struct kobject *kobj) +{ + struct damos_sysfs_dest *dest = container_of(kobj, + struct damos_sysfs_dest, kobj); + kfree(dest); +} + +static struct kobj_attribute damos_sysfs_dest_id_attr = + __ATTR_RW_MODE(id, 0600); + +static struct kobj_attribute damos_sysfs_dest_weight_attr = + __ATTR_RW_MODE(weight, 0600); + +static struct attribute *damos_sysfs_dest_attrs[] = { + &damos_sysfs_dest_id_attr.attr, + &damos_sysfs_dest_weight_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damos_sysfs_dest); + +static const struct kobj_type damos_sysfs_dest_ktype = { + .release = damos_sysfs_dest_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damos_sysfs_dest_groups, +}; + +/* + * dests (action destinations) directory + */ + +struct damos_sysfs_dests { + struct kobject kobj; + struct damos_sysfs_dest **dests_arr; + int nr; +}; + +static struct damos_sysfs_dests * +damos_sysfs_dests_alloc(void) +{ + return kzalloc(sizeof(struct damos_sysfs_dests), GFP_KERNEL); +} + +static void damos_sysfs_dests_rm_dirs( + struct damos_sysfs_dests *dests) +{ + struct damos_sysfs_dest **dests_arr = dests->dests_arr; + int i; + + for (i = 0; i < dests->nr; i++) + kobject_put(&dests_arr[i]->kobj); + dests->nr = 0; + kfree(dests_arr); + dests->dests_arr = NULL; +} + +static int damos_sysfs_dests_add_dirs( + struct damos_sysfs_dests *dests, int nr_dests) +{ + struct damos_sysfs_dest **dests_arr, *dest; + int err, i; + + damos_sysfs_dests_rm_dirs(dests); + if (!nr_dests) + return 0; + + dests_arr = kmalloc_array(nr_dests, sizeof(*dests_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!dests_arr) + return -ENOMEM; + dests->dests_arr = dests_arr; + + for (i = 0; i < nr_dests; i++) { + dest = damos_sysfs_dest_alloc(); + if (!dest) { + damos_sysfs_dests_rm_dirs(dests); + return -ENOMEM; + } + + err = kobject_init_and_add(&dest->kobj, + &damos_sysfs_dest_ktype, + &dests->kobj, "%d", i); + if (err) { + kobject_put(&dest->kobj); + damos_sysfs_dests_rm_dirs(dests); + return err; + } + + dests_arr[i] = dest; + dests->nr++; + } + return 0; +} + +static ssize_t nr_dests_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_dests *dests = container_of(kobj, + struct damos_sysfs_dests, kobj); + + return sysfs_emit(buf, "%d\n", dests->nr); +} + +static ssize_t nr_dests_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_dests *dests; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + dests = container_of(kobj, struct damos_sysfs_dests, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damos_sysfs_dests_add_dirs(dests, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damos_sysfs_dests_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damos_sysfs_dests, kobj)); +} + +static struct kobj_attribute damos_sysfs_dests_nr_attr = + __ATTR_RW_MODE(nr_dests, 0600); + +static struct attribute *damos_sysfs_dests_attrs[] = { + &damos_sysfs_dests_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damos_sysfs_dests); + +static const struct kobj_type damos_sysfs_dests_ktype = { + .release = damos_sysfs_dests_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damos_sysfs_dests_groups, +}; + +/* * scheme directory */ @@ -1554,20 +1870,55 @@ struct damon_sysfs_scheme { struct damon_sysfs_stats *stats; struct damon_sysfs_scheme_regions *tried_regions; int target_nid; + struct damos_sysfs_dests *dests; +}; + +struct damos_sysfs_action_name { + enum damos_action action; + char *name; }; -/* This should match with enum damos_action */ -static const char * const damon_sysfs_damos_action_strs[] = { - "willneed", - "cold", - "pageout", - "hugepage", - "nohugepage", - "lru_prio", - "lru_deprio", - "migrate_hot", - "migrate_cold", - "stat", +static struct damos_sysfs_action_name damos_sysfs_action_names[] = { + { + .action = DAMOS_WILLNEED, + .name = "willneed", + }, + { + .action = DAMOS_COLD, + .name = "cold", + }, + { + .action = DAMOS_PAGEOUT, + .name = "pageout", + }, + { + .action = DAMOS_HUGEPAGE, + .name = "hugepage", + }, + { + .action = DAMOS_NOHUGEPAGE, + .name = "nohugepage", + }, + { + .action = DAMOS_LRU_PRIO, + .name = "lru_prio", + }, + { + .action = DAMOS_LRU_DEPRIO, + .name = "lru_deprio", + }, + { + .action = DAMOS_MIGRATE_HOT, + .name = "migrate_hot", + }, + { + .action = DAMOS_MIGRATE_COLD, + .name = "migrate_cold", + }, + { + .action = DAMOS_STAT, + .name = "stat", + }, }; static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( @@ -1610,6 +1961,22 @@ out: return err; } +static int damos_sysfs_set_dests(struct damon_sysfs_scheme *scheme) +{ + struct damos_sysfs_dests *dests = damos_sysfs_dests_alloc(); + int err; + + if (!dests) + return -ENOMEM; + err = kobject_init_and_add(&dests->kobj, &damos_sysfs_dests_ktype, + &scheme->kobj, "dests"); + if (err) + kobject_put(&dests->kobj); + else + scheme->dests = dests; + return err; +} + static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) { struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); @@ -1742,9 +2109,12 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) err = damon_sysfs_scheme_set_access_pattern(scheme); if (err) return err; - err = damon_sysfs_scheme_set_quotas(scheme); + err = damos_sysfs_set_dests(scheme); if (err) goto put_access_pattern_out; + err = damon_sysfs_scheme_set_quotas(scheme); + if (err) + goto put_dests_out; err = damon_sysfs_scheme_set_watermarks(scheme); if (err) goto put_quotas_access_pattern_out; @@ -1775,6 +2145,9 @@ put_watermarks_quotas_access_pattern_out: put_quotas_access_pattern_out: kobject_put(&scheme->quotas->kobj); scheme->quotas = NULL; +put_dests_out: + kobject_put(&scheme->dests->kobj); + scheme->dests = NULL; put_access_pattern_out: kobject_put(&scheme->access_pattern->kobj); scheme->access_pattern = NULL; @@ -1785,6 +2158,8 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) { damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); kobject_put(&scheme->access_pattern->kobj); + kobject_put(&scheme->dests->kobj); + damos_sysfs_dests_rm_dirs(scheme->dests); damon_sysfs_quotas_rm_dirs(scheme->quotas); kobject_put(&scheme->quotas->kobj); kobject_put(&scheme->watermarks->kobj); @@ -1804,9 +2179,16 @@ static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_scheme *scheme = container_of(kobj, struct damon_sysfs_scheme, kobj); + int i; - return sysfs_emit(buf, "%s\n", - damon_sysfs_damos_action_strs[scheme->action]); + for (i = 0; i < ARRAY_SIZE(damos_sysfs_action_names); i++) { + struct damos_sysfs_action_name *action_name; + + action_name = &damos_sysfs_action_names[i]; + if (action_name->action == scheme->action) + return sysfs_emit(buf, "%s\n", action_name->name); + } + return -EINVAL; } static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1814,11 +2196,14 @@ static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_scheme *scheme = container_of(kobj, struct damon_sysfs_scheme, kobj); - enum damos_action action; + int i; - for (action = 0; action < NR_DAMOS_ACTIONS; action++) { - if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { - scheme->action = action; + for (i = 0; i < ARRAY_SIZE(damos_sysfs_action_names); i++) { + struct damos_sysfs_action_name *action_name; + + action_name = &damos_sysfs_action_names[i]; + if (sysfs_streq(buf, action_name->name)) { + scheme->action = action_name->action; return count; } } @@ -2035,7 +2420,7 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id) if (!memcg_path) return -EINVAL; - path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL); + path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL); if (!path) return -ENOMEM; @@ -2120,8 +2505,17 @@ static int damos_sysfs_add_quota_score( sysfs_goal->target_value); if (!goal) return -ENOMEM; - if (sysfs_goal->metric == DAMOS_QUOTA_USER_INPUT) + switch (sysfs_goal->metric) { + case DAMOS_QUOTA_USER_INPUT: goal->current_value = sysfs_goal->current_value; + break; + case DAMOS_QUOTA_NODE_MEM_USED_BP: + case DAMOS_QUOTA_NODE_MEM_FREE_BP: + goal->nid = sysfs_goal->nid; + break; + default: + break; + } damos_add_quota_goal(quota, goal); } return 0; @@ -2182,6 +2576,29 @@ void damos_sysfs_update_effective_quotas( } } +static int damos_sysfs_add_migrate_dest(struct damos *scheme, + struct damos_sysfs_dests *sysfs_dests) +{ + struct damos_migrate_dests *dests = &scheme->migrate_dests; + int i; + + dests->node_id_arr = kmalloc_array(sysfs_dests->nr, + sizeof(*dests->node_id_arr), GFP_KERNEL); + if (!dests->node_id_arr) + return -ENOMEM; + dests->weight_arr = kmalloc_array(sysfs_dests->nr, + sizeof(*dests->weight_arr), GFP_KERNEL); + if (!dests->weight_arr) + /* ->node_id_arr will be freed by scheme destruction */ + return -ENOMEM; + for (i = 0; i < sysfs_dests->nr; i++) { + dests->node_id_arr[i] = sysfs_dests->dests_arr[i]->id; + dests->weight_arr[i] = sysfs_dests->dests_arr[i]->weight; + } + dests->nr_dests = sysfs_dests->nr; + return 0; +} + static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_scheme *sysfs_scheme) { @@ -2244,6 +2661,11 @@ static struct damos *damon_sysfs_mk_scheme( damon_destroy_scheme(scheme); return NULL; } + err = damos_sysfs_add_migrate_dest(scheme, sysfs_scheme->dests); + if (err) { + damon_destroy_scheme(scheme); + return NULL; + } return scheme; } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 1af6aff35d84..6d2b0dab50cb 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -811,11 +811,24 @@ static const struct kobj_type damon_sysfs_attrs_ktype = { * context directory */ -/* This should match with enum damon_ops_id */ -static const char * const damon_sysfs_ops_strs[] = { - "vaddr", - "fvaddr", - "paddr", +struct damon_sysfs_ops_name { + enum damon_ops_id ops_id; + char *name; +}; + +static const struct damon_sysfs_ops_name damon_sysfs_ops_names[] = { + { + .ops_id = DAMON_OPS_VADDR, + .name = "vaddr", + }, + { + .ops_id = DAMON_OPS_FVADDR, + .name = "fvaddr", + }, + { + .ops_id = DAMON_OPS_PADDR, + .name = "paddr", + }, }; struct damon_sysfs_context { @@ -934,14 +947,16 @@ static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context) static ssize_t avail_operations_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - enum damon_ops_id id; int len = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) { + const struct damon_sysfs_ops_name *ops_name; - for (id = 0; id < NR_DAMON_OPS; id++) { - if (!damon_is_registered_ops(id)) + ops_name = &damon_sysfs_ops_names[i]; + if (!damon_is_registered_ops(ops_name->ops_id)) continue; - len += sysfs_emit_at(buf, len, "%s\n", - damon_sysfs_ops_strs[id]); + len += sysfs_emit_at(buf, len, "%s\n", ops_name->name); } return len; } @@ -951,8 +966,16 @@ static ssize_t operations_show(struct kobject *kobj, { struct damon_sysfs_context *context = container_of(kobj, struct damon_sysfs_context, kobj); + int i; + + for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) { + const struct damon_sysfs_ops_name *ops_name; - return sysfs_emit(buf, "%s\n", damon_sysfs_ops_strs[context->ops_id]); + ops_name = &damon_sysfs_ops_names[i]; + if (ops_name->ops_id == context->ops_id) + return sysfs_emit(buf, "%s\n", ops_name->name); + } + return -EINVAL; } static ssize_t operations_store(struct kobject *kobj, @@ -960,11 +983,14 @@ static ssize_t operations_store(struct kobject *kobj, { struct damon_sysfs_context *context = container_of(kobj, struct damon_sysfs_context, kobj); - enum damon_ops_id id; + int i; + + for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) { + const struct damon_sysfs_ops_name *ops_name; - for (id = 0; id < NR_DAMON_OPS; id++) { - if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) { - context->ops_id = id; + ops_name = &damon_sysfs_ops_names[i]; + if (sysfs_streq(buf, ops_name->name)) { + context->ops_id = ops_name->ops_id; return count; } } @@ -1129,6 +1155,7 @@ struct damon_sysfs_kdamond { struct kobject kobj; struct damon_sysfs_contexts *contexts; struct damon_ctx *damon_ctx; + unsigned int refresh_ms; }; static struct damon_sysfs_kdamond *damon_sysfs_kdamond_alloc(void) @@ -1163,16 +1190,6 @@ static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond) kobject_put(&kdamond->contexts->kobj); } -static bool damon_sysfs_ctx_running(struct damon_ctx *ctx) -{ - bool running; - - mutex_lock(&ctx->kdamond_lock); - running = ctx->kdamond != NULL; - mutex_unlock(&ctx->kdamond_lock); - return running; -} - /* * enum damon_sysfs_cmd - Commands for a specific kdamond. */ @@ -1249,7 +1266,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, if (!ctx) running = false; else - running = damon_sysfs_ctx_running(ctx); + running = damon_is_running(ctx); return sysfs_emit(buf, "%s\n", running ? damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] : @@ -1279,18 +1296,6 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, return damon_set_attrs(ctx, &attrs); } -static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) -{ - struct damon_target *t, *next; - bool has_pid = damon_target_has_pid(ctx); - - damon_for_each_target_safe(t, next, ctx) { - if (has_pid) - put_pid(t->pid); - damon_destroy_target(t); - } -} - static int damon_sysfs_set_regions(struct damon_target *t, struct damon_sysfs_regions *sysfs_regions) { @@ -1325,7 +1330,6 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, struct damon_ctx *ctx) { struct damon_target *t = damon_new_target(); - int err = -EINVAL; if (!t) return -ENOMEM; @@ -1333,16 +1337,10 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, if (damon_target_has_pid(ctx)) { t->pid = find_get_pid(sys_target->pid); if (!t->pid) - goto destroy_targets_out; + /* caller will destroy targets */ + return -EINVAL; } - err = damon_sysfs_set_regions(t, sys_target->regions); - if (err) - goto destroy_targets_out; - return 0; - -destroy_targets_out: - damon_sysfs_destroy_targets(ctx); - return err; + return damon_sysfs_set_regions(t, sys_target->regions); } static int damon_sysfs_add_targets(struct damon_ctx *ctx, @@ -1364,21 +1362,6 @@ static int damon_sysfs_add_targets(struct damon_ctx *ctx, return 0; } -static void damon_sysfs_before_terminate(struct damon_ctx *ctx) -{ - struct damon_target *t, *next; - - if (!damon_target_has_pid(ctx)) - return; - - mutex_lock(&ctx->kdamond_lock); - damon_for_each_target_safe(t, next, ctx) { - put_pid(t->pid); - damon_destroy_target(t); - } - mutex_unlock(&ctx->kdamond_lock); -} - /* * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. * @data: The kobject wrapper that associated to the kdamond thread. @@ -1403,7 +1386,7 @@ static inline bool damon_sysfs_kdamond_running( struct damon_sysfs_kdamond *kdamond) { return kdamond->damon_ctx && - damon_sysfs_ctx_running(kdamond->damon_ctx); + damon_is_running(kdamond->damon_ctx); } static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, @@ -1450,13 +1433,11 @@ static int damon_sysfs_commit_input(void *data) test_ctx = damon_new_ctx(); err = damon_commit_ctx(test_ctx, param_ctx); if (err) { - damon_sysfs_destroy_targets(test_ctx); damon_destroy_ctx(test_ctx); goto out; } err = damon_commit_ctx(kdamond->damon_ctx, param_ctx); out: - damon_sysfs_destroy_targets(param_ctx); damon_destroy_ctx(param_ctx); return err; } @@ -1525,10 +1506,35 @@ static struct damon_ctx *damon_sysfs_build_ctx( return ERR_PTR(err); } - ctx->callback.before_terminate = damon_sysfs_before_terminate; return ctx; } +static int damon_sysfs_repeat_call_fn(void *data) +{ + struct damon_sysfs_kdamond *sysfs_kdamond = data; + static unsigned long next_update_jiffies; + + if (!sysfs_kdamond->refresh_ms) + return 0; + if (time_before(jiffies, next_update_jiffies)) + return 0; + next_update_jiffies = jiffies + + msecs_to_jiffies(sysfs_kdamond->refresh_ms); + + if (!mutex_trylock(&damon_sysfs_lock)) + return 0; + damon_sysfs_upd_tuned_intervals(sysfs_kdamond); + damon_sysfs_upd_schemes_stats(sysfs_kdamond); + damon_sysfs_upd_schemes_effective_quotas(sysfs_kdamond); + mutex_unlock(&damon_sysfs_lock); + return 0; +} + +static struct damon_call_control damon_sysfs_repeat_call_control = { + .fn = damon_sysfs_repeat_call_fn, + .repeat = true, +}; + static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) { struct damon_ctx *ctx; @@ -1553,6 +1559,9 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) return err; } kdamond->damon_ctx = ctx; + + damon_sysfs_repeat_call_control.data = kdamond; + damon_call(ctx, &damon_sysfs_repeat_call_control); return err; } @@ -1711,6 +1720,30 @@ out: return sysfs_emit(buf, "%d\n", pid); } +static ssize_t refresh_ms_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + + return sysfs_emit(buf, "%u\n", kdamond->refresh_ms); +} + +static ssize_t refresh_ms_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + unsigned int nr; + int err = kstrtouint(buf, 0, &nr); + + if (err) + return err; + + kdamond->refresh_ms = nr; + return count; +} + static void damon_sysfs_kdamond_release(struct kobject *kobj) { struct damon_sysfs_kdamond *kdamond = container_of(kobj, @@ -1727,9 +1760,13 @@ static struct kobj_attribute damon_sysfs_kdamond_state_attr = static struct kobj_attribute damon_sysfs_kdamond_pid_attr = __ATTR_RO_MODE(pid, 0400); +static struct kobj_attribute damon_sysfs_kdamond_refresh_ms_attr = + __ATTR_RW_MODE(refresh_ms, 0600); + static struct attribute *damon_sysfs_kdamond_attrs[] = { &damon_sysfs_kdamond_state_attr.attr, &damon_sysfs_kdamond_pid_attr.attr, + &damon_sysfs_kdamond_refresh_ms_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_kdamond); diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index be0fea9ee5fc..dfedfff19940 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -58,7 +58,7 @@ static void damon_test_target(struct kunit *test) damon_add_target(c, t); KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c)); - damon_destroy_target(t); + damon_destroy_target(t, c); KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); damon_destroy_ctx(c); @@ -310,7 +310,7 @@ static void damon_test_set_regions(struct kunit *test) KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]); KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]); } - damon_destroy_target(t); + damon_destroy_target(t, NULL); } static void damon_test_nr_accesses_to_accesses_bp(struct kunit *test) @@ -510,6 +510,75 @@ static void damon_test_feed_loop_next_input(struct kunit *test) damon_feed_loop_next_input(last_input, 2000)); } +static void damon_test_set_filters_default_reject(struct kunit *test) +{ + struct damos scheme; + struct damos_filter *target_filter, *anon_filter; + + INIT_LIST_HEAD(&scheme.filters); + INIT_LIST_HEAD(&scheme.ops_filters); + + damos_set_filters_default_reject(&scheme); + /* + * No filter is installed. Allow by default on both core and ops layer + * filtering stages, since there are no filters at all. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); + + target_filter = damos_new_filter(DAMOS_FILTER_TYPE_TARGET, true, true); + damos_add_filter(&scheme, target_filter); + damos_set_filters_default_reject(&scheme); + /* + * A core-handled allow-filter is installed. + * Rejct by default on core layer filtering stage due to the last + * core-layer-filter's behavior. + * Allow by default on ops layer filtering stage due to the absence of + * ops layer filters. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, true); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); + + target_filter->allow = false; + damos_set_filters_default_reject(&scheme); + /* + * A core-handled reject-filter is installed. + * Allow by default on core layer filtering stage due to the last + * core-layer-filter's behavior. + * Allow by default on ops layer filtering stage due to the absence of + * ops layer filters. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); + + anon_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true); + damos_add_filter(&scheme, anon_filter); + + damos_set_filters_default_reject(&scheme); + /* + * A core-handled reject-filter and ops-handled allow-filter are installed. + * Allow by default on core layer filtering stage due to the existence + * of the ops-handled filter. + * Reject by default on ops layer filtering stage due to the last + * ops-layer-filter's behavior. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true); + + target_filter->allow = true; + damos_set_filters_default_reject(&scheme); + /* + * A core-handled allow-filter and ops-handled allow-filter are + * installed. + * Allow by default on core layer filtering stage due to the existence + * of the ops-handled filter. + * Reject by default on ops layer filtering stage due to the last + * ops-layer-filter's behavior. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -527,6 +596,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damos_test_new_filter), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), + KUNIT_CASE(damon_test_set_filters_default_reject), {}, }; diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index 7cd944266a92..d2b37ccf2cc0 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -149,7 +149,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]); } - damon_destroy_target(t); + damon_destroy_target(t, NULL); } /* diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index e6d99106a7f9..94af19c4dfed 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * DAMON Primitives for Virtual Address Spaces + * DAMON Code for Virtual Address Spaces * * Author: SeongJae Park <sj@kernel.org> */ @@ -15,6 +15,7 @@ #include <linux/pagewalk.h> #include <linux/sched/mm.h> +#include "../internal.h" #include "ops-common.h" #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST @@ -610,6 +611,183 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } +static bool damos_va_filter_young_match(struct damos_filter *filter, + struct folio *folio, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, pmd_t *pmdp) +{ + bool young = false; + + if (ptep) + young = pte_young(ptep_get(ptep)); + else if (pmdp) + young = pmd_young(pmdp_get(pmdp)); + + young = young || !folio_test_idle(folio) || + mmu_notifier_test_young(vma->vm_mm, addr); + + if (young && ptep) + damon_ptep_mkold(ptep, vma, addr); + else if (young && pmdp) + damon_pmdp_mkold(pmdp, vma, addr); + + return young == filter->matching; +} + +static bool damos_va_filter_out(struct damos *scheme, struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pmd_t *pmdp) +{ + struct damos_filter *filter; + bool matched; + + if (scheme->core_filters_allowed) + return false; + + damos_for_each_ops_filter(filter, scheme) { + /* + * damos_folio_filter_match checks the young filter by doing an + * rmap on the folio to find its page table. However, being the + * vaddr scheme, we have direct access to the page tables, so + * use that instead. + */ + if (filter->type == DAMOS_FILTER_TYPE_YOUNG) + matched = damos_va_filter_young_match(filter, folio, + vma, addr, ptep, pmdp); + else + matched = damos_folio_filter_match(filter, folio); + + if (matched) + return !filter->allow; + } + return scheme->ops_filters_default_reject; +} + +struct damos_va_migrate_private { + struct list_head *migration_lists; + struct damos *scheme; +}; + +/* + * Place the given folio in the migration_list corresponding to where the folio + * should be migrated. + * + * The algorithm used here is similar to weighted_interleave_nid() + */ +static void damos_va_migrate_dests_add(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, + struct damos_migrate_dests *dests, + struct list_head *migration_lists) +{ + pgoff_t ilx; + int order; + unsigned int target; + unsigned int weight_total = 0; + int i; + + /* + * If dests is empty, there is only one migration list corresponding + * to s->target_nid. + */ + if (!dests->nr_dests) { + i = 0; + goto isolate; + } + + order = folio_order(folio); + ilx = vma->vm_pgoff >> order; + ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); + + for (i = 0; i < dests->nr_dests; i++) + weight_total += dests->weight_arr[i]; + + /* If the total weights are somehow 0, don't migrate at all */ + if (!weight_total) + return; + + target = ilx % weight_total; + for (i = 0; i < dests->nr_dests; i++) { + if (target < dests->weight_arr[i]) + break; + target -= dests->weight_arr[i]; + } + +isolate: + if (!folio_isolate_lru(folio)) + return; + + list_add(&folio->lru, &migration_lists[i]); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct damos_va_migrate_private *priv = walk->private; + struct list_head *migration_lists = priv->migration_lists; + struct damos *s = priv->scheme; + struct damos_migrate_dests *dests = &s->migrate_dests; + struct folio *folio; + spinlock_t *ptl; + pmd_t pmde; + + ptl = pmd_lock(walk->mm, pmd); + pmde = pmdp_get(pmd); + + if (!pmd_present(pmde) || !pmd_trans_huge(pmde)) + goto unlock; + + /* Tell page walk code to not split the PMD */ + walk->action = ACTION_CONTINUE; + + folio = damon_get_folio(pmd_pfn(pmde)); + if (!folio) + goto unlock; + + if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd)) + goto put_folio; + + damos_va_migrate_dests_add(folio, walk->vma, addr, dests, + migration_lists); + +put_folio: + folio_put(folio); +unlock: + spin_unlock(ptl); + return 0; +} +#else +#define damos_va_migrate_pmd_entry NULL +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static int damos_va_migrate_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct damos_va_migrate_private *priv = walk->private; + struct list_head *migration_lists = priv->migration_lists; + struct damos *s = priv->scheme; + struct damos_migrate_dests *dests = &s->migrate_dests; + struct folio *folio; + pte_t ptent; + + ptent = ptep_get(pte); + if (pte_none(ptent) || !pte_present(ptent)) + return 0; + + folio = damon_get_folio(pte_pfn(ptent)); + if (!folio) + return 0; + + if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL)) + goto put_folio; + + damos_va_migrate_dests_add(folio, walk->vma, addr, dests, + migration_lists); + +put_folio: + folio_put(folio); + return 0; +} + /* * Functions for the target validity check and cleanup */ @@ -627,6 +805,11 @@ static bool damon_va_target_valid(struct damon_target *t) return false; } +static void damon_va_cleanup_target(struct damon_target *t) +{ + put_pid(t->pid); +} + #ifndef CONFIG_ADVISE_SYSCALLS static unsigned long damos_madvise(struct damon_target *target, struct damon_region *r, int behavior) @@ -653,6 +836,56 @@ static unsigned long damos_madvise(struct damon_target *target, } #endif /* CONFIG_ADVISE_SYSCALLS */ +static unsigned long damos_va_migrate(struct damon_target *target, + struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) +{ + LIST_HEAD(folio_list); + struct damos_va_migrate_private priv; + struct mm_struct *mm; + int nr_dests; + int nid; + bool use_target_nid; + unsigned long applied = 0; + struct damos_migrate_dests *dests = &s->migrate_dests; + struct mm_walk_ops walk_ops = { + .pmd_entry = damos_va_migrate_pmd_entry, + .pte_entry = damos_va_migrate_pte_entry, + .walk_lock = PGWALK_RDLOCK, + }; + + use_target_nid = dests->nr_dests == 0; + nr_dests = use_target_nid ? 1 : dests->nr_dests; + priv.scheme = s; + priv.migration_lists = kmalloc_array(nr_dests, + sizeof(*priv.migration_lists), GFP_KERNEL); + if (!priv.migration_lists) + return 0; + + for (int i = 0; i < nr_dests; i++) + INIT_LIST_HEAD(&priv.migration_lists[i]); + + + mm = damon_get_mm(target); + if (!mm) + goto free_lists; + + mmap_read_lock(mm); + walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); + mmap_read_unlock(mm); + mmput(mm); + + for (int i = 0; i < nr_dests; i++) { + nid = use_target_nid ? s->target_nid : dests->node_id_arr[i]; + applied += damon_migrate_pages(&priv.migration_lists[i], nid); + cond_resched(); + } + +free_lists: + kfree(priv.migration_lists); + return applied * PAGE_SIZE; +} + static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed) @@ -675,6 +908,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_NOHUGEPAGE: madv_action = MADV_NOHUGEPAGE; break; + case DAMOS_MIGRATE_HOT: + case DAMOS_MIGRATE_COLD: + return damos_va_migrate(t, r, scheme, sz_filter_passed); case DAMOS_STAT: return 0; default: @@ -695,6 +931,10 @@ static int damon_va_scheme_score(struct damon_ctx *context, switch (scheme->action) { case DAMOS_PAGEOUT: return damon_cold_score(context, r, scheme); + case DAMOS_MIGRATE_HOT: + return damon_hot_score(context, r, scheme); + case DAMOS_MIGRATE_COLD: + return damon_cold_score(context, r, scheme); default: break; } @@ -711,6 +951,7 @@ static int __init damon_va_initcall(void) .prepare_access_checks = damon_va_prepare_access_checks, .check_accesses = damon_va_check_accesses, .target_valid = damon_va_target_valid, + .cleanup_target = damon_va_cleanup_target, .cleanup = NULL, .apply_scheme = damon_va_apply_scheme, .get_scheme_score = damon_va_scheme_score, diff --git a/mm/debug.c b/mm/debug.c index db83e381a8ae..b4388f4dcd4d 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -71,10 +71,12 @@ static void __dump_folio(struct folio *folio, struct page *page, unsigned long pfn, unsigned long idx) { struct address_space *mapping = folio_mapping(folio); - int mapcount = atomic_read(&page->_mapcount); + int mapcount = atomic_read(&page->_mapcount) + 1; char *type = ""; - mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1; + if (page_mapcount_is_type(mapcount)) + mapcount = 0; + pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", folio_ref_count(folio), mapcount, mapping, folio->index + idx, pfn); @@ -127,47 +129,13 @@ static void __dump_folio(struct folio *folio, struct page *page, static void __dump_page(const struct page *page) { - struct folio *foliop, folio; - struct page precise; - unsigned long head; - unsigned long pfn = page_to_pfn(page); - unsigned long idx, nr_pages = 1; - int loops = 5; - -again: - memcpy(&precise, page, sizeof(*page)); - head = precise.compound_head; - if ((head & 1) == 0) { - foliop = (struct folio *)&precise; - idx = 0; - if (!folio_test_large(foliop)) - goto dump; - foliop = (struct folio *)page; - } else { - foliop = (struct folio *)(head - 1); - idx = folio_page_idx(foliop, page); - } + struct page_snapshot ps; - if (idx < MAX_FOLIO_NR_PAGES) { - memcpy(&folio, foliop, 2 * sizeof(struct page)); - nr_pages = folio_nr_pages(&folio); - if (nr_pages > 1) - memcpy(&folio.__page_2, &foliop->__page_2, - sizeof(struct page)); - foliop = &folio; - } - - if (idx > nr_pages) { - if (loops-- > 0) - goto again; + snapshot_page(&ps, page); + if (!snapshot_page_is_faithful(&ps)) pr_warn("page does not match folio\n"); - precise.compound_head &= ~1UL; - foliop = (struct folio *)&precise; - idx = 0; - } -dump: - __dump_folio(foliop, &precise, pfn, idx); + __dump_folio(&ps.folio_snapshot, &ps.page_snapshot, ps.pfn, ps.idx); } void dump_page(const struct page *page, const char *reason) @@ -288,7 +256,7 @@ void dump_vmg(const struct vma_merge_struct *vmg, const char *reason) vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0, vmg->vmi ? vma_iter_end(vmg->vmi) : 0, vmg->prev, vmg->middle, vmg->next, vmg->target, - vmg->start, vmg->end, vmg->flags, + vmg->start, vmg->end, vmg->vm_flags, vmg->file, vmg->anon_vma, vmg->policy, #ifdef CONFIG_USERFAULTFD vmg->uffd_ctx.ctx, diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c index d46acf989dde..6a26eca546c3 100644 --- a/mm/debug_page_alloc.c +++ b/mm/debug_page_alloc.c @@ -23,7 +23,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) unsigned long res; if (kstrtoul(buf, 10, &res) < 0 || res > MAX_PAGE_ORDER / 2) { - pr_err("Bad debug_guardpage_minorder value\n"); + pr_err("Bad debug_guardpage_minorder value: %s\n", buf); return 0; } _debug_guardpage_minorder = res; diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index bc748f700a9e..d19031f275a3 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -20,7 +20,6 @@ #include <linux/mman.h> #include <linux/mm_types.h> #include <linux/module.h> -#include <linux/pfn_t.h> #include <linux/printk.h> #include <linux/pgtable.h> #include <linux/random.h> @@ -73,6 +72,8 @@ struct pgtable_debug_args { unsigned long fixed_pud_pfn; unsigned long fixed_pmd_pfn; unsigned long fixed_pte_pfn; + + swp_entry_t swp_entry; }; static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) @@ -348,12 +349,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) vaddr &= HPAGE_PUD_MASK; pud = pfn_pud(args->pud_pfn, args->page_prot); - /* - * Some architectures have debug checks to make sure - * huge pud mapping are only found with devmap entries - * For now test with only devmap entries. - */ - pud = pud_mkdevmap(pud); set_pud_at(args->mm, vaddr, args->pudp, pud); flush_dcache_page(page); pudp_set_wrprotect(args->mm, vaddr, args->pudp); @@ -366,7 +361,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ pud = pfn_pud(args->pud_pfn, args->page_prot); - pud = pud_mkdevmap(pud); pud = pud_wrprotect(pud); pud = pud_mkclean(pud); set_pud_at(args->mm, vaddr, args->pudp, pud); @@ -384,7 +378,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) #endif /* __PAGETABLE_PMD_FOLDED */ pud = pfn_pud(args->pud_pfn, args->page_prot); - pud = pud_mkdevmap(pud); pud = pud_mkyoung(pud); set_pud_at(args->mm, vaddr, args->pudp, pud); flush_dcache_page(page); @@ -693,53 +686,6 @@ static void __init pmd_protnone_tests(struct pgtable_debug_args *args) static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP -static void __init pte_devmap_tests(struct pgtable_debug_args *args) -{ - pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); - - pr_debug("Validating PTE devmap\n"); - WARN_ON(!pte_devmap(pte_mkdevmap(pte))); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_devmap_tests(struct pgtable_debug_args *args) -{ - pmd_t pmd; - - if (!has_transparent_hugepage()) - return; - - pr_debug("Validating PMD devmap\n"); - pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); - WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd))); -} - -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_devmap_tests(struct pgtable_debug_args *args) -{ - pud_t pud; - - if (!has_transparent_pud_hugepage()) - return; - - pr_debug("Validating PUD devmap\n"); - pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); - WARN_ON(!pud_devmap(pud_mkdevmap(pud))); -} -#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } -#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -#else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } -static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#else -static void __init pte_devmap_tests(struct pgtable_debug_args *args) { } -static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } -static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } -#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */ - static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) { pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); @@ -754,12 +700,15 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + pte_t pte; if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return; pr_debug("Validating PTE swap soft dirty\n"); + pte = swp_entry_to_pte(args->swp_entry); + WARN_ON(!is_swap_pte(pte)); + WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte))); WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte))); } @@ -793,7 +742,9 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PMD swap soft dirty\n"); - pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); + pmd = swp_entry_to_pmd(args->swp_entry); + WARN_ON(!is_swap_pmd(pmd)); + WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd))); WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd))); } @@ -804,17 +755,11 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) { - unsigned long max_swap_offset; swp_entry_t entry, entry2; pte_t pte; pr_debug("Validating PTE swap exclusive\n"); - - /* See generic_max_swapfile_size(): probe the maximum offset */ - max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); - - /* Create a swp entry with all possible bits set */ - entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset); + entry = args->swp_entry; pte = swp_entry_to_pte(entry); WARN_ON(pte_swp_exclusive(pte)); @@ -838,30 +783,34 @@ static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) static void __init pte_swap_tests(struct pgtable_debug_args *args) { - swp_entry_t swp; - pte_t pte; + swp_entry_t arch_entry; + pte_t pte1, pte2; pr_debug("Validating PTE swap\n"); - pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); - swp = __pte_to_swp_entry(pte); - pte = __swp_entry_to_pte(swp); - WARN_ON(args->fixed_pte_pfn != pte_pfn(pte)); + pte1 = swp_entry_to_pte(args->swp_entry); + WARN_ON(!is_swap_pte(pte1)); + + arch_entry = __pte_to_swp_entry(pte1); + pte2 = __swp_entry_to_pte(arch_entry); + WARN_ON(memcmp(&pte1, &pte2, sizeof(pte1))); } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION static void __init pmd_swap_tests(struct pgtable_debug_args *args) { - swp_entry_t swp; - pmd_t pmd; + swp_entry_t arch_entry; + pmd_t pmd1, pmd2; if (!has_transparent_hugepage()) return; pr_debug("Validating PMD swap\n"); - pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); - swp = __pmd_to_swp_entry(pmd); - pmd = __swp_entry_to_pmd(swp); - WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd)); + pmd1 = swp_entry_to_pmd(args->swp_entry); + WARN_ON(!is_swap_pmd(pmd1)); + + arch_entry = __pmd_to_swp_entry(pmd1); + pmd2 = __swp_entry_to_pmd(arch_entry); + WARN_ON(memcmp(&pmd1, &pmd2, sizeof(pmd1))); } #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ static void __init pmd_swap_tests(struct pgtable_debug_args *args) { } @@ -910,26 +859,18 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args) #ifdef CONFIG_HUGETLB_PAGE static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { - struct page *page; pte_t pte; pr_debug("Validating HugeTLB basic\n"); - /* - * Accessing the page associated with the pfn is safe here, - * as it was previously derived from a real kernel symbol. - */ - page = pfn_to_page(args->fixed_pmd_pfn); - pte = mk_huge_pte(page, args->page_prot); + pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot); + pte = arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS); +#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB + WARN_ON(!pte_huge(pte)); +#endif WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte))); WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte)))); WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte)))); - -#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB - pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot); - - WARN_ON(!pte_huge(arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS))); -#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ } #else /* !CONFIG_HUGETLB_PAGE */ static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { } @@ -1174,6 +1115,7 @@ static void __init init_fixed_pfns(struct pgtable_debug_args *args) static int __init init_args(struct pgtable_debug_args *args) { + unsigned long max_swap_offset; struct page *page = NULL; int ret = 0; @@ -1256,6 +1198,11 @@ static int __init init_args(struct pgtable_debug_args *args) init_fixed_pfns(args); + /* See generic_max_swapfile_size(): probe the maximum offset */ + max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); + /* Create a swp entry with all possible bits set */ + args->swp_entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset); + /* * Allocate (huge) pages because some of the tests need to access * the data in the pages. The corresponding tests will be skipped @@ -1341,10 +1288,6 @@ static int __init debug_vm_pgtable(void) pte_protnone_tests(&args); pmd_protnone_tests(&args); - pte_devmap_tests(&args); - pmd_devmap_tests(&args); - pud_devmap_tests(&args); - pte_soft_dirty_tests(&args); pmd_soft_dirty_tests(&args); pte_swap_soft_dirty_tests(&args); diff --git a/mm/dmapool.c b/mm/dmapool.c index f0bfc6c490f4..5d8af6e29127 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -56,6 +56,7 @@ struct dma_pool { /* the pool */ unsigned int size; unsigned int allocation; unsigned int boundary; + int node; char name[32]; struct list_head pools; }; @@ -199,16 +200,17 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, /** - * dma_pool_create - Creates a pool of consistent memory blocks, for dma. + * dma_pool_create_node - Creates a pool of coherent DMA memory blocks. * @name: name of pool, for diagnostics * @dev: device that will be doing the DMA * @size: size of the blocks in this pool. * @align: alignment requirement for blocks; must be a power of two * @boundary: returned blocks won't cross this power of two boundary + * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on * Context: not in_interrupt() * * Given one of these pools, dma_pool_alloc() - * may be used to allocate memory. Such memory will all have "consistent" + * may be used to allocate memory. Such memory will all have coherent * DMA mappings, accessible by the device and its driver without using * cache flushing primitives. The actual size of blocks allocated may be * larger than requested because of alignment. @@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, * Return: a dma allocation pool with the requested characteristics, or * %NULL if one can't be created. */ -struct dma_pool *dma_pool_create(const char *name, struct device *dev, - size_t size, size_t align, size_t boundary) +struct dma_pool *dma_pool_create_node(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary, int node) { struct dma_pool *retval; size_t allocation; @@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, boundary = min(boundary, allocation); - retval = kzalloc(sizeof(*retval), GFP_KERNEL); + retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node); if (!retval) return retval; @@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->size = size; retval->boundary = boundary; retval->allocation = allocation; + retval->node = node; INIT_LIST_HEAD(&retval->pools); /* @@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, mutex_unlock(&pools_reg_lock); return retval; } -EXPORT_SYMBOL(dma_pool_create); +EXPORT_SYMBOL(dma_pool_create_node); static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) { @@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) { struct dma_page *page; - page = kmalloc(sizeof(*page), mem_flags); + page = kmalloc_node(sizeof(*page), mem_flags, pool->node); if (!page) return NULL; @@ -392,7 +395,7 @@ void dma_pool_destroy(struct dma_pool *pool) EXPORT_SYMBOL(dma_pool_destroy); /** - * dma_pool_alloc - get a block of consistent memory + * dma_pool_alloc - get a block of coherent memory * @pool: dma pool that will produce the block * @mem_flags: GFP_* bitmask * @handle: pointer to dma address of block diff --git a/mm/execmem.c b/mm/execmem.c index e6c4f5076ca8..627e6cf64f4f 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -26,7 +26,7 @@ static struct execmem_info default_execmem_info __ro_after_init; #ifdef CONFIG_MMU static void *execmem_vmalloc(struct execmem_range *range, size_t size, - pgprot_t pgprot, unsigned long vm_flags) + pgprot_t pgprot, vm_flags_t vm_flags) { bool kasan = range->flags & EXECMEM_KASAN_SHADOW; gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; @@ -82,7 +82,7 @@ struct vm_struct *execmem_vmap(size_t size) } #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, - pgprot_t pgprot, unsigned long vm_flags) + pgprot_t pgprot, vm_flags_t vm_flags) { return vmalloc(size); } @@ -256,7 +256,7 @@ out_unlock: static int execmem_cache_populate(struct execmem_range *range, size_t size) { - unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; + vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -373,10 +373,12 @@ void *execmem_alloc(enum execmem_type type, size_t size) { struct execmem_range *range = &execmem_info->ranges[type]; bool use_cache = range->flags & EXECMEM_ROX_CACHE; - unsigned long vm_flags = VM_FLUSH_RESET_PERMS; + vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS; pgprot_t pgprot = range->pgprot; void *p; + size = PAGE_ALIGN(size); + if (use_cache) p = execmem_cache_alloc(range, size); else diff --git a/mm/filemap.c b/mm/filemap.c index b5e784f34d98..751838ef05e5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -142,7 +142,7 @@ static void page_cache_delete(struct address_space *mapping, xas_init_marks(&xas); folio->mapping = NULL; - /* Leave page->index set: truncation lookup relies upon it */ + /* Leave folio->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } @@ -949,7 +949,7 @@ unlock: return 0; error: folio->mapping = NULL; - /* Leave page->index set: truncation relies upon it */ + /* Leave folio->index set: truncation relies upon it */ folio_put_refs(folio, nr); return xas_error(&xas); } @@ -1589,13 +1589,30 @@ int folio_wait_private_2_killable(struct folio *folio) } EXPORT_SYMBOL(folio_wait_private_2_killable); +static void filemap_end_dropbehind(struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return; + if (!folio_test_clear_dropbehind(folio)) + return; + if (mapping) + folio_unmap_invalidate(mapping, folio, 0); +} + /* * If folio was marked as dropbehind, then pages should be dropped when writeback * completes. Do that now. If we fail, it's likely because of a big folio - * just reset dropbehind for that case and latter completions should invalidate. */ -static void folio_end_dropbehind_write(struct folio *folio) +static void filemap_end_dropbehind_write(struct folio *folio) { + if (!folio_test_dropbehind(folio)) + return; + /* * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, * but can happen if normal writeback just happens to find dirty folios @@ -1604,8 +1621,7 @@ static void folio_end_dropbehind_write(struct folio *folio) * invalidation in that case. */ if (in_task() && folio_trylock(folio)) { - if (folio->mapping) - folio_unmap_invalidate(folio->mapping, folio, 0); + filemap_end_dropbehind(folio); folio_unlock(folio); } } @@ -1620,8 +1636,6 @@ static void folio_end_dropbehind_write(struct folio *folio) */ void folio_end_writeback(struct folio *folio) { - bool folio_dropbehind = false; - VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); /* @@ -1643,14 +1657,11 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake_bit(). */ folio_get(folio); - if (!folio_test_dirty(folio)) - folio_dropbehind = folio_test_clear_dropbehind(folio); if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); - acct_reclaim_writeback(folio); - if (folio_dropbehind) - folio_end_dropbehind_write(folio); + filemap_end_dropbehind_write(folio); + acct_reclaim_writeback(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); @@ -1767,8 +1778,9 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); + unsigned long nr = max_scan; - while (max_scan--) { + while (nr--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) return xas.xa_index; @@ -2244,6 +2256,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, *start = folio->index + nr; goto out; } + xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); @@ -2634,16 +2647,14 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) return (pos1 >> shift == pos2 >> shift); } -static void filemap_end_dropbehind_read(struct address_space *mapping, - struct folio *folio) +static void filemap_end_dropbehind_read(struct folio *folio) { if (!folio_test_dropbehind(folio)) return; if (folio_test_writeback(folio) || folio_test_dirty(folio)) return; if (folio_trylock(folio)) { - if (folio_test_clear_dropbehind(folio)) - folio_unmap_invalidate(mapping, folio, 0); + filemap_end_dropbehind(folio); folio_unlock(folio); } } @@ -2764,7 +2775,7 @@ put_folios: for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; - filemap_end_dropbehind_read(mapping, folio); + filemap_end_dropbehind_read(folio); folio_put(folio); } folio_batch_init(&fbatch); @@ -3205,8 +3216,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; - unsigned long vm_flags = vmf->vma->vm_flags; - unsigned int mmap_miss; + vm_flags_t vm_flags = vmf->vma->vm_flags; + unsigned short mmap_miss; #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ @@ -3221,13 +3232,17 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) if (!(vm_flags & VM_RAND_READ)) ra->size *= 2; ra->async_size = HPAGE_PMD_NR; - page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER); + ra->order = HPAGE_PMD_ORDER; + page_cache_ra_order(&ractl, ra); return fpin; } #endif - /* If we don't want any read-ahead, don't bother */ - if (vm_flags & VM_RAND_READ) + /* + * If we don't want any read-ahead, don't bother. VM_EXEC case below is + * already intended for random access. + */ + if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ) return fpin; if (!ra->ra_pages) return fpin; @@ -3250,15 +3265,43 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) if (mmap_miss > MMAP_LOTSAMISS) return fpin; - /* - * mmap read-around - */ + if (vm_flags & VM_EXEC) { + /* + * Allow arch to request a preferred minimum folio order for + * executable memory. This can often be beneficial to + * performance if (e.g.) arm64 can contpte-map the folio. + * Executable memory rarely benefits from readahead, due to its + * random access nature, so set async_size to 0. + * + * Limit to the boundaries of the VMA to avoid reading in any + * pad that might exist between sections, which would be a waste + * of memory. + */ + struct vm_area_struct *vma = vmf->vma; + unsigned long start = vma->vm_pgoff; + unsigned long end = start + vma_pages(vma); + unsigned long ra_end; + + ra->order = exec_folio_order(); + ra->start = round_down(vmf->pgoff, 1UL << ra->order); + ra->start = max(ra->start, start); + ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order); + ra_end = min(ra_end, end); + ra->size = ra_end - ra->start; + ra->async_size = 0; + } else { + /* + * mmap read-around + */ + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); + ra->size = ra->ra_pages; + ra->async_size = ra->ra_pages / 4; + ra->order = 0; + } + fpin = maybe_unlock_mmap_for_io(vmf, fpin); - ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); - ra->size = ra->ra_pages; - ra->async_size = ra->ra_pages / 4; ractl._index = ra->start; - page_cache_ra_order(&ractl, ra, 0); + page_cache_ra_order(&ractl, ra); return fpin; } @@ -3274,7 +3317,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct file_ra_state *ra = &file->f_ra; DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); struct file *fpin = NULL; - unsigned int mmap_miss; + unsigned short mmap_miss; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) @@ -3532,7 +3575,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { struct page *page = folio_file_page(folio, start); - vm_fault_t ret = do_set_pmd(vmf, page); + vm_fault_t ret = do_set_pmd(vmf, folio, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ folio_unlock(folio); @@ -3594,7 +3637,7 @@ skip: static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned long *rss, unsigned int *mmap_miss) + unsigned long *rss, unsigned short *mmap_miss) { vm_fault_t ret = 0; struct page *page = folio_page(folio, start); @@ -3656,7 +3699,7 @@ skip: static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, - unsigned long *rss, unsigned int *mmap_miss) + unsigned long *rss, unsigned short *mmap_miss) { vm_fault_t ret = 0; struct page *page = &folio->page; @@ -3698,7 +3741,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, struct folio *folio; vm_fault_t ret = 0; unsigned long rss = 0; - unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type; + unsigned int nr_pages = 0, folio_type; + unsigned short mmap_miss = 0, mmap_miss_saved; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); @@ -3804,6 +3848,18 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +int generic_file_mmap_prepare(struct vm_area_desc *desc) +{ + struct file *file = desc->file; + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->read_folio) + return -ENOEXEC; + file_accessed(file); + desc->vm_ops = &generic_file_vm_ops; + return 0; +} + /* * This is for filesystems which do not implement ->writepage. */ @@ -3813,6 +3869,13 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; return generic_file_mmap(file, vma); } + +int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc) +{ + if (is_shared_maywrite(desc->vm_flags)) + return -EINVAL; + return generic_file_mmap_prepare(desc); +} #else vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { @@ -3822,15 +3885,25 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } +int generic_file_mmap_prepare(struct vm_area_desc *desc) +{ + return -ENOSYS; +} int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } +int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc) +{ + return -ENOSYS; +} #endif /* CONFIG_MMU */ EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); +EXPORT_SYMBOL(generic_file_mmap_prepare); EXPORT_SYMBOL(generic_file_readonly_mmap); +EXPORT_SYMBOL(generic_file_readonly_mmap_prepare); static struct folio *do_read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file, gfp_t gfp) @@ -4099,7 +4172,7 @@ retry: break; } - status = a_ops->write_begin(file, mapping, pos, bytes, + status = a_ops->write_begin(iocb, mapping, pos, bytes, &folio, &fsdata); if (unlikely(status < 0)) break; @@ -4120,7 +4193,7 @@ retry: copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); - status = a_ops->write_end(file, mapping, pos, bytes, copied, + status = a_ops->write_end(iocb, mapping, pos, bytes, copied, folio, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); @@ -26,6 +26,7 @@ #include <asm/tlbflush.h> #include "internal.h" +#include "swap.h" struct follow_page_context { struct dev_pagemap *pgmap; @@ -63,11 +64,11 @@ static inline void sanity_check_pinned_pages(struct page **pages, !folio_test_anon(folio)) continue; if (!folio_test_large(folio) || folio_test_hugetlb(folio)) - VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page); + VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio); else /* Either a PTE-mapped or a PMD-mapped THP. */ - VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) && - !PageAnonExclusive(page), page); + VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) && + !PageAnonExclusive(page), page); } } @@ -678,31 +679,9 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, return NULL; pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; - - if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && - pud_devmap(pud)) { - /* - * device mapped pages can only be returned if the caller - * will manage the page reference count. - * - * At least one of FOLL_GET | FOLL_PIN must be set, so - * assert that here: - */ - if (!(flags & (FOLL_GET | FOLL_PIN))) - return ERR_PTR(-EEXIST); - - if (flags & FOLL_TOUCH) - touch_pud(vma, addr, pudp, flags & FOLL_WRITE); - - ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap); - if (!ctx->pgmap) - return ERR_PTR(-EFAULT); - } - page = pfn_to_page(pfn); - if (!pud_devmap(pud) && !pud_write(pud) && - gup_must_unshare(vma, flags, page)) + if (!pud_write(pud) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); ret = try_grab_folio(page_folio(page), 1, flags); @@ -759,8 +738,8 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma, if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); - VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && - !PageAnonExclusive(page), page); + VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); ret = try_grab_folio(page_folio(page), 1, flags); if (ret) @@ -844,11 +823,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, pte_t *ptep, pte; int ret; - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == - (FOLL_PIN | FOLL_GET))) - return ERR_PTR(-EINVAL); - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags, address); @@ -861,8 +835,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, page = vm_normal_page(vma, address, pte); /* - * We only care about anon pages in can_follow_write_pte() and don't - * have to worry about pte_devmap() because they are never anon. + * We only care about anon pages in can_follow_write_pte(). */ if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, vma, flags)) { @@ -870,18 +843,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, goto out; } - if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { - /* - * Only return device mapping pages in the FOLL_GET or FOLL_PIN - * case since they are only valid while holding the pgmap - * reference. - */ - *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); - if (*pgmap) - page = pte_page(pte); - else - goto no_page; - } else if (unlikely(!page)) { + if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); @@ -903,8 +865,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, goto out; } - VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && - !PageAnonExclusive(page), page); + VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ ret = try_grab_folio(folio, 1, flags); @@ -963,14 +925,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return no_page_table(vma, flags, address); if (!pmd_present(pmdval)) return no_page_table(vma, flags, address); - if (pmd_devmap(pmdval)) { - ptl = pmd_lock(mm, pmd); - page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); - spin_unlock(ptl); - if (page) - return page; - return no_page_table(vma, flags, address); - } if (likely(!pmd_leaf(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); @@ -1106,10 +1060,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) return -EFAULT; - if (address > TASK_SIZE) - pgd = pgd_offset_k(address); - else - pgd = pgd_offset_gate(mm, address); + pgd = pgd_offset(mm, address); if (pgd_none(*pgd)) return -EFAULT; p4d = p4d_offset(pgd, address); @@ -1187,7 +1138,7 @@ static int faultin_page(struct vm_area_struct *vma, if (unshare) { fault_flags |= FAULT_FLAG_UNSHARE; /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ - VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE); + VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE); } ret = handle_mm_fault(vma, address, fault_flags, NULL); @@ -1432,7 +1383,11 @@ static long __get_user_pages(struct mm_struct *mm, start = untagged_addr_remote(mm, start); - VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); + VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET)); do { struct page *page; @@ -1763,10 +1718,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, } /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ - if (!*locked) { - BUG_ON(ret < 0); - BUG_ON(ret >= nr_pages); - } + VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages)); if (ret > 0) { nr_pages -= ret; @@ -1811,7 +1763,6 @@ retry: ret = mmap_read_lock_killable(mm); if (ret) { - BUG_ON(ret > 0); if (!pages_done) pages_done = ret; break; @@ -1822,11 +1773,11 @@ retry: pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ - BUG_ON(ret != 0); + VM_WARN_ON_ONCE(ret != 0); goto retry; } if (ret != 1) { - BUG_ON(ret > 1); + VM_WARN_ON_ONCE(ret > 1); if (!pages_done) pages_done = ret; break; @@ -1888,10 +1839,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, int gup_flags; long ret; - VM_BUG_ON(!PAGE_ALIGNED(start)); - VM_BUG_ON(!PAGE_ALIGNED(end)); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); + VM_WARN_ON_ONCE(!PAGE_ALIGNED(start)); + VM_WARN_ON_ONCE(!PAGE_ALIGNED(end)); + VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma); + VM_WARN_ON_ONCE_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* @@ -1960,8 +1911,8 @@ long faultin_page_range(struct mm_struct *mm, unsigned long start, int gup_flags; long ret; - VM_BUG_ON(!PAGE_ALIGNED(start)); - VM_BUG_ON(!PAGE_ALIGNED(end)); + VM_WARN_ON_ONCE(!PAGE_ALIGNED(start)); + VM_WARN_ON_ONCE(!PAGE_ALIGNED(end)); mmap_assert_locked(mm); /* @@ -2051,7 +2002,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, { struct vm_area_struct *vma; bool must_unlock = false; - unsigned long vm_flags; + vm_flags_t vm_flags; long i; if (!nr_pages) @@ -2114,28 +2065,22 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, */ size_t fault_in_writeable(char __user *uaddr, size_t size) { - char __user *start = uaddr, *end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; if (unlikely(size == 0)) return 0; if (!user_write_access_begin(uaddr, size)) return size; - if (!PAGE_ALIGNED(uaddr)) { - unsafe_put_user(0, uaddr, out); - uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); - } - end = (char __user *)PAGE_ALIGN((unsigned long)start + size); - if (unlikely(end < start)) - end = NULL; - while (uaddr != end) { - unsafe_put_user(0, uaddr, out); - uaddr += PAGE_SIZE; - } + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + unsafe_put_user(0, (char __user *)cur, out); out: user_write_access_end(); - if (size > uaddr - start) - return size - (uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_writeable); @@ -2189,26 +2134,24 @@ EXPORT_SYMBOL(fault_in_subpage_writeable); */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { - unsigned long start = (unsigned long)uaddr, end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; struct mm_struct *mm = current->mm; bool unlocked = false; if (unlikely(size == 0)) return 0; - end = PAGE_ALIGN(start + size); - if (end < start) - end = 0; mmap_read_lock(mm); - do { - if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked)) break; - start = (start + PAGE_SIZE) & PAGE_MASK; - } while (start != end); mmap_read_unlock(mm); - if (size > (unsigned long)uaddr - start) - return size - ((unsigned long)uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); @@ -2223,30 +2166,24 @@ EXPORT_SYMBOL(fault_in_safe_writeable); */ size_t fault_in_readable(const char __user *uaddr, size_t size) { - const char __user *start = uaddr, *end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; volatile char c; if (unlikely(size == 0)) return 0; if (!user_read_access_begin(uaddr, size)) return size; - if (!PAGE_ALIGNED(uaddr)) { - unsafe_get_user(c, uaddr, out); - uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); - } - end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); - if (unlikely(end < start)) - end = NULL; - while (uaddr != end) { - unsafe_get_user(c, uaddr, out); - uaddr += PAGE_SIZE; - } + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + unsafe_get_user(c, (const char __user *)cur, out); out: user_read_access_end(); (void)c; - if (size > uaddr - start) - return size - (uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_readable); @@ -2317,27 +2254,51 @@ static void pofs_unpin(struct pages_or_folios *pofs) unpin_user_pages(pofs->pages, pofs->nr_entries); } +static struct folio *pofs_next_folio(struct folio *folio, + struct pages_or_folios *pofs, long *index_ptr) +{ + long i = *index_ptr + 1; + + if (!pofs->has_folios && folio_test_large(folio)) { + const unsigned long start_pfn = folio_pfn(folio); + const unsigned long end_pfn = start_pfn + folio_nr_pages(folio); + + for (; i < pofs->nr_entries; i++) { + unsigned long pfn = page_to_pfn(pofs->pages[i]); + + /* Is this page part of this folio? */ + if (pfn < start_pfn || pfn >= end_pfn) + break; + } + } + + if (unlikely(i == pofs->nr_entries)) + return NULL; + *index_ptr = i; + + return pofs_get_folio(pofs, i); +} + /* * Returns the number of collected folios. Return value is always >= 0. */ -static void collect_longterm_unpinnable_folios( +static unsigned long collect_longterm_unpinnable_folios( struct list_head *movable_folio_list, struct pages_or_folios *pofs) { - struct folio *prev_folio = NULL; + unsigned long collected = 0; bool drain_allow = true; - unsigned long i; - - for (i = 0; i < pofs->nr_entries; i++) { - struct folio *folio = pofs_get_folio(pofs, i); + struct folio *folio; + long i = 0; - if (folio == prev_folio) - continue; - prev_folio = folio; + for (folio = pofs_get_folio(pofs, i); folio; + folio = pofs_next_folio(folio, pofs, &i)) { if (folio_is_longterm_pinnable(folio)) continue; + collected++; + if (folio_is_device_coherent(folio)) continue; @@ -2359,6 +2320,8 @@ static void collect_longterm_unpinnable_folios( NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } + + return collected; } /* @@ -2435,9 +2398,11 @@ static long check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs) { LIST_HEAD(movable_folio_list); + unsigned long collected; - collect_longterm_unpinnable_folios(&movable_folio_list, pofs); - if (list_empty(&movable_folio_list)) + collected = collect_longterm_unpinnable_folios(&movable_folio_list, + pofs); + if (!collected) return 0; return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs); @@ -2839,9 +2804,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) return false; /* Anonymous folios pose no problem. */ - mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS; + mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS; if (mapping_flags) - return mapping_flags & PAGE_MAPPING_ANON; + return mapping_flags & FOLIO_MAPPING_ANON; /* * At this point, we know the mapping is non-null and points to an @@ -2889,7 +2854,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, int *nr) { struct dev_pagemap *pgmap = NULL; - int nr_start = *nr, ret = 0; + int ret = 0; pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); @@ -2913,19 +2878,11 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; - if (pte_devmap(pte)) { - if (unlikely(flags & FOLL_LONGTERM)) - goto pte_unmap; - - pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); - if (unlikely(!pgmap)) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - goto pte_unmap; - } - } else if (pte_special(pte)) + if (pte_special(pte)) goto pte_unmap; - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + /* If it's not marked as special it must have a valid memmap. */ + VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); folio = try_grab_folio_fast(page, 1, flags); @@ -2993,91 +2950,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ -#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, int *nr) -{ - int nr_start = *nr; - struct dev_pagemap *pgmap = NULL; - - do { - struct folio *folio; - struct page *page = pfn_to_page(pfn); - - pgmap = get_dev_pagemap(pfn, pgmap); - if (unlikely(!pgmap)) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - break; - } - - folio = try_grab_folio_fast(page, 1, flags); - if (!folio) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - break; - } - folio_set_referenced(folio); - pages[*nr] = page; - (*nr)++; - pfn++; - } while (addr += PAGE_SIZE, addr != end); - - put_dev_pagemap(pgmap); - return addr == end; -} - -static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - unsigned long fault_pfn; - int nr_start = *nr; - - fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr)) - return 0; - - if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; - } - return 1; -} - -static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - unsigned long fault_pfn; - int nr_start = *nr; - - fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr)) - return 0; - - if (unlikely(pud_val(orig) != pud_val(*pudp))) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; - } - return 1; -} -#else -static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - BUILD_BUG(); - return 0; -} - -static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - BUILD_BUG(); - return 0; -} -#endif - static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) @@ -3092,13 +2964,6 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (pmd_special(orig)) return 0; - if (pmd_devmap(orig)) { - if (unlikely(flags & FOLL_LONGTERM)) - return 0; - return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags, - pages, nr); - } - page = pmd_page(orig); refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); @@ -3139,13 +3004,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, if (pud_special(orig)) return 0; - if (pud_devmap(orig)) { - if (unlikely(flags & FOLL_LONGTERM)) - return 0; - return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags, - pages, nr); - } - page = pud_page(orig); refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); @@ -3173,46 +3031,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, return 1; } -static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - int refs; - struct page *page; - struct folio *folio; - - if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) - return 0; - - BUILD_BUG_ON(pgd_devmap(orig)); - - page = pgd_page(orig); - refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr); - - folio = try_grab_folio_fast(page, refs, flags); - if (!folio) - return 0; - - if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!gup_fast_folio_allowed(folio, flags)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - *nr += refs; - folio_set_referenced(folio); - return 1; -} - static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) @@ -3307,12 +3125,9 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; - if (unlikely(pgd_leaf(pgd))) { - if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags, - pages, nr)) - return; - } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, - pages, nr)) + BUILD_BUG_ON(pgd_leaf(pgd)); + if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, + pages, nr)) return; } while (pgdp++, addr = next, addr != end); } @@ -3359,7 +3174,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end, * include/asm-generic/tlb.h for more details. * * We do not adopt an rcu_read_lock() here as we also want to block IPIs - * that come from THPs splitting. + * that come from callers of tlb_remove_table_sync_one(). */ local_irq_save(flags); gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned); @@ -3647,7 +3462,7 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, { unsigned int flags, nr_folios, nr_found; unsigned int i, pgshift = PAGE_SHIFT; - pgoff_t start_idx, end_idx, next_idx; + pgoff_t start_idx, end_idx; struct folio *folio = NULL; struct folio_batch fbatch; struct hstate *h; @@ -3697,20 +3512,8 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, folio = NULL; } - next_idx = 0; for (i = 0; i < nr_found; i++) { - /* - * As there can be multiple entries for a - * given folio in the batch returned by - * filemap_get_folios_contig(), the below - * check is to ensure that we pin and return a - * unique set of folios between start and end. - */ - if (next_idx && - next_idx != folio_index(fbatch.folios[i])) - continue; - - folio = page_folio(&fbatch.folios[i]->page); + folio = fbatch.folios[i]; if (try_grab_folio(folio, 1, FOLL_PIN)) { folio_batch_release(&fbatch); @@ -3722,7 +3525,6 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, *offset = offset_in_folio(folio, start); folios[nr_folios] = folio; - next_idx = folio_next_index(folio); if (++nr_folios == max_folios) break; } @@ -10,6 +10,7 @@ */ #include <linux/pagewalk.h> #include <linux/hmm.h> +#include <linux/hmm-dma.h> #include <linux/init.h> #include <linux/rmap.h> #include <linux/swap.h> @@ -23,6 +24,7 @@ #include <linux/sched/mm.h> #include <linux/jump_label.h> #include <linux/dma-mapping.h> +#include <linux/pci-p2pdma.h> #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> @@ -39,13 +41,21 @@ enum { HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, }; +enum { + /* These flags are carried from input-to-output */ + HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | + HMM_PFN_P2PDMA_BUS, +}; + static int hmm_pfns_fill(unsigned long addr, unsigned long end, struct hmm_range *range, unsigned long cpu_flags) { unsigned long i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) - range->hmm_pfns[i] = cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++) { + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + range->hmm_pfns[i] |= cpu_flags; + } return 0; } @@ -173,6 +183,7 @@ static inline unsigned long hmm_pfn_flags_order(unsigned long order) return order << HMM_PFN_ORDER_SHIFT; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) { @@ -183,7 +194,6 @@ static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, unsigned long hmm_pfns[], pmd_t pmd) @@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + hmm_pfns[i] |= pfn | cpu_flags; + } return 0; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, unsigned long cpu_flags; pte_t pte = ptep_get(ptep); uint64_t pfn_req_flags = *hmm_pfn; + uint64_t new_pfn_flags = 0; if (pte_none_mostly(pte)) { required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) goto fault; - *hmm_pfn = 0; - return 0; + goto out; } if (!pte_present(pte)) { @@ -253,16 +265,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, cpu_flags = HMM_PFN_VALID; if (is_writable_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; - *hmm_pfn = swp_offset_pfn(entry) | cpu_flags; - return 0; + new_pfn_flags = swp_offset_pfn(entry) | cpu_flags; + goto out; } required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); - if (!required_fault) { - *hmm_pfn = 0; - return 0; - } + if (!required_fault) + goto out; if (!non_swap_entry(entry)) goto fault; @@ -292,23 +302,22 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, goto fault; /* - * Bypass devmap pte such as DAX page when all pfn requested - * flags(pfn_req_flags) are fulfilled. * Since each architecture defines a struct page for the zero page, just * fall through and treat it like a normal page. */ if (!vm_normal_page(walk->vma, addr, pte) && - !pte_devmap(pte) && !is_zero_pfn(pte_pfn(pte))) { if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); return -EFAULT; } - *hmm_pfn = HMM_PFN_ERROR; - return 0; + new_pfn_flags = HMM_PFN_ERROR; + goto out; } - *hmm_pfn = pte_pfn(pte) | cpu_flags; + new_pfn_flags = pte_pfn(pte) | cpu_flags; +out: + *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags; return 0; fault: @@ -351,7 +360,7 @@ again: return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); } - if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { + if (pmd_trans_huge(pmd)) { /* * No need to take pmd_lock here, even if some other thread * is splitting the huge pmd we will get that event through @@ -362,7 +371,7 @@ again: * values. */ pmd = pmdp_get_lockless(pmdp); - if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) + if (!pmd_trans_huge(pmd)) goto again; return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd); @@ -396,8 +405,7 @@ again: return 0; } -#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ - defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +#if defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) { @@ -429,7 +437,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, return hmm_vma_walk_hole(start, end, -1, walk); } - if (pud_leaf(pud) && pud_devmap(pud)) { + if (pud_leaf(pud)) { unsigned long i, npages, pfn; unsigned int required_fault; unsigned long *hmm_pfns; @@ -448,8 +456,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, } pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - for (i = 0; i < npages; ++i, ++pfn) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; i < npages; ++i, ++pfn) { + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + hmm_pfns[i] |= pfn | cpu_flags; + } goto out_unlock; } @@ -507,8 +517,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); - for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - range->hmm_pfns[i] = pfn | cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) { + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + range->hmm_pfns[i] |= pfn | cpu_flags; + } spin_unlock(ptl); return 0; @@ -607,3 +619,211 @@ int hmm_range_fault(struct hmm_range *range) return ret; } EXPORT_SYMBOL(hmm_range_fault); + +/** + * hmm_dma_map_alloc - Allocate HMM map structure + * @dev: device to allocate structure for + * @map: HMM map to allocate + * @nr_entries: number of entries in the map + * @dma_entry_size: size of the DMA entry in the map + * + * Allocate the HMM map structure and all the lists it contains. + * Return 0 on success, -ENOMEM on failure. + */ +int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map, + size_t nr_entries, size_t dma_entry_size) +{ + bool dma_need_sync = false; + bool use_iova; + + WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size)); + + /* + * The HMM API violates our normal DMA buffer ownership rules and can't + * transfer buffer ownership. The dma_addressing_limited() check is a + * best approximation to ensure no swiotlb buffering happens. + */ +#ifdef CONFIG_DMA_NEED_SYNC + dma_need_sync = !dev->dma_skip_sync; +#endif /* CONFIG_DMA_NEED_SYNC */ + if (dma_need_sync || dma_addressing_limited(dev)) + return -EOPNOTSUPP; + + map->dma_entry_size = dma_entry_size; + map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->pfn_list) + return -ENOMEM; + + use_iova = dma_iova_try_alloc(dev, &map->state, 0, + nr_entries * PAGE_SIZE); + if (!use_iova && dma_need_unmap(dev)) { + map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->dma_list) + goto err_dma; + } + return 0; + +err_dma: + kvfree(map->pfn_list); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(hmm_dma_map_alloc); + +/** + * hmm_dma_map_free - iFree HMM map structure + * @dev: device to free structure from + * @map: HMM map containing the various lists and state + * + * Free the HMM map structure and all the lists it contains. + */ +void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map) +{ + if (dma_use_iova(&map->state)) + dma_iova_free(dev, &map->state); + kvfree(map->pfn_list); + kvfree(map->dma_list); +} +EXPORT_SYMBOL_GPL(hmm_dma_map_free); + +/** + * hmm_dma_map_pfn - Map a physical HMM page to DMA address + * @dev: Device to map the page for + * @map: HMM map + * @idx: Index into the PFN and dma address arrays + * @p2pdma_state: PCI P2P state. + * + * dma_alloc_iova() allocates IOVA based on the size specified by their use in + * iova->size. Call this function after IOVA allocation to link whole @page + * to get the DMA address. Note that very first call to this function + * will have @offset set to 0 in the IOVA space allocated from + * dma_alloc_iova(). For subsequent calls to this function on same @iova, + * @offset needs to be advanced by the caller with the size of previous + * page that was linked + DMA address returned for the previous page that was + * linked by this function. + */ +dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, + size_t idx, + struct pci_p2pdma_map_state *p2pdma_state) +{ + struct dma_iova_state *state = &map->state; + dma_addr_t *dma_addrs = map->dma_list; + unsigned long *pfns = map->pfn_list; + struct page *page = hmm_pfn_to_page(pfns[idx]); + phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]); + size_t offset = idx * map->dma_entry_size; + unsigned long attrs = 0; + dma_addr_t dma_addr; + int ret; + + if ((pfns[idx] & HMM_PFN_DMA_MAPPED) && + !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) { + /* + * We are in this flow when there is a need to resync flags, + * for example when page was already linked in prefetch call + * with READ flag and now we need to add WRITE flag + * + * This page was already programmed to HW and we don't want/need + * to unlink and link it again just to resync flags. + */ + if (dma_use_iova(state)) + return state->addr + offset; + + /* + * Without dma_need_unmap, the dma_addrs array is NULL, thus we + * need to regenerate the address below even if there already + * was a mapping. But !dma_need_unmap implies that the + * mapping stateless, so this is fine. + */ + if (dma_need_unmap(dev)) + return dma_addrs[idx]; + + /* Continue to remapping */ + } + + switch (pci_p2pdma_state(p2pdma_state, dev, page)) { + case PCI_P2PDMA_MAP_NONE: + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + pfns[idx] |= HMM_PFN_P2PDMA; + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED; + return pci_p2pdma_bus_addr_map(p2pdma_state, paddr); + default: + return DMA_MAPPING_ERROR; + } + + if (dma_use_iova(state)) { + ret = dma_iova_link(dev, state, paddr, offset, + map->dma_entry_size, DMA_BIDIRECTIONAL, + attrs); + if (ret) + goto error; + + ret = dma_iova_sync(dev, state, offset, map->dma_entry_size); + if (ret) { + dma_iova_unlink(dev, state, offset, map->dma_entry_size, + DMA_BIDIRECTIONAL, attrs); + goto error; + } + + dma_addr = state->addr + offset; + } else { + if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs)) + goto error; + + dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(dev, dma_addr)) + goto error; + + if (dma_need_unmap(dev)) + dma_addrs[idx] = dma_addr; + } + pfns[idx] |= HMM_PFN_DMA_MAPPED; + return dma_addr; +error: + pfns[idx] &= ~HMM_PFN_P2PDMA; + return DMA_MAPPING_ERROR; + +} +EXPORT_SYMBOL_GPL(hmm_dma_map_pfn); + +/** + * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address + * @dev: Device to unmap the page from + * @map: HMM map + * @idx: Index of the PFN to unmap + * + * Returns true if the PFN was mapped and has been unmapped, false otherwise. + */ +bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx) +{ + const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED; + struct dma_iova_state *state = &map->state; + dma_addr_t *dma_addrs = map->dma_list; + unsigned long *pfns = map->pfn_list; + unsigned long attrs = 0; + + if ((pfns[idx] & valid_dma) != valid_dma) + return false; + + if (pfns[idx] & HMM_PFN_P2PDMA_BUS) + ; /* no need to unmap bus address P2P mappings */ + else if (dma_use_iova(state)) { + if (pfns[idx] & HMM_PFN_P2PDMA) + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + dma_iova_unlink(dev, state, idx * map->dma_entry_size, + map->dma_entry_size, DMA_BIDIRECTIONAL, attrs); + } else if (dma_need_unmap(dev)) + dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size, + DMA_BIDIRECTIONAL); + + pfns[idx] &= + ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS); + return true; +} +EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2a47682d1ab7..9c38a95e9f09 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -22,7 +22,6 @@ #include <linux/mm_types.h> #include <linux/khugepaged.h> #include <linux/freezer.h> -#include <linux/pfn_t.h> #include <linux/mman.h> #include <linux/memremap.h> #include <linux/pagemap.h> @@ -99,7 +98,7 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, + vm_flags_t vm_flags, unsigned long tva_flags, unsigned long orders) { @@ -166,7 +165,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) - return shmem_allowable_huge_orders(file_inode(vma->vm_file), + return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file), vma, vma->vm_pgoff, 0, !enforce_sysfs); @@ -1203,7 +1202,7 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, { pmd_t entry; - entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); + entry = folio_mk_pmd(folio, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); @@ -1309,8 +1308,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, struct folio *zero_folio) { pmd_t entry; - entry = mk_pmd(&zero_folio->page, vma->vm_page_prot); - entry = pmd_mkhuge(entry); + entry = folio_mk_pmd(zero_folio, vma->vm_page_prot); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); @@ -1373,9 +1371,17 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return __do_huge_pmd_anonymous_page(vmf); } -static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, - pgtable_t pgtable) +struct folio_or_pfn { + union { + struct folio *folio; + unsigned long pfn; + }; + bool is_folio; +}; + +static int insert_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot, + bool write, pgtable_t pgtable) { struct mm_struct *mm = vma->vm_mm; pmd_t entry; @@ -1383,8 +1389,11 @@ static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, lockdep_assert_held(pmd_lockptr(mm, pmd)); if (!pmd_none(*pmd)) { + const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : + fop.pfn; + if (write) { - if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { + if (pmd_pfn(*pmd) != pfn) { WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); return -EEXIST; } @@ -1397,11 +1406,16 @@ static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return -EEXIST; } - entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); - if (pfn_t_devmap(pfn)) - entry = pmd_mkdevmap(entry); - else + if (fop.is_folio) { + entry = folio_mk_pmd(fop.folio, vma->vm_page_prot); + + folio_get(fop.folio); + folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma); + add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR); + } else { + entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot)); entry = pmd_mkspecial(entry); + } if (write) { entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); @@ -1427,11 +1441,15 @@ static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, * * Return: vm_fault_t value. */ -vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) +vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn, + bool write) { unsigned long addr = vmf->address & PMD_MASK; struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; + struct folio_or_pfn fop = { + .pfn = pfn, + }; pgtable_t pgtable = NULL; spinlock_t *ptl; int error; @@ -1441,8 +1459,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && - !pfn_t_devmap(pfn)); + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); @@ -1456,10 +1473,11 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) return VM_FAULT_OOM; } - track_pfn_insert(vma, &pgprot, pfn); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); + ptl = pmd_lock(vma->vm_mm, vmf->pmd); - error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, - pgtable); + error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write, + pgtable); spin_unlock(ptl); if (error && pgtable) pte_free(vma->vm_mm, pgtable); @@ -1474,6 +1492,10 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address & PMD_MASK; struct mm_struct *mm = vma->vm_mm; + struct folio_or_pfn fop = { + .folio = folio, + .is_folio = true, + }; spinlock_t *ptl; pgtable_t pgtable = NULL; int error; @@ -1491,14 +1513,8 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, } ptl = pmd_lock(mm, vmf->pmd); - if (pmd_none(*vmf->pmd)) { - folio_get(folio); - folio_add_file_rmap_pmd(folio, &folio->page, vma); - add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR); - } - error = insert_pfn_pmd(vma, addr, vmf->pmd, - pfn_to_pfn_t(folio_pfn(folio)), vma->vm_page_prot, - write, pgtable); + error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, + write, pgtable); spin_unlock(ptl); if (error && pgtable) pte_free(mm, pgtable); @@ -1515,16 +1531,18 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) return pud; } -static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, pfn_t pfn, bool write) +static void insert_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write) { struct mm_struct *mm = vma->vm_mm; - pgprot_t prot = vma->vm_page_prot; pud_t entry; if (!pud_none(*pud)) { + const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : + fop.pfn; + if (write) { - if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn))) + if (WARN_ON_ONCE(pud_pfn(*pud) != pfn)) return; entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); @@ -1534,11 +1552,16 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, return; } - entry = pud_mkhuge(pfn_t_pud(pfn, prot)); - if (pfn_t_devmap(pfn)) - entry = pud_mkdevmap(entry); - else + if (fop.is_folio) { + entry = folio_mk_pud(fop.folio, vma->vm_page_prot); + + folio_get(fop.folio); + folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma); + add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR); + } else { + entry = pud_mkhuge(pfn_pud(fop.pfn, prot)); entry = pud_mkspecial(entry); + } if (write) { entry = pud_mkyoung(pud_mkdirty(entry)); entry = maybe_pud_mkwrite(entry, vma); @@ -1557,11 +1580,15 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, * * Return: vm_fault_t value. */ -vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) +vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn, + bool write) { unsigned long addr = vmf->address & PUD_MASK; struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; + struct folio_or_pfn fop = { + .pfn = pfn, + }; spinlock_t *ptl; /* @@ -1569,8 +1596,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && - !pfn_t_devmap(pfn)); + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); @@ -1578,10 +1604,10 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, pfn); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); ptl = pud_lock(vma->vm_mm, vmf->pud); - insert_pfn_pud(vma, addr, vmf->pud, pfn, write); + insert_pud(vma, addr, vmf->pud, fop, pgprot, write); spin_unlock(ptl); return VM_FAULT_NOPAGE; @@ -1603,6 +1629,10 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, unsigned long addr = vmf->address & PUD_MASK; pud_t *pud = vmf->pud; struct mm_struct *mm = vma->vm_mm; + struct folio_or_pfn fop = { + .folio = folio, + .is_folio = true, + }; spinlock_t *ptl; if (addr < vma->vm_start || addr >= vma->vm_end) @@ -1612,20 +1642,7 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, return VM_FAULT_SIGBUS; ptl = pud_lock(mm, pud); - - /* - * If there is already an entry present we assume the folio is - * already mapped, hence no need to take another reference. We - * still call insert_pfn_pud() though in case the mapping needs - * upgrading to writeable. - */ - if (pud_none(*vmf->pud)) { - folio_get(folio); - folio_add_file_rmap_pud(folio, &folio->page, vma); - add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR); - } - insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)), - write); + insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write); spin_unlock(ptl); return VM_FAULT_NOPAGE; @@ -1646,46 +1663,6 @@ void touch_pmd(struct vm_area_struct *vma, unsigned long addr, update_mmu_cache_pmd(vma, addr, pmd); } -struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags, struct dev_pagemap **pgmap) -{ - unsigned long pfn = pmd_pfn(*pmd); - struct mm_struct *mm = vma->vm_mm; - struct page *page; - int ret; - - assert_spin_locked(pmd_lockptr(mm, pmd)); - - if (flags & FOLL_WRITE && !pmd_write(*pmd)) - return NULL; - - if (pmd_present(*pmd) && pmd_devmap(*pmd)) - /* pass */; - else - return NULL; - - if (flags & FOLL_TOUCH) - touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); - - /* - * device mapped pages can only be returned if the - * caller will manage the page reference count. - */ - if (!(flags & (FOLL_GET | FOLL_PIN))) - return ERR_PTR(-EEXIST); - - pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; - *pgmap = get_dev_pagemap(pfn, *pgmap); - if (!*pgmap) - return ERR_PTR(-EFAULT); - page = pfn_to_page(pfn); - ret = try_grab_folio(page_folio(page), 1, flags); - if (ret) - page = ERR_PTR(ret); - - return page; -} - int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) @@ -1786,7 +1763,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_free(dst_mm, pgtable); spin_unlock(src_ptl); spin_unlock(dst_ptl); - __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); + __split_huge_pmd(src_vma, src_pmd, addr, false); return -EAGAIN; } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -1837,7 +1814,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pud = *src_pud; - if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) + if (unlikely(!pud_trans_huge(pud))) goto out_unlock; /* @@ -2008,7 +1985,7 @@ unlock_fallback: folio_unlock(folio); spin_unlock(vmf->ptl); fallback: - __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + __split_huge_pmd(vma, vmf->pmd, vmf->address, false); return VM_FAULT_FALLBACK; } @@ -2260,6 +2237,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PMD_NR); + + /* + * Use flush_needed to indicate whether the PMD entry + * is present, instead of checking pmd_present() again. + */ + if (flush_needed && pmd_young(orig_pmd) && + likely(vma_has_recency(vma))) + folio_mark_accessed(folio); } spin_unlock(ptl); @@ -2653,12 +2638,12 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); - _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); + _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); } else { src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); - _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot); + _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot); } set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); @@ -2691,8 +2676,7 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || - pmd_devmap(*pmd))) + if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd))) return ptl; spin_unlock(ptl); return NULL; @@ -2709,7 +2693,7 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) spinlock_t *ptl; ptl = pud_lock(vma->vm_mm, pud); - if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) + if (likely(pud_trans_huge(*pud))) return ptl; spin_unlock(ptl); return NULL; @@ -2761,7 +2745,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); - VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); + VM_BUG_ON(!pud_trans_huge(*pud)); count_vm_event(THP_SPLIT_PUD); @@ -2794,7 +2778,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pud_lock(vma->vm_mm, pud); - if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) + if (unlikely(!pud_trans_huge(*pud))) goto out; __split_huge_pud_locked(vma, pud, range.start); @@ -2867,8 +2851,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) - && !pmd_devmap(*pmd)); + VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); count_vm_event(THP_SPLIT_PMD); @@ -3073,28 +3056,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, bool freeze, struct folio *folio) + pmd_t *pmd, bool freeze) { - VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio)); VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); - VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); - VM_BUG_ON(freeze && !folio); - - /* - * When the caller requests to set up a migration entry, we - * require a folio to check the PMD against. Otherwise, there - * is a risk of replacing the wrong folio. - */ - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || - is_pmd_migration_entry(*pmd)) { - if (folio && folio != pmd_folio(*pmd)) - return; + if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) __split_huge_pmd_locked(vma, pmd, address, freeze); - } } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze, struct folio *folio) + unsigned long address, bool freeze) { spinlock_t *ptl; struct mmu_notifier_range range; @@ -3104,20 +3074,20 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pmd_lock(vma->vm_mm, pmd); - split_huge_pmd_locked(vma, range.start, pmd, freeze, folio); + split_huge_pmd_locked(vma, range.start, pmd, freeze); spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, - bool freeze, struct folio *folio) + bool freeze) { pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); if (!pmd) return; - __split_huge_pmd(vma, pmd, address, freeze, folio); + __split_huge_pmd(vma, pmd, address, freeze); } static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) @@ -3129,7 +3099,7 @@ static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), ALIGN(address, HPAGE_PMD_SIZE))) - split_huge_pmd_address(vma, address, false, NULL); + split_huge_pmd_address(vma, address, false); } void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -3415,10 +3385,6 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * order - 1 to new_order). * @split_at: in buddy allocator like split, the folio containing @split_at * will be split until its order becomes @new_order. - * @lock_at: the folio containing @lock_at is left locked for caller. - * @list: the after split folios will be added to @list if it is not NULL, - * otherwise to LRU lists. - * @end: the end of the file @folio maps to. -1 if @folio is anonymous memory. * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller * @mapping: @folio->mapping * @uniform_split: if the split is uniform or not (buddy allocator like split) @@ -3444,52 +3410,26 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * @page, which is split in next for loop. * * After splitting, the caller's folio reference will be transferred to the - * folio containing @page. The other folios may be freed if they are not mapped. - * - * In terms of locking, after splitting, - * 1. uniform split leaves @page (or the folio contains it) locked; - * 2. buddy allocator like (non-uniform) split leaves @folio locked. - * + * folio containing @page. The caller needs to unlock and/or free after-split + * folios if necessary. * * For !uniform_split, when -ENOMEM is returned, the original folio might be * split. The caller needs to check the input folio. */ static int __split_unmapped_folio(struct folio *folio, int new_order, - struct page *split_at, struct page *lock_at, - struct list_head *list, pgoff_t end, - struct xa_state *xas, struct address_space *mapping, - bool uniform_split) + struct page *split_at, struct xa_state *xas, + struct address_space *mapping, bool uniform_split) { - struct lruvec *lruvec; - struct address_space *swap_cache = NULL; - struct folio *origin_folio = folio; - struct folio *next_folio = folio_next(folio); - struct folio *new_folio; - struct folio *next; int order = folio_order(folio); - int split_order; int start_order = uniform_split ? new_order : order - 1; - int nr_dropped = 0; - int ret = 0; bool stop_split = false; - - if (folio_test_swapcache(folio)) { - VM_BUG_ON(mapping); - - /* a swapcache folio can only be uniformly split to order-0 */ - if (!uniform_split || new_order != 0) - return -EINVAL; - - swap_cache = swap_address_space(folio->swap); - xa_lock(&swap_cache->i_pages); - } + struct folio *next; + int split_order; + int ret = 0; if (folio_test_anon(folio)) mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); - /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ - lruvec = folio_lruvec_lock(folio); - folio_clear_has_hwpoisoned(folio); /* @@ -3499,9 +3439,9 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, for (split_order = start_order; split_order >= new_order && !stop_split; split_order--) { - int old_order = folio_order(folio); - struct folio *release; struct folio *end_folio = folio_next(folio); + int old_order = folio_order(folio); + struct folio *new_folio; /* order-1 anonymous folio is not supported */ if (folio_test_anon(folio) && split_order == 1) @@ -3523,126 +3463,45 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, if (xas_error(xas)) { ret = xas_error(xas); stop_split = true; - goto after_split; } } } - folio_split_memcg_refs(folio, old_order, split_order); - split_page_owner(&folio->page, old_order, split_order); - pgalloc_tag_split(folio, old_order, split_order); + if (!stop_split) { + folio_split_memcg_refs(folio, old_order, split_order); + split_page_owner(&folio->page, old_order, split_order); + pgalloc_tag_split(folio, old_order, split_order); - __split_folio_to_order(folio, old_order, split_order); + __split_folio_to_order(folio, old_order, split_order); + } -after_split: /* - * Iterate through after-split folios and perform related - * operations. But in buddy allocator like split, the folio + * Iterate through after-split folios and update folio stats. + * But in buddy allocator like split, the folio * containing the specified page is skipped until its order * is new_order, since the folio will be worked on in next * iteration. */ - for (release = folio; release != end_folio; release = next) { - next = folio_next(release); + for (new_folio = folio; new_folio != end_folio; new_folio = next) { + next = folio_next(new_folio); /* - * for buddy allocator like split, the folio containing - * page will be split next and should not be released, - * until the folio's order is new_order or stop_split - * is set to true by the above xas_split() failure. + * for buddy allocator like split, new_folio containing + * @split_at page could be split again, thus do not + * change stats yet. Wait until new_folio's order is + * @new_order or stop_split is set to true by the above + * xas_split() failure. */ - if (release == page_folio(split_at)) { - folio = release; + if (new_folio == page_folio(split_at)) { + folio = new_folio; if (split_order != new_order && !stop_split) continue; } - if (folio_test_anon(release)) { - mod_mthp_stat(folio_order(release), - MTHP_STAT_NR_ANON, 1); - } - - /* - * origin_folio should be kept frozon until page cache - * entries are updated with all the other after-split - * folios to prevent others seeing stale page cache - * entries. - */ - if (release == origin_folio) - continue; - - folio_ref_unfreeze(release, 1 + - ((mapping || swap_cache) ? - folio_nr_pages(release) : 0)); - - lru_add_split_folio(origin_folio, release, lruvec, - list); - - /* Some pages can be beyond EOF: drop them from cache */ - if (release->index >= end) { - if (shmem_mapping(mapping)) - nr_dropped += folio_nr_pages(release); - else if (folio_test_clear_dirty(release)) - folio_account_cleaned(release, - inode_to_wb(mapping->host)); - __filemap_remove_folio(release, NULL); - folio_put_refs(release, folio_nr_pages(release)); - } else if (mapping) { - __xa_store(&mapping->i_pages, - release->index, release, 0); - } else if (swap_cache) { - __xa_store(&swap_cache->i_pages, - swap_cache_index(release->swap), - release, 0); - } + if (folio_test_anon(new_folio)) + mod_mthp_stat(folio_order(new_folio), + MTHP_STAT_NR_ANON, 1); } } - /* - * Unfreeze origin_folio only after all page cache entries, which used - * to point to it, have been updated with new folios. Otherwise, - * a parallel folio_try_get() can grab origin_folio and its caller can - * see stale page cache entries. - */ - folio_ref_unfreeze(origin_folio, 1 + - ((mapping || swap_cache) ? folio_nr_pages(origin_folio) : 0)); - - unlock_page_lruvec(lruvec); - - if (swap_cache) - xa_unlock(&swap_cache->i_pages); - if (mapping) - xa_unlock(&mapping->i_pages); - - /* Caller disabled irqs, so they are still disabled here */ - local_irq_enable(); - - if (nr_dropped) - shmem_uncharge(mapping->host, nr_dropped); - - remap_page(origin_folio, 1 << order, - folio_test_anon(origin_folio) ? - RMP_USE_SHARED_ZEROPAGE : 0); - - /* - * At this point, folio should contain the specified page. - * For uniform split, it is left for caller to unlock. - * For buddy allocator like split, the first after-split folio is left - * for caller to unlock. - */ - for (new_folio = origin_folio; new_folio != next_folio; new_folio = next) { - next = folio_next(new_folio); - if (new_folio == page_folio(lock_at)) - continue; - - folio_unlock(new_folio); - /* - * Subpages may be freed if there wasn't any mapping - * like if add_to_swap() is running on a lru page that - * had its mapping zapped. And freeing these pages - * requires taking the lru_lock so we do the put_page - * of the tail pages after the split is complete. - */ - free_page_and_swap_cache(&new_folio->page); - } return ret; } @@ -3716,6 +3575,11 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, * It is in charge of checking whether the split is supported or not and * preparing @folio for __split_unmapped_folio(). * + * After splitting, the after-split folio containing @lock_at remains locked + * and others are unlocked: + * 1. for uniform split, @lock_at points to one of @folio's subpages; + * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio. + * * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be * split but not to @new_order, the caller needs to check) */ @@ -3725,16 +3589,20 @@ static int __folio_split(struct folio *folio, unsigned int new_order, { struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); + struct folio *end_folio = folio_next(folio); bool is_anon = folio_test_anon(folio); struct address_space *mapping = NULL; struct anon_vma *anon_vma = NULL; int order = folio_order(folio); + struct folio *new_folio, *next; + int nr_shmem_dropped = 0; + int remap_flags = 0; int extra_pins, ret; pgoff_t end; bool is_hzp; - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio); if (folio != page_folio(split_at) || folio != page_folio(lock_at)) return -EINVAL; @@ -3772,7 +3640,6 @@ static int __folio_split(struct folio *folio, unsigned int new_order, ret = -EBUSY; goto out; } - end = -1; mapping = NULL; anon_vma_lock_write(anon_vma); } else { @@ -3852,13 +3719,19 @@ static int __folio_split(struct folio *folio, unsigned int new_order, */ xas_lock(&xas); xas_reset(&xas); - if (xas_load(&xas) != folio) + if (xas_load(&xas) != folio) { + ret = -EAGAIN; goto fail; + } } /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { + struct address_space *swap_cache = NULL; + struct lruvec *lruvec; + int expected_refs; + if (folio_order(folio) > 1 && !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; @@ -3892,18 +3765,122 @@ static int __folio_split(struct folio *folio, unsigned int new_order, } } - ret = __split_unmapped_folio(folio, new_order, - split_at, lock_at, list, end, &xas, mapping, - uniform_split); + if (folio_test_swapcache(folio)) { + if (mapping) { + VM_WARN_ON_ONCE_FOLIO(mapping, folio); + ret = -EINVAL; + goto fail; + } + + swap_cache = swap_address_space(folio->swap); + xa_lock(&swap_cache->i_pages); + } + + /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ + lruvec = folio_lruvec_lock(folio); + + ret = __split_unmapped_folio(folio, new_order, split_at, &xas, + mapping, uniform_split); + + /* + * Unfreeze after-split folios and put them back to the right + * list. @folio should be kept frozon until page cache + * entries are updated with all the other after-split folios + * to prevent others seeing stale page cache entries. + * As a result, new_folio starts from the next folio of + * @folio. + */ + for (new_folio = folio_next(folio); new_folio != end_folio; + new_folio = next) { + unsigned long nr_pages = folio_nr_pages(new_folio); + + next = folio_next(new_folio); + + expected_refs = folio_expected_ref_count(new_folio) + 1; + folio_ref_unfreeze(new_folio, expected_refs); + + lru_add_split_folio(folio, new_folio, lruvec, list); + + /* + * Anonymous folio with swap cache. + * NOTE: shmem in swap cache is not supported yet. + */ + if (swap_cache) { + __xa_store(&swap_cache->i_pages, + swap_cache_index(new_folio->swap), + new_folio, 0); + continue; + } + + /* Anonymous folio without swap cache */ + if (!mapping) + continue; + + /* Add the new folio to the page cache. */ + if (new_folio->index < end) { + __xa_store(&mapping->i_pages, new_folio->index, + new_folio, 0); + continue; + } + + /* Drop folio beyond EOF: ->index >= end */ + if (shmem_mapping(mapping)) + nr_shmem_dropped += nr_pages; + else if (folio_test_clear_dirty(new_folio)) + folio_account_cleaned( + new_folio, inode_to_wb(mapping->host)); + __filemap_remove_folio(new_folio, NULL); + folio_put_refs(new_folio, nr_pages); + } + /* + * Unfreeze @folio only after all page cache entries, which + * used to point to it, have been updated with new folios. + * Otherwise, a parallel folio_try_get() can grab @folio + * and its caller can see stale page cache entries. + */ + expected_refs = folio_expected_ref_count(folio) + 1; + folio_ref_unfreeze(folio, expected_refs); + + unlock_page_lruvec(lruvec); + + if (swap_cache) + xa_unlock(&swap_cache->i_pages); } else { spin_unlock(&ds_queue->split_queue_lock); -fail: - if (mapping) - xas_unlock(&xas); - local_irq_enable(); - remap_page(folio, folio_nr_pages(folio), 0); ret = -EAGAIN; } +fail: + if (mapping) + xas_unlock(&xas); + + local_irq_enable(); + + if (nr_shmem_dropped) + shmem_uncharge(mapping->host, nr_shmem_dropped); + + if (!ret && is_anon) + remap_flags = RMP_USE_SHARED_ZEROPAGE; + remap_page(folio, 1 << order, remap_flags); + + /* + * Unlock all after-split folios except the one containing + * @lock_at page. If @folio is not split, it will be kept locked. + */ + for (new_folio = folio; new_folio != end_folio; new_folio = next) { + next = folio_next(new_folio); + if (new_folio == page_folio(lock_at)) + continue; + + folio_unlock(new_folio); + /* + * Subpages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + free_folio_and_swap_cache(new_folio); + } out_unlock: if (anon_vma) { @@ -4675,7 +4652,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) entry = pmd_to_swp_entry(*pvmw->pmd); folio_get(folio); - pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); + pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6fccfe6d046c..753f99b4c718 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -25,6 +25,7 @@ #include <linux/mmdebug.h> #include <linux/sched/signal.h> #include <linux/rmap.h> +#include <linux/string_choices.h> #include <linux/string_helpers.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -58,6 +59,7 @@ int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; +__initdata nodemask_t hugetlb_bootmem_nodes; __initdata struct list_head huge_boot_pages[MAX_NUMNODES]; static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata; @@ -120,7 +122,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, bool take_locks); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); static void hugetlb_free_folio(struct folio *folio) @@ -283,11 +285,6 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, return ret; } -static inline struct hugepage_subpool *subpool_inode(struct inode *inode) -{ - return HUGETLBFS_SB(inode->i_sb)->spool; -} - static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) { return subpool_inode(file_inode(vma->vm_file)); @@ -1250,7 +1247,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) /* * Reset and decrement one ref on hugepage private reservation. * Called with mm->mmap_lock writer semaphore held. - * This function should be only used by move_vma() and operate on + * This function should be only used by mremap and operate on * same sized vma. It should never come here with last ref on the * reservation. */ @@ -1950,7 +1947,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, int order = huge_page_order(h); struct folio *folio; bool alloc_try_hard = true; - bool retry = true; /* * By default we always try hard to allocate the folio with @@ -1965,22 +1961,8 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); -retry: - folio = __folio_alloc(gfp_mask, order, nid, nmask); - /* Ensure hugetlb folio won't have large_rmappable flag set. */ - if (folio) - folio_clear_large_rmappable(folio); - if (folio && !folio_ref_freeze(folio, 1)) { - folio_put(folio); - if (retry) { /* retry once */ - retry = false; - goto retry; - } - /* WOW! twice in a row. */ - pr_warn("HugeTLB unexpected inflated folio ref count\n"); - folio = NULL; - } + folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask); /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a @@ -2271,7 +2253,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, * as surplus_pages, otherwise it might confuse * persistent_huge_pages() momentarily. */ - __prep_account_new_huge_page(h, nid); + __prep_account_new_huge_page(h, folio_nid(folio)); /* * We could have raced with the pool size change. @@ -2354,12 +2336,15 @@ struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, struct folio *folio; spin_lock_irq(&hugetlb_lock); + if (!h->resv_huge_pages) { + spin_unlock_irq(&hugetlb_lock); + return NULL; + } + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); - if (folio) { - VM_BUG_ON(!h->resv_huge_pages); + if (folio) h->resv_huge_pages--; - } spin_unlock_irq(&hugetlb_lock); return folio; @@ -2419,7 +2404,6 @@ static int gather_surplus_pages(struct hstate *h, long delta) long i; long needed, allocated; bool alloc_ok = true; - int node; nodemask_t *mbind_nodemask, alloc_nodemask; mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); @@ -2443,21 +2427,12 @@ retry: for (i = 0; i < needed; i++) { folio = NULL; - /* Prioritize current node */ - if (node_isset(numa_mem_id(), alloc_nodemask)) - folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), - numa_mem_id(), NULL); - - if (!folio) { - for_each_node_mask(node, alloc_nodemask) { - if (node == numa_mem_id()) - continue; - folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), - node, NULL); - if (folio) - break; - } - } + /* + * It is okay to use NUMA_NO_NODE because we use numa_mem_id() + * down the road to pick the current node if that is the case. + */ + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), + NUMA_NO_NODE, &alloc_nodemask); if (!folio) { alloc_ok = false; break; @@ -2811,20 +2786,24 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve * the old one - * @h: struct hstate old page belongs to * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ -static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, - struct folio *old_folio, struct list_head *list) +static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio, + struct list_head *list) { - gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + gfp_t gfp_mask; + struct hstate *h; int nid = folio_nid(old_folio); struct folio *new_folio = NULL; int ret = 0; retry: + /* + * The old_folio might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + */ spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { /* @@ -2853,8 +2832,10 @@ retry: cond_resched(); goto retry; } else { + h = folio_hstate(old_folio); if (!new_folio) { spin_unlock_irq(&hugetlb_lock); + gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); if (!new_folio) @@ -2896,38 +2877,26 @@ free_new: return ret; } -int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) +int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { - struct hstate *h; - struct folio *folio = page_folio(page); int ret = -EBUSY; - /* - * The page might have been dissolved from under our feet, so make sure - * to carefully check the state under the lock. - * Return success when racing as if we dissolved the page ourselves. - */ - spin_lock_irq(&hugetlb_lock); - if (folio_test_hugetlb(folio)) { - h = folio_hstate(folio); - } else { - spin_unlock_irq(&hugetlb_lock); + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (!folio_test_hugetlb(folio)) return 0; - } - spin_unlock_irq(&hugetlb_lock); /* * Fence off gigantic pages as there is a cyclic dependency between * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ - if (hstate_is_gigantic(h)) + if (folio_order(folio) > MAX_PAGE_ORDER) return -ENOMEM; if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) - ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); + ret = alloc_and_dissolve_hugetlb_folio(folio, list); return ret; } @@ -2941,7 +2910,6 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) */ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) { - struct hstate *h; struct folio *folio; int ret = 0; @@ -2949,16 +2917,10 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) while (start_pfn < end_pfn) { folio = pfn_folio(start_pfn); - if (folio_test_hugetlb(folio)) { - h = folio_hstate(folio); - } else { - start_pfn++; - continue; - } - if (!folio_ref_count(folio)) { - ret = alloc_and_dissolve_hugetlb_folio(h, folio, - &isolate_list); + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) { + ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list); if (ret) break; @@ -3010,7 +2972,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long retval, gbl_chg; + long retval, gbl_chg, gbl_reserve; map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; @@ -3163,8 +3125,16 @@ out_uncharge_cgroup_reservation: hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg) - hugepage_subpool_put_pages(spool, 1); + /* + * put page to subpool iff the quota of subpool's rsv_hpages is used + * during hugepage_subpool_get_pages. + */ + if (map_chg && !gbl_chg) { + gbl_reserve = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -gbl_reserve); + } + + out_end_reservation: if (map_chg != MAP_CHG_ENFORCED) vma_end_reservation(h, vma, addr); @@ -3237,7 +3207,8 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) } /* allocate from next node when distributing huge pages */ - for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) { + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, + &hugetlb_bootmem_nodes) { m = alloc_bootmem(h, node, false); if (!m) return 0; @@ -3327,8 +3298,8 @@ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio, if (folio_test_hugetlb_cma(folio)) init_cma_pageblock(folio_page(folio, i)); else - set_pageblock_migratetype(folio_page(folio, i), - MIGRATE_MOVABLE); + init_pageblock_migratetype(folio_page(folio, i), + MIGRATE_MOVABLE, false); } } @@ -3701,6 +3672,15 @@ static void __init hugetlb_init_hstates(void) struct hstate *h, *h2; for_each_hstate(h) { + /* + * Always reset to first_memory_node here, even if + * next_nid_to_alloc was set before - we can't + * reference hugetlb_bootmem_nodes after init, and + * first_memory_node is right for all further allocations. + */ + h->next_nid_to_alloc = first_memory_node; + h->next_nid_to_free = first_memory_node; + /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); @@ -3740,10 +3720,10 @@ static void __init report_hugepages(void) string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", - buf, h->free_huge_pages); + buf, h->nr_huge_pages); if (nrinvalid) pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n", - buf, nrinvalid, nrinvalid > 1 ? "s" : ""); + buf, nrinvalid, str_plural(nrinvalid)); pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); } @@ -3825,6 +3805,7 @@ found: static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { + unsigned long persistent_free_count; unsigned long min_count; unsigned long allocated; struct folio *folio; @@ -3959,8 +3940,24 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. + * + * min_count is the expected number of persistent pages, we + * shouldn't calculate min_count by using + * resv_huge_pages + persistent_huge_pages() - free_huge_pages, + * because there may exist free surplus huge pages, and this will + * lead to subtracting twice. Free surplus huge pages come from HVO + * failing to restore vmemmap, see comments in the callers of + * hugetlb_vmemmap_restore_folio(). Thus, we should calculate + * persistent free count first. */ - min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; + persistent_free_count = h->free_huge_pages; + if (h->free_huge_pages > persistent_huge_pages(h)) { + if (h->free_huge_pages > h->surplus_huge_pages) + persistent_free_count -= h->surplus_huge_pages; + else + persistent_free_count = 0; + } + min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); @@ -4017,10 +4014,13 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, list_for_each_entry_safe(folio, next, src_list, lru) { int i; + bool cma; if (folio_test_hugetlb_vmemmap_optimized(folio)) continue; + cma = folio_test_hugetlb_cma(folio); + list_del(&folio->lru); split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); @@ -4036,6 +4036,9 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, new_folio->mapping = NULL; init_new_hugetlb_folio(dst, new_folio); + /* Copy the CMA flag so that it is freed correctly */ + if (cma) + folio_set_hugetlb_cma(new_folio); list_add(&new_folio->lru, &dst_list); } } @@ -4630,7 +4633,7 @@ static void __init hugetlb_sysfs_init(void) err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) - pr_err("HugeTLB: Unable to add hstate %s", h->name); + pr_err("HugeTLB: Unable to add hstate %s\n", h->name); } #ifdef CONFIG_NUMA @@ -4990,6 +4993,20 @@ static int __init default_hugepagesz_setup(char *s) } hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup); +void __init hugetlb_bootmem_set_nodes(void) +{ + int i, nid; + unsigned long start_pfn, end_pfn; + + if (!nodes_empty(hugetlb_bootmem_nodes)) + return; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + if (end_pfn > start_pfn) + node_set(nid, hugetlb_bootmem_nodes); + } +} + static bool __hugetlb_bootmem_allocated __initdata; bool __init hugetlb_bootmem_allocated(void) @@ -5005,6 +5022,8 @@ void __init hugetlb_bootmem_alloc(void) if (__hugetlb_bootmem_allocated) return; + hugetlb_bootmem_set_nodes(); + for (i = 0; i < MAX_NUMNODES; i++) INIT_LIST_HEAD(&huge_boot_pages[i]); @@ -5012,7 +5031,6 @@ void __init hugetlb_bootmem_alloc(void) for_each_hstate(h) { h->next_nid_to_alloc = first_online_node; - h->next_nid_to_free = first_online_node; if (hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); @@ -5179,7 +5197,7 @@ static const struct ctl_table hugetlb_table[] = { }, }; -static void hugetlb_sysctl_init(void) +static void __init hugetlb_sysctl_init(void) { register_sysctl_init("vm", hugetlb_table); } @@ -5387,26 +5405,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; + return 0; +} +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) +{ /* * PMD sharing is only possible for PUD_SIZE-aligned address ranges * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. + * This function is called in the middle of a VMA split operation, with + * MM, VMA and rmap all write-locked to prevent concurrent page table + * walks (except hardware and gup_fast()). */ + vma_assert_write_locked(vma); + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + if (addr & ~PUD_MASK) { - /* - * hugetlb_vm_op_split is called right before we attempt to - * split the VMA. We will need to unshare PMDs in the old and - * new VMAs, so let's unshare before we split. - */ unsigned long floor = addr & PUD_MASK; unsigned long ceil = floor + PUD_SIZE; - if (floor >= vma->vm_start && ceil <= vma->vm_end) - hugetlb_unshare_pmds(vma, floor, ceil); + if (floor >= vma->vm_start && ceil <= vma->vm_end) { + /* + * Locking: + * Use take_locks=false here. + * The file rmap lock is already held. + * The hugetlb VMA lock can't be taken when we already + * hold the file rmap lock, and we don't need it because + * its purpose is to synchronize against concurrent page + * table walks, which are not possible thanks to the + * locks held by our caller. + */ + hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false); + } } - - return 0; } static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) @@ -5441,18 +5473,16 @@ const struct vm_operations_struct hugetlb_vm_ops = { .pagesize = hugetlb_vm_op_pagesize, }; -static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, +static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, bool try_mkwrite) { - pte_t entry; + pte_t entry = folio_mk_pte(folio, vma->vm_page_prot); unsigned int shift = huge_page_shift(hstate_vma(vma)); if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { - entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, - vma->vm_page_prot))); + entry = pte_mkwrite_novma(pte_mkdirty(entry)); } else { - entry = huge_pte_wrprotect(mk_huge_pte(page, - vma->vm_page_prot)); + entry = pte_wrprotect(entry); } entry = pte_mkyoung(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); @@ -5507,7 +5537,7 @@ static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, true); + pte_t newpte = make_huge_pte(vma, new_folio, true); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); @@ -5811,14 +5841,14 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct page *ref_page, zap_flags_t zap_flags) + struct folio *folio, zap_flags_t zap_flags) { struct mm_struct *mm = vma->vm_mm; + const bool folio_provided = !!folio; unsigned long address; pte_t *ptep; pte_t pte; spinlock_t *ptl; - struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); bool adjust_reservation = false; @@ -5882,14 +5912,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, continue; } - page = pte_page(pte); /* - * If a reference page is supplied, it is because a specific - * page is being unmapped, not a range. Ensure the page we - * are about to unmap is the actual page of interest. + * If a folio is supplied, it is because a specific + * folio is being unmapped, not a range. Ensure the folio we + * are about to unmap is the actual folio of interest. */ - if (ref_page) { - if (page != ref_page) { + if (folio_provided) { + if (folio != page_folio(pte_page(pte))) { spin_unlock(ptl); continue; } @@ -5899,12 +5928,14 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * looking like data was lost */ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); + } else { + folio = page_folio(pte_page(pte)); } pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) - set_page_dirty(page); + folio_mark_dirty(folio); /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) @@ -5912,7 +5943,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, make_pte_marker(PTE_MARKER_UFFD_WP), sz); hugetlb_count_sub(pages_per_huge_page(h), mm); - hugetlb_remove_rmap(page_folio(page)); + hugetlb_remove_rmap(folio); /* * Restore the reservation for anonymous page, otherwise the @@ -5921,8 +5952,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * reservation bit. */ if (!h->surplus_huge_pages && __vma_private_lock(vma) && - folio_test_anon(page_folio(page))) { - folio_set_hugetlb_restore_reserve(page_folio(page)); + folio_test_anon(folio)) { + folio_set_hugetlb_restore_reserve(folio); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } @@ -5946,16 +5977,17 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * count will not be incremented by free_huge_folio. * Act as if we consumed the reservation. */ - folio_clear_hugetlb_restore_reserve(page_folio(page)); + folio_clear_hugetlb_restore_reserve(folio); else if (rc) vma_add_reservation(h, vma, address); } - tlb_remove_page_size(tlb, page, huge_page_size(h)); + tlb_remove_page_size(tlb, folio_page(folio, 0), + folio_size(folio)); /* - * Bail out after unmapping reference page if supplied + * If we were instructed to unmap a specific folio, we're done. */ - if (ref_page) + if (folio_provided) break; } tlb_end_vma(tlb, vma); @@ -6017,7 +6049,7 @@ void __hugetlb_zap_end(struct vm_area_struct *vma, } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page, + unsigned long end, struct folio *folio, zap_flags_t zap_flags) { struct mmu_notifier_range range; @@ -6029,7 +6061,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, mmu_notifier_invalidate_range_start(&range); tlb_gather_mmu(&tlb, vma->vm_mm); - __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); + __unmap_hugepage_range(&tlb, vma, start, end, + folio, zap_flags); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); @@ -6042,7 +6075,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * same region. */ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, unsigned long address) + struct folio *folio, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; @@ -6086,7 +6119,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, address, - address + huge_page_size(h), page, 0); + address + huge_page_size(h), + folio, 0); } i_mmap_unlock_write(mapping); } @@ -6097,8 +6131,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, - struct vm_fault *vmf) +static vm_fault_t hugetlb_wp(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; @@ -6160,16 +6193,17 @@ retry_avoidcopy: PageAnonExclusive(&old_folio->page), &old_folio->page); /* - * If the process that created a MAP_PRIVATE mapping is about to - * perform a COW due to a shared page count, attempt to satisfy - * the allocation without using the existing reserves. The pagecache - * page is used to determine if the reserve at this address was - * consumed or not. If reserves were used, a partial faulted mapping - * at the time of fork() could consume its reserves on COW instead - * of the full address range. + * If the process that created a MAP_PRIVATE mapping is about to perform + * a COW due to a shared page count, attempt to satisfy the allocation + * without using the existing reserves. + * In order to determine where this is a COW on a MAP_PRIVATE mapping it + * is enough to check whether the old_folio is anonymous. This means that + * the reserve for this address was consumed. If reserves were used, a + * partial faulted mapping at the fime of fork() could consume its reserves + * on COW instead of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && - old_folio != pagecache_folio) + folio_test_anon(old_folio)) cow_from_owner = true; folio_get(old_folio); @@ -6209,8 +6243,7 @@ retry_avoidcopy: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - unmap_ref_private(mm, vma, &old_folio->page, - vmf->address); + unmap_ref_private(mm, vma, old_folio, vmf->address); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); @@ -6257,7 +6290,7 @@ retry_avoidcopy: spin_lock(vmf->ptl); vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); + pte_t newpte = make_huge_pte(vma, new_folio, !unshare); /* Break COW or unshare */ huge_ptep_clear_flush(vma, vmf->address, vmf->pte); @@ -6373,16 +6406,16 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned static vm_fault_t hugetlb_no_page(struct address_space *mapping, struct vm_fault *vmf) { + u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); + bool new_folio, new_anon_folio = false; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; - int anon_rmap = 0; - unsigned long size; + bool folio_locked = true; struct folio *folio; + unsigned long size; pte_t new_pte; - bool new_folio, new_pagecache_folio = false; - u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); /* * Currently, we are forced to kill the process in the event the @@ -6481,10 +6514,9 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, ret = VM_FAULT_SIGBUS; goto out; } - new_pagecache_folio = true; } else { + new_anon_folio = true; folio_lock(folio); - anon_rmap = 1; } } else { /* @@ -6533,11 +6565,11 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte)) goto backout; - if (anon_rmap) + if (new_anon_folio) hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); - new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED); + new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. @@ -6548,8 +6580,16 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, hugetlb_count_add(pages_per_huge_page(h), mm); if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + /* + * No need to keep file folios locked. See comment in + * hugetlb_fault(). + */ + if (!new_anon_folio) { + folio_locked = false; + folio_unlock(folio); + } /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(folio, vmf); + ret = hugetlb_wp(vmf); } spin_unlock(vmf->ptl); @@ -6562,7 +6602,8 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, if (new_folio) folio_set_hugetlb_migratable(folio); - folio_unlock(folio); + if (folio_locked) + folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); @@ -6579,7 +6620,8 @@ out: backout: spin_unlock(vmf->ptl); backout_unlocked: - if (new_folio && !new_pagecache_folio) + /* We only need to restore reservations for private mappings */ + if (new_anon_folio) restore_reserve_on_error(h, vma, vmf->address, folio); folio_unlock(folio); @@ -6617,10 +6659,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vm_fault_t ret; u32 hash; struct folio *folio = NULL; - struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; - int need_wait_lock = 0; + bool need_wait_lock = false; struct vm_fault vmf = { .vma = vma, .address = address & huge_page_mask(h), @@ -6686,15 +6727,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; - /* - * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this - * point, so this check prevents the kernel from going below assuming - * that we have an active hugepage in pagecache. This goto expects - * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned) - * check will properly handle it. - */ + /* Not present, either a migration or a hwpoisoned entry */ if (!pte_present(vmf.orig_pte)) { - if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) { + if (is_hugetlb_entry_migration(vmf.orig_pte)) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the @@ -6705,7 +6740,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_unlock(&hugetlb_fault_mutex_table[hash]); migration_entry_wait_huge(vma, vmf.address, vmf.pte); return 0; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte))) + } else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte)) ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; @@ -6715,8 +6750,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * If we are going to COW/unshare the mapping later, we examine the * pending reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the - * spinlock. Also lookup the pagecache page now as it is used to - * determine if a reservation has been consumed. + * spinlock. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) { @@ -6726,11 +6760,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, vmf.address); - - pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, - vmf.pgoff); - if (IS_ERR(pagecache_folio)) - pagecache_folio = NULL; } vmf.ptl = huge_pte_lock(h, mm, vmf.pte); @@ -6744,10 +6773,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) { if (!userfaultfd_wp_async(vma)) { spin_unlock(vmf.ptl); - if (pagecache_folio) { - folio_unlock(pagecache_folio); - folio_put(pagecache_folio); - } hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(&vmf, VM_UFFD_WP); @@ -6759,24 +6784,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Fallthrough to CoW */ } - /* - * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and - * pagecache_folio, so here we need take the former one - * when folio != pagecache_folio or !pagecache_folio. - */ - folio = page_folio(pte_page(vmf.orig_pte)); - if (folio != pagecache_folio) - if (!folio_trylock(folio)) { - need_wait_lock = 1; - goto out_ptl; - } - - folio_get(folio); - if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(vmf.orig_pte)) { - ret = hugetlb_wp(pagecache_folio, &vmf); - goto out_put_page; + /* + * Anonymous folios need to be lock since hugetlb_wp() + * checks whether we can re-use the folio exclusively + * for us in case we are the only user of it. + */ + folio = page_folio(pte_page(vmf.orig_pte)); + if (folio_test_anon(folio) && !folio_trylock(folio)) { + need_wait_lock = true; + goto out_ptl; + } + folio_get(folio); + ret = hugetlb_wp(&vmf); + if (folio_test_anon(folio)) + folio_unlock(folio); + folio_put(folio); + goto out_ptl; } else if (likely(flags & FAULT_FLAG_WRITE)) { vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte); } @@ -6785,17 +6810,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, vmf.address, vmf.pte); -out_put_page: - if (folio != pagecache_folio) - folio_unlock(folio); - folio_put(folio); out_ptl: spin_unlock(vmf.ptl); - - if (pagecache_folio) { - folio_unlock(pagecache_folio); - folio_put(pagecache_folio); - } out_mutex: hugetlb_vma_unlock_read(vma); @@ -6808,11 +6824,16 @@ out_mutex: mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* - * Generally it's safe to hold refcount during waiting page lock. But - * here we just wait to defer the next page fault to avoid busy loop and - * the page is not used after unlocked before returning from the current - * page fault. So we are safe from accessing freed page, even if we wait - * here without taking refcount. + * hugetlb_wp drops all the locks, but the folio lock, before trying to + * unmap the folio from other processes. During that window, if another + * process mapping that folio faults in, it will take the mutex and then + * it will wait on folio_lock, causing an ABBA deadlock. + * Use trylock instead and bail out if we fail. + * + * Ideally, we should hold a refcount on the folio we wait for, but we do + * not want to use the folio after it becomes unlocked, but rather just + * wait for it to become unlocked, so hopefully next fault successes on + * the trylock. */ if (need_wait_lock) folio_wait_locked(folio); @@ -7022,7 +7043,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ - _dst_pte = make_huge_pte(dst_vma, &folio->page, + _dst_pte = make_huge_pte(dst_vma, folio, !wp_enabled && !(is_continue && !vm_shared)); /* * Always mark UFFDIO_COPY page dirty; note that this may not be @@ -7132,11 +7153,11 @@ long hugetlb_change_protection(struct vm_area_struct *vma, /* Nothing to do. */ } else if (unlikely(is_hugetlb_entry_migration(pte))) { swp_entry_t entry = pte_to_swp_entry(pte); - struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = pfn_swap_entry_folio(entry); pte_t newpte = pte; if (is_writable_migration_entry(entry)) { - if (PageAnon(page)) + if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); else @@ -7210,13 +7231,20 @@ long hugetlb_change_protection(struct vm_area_struct *vma, return pages > 0 ? (pages << h->order) : pages; } -/* Return true if reservation was successful, false otherwise. */ -bool hugetlb_reserve_pages(struct inode *inode, +/* + * Update the reservation map for the range [from, to]. + * + * Returns the number of entries that would be added to the reservation map + * associated with the range [from, to]. This number is greater or equal to + * zero. -EINVAL or -ENOMEM is returned in case of any errors. + */ + +long hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long chg = -1, add = -1; + long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; @@ -7226,7 +7254,7 @@ bool hugetlb_reserve_pages(struct inode *inode, /* This should never happen */ if (from > to) { VM_WARN(1, "%s called with a negative range\n", __func__); - return false; + return -EINVAL; } /* @@ -7241,7 +7269,7 @@ bool hugetlb_reserve_pages(struct inode *inode, * without using reserves */ if (vm_flags & VM_NORESERVE) - return true; + return 0; /* * Shared mappings base their reservation on the number of pages that @@ -7348,11 +7376,19 @@ bool hugetlb_reserve_pages(struct inode *inode, hugetlb_cgroup_put_rsvd_cgroup(h_cg); } } - return true; + return chg; out_put_pages: - /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); + spool_resv = chg - gbl_reserve; + if (spool_resv) { + /* put sub pool's reservation back, chg - gbl_reserve */ + gbl_resv = hugepage_subpool_put_pages(spool, spool_resv); + /* + * subpool's reserved pages can not be put back due to race, + * return to hstate. + */ + hugetlb_acct_memory(h, -gbl_resv); + } out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); @@ -7368,7 +7404,7 @@ out_err: kref_put(&resv_map->refs, resv_map_release); set_vma_resv_map(vma, NULL); } - return false; + return chg < 0 ? chg : add < 0 ? add : -EINVAL; } long hugetlb_unreserve_pages(struct inode *inode, long start, long end, @@ -7423,8 +7459,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; - unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; + vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; + vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the @@ -7567,6 +7603,13 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, return 0; pud_clear(pud); + /* + * Once our caller drops the rmap lock, some other process might be + * using this page table as a normal, non-hugetlb page table. + * Wait for pending gup_fast() in other threads to finish before letting + * that happen. + */ + tlb_remove_table_sync_one(); ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); mm_dec_nr_pmds(mm); return 1; @@ -7792,7 +7835,7 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re struct hstate *h = folio_hstate(old_folio); hugetlb_cgroup_migrate(old_folio, new_folio); - set_page_owner_migrate_reason(&new_folio->page, reason); + folio_set_owner_migrate_reason(new_folio, reason); /* * transfer temporary state of the new hugetlb folio. This is @@ -7837,9 +7880,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re spin_unlock_irq(&hugetlb_lock); } +/* + * If @take_locks is false, the caller must ensure that no concurrent page table + * access can happen (except for gup_fast() and hardware page walks). + * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like + * concurrent page fault handling) and the file rmap lock. + */ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, + bool take_locks) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -7863,8 +7913,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); - hugetlb_vma_lock_write(vma); - i_mmap_lock_write(vma->vm_file->f_mapping); + if (take_locks) { + hugetlb_vma_lock_write(vma); + i_mmap_lock_write(vma->vm_file->f_mapping); + } else { + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + } for (address = start; address < end; address += PUD_SIZE) { ptep = hugetlb_walk(vma, address, sz); if (!ptep) @@ -7874,8 +7928,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); - i_mmap_unlock_write(vma->vm_file->f_mapping); - hugetlb_vma_unlock_write(vma); + if (take_locks) { + i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); + } /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see * Documentation/mm/mmu_notifier.rst. @@ -7890,5 +7946,20 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), - ALIGN_DOWN(vma->vm_end, PUD_SIZE)); + ALIGN_DOWN(vma->vm_end, PUD_SIZE), + /* take_locks = */ true); +} + +/* + * For hugetlb, mremap() is an odd edge case - while the VMA copying is + * performed, we permit both the old and new VMAs to reference the same + * reservation. + * + * We fix this up after the operation succeeds, or if a newly allocated VMA + * is closed as a result of a failure to allocate memory. + */ +void fixup_hugetlb_reservations(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + clear_vma_resv_huge_pages(vma); } diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index e0f2d5c3a84c..f58ef4969e7a 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -66,7 +66,7 @@ hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact) if (node_exact) return NULL; - for_each_online_node(node) { + for_each_node_mask(node, hugetlb_bootmem_nodes) { cma = hugetlb_cma[node]; if (!cma || node == *nid) continue; @@ -153,11 +153,13 @@ void __init hugetlb_cma_reserve(int order) if (!hugetlb_cma_size) return; + hugetlb_bootmem_set_nodes(); + for (nid = 0; nid < MAX_NUMNODES; nid++) { if (hugetlb_cma_size_in_node[nid] == 0) continue; - if (!node_online(nid)) { + if (!node_isset(nid, hugetlb_bootmem_nodes)) { pr_warn("hugetlb_cma: invalid node %d specified\n", nid); hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; hugetlb_cma_size_in_node[nid] = 0; @@ -190,13 +192,14 @@ void __init hugetlb_cma_reserve(int order) * If 3 GB area is requested on a machine with 4 numa nodes, * let's allocate 1 GB on first three nodes and ignore the last one. */ - per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); + per_node = DIV_ROUND_UP(hugetlb_cma_size, + nodes_weight(hugetlb_bootmem_nodes)); pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", hugetlb_cma_size / SZ_1M, per_node / SZ_1M); } reserved = 0; - for_each_online_node(nid) { + for_each_node_mask(nid, hugetlb_bootmem_nodes) { int res; char name[CMA_MAX_NAME]; diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 9a99dfa3c495..ba0fb1b6a5a8 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -166,7 +166,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, VM_BUG_ON(!PAGE_ALIGNED(start | end)); mmap_read_lock(&init_mm); - ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, + ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops, NULL, walk); mmap_read_unlock(&init_mm); if (ret) @@ -238,11 +238,11 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * struct page, the special metadata (e.g. page->flags or page->mapping) * cannot copy to the tail struct page structs. The invalid value will be * checked in the free_tail_page_prepare(). In order to avoid the message - * of "corrupted mapping in tail page". We need to reset at least 3 (one - * head struct page struct and two tail struct page structs) struct page + * of "corrupted mapping in tail page". We need to reset at least 4 (one + * head struct page struct and three tail struct page structs) struct page * structs. */ -#define NR_RESET_STRUCT_PAGE 3 +#define NR_RESET_STRUCT_PAGE 4 static inline void reset_struct_pages(struct page *start) { diff --git a/mm/internal.h b/mm/internal.h index 50c2f590b2d0..1da16d550a45 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -149,7 +149,7 @@ static inline void *folio_raw_mapping(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; - return (void *)(mapping & ~PAGE_MAPPING_FLAGS); + return (void *)(mapping & ~FOLIO_MAPPING_FLAGS); } /* @@ -164,7 +164,7 @@ static inline void *folio_raw_mapping(const struct folio *folio) */ static inline int mmap_file(struct file *file, struct vm_area_struct *vma) { - int err = call_mmap(file, vma); + int err = vfs_mmap(file, vma); if (likely(!err)) return 0; @@ -202,109 +202,126 @@ static inline void vma_close(struct vm_area_struct *vma) /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; -/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ -#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) +/* Compare PTEs respecting the dirty bit. */ +#define FPB_RESPECT_DIRTY ((__force fpb_t)BIT(0)) -/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ -#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) +/* Compare PTEs respecting the soft-dirty bit. */ +#define FPB_RESPECT_SOFT_DIRTY ((__force fpb_t)BIT(1)) + +/* Compare PTEs respecting the writable bit. */ +#define FPB_RESPECT_WRITE ((__force fpb_t)BIT(2)) + +/* + * Merge PTE write bits: if any PTE in the batch is writable, modify the + * PTE at @ptentp to be writable. + */ +#define FPB_MERGE_WRITE ((__force fpb_t)BIT(3)) + +/* + * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty, + * modify the PTE at @ptentp to be young or dirty, respectively. + */ +#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(4)) static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) { - if (flags & FPB_IGNORE_DIRTY) + if (!(flags & FPB_RESPECT_DIRTY)) pte = pte_mkclean(pte); - if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) + if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY))) pte = pte_clear_soft_dirty(pte); - return pte_wrprotect(pte_mkold(pte)); + if (likely(!(flags & FPB_RESPECT_WRITE))) + pte = pte_wrprotect(pte); + return pte_mkold(pte); } /** - * folio_pte_batch - detect a PTE batch for a large folio + * folio_pte_batch_flags - detect a PTE batch for a large folio * @folio: The large folio to detect a PTE batch for. - * @addr: The user virtual address the first page is mapped at. - * @start_ptep: Page table pointer for the first entry. - * @pte: Page table entry for the first page. + * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL. + * @ptep: Page table pointer for the first entry. + * @ptentp: Pointer to a COPY of the first page table entry whose flags this + * function updates based on @flags if appropriate. * @max_nr: The maximum number of table entries to consider. * @flags: Flags to modify the PTE batch semantics. - * @any_writable: Optional pointer to indicate whether any entry except the - * first one is writable. - * @any_young: Optional pointer to indicate whether any entry except the - * first one is young. - * @any_dirty: Optional pointer to indicate whether any entry except the - * first one is dirty. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive - * pages of the same large folio. + * pages of the same large folio in a single VMA and a single page table. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, - * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and - * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). + * the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set) + * and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set). + * + * @ptep must map any page of the folio. max_nr must be at least one and + * must be limited by the caller so scanning cannot exceed a single VMA and + * a single page table. + * + * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will + * be updated: it's crucial that a pointer to a COPY of the first + * page table entry, obtained through ptep_get(), is provided as @ptentp. * - * start_ptep must map any page of the folio. max_nr must be at least one and - * must be limited by the caller so scanning cannot exceed a single page table. + * This function will be inlined to optimize based on the input parameters; + * consider using folio_pte_batch() instead if applicable. * * Return: the number of table entries in the batch. */ -static inline int folio_pte_batch(struct folio *folio, unsigned long addr, - pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, - bool *any_writable, bool *any_young, bool *any_dirty) +static inline unsigned int folio_pte_batch_flags(struct folio *folio, + struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp, + unsigned int max_nr, fpb_t flags) { - unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); - const pte_t *end_ptep = start_ptep + max_nr; - pte_t expected_pte, *ptep; - bool writable, young, dirty; - int nr; - - if (any_writable) - *any_writable = false; - if (any_young) - *any_young = false; - if (any_dirty) - *any_dirty = false; + bool any_writable = false, any_young = false, any_dirty = false; + pte_t expected_pte, pte = *ptentp; + unsigned int nr, cur_nr; VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); + /* + * Ensure this is a pointer to a copy not a pointer into a page table. + * If this is a stack value, it won't be a valid virtual address, but + * that's fine because it also cannot be pointing into the page table. + */ + VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp))); - nr = pte_batch_hint(start_ptep, pte); + /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ + max_nr = min_t(unsigned long, max_nr, + folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte)); + + nr = pte_batch_hint(ptep, pte); expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); - ptep = start_ptep + nr; + ptep = ptep + nr; - while (ptep < end_ptep) { + while (nr < max_nr) { pte = ptep_get(ptep); - if (any_writable) - writable = !!pte_write(pte); - if (any_young) - young = !!pte_young(pte); - if (any_dirty) - dirty = !!pte_dirty(pte); - pte = __pte_batch_clear_ignored(pte, flags); - if (!pte_same(pte, expected_pte)) - break; - - /* - * Stop immediately once we reached the end of the folio. In - * corner cases the next PFN might fall into a different - * folio. - */ - if (pte_pfn(pte) >= folio_end_pfn) + if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte)) break; - if (any_writable) - *any_writable |= writable; - if (any_young) - *any_young |= young; - if (any_dirty) - *any_dirty |= dirty; - - nr = pte_batch_hint(ptep, pte); - expected_pte = pte_advance_pfn(expected_pte, nr); - ptep += nr; + if (flags & FPB_MERGE_WRITE) + any_writable |= pte_write(pte); + if (flags & FPB_MERGE_YOUNG_DIRTY) { + any_young |= pte_young(pte); + any_dirty |= pte_dirty(pte); + } + + cur_nr = pte_batch_hint(ptep, pte); + expected_pte = pte_advance_pfn(expected_pte, cur_nr); + ptep += cur_nr; + nr += cur_nr; } - return min(ptep - start_ptep, max_nr); + if (any_writable) + *ptentp = pte_mkwrite(*ptentp, vma); + if (any_young) + *ptentp = pte_mkyoung(*ptentp); + if (any_dirty) + *ptentp = pte_mkdirty(*ptentp); + + return min(nr, max_nr); } +unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, + unsigned int max_nr); + /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte * forward or backward by delta @@ -435,11 +452,13 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); +void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, + unsigned long size, struct zap_details *details); int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp); -void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, - unsigned int order); +void page_cache_ra_order(struct readahead_control *, struct file_ra_state *); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) @@ -519,6 +538,16 @@ extern unsigned long highest_memmap_pfn; bool folio_isolate_lru(struct folio *folio); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); +#ifdef CONFIG_NUMA +int user_proactive_reclaim(char *buf, + struct mem_cgroup *memcg, pg_data_t *pgdat); +#else +static inline int user_proactive_reclaim(char *buf, + struct mem_cgroup *memcg, pg_data_t *pgdat) +{ + return 0; +} +#endif /* * in mm/rmap.c: @@ -823,7 +852,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, int nid, bool exact_nid); void memmap_init_range(unsigned long, int, unsigned long, unsigned long, - unsigned long, enum meminit_context, struct vmem_altmap *, int); + unsigned long, enum meminit_context, struct vmem_altmap *, int, + bool); #if defined CONFIG_COMPACTION || defined CONFIG_CMA @@ -915,7 +945,7 @@ static inline void init_cma_pageblock(struct page *page) int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool claim_only, bool *claim_block); + int migratetype, bool claimable); static inline bool free_area_empty(struct free_area *area, int migratetype) { @@ -931,7 +961,7 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); -extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, +extern bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes); /* @@ -1121,6 +1151,8 @@ DECLARE_STATIC_KEY_TRUE(deferred_pages); bool __init deferred_grow_zone(struct zone *zone, unsigned int order); #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ +void init_deferred_page(unsigned long pfn, int nid); + enum mminit_level { MMINIT_WARNING, MMINIT_VERIFY, @@ -1227,7 +1259,6 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -struct folio *alloc_migrate_folio(struct folio *src, unsigned long private); unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); @@ -1360,7 +1391,7 @@ int migrate_device_coherent_folio(struct folio *folio); struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, - unsigned long flags, unsigned long start, + vm_flags_t vm_flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller); @@ -1605,6 +1636,9 @@ static inline void accept_page(struct page *page) int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); +int walk_page_range_debug(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); /* pt_reclaim.c */ bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); @@ -1624,5 +1658,7 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, } #endif /* CONFIG_PT_RECLAIM */ +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); #endif /* __MM_INTERNAL_H */ diff --git a/mm/io-mapping.c b/mm/io-mapping.c index 01b362799930..d3586e95c12c 100644 --- a/mm/io-mapping.c +++ b/mm/io-mapping.c @@ -21,9 +21,10 @@ int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags)) return -EINVAL; - /* We rely on prevalidation of the io-mapping to skip track_pfn(). */ - return remap_pfn_range_notrack(vma, addr, pfn, size, - __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | - (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK))); + pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | + (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)); + + /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */ + return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot); } EXPORT_SYMBOL_GPL(io_mapping_map_user); diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 1a958e7c8a46..dd93ae8a6beb 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) -CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla) +CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX # If compiler instruments memintrinsics by prefixing them with __asan/__hwasan, # we need to treat them normally (as builtins), otherwise the compiler won't @@ -44,6 +44,7 @@ ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX CFLAGS_KASAN_TEST += -fno-builtin endif +CFLAGS_REMOVE_kasan_test_c.o += $(call cc-option, -Wvla-larger-than=1) CFLAGS_kasan_test_c.o := $(CFLAGS_KASAN_TEST) RUSTFLAGS_kasan_test_rust.o := $(RUSTFLAGS_KASAN) diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 59d673400085..2aa12dfa427a 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1073,14 +1073,11 @@ static void kmem_cache_rcu_uaf(struct kunit *test) kmem_cache_destroy(cache); } -static void empty_cache_ctor(void *object) { } - static void kmem_cache_double_destroy(struct kunit *test) { struct kmem_cache *cache; - /* Provide a constructor to prevent cache merging. */ - cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor); + cache = kmem_cache_create("test_cache", 200, 0, SLAB_NO_MERGE, NULL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); kmem_cache_destroy(cache); KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache)); @@ -1570,6 +1567,7 @@ static void kasan_memcmp(struct kunit *test) static void kasan_strings(struct kunit *test) { char *ptr; + char *src; size_t size = 24; /* @@ -1581,6 +1579,25 @@ static void kasan_strings(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO); + strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE); + + /* + * Make sure that strscpy() does not trigger KASAN if it overreads into + * poisoned memory. + * + * The expected size does not include the terminator '\0' + * so it is (KASAN_GRANULE_SIZE - 2) == + * KASAN_GRANULE_SIZE - ("initial removed character" + "\0"). + */ + KUNIT_EXPECT_EQ(test, KASAN_GRANULE_SIZE - 2, + strscpy(ptr, src + 1, KASAN_GRANULE_SIZE)); + + /* strscpy should fail if the first byte is unreadable. */ + KUNIT_EXPECT_KASAN_FAIL(test, strscpy(ptr, src + KASAN_GRANULE_SIZE, + KASAN_GRANULE_SIZE)); + + kfree(src); kfree(ptr); /* @@ -1960,6 +1977,11 @@ static void rust_uaf(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kasan_test_rust_uaf()); } +/* + * copy_to_kernel_nofault() is an internal helper available when + * kasan_test is built-in, so it must not be visible to loadable modules. + */ +#ifndef MODULE static void copy_to_kernel_nofault_oob(struct kunit *test) { char *ptr; @@ -1994,6 +2016,7 @@ static void copy_to_kernel_nofault_oob(struct kunit *test) kfree(ptr); } +#endif /* !MODULE */ static void copy_user_test_oob(struct kunit *test) { @@ -2114,7 +2137,9 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(match_all_not_assigned), KUNIT_CASE(match_all_ptr_tag), KUNIT_CASE(match_all_mem_tag), +#ifndef MODULE KUNIT_CASE(copy_to_kernel_nofault_oob), +#endif KUNIT_CASE(rust_uaf), KUNIT_CASE(copy_user_test_oob), {} @@ -2130,4 +2155,5 @@ static struct kunit_suite kasan_kunit_test_suite = { kunit_test_suite(kasan_kunit_test_suite); +MODULE_DESCRIPTION("KUnit tests for checking KASAN bug-detection capabilities"); MODULE_LICENSE("GPL"); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 8357e1a33699..62c01b4527eb 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -370,36 +370,6 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } -/* - * This function is invoked with report_lock (a raw_spinlock) held. A - * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping - * rt_spinlock. - * - * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a - * lockdep warning for this raw_spinlock -> spinlock dependency. This config - * option is enabled by default to ensure better test coverage to expose this - * kind of RT kernel problem. This lockdep splat, however, can be suppressed - * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the - * invalid PREEMPT_RT case has been taken care of. - */ -static inline struct vm_struct *kasan_find_vm_area(void *addr) -{ - static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP); - struct vm_struct *va; - - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - return NULL; - - /* - * Suppress lockdep warning and fetch vmalloc area of the - * offending address. - */ - lock_map_acquire_try(&vmalloc_map); - va = find_vm_area(addr); - lock_map_release(&vmalloc_map); - return va; -} - static void print_address_description(void *addr, u8 tag, struct kasan_report_info *info) { @@ -429,19 +399,10 @@ static void print_address_description(void *addr, u8 tag, } if (is_vmalloc_addr(addr)) { - struct vm_struct *va = kasan_find_vm_area(addr); - - if (va) { - pr_err("The buggy address belongs to the virtual mapping at\n" - " [%px, %px) created by:\n" - " %pS\n", - va->addr, va->addr + va->size, va->caller); - pr_err("\n"); - - page = vmalloc_to_page(addr); - } else { - pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr); - } + pr_err("The buggy address belongs to a"); + if (!vmalloc_dump_obj(addr)) + pr_cont(" vmalloc virtual mapping\n"); + page = vmalloc_to_page(addr); } if (page) { diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 88d1c9dcb507..d2c70cd2afb1 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -292,33 +292,99 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start, { } +struct vmalloc_populate_data { + unsigned long start; + struct page **pages; +}; + static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, - void *unused) + void *_data) { - unsigned long page; + struct vmalloc_populate_data *data = _data; + struct page *page; pte_t pte; + int index; if (likely(!pte_none(ptep_get(ptep)))) return 0; - page = __get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - - __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); - pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); + index = PFN_DOWN(addr - data->start); + page = data->pages[index]; + __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE); + pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL); spin_lock(&init_mm.page_table_lock); if (likely(pte_none(ptep_get(ptep)))) { set_pte_at(&init_mm, addr, ptep, pte); - page = 0; + data->pages[index] = NULL; } spin_unlock(&init_mm.page_table_lock); - if (page) - free_page(page); + + return 0; +} + +static void ___free_pages_bulk(struct page **pages, int nr_pages) +{ + int i; + + for (i = 0; i < nr_pages; i++) { + if (pages[i]) { + __free_pages(pages[i], 0); + pages[i] = NULL; + } + } +} + +static int ___alloc_pages_bulk(struct page **pages, int nr_pages) +{ + unsigned long nr_populated, nr_total = nr_pages; + struct page **page_array = pages; + + while (nr_pages) { + nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages); + if (!nr_populated) { + ___free_pages_bulk(page_array, nr_total - nr_pages); + return -ENOMEM; + } + pages += nr_populated; + nr_pages -= nr_populated; + } + return 0; } +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +{ + unsigned long nr_pages, nr_total = PFN_UP(end - start); + struct vmalloc_populate_data data; + int ret = 0; + + data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!data.pages) + return -ENOMEM; + + while (nr_total) { + nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0])); + ret = ___alloc_pages_bulk(data.pages, nr_pages); + if (ret) + break; + + data.start = start; + ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, + kasan_populate_vmalloc_pte, &data); + ___free_pages_bulk(data.pages, nr_pages); + if (ret) + break; + + start += nr_pages * PAGE_SIZE; + nr_total -= nr_pages; + } + + free_page((unsigned long)data.pages); + + return ret; +} + int kasan_populate_vmalloc(unsigned long addr, unsigned long size) { unsigned long shadow_start, shadow_end; @@ -348,9 +414,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = apply_to_page_range(&init_mm, shadow_start, - shadow_end - shadow_start, - kasan_populate_vmalloc_pte, NULL); + ret = __kasan_populate_vmalloc(shadow_start, shadow_end); if (ret) return ret; diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 102048821c22..0ed3be100963 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -605,8 +605,8 @@ static unsigned long kfence_init_pool(void) pages = virt_to_page(__kfence_pool); /* - * Set up object pages: they must have PG_slab set, to avoid freeing - * these as real pages. + * Set up object pages: they must have PGTY_slab set to avoid freeing + * them as real pages. * * We also want to avoid inserting kfence_free() in the kfree() * fast-path in SLUB, and therefore need to ensure kfree() correctly diff --git a/mm/khugepaged.c b/mm/khugepaged.c index cc945c6ab3bd..a55fb1dcd224 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -347,7 +347,7 @@ struct attribute_group khugepaged_attr_group = { #endif /* CONFIG_SYSFS */ int hugepage_madvise(struct vm_area_struct *vma, - unsigned long *vm_flags, int advice) + vm_flags_t *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: @@ -470,7 +470,7 @@ void __khugepaged_enter(struct mm_struct *mm) } void khugepaged_enter_vma(struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_pmd_enabled()) { @@ -548,19 +548,6 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } } -static bool is_refcount_suitable(struct folio *folio) -{ - int expected_refcount = folio_mapcount(folio); - - if (!folio_test_anon(folio) || folio_test_swapcache(folio)) - expected_refcount += folio_nr_pages(folio); - - if (folio_test_private(folio)) - expected_refcount++; - - return folio_ref_count(folio) == expected_refcount; -} - static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, @@ -652,7 +639,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; @@ -696,13 +683,13 @@ next: result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, + trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, writable, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); - trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, + trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, writable, result); return result; } @@ -746,7 +733,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, ptep_clear(vma->vm_mm, address, _pte); folio_remove_rmap_pte(src, src_page, vma); spin_unlock(ptl); - free_page_and_swap_cache(src_page); + free_folio_and_swap_cache(src); } } @@ -954,12 +941,18 @@ static inline int check_pmd_state(pmd_t *pmd) if (pmd_none(pmde)) return SCAN_PMD_NONE; + + /* + * The folio may be under migration when khugepaged is trying to + * collapse it. Migration success or failure will eventually end + * up with a present PMD mapping a folio again. + */ + if (is_pmd_migration_entry(pmde)) + return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) return SCAN_PMD_NULL; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; - if (pmd_devmap(pmde)) - return SCAN_PMD_NULL; if (pmd_bad(pmde)) return SCAN_PMD_NULL; return SCAN_SUCCEED; @@ -1239,7 +1232,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot); + _pmd = folio_mk_pmd(folio, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); @@ -1402,7 +1395,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1435,7 +1428,7 @@ out_unmap: *mmap_locked = false; } out: - trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced, none_or_zero, result, unmapped); return result; } @@ -1464,10 +1457,9 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) } } -#ifdef CONFIG_SHMEM -/* hpage must be locked, and mmap_lock must be held */ +/* folio must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmdp, struct page *hpage) + pmd_t *pmdp, struct folio *folio, struct page *page) { struct vm_fault vmf = { .vma = vma, @@ -1476,13 +1468,12 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, .pmd = pmdp, }; - VM_BUG_ON(!PageTransHuge(hpage)); mmap_assert_locked(vma->vm_mm); - if (do_set_pmd(&vmf, hpage)) + if (do_set_pmd(&vmf, folio, page)) return SCAN_FAIL; - get_page(hpage); + folio_get(folio); return SCAN_SUCCEED; } @@ -1689,7 +1680,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd - ? set_huge_pmd(vma, haddr, pmd, &folio->page) + ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page) : SCAN_SUCCEED; goto drop_folio; abort: @@ -2295,6 +2286,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, continue; } + if (!folio_try_get(folio)) { + xas_reset(&xas); + continue; + } + + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); + xas_reset(&xas); + continue; + } + if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ @@ -2305,23 +2307,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, * it's safe to skip LRU and refcount checks before * returning. */ + folio_put(folio); break; } node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; + folio_put(folio); break; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; + folio_put(folio); break; } - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; + folio_put(folio); break; } @@ -2333,6 +2339,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, */ present += folio_nr_pages(folio); + folio_put(folio); if (need_resched()) { xas_pause(&xas); @@ -2354,14 +2361,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } -#else -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, - struct collapse_control *cc) -{ - BUILD_BUG(); -} -#endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) @@ -2437,7 +2436,7 @@ skip: VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) { + if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); @@ -2736,8 +2735,8 @@ static int madvise_collapse_errno(enum scan_result r) } } -int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end) +int madvise_collapse(struct vm_area_struct *vma, unsigned long start, + unsigned long end, bool *lock_dropped) { struct collapse_control *cc; struct mm_struct *mm = vma->vm_mm; @@ -2748,8 +2747,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - *prev = vma; - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return -EINVAL; @@ -2783,7 +2780,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); - if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) { + if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); @@ -2797,7 +2794,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, &mmap_locked, cc); } if (!mmap_locked) - *prev = NULL; /* Tell caller we dropped mmap_lock */ + *lock_dropped = true; handle_result: switch (result) { @@ -2807,7 +2804,6 @@ handle_result: break; case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); - BUG_ON(*prev); mmap_read_lock(mm); result = collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c12cef3eeb32..8d588e685311 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -210,13 +210,11 @@ static struct kmem_cache *object_cache; static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ -static int kmemleak_enabled = 1; +static int kmemleak_enabled __read_mostly = 1; /* same as above but only for the kmemleak_free() callback */ -static int kmemleak_free_enabled = 1; +static int kmemleak_free_enabled __read_mostly = 1; /* set in the late_initcall if there were no errors */ static int kmemleak_late_initialized; -/* set if a kmemleak warning was issued */ -static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ static int kmemleak_error; @@ -254,7 +252,6 @@ static void kmemleak_disable(void); #define kmemleak_warn(x...) do { \ pr_warn(x); \ dump_stack(); \ - kmemleak_warning = 1; \ } while (0) /* @@ -325,8 +322,6 @@ static void hex_dump_object(struct seq_file *seq, * sufficient references to it (count >= min_count) * - black - ignore, it doesn't contain references (e.g. text section) * (min_count == -1). No function defined for this color. - * Newly created objects don't have any color assigned (object->count == -1) - * before the next memory scan when they become white. */ static bool color_white(const struct kmemleak_object *object) { @@ -1252,6 +1247,20 @@ void __ref kmemleak_transient_leak(const void *ptr) EXPORT_SYMBOL(kmemleak_transient_leak); /** + * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu + * address argument + * @ptr: percpu address of the object + */ +void __ref kmemleak_ignore_percpu(const void __percpu *ptr) +{ + pr_debug("%s(0x%px)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr)) + make_black_object((unsigned long)ptr, OBJECT_PERCPU); +} +EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu); + +/** * kmemleak_ignore - ignore an allocated object * @ptr: pointer to beginning of the object * diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index a495debf1436..1ea711786c52 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -159,8 +159,8 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) * Make sure we have enough spare bits in @id to hold the UAF bit and * the chain depth. */ - BUILD_BUG_ON( - (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1)); + BUILD_BUG_ON((1 << STACK_DEPOT_EXTRA_BITS) <= + (KMSAN_MAX_ORIGIN_DEPTH << 1)); extra_bits = stack_depot_get_extra_bits(id); depth = kmsan_depth_from_eb(extra_bits); @@ -274,11 +274,9 @@ void kmsan_internal_check_memory(void *addr, size_t size, * bytes before, report them. */ if (cur_origin) { - kmsan_enter_runtime(); kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1, user_addr, reason); - kmsan_leave_runtime(); } cur_origin = 0; cur_off_start = -1; @@ -292,11 +290,9 @@ void kmsan_internal_check_memory(void *addr, size_t size, * poisoned bytes before, report them. */ if (cur_origin) { - kmsan_enter_runtime(); kmsan_report(cur_origin, addr, size, cur_off_start, pos + i - 1, user_addr, reason); - kmsan_leave_runtime(); } cur_origin = 0; cur_off_start = -1; @@ -312,11 +308,9 @@ void kmsan_internal_check_memory(void *addr, size_t size, */ if (cur_origin != new_origin) { if (cur_origin) { - kmsan_enter_runtime(); kmsan_report(cur_origin, addr, size, cur_off_start, pos + i - 1, user_addr, reason); - kmsan_leave_runtime(); } cur_origin = new_origin; cur_off_start = pos + i; @@ -326,10 +320,8 @@ void kmsan_internal_check_memory(void *addr, size_t size, } KMSAN_WARN_ON(pos != size); if (cur_origin) { - kmsan_enter_runtime(); kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1, user_addr, reason); - kmsan_leave_runtime(); } } diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 3df45c25c1f6..97de3d6194f0 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -114,9 +114,7 @@ void kmsan_kfree_large(const void *ptr) kmsan_enter_runtime(); page = virt_to_head_page((void *)ptr); KMSAN_WARN_ON(ptr != page_address(page)); - kmsan_internal_poison_memory((void *)ptr, - page_size(page), - GFP_KERNEL, + kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL, KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); } @@ -277,8 +275,10 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, * Don't check anything, just copy the shadow of the copied * bytes. */ + kmsan_enter_runtime(); kmsan_internal_memmove_metadata((void *)to, (void *)from, to_copy - left); + kmsan_leave_runtime(); } user_access_restore(ua_flags); } diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c index 10f52c085e6c..b14ce3417e65 100644 --- a/mm/kmsan/init.c +++ b/mm/kmsan/init.c @@ -35,8 +35,7 @@ static void __init kmsan_record_future_shadow_range(void *start, void *end) KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES); KMSAN_WARN_ON((nstart >= nend) || /* Virtual address 0 is valid on s390. */ - (!IS_ENABLED(CONFIG_S390) && !nstart) || - !nend); + (!IS_ENABLED(CONFIG_S390) && !nstart) || !nend); nstart = ALIGN_DOWN(nstart, PAGE_SIZE); nend = ALIGN(nend, PAGE_SIZE); diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 02a405e55d6c..69f0a57a401c 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -312,13 +312,9 @@ EXPORT_SYMBOL(__msan_unpoison_alloca); void __msan_warning(u32 origin); void __msan_warning(u32 origin) { - if (!kmsan_enabled || kmsan_in_runtime()) - return; - kmsan_enter_runtime(); kmsan_report(origin, /*address*/ NULL, /*size*/ 0, /*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ NULL, REASON_ANY); - kmsan_leave_runtime(); } EXPORT_SYMBOL(__msan_warning); diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h index 29555a8bc315..bc3d1810f352 100644 --- a/mm/kmsan/kmsan.h +++ b/mm/kmsan/kmsan.h @@ -121,7 +121,6 @@ static __always_inline void kmsan_leave_runtime(void) KMSAN_WARN_ON(--ctx->kmsan_in_runtime); } -depot_stack_handle_t kmsan_save_stack(void); depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, unsigned int extra_bits); diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 9733a22c46c1..c6c5b2bbede0 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -732,3 +732,4 @@ kunit_test_suites(&kmsan_test_suite); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Alexander Potapenko <glider@google.com>"); +MODULE_DESCRIPTION("Test cases for KMSAN"); diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c index 94a3303fb65e..d6853ce08954 100644 --- a/mm/kmsan/report.c +++ b/mm/kmsan/report.c @@ -157,14 +157,14 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size, unsigned long ua_flags; bool is_uaf; - if (!kmsan_enabled) + if (!kmsan_enabled || kmsan_in_runtime()) return; if (current->kmsan_ctx.depth) return; if (!origin) return; - kmsan_disable_current(); + kmsan_enter_runtime(); ua_flags = user_access_save(); raw_spin_lock(&kmsan_report_lock); pr_err("=====================================================\n"); @@ -217,5 +217,5 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size, if (panic_on_kmsan) panic("kmsan.panic set ...\n"); user_access_restore(ua_flags); - kmsan_enable_current(); + kmsan_leave_runtime(); } diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 1bb505a08415..54f3c3c962f0 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -207,8 +207,7 @@ void kmsan_free_page(struct page *page, unsigned int order) if (!kmsan_enabled || kmsan_in_runtime()) return; kmsan_enter_runtime(); - kmsan_internal_poison_memory(page_address(page), - page_size(page), + kmsan_internal_poison_memory(page_address(page), page_size(page), GFP_KERNEL, KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); @@ -248,17 +247,19 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, kmsan_enter_runtime(); mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, s_pages, page_shift); + kmsan_leave_runtime(); if (mapped) { err = mapped; goto ret; } + kmsan_enter_runtime(); mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, o_pages, page_shift); + kmsan_leave_runtime(); if (mapped) { err = mapped; goto ret; } - kmsan_leave_runtime(); flush_tlb_kernel_range(shadow_start, shadow_end); flush_tlb_kernel_range(origin_start, origin_end); flush_cache_vmap(shadow_start, shadow_end); @@ -677,28 +677,32 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } -static bool vma_ksm_compatible(struct vm_area_struct *vma) +static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags) { - if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | - VM_IO | VM_DONTEXPAND | VM_HUGETLB | - VM_MIXEDMAP| VM_DROPPABLE)) + if (vm_flags & (VM_SHARED | VM_MAYSHARE | VM_SPECIAL | + VM_HUGETLB | VM_DROPPABLE)) return false; /* just ignore the advice */ - if (vma_is_dax(vma)) + if (file_is_dax(file)) return false; #ifdef VM_SAO - if (vma->vm_flags & VM_SAO) + if (vm_flags & VM_SAO) return false; #endif #ifdef VM_SPARC_ADI - if (vma->vm_flags & VM_SPARC_ADI) + if (vm_flags & VM_SPARC_ADI) return false; #endif return true; } +static bool vma_ksm_compatible(struct vm_area_struct *vma) +{ + return ksm_compatible(vma->vm_file, vma->vm_flags); +} + static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, unsigned long addr) { @@ -889,7 +893,7 @@ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, unsigned long kpfn; expected_mapping = (void *)((unsigned long)stable_node | - PAGE_MAPPING_KSM); + FOLIO_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ folio = pfn_folio(kpfn); @@ -1066,7 +1070,7 @@ static inline void folio_set_stable_node(struct folio *folio, struct ksm_stable_node *stable_node) { VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio); - folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); + folio->mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM); } #ifdef CONFIG_SYSFS @@ -2696,14 +2700,17 @@ static int ksm_scan_thread(void *nothing) return 0; } -static void __ksm_add_vma(struct vm_area_struct *vma) +static bool __ksm_should_add_vma(const struct file *file, vm_flags_t vm_flags) { - unsigned long vm_flags = vma->vm_flags; - if (vm_flags & VM_MERGEABLE) - return; + return false; + + return ksm_compatible(file, vm_flags); +} - if (vma_ksm_compatible(vma)) +static void __ksm_add_vma(struct vm_area_struct *vma) +{ + if (__ksm_should_add_vma(vma->vm_file, vma->vm_flags)) vm_flags_set(vma, VM_MERGEABLE); } @@ -2724,16 +2731,22 @@ static int __ksm_del_vma(struct vm_area_struct *vma) return 0; } /** - * ksm_add_vma - Mark vma as mergeable if compatible + * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible + * + * @mm: Proposed VMA's mm_struct + * @file: Proposed VMA's file-backed mapping, if any. + * @vm_flags: Proposed VMA"s flags. * - * @vma: Pointer to vma + * Returns: @vm_flags possibly updated to mark mergeable. */ -void ksm_add_vma(struct vm_area_struct *vma) +vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, + vm_flags_t vm_flags) { - struct mm_struct *mm = vma->vm_mm; + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) && + __ksm_should_add_vma(file, vm_flags)) + vm_flags |= VM_MERGEABLE; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) - __ksm_add_vma(vma); + return vm_flags; } static void ksm_add_vmas(struct mm_struct *mm) @@ -2827,7 +2840,7 @@ int ksm_disable(struct mm_struct *mm) } int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags) + unsigned long end, int advice, vm_flags_t *vm_flags) { struct mm_struct *mm = vma->vm_mm; int err; @@ -3669,10 +3682,10 @@ static ssize_t advisor_mode_show(struct kobject *kobj, { const char *output; - if (ksm_advisor == KSM_ADVISOR_NONE) - output = "[none] scan-time"; - else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) + if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) output = "none [scan-time]"; + else + output = "[none] scan-time"; return sysfs_emit(buf, "%s\n", output); } diff --git a/mm/list_lru.c b/mm/list_lru.c index 490473af3122..ec48b5dadf51 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -60,30 +60,34 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) return &lru->node[nid].lru; } +static inline bool lock_list_lru(struct list_lru_one *l, bool irq) +{ + if (irq) + spin_lock_irq(&l->lock); + else + spin_lock(&l->lock); + if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) { + if (irq) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); + return false; + } + return true; +} + static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, bool irq, bool skip_empty) { struct list_lru_one *l; - long nr_items; rcu_read_lock(); again: l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); - if (likely(l)) { - if (irq) - spin_lock_irq(&l->lock); - else - spin_lock(&l->lock); - nr_items = READ_ONCE(l->nr_items); - if (likely(nr_items != LONG_MIN)) { - rcu_read_unlock(); - return l; - } - if (irq) - spin_unlock_irq(&l->lock); - else - spin_unlock(&l->lock); + if (likely(l) && lock_list_lru(l, irq)) { + rcu_read_unlock(); + return l; } /* * Caller may simply bail out if raced with reparenting or diff --git a/mm/maccess.c b/mm/maccess.c index 8f0906180a94..486559d68858 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -82,7 +82,6 @@ Efault: pagefault_enable(); return -EFAULT; } -EXPORT_SYMBOL_GPL(copy_to_kernel_nofault); long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) { @@ -196,7 +195,7 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, if (ret >= count) { ret = count; dst[ret - 1] = '\0'; - } else if (ret > 0) { + } else if (ret >= 0) { ret++; } diff --git a/mm/madvise.c b/mm/madvise.c index b17f684322ad..bb80fc5ea08f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -37,6 +37,8 @@ #include "internal.h" #include "swap.h" +#define __MADV_SET_ANON_VMA_NAME (-1) + /* * Maximum number of attempts we make to install guard pages before we give up * and return -ERESTARTNOINTR to have userspace try again. @@ -48,34 +50,39 @@ struct madvise_walk_private { bool pageout; }; -/* - * Any behaviour which results in changes to the vma->vm_flags needs to - * take mmap_lock for writing. Others, which simply traverse vmas, need - * to only take it for reading. - */ -static int madvise_need_mmap_write(int behavior) -{ - switch (behavior) { - case MADV_REMOVE: - case MADV_WILLNEED: - case MADV_DONTNEED: - case MADV_DONTNEED_LOCKED: - case MADV_COLD: - case MADV_PAGEOUT: - case MADV_FREE: - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - case MADV_COLLAPSE: - case MADV_GUARD_INSTALL: - case MADV_GUARD_REMOVE: - return 0; - default: - /* be safe, default to 1. list exceptions explicitly */ - return 1; - } -} +enum madvise_lock_mode { + MADVISE_NO_LOCK, + MADVISE_MMAP_READ_LOCK, + MADVISE_MMAP_WRITE_LOCK, + MADVISE_VMA_READ_LOCK, +}; + +struct madvise_behavior_range { + unsigned long start; + unsigned long end; +}; + +struct madvise_behavior { + struct mm_struct *mm; + int behavior; + struct mmu_gather *tlb; + enum madvise_lock_mode lock_mode; + struct anon_vma_name *anon_name; + + /* + * The range over which the behaviour is currently being applied. If + * traversing multiple VMAs, this is updated for each. + */ + struct madvise_behavior_range range; + /* The VMA and VMA preceding it (if applicable) currently targeted. */ + struct vm_area_struct *prev; + struct vm_area_struct *vma; + bool lock_dropped; +}; #ifdef CONFIG_ANON_VMA_NAME +static int madvise_walk_vmas(struct madvise_behavior *madv_behavior); + struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; @@ -101,7 +108,8 @@ void anon_vma_name_free(struct kref *kref) struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { - mmap_assert_locked(vma->vm_mm); + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) + vma_assert_locked(vma); return vma->anon_name; } @@ -137,40 +145,39 @@ static int replace_anon_vma_name(struct vm_area_struct *vma, } #endif /* CONFIG_ANON_VMA_NAME */ /* - * Update the vm_flags on region of a vma, splitting it or merging it as - * necessary. Must be called with mmap_lock held for writing; - * Caller should ensure anon_name stability by raising its refcount even when - * anon_name belongs to a valid vma because this function might free that vma. + * Update the vm_flags or anon_name on region of a vma, splitting it or merging + * it as necessary. Must be called with mmap_lock held for writing. */ -static int madvise_update_vma(struct vm_area_struct *vma, - struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long new_flags, - struct anon_vma_name *anon_name) +static int madvise_update_vma(vm_flags_t new_flags, + struct madvise_behavior *madv_behavior) { - struct mm_struct *mm = vma->vm_mm; - int error; - VMA_ITERATOR(vmi, mm, start); - - if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { - *prev = vma; + struct vm_area_struct *vma = madv_behavior->vma; + struct madvise_behavior_range *range = &madv_behavior->range; + struct anon_vma_name *anon_name = madv_behavior->anon_name; + bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME; + VMA_ITERATOR(vmi, madv_behavior->mm, range->start); + + if (new_flags == vma->vm_flags && (!set_new_anon_name || + anon_vma_name_eq(anon_vma_name(vma), anon_name))) return 0; - } - vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags, - anon_name); + if (set_new_anon_name) + vma = vma_modify_name(&vmi, madv_behavior->prev, vma, + range->start, range->end, anon_name); + else + vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, + range->start, range->end, new_flags); + if (IS_ERR(vma)) return PTR_ERR(vma); - *prev = vma; + madv_behavior->vma = vma; /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vm_flags_reset(vma, new_flags); - if (!vma->vm_file || vma_is_anon_shmem(vma)) { - error = replace_anon_vma_name(vma, anon_name); - if (error) - return error; - } + if (set_new_anon_name) + return replace_anon_vma_name(vma, anon_name); return 0; } @@ -263,21 +270,27 @@ static void shmem_swapin_range(struct vm_area_struct *vma, } #endif /* CONFIG_SWAP */ +static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior) +{ + VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK); + madv_behavior->lock_dropped = true; +} + /* * Schedule all required I/O operations. Do not wait for completion. */ -static long madvise_willneed(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) +static long madvise_willneed(struct madvise_behavior *madv_behavior) { - struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *vma = madv_behavior->vma; + struct mm_struct *mm = madv_behavior->mm; struct file *file = vma->vm_file; + unsigned long start = madv_behavior->range.start; + unsigned long end = madv_behavior->range.end; loff_t offset; - *prev = vma; #ifdef CONFIG_SWAP if (!file) { - walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); + walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } @@ -303,7 +316,7 @@ static long madvise_willneed(struct vm_area_struct *vma, * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ - *prev = NULL; /* tell sys_madvise we drop mmap_lock */ + mark_mmap_lock_dropped(madv_behavior); get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); @@ -331,14 +344,12 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, struct folio *folio, pte_t *ptep, - pte_t pte, bool *any_young, - bool *any_dirty) + pte_t *ptentp) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; int max_nr = (end - addr) / PAGE_SIZE; - return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, - any_young, any_dirty); + return folio_pte_batch_flags(folio, NULL, ptep, ptentp, max_nr, + FPB_MERGE_YOUNG_DIRTY); } static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, @@ -476,13 +487,7 @@ restart: * next pte in the range. */ if (folio_test_large(folio)) { - bool any_young; - - nr = madvise_folio_pte_batch(addr, end, folio, pte, - ptent, &any_young, NULL); - if (any_young) - ptent = pte_mkyoung(ptent); - + nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); if (nr < folio_nr_pages(folio)) { int err; @@ -503,6 +508,7 @@ restart: pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; @@ -567,16 +573,19 @@ static const struct mm_walk_ops cold_walk_ops = { }; static void madvise_cold_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end) + struct madvise_behavior *madv_behavior) + { + struct vm_area_struct *vma = madv_behavior->vma; + struct madvise_behavior_range *range = &madv_behavior->range; struct madvise_walk_private walk_private = { .pageout = false, .tlb = tlb, }; tlb_start_vma(tlb, vma); - walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); + walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, + &walk_private); tlb_end_vma(tlb, vma); } @@ -585,28 +594,25 @@ static inline bool can_madv_lru_vma(struct vm_area_struct *vma) return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); } -static long madvise_cold(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start_addr, unsigned long end_addr) +static long madvise_cold(struct madvise_behavior *madv_behavior) { - struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *vma = madv_behavior->vma; struct mmu_gather tlb; - *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - madvise_cold_page_range(&tlb, vma, start_addr, end_addr); + tlb_gather_mmu(&tlb, madv_behavior->mm); + madvise_cold_page_range(&tlb, madv_behavior); tlb_finish_mmu(&tlb); return 0; } static void madvise_pageout_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end) + struct vm_area_struct *vma, + struct madvise_behavior_range *range) { struct madvise_walk_private walk_private = { .pageout = true, @@ -614,18 +620,16 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb, }; tlb_start_vma(tlb, vma); - walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); + walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, + &walk_private); tlb_end_vma(tlb, vma); } -static long madvise_pageout(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start_addr, unsigned long end_addr) +static long madvise_pageout(struct madvise_behavior *madv_behavior) { - struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; + struct vm_area_struct *vma = madv_behavior->vma; - *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; @@ -640,8 +644,8 @@ static long madvise_pageout(struct vm_area_struct *vma, return 0; lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); + tlb_gather_mmu(&tlb, madv_behavior->mm); + madvise_pageout_page_range(&tlb, vma, &madv_behavior->range); tlb_finish_mmu(&tlb); return 0; @@ -713,11 +717,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, * next pte in the range. */ if (folio_test_large(folio)) { - bool any_young, any_dirty; - - nr = madvise_folio_pte_batch(addr, end, folio, pte, - ptent, &any_young, &any_dirty); - + nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); if (nr < folio_nr_pages(folio)) { int err; @@ -736,16 +736,12 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, start_pte = pte; if (!start_pte) break; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } - - if (any_young) - ptent = pte_mkyoung(ptent); - if (any_dirty) - ptent = pte_mkdirty(ptent); } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { @@ -789,17 +785,31 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -static const struct mm_walk_ops madvise_free_walk_ops = { - .pmd_entry = madvise_free_pte_range, - .walk_lock = PGWALK_RDLOCK, -}; +static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode) +{ + switch (mode) { + case MADVISE_VMA_READ_LOCK: + return PGWALK_VMA_RDLOCK_VERIFY; + case MADVISE_MMAP_READ_LOCK: + return PGWALK_RDLOCK; + default: + /* Other modes don't require fixing up the walk_lock */ + WARN_ON_ONCE(1); + return PGWALK_RDLOCK; + } +} -static int madvise_free_single_vma(struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr) +static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) { - struct mm_struct *mm = vma->vm_mm; + struct mm_struct *mm = madv_behavior->mm; + struct vm_area_struct *vma = madv_behavior->vma; + unsigned long start_addr = madv_behavior->range.start; + unsigned long end_addr = madv_behavior->range.end; struct mmu_notifier_range range; - struct mmu_gather tlb; + struct mmu_gather *tlb = madv_behavior->tlb; + struct mm_walk_ops walk_ops = { + .pmd_entry = madvise_free_pte_range, + }; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) @@ -815,17 +825,15 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.start, range.end); lru_add_drain(); - tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); - tlb_start_vma(&tlb, vma); - walk_page_range(vma->vm_mm, range.start, range.end, - &madvise_free_walk_ops, &tlb); - tlb_end_vma(&tlb, vma); + tlb_start_vma(tlb, vma); + walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode); + walk_page_range_vma(vma, range.start, range.end, + &walk_ops, tlb); + tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb); - return 0; } @@ -848,23 +856,28 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed_single_vma(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior) + { + struct madvise_behavior_range *range = &madv_behavior->range; struct zap_details details = { .reclaim_pt = true, .even_cows = true, }; - zap_page_range_single(vma, start, end - start, &details); + zap_page_range_single_batched( + madv_behavior->tlb, madv_behavior->vma, range->start, + range->end - range->start, &details); return 0; } -static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, - unsigned long start, - unsigned long *end, - int behavior) +static +bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior) { + struct vm_area_struct *vma = madv_behavior->vma; + int behavior = madv_behavior->behavior; + struct madvise_behavior_range *range = &madv_behavior->range; + if (!is_vm_hugetlb_page(vma)) { unsigned int forbidden = VM_PFNMAP; @@ -876,7 +889,7 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) return false; - if (start & ~huge_page_mask(hstate_vma(vma))) + if (range->start & ~huge_page_mask(hstate_vma(vma))) return false; /* @@ -885,40 +898,38 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, * Avoid unexpected data loss by rounding down the number of * huge pages freed. */ - *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); + range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma))); return true; } -static long madvise_dontneed_free(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, - int behavior) +static long madvise_dontneed_free(struct madvise_behavior *madv_behavior) { - struct mm_struct *mm = vma->vm_mm; + struct mm_struct *mm = madv_behavior->mm; + struct madvise_behavior_range *range = &madv_behavior->range; + int behavior = madv_behavior->behavior; - *prev = vma; - if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) + if (!madvise_dontneed_free_valid_vma(madv_behavior)) return -EINVAL; - if (start == end) + if (range->start == range->end) return 0; - if (!userfaultfd_remove(vma, start, end)) { - *prev = NULL; /* mmap_lock has been dropped, prev is stale */ + if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) { + struct vm_area_struct *vma; + mark_mmap_lock_dropped(madv_behavior); mmap_read_lock(mm); - vma = vma_lookup(mm, start); + madv_behavior->vma = vma = vma_lookup(mm, range->start); if (!vma) return -ENOMEM; /* * Potential end adjustment for hugetlb vma is OK as * the check below keeps end within vma. */ - if (!madvise_dontneed_free_valid_vma(vma, start, &end, - behavior)) + if (!madvise_dontneed_free_valid_vma(madv_behavior)) return -EINVAL; - if (end > vma->vm_end) { + if (range->end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old * vma was split while the mmap_lock was @@ -931,7 +942,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, * end-vma->vm_end range, but the manager can * handle a repetition fine. */ - end = vma->vm_end; + range->end = vma->vm_end; } /* * If the memory region between start and end was @@ -940,24 +951,26 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, * the adjustment for hugetlb vma above may have rounded * end down to the start address. */ - if (start == end) + if (range->start == range->end) return 0; - VM_WARN_ON(start > end); + VM_WARN_ON(range->start > range->end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) - return madvise_dontneed_single_vma(vma, start, end); + return madvise_dontneed_single_vma(madv_behavior); else if (behavior == MADV_FREE) - return madvise_free_single_vma(vma, start, end); + return madvise_free_single_vma(madv_behavior); else return -EINVAL; } -static long madvise_populate(struct mm_struct *mm, unsigned long start, - unsigned long end, int behavior) +static long madvise_populate(struct madvise_behavior *madv_behavior) { - const bool write = behavior == MADV_POPULATE_WRITE; + struct mm_struct *mm = madv_behavior->mm; + const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE; int locked = 1; + unsigned long start = madv_behavior->range.start; + unsigned long end = madv_behavior->range.end; long pages; while (start < end) { @@ -994,16 +1007,17 @@ static long madvise_populate(struct mm_struct *mm, unsigned long start, * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. */ -static long madvise_remove(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) +static long madvise_remove(struct madvise_behavior *madv_behavior) { loff_t offset; int error; struct file *f; - struct mm_struct *mm = vma->vm_mm; + struct mm_struct *mm = madv_behavior->mm; + struct vm_area_struct *vma = madv_behavior->vma; + unsigned long start = madv_behavior->range.start; + unsigned long end = madv_behavior->range.end; - *prev = NULL; /* tell sys_madvise we drop mmap_lock */ + mark_mmap_lock_dropped(madv_behavior); if (vma->vm_flags & VM_LOCKED) return -EINVAL; @@ -1066,7 +1080,7 @@ static int guard_install_pud_entry(pud_t *pud, unsigned long addr, pud_t pudval = pudp_get(pud); /* If huge return >0 so we abort the operation + zap. */ - return pud_trans_huge(pudval) || pud_devmap(pudval); + return pud_trans_huge(pudval); } static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -1075,7 +1089,7 @@ static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, pmd_t pmdval = pmdp_get(pmd); /* If huge return >0 so we abort the operation + zap. */ - return pmd_trans_huge(pmdval) || pmd_devmap(pmdval); + return pmd_trans_huge(pmdval); } static int guard_install_pte_entry(pte_t *pte, unsigned long addr, @@ -1115,14 +1129,13 @@ static const struct mm_walk_ops guard_install_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; -static long madvise_guard_install(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) +static long madvise_guard_install(struct madvise_behavior *madv_behavior) { + struct vm_area_struct *vma = madv_behavior->vma; + struct madvise_behavior_range *range = &madv_behavior->range; long err; int i; - *prev = vma; if (!is_valid_guard_vma(vma, /* allow_locked = */false)) return -EINVAL; @@ -1153,13 +1166,14 @@ static long madvise_guard_install(struct vm_area_struct *vma, unsigned long nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ - err = walk_page_range_mm(vma->vm_mm, start, end, + err = walk_page_range_mm(vma->vm_mm, range->start, range->end, &guard_install_walk_ops, &nr_pages); if (err < 0) return err; if (err == 0) { - unsigned long nr_expected_pages = PHYS_PFN(end - start); + unsigned long nr_expected_pages = + PHYS_PFN(range->end - range->start); VM_WARN_ON(nr_pages != nr_expected_pages); return 0; @@ -1169,7 +1183,8 @@ static long madvise_guard_install(struct vm_area_struct *vma, * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ - zap_page_range_single(vma, start, end - start, NULL); + zap_page_range_single(vma, range->start, + range->end - range->start, NULL); } /* @@ -1186,7 +1201,7 @@ static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, pud_t pudval = pudp_get(pud); /* If huge, cannot have guard pages present, so no-op - skip. */ - if (pud_trans_huge(pudval) || pud_devmap(pudval)) + if (pud_trans_huge(pudval)) walk->action = ACTION_CONTINUE; return 0; @@ -1198,7 +1213,7 @@ static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, pmd_t pmdval = pmdp_get(pmd); /* If huge, cannot have guard pages present, so no-op - skip. */ - if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) + if (pmd_trans_huge(pmdval)) walk->action = ACTION_CONTINUE; return 0; @@ -1225,11 +1240,11 @@ static const struct mm_walk_ops guard_remove_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; -static long madvise_guard_remove(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) +static long madvise_guard_remove(struct madvise_behavior *madv_behavior) { - *prev = vma; + struct vm_area_struct *vma = madv_behavior->vma; + struct madvise_behavior_range *range = &madv_behavior->range; + /* * We're ok with removing guards in mlock()'d ranges, as this is a * non-destructive action. @@ -1237,7 +1252,7 @@ static long madvise_guard_remove(struct vm_area_struct *vma, if (!is_valid_guard_vma(vma, /* allow_locked = */true)) return -EINVAL; - return walk_page_range(vma->vm_mm, start, end, + return walk_page_range_vma(vma, range->start, range->end, &guard_remove_walk_ops, NULL); } @@ -1246,31 +1261,40 @@ static long madvise_guard_remove(struct vm_area_struct *vma, * will handle splitting a vm area into separate areas, each area with its own * behavior. */ -static int madvise_vma_behavior(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, - unsigned long behavior) +static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) { + int behavior = madv_behavior->behavior; + struct vm_area_struct *vma = madv_behavior->vma; + vm_flags_t new_flags = vma->vm_flags; + struct madvise_behavior_range *range = &madv_behavior->range; int error; - struct anon_vma_name *anon_name; - unsigned long new_flags = vma->vm_flags; - if (unlikely(!can_modify_vma_madv(vma, behavior))) + if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior))) return -EPERM; switch (behavior) { case MADV_REMOVE: - return madvise_remove(vma, prev, start, end); + return madvise_remove(madv_behavior); case MADV_WILLNEED: - return madvise_willneed(vma, prev, start, end); + return madvise_willneed(madv_behavior); case MADV_COLD: - return madvise_cold(vma, prev, start, end); + return madvise_cold(madv_behavior); case MADV_PAGEOUT: - return madvise_pageout(vma, prev, start, end); + return madvise_pageout(madv_behavior); case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: - return madvise_dontneed_free(vma, prev, start, end, behavior); + return madvise_dontneed_free(madv_behavior); + case MADV_COLLAPSE: + return madvise_collapse(vma, range->start, range->end, + &madv_behavior->lock_dropped); + case MADV_GUARD_INSTALL: + return madvise_guard_install(madv_behavior); + case MADV_GUARD_REMOVE: + return madvise_guard_remove(madv_behavior); + + /* The below behaviours update VMAs via madvise_update_vma(). */ + case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1284,18 +1308,18 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: - if (vma->vm_flags & VM_IO) + if (new_flags & VM_IO) return -EINVAL; new_flags &= ~VM_DONTCOPY; break; case MADV_WIPEONFORK: /* MADV_WIPEONFORK is only supported on anonymous memory. */ - if (vma->vm_file || vma->vm_flags & VM_SHARED) + if (vma->vm_file || new_flags & VM_SHARED) return -EINVAL; new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: - if (vma->vm_flags & VM_DROPPABLE) + if (new_flags & VM_DROPPABLE) return -EINVAL; new_flags &= ~VM_WIPEONFORK; break; @@ -1303,14 +1327,15 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || - (vma->vm_flags & VM_DROPPABLE)) + if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) || + (new_flags & VM_DROPPABLE)) return -EINVAL; new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: - error = ksm_madvise(vma, start, end, behavior, &new_flags); + error = ksm_madvise(vma, range->start, range->end, + behavior, &new_flags); if (error) goto out; break; @@ -1320,20 +1345,17 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, if (error) goto out; break; - case MADV_COLLAPSE: - return madvise_collapse(vma, prev, start, end); - case MADV_GUARD_INSTALL: - return madvise_guard_install(vma, prev, start, end); - case MADV_GUARD_REMOVE: - return madvise_guard_remove(vma, prev, start, end); + case __MADV_SET_ANON_VMA_NAME: + /* Only anonymous mappings can be named */ + if (vma->vm_file && !vma_is_anon_shmem(vma)) + return -EBADF; + break; } - anon_name = anon_vma_name(vma); - anon_vma_name_get(anon_name); - error = madvise_update_vma(vma, prev, start, end, new_flags, - anon_name); - anon_vma_name_put(anon_name); + /* This is a write operation.*/ + VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK); + error = madvise_update_vma(new_flags, madv_behavior); out: /* * madvise() returns EAGAIN if kernel resources, such as @@ -1348,15 +1370,15 @@ out: /* * Error injection support for memory error handling. */ -static int madvise_inject_error(int behavior, - unsigned long start, unsigned long end) +static int madvise_inject_error(struct madvise_behavior *madv_behavior) { unsigned long size; + unsigned long start = madv_behavior->range.start; + unsigned long end = madv_behavior->range.end; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - for (; start < end; start += size) { unsigned long pfn; struct page *page; @@ -1374,7 +1396,7 @@ static int madvise_inject_error(int behavior, */ size = page_size(compound_head(page)); - if (behavior == MADV_SOFT_OFFLINE) { + if (madv_behavior->behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); @@ -1393,9 +1415,9 @@ static int madvise_inject_error(int behavior, return 0; } -static bool is_memory_failure(int behavior) +static bool is_memory_failure(struct madvise_behavior *madv_behavior) { - switch (behavior) { + switch (madv_behavior->behavior) { case MADV_HWPOISON: case MADV_SOFT_OFFLINE: return true; @@ -1406,13 +1428,12 @@ static bool is_memory_failure(int behavior) #else -static int madvise_inject_error(int behavior, - unsigned long start, unsigned long end) +static int madvise_inject_error(struct madvise_behavior *madv_behavior) { return 0; } -static bool is_memory_failure(int behavior) +static bool is_memory_failure(struct madvise_behavior *madv_behavior) { return false; } @@ -1478,145 +1499,226 @@ static bool process_madvise_remote_valid(int behavior) } /* - * Walk the vmas in range [start,end), and call the visit function on each one. - * The visit function will get start and end parameters that cover the overlap - * between the current vma and the original range. Any unmapped regions in the - * original range will result in this function returning -ENOMEM while still - * calling the visit function on all of the existing vmas in the range. - * Must be called with the mmap_lock held for reading or writing. + * Try to acquire a VMA read lock if possible. + * + * We only support this lock over a single VMA, which the input range must + * span either partially or fully. + * + * This function always returns with an appropriate lock held. If a VMA read + * lock could be acquired, we return true and set madv_behavior state + * accordingly. + * + * If a VMA read lock could not be acquired, we return false and expect caller to + * fallback to mmap lock behaviour. */ -static -int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long arg, - int (*visit)(struct vm_area_struct *vma, - struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long arg)) +static bool try_vma_read_lock(struct madvise_behavior *madv_behavior) { + struct mm_struct *mm = madv_behavior->mm; struct vm_area_struct *vma; - struct vm_area_struct *prev; - unsigned long tmp; + + vma = lock_vma_under_rcu(mm, madv_behavior->range.start); + if (!vma) + goto take_mmap_read_lock; + /* + * Must span only a single VMA; uffd and remote processes are + * unsupported. + */ + if (madv_behavior->range.end > vma->vm_end || current->mm != mm || + userfaultfd_armed(vma)) { + vma_end_read(vma); + goto take_mmap_read_lock; + } + madv_behavior->vma = vma; + return true; + +take_mmap_read_lock: + mmap_read_lock(mm); + madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK; + return false; +} + +/* + * Walk the vmas in range [start,end), and call the madvise_vma_behavior + * function on each one. The function will get start and end parameters that + * cover the overlap between the current vma and the original range. Any + * unmapped regions in the original range will result in this function returning + * -ENOMEM while still calling the madvise_vma_behavior function on all of the + * existing vmas in the range. Must be called with the mmap_lock held for + * reading or writing. + */ +static +int madvise_walk_vmas(struct madvise_behavior *madv_behavior) +{ + struct mm_struct *mm = madv_behavior->mm; + struct madvise_behavior_range *range = &madv_behavior->range; + /* range is updated to span each VMA, so store end of entire range. */ + unsigned long last_end = range->end; int unmapped_error = 0; + int error; + struct vm_area_struct *prev, *vma; /* - * If the interval [start,end) covers some unmapped address - * ranges, just ignore them, but return -ENOMEM at the end. - * - different from the way of handling in mlock etc. + * If VMA read lock is supported, apply madvise to a single VMA + * tentatively, avoiding walking VMAs. */ - vma = find_vma_prev(mm, start, &prev); - if (vma && start > vma->vm_start) + if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK && + try_vma_read_lock(madv_behavior)) { + error = madvise_vma_behavior(madv_behavior); + vma_end_read(madv_behavior->vma); + return error; + } + + vma = find_vma_prev(mm, range->start, &prev); + if (vma && range->start > vma->vm_start) prev = vma; for (;;) { - int error; - /* Still start < end. */ if (!vma) return -ENOMEM; - /* Here start < (end|vma->vm_end). */ - if (start < vma->vm_start) { + /* Here start < (last_end|vma->vm_end). */ + if (range->start < vma->vm_start) { + /* + * This indicates a gap between VMAs in the input + * range. This does not cause the operation to abort, + * rather we simply return -ENOMEM to indicate that this + * has happened, but carry on. + */ unmapped_error = -ENOMEM; - start = vma->vm_start; - if (start >= end) + range->start = vma->vm_start; + if (range->start >= last_end) break; } - /* Here vma->vm_start <= start < (end|vma->vm_end) */ - tmp = vma->vm_end; - if (end < tmp) - tmp = end; + /* Here vma->vm_start <= range->start < (last_end|vma->vm_end) */ + range->end = min(vma->vm_end, last_end); - /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = visit(vma, &prev, start, tmp, arg); + /* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */ + madv_behavior->prev = prev; + madv_behavior->vma = vma; + error = madvise_vma_behavior(madv_behavior); if (error) return error; - start = tmp; - if (prev && start < prev->vm_end) - start = prev->vm_end; - if (start >= end) + if (madv_behavior->lock_dropped) { + /* We dropped the mmap lock, we can't ref the VMA. */ + prev = NULL; + vma = NULL; + madv_behavior->lock_dropped = false; + } else { + vma = madv_behavior->vma; + prev = vma; + } + + if (vma && range->end < vma->vm_end) + range->end = vma->vm_end; + if (range->end >= last_end) break; - if (prev) - vma = find_vma(mm, prev->vm_end); - else /* madvise_remove dropped mmap_lock */ - vma = find_vma(mm, start); + + vma = find_vma(mm, vma ? vma->vm_end : range->end); + range->start = range->end; } return unmapped_error; } -#ifdef CONFIG_ANON_VMA_NAME -static int madvise_vma_anon_name(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, - unsigned long anon_name) +/* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_lock for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior) { - int error; - - /* Only anonymous mappings can be named */ - if (vma->vm_file && !vma_is_anon_shmem(vma)) - return -EBADF; - - error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (struct anon_vma_name *)anon_name); + if (is_memory_failure(madv_behavior)) + return MADVISE_NO_LOCK; - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - return error; + switch (madv_behavior->behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_COLD: + case MADV_PAGEOUT: + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + case MADV_COLLAPSE: + case MADV_GUARD_INSTALL: + case MADV_GUARD_REMOVE: + return MADVISE_MMAP_READ_LOCK; + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_FREE: + return MADVISE_VMA_READ_LOCK; + default: + return MADVISE_MMAP_WRITE_LOCK; + } } -int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, struct anon_vma_name *anon_name) +static int madvise_lock(struct madvise_behavior *madv_behavior) { - unsigned long end; - unsigned long len; + struct mm_struct *mm = madv_behavior->mm; + enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior); - if (start & ~PAGE_MASK) - return -EINVAL; - len = (len_in + ~PAGE_MASK) & PAGE_MASK; + switch (lock_mode) { + case MADVISE_NO_LOCK: + break; + case MADVISE_MMAP_WRITE_LOCK: + if (mmap_write_lock_killable(mm)) + return -EINTR; + break; + case MADVISE_MMAP_READ_LOCK: + mmap_read_lock(mm); + break; + case MADVISE_VMA_READ_LOCK: + /* We will acquire the lock per-VMA in madvise_walk_vmas(). */ + break; + } - /* Check to see whether len was rounded up from small -ve to zero */ - if (len_in && !len) - return -EINVAL; + madv_behavior->lock_mode = lock_mode; + return 0; +} - end = start + len; - if (end < start) - return -EINVAL; +static void madvise_unlock(struct madvise_behavior *madv_behavior) +{ + struct mm_struct *mm = madv_behavior->mm; - if (end == start) - return 0; + switch (madv_behavior->lock_mode) { + case MADVISE_NO_LOCK: + return; + case MADVISE_MMAP_WRITE_LOCK: + mmap_write_unlock(mm); + break; + case MADVISE_MMAP_READ_LOCK: + mmap_read_unlock(mm); + break; + case MADVISE_VMA_READ_LOCK: + /* We will drop the lock per-VMA in madvise_walk_vmas(). */ + break; + } - return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, - madvise_vma_anon_name); + madv_behavior->lock_mode = MADVISE_NO_LOCK; } -#endif /* CONFIG_ANON_VMA_NAME */ -static int madvise_lock(struct mm_struct *mm, int behavior) +static bool madvise_batch_tlb_flush(int behavior) { - if (is_memory_failure(behavior)) - return 0; - - if (madvise_need_mmap_write(behavior)) { - if (mmap_write_lock_killable(mm)) - return -EINTR; - } else { - mmap_read_lock(mm); + switch (behavior) { + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_FREE: + return true; + default: + return false; } - return 0; } -static void madvise_unlock(struct mm_struct *mm, int behavior) +static void madvise_init_tlb(struct madvise_behavior *madv_behavior) { - if (is_memory_failure(behavior)) - return; + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm); +} - if (madvise_need_mmap_write(behavior)) - mmap_write_unlock(mm); - else - mmap_read_unlock(mm); +static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_finish_mmu(madv_behavior->tlb); } static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) @@ -1665,9 +1767,9 @@ static bool madvise_should_skip(unsigned long start, size_t len_in, return false; } -static bool is_madvise_populate(int behavior) +static bool is_madvise_populate(struct madvise_behavior *madv_behavior) { - switch (behavior) { + switch (madv_behavior->behavior) { case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: return true; @@ -1676,24 +1778,42 @@ static bool is_madvise_populate(int behavior) } } -static int madvise_do_behavior(struct mm_struct *mm, - unsigned long start, size_t len_in, int behavior) +/* + * untagged_addr_remote() assumes mmap_lock is already held. On + * architectures like x86 and RISC-V, tagging is tricky because each + * mm may have a different tagging mask. However, we might only hold + * the per-VMA lock (currently only local processes are supported), + * so untagged_addr is used to avoid the mmap_lock assertion for + * local processes. + */ +static inline unsigned long get_untagged_addr(struct mm_struct *mm, + unsigned long start) +{ + return current->mm == mm ? untagged_addr(start) : + untagged_addr_remote(mm, start); +} + +static int madvise_do_behavior(unsigned long start, size_t len_in, + struct madvise_behavior *madv_behavior) { struct blk_plug plug; - unsigned long end; int error; + struct madvise_behavior_range *range = &madv_behavior->range; + + if (is_memory_failure(madv_behavior)) { + range->start = start; + range->end = start + len_in; + return madvise_inject_error(madv_behavior); + } - if (is_memory_failure(behavior)) - return madvise_inject_error(behavior, start, start + len_in); - start = untagged_addr_remote(mm, start); - end = start + PAGE_ALIGN(len_in); + range->start = get_untagged_addr(madv_behavior->mm, start); + range->end = range->start + PAGE_ALIGN(len_in); blk_start_plug(&plug); - if (is_madvise_populate(behavior)) - error = madvise_populate(mm, start, end, behavior); + if (is_madvise_populate(madv_behavior)) + error = madvise_populate(madv_behavior); else - error = madvise_walk_vmas(mm, start, end, behavior, - madvise_vma_behavior); + error = madvise_walk_vmas(madv_behavior); blk_finish_plug(&plug); return error; } @@ -1773,14 +1893,22 @@ static int madvise_do_behavior(struct mm_struct *mm, int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { int error; + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .mm = mm, + .behavior = behavior, + .tlb = &tlb, + }; if (madvise_should_skip(start, len_in, behavior, &error)) return error; - error = madvise_lock(mm, behavior); + error = madvise_lock(&madv_behavior); if (error) return error; - error = madvise_do_behavior(mm, start, len_in, behavior); - madvise_unlock(mm, behavior); + madvise_init_tlb(&madv_behavior); + error = madvise_do_behavior(start, len_in, &madv_behavior); + madvise_finish_tlb(&madv_behavior); + madvise_unlock(&madv_behavior); return error; } @@ -1796,12 +1924,19 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, { ssize_t ret = 0; size_t total_len; + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .mm = mm, + .behavior = behavior, + .tlb = &tlb, + }; total_len = iov_iter_count(iter); - ret = madvise_lock(mm, behavior); + ret = madvise_lock(&madv_behavior); if (ret) return ret; + madvise_init_tlb(&madv_behavior); while (iov_iter_count(iter)) { unsigned long start = (unsigned long)iter_iov_addr(iter); @@ -1811,7 +1946,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, if (madvise_should_skip(start, len_in, behavior, &error)) ret = error; else - ret = madvise_do_behavior(mm, start, len_in, behavior); + ret = madvise_do_behavior(start, len_in, &madv_behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat @@ -1829,16 +1964,22 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, } /* Drop and reacquire lock to unwind race. */ - madvise_unlock(mm, behavior); - madvise_lock(mm, behavior); + madvise_finish_tlb(&madv_behavior); + madvise_unlock(&madv_behavior); + ret = madvise_lock(&madv_behavior); + if (ret) + goto out; + madvise_init_tlb(&madv_behavior); continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } - madvise_unlock(mm, behavior); + madvise_finish_tlb(&madv_behavior); + madvise_unlock(&madv_behavior); +out: ret = (total_len - iov_iter_count(iter)) ? : ret; return ret; @@ -1907,3 +2048,88 @@ free_iov: out: return ret; } + +#ifdef CONFIG_ANON_VMA_NAME + +#define ANON_VMA_NAME_MAX_LEN 80 +#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" + +static inline bool is_valid_name_char(char ch) +{ + /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ + return ch > 0x1f && ch < 0x7f && + !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); +} + +static int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, struct anon_vma_name *anon_name) +{ + unsigned long end; + unsigned long len; + int error; + struct madvise_behavior madv_behavior = { + .mm = mm, + .behavior = __MADV_SET_ANON_VMA_NAME, + .anon_name = anon_name, + }; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + madv_behavior.range.start = start; + madv_behavior.range.end = end; + + error = madvise_lock(&madv_behavior); + if (error) + return error; + error = madvise_walk_vmas(&madv_behavior); + madvise_unlock(&madv_behavior); + + return error; +} + +int set_anon_vma_name(unsigned long addr, unsigned long size, + const char __user *uname) +{ + struct anon_vma_name *anon_name = NULL; + struct mm_struct *mm = current->mm; + int error; + + if (uname) { + char *name, *pch; + + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + if (IS_ERR(name)) + return PTR_ERR(name); + + for (pch = name; *pch != '\0'; pch++) { + if (!is_valid_name_char(*pch)) { + kfree(name); + return -EINVAL; + } + } + /* anon_vma has its own copy */ + anon_name = anon_vma_name_alloc(name); + kfree(name); + if (!anon_name) + return -ENOMEM; + } + + error = madvise_set_anon_name(mm, addr, size, anon_name); + anon_vma_name_put(anon_name); + + return error; +} +#endif diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 2f8829b3541a..c193de6cb23a 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -129,7 +129,7 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, pmd_t pmdval = pmdp_get_lockless(pmd); /* Do not split a huge pmd, present or migrated */ - if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) { + if (pmd_trans_huge(pmdval)) { WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval)); walk->action = ACTION_CONTINUE; } @@ -152,7 +152,7 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, pud_t pudval = READ_ONCE(*pud); /* Do not split a huge pud */ - if (pud_trans_huge(pudval) || pud_devmap(pudval)) { + if (pud_trans_huge(pudval)) { WARN_ON(pud_write(pudval) || pud_dirty(pudval)); walk->action = ACTION_CONTINUE; } @@ -218,7 +218,7 @@ static void wp_clean_post_vma(struct mm_walk *walk) static int wp_clean_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { - unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags); + vm_flags_t vm_flags = READ_ONCE(walk->vma->vm_flags); /* Skip non-applicable VMAs */ if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) != diff --git a/mm/memblock.c b/mm/memblock.c index 284154445409..154f1d73b61f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -18,6 +18,11 @@ #include <linux/memblock.h> #include <linux/mutex.h> +#ifdef CONFIG_KEXEC_HANDOVER +#include <linux/libfdt.h> +#include <linux/kexec_handover.h> +#endif /* CONFIG_KEXEC_HANDOVER */ + #include <asm/sections.h> #include <linux/io.h> @@ -107,6 +112,13 @@ unsigned long min_low_pfn; unsigned long max_pfn; unsigned long long max_possible_pfn; +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +/* When set to true, only allocate from MEMBLOCK_KHO_SCRATCH ranges */ +static bool kho_scratch_only; +#else +#define kho_scratch_only false +#endif + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -166,6 +178,10 @@ bool __init_memblock memblock_has_mirror(void) static enum memblock_flags __init_memblock choose_memblock_flags(void) { + /* skip non-scratch memory for kho early boot allocations */ + if (kho_scratch_only) + return MEMBLOCK_KHO_SCRATCH; + return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } @@ -457,7 +473,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, min(new_area_start, memblock.current_limit), new_alloc_size, PAGE_SIZE); - new_array = addr ? __va(addr) : NULL; + if (addr) { + /* The memory may not have been accepted, yet. */ + accept_memory(addr, new_alloc_size); + + new_array = __va(addr); + } else { + new_array = NULL; + } } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", @@ -492,7 +515,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, * needn't do it */ if (!use_slab) - BUG_ON(memblock_reserve(addr, new_alloc_size)); + BUG_ON(memblock_reserve_kern(addr, new_alloc_size)); /* Update slab flag */ *in_slab = use_slab; @@ -642,7 +665,7 @@ repeat: #ifdef CONFIG_NUMA WARN_ON(nid != memblock_get_region_node(rgn)); #endif - WARN_ON(flags != rgn->flags); + WARN_ON(flags != MEMBLOCK_NONE && flags != rgn->flags); nr_new++; if (insert) { if (start_rgn == -1) @@ -902,14 +925,15 @@ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.reserved, base, size); } -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size, + int nid, enum memblock_flags flags) { phys_addr_t end = base + size - 1; - memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, - &base, &end, (void *)_RET_IP_); + memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__, + &base, &end, nid, flags, (void *)_RET_IP_); - return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); + return memblock_add_range(&memblock.reserved, base, size, nid, flags); } #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -924,6 +948,40 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) } #endif +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +__init void memblock_set_kho_scratch_only(void) +{ + kho_scratch_only = true; +} + +__init void memblock_clear_kho_scratch_only(void) +{ + kho_scratch_only = false; +} + +__init void memmap_init_kho_scratch_pages(void) +{ + phys_addr_t start, end; + unsigned long pfn; + int nid; + u64 i; + + if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) + return; + + /* + * Initialize struct pages for free scratch memory. + * The struct pages for reserved scratch memory will be set up in + * reserve_bootmem_region() + */ + __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, + MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) { + for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++) + init_deferred_page(pfn, nid); + } +} +#endif + /** * memblock_setclr_flag - set or clear flag for a memory region * @type: memblock type to set/clear flag for @@ -1049,6 +1107,36 @@ int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t MEMBLOCK_RSRV_NOINIT); } +/** + * memblock_mark_kho_scratch - Mark a memory region as MEMBLOCK_KHO_SCRATCH. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH will be considered + * for allocations during early boot with kexec handover. + * + * Return: 0 on success, -errno on failure. + */ +__init int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.memory, base, size, 1, + MEMBLOCK_KHO_SCRATCH); +} + +/** + * memblock_clear_kho_scratch - Clear MEMBLOCK_KHO_SCRATCH flag for a + * specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return: 0 on success, -errno on failure. + */ +__init int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.memory, base, size, 0, + MEMBLOCK_KHO_SCRATCH); +} + static bool should_skip_region(struct memblock_type *type, struct memblock_region *m, int nid, int flags) @@ -1080,6 +1168,13 @@ static bool should_skip_region(struct memblock_type *type, if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m)) return true; + /* + * In early alloc during kexec handover, we can only consider + * MEMBLOCK_KHO_SCRATCH regions for the allocations + */ + if ((flags & MEMBLOCK_KHO_SCRATCH) && !memblock_is_kho_scratch(m)) + return true; + return false; } @@ -1460,14 +1555,14 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); - if (found && !memblock_reserve(found, size)) + if (found && !__memblock_reserve(found, size, nid, MEMBLOCK_RSRV_KERN)) goto done; if (numa_valid_node(nid) && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); - if (found && !memblock_reserve(found, size)) + if (found && !memblock_reserve_kern(found, size)) goto done; } @@ -1752,6 +1847,28 @@ phys_addr_t __init_memblock memblock_reserved_size(void) return memblock.reserved.total_size; } +phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int nid) +{ + struct memblock_region *r; + phys_addr_t total = 0; + + for_each_reserved_mem_region(r) { + phys_addr_t size = r->size; + + if (r->base > limit) + break; + + if (r->base + r->size > limit) + size = limit - r->base; + + if (nid == memblock_get_region_node(r) || !numa_valid_node(nid)) + if (r->flags & MEMBLOCK_RSRV_KERN) + total += size; + } + + return total; +} + /** * memblock_estimated_nr_free_pages - return estimated number of free pages * from memblock point of view @@ -2167,6 +2284,9 @@ static unsigned long __init __free_memory_core(phys_addr_t start, unsigned long start_pfn = PFN_UP(start); unsigned long end_pfn = PFN_DOWN(end); + if (!IS_ENABLED(CONFIG_HIGHMEM) && end_pfn > max_low_pfn) + end_pfn = max_low_pfn; + if (start_pfn >= end_pfn) return 0; @@ -2180,11 +2300,14 @@ static void __init memmap_init_reserved_pages(void) struct memblock_region *region; phys_addr_t start, end; int nid; + unsigned long max_reserved; /* * set nid on all reserved pages and also treat struct * pages for the NOMAP regions as PageReserved */ +repeat: + max_reserved = memblock.reserved.max; for_each_mem_region(region) { nid = memblock_get_region_node(region); start = region->base; @@ -2193,8 +2316,15 @@ static void __init memmap_init_reserved_pages(void) if (memblock_is_nomap(region)) reserve_bootmem_region(start, end, nid); - memblock_set_node(start, end, &memblock.reserved, nid); + memblock_set_node(start, region->size, &memblock.reserved, nid); } + /* + * 'max' is changed means memblock.reserved has been doubled its + * array, which may result a new reserved region before current + * 'start'. Now we should repeat the procedure to set its node id. + */ + if (max_reserved != memblock.reserved.max) + goto repeat; /* * initialize struct pages for reserved regions that don't have @@ -2269,6 +2399,7 @@ void __init memblock_free_all(void) free_unused_memmap(); reset_all_zones_managed_pages(); + memblock_clear_kho_scratch_only(); pages = free_low_memory_core_early(); totalram_pages_add(pages); } @@ -2366,6 +2497,189 @@ int reserve_mem_release_by_name(const char *name) return 1; } +#ifdef CONFIG_KEXEC_HANDOVER +#define MEMBLOCK_KHO_FDT "memblock" +#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1" +#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1" +static struct page *kho_fdt; + +static int reserve_mem_kho_finalize(struct kho_serialization *ser) +{ + int err = 0, i; + + for (i = 0; i < reserved_mem_count; i++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + + err |= kho_preserve_phys(map->start, map->size); + } + + err |= kho_preserve_folio(page_folio(kho_fdt)); + err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt)); + + return notifier_from_errno(err); +} + +static int reserve_mem_kho_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + switch (cmd) { + case KEXEC_KHO_FINALIZE: + return reserve_mem_kho_finalize((struct kho_serialization *)v); + case KEXEC_KHO_ABORT: + return NOTIFY_DONE; + default: + return NOTIFY_BAD; + } +} + +static struct notifier_block reserve_mem_kho_nb = { + .notifier_call = reserve_mem_kho_notifier, +}; + +static int __init prepare_kho_fdt(void) +{ + int err = 0, i; + void *fdt; + + kho_fdt = alloc_page(GFP_KERNEL); + if (!kho_fdt) + return -ENOMEM; + + fdt = page_to_virt(kho_fdt); + + err |= fdt_create(fdt, PAGE_SIZE); + err |= fdt_finish_reservemap(fdt); + + err |= fdt_begin_node(fdt, ""); + err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE); + for (i = 0; i < reserved_mem_count; i++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + + err |= fdt_begin_node(fdt, map->name); + err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); + err |= fdt_property(fdt, "start", &map->start, sizeof(map->start)); + err |= fdt_property(fdt, "size", &map->size, sizeof(map->size)); + err |= fdt_end_node(fdt); + } + err |= fdt_end_node(fdt); + + err |= fdt_finish(fdt); + + if (err) { + pr_err("failed to prepare memblock FDT for KHO: %d\n", err); + put_page(kho_fdt); + kho_fdt = NULL; + } + + return err; +} + +static int __init reserve_mem_init(void) +{ + int err; + + if (!kho_is_enabled() || !reserved_mem_count) + return 0; + + err = prepare_kho_fdt(); + if (err) + return err; + + err = register_kho_notifier(&reserve_mem_kho_nb); + if (err) { + put_page(kho_fdt); + kho_fdt = NULL; + } + + return err; +} +late_initcall(reserve_mem_init); + +static void *__init reserve_mem_kho_retrieve_fdt(void) +{ + phys_addr_t fdt_phys; + static void *fdt; + int err; + + if (fdt) + return fdt; + + err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys); + if (err) { + if (err != -ENOENT) + pr_warn("failed to retrieve FDT '%s' from KHO: %d\n", + MEMBLOCK_KHO_FDT, err); + return NULL; + } + + fdt = phys_to_virt(fdt_phys); + + err = fdt_node_check_compatible(fdt, 0, MEMBLOCK_KHO_NODE_COMPATIBLE); + if (err) { + pr_warn("FDT '%s' is incompatible with '%s': %d\n", + MEMBLOCK_KHO_FDT, MEMBLOCK_KHO_NODE_COMPATIBLE, err); + fdt = NULL; + } + + return fdt; +} + +static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size, + phys_addr_t align) +{ + int err, len_start, len_size, offset; + const phys_addr_t *p_start, *p_size; + const void *fdt; + + fdt = reserve_mem_kho_retrieve_fdt(); + if (!fdt) + return false; + + offset = fdt_subnode_offset(fdt, 0, name); + if (offset < 0) { + pr_warn("FDT '%s' has no child '%s': %d\n", + MEMBLOCK_KHO_FDT, name, offset); + return false; + } + err = fdt_node_check_compatible(fdt, offset, RESERVE_MEM_KHO_NODE_COMPATIBLE); + if (err) { + pr_warn("Node '%s' is incompatible with '%s': %d\n", + name, RESERVE_MEM_KHO_NODE_COMPATIBLE, err); + return false; + } + + p_start = fdt_getprop(fdt, offset, "start", &len_start); + p_size = fdt_getprop(fdt, offset, "size", &len_size); + if (!p_start || len_start != sizeof(*p_start) || !p_size || + len_size != sizeof(*p_size)) { + return false; + } + + if (*p_start & (align - 1)) { + pr_warn("KHO reserve-mem '%s' has wrong alignment (0x%lx, 0x%lx)\n", + name, (long)align, (long)*p_start); + return false; + } + + if (*p_size != size) { + pr_warn("KHO reserve-mem '%s' has wrong size (0x%lx != 0x%lx)\n", + name, (long)*p_size, (long)size); + return false; + } + + reserved_mem_add(*p_start, size, name); + pr_info("Revived memory reservation '%s' from KHO\n", name); + + return true; +} +#else +static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size, + phys_addr_t align) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER */ + /* * Parse reserve_mem=nn:align:name */ @@ -2421,6 +2735,11 @@ static int __init reserve_mem(char *p) if (reserve_mem_find_by_name(name, &start, &tmp)) return -EBUSY; + /* Pick previous allocations up from KHO if available */ + if (reserve_mem_kho_revive(name, size, align)) + return 1; + + /* TODO: Allocation must be outside of scratch region */ start = memblock_phys_alloc(size, align); if (!start) return -ENOMEM; @@ -2438,6 +2757,8 @@ static const char * const flagname[] = { [ilog2(MEMBLOCK_NOMAP)] = "NOMAP", [ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG", [ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT", + [ilog2(MEMBLOCK_RSRV_KERN)] = "RSV_KERN", + [ilog2(MEMBLOCK_KHO_SCRATCH)] = "KHO_SCRATCH", }; static int memblock_debug_show(struct seq_file *m, void *private) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 8660908850dc..4b94731305b9 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -512,9 +512,9 @@ static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages) { /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __count_memcg_events(memcg, PGPGIN, 1); + count_memcg_events(memcg, PGPGIN, 1); else { - __count_memcg_events(memcg, PGPGOUT, 1); + count_memcg_events(memcg, PGPGOUT, 1); nr_pages = -nr_pages; /* for event */ } @@ -620,7 +620,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, mem_cgroup_id(memcg), entry); + swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; @@ -689,7 +689,7 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long flags; local_irq_save(flags); - __count_memcg_events(memcg, PGPGOUT, pgpgout); + count_memcg_events(memcg, PGPGOUT, pgpgout); __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory); memcg1_check_events(memcg, nid); local_irq_restore(flags); @@ -2198,8 +2198,7 @@ bool memcg1_alloc_events(struct mem_cgroup *memcg) void memcg1_free_events(struct mem_cgroup *memcg) { - if (memcg->events_percpu) - free_percpu(memcg->events_percpu); + free_percpu(memcg->events_percpu); } static int __init memcg1_init(void) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 421740f1bcdc..8dd7fbed5a94 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -29,6 +29,7 @@ #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/cgroup.h> +#include <linux/cpuset.h> #include <linux/sched/mm.h> #include <linux/shmem_fs.h> #include <linux/hugetlb.h> @@ -50,7 +51,6 @@ #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/seq_file.h> -#include <linux/parser.h> #include <linux/vmpressure.h> #include <linux/memremap.h> #include <linux/mm_inline.h> @@ -95,6 +95,9 @@ static bool cgroup_memory_nokmem __ro_after_init; /* BPF memory accounting disabled? */ static bool cgroup_memory_nobpf __ro_after_init; +static struct kmem_cache *memcg_cachep; +static struct kmem_cache *memcg_pn_cachep; + #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -129,8 +132,7 @@ bool mem_cgroup_kmem_disabled(void) return cgroup_memory_nokmem; } -static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, - unsigned int nr_pages); +static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages); static void obj_cgroup_release(struct percpu_ref *ref) { @@ -163,8 +165,16 @@ static void obj_cgroup_release(struct percpu_ref *ref) WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); nr_pages = nr_bytes >> PAGE_SHIFT; - if (nr_pages) - obj_cgroup_uncharge_pages(objcg, nr_pages); + if (nr_pages) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + memcg1_account_kmem(memcg, -nr_pages); + if (!mem_cgroup_is_root(memcg)) + memcg_uncharge(memcg, nr_pages); + mem_cgroup_put(memcg); + } spin_lock_irqsave(&objcg_lock, flags); list_del(&objcg->list); @@ -492,8 +502,8 @@ struct memcg_vmstats_percpu { unsigned int stats_updates; /* Cached pointers for fast iteration in memcg_rstat_updated() */ - struct memcg_vmstats_percpu *parent; - struct memcg_vmstats *vmstats; + struct memcg_vmstats_percpu __percpu *parent_pcpu; + struct memcg_vmstats *vmstats; /* The above should fit a single cacheline for memcg_rstat_updated() */ @@ -520,7 +530,7 @@ struct memcg_vmstats { unsigned long events_pending[NR_MEMCG_EVENTS]; /* Stats updates since the last flush */ - atomic64_t stats_updates; + atomic_t stats_updates; }; /* @@ -544,60 +554,41 @@ static u64 flush_last_time; #define FLUSH_TIME (2UL*HZ) -/* - * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can - * not rely on this as part of an acquired spinlock_t lock. These functions are - * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion - * is sufficient. - */ -static void memcg_stats_lock(void) -{ - preempt_disable_nested(); - VM_WARN_ON_IRQS_ENABLED(); -} - -static void __memcg_stats_lock(void) -{ - preempt_disable_nested(); -} - -static void memcg_stats_unlock(void) -{ - preempt_enable_nested(); -} - - static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) { - return atomic64_read(&vmstats->stats_updates) > + return atomic_read(&vmstats->stats_updates) > MEMCG_CHARGE_BATCH * num_online_cpus(); } -static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, + int cpu) { + struct memcg_vmstats_percpu __percpu *statc_pcpu; struct memcg_vmstats_percpu *statc; - int cpu = smp_processor_id(); unsigned int stats_updates; if (!val) return; - cgroup_rstat_updated(memcg->css.cgroup, cpu); - statc = this_cpu_ptr(memcg->vmstats_percpu); - for (; statc; statc = statc->parent) { - stats_updates = READ_ONCE(statc->stats_updates) + abs(val); - WRITE_ONCE(statc->stats_updates, stats_updates); + css_rstat_updated(&memcg->css, cpu); + statc_pcpu = memcg->vmstats_percpu; + for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) { + statc = this_cpu_ptr(statc_pcpu); + /* + * If @memcg is already flushable then all its ancestors are + * flushable as well and also there is no need to increase + * stats_updates. + */ + if (memcg_vmstats_needs_flush(statc->vmstats)) + break; + + stats_updates = this_cpu_add_return(statc_pcpu->stats_updates, + abs(val)); if (stats_updates < MEMCG_CHARGE_BATCH) continue; - /* - * If @memcg is already flush-able, increasing stats_updates is - * redundant. Avoid the overhead of the atomic update. - */ - if (!memcg_vmstats_needs_flush(statc->vmstats)) - atomic64_add(stats_updates, - &statc->vmstats->stats_updates); - WRITE_ONCE(statc->stats_updates, 0); + stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0); + atomic_add(stats_updates, &statc->vmstats->stats_updates); } } @@ -605,7 +596,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) { bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats); - trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates), + trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates), force, needs_flush); if (!force && !needs_flush) @@ -614,7 +605,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) if (mem_cgroup_is_root(memcg)) WRITE_ONCE(flush_last_time, jiffies_64); - cgroup_rstat_flush(memcg->css.cgroup); + css_rstat_flush(&memcg->css); } /* @@ -687,15 +678,16 @@ static int memcg_state_val_in_pages(int idx, int val) } /** - * __mod_memcg_state - update cgroup memory statistics + * mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item * @val: delta to add to the counter, can be negative */ -void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, +void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) { int i = memcg_stats_index(idx); + int cpu; if (mem_cgroup_disabled()) return; @@ -703,10 +695,14 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; - __this_cpu_add(memcg->vmstats_percpu->state[i], val); + cpu = get_cpu(); + + this_cpu_add(memcg->vmstats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); - memcg_rstat_updated(memcg, val); + memcg_rstat_updated(memcg, val, cpu); trace_mod_memcg_state(memcg, idx, val); + + put_cpu(); } #ifdef CONFIG_MEMCG_V1 @@ -728,13 +724,14 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) } #endif -static void __mod_memcg_lruvec_state(struct lruvec *lruvec, +static void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; int i = memcg_stats_index(idx); + int cpu; if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; @@ -742,35 +739,19 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; - /* - * The caller from rmap relies on disabled preemption because they never - * update their counter from in-interrupt context. For these two - * counters we check that the update is never performed from an - * interrupt context while other caller need to have disabled interrupt. - */ - __memcg_stats_lock(); - if (IS_ENABLED(CONFIG_DEBUG_VM)) { - switch (idx) { - case NR_ANON_MAPPED: - case NR_FILE_MAPPED: - case NR_ANON_THPS: - WARN_ON_ONCE(!in_task()); - break; - default: - VM_WARN_ON_IRQS_ENABLED(); - } - } + cpu = get_cpu(); /* Update memcg */ - __this_cpu_add(memcg->vmstats_percpu->state[i], val); + this_cpu_add(memcg->vmstats_percpu->state[i], val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); + this_cpu_add(pn->lruvec_stats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); - memcg_rstat_updated(memcg, val); + memcg_rstat_updated(memcg, val, cpu); trace_mod_memcg_lruvec_state(memcg, idx, val); - memcg_stats_unlock(); + + put_cpu(); } /** @@ -791,7 +772,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update memcg and lruvec */ if (!mem_cgroup_disabled()) - __mod_memcg_lruvec_state(lruvec, idx, val); + mod_memcg_lruvec_state(lruvec, idx, val); } void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, @@ -841,15 +822,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) } /** - * __count_memcg_events - account VM events in a cgroup + * count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup * @idx: the event item * @count: the number of events that occurred */ -void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, +void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { int i = memcg_events_index(idx); + int cpu; if (mem_cgroup_disabled()) return; @@ -857,11 +839,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; - memcg_stats_lock(); - __this_cpu_add(memcg->vmstats_percpu->events[i], count); - memcg_rstat_updated(memcg, count); + cpu = get_cpu(); + + this_cpu_add(memcg->vmstats_percpu->events[i], count); + memcg_rstat_updated(memcg, count, cpu); trace_count_memcg_events(memcg, idx, count); - memcg_stats_unlock(); + + put_cpu(); } unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -1168,7 +1152,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, { struct mem_cgroup *iter; int ret = 0; - int i = 0; BUG_ON(mem_cgroup_is_root(memcg)); @@ -1178,10 +1161,9 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); while (!ret && (task = css_task_iter_next(&it))) { - /* Avoid potential softlockup warning */ - if ((++i & 1023) == 0) - cond_resched(); ret = fn(task, arg); + /* Avoid potential softlockup warning */ + cond_resched(); } css_task_iter_end(&it); if (ret) { @@ -1664,7 +1646,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * A few threads which were not waiting at mutex_lock_killable() can * fail to bail out. Therefore, check again after holding oom_lock. */ - ret = task_is_dying() || out_of_memory(&oc); + ret = out_of_memory(&oc); unlock: mutex_unlock(&oom_lock); @@ -1758,156 +1740,234 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) pr_cont(" are going to be killed due to memory.oom.group set\n"); } +/* + * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their + * nr_pages in a single cacheline. This may change in future. + */ +#define NR_MEMCG_STOCK 7 +#define FLUSHING_CACHED_CHARGE 0 struct memcg_stock_pcp { - localtry_lock_t stock_lock; - struct mem_cgroup *cached; /* this never be root cgroup */ - unsigned int nr_pages; + local_trylock_t lock; + uint8_t nr_pages[NR_MEMCG_STOCK]; + struct mem_cgroup *cached[NR_MEMCG_STOCK]; + struct work_struct work; + unsigned long flags; +}; + +static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = { + .lock = INIT_LOCAL_TRYLOCK(lock), +}; + +struct obj_stock_pcp { + local_trylock_t lock; + unsigned int nr_bytes; struct obj_cgroup *cached_objcg; struct pglist_data *cached_pgdat; - unsigned int nr_bytes; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; struct work_struct work; unsigned long flags; -#define FLUSHING_CACHED_CHARGE 0 }; -static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { - .stock_lock = INIT_LOCALTRY_LOCK(stock_lock), + +static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = { + .lock = INIT_LOCAL_TRYLOCK(lock), }; + static DEFINE_MUTEX(percpu_charge_mutex); -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); -static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +static void drain_obj_stock(struct obj_stock_pcp *stock); +static bool obj_stock_flush_required(struct obj_stock_pcp *stock, struct mem_cgroup *root_memcg); /** * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. * @nr_pages: how many pages to charge. - * @gfp_mask: allocation mask. * - * The charges will only happen if @memcg matches the current cpu's memcg - * stock, and at least @nr_pages are available in that stock. Failure to - * service an allocation will refill the stock. + * Consume the cached charge if enough nr_pages are present otherwise return + * failure. Also return failure for charge request larger than + * MEMCG_CHARGE_BATCH or if the local lock is already taken. * * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages, - gfp_t gfp_mask) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; - unsigned int stock_pages; - unsigned long flags; + uint8_t stock_pages; bool ret = false; + int i; - if (nr_pages > MEMCG_CHARGE_BATCH) + if (nr_pages > MEMCG_CHARGE_BATCH || + !local_trylock(&memcg_stock.lock)) return ret; - if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) { - if (!gfpflags_allow_spinning(gfp_mask)) - return ret; - localtry_lock_irqsave(&memcg_stock.stock_lock, flags); - } - stock = this_cpu_ptr(&memcg_stock); - stock_pages = READ_ONCE(stock->nr_pages); - if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) { - WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages); - ret = true; + + for (i = 0; i < NR_MEMCG_STOCK; ++i) { + if (memcg != READ_ONCE(stock->cached[i])) + continue; + + stock_pages = READ_ONCE(stock->nr_pages[i]); + if (stock_pages >= nr_pages) { + WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages); + ret = true; + } + break; } - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); + local_unlock(&memcg_stock.lock); return ret; } +static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); +} + /* * Returns stocks cached in percpu and reset cached information. */ -static void drain_stock(struct memcg_stock_pcp *stock) +static void drain_stock(struct memcg_stock_pcp *stock, int i) { - unsigned int stock_pages = READ_ONCE(stock->nr_pages); - struct mem_cgroup *old = READ_ONCE(stock->cached); + struct mem_cgroup *old = READ_ONCE(stock->cached[i]); + uint8_t stock_pages; if (!old) return; + stock_pages = READ_ONCE(stock->nr_pages[i]); if (stock_pages) { - page_counter_uncharge(&old->memory, stock_pages); - if (do_memsw_account()) - page_counter_uncharge(&old->memsw, stock_pages); - - WRITE_ONCE(stock->nr_pages, 0); + memcg_uncharge(old, stock_pages); + WRITE_ONCE(stock->nr_pages[i], 0); } css_put(&old->css); - WRITE_ONCE(stock->cached, NULL); + WRITE_ONCE(stock->cached[i], NULL); +} + +static void drain_stock_fully(struct memcg_stock_pcp *stock) +{ + int i; + + for (i = 0; i < NR_MEMCG_STOCK; ++i) + drain_stock(stock, i); } -static void drain_local_stock(struct work_struct *dummy) +static void drain_local_memcg_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; - unsigned long flags; - /* - * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. - * drain_stock races is that we always operate on local CPU stock - * here with IRQ disabled - */ - localtry_lock_irqsave(&memcg_stock.stock_lock, flags); + if (WARN_ONCE(!in_task(), "drain in non-task context")) + return; + + local_lock(&memcg_stock.lock); stock = this_cpu_ptr(&memcg_stock); - old = drain_obj_stock(stock); - drain_stock(stock); + drain_stock_fully(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); + local_unlock(&memcg_stock.lock); } -/* - * Cache charges(val) to local per_cpu area. - * This will be consumed by consume_stock() function, later. - */ -static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void drain_local_obj_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock; - unsigned int stock_pages; + struct obj_stock_pcp *stock; - stock = this_cpu_ptr(&memcg_stock); - if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ - drain_stock(stock); - css_get(&memcg->css); - WRITE_ONCE(stock->cached, memcg); - } - stock_pages = READ_ONCE(stock->nr_pages) + nr_pages; - WRITE_ONCE(stock->nr_pages, stock_pages); + if (WARN_ONCE(!in_task(), "drain in non-task context")) + return; - if (stock_pages > MEMCG_CHARGE_BATCH) - drain_stock(stock); + local_lock(&obj_stock.lock); + + stock = this_cpu_ptr(&obj_stock); + drain_obj_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + + local_unlock(&obj_stock.lock); } static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { - unsigned long flags; + struct memcg_stock_pcp *stock; + struct mem_cgroup *cached; + uint8_t stock_pages; + bool success = false; + int empty_slot = -1; + int i; - if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) { + /* + * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we + * decide to increase it more than 127 then we will need more careful + * handling of nr_pages[] in struct memcg_stock_pcp. + */ + BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX); + + VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg)); + + if (nr_pages > MEMCG_CHARGE_BATCH || + !local_trylock(&memcg_stock.lock)) { /* - * In case of unlikely failure to lock percpu stock_lock - * uncharge memcg directly. + * In case of larger than batch refill or unlikely failure to + * lock the percpu memcg_stock.lock, uncharge memcg directly. */ - if (mem_cgroup_is_root(memcg)) - return; - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); + memcg_uncharge(memcg, nr_pages); return; } - __refill_stock(memcg, nr_pages); - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + for (i = 0; i < NR_MEMCG_STOCK; ++i) { + cached = READ_ONCE(stock->cached[i]); + if (!cached && empty_slot == -1) + empty_slot = i; + if (memcg == READ_ONCE(stock->cached[i])) { + stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages; + WRITE_ONCE(stock->nr_pages[i], stock_pages); + if (stock_pages > MEMCG_CHARGE_BATCH) + drain_stock(stock, i); + success = true; + break; + } + } + + if (!success) { + i = empty_slot; + if (i == -1) { + i = get_random_u32_below(NR_MEMCG_STOCK); + drain_stock(stock, i); + } + css_get(&memcg->css); + WRITE_ONCE(stock->cached[i], memcg); + WRITE_ONCE(stock->nr_pages[i], nr_pages); + } + + local_unlock(&memcg_stock.lock); +} + +static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg) +{ + struct mem_cgroup *memcg; + bool flush = false; + int i; + + rcu_read_lock(); + for (i = 0; i < NR_MEMCG_STOCK; ++i) { + memcg = READ_ONCE(stock->cached[i]); + if (!memcg) + continue; + + if (READ_ONCE(stock->nr_pages[i]) && + mem_cgroup_is_descendant(memcg, root_memcg)) { + flush = true; + break; + } + } + rcu_read_unlock(); + return flush; } /* @@ -1930,25 +1990,27 @@ void drain_all_stock(struct mem_cgroup *root_memcg) migrate_disable(); curcpu = smp_processor_id(); for_each_online_cpu(cpu) { - struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - struct mem_cgroup *memcg; - bool flush = false; + struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu); + struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu); - rcu_read_lock(); - memcg = READ_ONCE(stock->cached); - if (memcg && READ_ONCE(stock->nr_pages) && - mem_cgroup_is_descendant(memcg, root_memcg)) - flush = true; - else if (obj_stock_flush_required(stock, root_memcg)) - flush = true; - rcu_read_unlock(); + if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) && + is_memcg_drain_needed(memcg_st, root_memcg) && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, + &memcg_st->flags)) { + if (cpu == curcpu) + drain_local_memcg_stock(&memcg_st->work); + else if (!cpu_is_isolated(cpu)) + schedule_work_on(cpu, &memcg_st->work); + } - if (flush && - !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) && + obj_stock_flush_required(obj_st, root_memcg) && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, + &obj_st->flags)) { if (cpu == curcpu) - drain_local_stock(&stock->work); + drain_local_obj_stock(&obj_st->work); else if (!cpu_is_isolated(cpu)) - schedule_work_on(cpu, &stock->work); + schedule_work_on(cpu, &obj_st->work); } } migrate_enable(); @@ -1957,19 +2019,9 @@ void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { - struct memcg_stock_pcp *stock; - struct obj_cgroup *old; - unsigned long flags; - - stock = &per_cpu(memcg_stock, cpu); - - /* drain_obj_stock requires stock_lock */ - localtry_lock_irqsave(&memcg_stock.stock_lock, flags); - old = drain_obj_stock(stock); - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); - - drain_stock(stock); - obj_cgroup_put(old); + /* no need for the local lock */ + drain_obj_stock(&per_cpu(obj_stock, cpu)); + drain_stock_fully(&per_cpu(memcg_stock, cpu)); return 0; } @@ -2259,7 +2311,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long pflags; retry: - if (consume_stock(memcg, nr_pages, gfp_mask)) + if (consume_stock(memcg, nr_pages)) return 0; if (!gfpflags_allow_spinning(gfp_mask)) @@ -2460,17 +2512,48 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } -static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC +static inline void account_slab_nmi_safe(struct mem_cgroup *memcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) +{ + struct lruvec *lruvec; + + if (likely(!in_nmi())) { + lruvec = mem_cgroup_lruvec(memcg, pgdat); + mod_memcg_lruvec_state(lruvec, idx, nr); + } else { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id]; + + /* preemption is disabled in_nmi(). */ + css_rstat_updated(&memcg->css, smp_processor_id()); + if (idx == NR_SLAB_RECLAIMABLE_B) + atomic_add(nr, &pn->slab_reclaimable); + else + atomic_add(nr, &pn->slab_unreclaimable); + } +} +#else +static inline void account_slab_nmi_safe(struct mem_cgroup *memcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) +{ + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + mod_memcg_lruvec_state(lruvec, idx, nr); +} +#endif + +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { struct mem_cgroup *memcg; - struct lruvec *lruvec; rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - __mod_memcg_lruvec_state(lruvec, idx, nr); + account_slab_nmi_safe(memcg, pgdat, idx, nr); rcu_read_unlock(); } @@ -2595,6 +2678,9 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) struct mem_cgroup *memcg; struct obj_cgroup *objcg; + if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi()) + return NULL; + if (in_task()) { memcg = current->active_memcg; if (unlikely(memcg)) @@ -2657,6 +2743,24 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) return objcg; } +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC +static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val) +{ + if (likely(!in_nmi())) { + mod_memcg_state(memcg, MEMCG_KMEM, val); + } else { + /* preemption is disabled in_nmi(). */ + css_rstat_updated(&memcg->css, smp_processor_id()); + atomic_add(val, &memcg->kmem_stat); + } +} +#else +static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val) +{ + mod_memcg_state(memcg, MEMCG_KMEM, val); +} +#endif + /* * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg * @objcg: object cgroup to uncharge @@ -2669,7 +2773,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, memcg = get_mem_cgroup_from_objcg(objcg); - mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + account_kmem_nmi_safe(memcg, -nr_pages); memcg1_account_kmem(memcg, -nr_pages); if (!mem_cgroup_is_root(memcg)) refill_stock(memcg, nr_pages); @@ -2697,7 +2801,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); + account_kmem_nmi_safe(memcg, nr_pages); memcg1_account_kmem(memcg, nr_pages); out: css_put(&memcg->css); @@ -2765,50 +2869,27 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) obj_cgroup_put(objcg); } -/* Replace the stock objcg with objcg, return the old objcg */ -static struct obj_cgroup *replace_stock_objcg(struct memcg_stock_pcp *stock, - struct obj_cgroup *objcg) -{ - struct obj_cgroup *old = NULL; - - old = drain_obj_stock(stock); - obj_cgroup_get(objcg); - stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) - ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; - WRITE_ONCE(stock->cached_objcg, objcg); - return old; -} - -static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - enum node_stat_item idx, int nr) +static void __account_obj_stock(struct obj_cgroup *objcg, + struct obj_stock_pcp *stock, int nr, + struct pglist_data *pgdat, enum node_stat_item idx) { - struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; - unsigned long flags; int *bytes; - localtry_lock_irqsave(&memcg_stock.stock_lock, flags); - stock = this_cpu_ptr(&memcg_stock); - /* * Save vmstat data in stock and skip vmstat array update unless - * accumulating over a page of vmstat data or when pgdat or idx - * changes. + * accumulating over a page of vmstat data or when pgdat changes. */ - if (READ_ONCE(stock->cached_objcg) != objcg) { - old = replace_stock_objcg(stock, objcg); - stock->cached_pgdat = pgdat; - } else if (stock->cached_pgdat != pgdat) { + if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ struct pglist_data *oldpg = stock->cached_pgdat; if (stock->nr_slab_reclaimable_b) { - __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } @@ -2834,37 +2915,38 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, } } if (nr) - __mod_objcg_mlstate(objcg, pgdat, idx, nr); - - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); + mod_objcg_mlstate(objcg, pgdat, idx, nr); } -static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + struct pglist_data *pgdat, enum node_stat_item idx) { - struct memcg_stock_pcp *stock; - unsigned long flags; + struct obj_stock_pcp *stock; bool ret = false; - localtry_lock_irqsave(&memcg_stock.stock_lock, flags); + if (!local_trylock(&obj_stock.lock)) + return ret; - stock = this_cpu_ptr(&memcg_stock); + stock = this_cpu_ptr(&obj_stock); if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; + + if (pgdat) + __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); } - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); + local_unlock(&obj_stock.lock); return ret; } -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) +static void drain_obj_stock(struct obj_stock_pcp *stock) { struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); if (!old) - return NULL; + return; if (stock->nr_bytes) { unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; @@ -2877,7 +2959,8 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); memcg1_account_kmem(memcg, -nr_pages); - __refill_stock(memcg, nr_pages); + if (!mem_cgroup_is_root(memcg)) + memcg_uncharge(memcg, nr_pages); css_put(&memcg->css); } @@ -2901,13 +2984,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) */ if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { if (stock->nr_slab_reclaimable_b) { - __mod_objcg_mlstate(old, stock->cached_pgdat, + mod_objcg_mlstate(old, stock->cached_pgdat, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - __mod_objcg_mlstate(old, stock->cached_pgdat, + mod_objcg_mlstate(old, stock->cached_pgdat, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; @@ -2916,63 +2999,76 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) } WRITE_ONCE(stock->cached_objcg, NULL); - /* - * The `old' objects needs to be released by the caller via - * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. - */ - return old; + obj_cgroup_put(old); } -static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +static bool obj_stock_flush_required(struct obj_stock_pcp *stock, struct mem_cgroup *root_memcg) { struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); struct mem_cgroup *memcg; + bool flush = false; + rcu_read_lock(); if (objcg) { memcg = obj_cgroup_memcg(objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) - return true; + flush = true; } + rcu_read_unlock(); - return false; + return flush; } static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, - bool allow_uncharge) + bool allow_uncharge, int nr_acct, struct pglist_data *pgdat, + enum node_stat_item idx) { - struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; - unsigned long flags; + struct obj_stock_pcp *stock; unsigned int nr_pages = 0; - localtry_lock_irqsave(&memcg_stock.stock_lock, flags); + if (!local_trylock(&obj_stock.lock)) { + if (pgdat) + mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes); + nr_pages = nr_bytes >> PAGE_SHIFT; + nr_bytes = nr_bytes & (PAGE_SIZE - 1); + atomic_add(nr_bytes, &objcg->nr_charged_bytes); + goto out; + } - stock = this_cpu_ptr(&memcg_stock); + stock = this_cpu_ptr(&obj_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ - old = replace_stock_objcg(stock, objcg); + drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; + WRITE_ONCE(stock->cached_objcg, objcg); + allow_uncharge = true; /* Allow uncharge when objcg changes */ } stock->nr_bytes += nr_bytes; + if (pgdat) + __account_obj_stock(objcg, stock, nr_acct, pgdat, idx); + if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { nr_pages = stock->nr_bytes >> PAGE_SHIFT; stock->nr_bytes &= (PAGE_SIZE - 1); } - localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); - + local_unlock(&obj_stock.lock); +out: if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); } -int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size, + struct pglist_data *pgdat, enum node_stat_item idx) { unsigned int nr_pages, nr_bytes; int ret; - if (consume_obj_stock(objcg, size)) + if (likely(consume_obj_stock(objcg, size, pgdat, idx))) return 0; /* @@ -3005,15 +3101,21 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) nr_pages += 1; ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); - if (!ret && nr_bytes) - refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); + if (!ret && (nr_bytes || pgdat)) + refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0, + false, size, pgdat, idx); return ret; } +int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +{ + return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0); +} + void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) { - refill_obj_stock(objcg, size, true); + refill_obj_stock(objcg, size, true, 0, NULL, 0); } static inline size_t obj_full_size(struct kmem_cache *s) @@ -3065,23 +3167,32 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, return false; } - if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s))) - return false; - for (i = 0; i < size; i++) { slab = virt_to_slab(p[i]); if (!slab_obj_exts(slab) && alloc_slab_obj_exts(slab, s, flags, false)) { - obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } + /* + * if we fail and size is 1, memcg_alloc_abort_single() will + * just free the object, which is ok as we have not assigned + * objcg to its obj_ext yet + * + * for larger sizes, kmem_cache_free_bulk() will uncharge + * any objects that were already charged and obj_ext assigned + * + * TODO: we could batch this until slab_pgdat(slab) changes + * between iterations, with a more complicated undo + */ + if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s), + slab_pgdat(slab), cache_vmstat_idx(s))) + return false; + off = obj_to_index(s, slab, p[i]); obj_cgroup_get(objcg); slab_obj_exts(slab)[off].objcg = objcg; - mod_objcg_state(objcg, slab_pgdat(slab), - cache_vmstat_idx(s), obj_full_size(s)); } return true; @@ -3090,6 +3201,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts) { + size_t obj_size = obj_full_size(s); + for (int i = 0; i < objects; i++) { struct obj_cgroup *objcg; unsigned int off; @@ -3100,9 +3213,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, continue; obj_exts[off].objcg = NULL; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), - -obj_full_size(s)); + refill_obj_stock(objcg, obj_size, true, -obj_size, + slab_pgdat(slab), cache_vmstat_idx(s)); obj_cgroup_put(objcg); } } @@ -3544,7 +3656,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; - pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); + pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO, + node); if (!pn) return false; @@ -3591,13 +3704,14 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) { - struct memcg_vmstats_percpu *statc, *pstatc; + struct memcg_vmstats_percpu *statc; + struct memcg_vmstats_percpu __percpu *pstatc_pcpu; struct mem_cgroup *memcg; int node, cpu; int __maybe_unused i; long error; - memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); + memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL); if (!memcg) return ERR_PTR(-ENOMEM); @@ -3622,9 +3736,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) for_each_possible_cpu(cpu) { if (parent) - pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); + pstatc_pcpu = parent->vmstats_percpu; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); - statc->parent = parent ? pstatc : NULL; + statc->parent_pcpu = parent ? pstatc_pcpu : NULL; statc->vmstats = memcg->vmstats; } @@ -3640,7 +3754,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) INIT_LIST_HEAD(&memcg->memory_peaks); INIT_LIST_HEAD(&memcg->swap_peaks); spin_lock_init(&memcg->peaks_lock); - memcg->socket_pressure = jiffies; + memcg->socket_pressure = get_jiffies_64(); +#if BITS_PER_LONG < 64 + seqlock_init(&memcg->socket_pressure_seqlock); +#endif memcg1_memcg_init(memcg); memcg->kmemcg_id = -1; INIT_LIST_HEAD(&memcg->objcg_list); @@ -3898,6 +4015,53 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac) } } +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC +static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent, + int cpu) +{ + int nid; + + if (atomic_read(&memcg->kmem_stat)) { + int kmem = atomic_xchg(&memcg->kmem_stat, 0); + int index = memcg_stats_index(MEMCG_KMEM); + + memcg->vmstats->state[index] += kmem; + if (parent) + parent->vmstats->state_pending[index] += kmem; + } + + for_each_node_state(nid, N_MEMORY) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct lruvec_stats *lstats = pn->lruvec_stats; + struct lruvec_stats *plstats = NULL; + + if (parent) + plstats = parent->nodeinfo[nid]->lruvec_stats; + + if (atomic_read(&pn->slab_reclaimable)) { + int slab = atomic_xchg(&pn->slab_reclaimable, 0); + int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B); + + lstats->state[index] += slab; + if (plstats) + plstats->state_pending[index] += slab; + } + if (atomic_read(&pn->slab_unreclaimable)) { + int slab = atomic_xchg(&pn->slab_unreclaimable, 0); + int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B); + + lstats->state[index] += slab; + if (plstats) + plstats->state_pending[index] += slab; + } + } +} +#else +static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent, + int cpu) +{} +#endif + static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -3906,6 +4070,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct aggregate_control ac; int nid; + flush_nmi_stats(memcg, parent, cpu); + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); ac = (struct aggregate_control) { @@ -3955,8 +4121,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) } WRITE_ONCE(statc->stats_updates, 0); /* We are in a per-cpu loop here, only do the atomic write once */ - if (atomic64_read(&memcg->vmstats->stats_updates)) - atomic64_set(&memcg->vmstats->stats_updates, 0); + if (atomic_read(&memcg->vmstats->stats_updates)) + atomic_set(&memcg->vmstats->stats_updates, 0); } static void mem_cgroup_fork(struct task_struct *task) @@ -4197,6 +4363,9 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, page_counter_set_high(&memcg->memory, high); + if (of->file->f_flags & O_NONBLOCK) + goto out; + for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); unsigned long reclaimed; @@ -4219,7 +4388,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (!reclaimed && !nr_retries--) break; } - +out: memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -4246,6 +4415,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, xchg(&memcg->memory.max, max); + if (of->file->f_flags & O_NONBLOCK) + goto out; + for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -4273,7 +4445,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, break; cond_resched(); } - +out: memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -4394,78 +4566,15 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } -enum { - MEMORY_RECLAIM_SWAPPINESS = 0, - MEMORY_RECLAIM_NULL, -}; - -static const match_table_t tokens = { - { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, - { MEMORY_RECLAIM_NULL, NULL }, -}; - static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned int nr_retries = MAX_RECLAIM_RETRIES; - unsigned long nr_to_reclaim, nr_reclaimed = 0; - int swappiness = -1; - unsigned int reclaim_options; - char *old_buf, *start; - substring_t args[MAX_OPT_ARGS]; - - buf = strstrip(buf); - - old_buf = buf; - nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; - if (buf == old_buf) - return -EINVAL; - - buf = strstrip(buf); - - while ((start = strsep(&buf, " ")) != NULL) { - if (!strlen(start)) - continue; - switch (match_token(start, tokens, args)) { - case MEMORY_RECLAIM_SWAPPINESS: - if (match_int(&args[0], &swappiness)) - return -EINVAL; - if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS) - return -EINVAL; - break; - default: - return -EINVAL; - } - } - - reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; - while (nr_reclaimed < nr_to_reclaim) { - /* Will converge on zero, but reclaim enforces a minimum */ - unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; - unsigned long reclaimed; - - if (signal_pending(current)) - return -EINTR; - - /* - * This is the final attempt, drain percpu lru caches in the - * hope of introducing more evictable pages for - * try_to_free_mem_cgroup_pages(). - */ - if (!nr_retries) - lru_add_drain_all(); - - reclaimed = try_to_free_mem_cgroup_pages(memcg, - batch_size, GFP_KERNEL, - reclaim_options, - swappiness == -1 ? NULL : &swappiness); - - if (!reclaimed && !nr_retries--) - return -EAGAIN; + int ret; - nr_reclaimed += reclaimed; - } + ret = user_proactive_reclaim(buf, memcg, NULL); + if (ret) + return ret; return nbytes; } @@ -4698,9 +4807,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { if (ug->nr_memory) { - page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); - if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); + memcg_uncharge(ug->memcg, ug->nr_memory); if (ug->nr_kmem) { mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem); memcg1_account_kmem(ug->memcg, -ug->nr_kmem); @@ -4976,15 +5083,16 @@ static int __init cgroup_memory(char *s) __setup("cgroup.memory=", cgroup_memory); /* - * subsys_initcall() for memory controller. + * Memory controller init before cgroup_init() initialize root_mem_cgroup. * * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this * context because of lock dependencies (cgroup_lock -> cpu hotplug) but * basically everything that doesn't depend on a specific mem_cgroup structure * should be initialized from here. */ -static int __init mem_cgroup_init(void) +int __init mem_cgroup_init(void) { + unsigned int memcg_size; int cpu; /* @@ -4998,13 +5106,22 @@ static int __init mem_cgroup_init(void) cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, memcg_hotplug_cpu_dead); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, - drain_local_stock); + drain_local_memcg_stock); + INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work, + drain_local_obj_stock); + } + + memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids); + memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0, + SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL); + + memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node, + SLAB_PANIC | SLAB_HWCACHE_ALIGN); return 0; } -subsys_initcall(mem_cgroup_init); #ifdef CONFIG_SWAP /** @@ -5458,3 +5575,8 @@ static int __init mem_cgroup_swap_init(void) subsys_initcall(mem_cgroup_swap_init); #endif /* CONFIG_SWAP */ + +bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) +{ + return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true; +} diff --git a/mm/memfd.c b/mm/memfd.c index c64df1343059..bbe679895ef6 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -20,6 +20,7 @@ #include <linux/memfd.h> #include <linux/pid_namespace.h> #include <uapi/linux/memfd.h> +#include "swap.h" /* * We need a tag: a new tag would expand every xa_node by 8 bytes, @@ -31,8 +32,7 @@ static bool memfd_folio_has_extra_refs(struct folio *folio) { - return folio_ref_count(folio) - folio_mapcount(folio) != - folio_nr_pages(folio); + return folio_ref_count(folio) != folio_expected_ref_count(folio); } static void memfd_tag_pins(struct xa_state *xas) @@ -70,7 +70,6 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) #ifdef CONFIG_HUGETLB_PAGE struct folio *folio; gfp_t gfp_mask; - int err; if (is_file_hugepages(memfd)) { /* @@ -79,12 +78,19 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) * alloc from. Also, the folio will be pinned for an indefinite * amount of time, so it is not expected to be migrated away. */ + struct inode *inode = file_inode(memfd); struct hstate *h = hstate_file(memfd); + int err = -ENOMEM; + long nr_resv; gfp_mask = htlb_alloc_mask(h); gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); idx >>= huge_page_order(h); + nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0); + if (nr_resv < 0) + return ERR_PTR(nr_resv); + folio = alloc_hugetlb_folio_reserve(h, numa_node_id(), NULL, @@ -95,12 +101,17 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) idx); if (err) { folio_put(folio); - return ERR_PTR(err); + goto err_unresv; } + + hugetlb_set_folio_subpool(folio, subpool_inode(inode)); folio_unlock(folio); return folio; } - return ERR_PTR(-ENOMEM); +err_unresv: + if (nr_resv > 0) + hugetlb_unreserve_pages(inode, idx, idx + 1, 0); + return ERR_PTR(err); } #endif return shmem_read_folio(memfd->f_mapping, idx); @@ -332,10 +343,10 @@ static inline bool is_write_sealed(unsigned int seals) return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); } -static int check_write_seal(unsigned long *vm_flags_ptr) +static int check_write_seal(vm_flags_t *vm_flags_ptr) { - unsigned long vm_flags = *vm_flags_ptr; - unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE); + vm_flags_t vm_flags = *vm_flags_ptr; + vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE); /* If a private mapping then writability is irrelevant. */ if (!(mask & VM_SHARED)) @@ -357,7 +368,7 @@ static int check_write_seal(unsigned long *vm_flags_ptr) return 0; } -int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr) +int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr) { int err = 0; unsigned int *seals_ptr = memfd_file_seals_ptr(file); @@ -400,7 +411,7 @@ static char *alloc_name(const char __user *uname) if (!name) return ERR_PTR(-ENOMEM); - strcpy(name, MFD_NAME_PREFIX); + memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN); /* returned length does not include terminating zero */ len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); if (len < 0) { @@ -474,22 +485,22 @@ SYSCALL_DEFINE2(memfd_create, fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); if (fd < 0) { error = fd; - goto err_name; + goto err_free_name; } file = alloc_file(name, flags); if (IS_ERR(file)) { error = PTR_ERR(file); - goto err_fd; + goto err_free_fd; } fd_install(fd, file); kfree(name); return fd; -err_fd: +err_free_fd: put_unused_fd(fd); -err_name: +err_free_name: kfree(name); return error; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b91a33fb6c69..3047b9ac667e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1388,8 +1388,8 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) if (PageSlab(page)) return false; - /* Soft offline could migrate non-LRU movable pages */ - if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) + /* Soft offline could migrate movable_ops pages */ + if ((flags & MF_SOFT_OFFLINE) && page_has_movable_ops(page)) return true; return PageLRU(page) || is_free_buddy_page(page); @@ -1561,6 +1561,10 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) return ret; } +/* + * The caller must guarantee the folio isn't large folio, except hugetlb. + * try_to_unmap() can't handle it. + */ int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; @@ -2503,19 +2507,6 @@ static void memory_failure_work_func(struct work_struct *work) } } -/* - * Process memory_failure work queued on the specified CPU. - * Used to avoid return-to-userspace racing with the memory_failure workqueue. - */ -void memory_failure_queue_kick(int cpu) -{ - struct memory_failure_cpu *mf_cpu; - - mf_cpu = &per_cpu(memory_failure_cpu, cpu); - cancel_work_sync(&mf_cpu->work); - memory_failure_work_func(&mf_cpu->work); -} - static int __init memory_failure_init(void) { struct memory_failure_cpu *mf_cpu; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index fc14fe53e9b7..0382b6942b8b 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -872,25 +872,18 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self, unsigned long action, void *_arg) { struct memory_tier *memtier; - struct memory_notify *arg = _arg; - - /* - * Only update the node migration order when a node is - * changing status, like online->offline. - */ - if (arg->status_change_nid < 0) - return notifier_from_errno(0); + struct node_notify *nn = _arg; switch (action) { - case MEM_OFFLINE: + case NODE_REMOVED_LAST_MEMORY: mutex_lock(&memory_tier_lock); - if (clear_node_memory_tier(arg->status_change_nid)) + if (clear_node_memory_tier(nn->nid)) establish_demotion_targets(); mutex_unlock(&memory_tier_lock); break; - case MEM_ONLINE: + case NODE_ADDED_FIRST_MEMORY: mutex_lock(&memory_tier_lock); - memtier = set_node_memory_tier(arg->status_change_nid); + memtier = set_node_memory_tier(nn->nid); if (!IS_ERR(memtier)) establish_demotion_targets(); mutex_unlock(&memory_tier_lock); @@ -929,7 +922,7 @@ static int __init memory_tier_init(void) nodes_and(default_dram_nodes, node_states[N_MEMORY], node_states[N_CPU]); - hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); + hotplug_node_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); return 0; } subsys_initcall(memory_tier_init); diff --git a/mm/memory.c b/mm/memory.c index 2d8c265fc7d6..0ba4f6b71847 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,7 +57,6 @@ #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> -#include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> @@ -125,6 +124,24 @@ int randomize_va_space __read_mostly = 2; #endif +static const struct ctl_table mmu_sysctl_table[] = { + { + .procname = "randomize_va_space", + .data = &randomize_va_space, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_mm_sysctl(void) +{ + register_sysctl_init("kernel", mmu_sysctl_table); + return 0; +} + +subsys_initcall(init_mm_sysctl); + #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { @@ -278,8 +295,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, p4d_free_tlb(tlb, p4d, start); } -/* - * This function frees user-level page tables of a process. +/** + * free_pgd_range - Unmap and free page tables in the range + * @tlb: the mmu_gather containing pending TLB flush info + * @addr: virtual address start + * @end: virtual address end + * @floor: lowest address boundary + * @ceiling: highest address boundary + * + * This function tears down all user-level page tables in the + * specified virtual address range [@addr..@end). It is part of + * the memory unmap flow. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -349,6 +375,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, { struct unlink_vma_file_batch vb; + tlb_free_vmas(tlb); + do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; @@ -369,32 +397,26 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, vma_start_write(vma); unlink_anon_vmas(vma); - if (is_vm_hugetlb_page(vma)) { - unlink_file_vma(vma); - hugetlb_free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); - } else { - unlink_file_vma_batch_init(&vb); - unlink_file_vma_batch_add(&vb, vma); + unlink_file_vma_batch_init(&vb); + unlink_file_vma_batch_add(&vb, vma); - /* - * Optimization: gather nearby vmas into one call down - */ - while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_vm_hugetlb_page(next)) { - vma = next; - next = mas_find(mas, ceiling - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; - if (mm_wr_locked) - vma_start_write(vma); - unlink_anon_vmas(vma); - unlink_file_vma_batch_add(&vb, vma); - } - unlink_file_vma_batch_final(&vb); - free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { + vma = next; + next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; + if (mm_wr_locked) + vma_start_write(vma); + unlink_anon_vmas(vma); + unlink_file_vma_batch_add(&vb, vma); } + unlink_file_vma_batch_final(&vb); + + free_pgd_range(tlb, addr, vma->vm_end, + floor, next ? next->vm_start : ceiling); vma = next; } while (vma); } @@ -518,10 +540,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_page(page, "bad pte"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); - pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", + pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, + vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL, mapping ? mapping->a_ops->read_folio : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -586,16 +609,6 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; if (is_zero_pfn(pfn)) return NULL; - if (pte_devmap(pte)) - /* - * NOTE: New users of ZONE_DEVICE will not set pte_devmap() - * and will have refcounts incremented on their struct pages - * when they are inserted into PTEs, thus they are safe to - * return here. Legacy ZONE_DEVICE pages that set pte_devmap() - * do not have refcounts. Example of legacy ZONE_DEVICE is - * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. - */ - return NULL; print_bad_pte(vma, addr, pte, NULL); return NULL; @@ -673,9 +686,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, } } - if (pmd_devmap(pmd)) - return NULL; - if (is_huge_zero_pmd(pmd)) + if (is_huge_zero_pfn(pfn)) return NULL; if (unlikely(pfn > highest_memmap_pfn)) return NULL; @@ -785,7 +796,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { - unsigned long vm_flags = dst_vma->vm_flags; + vm_flags_t vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); pte_t pte = orig_pte; struct folio *folio; @@ -929,7 +940,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); + pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ @@ -973,10 +984,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int max_nr, int *rss, struct folio **prealloc) { + fpb_t flags = FPB_MERGE_WRITE; struct page *page; struct folio *folio; - bool any_writable; - fpb_t flags = 0; int err, nr; page = vm_normal_page(src_vma, addr, pte); @@ -991,13 +1001,12 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * by keeping the batching logic separate. */ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { - if (src_vma->vm_flags & VM_SHARED) - flags |= FPB_IGNORE_DIRTY; - if (!vma_soft_dirty_enabled(src_vma)) - flags |= FPB_IGNORE_SOFT_DIRTY; + if (!(src_vma->vm_flags & VM_SHARED)) + flags |= FPB_RESPECT_DIRTY; + if (vma_soft_dirty_enabled(src_vma)) + flags |= FPB_RESPECT_SOFT_DIRTY; - nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, - &any_writable, NULL, NULL); + nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1011,8 +1020,6 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma folio_dup_file_rmap_ptes(folio, page, nr, dst_vma); rss[mm_counter_file(folio)] += nr; } - if (any_writable) - pte = pte_mkwrite(pte, src_vma); __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr); return nr; @@ -1238,8 +1245,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) - || pmd_devmap(*src_pmd)) { + if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, @@ -1275,7 +1281,7 @@ copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { + if (pud_trans_huge(*src_pud)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); @@ -1361,7 +1367,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; - unsigned long next, pfn; + unsigned long next; bool is_cow; int ret; @@ -1371,12 +1377,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (is_vm_hugetlb_page(src_vma)) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); - if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { - ret = track_pfn_copy(dst_vma, src_vma, &pfn); - if (ret) - return ret; - } - /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the @@ -1418,8 +1418,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } - if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) - untrack_pfn_copy(dst_vma, pfn); return ret; } @@ -1545,7 +1543,6 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; @@ -1575,9 +1572,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { - nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, - NULL, NULL, NULL); - + nr = folio_pte_batch(folio, pte, ptent, max_nr); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, force_break, any_skipped); @@ -1797,9 +1792,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) - __split_huge_pmd(vma, pmd, addr, false, NULL); + __split_huge_pmd(vma, pmd, addr, false); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { addr = next; continue; @@ -1839,7 +1834,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_trans_huge(*pud) || pud_devmap(*pud)) { + if (pud_trans_huge(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { mmap_assert_locked(tlb->mm); split_huge_pud(vma, pud, addr); @@ -1914,9 +1909,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) uprobe_munmap(vma, start, end); - if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn(vma, 0, 0, mm_wr_locked); - if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { /* @@ -1990,35 +1982,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, } /** - * zap_page_range_single - remove user pages in a given range + * zap_page_range_single_batched - remove user pages in a given range + * @tlb: pointer to the caller's struct mmu_gather * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap - * @size: number of bytes to zap + * @address: starting address of pages to remove + * @size: number of bytes to remove * @details: details of shared cache invalidation * - * The range must fit into one VMA. + * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for + * hugetlb, @tlb is flushed and re-initialized by this function. */ -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { const unsigned long end = address + size; struct mmu_notifier_range range; - struct mmu_gather tlb; + + VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); - tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); /* * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(&tlb, vma, address, end, details, false); + unmap_single_vma(tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); + if (is_vm_hugetlb_page(vma)) { + /* + * flush tlb and free resources before hugetlb_zap_end(), to + * avoid concurrent page faults' allocation failure. + */ + tlb_finish_mmu(tlb); + hugetlb_zap_end(vma, details); + tlb_gather_mmu(tlb, vma->vm_mm); + } +} + +/** + * zap_page_range_single - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of shared cache invalidation + * + * The range must fit into one VMA. + */ +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_page_range_single_batched(&tlb, vma, address, size, details); tlb_finish_mmu(&tlb); - hugetlb_zap_end(vma, details); } /** @@ -2418,7 +2439,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, EXPORT_SYMBOL(vm_map_pages_zero); static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, pgprot_t prot, bool mkwrite) + unsigned long pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, entry; @@ -2440,7 +2461,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, * allocation and mapping invalidation so just skip the * update. */ - if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { + if (pte_pfn(entry) != pfn) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } @@ -2453,10 +2474,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, } /* Ok, finally just insert the thing.. */ - if (pfn_t_devmap(pfn)) - entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); - else - entry = pte_mkspecial(pfn_t_pte(pfn, prot)); + entry = pte_mkspecial(pfn_pte(pfn, prot)); if (mkwrite) { entry = pte_mkyoung(entry); @@ -2525,10 +2543,9 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); - return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, - false); + return insert_pfn(vma, addr, pfn, pgprot, false); } EXPORT_SYMBOL(vmf_insert_pfn_prot); @@ -2559,25 +2576,22 @@ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vmf_insert_pfn); -static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite) +static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn, + bool mkwrite) { - if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) && + if (unlikely(is_zero_pfn(pfn)) && (mkwrite || !vm_mixed_zeropage_allowed(vma))) return false; /* these checks mirror the abort conditions in vm_normal_page */ if (vma->vm_flags & VM_MIXEDMAP) return true; - if (pfn_t_devmap(pfn)) - return true; - if (pfn_t_special(pfn)) - return true; - if (is_zero_pfn(pfn_t_to_pfn(pfn))) + if (is_zero_pfn(pfn)) return true; return false; } static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn, bool mkwrite) + unsigned long addr, unsigned long pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; int err; @@ -2588,9 +2602,9 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, pfn); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); - if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) + if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; /* @@ -2600,8 +2614,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && - !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) { struct page *page; /* @@ -2609,7 +2622,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, * regardless of whether the caller specified flags that * result in pfn_t_has_page() == false. */ - page = pfn_to_page(pfn_t_to_pfn(pfn)); + page = pfn_to_page(pfn); err = insert_page(vma, addr, page, pgprot, mkwrite); } else { return insert_pfn(vma, addr, pfn, pgprot, mkwrite); @@ -2644,7 +2657,7 @@ vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) + unsigned long pfn) { return __vm_insert_mixed(vma, addr, pfn, false); } @@ -2656,7 +2669,7 @@ EXPORT_SYMBOL(vmf_insert_mixed); * the same entry was actually inserted. */ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn) + unsigned long addr, unsigned long pfn) { return __vm_insert_mixed(vma, addr, pfn, true); } @@ -2833,6 +2846,36 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, return error; } +#ifdef __HAVE_PFNMAP_TRACKING +static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn, + unsigned long size, pgprot_t *prot) +{ + struct pfnmap_track_ctx *ctx; + + if (pfnmap_track(pfn, size, prot)) + return ERR_PTR(-EINVAL); + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (unlikely(!ctx)) { + pfnmap_untrack(pfn, size); + return ERR_PTR(-ENOMEM); + } + + ctx->pfn = pfn; + ctx->size = size; + kref_init(&ctx->kref); + return ctx; +} + +void pfnmap_track_ctx_release(struct kref *ref) +{ + struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref); + + pfnmap_untrack(ctx->pfn, ctx->size); + kfree(ctx); +} +#endif /* __HAVE_PFNMAP_TRACKING */ + /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to @@ -2845,20 +2888,51 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, * * Return: %0 on success, negative error code otherwise. */ +#ifdef __HAVE_PFNMAP_TRACKING int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { + struct pfnmap_track_ctx *ctx = NULL; int err; - err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); - if (err) + size = PAGE_ALIGN(size); + + /* + * If we cover the full VMA, we'll perform actual tracking, and + * remember to untrack when the last reference to our tracking + * context from a VMA goes away. We'll keep tracking the whole pfn + * range even during VMA splits and partial unmapping. + * + * If we only cover parts of the VMA, we'll only setup the cachemode + * in the pgprot for the pfn range. + */ + if (addr == vma->vm_start && addr + size == vma->vm_end) { + if (vma->pfnmap_track_ctx) + return -EINVAL; + ctx = pfnmap_track_ctx_alloc(pfn, size, &prot); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + } else if (pfnmap_setup_cachemode(pfn, size, &prot)) { return -EINVAL; + } err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); - if (err) - untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); + if (ctx) { + if (err) + kref_put(&ctx->kref, pfnmap_track_ctx_release); + else + vma->pfnmap_track_ctx = ctx; + } return err; } + +#else +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_notrack(vma, addr, pfn, size, prot); +} +#endif EXPORT_SYMBOL(remap_pfn_range); /** @@ -2938,11 +3012,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (fn) { do { if (create || !pte_none(ptep_get(pte))) { - err = fn(pte++, addr, data); + err = fn(pte, addr, data); if (err) break; } - } while (addr += PAGE_SIZE, addr != end); + } while (pte++, addr += PAGE_SIZE, addr != end); } *mask |= PGTBL_PTE_MODIFIED; @@ -3523,7 +3597,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); - entry = mk_pte(&new_folio->page, vma->vm_page_prot); + entry = folio_mk_pte(new_folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) @@ -3730,12 +3804,10 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, * If all folio references are from mappings, and all mappings are in * the page tables of this MM, then this folio is exclusive to this MM. */ - if (folio_test_large_maybe_mapped_shared(folio)) + if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) return false; VM_WARN_ON_ONCE(folio_test_ksm(folio)); - VM_WARN_ON_ONCE(folio_mapcount(folio) > folio_nr_pages(folio)); - VM_WARN_ON_ONCE(folio_entire_mapcount(folio)); if (unlikely(folio_test_swapcache(folio))) { /* @@ -3753,13 +3825,15 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, /* Stabilize the mapcount vs. refcount and recheck. */ folio_lock_large_mapcount(folio); - VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio)); + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio); - if (folio_test_large_maybe_mapped_shared(folio)) + if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) goto unlock; if (folio_large_mapcount(folio) != folio_ref_count(folio)) goto unlock; + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio); VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id && folio_mm_id(folio, 1) != vma->vm_mm->mm_id); @@ -4224,26 +4298,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - struct swap_info_struct *si = swp_swap_info(entry); - pgoff_t offset = swp_offset(entry); - int i; - - /* - * While allocating a large folio and doing swap_read_folio, which is - * the case the being faulted pte doesn't have swapcache. We need to - * ensure all PTEs have no cache as well, otherwise, we might go to - * swap devices while the content is in swapcache. - */ - for (i = 0; i < max_nr; i++) { - if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) - return i; - } - - return i; -} - /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. @@ -4579,8 +4633,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* * KSM sometimes has to copy on read faults, for example, if - * page->index of !PageKSM() pages would be nonlinear inside the - * anon VMA -- PageKSM() is lost on actual swapout. + * folio->index of non-ksm folios would be nonlinear inside the + * anon VMA -- the ksm flag is lost on actual swapout. */ folio = ksm_might_need_to_copy(folio, vma, vmf->address); if (unlikely(!folio)) { @@ -5013,7 +5067,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) */ __folio_mark_uptodate(folio); - entry = mk_pte(&folio->page, vma->vm_page_prot); + entry = folio_mk_pte(folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); @@ -5138,9 +5192,8 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page) { - struct folio *folio = page_folio(page); struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; @@ -5188,7 +5241,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) flush_icache_pages(vma, page, HPAGE_PMD_NR); - entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = folio_mk_pmd(folio, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -5213,7 +5266,7 @@ out: return ret; } #else -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page) { return VM_FAULT_FALLBACK; } @@ -5245,6 +5298,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_write(entry) && folio_test_dirty(folio)) + entry = pte_mkdirty(entry); if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ @@ -5305,6 +5360,7 @@ fallback: else page = vmf->page; + folio = page_folio(page); /* * check even for read faults because we might have lost our CoWed * page @@ -5316,8 +5372,8 @@ fallback: } if (pmd_none(*vmf->pmd)) { - if (PageTransCompound(page)) { - ret = do_set_pmd(vmf, page); + if (folio_test_pmd_mappable(folio)) { + ret = do_set_pmd(vmf, folio, page); if (ret != VM_FAULT_FALLBACK) return ret; } @@ -5328,15 +5384,14 @@ fallback: return VM_FAULT_OOM; } - folio = page_folio(page); nr_pages = folio_nr_pages(folio); /* * Using per-page fault to maintain the uffd semantics, and same - * approach also applies to non-anonymous-shmem faults to avoid + * approach also applies to non shmem/tmpfs faults to avoid * inflating the RSS of the process. */ - if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) || + if (!vma_is_shmem(vma) || unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { @@ -5892,7 +5947,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) split: /* COW or write-notify handled on pte level: split pmd. */ - __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + __split_huge_pmd(vma, vmf->pmd, vmf->address, false); return VM_FAULT_FALLBACK; } @@ -6056,7 +6111,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; @@ -6080,7 +6135,7 @@ retry_pud: pud_t orig_pud = *vmf.pud; barrier(); - if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { + if (pud_trans_huge(orig_pud)) { /* * TODO once we support anonymous PUDs: NUMA case and @@ -6121,7 +6176,7 @@ retry_pud: pmd_migration_entry_wait(mm, vmf.pmd); return 0; } - if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { + if (pmd_trans_huge(vmf.orig_pmd)) { if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf); @@ -6338,258 +6393,6 @@ out: } EXPORT_SYMBOL_GPL(handle_mm_fault); -#ifdef CONFIG_LOCK_MM_AND_FIND_VMA -#include <linux/extable.h> - -static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) -{ - if (likely(mmap_read_trylock(mm))) - return true; - - if (regs && !user_mode(regs)) { - unsigned long ip = exception_ip(regs); - if (!search_exception_tables(ip)) - return false; - } - - return !mmap_read_lock_killable(mm); -} - -static inline bool mmap_upgrade_trylock(struct mm_struct *mm) -{ - /* - * We don't have this operation yet. - * - * It should be easy enough to do: it's basically a - * atomic_long_try_cmpxchg_acquire() - * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but - * it also needs the proper lockdep magic etc. - */ - return false; -} - -static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) -{ - mmap_read_unlock(mm); - if (regs && !user_mode(regs)) { - unsigned long ip = exception_ip(regs); - if (!search_exception_tables(ip)) - return false; - } - return !mmap_write_lock_killable(mm); -} - -/* - * Helper for page fault handling. - * - * This is kind of equivalent to "mmap_read_lock()" followed - * by "find_extend_vma()", except it's a lot more careful about - * the locking (and will drop the lock on failure). - * - * For example, if we have a kernel bug that causes a page - * fault, we don't want to just use mmap_read_lock() to get - * the mm lock, because that would deadlock if the bug were - * to happen while we're holding the mm lock for writing. - * - * So this checks the exception tables on kernel faults in - * order to only do this all for instructions that are actually - * expected to fault. - * - * We can also actually take the mm lock for writing if we - * need to extend the vma, which helps the VM layer a lot. - */ -struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, - unsigned long addr, struct pt_regs *regs) -{ - struct vm_area_struct *vma; - - if (!get_mmap_lock_carefully(mm, regs)) - return NULL; - - vma = find_vma(mm, addr); - if (likely(vma && (vma->vm_start <= addr))) - return vma; - - /* - * Well, dang. We might still be successful, but only - * if we can extend a vma to do so. - */ - if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { - mmap_read_unlock(mm); - return NULL; - } - - /* - * We can try to upgrade the mmap lock atomically, - * in which case we can continue to use the vma - * we already looked up. - * - * Otherwise we'll have to drop the mmap lock and - * re-take it, and also look up the vma again, - * re-checking it. - */ - if (!mmap_upgrade_trylock(mm)) { - if (!upgrade_mmap_lock_carefully(mm, regs)) - return NULL; - - vma = find_vma(mm, addr); - if (!vma) - goto fail; - if (vma->vm_start <= addr) - goto success; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto fail; - } - - if (expand_stack_locked(vma, addr)) - goto fail; - -success: - mmap_write_downgrade(mm); - return vma; - -fail: - mmap_write_unlock(mm); - return NULL; -} -#endif - -#ifdef CONFIG_PER_VMA_LOCK -static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) -{ - unsigned int tgt_refcnt = VMA_LOCK_OFFSET; - - /* Additional refcnt if the vma is attached. */ - if (!detaching) - tgt_refcnt++; - - /* - * If vma is detached then only vma_mark_attached() can raise the - * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). - */ - if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) - return false; - - rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); - rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, - refcount_read(&vma->vm_refcnt) == tgt_refcnt, - TASK_UNINTERRUPTIBLE); - lock_acquired(&vma->vmlock_dep_map, _RET_IP_); - - return true; -} - -static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) -{ - *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); -} - -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) -{ - bool locked; - - /* - * __vma_enter_locked() returns false immediately if the vma is not - * attached, otherwise it waits until refcnt is indicating that vma - * is attached with no readers. - */ - locked = __vma_enter_locked(vma, false); - - /* - * We should use WRITE_ONCE() here because we can have concurrent reads - * from the early lockless pessimistic check in vma_start_read(). - * We don't really care about the correctness of that early check, but - * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. - */ - WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - - if (locked) { - bool detached; - - __vma_exit_locked(vma, &detached); - WARN_ON_ONCE(detached); /* vma should remain attached */ - } -} -EXPORT_SYMBOL_GPL(__vma_start_write); - -void vma_mark_detached(struct vm_area_struct *vma) -{ - vma_assert_write_locked(vma); - vma_assert_attached(vma); - - /* - * We are the only writer, so no need to use vma_refcount_put(). - * The condition below is unlikely because the vma has been already - * write-locked and readers can increment vm_refcnt only temporarily - * before they check vm_lock_seq, realize the vma is locked and drop - * back the vm_refcnt. That is a narrow window for observing a raised - * vm_refcnt. - */ - if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { - /* Wait until vma is detached with no readers. */ - if (__vma_enter_locked(vma, true)) { - bool detached; - - __vma_exit_locked(vma, &detached); - WARN_ON_ONCE(!detached); - } - } -} - -/* - * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be - * stable and not isolated. If the VMA is not found or is being modified the - * function returns NULL. - */ -struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, - unsigned long address) -{ - MA_STATE(mas, &mm->mm_mt, address, address); - struct vm_area_struct *vma; - - rcu_read_lock(); -retry: - vma = mas_walk(&mas); - if (!vma) - goto inval; - - vma = vma_start_read(mm, vma); - if (IS_ERR_OR_NULL(vma)) { - /* Check if the VMA got isolated after we found it */ - if (PTR_ERR(vma) == -EAGAIN) { - count_vm_vma_lock_event(VMA_LOCK_MISS); - /* The area was replaced with another one */ - goto retry; - } - - /* Failed to lock the VMA */ - goto inval; - } - /* - * At this point, we have a stable reference to a VMA: The VMA is - * locked and we know it hasn't already been isolated. - * From here on, we can access the VMA without worrying about which - * fields are accessible for RCU readers. - */ - - /* Check if the vma we locked is the right one. */ - if (unlikely(vma->vm_mm != mm || - address < vma->vm_start || address >= vma->vm_end)) - goto inval_end_read; - - rcu_read_unlock(); - return vma; - -inval_end_read: - vma_end_read(vma); -inval: - rcu_read_unlock(); - count_vm_vma_lock_event(VMA_LOCK_ABORT); - return NULL; -} -#endif /* CONFIG_PER_VMA_LOCK */ - #ifndef __PAGETABLE_P4D_FOLDED /* * Allocate p4d page table. @@ -6900,6 +6703,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, while (len) { int bytes, offset; void *maddr; + struct folio *folio; struct vm_area_struct *vma = NULL; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); @@ -6931,21 +6735,22 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, if (bytes <= 0) break; } else { + folio = page_folio(page); bytes = len; offset = addr & (PAGE_SIZE-1); if (bytes > PAGE_SIZE-offset) bytes = PAGE_SIZE-offset; - maddr = kmap_local_page(page); + maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE); if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); - set_page_dirty_lock(page); + folio_mark_dirty_lock(folio); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); } - unmap_and_put_page(page, maddr); + folio_release_kmap(folio, maddr); } len -= bytes; buf += bytes; @@ -7024,6 +6829,7 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, while (len) { int bytes, offset, retval; void *maddr; + struct folio *folio; struct page *page; struct vm_area_struct *vma = NULL; @@ -7039,17 +6845,18 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, goto out; } + folio = page_folio(page); bytes = len; offset = addr & (PAGE_SIZE - 1); if (bytes > PAGE_SIZE - offset) bytes = PAGE_SIZE - offset; - maddr = kmap_local_page(page); + maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE); retval = strscpy(buf, maddr + offset, bytes); if (retval >= 0) { /* Found the end of the string */ buf += retval; - unmap_and_put_page(page, maddr); + folio_release_kmap(folio, maddr); break; } @@ -7067,7 +6874,7 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, } len -= bytes; - unmap_and_put_page(page, maddr); + folio_release_kmap(folio, maddr); } out: diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 75401866fb76..1f15af712bc3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -35,6 +35,7 @@ #include <linux/compaction.h> #include <linux/rmap.h> #include <linux/module.h> +#include <linux/node.h> #include <asm/tlbflush.h> @@ -699,30 +700,6 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) online_mem_sections(start_pfn, end_pfn); } -/* check which state of node_states will be changed when online memory */ -static void node_states_check_changes_online(unsigned long nr_pages, - struct zone *zone, struct memory_notify *arg) -{ - int nid = zone_to_nid(zone); - - arg->status_change_nid = NUMA_NO_NODE; - arg->status_change_nid_normal = NUMA_NO_NODE; - - if (!node_state(nid, N_MEMORY)) - arg->status_change_nid = nid; - if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) - arg->status_change_nid_normal = nid; -} - -static void node_states_set_node(int node, struct memory_notify *arg) -{ - if (arg->status_change_nid_normal >= 0) - node_set_state(node, N_NORMAL_MEMORY); - - if (arg->status_change_nid >= 0) - node_set_state(node, N_MEMORY); -} - static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { @@ -770,7 +747,8 @@ static inline void section_taint_zone_device(unsigned long pfn) */ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, int migratetype) + struct vmem_altmap *altmap, int migratetype, + bool isolate_pageblock) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; @@ -797,12 +775,13 @@ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, /* * TODO now we have a visible range of pages which are not associated - * with their zone properly. Not nice but set_pfnblock_flags_mask + * with their zone properly. Not nice but set_pfnblock_migratetype() * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, - MEMINIT_HOTPLUG, altmap, migratetype); + MEMINIT_HOTPLUG, altmap, migratetype, + isolate_pageblock); set_zone_contiguous(zone); } @@ -1127,7 +1106,8 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, if (mhp_off_inaccessible) page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE, + false); for (i = 0; i < nr_pages; i++) { struct page *page = pfn_to_page(pfn + i); @@ -1173,11 +1153,17 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { - unsigned long flags; - int need_zonelists_rebuild = 0; + struct memory_notify mem_arg = { + .start_pfn = pfn, + .nr_pages = nr_pages, + }; + struct node_notify node_arg = { + .nid = NUMA_NO_NODE, + }; const int nid = zone_to_nid(zone); + int need_zonelists_rebuild = 0; + unsigned long flags; int ret; - struct memory_notify arg; /* * {on,off}lining is constrained to full memory sections (or more @@ -1192,13 +1178,19 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, /* associate pfn range with the zone */ - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); - - arg.start_pfn = pfn; - arg.nr_pages = nr_pages; - node_states_check_changes_online(nr_pages, zone, &arg); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE, + true); + + if (!node_state(nid, N_MEMORY)) { + /* Adding memory to the node for the first time */ + node_arg.nid = nid; + ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg); + ret = notifier_to_errno(ret); + if (ret) + goto failed_addition; + } - ret = memory_notify(MEM_GOING_ONLINE, &arg); + ret = memory_notify(MEM_GOING_ONLINE, &mem_arg); ret = notifier_to_errno(ret); if (ret) goto failed_addition; @@ -1224,12 +1216,13 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, online_pages_range(pfn, nr_pages); adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); - node_states_set_node(nid, &arg); + if (node_arg.nid >= 0) + node_set_state(nid, N_MEMORY); if (need_zonelists_rebuild) build_all_zonelists(NULL); /* Basic onlining is complete, allow allocation of onlined pages. */ - undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); + undo_isolate_page_range(pfn, pfn + nr_pages); /* * Freshly onlined pages aren't shuffled (e.g., all pages are placed to @@ -1245,16 +1238,22 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, kswapd_run(nid); kcompactd_run(nid); + if (node_arg.nid >= 0) + /* First memory added successfully. Notify consumers. */ + node_notify(NODE_ADDED_FIRST_MEMORY, &node_arg); + writeback_set_ratelimit(); - memory_notify(MEM_ONLINE, &arg); + memory_notify(MEM_ONLINE, &mem_arg); return 0; failed_addition: pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", (unsigned long long) pfn << PAGE_SHIFT, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); - memory_notify(MEM_CANCEL_ONLINE, &arg); + memory_notify(MEM_CANCEL_ONLINE, &mem_arg); + if (node_arg.nid != NUMA_NO_NODE) + node_notify(NODE_CANCEL_ADDING_FIRST_MEMORY, &node_arg); remove_pfn_range_from_zone(zone, pfn, nr_pages); return ret; } @@ -1571,13 +1570,12 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) * We online node here. We can't roll back from here. */ node_set_online(nid); - ret = __register_one_node(nid); + ret = register_one_node(nid); BUG_ON(ret); } - register_memory_blocks_under_node(nid, PFN_DOWN(start), - PFN_UP(start + size - 1), - MEMINIT_HOTPLUG); + register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start), + PFN_UP(start + size - 1)); /* create new memmap entry */ if (!strcmp(res->name, "System RAM")) @@ -1741,8 +1739,8 @@ bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, - * non-lru movable pages and hugepages). Will skip over most unmovable + * Scan pfn range [start,end) to find movable/migratable pages (LRU and + * hugetlb folio, movable_ops pages). Will skip over most unmovable * pages (esp., pages that can be skipped when offlining), but bail out on * definitely unmovable pages. * @@ -1756,20 +1754,16 @@ static int scan_movable_pages(unsigned long start, unsigned long end, { unsigned long pfn; - for (pfn = start; pfn < end; pfn++) { + for_each_valid_pfn(pfn, start, end) { struct page *page; struct folio *folio; - if (!pfn_valid(pfn)) - continue; page = pfn_to_page(pfn); - if (PageLRU(page)) - goto found; - if (__PageMovable(page)) + if (PageLRU(page) || page_has_movable_ops(page)) goto found; /* - * PageOffline() pages that are not marked __PageMovable() and + * PageOffline() pages that do not have movable_ops and * have a reference count > 0 (after MEM_GOING_OFFLINE) are * definitely unmovable. If their reference count would be 0, * they could at least be skipped when offlining memory. @@ -1805,29 +1799,21 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - for (pfn = start_pfn; pfn < end_pfn; pfn++) { + for_each_valid_pfn(pfn, start_pfn, end_pfn) { struct page *page; - if (!pfn_valid(pfn)) - continue; page = pfn_to_page(pfn); folio = page_folio(page); - /* - * No reference or lock is held on the folio, so it might - * be modified concurrently (e.g. split). As such, - * folio_nr_pages() may read garbage. This is fine as the outer - * loop will revisit the split folio later. - */ - if (folio_test_large(folio)) - pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; - if (!folio_try_get(folio)) continue; if (unlikely(page_folio(page) != folio)) goto put_folio; + if (folio_test_large(folio)) + pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; + if (folio_contain_hwpoisoned_page(folio)) { if (WARN_ON(folio_test_lru(folio))) folio_isolate_lru(folio); @@ -1896,54 +1882,6 @@ static int __init cmdline_parse_movable_node(char *p) } early_param("movable_node", cmdline_parse_movable_node); -/* check which state of node_states will be changed when offline memory */ -static void node_states_check_changes_offline(unsigned long nr_pages, - struct zone *zone, struct memory_notify *arg) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long present_pages = 0; - enum zone_type zt; - - arg->status_change_nid = NUMA_NO_NODE; - arg->status_change_nid_normal = NUMA_NO_NODE; - - /* - * Check whether node_states[N_NORMAL_MEMORY] will be changed. - * If the memory to be offline is within the range - * [0..ZONE_NORMAL], and it is the last present memory there, - * the zones in that range will become empty after the offlining, - * thus we can determine that we need to clear the node from - * node_states[N_NORMAL_MEMORY]. - */ - for (zt = 0; zt <= ZONE_NORMAL; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) - arg->status_change_nid_normal = zone_to_nid(zone); - - /* - * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM - * does not apply as we don't support 32bit. - * Here we count the possible pages from ZONE_MOVABLE. - * If after having accounted all the pages, we see that the nr_pages - * to be offlined is over or equal to the accounted pages, - * we know that the node will become empty, and so, we can clear - * it for N_MEMORY as well. - */ - present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; - - if (nr_pages >= present_pages) - arg->status_change_nid = zone_to_nid(zone); -} - -static void node_states_clear_node(int node, struct memory_notify *arg) -{ - if (arg->status_change_nid_normal >= 0) - node_clear_state(node, N_NORMAL_MEMORY); - - if (arg->status_change_nid >= 0) - node_clear_state(node, N_MEMORY); -} - static int count_system_ram_pages_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { @@ -1959,11 +1897,18 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, int offline_pages(unsigned long start_pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { - const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn, managed_pages, system_ram_pages = 0; + const unsigned long end_pfn = start_pfn + nr_pages; + struct pglist_data *pgdat = zone->zone_pgdat; const int node = zone_to_nid(zone); + struct memory_notify mem_arg = { + .start_pfn = start_pfn, + .nr_pages = nr_pages, + }; + struct node_notify node_arg = { + .nid = NUMA_NO_NODE, + }; unsigned long flags; - struct memory_notify arg; char *reason; int ret; @@ -2015,18 +1960,28 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, - MIGRATE_MOVABLE, - MEMORY_OFFLINE | REPORT_FAILURE); + PB_ISOLATE_MODE_MEM_OFFLINE); if (ret) { reason = "failure to isolate range"; goto failed_removal_pcplists_disabled; } - arg.start_pfn = start_pfn; - arg.nr_pages = nr_pages; - node_states_check_changes_offline(nr_pages, zone, &arg); + /* + * Check whether the node will have no present pages after we offline + * 'nr_pages' more. If so, we know that the node will become empty, and + * so we will clear N_MEMORY for it. + */ + if (nr_pages >= pgdat->node_present_pages) { + node_arg.nid = node; + ret = node_notify(NODE_REMOVING_LAST_MEMORY, &node_arg); + ret = notifier_to_errno(ret); + if (ret) { + reason = "node notifier failure"; + goto failed_removal_isolated; + } + } - ret = memory_notify(MEM_GOING_OFFLINE, &arg); + ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg); ret = notifier_to_errno(ret); if (ret) { reason = "notifier failure"; @@ -2075,7 +2030,8 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, goto failed_removal_isolated; } - ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); + ret = test_pages_isolated(start_pfn, end_pfn, + PB_ISOLATE_MODE_MEM_OFFLINE); } while (ret); @@ -2106,27 +2062,32 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, * Make sure to mark the node as memory-less before rebuilding the zone * list. Otherwise this node would still appear in the fallback lists. */ - node_states_clear_node(node, &arg); + if (node_arg.nid >= 0) + node_clear_state(node, N_MEMORY); if (!populated_zone(zone)) { zone_pcp_reset(zone); build_all_zonelists(NULL); } - if (arg.status_change_nid >= 0) { + if (node_arg.nid >= 0) { kcompactd_stop(node); kswapd_stop(node); + /* Node went memoryless. Notify consumers */ + node_notify(NODE_REMOVED_LAST_MEMORY, &node_arg); } writeback_set_ratelimit(); - memory_notify(MEM_OFFLINE, &arg); + memory_notify(MEM_OFFLINE, &mem_arg); remove_pfn_range_from_zone(zone, start_pfn, nr_pages); return 0; failed_removal_isolated: /* pushback to free area */ - undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); - memory_notify(MEM_CANCEL_OFFLINE, &arg); + undo_isolate_page_range(start_pfn, end_pfn); + memory_notify(MEM_CANCEL_OFFLINE, &mem_arg); + if (node_arg.nid != NUMA_NO_NODE) + node_notify(NODE_CANCEL_REMOVING_LAST_MEMORY, &node_arg); failed_removal_pcplists_disabled: lru_cache_enable(); zone_pcp_enable(zone); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b28a1e6ae096..eb83cff7db8c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -109,10 +109,12 @@ #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> +#include <linux/gcd.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> +#include <linux/memory.h> #include "internal.h" @@ -139,31 +141,138 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /* - * iw_table is the sysfs-set interleave weight table, a value of 0 denotes - * system-default value should be used. A NULL iw_table also denotes that - * system-default values should be used. Until the system-default table - * is implemented, the system-default is always 1. - * - * iw_table is RCU protected + * weightiness balances the tradeoff between small weights (cycles through nodes + * faster, more fair/even distribution) and large weights (smaller errors + * between actual bandwidth ratios and weight ratios). 32 is a number that has + * been found to perform at a reasonable compromise between the two goals. + */ +static const int weightiness = 32; + +/* + * A null weighted_interleave_state is interpreted as having .mode="auto", + * and .iw_table is interpreted as an array of 1s with length nr_node_ids. */ -static u8 __rcu *iw_table; -static DEFINE_MUTEX(iw_table_lock); +struct weighted_interleave_state { + bool mode_auto; + u8 iw_table[]; +}; +static struct weighted_interleave_state __rcu *wi_state; +static unsigned int *node_bw_table; + +/* + * wi_state_lock protects both wi_state and node_bw_table. + * node_bw_table is only used by writers to update wi_state. + */ +static DEFINE_MUTEX(wi_state_lock); static u8 get_il_weight(int node) { - u8 *table; - u8 weight; + struct weighted_interleave_state *state; + u8 weight = 1; rcu_read_lock(); - table = rcu_dereference(iw_table); - /* if no iw_table, use system default */ - weight = table ? table[node] : 1; - /* if value in iw_table is 0, use system default */ - weight = weight ? weight : 1; + state = rcu_dereference(wi_state); + if (state) + weight = state->iw_table[node]; rcu_read_unlock(); return weight; } +/* + * Convert bandwidth values into weighted interleave weights. + * Call with wi_state_lock. + */ +static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) +{ + u64 sum_bw = 0; + unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0; + int nid; + + for_each_node_state(nid, N_MEMORY) + sum_bw += bw[nid]; + + /* Scale bandwidths to whole numbers in the range [1, weightiness] */ + for_each_node_state(nid, N_MEMORY) { + /* + * Try not to perform 64-bit division. + * If sum_bw < scaling_factor, then sum_bw < U32_MAX. + * If sum_bw > scaling_factor, then round the weight up to 1. + */ + scaling_factor = weightiness * bw[nid]; + if (bw[nid] && sum_bw < scaling_factor) { + cast_sum_bw = (unsigned int)sum_bw; + new_iw[nid] = scaling_factor / cast_sum_bw; + } else { + new_iw[nid] = 1; + } + if (!iw_gcd) + iw_gcd = new_iw[nid]; + iw_gcd = gcd(iw_gcd, new_iw[nid]); + } + + /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ + for_each_node_state(nid, N_MEMORY) + new_iw[nid] /= iw_gcd; +} + +int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *old_bw, *new_bw; + unsigned int bw_val; + int i; + + bw_val = min(coords->read_bandwidth, coords->write_bandwidth); + new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); + if (!new_bw) + return -ENOMEM; + + new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) { + kfree(new_bw); + return -ENOMEM; + } + new_wi_state->mode_auto = true; + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + + /* + * Update bandwidth info, even in manual mode. That way, when switching + * to auto mode in the future, iw_table can be overwritten using + * accurate bw data. + */ + mutex_lock(&wi_state_lock); + + old_bw = node_bw_table; + if (old_bw) + memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw)); + new_bw[node] = bw_val; + node_bw_table = new_bw; + + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (old_wi_state && !old_wi_state->mode_auto) { + /* Manual mode; skip reducing weights and updating wi_state */ + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + goto out; + } + + /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/ + reduce_interleave_weights(new_bw, new_wi_state->iw_table); + rcu_assign_pointer(wi_state, new_wi_state); + + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } +out: + kfree(old_bw); + return 0; +} + /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search @@ -573,6 +682,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, pte_t *pte, *mapped_pte; pte_t ptent; spinlock_t *ptl; + int max_nr, nr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -586,7 +696,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, walk->action = ACTION_AGAIN; return 0; } - for (; addr != end; pte++, addr += PAGE_SIZE) { + for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { + max_nr = (end - addr) >> PAGE_SHIFT; + nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) continue; @@ -598,6 +710,8 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; + if (folio_test_large(folio) && max_nr != 1) + nr = folio_pte_batch(folio, pte, ptent, max_nr); /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. @@ -630,7 +744,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(vma) || !migrate_folio_add(folio, qp->pagelist, flags)) { - qp->nr_failed++; + qp->nr_failed += nr; if (strictly_unmovable(flags)) break; } @@ -2014,26 +2128,28 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol, static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) { + struct weighted_interleave_state *state; nodemask_t nodemask; unsigned int target, nr_nodes; - u8 *table; + u8 *table = NULL; unsigned int weight_total = 0; u8 weight; - int nid; + int nid = 0; nr_nodes = read_once_policy_nodemask(pol, &nodemask); if (!nr_nodes) return numa_node_id(); rcu_read_lock(); - table = rcu_dereference(iw_table); + + state = rcu_dereference(wi_state); + /* Uninitialized wi_state means we should assume all weights are 1 */ + if (state) + table = state->iw_table; + /* calculate the total weight */ - for_each_node_mask(nid, nodemask) { - /* detect system default usage */ - weight = table ? table[nid] : 1; - weight = weight ? weight : 1; - weight_total += weight; - } + for_each_node_mask(nid, nodemask) + weight_total += table ? table[nid] : 1; /* Calculate the node offset based on totals */ target = ilx % weight_total; @@ -2041,7 +2157,6 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) while (target) { /* detect system default usage */ weight = table ? table[nid] : 1; - weight = weight ? weight : 1; if (target < weight) break; target -= weight; @@ -2442,13 +2557,14 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { + struct weighted_interleave_state *state; struct task_struct *me = current; unsigned int cpuset_mems_cookie; unsigned long total_allocated = 0; unsigned long nr_allocated = 0; unsigned long rounds; unsigned long node_pages, delta; - u8 *table, *weights, weight; + u8 *weights, weight; unsigned int weight_total = 0; unsigned long rem_pages = nr_pages; nodemask_t nodes; @@ -2498,17 +2614,19 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, return total_allocated; rcu_read_lock(); - table = rcu_dereference(iw_table); - if (table) - memcpy(weights, table, nr_node_ids); - rcu_read_unlock(); + state = rcu_dereference(wi_state); + if (state) { + memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + for (i = 0; i < nr_node_ids; i++) + weights[i] = 1; + } /* calculate total, detect system default usage */ - for_each_node_mask(node, nodes) { - if (!weights[node]) - weights[node] = 1; + for_each_node_mask(node, nodes) weight_total += weights[node]; - } /* * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. @@ -3419,6 +3537,14 @@ struct iw_node_attr { int nid; }; +struct sysfs_wi_group { + struct kobject wi_kobj; + struct mutex kobj_lock; + struct iw_node_attr *nattrs[]; +}; + +static struct sysfs_wi_group *wi_group; + static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3433,177 +3559,310 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; struct iw_node_attr *node_attr; - u8 *new; - u8 *old; u8 weight = 0; + int i; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); - if (count == 0 || sysfs_streq(buf, "")) - weight = 0; - else if (kstrtou8(buf, 0, &weight)) + if (count == 0 || sysfs_streq(buf, "") || + kstrtou8(buf, 0, &weight) || weight == 0) return -EINVAL; - new = kzalloc(nr_node_ids, GFP_KERNEL); - if (!new) + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) return -ENOMEM; - mutex_lock(&iw_table_lock); - old = rcu_dereference_protected(iw_table, - lockdep_is_held(&iw_table_lock)); - if (old) - memcpy(new, old, nr_node_ids); - new[node_attr->nid] = weight; - rcu_assign_pointer(iw_table, new); - mutex_unlock(&iw_table_lock); - synchronize_rcu(); - kfree(old); + mutex_lock(&wi_state_lock); + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (old_wi_state) { + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + } else { + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + } + new_wi_state->iw_table[node_attr->nid] = weight; + new_wi_state->mode_auto = false; + + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } return count; } -static struct iw_node_attr **node_attrs; - -static void sysfs_wi_node_release(struct iw_node_attr *node_attr, - struct kobject *parent) +static ssize_t weighted_interleave_auto_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { - if (!node_attr) - return; - sysfs_remove_file(parent, &node_attr->kobj_attr.attr); - kfree(node_attr->kobj_attr.attr.name); - kfree(node_attr); + struct weighted_interleave_state *state; + bool wi_auto = true; + + rcu_read_lock(); + state = rcu_dereference(wi_state); + if (state) + wi_auto = state->mode_auto; + rcu_read_unlock(); + + return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); } -static void sysfs_wi_release(struct kobject *wi_kobj) +static ssize_t weighted_interleave_auto_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) { + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *bw; + bool input; int i; + if (kstrtobool(buf, &input)) + return -EINVAL; + + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) + return -ENOMEM; for (i = 0; i < nr_node_ids; i++) - sysfs_wi_node_release(node_attrs[i], wi_kobj); - kobject_put(wi_kobj); + new_wi_state->iw_table[i] = 1; + + mutex_lock(&wi_state_lock); + if (!input) { + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (!old_wi_state) + goto update_wi_state; + if (input == old_wi_state->mode_auto) { + mutex_unlock(&wi_state_lock); + return count; + } + + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + goto update_wi_state; + } + + bw = node_bw_table; + if (!bw) { + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + return -ENODEV; + } + + new_wi_state->mode_auto = true; + reduce_interleave_weights(bw, new_wi_state->iw_table); + +update_wi_state: + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } + return count; +} + +static void sysfs_wi_node_delete(int nid) +{ + struct iw_node_attr *attr; + + if (nid < 0 || nid >= nr_node_ids) + return; + + mutex_lock(&wi_group->kobj_lock); + attr = wi_group->nattrs[nid]; + if (!attr) { + mutex_unlock(&wi_group->kobj_lock); + return; + } + + wi_group->nattrs[nid] = NULL; + mutex_unlock(&wi_group->kobj_lock); + + sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); + kfree(attr->kobj_attr.attr.name); + kfree(attr); +} + +static void sysfs_wi_node_delete_all(void) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) + sysfs_wi_node_delete(nid); +} + +static void wi_state_free(void) +{ + struct weighted_interleave_state *old_wi_state; + + mutex_lock(&wi_state_lock); + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + rcu_assign_pointer(wi_state, NULL); + mutex_unlock(&wi_state_lock); + + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } +} + +static struct kobj_attribute wi_auto_attr = + __ATTR(auto, 0664, weighted_interleave_auto_show, + weighted_interleave_auto_store); + +static void wi_cleanup(void) { + sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); + sysfs_wi_node_delete_all(); + wi_state_free(); +} + +static void wi_kobj_release(struct kobject *wi_kobj) +{ + kfree(wi_group); } static const struct kobj_type wi_ktype = { .sysfs_ops = &kobj_sysfs_ops, - .release = sysfs_wi_release, + .release = wi_kobj_release, }; -static int add_weight_node(int nid, struct kobject *wi_kobj) +static int sysfs_wi_node_add(int nid) { - struct iw_node_attr *node_attr; + int ret; char *name; + struct iw_node_attr *new_attr; - node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); - if (!node_attr) + if (nid < 0 || nid >= nr_node_ids) { + pr_err("invalid node id: %d\n", nid); + return -EINVAL; + } + + new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL); + if (!new_attr) return -ENOMEM; name = kasprintf(GFP_KERNEL, "node%d", nid); if (!name) { - kfree(node_attr); + kfree(new_attr); return -ENOMEM; } - sysfs_attr_init(&node_attr->kobj_attr.attr); - node_attr->kobj_attr.attr.name = name; - node_attr->kobj_attr.attr.mode = 0644; - node_attr->kobj_attr.show = node_show; - node_attr->kobj_attr.store = node_store; - node_attr->nid = nid; + sysfs_attr_init(&new_attr->kobj_attr.attr); + new_attr->kobj_attr.attr.name = name; + new_attr->kobj_attr.attr.mode = 0644; + new_attr->kobj_attr.show = node_show; + new_attr->kobj_attr.store = node_store; + new_attr->nid = nid; - if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { - kfree(node_attr->kobj_attr.attr.name); - kfree(node_attr); - pr_err("failed to add attribute to weighted_interleave\n"); - return -ENOMEM; + mutex_lock(&wi_group->kobj_lock); + if (wi_group->nattrs[nid]) { + mutex_unlock(&wi_group->kobj_lock); + ret = -EEXIST; + goto out; } - node_attrs[nid] = node_attr; + ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr); + if (ret) { + mutex_unlock(&wi_group->kobj_lock); + goto out; + } + wi_group->nattrs[nid] = new_attr; + mutex_unlock(&wi_group->kobj_lock); return 0; + +out: + kfree(new_attr->kobj_attr.attr.name); + kfree(new_attr); + return ret; +} + +static int wi_node_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + int err; + struct node_notify *nn = data; + int nid = nn->nid; + + switch (action) { + case NODE_ADDED_FIRST_MEMORY: + err = sysfs_wi_node_add(nid); + if (err) + pr_err("failed to add sysfs for node%d during hotplug: %d\n", + nid, err); + break; + case NODE_REMOVED_LAST_MEMORY: + sysfs_wi_node_delete(nid); + break; + } + + return NOTIFY_OK; } -static int add_weighted_interleave_group(struct kobject *root_kobj) +static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) { - struct kobject *wi_kobj; int nid, err; - wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); - if (!wi_kobj) + wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids), + GFP_KERNEL); + if (!wi_group) return -ENOMEM; + mutex_init(&wi_group->kobj_lock); - err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, + err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, "weighted_interleave"); - if (err) { - kfree(wi_kobj); - return err; - } + if (err) + goto err_put_kobj; - for_each_node_state(nid, N_POSSIBLE) { - err = add_weight_node(nid, wi_kobj); + err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr); + if (err) + goto err_put_kobj; + + for_each_online_node(nid) { + if (!node_state(nid, N_MEMORY)) + continue; + + err = sysfs_wi_node_add(nid); if (err) { - pr_err("failed to add sysfs [node%d]\n", nid); - break; + pr_err("failed to add sysfs for node%d during init: %d\n", + nid, err); + goto err_cleanup_kobj; } } - if (err) - kobject_put(wi_kobj); - return 0; -} -static void mempolicy_kobj_release(struct kobject *kobj) -{ - u8 *old; + hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); + return 0; - mutex_lock(&iw_table_lock); - old = rcu_dereference_protected(iw_table, - lockdep_is_held(&iw_table_lock)); - rcu_assign_pointer(iw_table, NULL); - mutex_unlock(&iw_table_lock); - synchronize_rcu(); - kfree(old); - kfree(node_attrs); - kfree(kobj); +err_cleanup_kobj: + wi_cleanup(); + kobject_del(&wi_group->wi_kobj); +err_put_kobj: + kobject_put(&wi_group->wi_kobj); + return err; } -static const struct kobj_type mempolicy_ktype = { - .release = mempolicy_kobj_release -}; - static int __init mempolicy_sysfs_init(void) { int err; static struct kobject *mempolicy_kobj; - mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); - if (!mempolicy_kobj) { - err = -ENOMEM; - goto err_out; - } - - node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *), - GFP_KERNEL); - if (!node_attrs) { - err = -ENOMEM; - goto mempol_out; - } + mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); + if (!mempolicy_kobj) + return -ENOMEM; - err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, - "mempolicy"); + err = add_weighted_interleave_group(mempolicy_kobj); if (err) - goto node_out; + goto err_kobj; - err = add_weighted_interleave_group(mempolicy_kobj); - if (err) { - pr_err("mempolicy sysfs structure failed to initialize\n"); - kobject_put(mempolicy_kobj); - return err; - } + return 0; - return err; -node_out: - kfree(node_attrs); -mempol_out: - kfree(mempolicy_kobj); -err_out: - pr_err("failed to add mempolicy kobject to the system\n"); +err_kobj: + kobject_del(mempolicy_kobj); + kobject_put(mempolicy_kobj); return err; } diff --git a/mm/mempool.c b/mm/mempool.c index 3223337135d0..204a216b6418 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -540,11 +540,43 @@ void mempool_free(void *element, mempool_t *pool) if (likely(pool->curr_nr < pool->min_nr)) { add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); - wake_up(&pool->wait); + if (wq_has_sleeper(&pool->wait)) + wake_up(&pool->wait); return; } spin_unlock_irqrestore(&pool->lock, flags); } + + /* + * Handle the min_nr = 0 edge case: + * + * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds, + * so waiters sleeping on pool->wait would never be woken by the + * wake-up path of previous test. This explicit check ensures the + * allocation of element when both min_nr and curr_nr are 0, and + * any active waiters are properly awakened. + * + * Inline the same logic as previous test, add_element() cannot be + * directly used here since it has BUG_ON to deny if min_nr equals + * curr_nr, so here picked rest of add_element() to use without + * BUG_ON check. + */ + if (unlikely(pool->min_nr == 0 && + READ_ONCE(pool->curr_nr) == 0)) { + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr == 0)) { + /* Inline the logic of add_element() */ + poison_element(pool, element); + if (kasan_poison_element(pool, element)) + pool->elements[pool->curr_nr++] = element; + spin_unlock_irqrestore(&pool->lock, flags); + if (wq_has_sleeper(&pool->wait)) + wake_up(&pool->wait); + return; + } + spin_unlock_irqrestore(&pool->lock, flags); + } + pool->free(element, pool->pool_data); } EXPORT_SYMBOL(mempool_free); diff --git a/mm/memremap.c b/mm/memremap.c index 2aebc1b192da..b0ce0d8254bd 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -5,7 +5,6 @@ #include <linux/kasan.h> #include <linux/memory_hotplug.h> #include <linux/memremap.h> -#include <linux/pfn_t.h> #include <linux/swap.h> #include <linux/mm.h> #include <linux/mmzone.h> @@ -39,30 +38,6 @@ unsigned long memremap_compat_align(void) EXPORT_SYMBOL_GPL(memremap_compat_align); #endif -#ifdef CONFIG_FS_DAX -DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL(devmap_managed_key); - -static void devmap_managed_enable_put(struct dev_pagemap *pgmap) -{ - if (pgmap->type == MEMORY_DEVICE_FS_DAX) - static_branch_dec(&devmap_managed_key); -} - -static void devmap_managed_enable_get(struct dev_pagemap *pgmap) -{ - if (pgmap->type == MEMORY_DEVICE_FS_DAX) - static_branch_inc(&devmap_managed_key); -} -#else -static void devmap_managed_enable_get(struct dev_pagemap *pgmap) -{ -} -static void devmap_managed_enable_put(struct dev_pagemap *pgmap) -{ -} -#endif /* CONFIG_FS_DAX */ - static void pgmap_array_delete(struct range *range) { xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end), @@ -130,7 +105,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) } mem_hotplug_done(); - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); + pfnmap_untrack(PHYS_PFN(range->start), range_len(range)); pgmap_array_delete(range); } @@ -151,7 +126,6 @@ void memunmap_pages(struct dev_pagemap *pgmap) percpu_ref_exit(&pgmap->ref); WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); - devmap_managed_enable_put(pgmap); } EXPORT_SYMBOL_GPL(memunmap_pages); @@ -211,8 +185,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, if (nid < 0) nid = numa_mem_id(); - error = track_pfn_remap(NULL, ¶ms->pgprot, PHYS_PFN(range->start), 0, - range_len(range)); + error = pfnmap_track(PHYS_PFN(range->start), range_len(range), + ¶ms->pgprot); if (error) goto err_pfn_remap; @@ -254,7 +228,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, PHYS_PFN(range->start), PHYS_PFN(range_len(range)), params->altmap, - MIGRATE_MOVABLE); + MIGRATE_MOVABLE, false); } mem_hotplug_done(); @@ -277,7 +251,7 @@ err_add_memory: if (!is_private) kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); + pfnmap_untrack(PHYS_PFN(range->start), range_len(range)); err_pfn_remap: pgmap_array_delete(range); return error; @@ -332,10 +306,6 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } break; case MEMORY_DEVICE_FS_DAX: - if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { - WARN(1, "File system DAX not supported\n"); - return ERR_PTR(-EINVAL); - } params.pgprot = pgprot_decrypted(params.pgprot); break; case MEMORY_DEVICE_GENERIC: @@ -354,8 +324,6 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) if (error) return ERR_PTR(error); - devmap_managed_enable_get(pgmap); - /* * Clear the pgmap nr_range as it will be incremented for each * successfully processed range. This communicates how many diff --git a/mm/migrate.c b/mm/migrate.c index f3ee6d8d5e2e..425401b2d4e1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -35,7 +35,6 @@ #include <linux/compat.h> #include <linux/hugetlb.h> #include <linux/gfp.h> -#include <linux/pfn_t.h> #include <linux/page_idle.h> #include <linux/page_owner.h> #include <linux/sched/mm.h> @@ -44,15 +43,57 @@ #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <linux/pagewalk.h> +#include <linux/balloon_compaction.h> +#include <linux/zsmalloc.h> #include <asm/tlbflush.h> #include <trace/events/migrate.h> #include "internal.h" +#include "swap.h" -bool isolate_movable_page(struct page *page, isolate_mode_t mode) +static const struct movable_operations *page_movable_ops(struct page *page) { + VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page); + + /* + * If we enable page migration for a page of a certain type by marking + * it as movable, the page type must be sticky until the page gets freed + * back to the buddy. + */ +#ifdef CONFIG_BALLOON_COMPACTION + if (PageOffline(page)) + /* Only balloon compaction sets PageOffline pages movable. */ + return &balloon_mops; +#endif /* CONFIG_BALLOON_COMPACTION */ +#if defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION) + if (PageZsmalloc(page)) + return &zsmalloc_mops; +#endif /* defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION) */ + return NULL; +} + +/** + * isolate_movable_ops_page - isolate a movable_ops page for migration + * @page: The page. + * @mode: The isolation mode. + * + * Try to isolate a movable_ops page for migration. Will fail if the page is + * not a movable_ops page, if the page is already isolated for migration + * or if the page was just was released by its owner. + * + * Once isolated, the page cannot get freed until it is either putback + * or migrated. + * + * Returns true if isolation succeeded, otherwise false. + */ +bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode) +{ + /* + * TODO: these pages will not be folios in the future. All + * folio dependencies will have to be removed. + */ struct folio *folio = folio_get_nontail_page(page); const struct movable_operations *mops; @@ -69,11 +110,14 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) goto out; /* - * Check movable flag before taking the page lock because + * Check for movable_ops pages before taking the page lock because * we use non-atomic bitops on newly allocated page flags so * unconditionally grabbing the lock ruins page's owner side. + * + * Note that once a page has movable_ops, it will stay that way + * until the page was freed. */ - if (unlikely(!__folio_test_movable(folio))) + if (unlikely(!page_has_movable_ops(page))) goto out_putfolio; /* @@ -90,18 +134,20 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) if (unlikely(!folio_trylock(folio))) goto out_putfolio; - if (!folio_test_movable(folio) || folio_test_isolated(folio)) + VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page); + if (PageMovableOpsIsolated(page)) goto out_no_isolated; - mops = folio_movable_ops(folio); - VM_BUG_ON_FOLIO(!mops, folio); + mops = page_movable_ops(page); + if (WARN_ON_ONCE(!mops)) + goto out_no_isolated; - if (!mops->isolate_page(&folio->page, mode)) + if (!mops->isolate_page(page, mode)) goto out_no_isolated; /* Driver shouldn't use the isolated flag */ - WARN_ON_ONCE(folio_test_isolated(folio)); - folio_set_isolated(folio); + VM_WARN_ON_ONCE_PAGE(PageMovableOpsIsolated(page), page); + SetPageMovableOpsIsolated(page); folio_unlock(folio); return true; @@ -114,12 +160,69 @@ out: return false; } -static void putback_movable_folio(struct folio *folio) +/** + * putback_movable_ops_page - putback an isolated movable_ops page + * @page: The isolated page. + * + * Putback an isolated movable_ops page. + * + * After the page was putback, it might get freed instantly. + */ +static void putback_movable_ops_page(struct page *page) { - const struct movable_operations *mops = folio_movable_ops(folio); + /* + * TODO: these pages will not be folios in the future. All + * folio dependencies will have to be removed. + */ + struct folio *folio = page_folio(page); - mops->putback_page(&folio->page); - folio_clear_isolated(folio); + VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page); + VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(page), page); + folio_lock(folio); + page_movable_ops(page)->putback_page(page); + ClearPageMovableOpsIsolated(page); + folio_unlock(folio); + folio_put(folio); +} + +/** + * migrate_movable_ops_page - migrate an isolated movable_ops page + * @dst: The destination page. + * @src: The source page. + * @mode: The migration mode. + * + * Migrate an isolated movable_ops page. + * + * If the src page was already released by its owner, the src page is + * un-isolated (putback) and migration succeeds; the migration core will be the + * owner of both pages. + * + * If the src page was not released by its owner and the migration was + * successful, the owner of the src page and the dst page are swapped and + * the src page is un-isolated. + * + * If migration fails, the ownership stays unmodified and the src page + * remains isolated: migration may be retried later or the page can be putback. + * + * TODO: migration core will treat both pages as folios and lock them before + * this call to unlock them after this call. Further, the folio refcounts on + * src and dst are also released by migration core. These pages will not be + * folios in the future, so that must be reworked. + * + * Returns MIGRATEPAGE_SUCCESS on success, otherwise a negative error + * code. + */ +static int migrate_movable_ops_page(struct page *dst, struct page *src, + enum migrate_mode mode) +{ + int rc = MIGRATEPAGE_SUCCESS; + + VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src); + VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src); + rc = page_movable_ops(src)->migrate_page(dst, src, mode); + if (rc == MIGRATEPAGE_SUCCESS) + ClearPageMovableOpsIsolated(src); + return rc; } /* @@ -141,20 +244,8 @@ void putback_movable_pages(struct list_head *l) continue; } list_del(&folio->lru); - /* - * We isolated non-lru movable folio so here we can use - * __folio_test_movable because LRU folio's mapping cannot - * have PAGE_MAPPING_MOVABLE. - */ - if (unlikely(__folio_test_movable(folio))) { - VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio); - folio_lock(folio); - if (folio_test_movable(folio)) - putback_movable_folio(folio); - else - folio_clear_isolated(folio); - folio_unlock(folio); - folio_put(folio); + if (unlikely(page_has_movable_ops(&folio->page))) { + putback_movable_ops_page(&folio->page); } else { node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), -folio_nr_pages(folio)); @@ -166,26 +257,20 @@ void putback_movable_pages(struct list_head *l) /* Must be called with an elevated refcount on the non-hugetlb folio */ bool isolate_folio_to_list(struct folio *folio, struct list_head *list) { - bool isolated, lru; - if (folio_test_hugetlb(folio)) return folio_isolate_hugetlb(folio, list); - lru = !__folio_test_movable(folio); - if (lru) - isolated = folio_isolate_lru(folio); - else - isolated = isolate_movable_page(&folio->page, - ISOLATE_UNEVICTABLE); - - if (!isolated) - return false; - - list_add(&folio->lru, list); - if (lru) + if (page_has_movable_ops(&folio->page)) { + if (!isolate_movable_ops_page(&folio->page, + ISOLATE_UNEVICTABLE)) + return false; + } else { + if (!folio_isolate_lru(folio)) + return false; node_stat_add_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio)); - + } + list_add(&folio->lru, list); return true; } @@ -445,20 +530,6 @@ unlock: } #endif -static int folio_expected_refs(struct address_space *mapping, - struct folio *folio) -{ - int refs = 1; - if (!mapping) - return refs; - - refs += folio_nr_pages(folio); - if (folio_test_private(folio)) - refs++; - - return refs; -} - /* * Replace the folio in the mapping. * @@ -601,7 +672,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count) { - int expected_count = folio_expected_refs(mapping, folio) + extra_count; + int expected_count = folio_expected_ref_count(folio) + extra_count + 1; if (folio_ref_count(folio) != expected_count) return -EAGAIN; @@ -618,7 +689,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) { XA_STATE(xas, &mapping->i_pages, folio_index(src)); - int rc, expected_count = folio_expected_refs(mapping, src); + int rc, expected_count = folio_expected_ref_count(src) + 1; if (folio_ref_count(src) != expected_count) return -EAGAIN; @@ -749,7 +820,7 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, void *src_private, enum migrate_mode mode) { - int rc, expected_count = folio_expected_refs(mapping, src); + int rc, expected_count = folio_expected_ref_count(src) + 1; /* Check whether src does not have extra refs before we do more work */ if (folio_ref_count(src) != expected_count) @@ -837,7 +908,7 @@ static int __buffer_migrate_folio(struct address_space *mapping, return migrate_folio(mapping, dst, src, mode); /* Check whether page does not have extra refs before we do more work */ - expected_count = folio_expected_refs(mapping, src); + expected_count = folio_expected_ref_count(src) + 1; if (folio_ref_count(src) != expected_count) return -EAGAIN; @@ -845,9 +916,11 @@ static int __buffer_migrate_folio(struct address_space *mapping, return -EAGAIN; if (check_refs) { - bool busy; + bool busy, migrating; bool invalidated = false; + migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state); + VM_WARN_ON_ONCE(migrating); recheck_buffers: busy = false; spin_lock(&mapping->i_private_lock); @@ -859,12 +932,12 @@ recheck_buffers: } bh = bh->b_this_page; } while (bh != head); + spin_unlock(&mapping->i_private_lock); if (busy) { if (invalidated) { rc = -EAGAIN; goto unlock_buffers; } - spin_unlock(&mapping->i_private_lock); invalidate_bh_lrus(); invalidated = true; goto recheck_buffers; @@ -883,7 +956,7 @@ recheck_buffers: unlock_buffers: if (check_refs) - spin_unlock(&mapping->i_private_lock); + clear_bit_unlock(BH_Migrate, &head->b_state); bh = head; do { unlock_buffer(bh); @@ -945,66 +1018,20 @@ int filemap_migrate_folio(struct address_space *mapping, EXPORT_SYMBOL_GPL(filemap_migrate_folio); /* - * Writeback a folio to clean the dirty state - */ -static int writeout(struct address_space *mapping, struct folio *folio) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = 1, - .range_start = 0, - .range_end = LLONG_MAX, - .for_reclaim = 1 - }; - int rc; - - if (!mapping->a_ops->writepage) - /* No write method for the address space */ - return -EINVAL; - - if (!folio_clear_dirty_for_io(folio)) - /* Someone else already triggered a write */ - return -EAGAIN; - - /* - * A dirty folio may imply that the underlying filesystem has - * the folio on some queue. So the folio must be clean for - * migration. Writeout may mean we lose the lock and the - * folio state is no longer what we checked for earlier. - * At this point we know that the migration attempt cannot - * be successful. - */ - remove_migration_ptes(folio, folio, 0); - - rc = mapping->a_ops->writepage(&folio->page, &wbc); - - if (rc != AOP_WRITEPAGE_ACTIVATE) - /* unlocked. Relock */ - folio_lock(folio); - - return (rc < 0) ? -EIO : -EAGAIN; -} - -/* * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { - if (folio_test_dirty(src)) { - /* Only writeback folios in full synchronous migration */ - switch (mode) { - case MIGRATE_SYNC: - break; - default: - return -EBUSY; - } - return writeout(mapping, src); - } + WARN_ONCE(mapping->a_ops->writepages, + "%ps does not implement migrate_folio\n", + mapping->a_ops); + if (folio_test_dirty(src)) + return -EBUSY; /* - * Buffers may be managed in a filesystem specific way. - * We must have no buffers or drop them. + * Filesystem may have private data at folio->private that we + * can't migrate automatically. */ if (!filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; @@ -1013,11 +1040,12 @@ static int fallback_migrate_folio(struct address_space *mapping, } /* - * Move a page to a newly allocated page - * The page is locked and all ptes have been successfully removed. + * Move a src folio to a newly allocated dst folio. + * + * The src and dst folios are locked and the src folios was unmapped from + * the page tables. * - * The new page will have replaced the old page if this function - * is successful. + * On success, the src folio was replaced by the dst folio. * * Return value: * < 0 - error code @@ -1026,78 +1054,40 @@ static int fallback_migrate_folio(struct address_space *mapping, static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) { + struct address_space *mapping = folio_mapping(src); int rc = -EAGAIN; - bool is_lru = !__folio_test_movable(src); VM_BUG_ON_FOLIO(!folio_test_locked(src), src); VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst); - if (likely(is_lru)) { - struct address_space *mapping = folio_mapping(src); - - if (!mapping) - rc = migrate_folio(mapping, dst, src, mode); - else if (mapping_inaccessible(mapping)) - rc = -EOPNOTSUPP; - else if (mapping->a_ops->migrate_folio) - /* - * Most folios have a mapping and most filesystems - * provide a migrate_folio callback. Anonymous folios - * are part of swap space which also has its own - * migrate_folio callback. This is the most common path - * for page migration. - */ - rc = mapping->a_ops->migrate_folio(mapping, dst, src, - mode); - else - rc = fallback_migrate_folio(mapping, dst, src, mode); - } else { - const struct movable_operations *mops; - + if (!mapping) + rc = migrate_folio(mapping, dst, src, mode); + else if (mapping_inaccessible(mapping)) + rc = -EOPNOTSUPP; + else if (mapping->a_ops->migrate_folio) /* - * In case of non-lru page, it could be released after - * isolation step. In that case, we shouldn't try migration. + * Most folios have a mapping and most filesystems + * provide a migrate_folio callback. Anonymous folios + * are part of swap space which also has its own + * migrate_folio callback. This is the most common path + * for page migration. */ - VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); - if (!folio_test_movable(src)) { - rc = MIGRATEPAGE_SUCCESS; - folio_clear_isolated(src); - goto out; - } - - mops = folio_movable_ops(src); - rc = mops->migrate_page(&dst->page, &src->page, mode); - WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && - !folio_test_isolated(src)); - } + rc = mapping->a_ops->migrate_folio(mapping, dst, src, + mode); + else + rc = fallback_migrate_folio(mapping, dst, src, mode); - /* - * When successful, old pagecache src->mapping must be cleared before - * src is freed; but stats require that PageAnon be left as PageAnon. - */ if (rc == MIGRATEPAGE_SUCCESS) { - if (__folio_test_movable(src)) { - VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); - - /* - * We clear PG_movable under page_lock so any compactor - * cannot try to migrate this page. - */ - folio_clear_isolated(src); - } - /* - * Anonymous and movable src->mapping will be cleared by - * free_pages_prepare so don't reset it here for keeping - * the type to work PageAnon, for example. + * For pagecache folios, src->mapping must be cleared before src + * is freed. Anonymous folios must stay anonymous until freed. */ - if (!folio_mapping_flags(src)) + if (!folio_test_anon(src)) src->mapping = NULL; if (likely(!folio_is_zone_device(dst))) flush_dcache_folio(dst); } -out: return rc; } @@ -1164,12 +1154,7 @@ static void migrate_folio_undo_dst(struct folio *dst, bool locked, static void migrate_folio_done(struct folio *src, enum migrate_reason reason) { - /* - * Compaction can migrate also non-LRU pages which are - * not accounted to NR_ISOLATED_*. They can be recognized - * as __folio_test_movable - */ - if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION) + if (likely(!page_has_movable_ops(&src->page)) && reason != MR_DEMOTION) mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + folio_is_file_lru(src), -folio_nr_pages(src)); @@ -1188,7 +1173,6 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, int rc = -EAGAIN; int old_page_state = 0; struct anon_vma *anon_vma = NULL; - bool is_lru = data_race(!__folio_test_movable(src)); bool locked = false; bool dst_locked = false; @@ -1289,7 +1273,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, goto out; dst_locked = true; - if (unlikely(!is_lru)) { + if (unlikely(page_has_movable_ops(&src->page))) { __migrate_folio_record(dst, old_page_state, anon_vma); return MIGRATEPAGE_UNMAP; } @@ -1348,20 +1332,23 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, int rc; int old_page_state = 0; struct anon_vma *anon_vma = NULL; - bool is_lru = !__folio_test_movable(src); struct list_head *prev; __migrate_folio_extract(dst, &old_page_state, &anon_vma); prev = dst->lru.prev; list_del(&dst->lru); + if (unlikely(page_has_movable_ops(&src->page))) { + rc = migrate_movable_ops_page(&dst->page, &src->page, mode); + if (rc) + goto out; + goto out_unlock_both; + } + rc = move_to_new_folio(dst, src, mode); if (rc) goto out; - if (unlikely(!is_lru)) - goto out_unlock_both; - /* * When successful, push dst to LRU immediately: so that if it * turns out to be an mlocked page, remove_migration_ptes() will @@ -1380,7 +1367,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, out_unlock_both: folio_unlock(dst); - set_page_owner_migrate_reason(&dst->page, reason); + folio_set_owner_migrate_reason(dst, reason); /* * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased @@ -2376,13 +2363,6 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, } /* - * The move_pages() man page does not have an -EEXIST choice, so - * use -EFAULT instead. - */ - if (err == -EEXIST) - err = -EFAULT; - - /* * If the page is already on the target node (!err), store the * node, otherwise, store the err. */ @@ -2456,6 +2436,7 @@ set_status: static int get_compat_pages_array(const void __user *chunk_pages[], const void __user * __user *pages, + unsigned long chunk_offset, unsigned long chunk_nr) { compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages; @@ -2463,7 +2444,7 @@ static int get_compat_pages_array(const void __user *chunk_pages[], int i; for (i = 0; i < chunk_nr; i++) { - if (get_user(p, pages32 + i)) + if (get_user(p, pages32 + chunk_offset + i)) return -EFAULT; chunk_pages[i] = compat_ptr(p); } @@ -2482,27 +2463,28 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, #define DO_PAGES_STAT_CHUNK_NR 16UL const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; int chunk_status[DO_PAGES_STAT_CHUNK_NR]; + unsigned long chunk_offset = 0; while (nr_pages) { unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR); if (in_compat_syscall()) { if (get_compat_pages_array(chunk_pages, pages, - chunk_nr)) + chunk_offset, chunk_nr)) break; } else { - if (copy_from_user(chunk_pages, pages, + if (copy_from_user(chunk_pages, pages + chunk_offset, chunk_nr * sizeof(*chunk_pages))) break; } do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); - if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) + if (copy_to_user(status + chunk_offset, chunk_status, + chunk_nr * sizeof(*status))) break; - pages += chunk_nr; - status += chunk_nr; + chunk_offset += chunk_nr; nr_pages -= chunk_nr; } return nr_pages ? -EFAULT : 0; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 3158afe7eb23..e05e14d6eacd 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -615,7 +615,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, pmdp = pmd_alloc(mm, pudp, addr); if (!pmdp) goto abort; - if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) + if (pmd_trans_huge(*pmdp)) goto abort; if (pte_alloc(mm, pmdp)) goto abort; diff --git a/mm/mincore.c b/mm/mincore.c index 832f29f46767..42d6c9c8da86 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -21,6 +21,7 @@ #include <linux/uaccess.h> #include "swap.h" +#include "internal.h" static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -105,6 +106,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *ptep; unsigned char *vec = walk->private; int nr = (end - addr) >> PAGE_SHIFT; + int step, i; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -118,16 +120,26 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, walk->action = ACTION_AGAIN; return 0; } - for (; addr != end; ptep++, addr += PAGE_SIZE) { + for (; addr != end; ptep += step, addr += step * PAGE_SIZE) { pte_t pte = ptep_get(ptep); + step = 1; /* We need to do cache lookup too for pte markers */ if (pte_none_mostly(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); - else if (pte_present(pte)) - *vec = 1; - else { /* pte is a swap entry */ + else if (pte_present(pte)) { + unsigned int batch = pte_batch_hint(ptep, pte); + + if (batch > 1) { + unsigned int max_nr = (end - addr) >> PAGE_SHIFT; + + step = min_t(unsigned int, batch, max_nr); + } + + for (i = 0; i < step; i++) + vec[i] = 1; + } else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); if (non_swap_entry(entry)) { @@ -146,7 +158,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, #endif } } - vec++; + vec += step; } pte_unmap_unlock(ptep - 1, ptl); out: diff --git a/mm/mlock.c b/mm/mlock.c index 3cb72b579ffd..a1d93ad33c6d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -307,15 +307,13 @@ void munlock_folio(struct folio *folio) static inline unsigned int folio_mlock_step(struct folio *folio, pte_t *pte, unsigned long addr, unsigned long end) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; unsigned int count = (end - addr) >> PAGE_SHIFT; pte_t ptent = ptep_get(pte); if (!folio_test_large(folio)) return 1; - return folio_pte_batch(folio, addr, pte, ptent, count, fpb_flags, NULL, - NULL, NULL); + return folio_pte_batch(folio, pte, ptent, count); } static inline bool allow_mlock_munlock(struct folio *folio, diff --git a/mm/mm_init.c b/mm/mm_init.c index a38a1909b407..5c21b3af216b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -30,6 +30,7 @@ #include <linux/crash_dump.h> #include <linux/execmem.h> #include <linux/vmstat.h> +#include <linux/kexec_handover.h> #include <linux/hugetlb.h> #include "internal.h" #include "slab.h" @@ -684,7 +685,8 @@ void __meminit __init_page_from_nid(unsigned long pfn, int nid) __init_single_page(pfn_to_page(pfn), pfn, zid, nid); if (pageblock_aligned(pfn)) - set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE); + init_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE, + false); } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -743,7 +745,7 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static void __meminit init_deferred_page(unsigned long pfn, int nid) +static void __meminit __init_deferred_page(unsigned long pfn, int nid) { if (early_page_initialised(pfn, nid)) return; @@ -763,11 +765,16 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static inline void init_deferred_page(unsigned long pfn, int nid) +static inline void __init_deferred_page(unsigned long pfn, int nid) { } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ +void __meminit init_deferred_page(unsigned long pfn, int nid) +{ + __init_deferred_page(pfn, nid); +} + /* * Initialised pages do not have PageReserved set. This function is * called for each range allocated by the bootmem allocator and @@ -777,22 +784,19 @@ static inline void init_deferred_page(unsigned long pfn, int nid) void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid) { - unsigned long start_pfn = PFN_DOWN(start); - unsigned long end_pfn = PFN_UP(end); + unsigned long pfn; - for (; start_pfn < end_pfn; start_pfn++) { - if (pfn_valid(start_pfn)) { - struct page *page = pfn_to_page(start_pfn); + for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) { + struct page *page = pfn_to_page(pfn); - init_deferred_page(start_pfn, nid); + __init_deferred_page(pfn, nid); - /* - * no need for atomic set_bit because the struct - * page is not visible yet so nobody should - * access it yet. - */ - __SetPageReserved(page); - } + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); } } @@ -828,7 +832,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * - physical memory bank size is not necessarily the exact multiple of the * arbitrary section size * - early reserved memory may not be listed in memblock.memory - * - non-memory regions covered by the contigious flatmem mapping + * - non-memory regions covered by the contiguous flatmem mapping * - memory layouts defined with memmap= kernel parameter may not align * nicely with memmap sections * @@ -848,11 +852,7 @@ static void __init init_unavailable_range(unsigned long spfn, unsigned long pfn; u64 pgcnt = 0; - for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(pageblock_start_pfn(pfn))) { - pfn = pageblock_end_pfn(pfn) - 1; - continue; - } + for_each_valid_pfn(pfn, spfn, epfn) { __init_single_page(pfn_to_page(pfn), pfn, zone, node); __SetPageReserved(pfn_to_page(pfn)); pgcnt++; @@ -875,7 +875,8 @@ static void __init init_unavailable_range(unsigned long spfn, void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, unsigned long zone_end_pfn, enum meminit_context context, - struct vmem_altmap *altmap, int migratetype) + struct vmem_altmap *altmap, int migratetype, + bool isolate_pageblock) { unsigned long pfn, end_pfn = start_pfn + size; struct page *page; @@ -932,7 +933,8 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone * over the place during system boot. */ if (pageblock_aligned(pfn)) { - set_pageblock_migratetype(page, migratetype); + init_pageblock_migratetype(page, migratetype, + isolate_pageblock); cond_resched(); } pfn++; @@ -955,7 +957,8 @@ static void __init memmap_init_zone_range(struct zone *zone, return; memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, - zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE, + false); if (*hole_pfn < start_pfn) init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); @@ -984,19 +987,19 @@ static void __init memmap_init(void) } } -#ifdef CONFIG_SPARSEMEM /* * Initialize the memory map for hole in the range [memory_end, - * section_end]. + * section_end] for SPARSEMEM and in the range [memory_end, memmap_end] + * for FLATMEM. * Append the pages in this hole to the highest zone in the last * node. - * The call to init_unavailable_range() is outside the ifdef to - * silence the compiler warining about zone_id set but not used; - * for FLATMEM it is a nop anyway */ +#ifdef CONFIG_SPARSEMEM end_pfn = round_up(end_pfn, PAGES_PER_SECTION); - if (hole_pfn < end_pfn) +#else + end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES); #endif + if (hole_pfn < end_pfn) init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); } @@ -1036,7 +1039,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * because this is done early in section_activate() */ if (pageblock_aligned(pfn)) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + init_pageblock_migratetype(page, MIGRATE_MOVABLE, false); cond_resched(); } @@ -1510,7 +1513,7 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order = MAX_PAGE_ORDER; + unsigned int order = PAGE_BLOCK_MAX_ORDER; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) @@ -1785,7 +1788,7 @@ static bool arch_has_descending_max_zone_pfns(void) return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40); } -static void set_high_memory(void) +static void __init set_high_memory(void) { phys_addr_t highmem = memblock_end_of_DRAM(); @@ -1907,7 +1910,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) free_area_init_node(nid); /* - * No sysfs hierarcy will be created via register_one_node() + * No sysfs hierarchy will be created via register_one_node() *for memory-less node because here it's not marked as N_MEMORY *and won't be set online later. The benefit is userspace *program won't be confused by sysfs files/directories of @@ -1997,7 +2000,8 @@ static void __init deferred_free_pages(unsigned long pfn, /* Free a large naturally-aligned chunk if possible */ if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) { for (i = 0; i < nr_pages; i += pageblock_nr_pages) - set_pageblock_migratetype(page + i, MIGRATE_MOVABLE); + init_pageblock_migratetype(page + i, MIGRATE_MOVABLE, + false); __free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY); return; } @@ -2007,7 +2011,8 @@ static void __init deferred_free_pages(unsigned long pfn, for (i = 0; i < nr_pages; i++, page++, pfn++) { if (pageblock_aligned(pfn)) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + init_pageblock_migratetype(page, MIGRATE_MOVABLE, + false); __free_pages_core(page, 0, MEMINIT_EARLY); } } @@ -2306,7 +2311,7 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_count(p, 0); } while (++p, --i); - set_pageblock_migratetype(page, MIGRATE_CMA); + init_pageblock_migratetype(page, MIGRATE_CMA, false); set_page_refcounted(page); /* pages were reserved and not allocated */ clear_page_tag_ref(page); @@ -2320,7 +2325,7 @@ void __init init_cma_reserved_pageblock(struct page *page) */ void __init init_cma_pageblock(struct page *page) { - set_pageblock_migratetype(page, MIGRATE_CMA); + init_pageblock_migratetype(page, MIGRATE_CMA, false); adjust_managed_page_count(page, pageblock_nr_pages); page_zone(page)->cma_pages += pageblock_nr_pages; } @@ -2667,12 +2672,6 @@ static void __init report_meminit(void) stack = "all(pattern)"; else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO)) stack = "all(zero)"; - else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL)) - stack = "byref_all(zero)"; - else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF)) - stack = "byref(zero)"; - else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER)) - stack = "__user(zero)"; else stack = "off"; @@ -2765,6 +2764,13 @@ void __init mm_core_init(void) report_meminit(); kmsan_init_shadow(); stack_depot_early_init(); + + /* + * KHO memory setup must happen while memblock is still active, but + * as close as possible to buddy initialization + */ + kho_memory_init(); + memblock_free_all(); mem_init(); kmem_cache_init(); diff --git a/mm/mmap.c b/mm/mmap.c index bd210aaf7ebd..7306253cc3b5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -80,7 +80,7 @@ core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); /* Update vma->vm_page_prot to reflect vma->vm_flags. */ void vma_set_page_prot(struct vm_area_struct *vma) { - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; pgprot_t vm_page_prot; vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); @@ -127,18 +127,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) origbrk = mm->brk; + min_brk = mm->start_brk; #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting * randomize_va_space to 2, which will still cause mm->start_brk * to be arbitrarily shifted */ - if (current->brk_randomized) - min_brk = mm->start_brk; - else + if (!current->brk_randomized) min_brk = mm->end_data; -#else - min_brk = mm->start_brk; #endif if (brk < min_brk) goto out; @@ -228,12 +225,12 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, +bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; - if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) + if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) return true; locked_pages = bytes >> PAGE_SHIFT; @@ -475,7 +472,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags &= ~VM_MAYEXEC; } - if (!file->f_op->mmap) + if (!can_mmap_file(file)) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; @@ -871,9 +868,8 @@ mm_get_unmapped_area(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - if (test_bit(MMF_TOPDOWN, &mm->flags)) - return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags, 0); - return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0); + return mm_get_unmapped_area_vmflags(mm, file, addr, len, + pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); @@ -1207,7 +1203,7 @@ out: return ret; } -int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) +int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; @@ -1224,7 +1220,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) return 0; /* Until we need other flags, refuse anything except VM_EXEC. */ - if ((flags & (~VM_EXEC)) != 0) + if ((vm_flags & (~VM_EXEC)) != 0) return -EINVAL; if (mmap_write_lock_killable(mm)) @@ -1239,7 +1235,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) goto munmap_failed; vma = vma_prev(&vmi); - ret = do_brk_flags(&vmi, vma, addr, len, flags); + ret = do_brk_flags(&vmi, vma, addr, len, vm_flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); @@ -1321,48 +1317,6 @@ destroy: vm_unacct_memory(nr_accounted); } -/* Insert vm structure into process list sorted by address - * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_rwsem is taken here. - */ -int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) -{ - unsigned long charged = vma_pages(vma); - - - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) - return -ENOMEM; - - if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory_mm(mm, charged)) - return -ENOMEM; - - /* - * The vm_pgoff of a purely anonymous vma should be irrelevant - * until its first write fault, when page's anon_vma and index - * are set. But now set the vm_pgoff it will almost certainly - * end up with (unless mremap moves it elsewhere before that - * first wfault), so /proc/pid/maps tells a consistent story. - * - * By setting it to reflect the virtual start address of the - * vma, merges and splits can happen in a seamless way, just - * using the existing file pgoff checks and manipulations. - * Similarly in do_mmap and in do_brk_flags. - */ - if (vma_is_anonymous(vma)) { - BUG_ON(vma->anon_vma); - vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; - } - - if (vma_link(mm, vma)) { - if (vma->vm_flags & VM_ACCOUNT) - vm_unacct_memory(charged); - return -ENOMEM; - } - - return 0; -} - /* * Return true if the calling process may expand its vm space by the passed * number of pages @@ -1486,7 +1440,7 @@ static vm_fault_t special_mapping_fault(struct vm_fault *vmf) static struct vm_area_struct *__install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long vm_flags, void *priv, + vm_flags_t vm_flags, void *priv, const struct vm_operations_struct *ops) { int ret; @@ -1538,7 +1492,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma, struct vm_area_struct *_install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long vm_flags, const struct vm_special_mapping *spec) + vm_flags_t vm_flags, const struct vm_special_mapping *spec) { return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, &special_mapping_vmops); @@ -1596,7 +1550,7 @@ static const struct ctl_table mmap_table[] = { #endif /* CONFIG_SYSCTL */ /* - * initialise the percpu counter for VM + * initialise the percpu counter for VM, initialise VMA state. */ void __init mmap_init(void) { @@ -1607,6 +1561,7 @@ void __init mmap_init(void) #ifdef CONFIG_SYSCTL register_sysctl_init("vm", mmap_table); #endif + vma_state_init(); } /* @@ -1718,90 +1673,6 @@ static int __meminit init_reserve_notifier(void) subsys_initcall(init_reserve_notifier); /* - * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between - * this VMA and its relocated range, which will now reside at [vma->vm_start - - * shift, vma->vm_end - shift). - * - * This function is almost certainly NOT what you want for anything other than - * early executable temporary stack relocation. - */ -int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) -{ - /* - * The process proceeds as follows: - * - * 1) Use shift to calculate the new vma endpoints. - * 2) Extend vma to cover both the old and new ranges. This ensures the - * arguments passed to subsequent functions are consistent. - * 3) Move vma's page tables to the new range. - * 4) Free up any cleared pgd range. - * 5) Shrink the vma to cover only the new range. - */ - - struct mm_struct *mm = vma->vm_mm; - unsigned long old_start = vma->vm_start; - unsigned long old_end = vma->vm_end; - unsigned long length = old_end - old_start; - unsigned long new_start = old_start - shift; - unsigned long new_end = old_end - shift; - VMA_ITERATOR(vmi, mm, new_start); - VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); - struct vm_area_struct *next; - struct mmu_gather tlb; - PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length); - - BUG_ON(new_start > new_end); - - /* - * ensure there are no vmas between where we want to go - * and where we are - */ - if (vma != vma_next(&vmi)) - return -EFAULT; - - vma_iter_prev_range(&vmi); - /* - * cover the whole range: [new_start, old_end) - */ - vmg.middle = vma; - if (vma_expand(&vmg)) - return -ENOMEM; - - /* - * move the page tables downwards, on failure we rely on - * process cleanup to remove whatever mess we made. - */ - pmc.for_stack = true; - if (length != move_page_tables(&pmc)) - return -ENOMEM; - - tlb_gather_mmu(&tlb, mm); - next = vma_next(&vmi); - if (new_end > old_start) { - /* - * when the old and new regions overlap clear from new_end. - */ - free_pgd_range(&tlb, new_end, old_end, new_end, - next ? next->vm_start : USER_PGTABLES_CEILING); - } else { - /* - * otherwise, clean from old_start; this is done to not touch - * the address space in [new_end, old_start) some architectures - * have constraints on va-space that make this illegal (IA64) - - * for the others its just a little faster. - */ - free_pgd_range(&tlb, old_start, old_end, new_end, - next ? next->vm_start : USER_PGTABLES_CEILING); - } - tlb_finish_mmu(&tlb); - - vma_prev(&vmi); - /* Shrink the vma to just the new range */ - return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); -} - -#ifdef CONFIG_MMU -/* * Obtain a read lock on mm->mmap_lock, if the specified address is below the * start of the VMA, the intent is to perform a write, and it is a * downward-growing stack, then attempt to expand the stack to contain it. @@ -1844,10 +1715,175 @@ bool mmap_read_lock_maybe_expand(struct mm_struct *mm, mmap_write_downgrade(mm); return true; } -#else -bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, bool write) + +__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - return false; + struct vm_area_struct *mpnt, *tmp; + int retval; + unsigned long charge = 0; + LIST_HEAD(uf); + VMA_ITERATOR(vmi, mm, 0); + + if (mmap_write_lock_killable(oldmm)) + return -EINTR; + flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); + /* + * Not linked in yet - no deadlock potential: + */ + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); + + /* No ordering required: file already has been exposed. */ + dup_mm_exe_file(mm, oldmm); + + mm->total_vm = oldmm->total_vm; + mm->data_vm = oldmm->data_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + + /* Use __mt_dup() to efficiently build an identical maple tree. */ + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); + if (unlikely(retval)) + goto out; + + mt_clear_in_rcu(vmi.mas.tree); + for_each_vma(vmi, mpnt) { + struct file *file; + + vma_start_write(mpnt); + if (mpnt->vm_flags & VM_DONTCOPY) { + retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, + mpnt->vm_end, GFP_KERNEL); + if (retval) + goto loop_out; + + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); + continue; + } + charge = 0; + /* + * Don't duplicate many vmas if we've been oom-killed (for + * example) + */ + if (fatal_signal_pending(current)) { + retval = -EINTR; + goto loop_out; + } + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned long len = vma_pages(mpnt); + + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ + goto fail_nomem; + charge = len; + } + + tmp = vm_area_dup(mpnt); + if (!tmp) + goto fail_nomem; + retval = vma_dup_policy(mpnt, tmp); + if (retval) + goto fail_nomem_policy; + tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; + if (tmp->vm_flags & VM_WIPEONFORK) { + /* + * VM_WIPEONFORK gets a clean slate in the child. + * Don't prepare anon_vma until fault since we don't + * copy page for current vma. + */ + tmp->anon_vma = NULL; + } else if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + vm_flags_clear(tmp, VM_LOCKED_MASK); + /* + * Copy/update hugetlb private vma information. + */ + if (is_vm_hugetlb_page(tmp)) + hugetlb_dup_vma_private(tmp); + + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ + vma_iter_bulk_store(&vmi, tmp); + + mm->map_count++; + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + file = tmp->vm_file; + if (file) { + struct address_space *mapping = file->f_mapping; + + get_file(file); + i_mmap_lock_write(mapping); + if (vma_is_shared_maywrite(tmp)) + mapping_allow_writable(mapping); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(tmp, mpnt); + + if (retval) { + mpnt = vma_next(&vmi); + goto loop_out; + } + } + /* a new mm has just been created */ + retval = arch_dup_mmap(oldmm, mm); +loop_out: + vma_iter_free(&vmi); + if (!retval) { + mt_set_in_rcu(vmi.mas.tree); + ksm_fork(mm, oldmm); + khugepaged_fork(mm, oldmm); + } else { + + /* + * The entire maple tree has already been duplicated. If the + * mmap duplication fails, mark the failure point with + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, + * stop releasing VMAs that have not been duplicated after this + * point. + */ + if (mpnt) { + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store(&vmi.mas, XA_ZERO_ENTRY); + /* Avoid OOM iterating a broken tree */ + set_bit(MMF_OOM_SKIP, &mm->flags); + } + /* + * The mm_struct is going to exit, but the locks will be dropped + * first. Set the mm_struct as unstable is advisable as it is + * not fully initialised. + */ + set_bit(MMF_UNSTABLE, &mm->flags); + } +out: + mmap_write_unlock(mm); + flush_tlb_mm(oldmm); + mmap_write_unlock(oldmm); + if (!retval) + dup_userfaultfd_complete(&uf); + else + dup_userfaultfd_fail(&uf); + return retval; + +fail_nomem_anon_vma_fork: + mpol_put(vma_policy(tmp)); +fail_nomem_policy: + vm_area_free(tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto loop_out; } -#endif diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index e7dbaf96aa17..729fb7d0dd59 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -42,3 +42,369 @@ void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) } EXPORT_SYMBOL(__mmap_lock_do_trace_released); #endif /* CONFIG_TRACING */ + +#ifdef CONFIG_MMU +#ifdef CONFIG_PER_VMA_LOCK +static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) +{ + unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + + /* Additional refcnt if the vma is attached. */ + if (!detaching) + tgt_refcnt++; + + /* + * If vma is detached then only vma_mark_attached() can raise the + * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). + */ + if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) + return false; + + rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); + rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, + refcount_read(&vma->vm_refcnt) == tgt_refcnt, + TASK_UNINTERRUPTIBLE); + lock_acquired(&vma->vmlock_dep_map, _RET_IP_); + + return true; +} + +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) +{ + *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); +} + +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) +{ + bool locked; + + /* + * __vma_enter_locked() returns false immediately if the vma is not + * attached, otherwise it waits until refcnt is indicating that vma + * is attached with no readers. + */ + locked = __vma_enter_locked(vma, false); + + /* + * We should use WRITE_ONCE() here because we can have concurrent reads + * from the early lockless pessimistic check in vma_start_read(). + * We don't really care about the correctness of that early check, but + * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. + */ + WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); + + if (locked) { + bool detached; + + __vma_exit_locked(vma, &detached); + WARN_ON_ONCE(detached); /* vma should remain attached */ + } +} +EXPORT_SYMBOL_GPL(__vma_start_write); + +void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * We are the only writer, so no need to use vma_refcount_put(). + * The condition below is unlikely because the vma has been already + * write-locked and readers can increment vm_refcnt only temporarily + * before they check vm_lock_seq, realize the vma is locked and drop + * back the vm_refcnt. That is a narrow window for observing a raised + * vm_refcnt. + */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* Wait until vma is detached with no readers. */ + if (__vma_enter_locked(vma, true)) { + bool detached; + + __vma_exit_locked(vma, &detached); + WARN_ON_ONCE(!detached); + } + } +} + +/* + * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be + * stable and not isolated. If the VMA is not found or is being modified the + * function returns NULL. + */ +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) +{ + MA_STATE(mas, &mm->mm_mt, address, address); + struct vm_area_struct *vma; + + rcu_read_lock(); +retry: + vma = mas_walk(&mas); + if (!vma) + goto inval; + + vma = vma_start_read(mm, vma); + if (IS_ERR_OR_NULL(vma)) { + /* Check if the VMA got isolated after we found it */ + if (PTR_ERR(vma) == -EAGAIN) { + count_vm_vma_lock_event(VMA_LOCK_MISS); + /* The area was replaced with another one */ + goto retry; + } + + /* Failed to lock the VMA */ + goto inval; + } + /* + * At this point, we have a stable reference to a VMA: The VMA is + * locked and we know it hasn't already been isolated. + * From here on, we can access the VMA without worrying about which + * fields are accessible for RCU readers. + */ + + /* Check if the vma we locked is the right one. */ + if (unlikely(vma->vm_mm != mm || + address < vma->vm_start || address >= vma->vm_end)) + goto inval_end_read; + + rcu_read_unlock(); + return vma; + +inval_end_read: + vma_end_read(vma); +inval: + rcu_read_unlock(); + count_vm_vma_lock_event(VMA_LOCK_ABORT); + return NULL; +} + +static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm, + struct vma_iterator *vmi, + unsigned long from_addr) +{ + struct vm_area_struct *vma; + int ret; + + ret = mmap_read_lock_killable(mm); + if (ret) + return ERR_PTR(ret); + + /* Lookup the vma at the last position again under mmap_read_lock */ + vma_iter_set(vmi, from_addr); + vma = vma_next(vmi); + if (vma) { + /* Very unlikely vma->vm_refcnt overflow case */ + if (unlikely(!vma_start_read_locked(vma))) + vma = ERR_PTR(-EAGAIN); + } + + mmap_read_unlock(mm); + + return vma; +} + +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, + struct vma_iterator *vmi, + unsigned long from_addr) +{ + struct vm_area_struct *vma; + unsigned int mm_wr_seq; + bool mmap_unlocked; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); +retry: + /* Start mmap_lock speculation in case we need to verify the vma later */ + mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); + vma = vma_next(vmi); + if (!vma) + return NULL; + + vma = vma_start_read(mm, vma); + if (IS_ERR_OR_NULL(vma)) { + /* + * Retry immediately if the vma gets detached from under us. + * Infinite loop should not happen because the vma we find will + * have to be constantly knocked out from under us. + */ + if (PTR_ERR(vma) == -EAGAIN) { + /* reset to search from the last address */ + vma_iter_set(vmi, from_addr); + goto retry; + } + + goto fallback; + } + + /* + * Verify the vma we locked belongs to the same address space and it's + * not behind of the last search position. + */ + if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end)) + goto fallback_unlock; + + /* + * vma can be ahead of the last search position but we need to verify + * it was not shrunk after we found it and another vma has not been + * installed ahead of it. Otherwise we might observe a gap that should + * not be there. + */ + if (from_addr < vma->vm_start) { + /* Verify only if the address space might have changed since vma lookup. */ + if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { + vma_iter_set(vmi, from_addr); + if (vma != vma_next(vmi)) + goto fallback_unlock; + } + } + + return vma; + +fallback_unlock: + vma_end_read(vma); +fallback: + rcu_read_unlock(); + vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); + rcu_read_lock(); + /* Reinitialize the iterator after re-entering rcu read section */ + vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end); + + return vma; +} +#endif /* CONFIG_PER_VMA_LOCK */ + +#ifdef CONFIG_LOCK_MM_AND_FIND_VMA +#include <linux/extable.h> + +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + if (likely(mmap_read_trylock(mm))) + return true; + + if (regs && !user_mode(regs)) { + unsigned long ip = exception_ip(regs); + if (!search_exception_tables(ip)) + return false; + } + + return !mmap_read_lock_killable(mm); +} + +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +{ + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; +} + +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + mmap_read_unlock(mm); + if (regs && !user_mode(regs)) { + unsigned long ip = exception_ip(regs); + if (!search_exception_tables(ip)) + return false; + } + return !mmap_write_lock_killable(mm); +} + +/* + * Helper for page fault handling. + * + * This is kind of equivalent to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). + * + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. + * + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. + */ +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) +{ + struct vm_area_struct *vma; + + if (!get_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; + + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } + + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } + + if (expand_stack_locked(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; +} +#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ + +#else /* CONFIG_MMU */ + +/* + * At least xtensa ends up having protection faults even with no + * MMU.. No stack expansion, at least. + */ +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) +{ + struct vm_area_struct *vma; + + mmap_read_lock(mm); + vma = vma_lookup(mm, addr); + if (!vma) + mmap_read_unlock(mm); + return vma; +} + +#endif /* CONFIG_MMU */ diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index db7ba4a725d6..b49cc6385f1f 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -424,6 +424,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, #ifdef CONFIG_MMU_GATHER_PAGE_SIZE tlb->page_size = 0; #endif + tlb->vma_pfn = 0; __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index fc18fe274505..8e0125dc0522 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -4,7 +4,7 @@ * * Copyright (C) 2008 Qumranet, Inc. * Copyright (C) 2008 SGI - * Christoph Lameter <cl@linux.com> + * Christoph Lameter <cl@gentwo.org> */ #include <linux/rculist.h> diff --git a/mm/mprotect.c b/mm/mprotect.c index 62c1f7945741..2ddd37b2f462 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -40,11 +40,8 @@ #include "internal.h" -bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +static bool maybe_change_pte_writable(struct vm_area_struct *vma, pte_t pte) { - struct page *page; - if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) return false; @@ -60,16 +57,32 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, if (userfaultfd_pte_wp(vma, pte)) return false; - if (!(vma->vm_flags & VM_SHARED)) { - /* - * Writable MAP_PRIVATE mapping: We can only special-case on - * exclusive anonymous pages, because we know that our - * write-fault handler similarly would map them writable without - * any additional checks while holding the PT lock. - */ - page = vm_normal_page(vma, addr, pte); - return page && PageAnon(page) && PageAnonExclusive(page); - } + return true; +} + +static bool can_change_private_pte_writable(struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + struct page *page; + + if (!maybe_change_pte_writable(vma, pte)) + return false; + + /* + * Writable MAP_PRIVATE mapping: We can only special-case on + * exclusive anonymous pages, because we know that our + * write-fault handler similarly would map them writable without + * any additional checks while holding the PT lock. + */ + page = vm_normal_page(vma, addr, pte); + return page && PageAnon(page) && PageAnonExclusive(page); +} + +static bool can_change_shared_pte_writable(struct vm_area_struct *vma, + pte_t pte) +{ + if (!maybe_change_pte_writable(vma, pte)) + return false; VM_WARN_ON_ONCE(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)); @@ -83,6 +96,183 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, return pte_dirty(pte); } +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + if (!(vma->vm_flags & VM_SHARED)) + return can_change_private_pte_writable(vma, addr, pte); + + return can_change_shared_pte_writable(vma, pte); +} + +static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep, + pte_t pte, int max_nr_ptes, fpb_t flags) +{ + /* No underlying folio, so cannot batch */ + if (!folio) + return 1; + + if (!folio_test_large(folio)) + return 1; + + return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr_ptes, flags); +} + +static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr, + pte_t oldpte, pte_t *pte, int target_node, + struct folio **foliop) +{ + struct folio *folio = NULL; + bool ret = true; + bool toptier; + int nid; + + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + goto skip; + + folio = vm_normal_folio(vma, addr, oldpte); + if (!folio) + goto skip; + + if (folio_is_zone_device(folio) || folio_test_ksm(folio)) + goto skip; + + /* Also skip shared copy-on-write pages */ + if (is_cow_mapping(vma->vm_flags) && + (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio))) + goto skip; + + /* + * While migration can move some dirty pages, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (folio_is_file_lru(folio) && folio_test_dirty(folio)) + goto skip; + + /* + * Don't mess with PTEs if page is already on the node + * a single-threaded process is running on. + */ + nid = folio_nid(folio); + if (target_node == nid) + goto skip; + + toptier = node_is_toptier(nid); + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier) + goto skip; + + ret = false; + if (folio_use_access_time(folio)) + folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); + +skip: + *foliop = folio; + return ret; +} + +/* Set nr_ptes number of ptes, starting from idx */ +static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, + int idx, bool set_write, struct mmu_gather *tlb) +{ + /* + * Advance the position in the batch by idx; note that if idx > 0, + * then the nr_ptes passed here is <= batch size - idx. + */ + addr += idx * PAGE_SIZE; + ptep += idx; + oldpte = pte_advance_pfn(oldpte, idx); + ptent = pte_advance_pfn(ptent, idx); + + if (set_write) + ptent = pte_mkwrite(ptent, vma); + + modify_prot_commit_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes); + if (pte_needs_flush(oldpte, ptent)) + tlb_flush_pte_range(tlb, addr, nr_ptes * PAGE_SIZE); +} + +/* + * Get max length of consecutive ptes pointing to PageAnonExclusive() pages or + * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce + * that the ptes point to consecutive pages of the same anon large folio. + */ +static int page_anon_exclusive_sub_batch(int start_idx, int max_len, + struct page *first_page, bool expected_anon_exclusive) +{ + int idx; + + for (idx = start_idx + 1; idx < start_idx + max_len; ++idx) { + if (expected_anon_exclusive != PageAnonExclusive(first_page + idx)) + break; + } + return idx - start_idx; +} + +/* + * This function is a result of trying our very best to retain the + * "avoid the write-fault handler" optimization. In can_change_pte_writable(), + * if the vma is a private vma, and we cannot determine whether to change + * the pte to writable just from the vma and the pte, we then need to look + * at the actual page pointed to by the pte. Unfortunately, if we have a + * batch of ptes pointing to consecutive pages of the same anon large folio, + * the anon-exclusivity (or the negation) of the first page does not guarantee + * the anon-exclusivity (or the negation) of the other pages corresponding to + * the pte batch; hence in this case it is incorrect to decide to change or + * not change the ptes to writable just by using information from the first + * pte of the batch. Therefore, we must individually check all pages and + * retrieve sub-batches. + */ +static void commit_anon_folio_batch(struct vm_area_struct *vma, + struct folio *folio, unsigned long addr, pte_t *ptep, + pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) +{ + struct page *first_page = folio_page(folio, 0); + bool expected_anon_exclusive; + int sub_batch_idx = 0; + int len; + + while (nr_ptes) { + expected_anon_exclusive = PageAnonExclusive(first_page + sub_batch_idx); + len = page_anon_exclusive_sub_batch(sub_batch_idx, nr_ptes, + first_page, expected_anon_exclusive); + prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, len, + sub_batch_idx, expected_anon_exclusive, tlb); + sub_batch_idx += len; + nr_ptes -= len; + } +} + +static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, + struct folio *folio, unsigned long addr, pte_t *ptep, + pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) +{ + bool set_write; + + if (vma->vm_flags & VM_SHARED) { + set_write = can_change_shared_pte_writable(vma, ptent); + prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes, + /* idx = */ 0, set_write, tlb); + return; + } + + set_write = maybe_change_pte_writable(vma, ptent) && + (folio && folio_test_anon(folio)); + if (!set_write) { + prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes, + /* idx = */ 0, set_write, tlb); + return; + } + commit_anon_folio_batch(vma, folio, addr, ptep, oldpte, ptent, nr_ptes, tlb); +} + static long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -94,6 +284,7 @@ static long change_pte_range(struct mmu_gather *tlb, bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + int nr_ptes; tlb_change_page_size(tlb, PAGE_SIZE); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -108,8 +299,12 @@ static long change_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { + nr_ptes = 1; oldpte = ptep_get(pte); if (pte_present(oldpte)) { + const fpb_t flags = FPB_RESPECT_SOFT_DIRTY | FPB_RESPECT_WRITE; + int max_nr_ptes = (end - addr) >> PAGE_SHIFT; + struct folio *folio = NULL; pte_t ptent; /* @@ -117,56 +312,23 @@ static long change_pte_range(struct mmu_gather *tlb, * pages. See similar comment in change_huge_pmd. */ if (prot_numa) { - struct folio *folio; - int nid; - bool toptier; + int ret = prot_numa_skip(vma, addr, oldpte, pte, + target_node, &folio); + if (ret) { - /* Avoid TLB flush if possible */ - if (pte_protnone(oldpte)) + /* determine batch to skip */ + nr_ptes = mprotect_folio_pte_batch(folio, + pte, oldpte, max_nr_ptes, /* flags = */ 0); continue; + } + } + if (!folio) folio = vm_normal_folio(vma, addr, oldpte); - if (!folio || folio_is_zone_device(folio) || - folio_test_ksm(folio)) - continue; - - /* Also skip shared copy-on-write pages */ - if (is_cow_mapping(vma->vm_flags) && - (folio_maybe_dma_pinned(folio) || - folio_maybe_mapped_shared(folio))) - continue; - /* - * While migration can move some dirty pages, - * it cannot move them all from MIGRATE_ASYNC - * context. - */ - if (folio_is_file_lru(folio) && - folio_test_dirty(folio)) - continue; - - /* - * Don't mess with PTEs if page is already on the node - * a single-threaded process is running on. - */ - nid = folio_nid(folio); - if (target_node == nid) - continue; - toptier = node_is_toptier(nid); - - /* - * Skip scanning top tier node if normal numa - * balancing is disabled - */ - if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - toptier) - continue; - if (folio_use_access_time(folio)) - folio_xchg_access_time(folio, - jiffies_to_msecs(jiffies)); - } + nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags); - oldpte = ptep_modify_prot_start(vma, addr, pte); + oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes); ptent = pte_modify(oldpte, newprot); if (uffd_wp) @@ -188,14 +350,13 @@ static long change_pte_range(struct mmu_gather *tlb, * COW or special handling is required. */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && - !pte_write(ptent) && - can_change_pte_writable(vma, addr, ptent)) - ptent = pte_mkwrite(ptent, vma); - - ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); - if (pte_needs_flush(oldpte, ptent)) - tlb_flush_pte_range(tlb, addr, PAGE_SIZE); - pages++; + !pte_write(ptent)) + set_write_prot_commit_flush_ptes(vma, folio, + addr, pte, oldpte, ptent, nr_ptes, tlb); + else + prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent, + nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb); + pages += nr_ptes; } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); pte_t newpte; @@ -280,7 +441,7 @@ static long change_pte_range(struct mmu_gather *tlb, pages++; } } - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -376,10 +537,10 @@ again: goto next; _pmd = pmdp_get_lockless(pmd); - if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) { + if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd)) { if ((next - addr != HPAGE_PMD_SIZE) || pgtable_split_needed(vma, cp_flags)) { - __split_huge_pmd(vma, pmd, addr, false, NULL); + __split_huge_pmd(vma, pmd, addr, false); /* * For file-backed, the pmd could have been * cleared; make sure pmd populated if @@ -596,10 +757,10 @@ static const struct mm_walk_ops prot_none_walk_ops = { int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, - unsigned long start, unsigned long end, unsigned long newflags) + unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; - unsigned long oldflags = READ_ONCE(vma->vm_flags); + vm_flags_t oldflags = READ_ONCE(vma->vm_flags); long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; @@ -774,8 +935,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, nstart = start; tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { - unsigned long mask_off_old_flags; - unsigned long newflags; + vm_flags_t mask_off_old_flags; + vm_flags_t newflags; int new_vma_pkey; if (vma->vm_start != tmp) { diff --git a/mm/mremap.c b/mm/mremap.c index 0865387531ed..e15cf2e444c7 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -52,7 +52,7 @@ struct vma_remap_struct { unsigned long addr; /* User-specified address from which we remap. */ unsigned long old_len; /* Length of range being remapped. */ unsigned long new_len; /* Desired new length of mapping. */ - unsigned long flags; /* user-specified MREMAP_* flags. */ + const unsigned long flags; /* user-specified MREMAP_* flags. */ unsigned long new_addr; /* Optionally, desired new address. */ /* uffd state. */ @@ -65,10 +65,11 @@ struct vma_remap_struct { /* Internal state, determined in do_mremap(). */ unsigned long delta; /* Absolute delta of old_len,new_len. */ - bool mlocked; /* Was the VMA mlock()'d? */ + bool populate_expand; /* mlock()'d expanded, must populate. */ enum mremap_type remap_type; /* expand, shrink, etc. */ bool mmap_locked; /* Is mm currently write-locked? */ unsigned long charged; /* If VM_ACCOUNT, # pages to account. */ + bool vmi_needs_invalidate; /* Is the VMA iterator invalidated? */ }; static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) @@ -170,13 +171,29 @@ static pte_t move_soft_dirty_pte(pte_t pte) return pte; } +static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t pte, int max_nr) +{ + struct folio *folio; + + if (max_nr == 1) + return 1; + + folio = vm_normal_folio(vma, addr, pte); + if (!folio || !folio_test_large(folio)) + return 1; + + return folio_pte_batch(folio, ptep, pte, max_nr); +} + static int move_ptes(struct pagetable_move_control *pmc, unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd) { struct vm_area_struct *vma = pmc->old; bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct mm_struct *mm = vma->vm_mm; - pte_t *old_pte, *new_pte, pte; + pte_t *old_ptep, *new_ptep; + pte_t old_pte, pte; pmd_t dummy_pmdval; spinlock_t *old_ptl, *new_ptl; bool force_flush = false; @@ -184,6 +201,8 @@ static int move_ptes(struct pagetable_move_control *pmc, unsigned long new_addr = pmc->new_addr; unsigned long old_end = old_addr + extent; unsigned long len = old_end - old_addr; + int max_nr_ptes; + int nr_ptes; int err = 0; /* @@ -211,8 +230,8 @@ static int move_ptes(struct pagetable_move_control *pmc, * We don't have to worry about the ordering of src and dst * pte locks because exclusive mmap_lock prevents deadlock. */ - old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); - if (!old_pte) { + old_ptep = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); + if (!old_ptep) { err = -EAGAIN; goto out; } @@ -223,10 +242,10 @@ static int move_ptes(struct pagetable_move_control *pmc, * mmap_lock, so this new_pte page is stable, so there is no need to get * pmdval and do pmd_same() check. */ - new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval, + new_ptep = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval, &new_ptl); - if (!new_pte) { - pte_unmap_unlock(old_pte, old_ptl); + if (!new_ptep) { + pte_unmap_unlock(old_ptep, old_ptl); err = -EAGAIN; goto out; } @@ -235,12 +254,16 @@ static int move_ptes(struct pagetable_move_control *pmc, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); - for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, - new_pte++, new_addr += PAGE_SIZE) { - if (pte_none(ptep_get(old_pte))) + for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE, + new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) { + VM_WARN_ON_ONCE(!pte_none(*new_ptep)); + + nr_ptes = 1; + max_nr_ptes = (old_end - old_addr) >> PAGE_SHIFT; + old_pte = ptep_get(old_ptep); + if (pte_none(old_pte)) continue; - pte = ptep_get_and_clear(mm, old_addr, old_pte); /* * If we are remapping a valid PTE, make sure * to flush TLB before we drop the PTL for the @@ -252,13 +275,17 @@ static int move_ptes(struct pagetable_move_control *pmc, * the TLB entry for the old mapping has been * flushed. */ - if (pte_present(pte)) + if (pte_present(old_pte)) { + nr_ptes = mremap_folio_pte_batch(vma, old_addr, old_ptep, + old_pte, max_nr_ptes); force_flush = true; + } + pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0); pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) - pte_clear(mm, new_addr, new_pte); + pte_clear(mm, new_addr, new_ptep); else { if (need_clear_uffd_wp) { if (pte_present(pte)) @@ -266,7 +293,7 @@ static int move_ptes(struct pagetable_move_control *pmc, else if (is_swap_pte(pte)) pte = pte_swp_clear_uffd_wp(pte); } - set_pte_at(mm, new_addr, new_pte, pte); + set_ptes(mm, new_addr, new_ptep, pte, nr_ptes); } } @@ -275,8 +302,8 @@ static int move_ptes(struct pagetable_move_control *pmc, flush_tlb_range(vma, old_end - len, old_end); if (new_ptl != old_ptl) spin_unlock(new_ptl); - pte_unmap(new_pte - 1); - pte_unmap_unlock(old_pte - 1, old_ptl); + pte_unmap(new_ptep - 1); + pte_unmap_unlock(old_ptep - 1, old_ptl); out: if (pmc->need_rmap_locks) drop_rmap_locks(vma); @@ -792,7 +819,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc) new_pud = alloc_new_pud(mm, pmc->new_addr); if (!new_pud) break; - if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) { + if (pud_trans_huge(*old_pud)) { if (extent == HPAGE_PUD_SIZE) { move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud); /* We ignore and continue on error? */ @@ -811,8 +838,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc) if (!new_pmd) break; again: - if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || - pmd_devmap(*old_pmd)) { + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE && move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd)) continue; @@ -884,7 +910,11 @@ static bool vrm_overlaps(struct vma_remap_struct *vrm) return false; } -/* Do the mremap() flags require that the new_addr parameter be specified? */ +/* + * Will a new address definitely be assigned? This either if the user specifies + * it via MREMAP_FIXED, or if MREMAP_DONTUNMAP is used, indicating we will + * always detemrine a target address. + */ static bool vrm_implies_new_addr(struct vma_remap_struct *vrm) { return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP); @@ -930,7 +960,7 @@ static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm) * * Returns true on success, false if insufficient memory to charge. */ -static bool vrm_charge(struct vma_remap_struct *vrm) +static bool vrm_calc_charge(struct vma_remap_struct *vrm) { unsigned long charged; @@ -981,10 +1011,8 @@ static void vrm_stat_account(struct vma_remap_struct *vrm, struct vm_area_struct *vma = vrm->vma; vm_stat_account(mm, vma->vm_flags, pages); - if (vma->vm_flags & VM_LOCKED) { + if (vma->vm_flags & VM_LOCKED) mm->locked_vm += pages; - vrm->mlocked = true; - } } /* @@ -997,7 +1025,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) struct vm_area_struct *vma = vrm->vma; unsigned long old_addr = vrm->addr; unsigned long old_len = vrm->old_len; - unsigned long dummy = vma->vm_flags; + vm_flags_t dummy = vma->vm_flags; /* * We'd prefer to avoid failure later on in do_munmap: @@ -1084,6 +1112,7 @@ static void unmap_source_vma(struct vma_remap_struct *vrm) err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false); vrm->vma = NULL; /* Invalidated. */ + vrm->vmi_needs_invalidate = true; if (err) { /* OOM: unable to split vma, just get accounts right */ vm_acct_memory(len >> PAGE_SHIFT); @@ -1159,6 +1188,10 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm, *new_vma_ptr = NULL; return -ENOMEM; } + /* By merging, we may have invalidated any iterator in use. */ + if (vma != vrm->vma) + vrm->vmi_needs_invalidate = true; + vrm->vma = vma; pmc.old = vma; pmc.new = new_vma; @@ -1188,12 +1221,7 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm, mremap_userfaultfd_prep(new_vma, vrm->uf); } - if (is_vm_hugetlb_page(vma)) - clear_vma_resv_huge_pages(vma); - - /* Tell pfnmap has moved from this vma */ - if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn_clear(vma); + fixup_hugetlb_reservations(vma); *new_vma_ptr = new_vma; return err; @@ -1240,8 +1268,11 @@ static unsigned long move_vma(struct vma_remap_struct *vrm) if (err) return err; - /* If accounted, charge the number of bytes the operation will use. */ - if (!vrm_charge(vrm)) + /* + * If accounted, determine the number of bytes the operation will + * charge. + */ + if (!vrm_calc_charge(vrm)) return -ENOMEM; /* We don't want racing faults. */ @@ -1280,64 +1311,6 @@ static unsigned long move_vma(struct vma_remap_struct *vrm) } /* - * resize_is_valid() - Ensure the vma can be resized to the new length at the give - * address. - * - * Return 0 on success, error otherwise. - */ -static int resize_is_valid(struct vma_remap_struct *vrm) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = vrm->vma; - unsigned long addr = vrm->addr; - unsigned long old_len = vrm->old_len; - unsigned long new_len = vrm->new_len; - unsigned long pgoff; - - /* - * !old_len is a special case where an attempt is made to 'duplicate' - * a mapping. This makes no sense for private mappings as it will - * instead create a fresh/new mapping unrelated to the original. This - * is contrary to the basic idea of mremap which creates new mappings - * based on the original. There are no known use cases for this - * behavior. As a result, fail such attempts. - */ - if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { - pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", - current->comm, current->pid); - return -EINVAL; - } - - if ((vrm->flags & MREMAP_DONTUNMAP) && - (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) - return -EINVAL; - - /* We can't remap across vm area boundaries */ - if (old_len > vma->vm_end - addr) - return -EFAULT; - - if (new_len == old_len) - return 0; - - /* Need to be careful about a growing mapping */ - pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; - pgoff += vma->vm_pgoff; - if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) - return -EINVAL; - - if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) - return -EFAULT; - - if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta)) - return -EAGAIN; - - if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) - return -ENOMEM; - - return 0; -} - -/* * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so * execute this, optionally dropping the mmap lock when we do so. * @@ -1386,14 +1359,6 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm) struct mm_struct *mm = current->mm; unsigned long err; - /* Is the new length or address silly? */ - if (vrm->new_len > TASK_SIZE || - vrm->new_addr > TASK_SIZE - vrm->new_len) - return -EINVAL; - - if (vrm_overlaps(vrm)) - return -EINVAL; - if (vrm->flags & MREMAP_FIXED) { /* * In mremap_to(). @@ -1403,6 +1368,7 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm) err = do_munmap(mm, vrm->new_addr, vrm->new_len, vrm->uf_unmap_early); vrm->vma = NULL; /* Invalidated. */ + vrm->vmi_needs_invalidate = true; if (err) return err; @@ -1424,10 +1390,6 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm) vrm->old_len = vrm->new_len; } - err = resize_is_valid(vrm); - if (err) - return err; - /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ if (vrm->flags & MREMAP_DONTUNMAP) { vm_flags_t vm_flags = vrm->vma->vm_flags; @@ -1476,68 +1438,6 @@ static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm) } /* - * Are the parameters passed to mremap() valid? If so return 0, otherwise return - * error. - */ -static unsigned long check_mremap_params(struct vma_remap_struct *vrm) - -{ - unsigned long addr = vrm->addr; - unsigned long flags = vrm->flags; - - /* Ensure no unexpected flag values. */ - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) - return -EINVAL; - - /* Start address must be page-aligned. */ - if (offset_in_page(addr)) - return -EINVAL; - - /* - * We allow a zero old-len as a special case - * for DOS-emu "duplicate shm area" thing. But - * a zero new-len is nonsensical. - */ - if (!PAGE_ALIGN(vrm->new_len)) - return -EINVAL; - - /* Remainder of checks are for cases with specific new_addr. */ - if (!vrm_implies_new_addr(vrm)) - return 0; - - /* The new address must be page-aligned. */ - if (offset_in_page(vrm->new_addr)) - return -EINVAL; - - /* A fixed address implies a move. */ - if (!(flags & MREMAP_MAYMOVE)) - return -EINVAL; - - /* MREMAP_DONTUNMAP does not allow resizing in the process. */ - if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len) - return -EINVAL; - - /* - * move_vma() need us to stay 4 maps below the threshold, otherwise - * it will bail out at the very beginning. - * That is a problem if we have already unmaped the regions here - * (new_addr, and old_addr), because userspace will not know the - * state of the vma's after it gets -ENOMEM. - * So, to avoid such scenario we can pre-compute if the whole - * operation has high chances to success map-wise. - * Worst-scenario case is when both vma's (new_addr and old_addr) get - * split in 3 before unmapping it. - * That means 2 more maps (1 for each) to the ones we already hold. - * Check whether current map count plus 2 still leads us to 4 maps below - * the threshold, otherwise return -ENOMEM here to be more safe. - */ - if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) - return -ENOMEM; - - return 0; -} - -/* * We know we can expand the VMA in-place by delta pages, so do so. * * If we discover the VMA is locked, update mm_struct statistics accordingly and @@ -1549,7 +1449,7 @@ static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm) struct vm_area_struct *vma = vrm->vma; VMA_ITERATOR(vmi, mm, vma->vm_end); - if (!vrm_charge(vrm)) + if (!vrm_calc_charge(vrm)) return -ENOMEM; /* @@ -1561,11 +1461,12 @@ static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm) * adjacent to the expanded vma and otherwise * compatible. */ - vma = vrm->vma = vma_merge_extend(&vmi, vma, vrm->delta); + vma = vma_merge_extend(&vmi, vma, vrm->delta); if (!vma) { vrm_uncharge(vrm); return -ENOMEM; } + vrm->vma = vma; vrm_stat_account(vrm, vrm->delta); @@ -1592,8 +1493,6 @@ static bool align_hugetlb(struct vma_remap_struct *vrm) if (vrm->new_len > vrm->old_len) return false; - vrm_set_delta(vrm); - return true; } @@ -1607,11 +1506,6 @@ static bool align_hugetlb(struct vma_remap_struct *vrm) static unsigned long expand_vma(struct vma_remap_struct *vrm) { unsigned long err; - unsigned long addr = vrm->addr; - - err = resize_is_valid(vrm); - if (err) - return err; /* * [addr, old_len) spans precisely to the end of the VMA, so try to @@ -1622,16 +1516,8 @@ static unsigned long expand_vma(struct vma_remap_struct *vrm) if (err) return err; - /* - * We want to populate the newly expanded portion of the VMA to - * satisfy the expectation that mlock()'ing a VMA maintains all - * of its pages in memory. - */ - if (vrm->mlocked) - vrm->new_addr = addr; - /* OK we're done! */ - return addr; + return vrm->addr; } /* @@ -1682,64 +1568,371 @@ static unsigned long mremap_at(struct vma_remap_struct *vrm) return expand_vma(vrm); } - BUG(); + /* Should not be possible. */ + WARN_ON_ONCE(1); + return -EINVAL; } -static unsigned long do_mremap(struct vma_remap_struct *vrm) +/* + * Will this operation result in the VMA being expanded or moved and thus need + * to map a new portion of virtual address space? + */ +static bool vrm_will_map_new(struct vma_remap_struct *vrm) +{ + if (vrm->remap_type == MREMAP_EXPAND) + return true; + + if (vrm_implies_new_addr(vrm)) + return true; + + return false; +} + +/* Does this remap ONLY move mappings? */ +static bool vrm_move_only(struct vma_remap_struct *vrm) +{ + if (!(vrm->flags & MREMAP_FIXED)) + return false; + + if (vrm->old_len != vrm->new_len) + return false; + + return true; +} + +static void notify_uffd(struct vma_remap_struct *vrm, bool failed) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long ret; - ret = check_mremap_params(vrm); - if (ret) - return ret; + /* Regardless of success/failure, we always notify of any unmaps. */ + userfaultfd_unmap_complete(mm, vrm->uf_unmap_early); + if (failed) + mremap_userfaultfd_fail(vrm->uf); + else + mremap_userfaultfd_complete(vrm->uf, vrm->addr, + vrm->new_addr, vrm->old_len); + userfaultfd_unmap_complete(mm, vrm->uf_unmap); +} - vrm->old_len = PAGE_ALIGN(vrm->old_len); - vrm->new_len = PAGE_ALIGN(vrm->new_len); - vrm_set_delta(vrm); +static bool vma_multi_allowed(struct vm_area_struct *vma) +{ + struct file *file; - if (mmap_write_lock_killable(mm)) - return -EINTR; - vrm->mmap_locked = true; + /* + * We can't support moving multiple uffd VMAs as notify requires + * mmap lock to be dropped. + */ + if (userfaultfd_armed(vma)) + return false; - vma = vrm->vma = vma_lookup(mm, vrm->addr); - if (!vma) { - ret = -EFAULT; - goto out; + /* + * Custom get unmapped area might result in MREMAP_FIXED not + * being obeyed. + */ + file = vma->vm_file; + if (file && !vma_is_shmem(vma) && !is_vm_hugetlb_page(vma)) { + const struct file_operations *fop = file->f_op; + + if (fop->get_unmapped_area) + return false; } + return true; +} + +static int check_prep_vma(struct vma_remap_struct *vrm) +{ + struct vm_area_struct *vma = vrm->vma; + struct mm_struct *mm = current->mm; + unsigned long addr = vrm->addr; + unsigned long old_len, new_len, pgoff; + + if (!vma) + return -EFAULT; + /* If mseal()'d, mremap() is prohibited. */ - if (!can_modify_vma(vma)) { - ret = -EPERM; - goto out; - } + if (!can_modify_vma(vma)) + return -EPERM; /* Align to hugetlb page size, if required. */ - if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) { - ret = -EINVAL; - goto out; - } + if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) + return -EINVAL; + vrm_set_delta(vrm); vrm->remap_type = vrm_remap_type(vrm); + /* For convenience, we set new_addr even if VMA won't move. */ + if (!vrm_implies_new_addr(vrm)) + vrm->new_addr = addr; - /* Actually execute mremap. */ - ret = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm); + /* Below only meaningful if we expand or move a VMA. */ + if (!vrm_will_map_new(vrm)) + return 0; -out: - if (vrm->mmap_locked) { - mmap_write_unlock(mm); - vrm->mmap_locked = false; + old_len = vrm->old_len; + new_len = vrm->new_len; - if (!offset_in_page(ret) && vrm->mlocked && vrm->new_len > vrm->old_len) - mm_populate(vrm->new_addr + vrm->old_len, vrm->delta); + /* + * !old_len is a special case where an attempt is made to 'duplicate' + * a mapping. This makes no sense for private mappings as it will + * instead create a fresh/new mapping unrelated to the original. This + * is contrary to the basic idea of mremap which creates new mappings + * based on the original. There are no known use cases for this + * behavior. As a result, fail such attempts. + */ + if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { + pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", + current->comm, current->pid); + return -EINVAL; } - userfaultfd_unmap_complete(mm, vrm->uf_unmap_early); - mremap_userfaultfd_complete(vrm->uf, vrm->addr, ret, vrm->old_len); - userfaultfd_unmap_complete(mm, vrm->uf_unmap); + if ((vrm->flags & MREMAP_DONTUNMAP) && + (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) + return -EINVAL; + + /* + * We permit crossing of boundaries for the range being unmapped due to + * a shrink. + */ + if (vrm->remap_type == MREMAP_SHRINK) + old_len = new_len; + + /* + * We can't remap across the end of VMAs, as another VMA may be + * adjacent: + * + * addr vma->vm_end + * |-----.----------| + * | . | + * |-----.----------| + * .<--------->xxx> + * old_len + * + * We also require that vma->vm_start <= addr < vma->vm_end. + */ + if (old_len > vma->vm_end - addr) + return -EFAULT; + + if (new_len == old_len) + return 0; + + /* We are expanding and the VMA is mlock()'d so we need to populate. */ + if (vma->vm_flags & VM_LOCKED) + vrm->populate_expand = true; + + /* Need to be careful about a growing mapping */ + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; + if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) + return -EINVAL; + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) + return -EFAULT; + + if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta)) + return -EAGAIN; + + if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) + return -ENOMEM; + + return 0; +} + +/* + * Are the parameters passed to mremap() valid? If so return 0, otherwise return + * error. + */ +static unsigned long check_mremap_params(struct vma_remap_struct *vrm) + +{ + unsigned long addr = vrm->addr; + unsigned long flags = vrm->flags; + + /* Ensure no unexpected flag values. */ + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) + return -EINVAL; + + /* Start address must be page-aligned. */ + if (offset_in_page(addr)) + return -EINVAL; + + /* + * We allow a zero old-len as a special case + * for DOS-emu "duplicate shm area" thing. But + * a zero new-len is nonsensical. + */ + if (!vrm->new_len) + return -EINVAL; + + /* Is the new length or address silly? */ + if (vrm->new_len > TASK_SIZE || + vrm->new_addr > TASK_SIZE - vrm->new_len) + return -EINVAL; + + /* Remainder of checks are for cases with specific new_addr. */ + if (!vrm_implies_new_addr(vrm)) + return 0; + + /* The new address must be page-aligned. */ + if (offset_in_page(vrm->new_addr)) + return -EINVAL; + + /* A fixed address implies a move. */ + if (!(flags & MREMAP_MAYMOVE)) + return -EINVAL; + + /* MREMAP_DONTUNMAP does not allow resizing in the process. */ + if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len) + return -EINVAL; + + /* Target VMA must not overlap source VMA. */ + if (vrm_overlaps(vrm)) + return -EINVAL; + + /* + * move_vma() need us to stay 4 maps below the threshold, otherwise + * it will bail out at the very beginning. + * That is a problem if we have already unmaped the regions here + * (new_addr, and old_addr), because userspace will not know the + * state of the vma's after it gets -ENOMEM. + * So, to avoid such scenario we can pre-compute if the whole + * operation has high chances to success map-wise. + * Worst-scenario case is when both vma's (new_addr and old_addr) get + * split in 3 before unmapping it. + * That means 2 more maps (1 for each) to the ones we already hold. + * Check whether current map count plus 2 still leads us to 4 maps below + * the threshold, otherwise return -ENOMEM here to be more safe. + */ + if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) + return -ENOMEM; + + return 0; +} + +static unsigned long remap_move(struct vma_remap_struct *vrm) +{ + struct vm_area_struct *vma; + unsigned long start = vrm->addr; + unsigned long end = vrm->addr + vrm->old_len; + unsigned long new_addr = vrm->new_addr; + bool allowed = true, seen_vma = false; + unsigned long target_addr = new_addr; + unsigned long res = -EFAULT; + unsigned long last_end; + VMA_ITERATOR(vmi, current->mm, start); + + /* + * When moving VMAs we allow for batched moves across multiple VMAs, + * with all VMAs in the input range [addr, addr + old_len) being moved + * (and split as necessary). + */ + for_each_vma_range(vmi, vma, end) { + /* Account for start, end not aligned with VMA start, end. */ + unsigned long addr = max(vma->vm_start, start); + unsigned long len = min(end, vma->vm_end) - addr; + unsigned long offset, res_vma; - return ret; + if (!allowed) + return -EFAULT; + + /* No gap permitted at the start of the range. */ + if (!seen_vma && start < vma->vm_start) + return -EFAULT; + + /* + * To sensibly move multiple VMAs, accounting for the fact that + * get_unmapped_area() may align even MAP_FIXED moves, we simply + * attempt to move such that the gaps between source VMAs remain + * consistent in destination VMAs, e.g.: + * + * X Y X Y + * <---> <-> <---> <-> + * |-------| |-----| |-----| |-------| |-----| |-----| + * | A | | B | | C | ---> | A' | | B' | | C' | + * |-------| |-----| |-----| |-------| |-----| |-----| + * new_addr + * + * So we map B' at A'->vm_end + X, and C' at B'->vm_end + Y. + */ + offset = seen_vma ? vma->vm_start - last_end : 0; + last_end = vma->vm_end; + + vrm->vma = vma; + vrm->addr = addr; + vrm->new_addr = target_addr + offset; + vrm->old_len = vrm->new_len = len; + + allowed = vma_multi_allowed(vma); + if (seen_vma && !allowed) + return -EFAULT; + + res_vma = check_prep_vma(vrm); + if (!res_vma) + res_vma = mremap_to(vrm); + if (IS_ERR_VALUE(res_vma)) + return res_vma; + + if (!seen_vma) { + VM_WARN_ON_ONCE(allowed && res_vma != new_addr); + res = res_vma; + } + + /* mmap lock is only dropped on shrink. */ + VM_WARN_ON_ONCE(!vrm->mmap_locked); + /* This is a move, no expand should occur. */ + VM_WARN_ON_ONCE(vrm->populate_expand); + + if (vrm->vmi_needs_invalidate) { + vma_iter_invalidate(&vmi); + vrm->vmi_needs_invalidate = false; + } + seen_vma = true; + target_addr = res_vma + vrm->new_len; + } + + return res; +} + +static unsigned long do_mremap(struct vma_remap_struct *vrm) +{ + struct mm_struct *mm = current->mm; + unsigned long res; + bool failed; + + vrm->old_len = PAGE_ALIGN(vrm->old_len); + vrm->new_len = PAGE_ALIGN(vrm->new_len); + + res = check_mremap_params(vrm); + if (res) + return res; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + vrm->mmap_locked = true; + + if (vrm_move_only(vrm)) { + res = remap_move(vrm); + } else { + vrm->vma = vma_lookup(current->mm, vrm->addr); + res = check_prep_vma(vrm); + if (res) + goto out; + + /* Actually execute mremap. */ + res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm); + } + +out: + failed = IS_ERR_VALUE(res); + + if (vrm->mmap_locked) + mmap_write_unlock(mm); + + /* VMA mlock'd + was expanded, so populated expanded region. */ + if (!failed && vrm->populate_expand) + mm_populate(vrm->new_addr + vrm->old_len, vrm->delta); + + notify_uffd(vrm, failed); + return res; } /* diff --git a/mm/nommu.c b/mm/nommu.c index 617e7ba8022f..736d0e0f0618 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -126,7 +126,7 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, unsigned long vm_flags, int node, + pgprot_t prot, vm_flags_t vm_flags, int node, const void *caller) { return __vmalloc_noprof(size, gfp_mask); @@ -200,7 +200,23 @@ void *vmalloc_noprof(unsigned long size) } EXPORT_SYMBOL(vmalloc_noprof); -void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof); +/* + * vmalloc_huge_node - allocate virtually contiguous memory, on a node + * + * @size: allocation size + * @gfp_mask: flags for the page level allocator + * @node: node to use for allocation or NUMA_NO_NODE + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * Due to NOMMU implications the node argument and HUGE page attribute is + * ignored. + */ +void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) +{ + return __vmalloc_noprof(size, gfp_mask); +} /* * vzalloc - allocate virtually contiguous memory with zero fill @@ -399,7 +415,8 @@ static const struct ctl_table nommu_table[] = { }; /* - * initialise the percpu counter for VM and region record slabs + * initialise the percpu counter for VM and region record slabs, initialise VMA + * state. */ void __init mmap_init(void) { @@ -409,6 +426,7 @@ void __init mmap_init(void) VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); register_sysctl_init("vm", nommu_table); + vma_state_init(); } /* @@ -627,22 +645,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); /* - * At least xtensa ends up having protection faults even with no - * MMU.. No stack expansion, at least. - */ -struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, - unsigned long addr, struct pt_regs *regs) -{ - struct vm_area_struct *vma; - - mmap_read_lock(mm); - vma = vma_lookup(mm, addr); - if (!vma) - mmap_read_unlock(mm); - return vma; -} - -/* * expand a stack to a given address * - not supported under NOMMU conditions */ @@ -717,7 +719,7 @@ static int validate_mmap_request(struct file *file, if (file) { /* files must support mmap */ - if (!file->f_op->mmap) + if (!can_mmap_file(file)) return -ENODEV; /* work out if what we've got could possibly be shared @@ -842,12 +844,12 @@ static int validate_mmap_request(struct file *file, * we've determined that we can make the mapping, now translate what we * now know into VMA flags */ -static unsigned long determine_vm_flags(struct file *file, - unsigned long prot, - unsigned long flags, - unsigned long capabilities) +static vm_flags_t determine_vm_flags(struct file *file, + unsigned long prot, + unsigned long flags, + unsigned long capabilities) { - unsigned long vm_flags; + vm_flags_t vm_flags; vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags); @@ -1890,3 +1892,11 @@ static int __meminit init_admin_reserve(void) return 0; } subsys_initcall(init_admin_reserve); + +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + mmap_write_lock(oldmm); + dup_mm_exe_file(mm, oldmm); + mmap_write_unlock(oldmm); + return 0; +} diff --git a/mm/numa.c b/mm/numa.c index f1787d7713a6..7d5e06fe5bd4 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -13,7 +13,6 @@ void __init alloc_node_data(int nid) { const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); u64 nd_pa; - void *nd; int tnid; /* Allocate node data. Try node-local memory and then any node. */ @@ -21,7 +20,6 @@ void __init alloc_node_data(int nid) if (!nd_pa) panic("Cannot allocate %zu bytes for node %d data\n", nd_size, nid); - nd = __va(nd_pa); /* report and initialize */ pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, @@ -30,7 +28,7 @@ void __init alloc_node_data(int nid) if (tnid != nid) pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); - node_data[nid] = nd; + node_data[nid] = __va(nd_pa); memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); } diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index ff4054f4334d..541a99c4071a 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -201,6 +201,28 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) } /** + * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the numa_reserved_meminfo. + * + * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances + * against memblock_type information and moves any that intersect reserved + * ranges to numa_reserved_meminfo. However, when that information is known + * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk + * to numa_reserved_meminfo directly. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_reserved_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_reserved_meminfo); +} + +/** * numa_cleanup_meminfo - Cleanup a numa_meminfo * @mi: numa_meminfo to clean up * diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 18456ddd463b..3e248d1c3969 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -41,6 +41,7 @@ #include <trace/events/writeback.h> #include "internal.h" +#include "swap.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). @@ -520,8 +521,8 @@ static int dirty_ratio_handler(const struct ctl_table *table, int write, void *b ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - writeback_set_ratelimit(); vm_dirty_bytes = 0; + writeback_set_ratelimit(); } return ret; } @@ -607,7 +608,7 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc); */ static void writeout_period(struct timer_list *t) { - struct wb_domain *dom = from_timer(dom, t, period_timer); + struct wb_domain *dom = timer_container_of(dom, t, period_timer); int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; @@ -640,7 +641,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { - del_timer_sync(&dom->period_timer); + timer_delete_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif @@ -1100,9 +1101,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) * such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to - * 1% by default. Without strictlimit feature, fuse writeback may - * consume arbitrary amount of RAM because it is accounted in - * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". + * 1% by default. * * Here, in wb_position_ratio(), we calculate pos_ratio based on * two values: wb_dirty and wb_thresh. Let's consider an example: @@ -2202,7 +2201,7 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = - from_timer(backing_dev_info, t, laptop_mode_wb_timer); + timer_container_of(backing_dev_info, t, laptop_mode_wb_timer); wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); } @@ -2229,7 +2228,7 @@ void laptop_sync_completion(void) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) - del_timer(&bdi->laptop_mode_wb_timer); + timer_delete(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); } @@ -2564,11 +2563,11 @@ struct folio *writeback_iter(struct address_space *mapping, if (!folio) { /* * To avoid deadlocks between range_cyclic writeback and callers - * that hold pages in PageWriteback to aggregate I/O until + * that hold folios in writeback to aggregate I/O until * the writeback iteration finishes, we do not loop back to the - * start of the file. Doing so causes a page lock/page + * start of the file. Doing so causes a folio lock/folio * writeback access order inversion - we should only ever lock - * multiple pages in ascending page->index order, and looping + * multiple folios in ascending folio->index order, and looping * back to the start of the file violates that rule and causes * deadlocks. */ @@ -2621,27 +2620,6 @@ int write_cache_pages(struct address_space *mapping, } EXPORT_SYMBOL(write_cache_pages); -static int writeback_use_writepage(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct folio *folio = NULL; - struct blk_plug plug; - int err; - - blk_start_plug(&plug); - while ((folio = writeback_iter(mapping, wbc, folio, &err))) { - err = mapping->a_ops->writepage(&folio->page, wbc); - if (err == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - err = 0; - } - mapping_set_error(mapping, err); - } - blk_finish_plug(&plug); - - return err; -} - int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; @@ -2652,14 +2630,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) wb = inode_to_wb_wbc(mapping->host, wbc); wb_bandwidth_estimate_start(wb); while (1) { - if (mapping->a_ops->writepages) { + if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); - } else if (mapping->a_ops->writepage) { - ret = writeback_use_writepage(mapping, wbc); - } else { + else /* deal with chardevs and other special files */ ret = 0; - } if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL) break; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b173c2da641..d1d037f97c5f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -290,7 +290,8 @@ EXPORT_SYMBOL(nr_online_nodes); #endif static bool page_contains_unaccepted(struct page *page, unsigned int order); -static bool cond_accept_memory(struct zone *zone, unsigned int order); +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags); static bool __free_unaccepted(struct page *page); int page_group_by_mobility_disabled __read_mostly; @@ -352,81 +353,225 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } +static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit) +{ + return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS; +} + +static __always_inline void +get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn, + unsigned long **bitmap_word, unsigned long *bitidx) +{ + unsigned long *bitmap; + unsigned long word_bitidx; + +#ifdef CONFIG_MEMORY_ISOLATION + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8); +#else + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); +#endif + BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits)); + VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); + + bitmap = get_pageblock_bitmap(page, pfn); + *bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = *bitidx / BITS_PER_LONG; + *bitidx &= (BITS_PER_LONG - 1); + *bitmap_word = &bitmap[word_bitidx]; +} + + /** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * __get_pfnblock_flags_mask - Return the requested group of flags for + * a pageblock_nr_pages block of pages * @page: The page within the block of interest * @pfn: The target page frame number * @mask: mask of bits that the caller is interested in * * Return: pageblock_bits flags */ -unsigned long get_pfnblock_flags_mask(const struct page *page, - unsigned long pfn, unsigned long mask) +static unsigned long __get_pfnblock_flags_mask(const struct page *page, + unsigned long pfn, + unsigned long mask) { - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; + unsigned long *bitmap_word; + unsigned long bitidx; unsigned long word; - bitmap = get_pageblock_bitmap(page, pfn); - bitidx = pfn_to_bitidx(page, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); + get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); /* - * This races, without locks, with set_pfnblock_flags_mask(). Ensure + * This races, without locks, with set_pfnblock_migratetype(). Ensure * a consistent read of the memory array, so that results, even though * racy, are not corrupted. */ - word = READ_ONCE(bitmap[word_bitidx]); + word = READ_ONCE(*bitmap_word); return (word >> bitidx) & mask; } -static __always_inline int get_pfnblock_migratetype(const struct page *page, - unsigned long pfn) +/** + * get_pfnblock_bit - Check if a standalone bit of a pageblock is set + * @page: The page within the block of interest + * @pfn: The target page frame number + * @pb_bit: pageblock bit to check + * + * Return: true if the bit is set, otherwise false + */ +bool get_pfnblock_bit(const struct page *page, unsigned long pfn, + enum pageblock_bits pb_bit) { - return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); + unsigned long *bitmap_word; + unsigned long bitidx; + + if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit))) + return false; + + get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); + + return test_bit(bitidx + pb_bit, bitmap_word); } /** - * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * get_pfnblock_migratetype - Return the migratetype of a pageblock * @page: The page within the block of interest - * @flags: The flags to set * @pfn: The target page frame number - * @mask: mask of bits that the caller is interested in + * + * Return: The migratetype of the pageblock + * + * Use get_pfnblock_migratetype() if caller already has both @page and @pfn + * to save a call to page_to_pfn(). */ -void set_pfnblock_flags_mask(struct page *page, unsigned long flags, - unsigned long pfn, - unsigned long mask) +__always_inline enum migratetype +get_pfnblock_migratetype(const struct page *page, unsigned long pfn) { - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long word; + unsigned long mask = MIGRATETYPE_AND_ISO_MASK; + unsigned long flags; - BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); - BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); + flags = __get_pfnblock_flags_mask(page, pfn, mask); - bitmap = get_pageblock_bitmap(page, pfn); - bitidx = pfn_to_bitidx(page, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); +#ifdef CONFIG_MEMORY_ISOLATION + if (flags & BIT(PB_migrate_isolate)) + return MIGRATE_ISOLATE; +#endif + return flags & MIGRATETYPE_MASK; +} - VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); +/** + * __set_pfnblock_flags_mask - Set the requested group of flags for + * a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @flags: The flags to set + * @mask: mask of bits that the caller is interested in + */ +static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long flags, unsigned long mask) +{ + unsigned long *bitmap_word; + unsigned long bitidx; + unsigned long word; + + get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); mask <<= bitidx; flags <<= bitidx; - word = READ_ONCE(bitmap[word_bitidx]); + word = READ_ONCE(*bitmap_word); do { - } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags)); + } while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags)); } -void set_pageblock_migratetype(struct page *page, int migratetype) +/** + * set_pfnblock_bit - Set a standalone bit of a pageblock + * @page: The page within the block of interest + * @pfn: The target page frame number + * @pb_bit: pageblock bit to set + */ +void set_pfnblock_bit(const struct page *page, unsigned long pfn, + enum pageblock_bits pb_bit) +{ + unsigned long *bitmap_word; + unsigned long bitidx; + + if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit))) + return; + + get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); + + set_bit(bitidx + pb_bit, bitmap_word); +} + +/** + * clear_pfnblock_bit - Clear a standalone bit of a pageblock + * @page: The page within the block of interest + * @pfn: The target page frame number + * @pb_bit: pageblock bit to clear + */ +void clear_pfnblock_bit(const struct page *page, unsigned long pfn, + enum pageblock_bits pb_bit) +{ + unsigned long *bitmap_word; + unsigned long bitidx; + + if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit))) + return; + + get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); + + clear_bit(bitidx + pb_bit, bitmap_word); +} + +/** + * set_pageblock_migratetype - Set the migratetype of a pageblock + * @page: The page within the block of interest + * @migratetype: migratetype to set + */ +static void set_pageblock_migratetype(struct page *page, + enum migratetype migratetype) { if (unlikely(page_group_by_mobility_disabled && migratetype < MIGRATE_PCPTYPES)) migratetype = MIGRATE_UNMOVABLE; - set_pfnblock_flags_mask(page, (unsigned long)migratetype, - page_to_pfn(page), MIGRATETYPE_MASK); +#ifdef CONFIG_MEMORY_ISOLATION + if (migratetype == MIGRATE_ISOLATE) { + VM_WARN_ONCE(1, + "Use set_pageblock_isolate() for pageblock isolation"); + return; + } + VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page), + PB_migrate_isolate), + "Use clear_pageblock_isolate() to unisolate pageblock"); + /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */ +#endif + __set_pfnblock_flags_mask(page, page_to_pfn(page), + (unsigned long)migratetype, + MIGRATETYPE_AND_ISO_MASK); +} + +void __meminit init_pageblock_migratetype(struct page *page, + enum migratetype migratetype, + bool isolate) +{ + unsigned long flags; + + if (unlikely(page_group_by_mobility_disabled && + migratetype < MIGRATE_PCPTYPES)) + migratetype = MIGRATE_UNMOVABLE; + + flags = migratetype; + +#ifdef CONFIG_MEMORY_ISOLATION + if (migratetype == MIGRATE_ISOLATE) { + VM_WARN_ONCE( + 1, + "Set isolate=true to isolate pageblock with a migratetype"); + return; + } + if (isolate) + flags |= BIT(PB_migrate_isolate); +#endif + __set_pfnblock_flags_mask(page, page_to_pfn(page), flags, + MIGRATETYPE_AND_ISO_MASK); } #ifdef CONFIG_DEBUG_VM @@ -666,7 +811,7 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone, int nr_pages = 1 << order; VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, - "page type is %lu, passed migratetype is %d (nr=%d)\n", + "page type is %d, passed migratetype is %d (nr=%d)\n", get_pageblock_migratetype(page), migratetype, nr_pages); if (tail) @@ -692,7 +837,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone, /* Free page moving can fail, so it happens before the type update */ VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, - "page type is %lu, passed migratetype is %d (nr=%d)\n", + "page type is %d, passed migratetype is %d (nr=%d)\n", get_pageblock_migratetype(page), old_mt, nr_pages); list_move_tail(&page->buddy_list, &area->free_list[new_mt]); @@ -714,7 +859,7 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon int nr_pages = 1 << order; VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, - "page type is %lu, passed migratetype is %d (nr=%d)\n", + "page type is %d, passed migratetype is %d (nr=%d)\n", get_pageblock_migratetype(page), migratetype, nr_pages); /* clear reported state and update reported page count */ @@ -897,9 +1042,7 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif -#ifdef CONFIG_PAGE_POOL - ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) | -#endif + page_pool_page_is_pp(page) | (page->flags & check_flags))) return false; @@ -926,26 +1069,18 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif -#ifdef CONFIG_PAGE_POOL - if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE)) + if (unlikely(page_pool_page_is_pp(page))) bad_reason = "page_pool leak"; -#endif return bad_reason; } -static void free_page_is_bad_report(struct page *page) -{ - bad_page(page, - page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); -} - static inline bool free_page_is_bad(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return false; /* Something has gone sideways, find it */ - free_page_is_bad_report(page); + bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); return true; } @@ -1151,14 +1286,9 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) __pgalloc_tag_sub(page, nr); } -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) +/* When tag is not NULL, assuming mem_alloc_profiling_enabled */ +static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) { - struct alloc_tag *tag; - - if (!mem_alloc_profiling_enabled()) - return; - - tag = __pgalloc_tag_get(page); if (tag) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } @@ -1168,7 +1298,7 @@ static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {} +static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ @@ -1245,11 +1375,14 @@ __always_inline bool free_pages_prepare(struct page *page, (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; } } - if (PageMappingFlags(page)) { - if (PageAnon(page)) - mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); - page->mapping = NULL; + if (folio_test_anon(folio)) { + mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); + folio->mapping = NULL; } + if (unlikely(page_has_type(page))) + /* Reset the page_type (which overlays _mapcount) */ + page->page_type = UINT_MAX; + if (is_check_pages_enabled()) { if (free_page_is_bad(page)) bad++; @@ -1400,11 +1533,12 @@ static void free_one_page(struct zone *zone, struct page *page, struct llist_head *llhead; unsigned long flags; - if (!spin_trylock_irqsave(&zone->lock, flags)) { - if (unlikely(fpi_flags & FPI_TRYLOCK)) { + if (unlikely(fpi_flags & FPI_TRYLOCK)) { + if (!spin_trylock_irqsave(&zone->lock, flags)) { add_page_to_zone_llist(zone, page, order); return; } + } else { spin_lock_irqsave(&zone->lock, flags); } @@ -1593,7 +1727,7 @@ static __always_inline void page_del_and_expand(struct zone *zone, static void check_new_page_bad(struct page *page) { - if (unlikely(page->flags & __PG_HWPOISON)) { + if (unlikely(PageHWPoison(page))) { /* Don't complain about hwpoisoned pages */ if (PageBuddy(page)) __ClearPageBuddy(page); @@ -1794,8 +1928,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, #endif /* - * Change the type of a block and move all its free pages to that - * type's freelist. + * Move all free pages of a block to new type's freelist. Caller needs to + * change the block type. */ static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, int old_mt, int new_mt) @@ -1827,8 +1961,6 @@ static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, pages_moved += 1 << order; } - set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); - return pages_moved; } @@ -1873,7 +2005,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page, * migration are movable. But we don't actually try * isolating, as that would be expensive. */ - if (PageLRU(page) || __PageMovable(page)) + if (PageLRU(page) || page_has_movable_ops(page)) (*num_movable)++; pfn++; } @@ -1886,11 +2018,16 @@ static int move_freepages_block(struct zone *zone, struct page *page, int old_mt, int new_mt) { unsigned long start_pfn; + int res; if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) return -1; - return __move_freepages_block(zone, start_pfn, old_mt, new_mt); + res = __move_freepages_block(zone, start_pfn, old_mt, new_mt); + set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); + + return res; + } #ifdef CONFIG_MEMORY_ISOLATION @@ -1918,11 +2055,19 @@ static unsigned long find_large_buddy(unsigned long start_pfn) return start_pfn; } +static inline void toggle_pageblock_isolate(struct page *page, bool isolate) +{ + if (isolate) + set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate); + else + clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate); +} + /** - * move_freepages_block_isolate - move free pages in block for page isolation + * __move_freepages_block_isolate - move free pages in block for page isolation * @zone: the zone * @page: the pageblock page - * @migratetype: migratetype to set on the pageblock + * @isolate: to isolate the given pageblock or unisolate it * * This is similar to move_freepages_block(), but handles the special * case encountered in page isolation, where the block of interest @@ -1937,10 +2082,18 @@ static unsigned long find_large_buddy(unsigned long start_pfn) * * Returns %true if pages could be moved, %false otherwise. */ -bool move_freepages_block_isolate(struct zone *zone, struct page *page, - int migratetype) +static bool __move_freepages_block_isolate(struct zone *zone, + struct page *page, bool isolate) { unsigned long start_pfn, pfn; + int from_mt; + int to_mt; + + if (isolate == get_pageblock_isolate(page)) { + VM_WARN_ONCE(1, "%s a pageblock that is already in that state", + isolate ? "Isolate" : "Unisolate"); + return false; + } if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) return false; @@ -1957,7 +2110,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, del_page_from_free_list(buddy, zone, order, get_pfnblock_migratetype(buddy, pfn)); - set_pageblock_migratetype(page, migratetype); + toggle_pageblock_isolate(page, isolate); split_large_buddy(zone, buddy, pfn, order, FPI_NONE); return true; } @@ -1968,16 +2121,38 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, del_page_from_free_list(page, zone, order, get_pfnblock_migratetype(page, pfn)); - set_pageblock_migratetype(page, migratetype); + toggle_pageblock_isolate(page, isolate); split_large_buddy(zone, page, pfn, order, FPI_NONE); return true; } move: - __move_freepages_block(zone, start_pfn, - get_pfnblock_migratetype(page, start_pfn), - migratetype); + /* Use MIGRATETYPE_MASK to get non-isolate migratetype */ + if (isolate) { + from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page), + MIGRATETYPE_MASK); + to_mt = MIGRATE_ISOLATE; + } else { + from_mt = MIGRATE_ISOLATE; + to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page), + MIGRATETYPE_MASK); + } + + __move_freepages_block(zone, start_pfn, from_mt, to_mt); + toggle_pageblock_isolate(pfn_to_page(start_pfn), isolate); + return true; } + +bool pageblock_isolate_and_move_free_pages(struct zone *zone, struct page *page) +{ + return __move_freepages_block_isolate(zone, page, true); +} + +bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *page) +{ + return __move_freepages_block_isolate(zone, page, false); +} + #endif /* CONFIG_MEMORY_ISOLATION */ static void change_pageblock_range(struct page *pageblock_page, @@ -2077,31 +2252,25 @@ static bool should_try_claim_block(unsigned int order, int start_mt) /* * Check whether there is a suitable fallback freepage with requested order. - * Sets *claim_block to instruct the caller whether it should convert a whole - * pageblock to the returned migratetype. - * If only_claim is true, this function returns fallback_mt only if + * If claimable is true, this function returns fallback_mt only if * we would do this whole-block claiming. This would help to reduce * fragmentation due to mixed migratetype pages in one pageblock. */ int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_claim, bool *claim_block) + int migratetype, bool claimable) { int i; - int fallback_mt; + + if (claimable && !should_try_claim_block(order, migratetype)) + return -2; if (area->nr_free == 0) return -1; - *claim_block = false; for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { - fallback_mt = fallbacks[migratetype][i]; - if (free_area_empty(area, fallback_mt)) - continue; + int fallback_mt = fallbacks[migratetype][i]; - if (should_try_claim_block(order, migratetype)) - *claim_block = true; - - if (*claim_block || !only_claim) + if (!free_area_empty(area, fallback_mt)) return fallback_mt; } @@ -2175,6 +2344,7 @@ try_to_claim_block(struct zone *zone, struct page *page, if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) { __move_freepages_block(zone, start_pfn, block_type, start_type); + set_pageblock_migratetype(pfn_to_page(start_pfn), start_type); return __rmqueue_smallest(zone, order, start_type); } @@ -2182,23 +2352,15 @@ try_to_claim_block(struct zone *zone, struct page *page, } /* - * Try finding a free buddy page on the fallback list. - * - * This will attempt to claim a whole pageblock for the requested type - * to ensure grouping of such requests in the future. - * - * If a whole block cannot be claimed, steal an individual page, regressing to - * __rmqueue_smallest() logic to at least break up as little contiguity as - * possible. + * Try to allocate from some fallback migratetype by claiming the entire block, + * i.e. converting it to the allocation's start migratetype. * * The use of signed ints for order and current_order is a deliberate * deviation from the rest of this file, to make the for loop * condition simpler. - * - * Return the stolen page, or NULL if none can be found. */ static __always_inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, +__rmqueue_claim(struct zone *zone, int order, int start_migratetype, unsigned int alloc_flags) { struct free_area *area; @@ -2206,7 +2368,6 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, int min_order = order; struct page *page; int fallback_mt; - bool claim_block; /* * Do not steal pages from freelists belonging to other pageblocks @@ -2225,53 +2386,73 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &claim_block); + start_migratetype, true); + + /* No block in that order */ if (fallback_mt == -1) continue; - if (!claim_block) + /* Advanced into orders too low to claim, abort */ + if (fallback_mt == -2) break; page = get_page_from_free_area(area, fallback_mt); page = try_to_claim_block(zone, page, current_order, order, start_migratetype, fallback_mt, alloc_flags); - if (page) - goto got_one; + if (page) { + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + return page; + } } - if (alloc_flags & ALLOC_NOFRAGMENT) - return NULL; + return NULL; +} + +/* + * Try to steal a single page from some fallback migratetype. Leave the rest of + * the block as its current migratetype, potentially causing fragmentation. + */ +static __always_inline struct page * +__rmqueue_steal(struct zone *zone, int order, int start_migratetype) +{ + struct free_area *area; + int current_order; + struct page *page; + int fallback_mt; - /* No luck claiming pageblock. Find the smallest fallback page */ for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &claim_block); + start_migratetype, false); if (fallback_mt == -1) continue; page = get_page_from_free_area(area, fallback_mt); page_del_and_expand(zone, page, order, current_order, fallback_mt); - goto got_one; + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + return page; } return NULL; - -got_one: - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, fallback_mt); - - return page; } +enum rmqueue_mode { + RMQUEUE_NORMAL, + RMQUEUE_CMA, + RMQUEUE_CLAIM, + RMQUEUE_STEAL, +}; + /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ static __always_inline struct page * __rmqueue(struct zone *zone, unsigned int order, int migratetype, - unsigned int alloc_flags) + unsigned int alloc_flags, enum rmqueue_mode *mode) { struct page *page; @@ -2290,16 +2471,48 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, } } - page = __rmqueue_smallest(zone, order, migratetype); - if (unlikely(!page)) { - if (alloc_flags & ALLOC_CMA) + /* + * First try the freelists of the requested migratetype, then try + * fallbacks modes with increasing levels of fragmentation risk. + * + * The fallback logic is expensive and rmqueue_bulk() calls in + * a loop with the zone->lock held, meaning the freelists are + * not subject to any outside changes. Remember in *mode where + * we found pay dirt, to save us the search on the next call. + */ + switch (*mode) { + case RMQUEUE_NORMAL: + page = __rmqueue_smallest(zone, order, migratetype); + if (page) + return page; + fallthrough; + case RMQUEUE_CMA: + if (alloc_flags & ALLOC_CMA) { page = __rmqueue_cma_fallback(zone, order); - - if (!page) - page = __rmqueue_fallback(zone, order, migratetype, - alloc_flags); + if (page) { + *mode = RMQUEUE_CMA; + return page; + } + } + fallthrough; + case RMQUEUE_CLAIM: + page = __rmqueue_claim(zone, order, migratetype, alloc_flags); + if (page) { + /* Replenished preferred freelist, back to normal mode. */ + *mode = RMQUEUE_NORMAL; + return page; + } + fallthrough; + case RMQUEUE_STEAL: + if (!(alloc_flags & ALLOC_NOFRAGMENT)) { + page = __rmqueue_steal(zone, order, migratetype); + if (page) { + *mode = RMQUEUE_STEAL; + return page; + } + } } - return page; + return NULL; } /* @@ -2311,17 +2524,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { + enum rmqueue_mode rmqm = RMQUEUE_NORMAL; unsigned long flags; int i; - if (!spin_trylock_irqsave(&zone->lock, flags)) { - if (unlikely(alloc_flags & ALLOC_TRYLOCK)) + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) { + if (!spin_trylock_irqsave(&zone->lock, flags)) return 0; + } else { spin_lock_irqsave(&zone->lock, flags); } for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, - alloc_flags); + alloc_flags, &rmqm); if (unlikely(page == NULL)) break; @@ -2631,10 +2846,10 @@ static void free_frozen_page_commit(struct zone *zone, * stops will be drained from vmstat refresh context. */ if (order && order <= PAGE_ALLOC_COSTLY_ORDER) { - free_high = (pcp->free_count >= batch && + free_high = (pcp->free_count >= (batch + pcp->high_min / 2) && (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || - pcp->count >= READ_ONCE(batch))); + pcp->count >= batch)); pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; @@ -2937,15 +3152,18 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, do { page = NULL; - if (!spin_trylock_irqsave(&zone->lock, flags)) { - if (unlikely(alloc_flags & ALLOC_TRYLOCK)) + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) { + if (!spin_trylock_irqsave(&zone->lock, flags)) return NULL; + } else { spin_lock_irqsave(&zone->lock, flags); } if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { - page = __rmqueue(zone, order, migratetype, alloc_flags); + enum rmqueue_mode rmqm = RMQUEUE_NORMAL; + + page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm); /* * If the allocation fails, allow OOM handling and @@ -3094,7 +3312,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, /* * Do not instrument rmqueue() with KMSAN. This function may call - * __msan_poison_alloca() through a call to set_pfnblock_flags_mask(). + * __msan_poison_alloca() through a call to set_pfnblock_migratetype(). * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it * may call rmqueue() again, which will result in a deadlock. */ @@ -3422,18 +3640,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, return false; } -bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int highest_zoneidx) -{ - long free_pages = zone_page_state(z, NR_FREE_PAGES); - - if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) - free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - - return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, - free_pages); -} - #ifdef CONFIG_NUMA int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; @@ -3522,7 +3728,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, retry: /* * Scan zonelist, looking for a zone with enough free. - * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c. + * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c. */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; @@ -3580,7 +3786,7 @@ retry: } } - cond_accept_memory(zone, order); + cond_accept_memory(zone, order, alloc_flags); /* * Detect whether the number of free pages is below high @@ -3607,7 +3813,7 @@ check_alloc_wmark: gfp_mask)) { int ret; - if (cond_accept_memory(zone, order)) + if (cond_accept_memory(zone, order, alloc_flags)) goto try_this_zone; /* @@ -3660,7 +3866,7 @@ try_this_zone: return page; } else { - if (cond_accept_memory(zone, order)) + if (cond_accept_memory(zone, order, alloc_flags)) goto try_this_zone; /* Try again if zone has deferred pages */ @@ -4209,7 +4415,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) /* * Ignore cpuset mems for non-blocking __GFP_HIGH (probably * GFP_ATOMIC) rather than fail, see the comment for - * cpuset_node_allowed(). + * cpuset_current_node_allowed(). */ if (alloc_flags & ALLOC_MIN_RESERVE) alloc_flags &= ~ALLOC_CPUSET; @@ -4530,6 +4736,14 @@ restart: } retry: + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * infinite retries. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); @@ -4604,8 +4818,8 @@ retry: goto retry; /* Reclaim/compaction failed to prevent the fallback */ - if (defrag_mode) { - alloc_flags &= ALLOC_NOFRAGMENT; + if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) { + alloc_flags &= ~ALLOC_NOFRAGMENT; goto retry; } @@ -4813,7 +5027,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, goto failed; } - cond_accept_memory(zone, 0); + cond_accept_memory(zone, 0, alloc_flags); retry_this_zone: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; if (zone_watermark_fast(zone, 0, mark, @@ -4822,7 +5036,7 @@ retry_this_zone: break; } - if (cond_accept_memory(zone, 0)) + if (cond_accept_memory(zone, 0, alloc_flags)) goto retry_this_zone; /* Try again if zone has deferred pages */ @@ -5003,11 +5217,28 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page_noprof); +static void ___free_pages(struct page *page, unsigned int order, + fpi_t fpi_flags) +{ + /* get PageHead before we drop reference */ + int head = PageHead(page); + /* get alloc tag in case the page is released by others */ + struct alloc_tag *tag = pgalloc_tag_get(page); + + if (put_page_testzero(page)) + __free_frozen_pages(page, order, fpi_flags); + else if (!head) { + pgalloc_tag_sub_pages(tag, (1 << order) - 1); + while (order-- > 0) + __free_frozen_pages(page + (1 << order), order, + fpi_flags); + } +} + /** - * ___free_pages - Free pages allocated with alloc_pages(). + * __free_pages - Free pages allocated with alloc_pages(). * @page: The page pointer returned from alloc_pages(). * @order: The order of the allocation. - * @fpi_flags: Free Page Internal flags. * * This function can free multi-page allocations that are not compound * pages. It does not check that the @order passed in matches that of @@ -5024,21 +5255,6 @@ EXPORT_SYMBOL(get_zeroed_page_noprof); * Context: May be called in interrupt context or while holding a normal * spinlock, but not in NMI context or while holding a raw spinlock. */ -static void ___free_pages(struct page *page, unsigned int order, - fpi_t fpi_flags) -{ - /* get PageHead before we drop reference */ - int head = PageHead(page); - - if (put_page_testzero(page)) - __free_frozen_pages(page, order, fpi_flags); - else if (!head) { - pgalloc_tag_sub_pages(page, (1 << order) - 1); - while (order-- > 0) - __free_frozen_pages(page + (1 << order), order, - fpi_flags); - } -} void __free_pages(struct page *page, unsigned int order) { ___free_pages(page, order, FPI_NONE); @@ -5047,7 +5263,7 @@ EXPORT_SYMBOL(__free_pages); /* * Can be called while holding raw_spin_lock or from IRQ and NMI for any - * page type (not only those that came from try_alloc_pages) + * page type (not only those that came from alloc_pages_nolock) */ void free_pages_nolock(struct page *page, unsigned int order) { @@ -6478,13 +6694,9 @@ static void alloc_contig_dump_pages(struct list_head *page_list) } } -/* - * [start, end) must belong to a single zone. - * @migratetype: using migratetype to filter the type of migration in - * trace_mm_alloc_contig_migrate_range_info. - */ +/* [start, end) must belong to a single zone. */ static int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, int migratetype) + unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; @@ -6496,10 +6708,6 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, .gfp_mask = cc->gfp_mask, .reason = MR_CONTIG_RANGE, }; - struct page *page; - unsigned long total_mapped = 0; - unsigned long total_migrated = 0; - unsigned long total_reclaimed = 0; lru_cache_disable(); @@ -6525,22 +6733,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, &cc->migratepages); cc->nr_migratepages -= nr_reclaimed; - if (trace_mm_alloc_contig_migrate_range_info_enabled()) { - total_reclaimed += nr_reclaimed; - list_for_each_entry(page, &cc->migratepages, lru) { - struct folio *folio = page_folio(page); - - total_mapped += folio_mapped(folio) * - folio_nr_pages(folio); - } - } - ret = migrate_pages(&cc->migratepages, alloc_migration_target, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); - if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret) - total_migrated += cc->nr_migratepages; - /* * On -ENOMEM, migrate_pages() bails out right away. It is pointless * to retry again over this error, so do the same here. @@ -6556,10 +6751,6 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, putback_movable_pages(&cc->migratepages); } - trace_mm_alloc_contig_migrate_range_info(start, end, migratetype, - total_migrated, - total_reclaimed, - total_mapped); return (ret < 0) ? ret : 0; } @@ -6627,10 +6818,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate * @end: one-past-the-last PFN to allocate - * @migratetype: migratetype of the underlying pageblocks (either - * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks - * in range must have the same migratetype and it must - * be either of the two. + * @alloc_flags: allocation information * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some * action and reclaim modifiers are supported. Reclaim modifiers * control allocation behavior during compaction/migration/reclaim. @@ -6647,7 +6835,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) * need to be freed with free_contig_range(). */ int alloc_contig_range_noprof(unsigned long start, unsigned long end, - unsigned migratetype, gfp_t gfp_mask) + acr_flags_t alloc_flags, gfp_t gfp_mask) { unsigned long outer_start, outer_end; int ret = 0; @@ -6662,6 +6850,9 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, .alloc_contig = true, }; INIT_LIST_HEAD(&cc.migratepages); + enum pb_isolate_mode mode = (alloc_flags & ACR_FLAGS_CMA) ? + PB_ISOLATE_MODE_CMA_ALLOC : + PB_ISOLATE_MODE_OTHER; gfp_mask = current_gfp_context(gfp_mask); if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) @@ -6688,7 +6879,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, * put back to page allocator so that buddy can use them. */ - ret = start_isolate_page_range(start, end, migratetype, 0); + ret = start_isolate_page_range(start, end, mode); if (ret) goto done; @@ -6704,7 +6895,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, * allocated. So, if we fall through be sure to clear ret so that * -EBUSY is not accidentally used or returned to caller. */ - ret = __alloc_contig_migrate_range(&cc, start, end, migratetype); + ret = __alloc_contig_migrate_range(&cc, start, end); if (ret && ret != -EBUSY) goto done; @@ -6738,7 +6929,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, outer_start = find_large_buddy(start); /* Make sure the range is really isolated. */ - if (test_pages_isolated(outer_start, end, 0)) { + if (test_pages_isolated(outer_start, end, mode)) { ret = -EBUSY; goto done; } @@ -6771,7 +6962,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, start, end, outer_start, outer_end); } done: - undo_isolate_page_range(start, end, migratetype); + undo_isolate_page_range(start, end); return ret; } EXPORT_SYMBOL(alloc_contig_range_noprof); @@ -6781,8 +6972,8 @@ static int __alloc_contig_pages(unsigned long start_pfn, { unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE, - gfp_mask); + return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE, + gfp_mask); } static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, @@ -7138,9 +7329,6 @@ bool has_managed_dma(void) #ifdef CONFIG_UNACCEPTED_MEMORY -/* Counts number of zones with unaccepted pages. */ -static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); - static bool lazy_accept = true; static int __init accept_memory_parse(char *p) @@ -7167,11 +7355,7 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) static void __accept_page(struct zone *zone, unsigned long *flags, struct page *page) { - bool last; - list_del(&page->lru); - last = list_empty(&zone->unaccepted_pages); - account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); __ClearPageUnaccepted(page); @@ -7180,9 +7364,6 @@ static void __accept_page(struct zone *zone, unsigned long *flags, accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER); __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); - - if (last) - static_branch_dec(&zones_with_unaccepted_pages); } void accept_page(struct page *page) @@ -7219,20 +7400,17 @@ static bool try_to_accept_memory_one(struct zone *zone) return true; } -static inline bool has_unaccepted_memory(void) -{ - return static_branch_unlikely(&zones_with_unaccepted_pages); -} - -static bool cond_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags) { long to_accept, wmark; bool ret = false; - if (!has_unaccepted_memory()) + if (list_empty(&zone->unaccepted_pages)) return false; - if (list_empty(&zone->unaccepted_pages)) + /* Bailout, since try_to_accept_memory_one() needs to take a lock */ + if (alloc_flags & ALLOC_TRYLOCK) return false; wmark = promo_wmark_pages(zone); @@ -7265,22 +7443,17 @@ static bool __free_unaccepted(struct page *page) { struct zone *zone = page_zone(page); unsigned long flags; - bool first = false; if (!lazy_accept) return false; spin_lock_irqsave(&zone->lock, flags); - first = list_empty(&zone->unaccepted_pages); list_add_tail(&page->lru, &zone->unaccepted_pages); account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); __SetPageUnaccepted(page); spin_unlock_irqrestore(&zone->lock, flags); - if (first) - static_branch_inc(&zones_with_unaccepted_pages); - return true; } @@ -7291,7 +7464,8 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) return false; } -static bool cond_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags) { return false; } @@ -7305,20 +7479,21 @@ static bool __free_unaccepted(struct page *page) #endif /* CONFIG_UNACCEPTED_MEMORY */ /** - * try_alloc_pages - opportunistic reentrant allocation from any context + * alloc_pages_nolock - opportunistic reentrant allocation from any context * @nid: node to allocate from * @order: allocation order size * * Allocates pages of a given order from the given node. This is safe to * call from any context (from atomic, NMI, and also reentrant - * allocator -> tracepoint -> try_alloc_pages_noprof). + * allocator -> tracepoint -> alloc_pages_nolock_noprof). * Allocation is best effort and to be expected to fail easily so nobody should * rely on the success. Failures are not reported via warn_alloc(). * See always fail conditions below. * - * Return: allocated page or NULL on failure. + * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. + * It means ENOMEM. There is no reason to call it again and expect !NULL. */ -struct page *try_alloc_pages_noprof(int nid, unsigned int order) +struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) { /* * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. @@ -7327,7 +7502,7 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order) * * These two are the conditions for gfpflags_allow_spinning() being true. * - * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason + * Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason * to warn. Also warn would trigger printk() which is unsafe from * various contexts. We cannot use printk_deferred_enter() to mitigate, * since the running context is unknown. @@ -7337,7 +7512,7 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order) * BPF use cases. * * Though __GFP_NOMEMALLOC is not checked in the code path below, - * specify it here to highlight that try_alloc_pages() + * specify it here to highlight that alloc_pages_nolock() * doesn't want to deplete reserves. */ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC @@ -7362,11 +7537,6 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order) if (!pcp_allowed_order(order)) return NULL; -#ifdef CONFIG_UNACCEPTED_MEMORY - /* Bailout, since try_to_accept_memory_one() needs to take a lock */ - if (has_unaccepted_memory()) - return NULL; -#endif /* Bailout, since _deferred_grow_zone() needs to take a lock */ if (deferred_pages_enabled()) return NULL; diff --git a/mm/page_ext.c b/mm/page_ext.c index c351fdfe9e9a..d7396a8970e5 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -369,25 +369,15 @@ static void __invalidate_page_ext(unsigned long pfn) } static int __meminit online_page_ext(unsigned long start_pfn, - unsigned long nr_pages, - int nid) + unsigned long nr_pages) { + int nid = pfn_to_nid(start_pfn); unsigned long start, end, pfn; int fail = 0; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); - if (nid == NUMA_NO_NODE) { - /* - * In this case, "nid" already exists and contains valid memory. - * "start_pfn" passed to us is a pfn which is an arg for - * online__pages(), and start_pfn should exist. - */ - nid = pfn_to_nid(start_pfn); - VM_BUG_ON(!node_online(nid)); - } - for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) fail = init_section_page_ext(pfn, nid); if (!fail) @@ -435,8 +425,7 @@ static int __meminit page_ext_callback(struct notifier_block *self, switch (action) { case MEM_GOING_ONLINE: - ret = online_page_ext(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); + ret = online_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_OFFLINE: offline_page_ext(mn->start_pfn, diff --git a/mm/page_idle.c b/mm/page_idle.c index 408aaf29a3ea..a82b340dc204 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -208,7 +208,7 @@ static const struct bin_attribute *const page_idle_bin_attrs[] = { }; static const struct attribute_group page_idle_attr_group = { - .bin_attrs_new = page_idle_bin_attrs, + .bin_attrs = page_idle_bin_attrs, .name = "page_idle", }; diff --git a/mm/page_io.c b/mm/page_io.c index 4bce19df557b..a2056a5ecb13 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -237,15 +237,13 @@ static void swap_zeromap_folio_clear(struct folio *folio) * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. */ -int swap_writepage(struct page *page, struct writeback_control *wbc) +int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) { - struct folio *folio = page_folio(page); - int ret; + int ret = 0; + + if (folio_free_swap(folio)) + goto out_unlock; - if (folio_free_swap(folio)) { - folio_unlock(folio); - return 0; - } /* * Arch code may have to preserve more data than just the page * contents, e.g. memory tags. @@ -253,8 +251,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) ret = arch_prepare_to_swap(folio); if (ret) { folio_mark_dirty(folio); - folio_unlock(folio); - return ret; + goto out_unlock; } /* @@ -265,28 +262,30 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) */ if (is_folio_zero_filled(folio)) { swap_zeromap_folio_set(folio); - folio_unlock(folio); - return 0; - } else { - /* - * Clear bits this folio occupies in the zeromap to prevent - * zero data being read in from any previous zero writes that - * occupied the same swap entries. - */ - swap_zeromap_folio_clear(folio); + goto out_unlock; } + + /* + * Clear bits this folio occupies in the zeromap to prevent zero data + * being read in from any previous zero writes that occupied the same + * swap entries. + */ + swap_zeromap_folio_clear(folio); + if (zswap_store(folio)) { count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT); - folio_unlock(folio); - return 0; + goto out_unlock; } if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) { folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; } - __swap_writepage(folio, wbc); + __swap_writepage(folio, swap_plug); return 0; +out_unlock: + folio_unlock(folio); + return ret; } static inline void count_swpout_vm_event(struct folio *folio) @@ -372,9 +371,9 @@ static void sio_write_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } -static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc) +static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug) { - struct swap_iocb *sio = NULL; + struct swap_iocb *sio = swap_plug ? *swap_plug : NULL; struct swap_info_struct *sis = swp_swap_info(folio->swap); struct file *swap_file = sis->swap_file; loff_t pos = swap_dev_pos(folio->swap); @@ -382,8 +381,6 @@ static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); - if (wbc->swap_plug) - sio = *wbc->swap_plug; if (sio) { if (sio->iocb.ki_filp != swap_file || sio->iocb.ki_pos + sio->len != pos) { @@ -402,22 +399,21 @@ static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); sio->len += folio_size(folio); sio->pages += 1; - if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) { + if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) { swap_write_unplug(sio); sio = NULL; } - if (wbc->swap_plug) - *wbc->swap_plug = sio; + if (swap_plug) + *swap_plug = sio; } static void swap_writepage_bdev_sync(struct folio *folio, - struct writeback_control *wbc, struct swap_info_struct *sis) + struct swap_info_struct *sis) { struct bio_vec bv; struct bio bio; - bio_init(&bio, sis->bdev, &bv, 1, - REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); + bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP); bio.bi_iter.bi_sector = swap_folio_sector(folio); bio_add_folio_nofail(&bio, folio, folio_size(folio), 0); @@ -432,13 +428,11 @@ static void swap_writepage_bdev_sync(struct folio *folio, } static void swap_writepage_bdev_async(struct folio *folio, - struct writeback_control *wbc, struct swap_info_struct *sis) + struct swap_info_struct *sis) { struct bio *bio; - bio = bio_alloc(sis->bdev, 1, - REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), - GFP_NOIO); + bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP, GFP_NOIO); bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_write; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); @@ -450,7 +444,7 @@ static void swap_writepage_bdev_async(struct folio *folio, submit_bio(bio); } -void __swap_writepage(struct folio *folio, struct writeback_control *wbc) +void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) { struct swap_info_struct *sis = swp_swap_info(folio->swap); @@ -461,16 +455,16 @@ void __swap_writepage(struct folio *folio, struct writeback_control *wbc) * is safe. */ if (data_race(sis->flags & SWP_FS_OPS)) - swap_writepage_fs(folio, wbc); + swap_writepage_fs(folio, swap_plug); /* * ->flags can be updated non-atomicially (scan_swap_map_slots), * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race * is safe. */ else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO)) - swap_writepage_bdev_sync(folio, wbc, sis); + swap_writepage_bdev_sync(folio, sis); else - swap_writepage_bdev_async(folio, wbc, sis); + swap_writepage_bdev_async(folio, sis); } void swap_write_unplug(struct swap_iocb *sio) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index a051a29e95ad..f72b6cd38b95 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -21,9 +21,9 @@ * consequently belong to a single zone. * * PageLRU check without isolation or lru_lock could race so that - * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable - * check without lock_page also may miss some movable non-lru pages at - * race condition. So you can't expect this function should be exact. + * MIGRATE_MOVABLE block might include unmovable pages. Similarly, pages + * with movable_ops can only be identified some time after they were + * allocated. So you can't expect this function should be exact. * * Returns a page without holding a reference. If the caller wants to * dereference that page (e.g., dumping), it has to make sure that it @@ -31,7 +31,7 @@ * */ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags) + enum pb_isolate_mode mode) { struct page *page = pfn_to_page(start_pfn); struct zone *zone = page_zone(page); @@ -46,7 +46,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e * isolate CMA pageblocks even when they are not movable in fact * so consider them movable here. */ - if (is_migrate_cma(migratetype)) + if (mode == PB_ISOLATE_MODE_CMA_ALLOC) return NULL; return page; @@ -83,9 +83,16 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e unsigned int skip_pages; if (PageHuge(page)) { - if (!hugepage_migration_supported(folio_hstate(folio))) + struct hstate *h; + + /* + * The huge page may be freed so can not + * use folio_hstate() directly. + */ + h = size_to_hstate(folio_size(folio)); + if (h && !hugepage_migration_supported(h)) return page; - } else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) { + } else if (!folio_test_lru(folio)) { return page; } @@ -110,7 +117,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e * The HWPoisoned page may be not in buddy system, and * page_count() is not 0. */ - if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) + if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page)) continue; /* @@ -123,10 +130,10 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e * move these pages that still have a reference count > 0. * (false negatives in this function only) */ - if ((flags & MEMORY_OFFLINE) && PageOffline(page)) + if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page)) continue; - if (__PageMovable(page) || PageLRU(page)) + if (PageLRU(page) || page_has_movable_ops(page)) continue; /* @@ -144,7 +151,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e * present in [start_pfn, end_pfn). The pageblock must intersect with * [start_pfn, end_pfn). */ -static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags, +static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode, unsigned long start_pfn, unsigned long end_pfn) { struct zone *zone = page_zone(page); @@ -179,9 +186,9 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ end_pfn); unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, - migratetype, isol_flags); + mode); if (!unmovable) { - if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) { + if (!pageblock_isolate_and_move_free_pages(zone, page)) { spin_unlock_irqrestore(&zone->lock, flags); return -EBUSY; } @@ -191,7 +198,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ } spin_unlock_irqrestore(&zone->lock, flags); - if (isol_flags & REPORT_FAILURE) { + if (mode == PB_ISOLATE_MODE_MEM_OFFLINE) { /* * printk() with zone->lock held will likely trigger a * lockdep splat, so defer it here. @@ -202,7 +209,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ return -EBUSY; } -static void unset_migratetype_isolate(struct page *page, int migratetype) +static void unset_migratetype_isolate(struct page *page) { struct zone *zone; unsigned long flags; @@ -255,10 +262,10 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) * Isolating this block already succeeded, so this * should not fail on zone boundaries. */ - WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype)); + WARN_ON_ONCE(!pageblock_unisolate_and_move_free_pages(zone, page)); } else { - set_pageblock_migratetype(page, migratetype); - __putback_isolated_page(page, order, migratetype); + clear_pageblock_isolate(page); + __putback_isolated_page(page, order, get_pageblock_migratetype(page)); } zone->nr_isolate_pageblock--; out: @@ -285,11 +292,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * isolate_single_pageblock() -- tries to isolate a pageblock that might be * within a free or in-use page. * @boundary_pfn: pageblock-aligned pfn that a page might cross - * @flags: isolation flags + * @mode: isolation mode * @isolate_before: isolate the pageblock before the boundary_pfn * @skip_isolation: the flag to skip the pageblock isolation in second * isolate_single_pageblock() - * @migratetype: migrate type to set in error recovery. * * Free and in-use pages can be as big as MAX_PAGE_ORDER and contain more than one * pageblock. When not all pageblocks within a page are isolated at the same @@ -304,8 +310,9 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * either. The function handles this by splitting the free page or migrating * the in-use page then splitting the free page. */ -static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, - bool isolate_before, bool skip_isolation, int migratetype) +static int isolate_single_pageblock(unsigned long boundary_pfn, + enum pb_isolate_mode mode, bool isolate_before, + bool skip_isolation) { unsigned long start_pfn; unsigned long isolate_pageblock; @@ -331,12 +338,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, zone->zone_start_pfn); if (skip_isolation) { - int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); - - VM_BUG_ON(!is_migrate_isolate(mt)); + VM_BUG_ON(!get_pageblock_isolate(pfn_to_page(isolate_pageblock))); } else { - ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype, - flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages); + ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), + mode, isolate_pageblock, + isolate_pageblock + pageblock_nr_pages); if (ret) return ret; @@ -376,7 +382,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, if (PageBuddy(page)) { int order = buddy_order(page); - /* move_freepages_block_isolate() handled this */ + /* pageblock_isolate_and_move_free_pages() handled this */ VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn); pfn += 1UL << order; @@ -415,7 +421,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, * proper free and split handling for them. */ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); - VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page); + VM_WARN_ON_ONCE_PAGE(page_has_movable_ops(page), page); goto failed; } @@ -426,7 +432,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, failed: /* restore the original migratetype */ if (!skip_isolation) - unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype); + unset_migratetype_isolate(pfn_to_page(isolate_pageblock)); return -EBUSY; } @@ -434,14 +440,7 @@ failed: * start_isolate_page_range() - mark page range MIGRATE_ISOLATE * @start_pfn: The first PFN of the range to be isolated. * @end_pfn: The last PFN of the range to be isolated. - * @migratetype: Migrate type to set in error recovery. - * @flags: The following flags are allowed (they can be combined in - * a bit mask) - * MEMORY_OFFLINE - isolate to offline (!allocate) memory - * e.g., skip over PageHWPoison() pages - * and PageOffline() pages. - * REPORT_FAILURE - report details about the failure to - * isolate the range + * @mode: isolation mode * * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in * the range will never be allocated. Any free pages and pages freed in the @@ -474,7 +473,7 @@ failed: * Return: 0 on success and -EBUSY if any part of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags) + enum pb_isolate_mode mode) { unsigned long pfn; struct page *page; @@ -485,8 +484,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, bool skip_isolation = false; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, flags, false, - skip_isolation, migratetype); + ret = isolate_single_pageblock(isolate_start, mode, false, + skip_isolation); if (ret) return ret; @@ -494,10 +493,9 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, skip_isolation = true; /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, flags, true, - skip_isolation, migratetype); + ret = isolate_single_pageblock(isolate_end, mode, true, skip_isolation); if (ret) { - unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); + unset_migratetype_isolate(pfn_to_page(isolate_start)); return ret; } @@ -506,12 +504,11 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < isolate_end - pageblock_nr_pages; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && set_migratetype_isolate(page, migratetype, flags, - start_pfn, end_pfn)) { - undo_isolate_page_range(isolate_start, pfn, migratetype); + if (page && set_migratetype_isolate(page, mode, start_pfn, + end_pfn)) { + undo_isolate_page_range(isolate_start, pfn); unset_migratetype_isolate( - pfn_to_page(isolate_end - pageblock_nr_pages), - migratetype); + pfn_to_page(isolate_end - pageblock_nr_pages)); return -EBUSY; } } @@ -522,13 +519,10 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * undo_isolate_page_range - undo effects of start_isolate_page_range() * @start_pfn: The first PFN of the isolated range * @end_pfn: The last PFN of the isolated range - * @migratetype: New migrate type to set on the range * - * This finds every MIGRATE_ISOLATE page block in the given range - * and switches it to @migratetype. + * This finds and unsets every MIGRATE_ISOLATE page block in the given range */ -void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype) +void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct page *page; @@ -541,7 +535,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, page = __first_valid_page(pfn, pageblock_nr_pages); if (!page || !is_migrate_isolate_page(page)) continue; - unset_migratetype_isolate(page, migratetype); + unset_migratetype_isolate(page); } } /* @@ -553,7 +547,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, */ static unsigned long __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, - int flags) + enum pb_isolate_mode mode) { struct page *page; @@ -566,11 +560,12 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, * simple way to verify that as VM_BUG_ON(), though. */ pfn += 1 << buddy_order(page); - else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) + else if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && + PageHWPoison(page)) /* A HWPoisoned page cannot be also PageBuddy */ pfn++; - else if ((flags & MEMORY_OFFLINE) && PageOffline(page) && - !page_count(page)) + else if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && + PageOffline(page) && !page_count(page)) /* * The responsible driver agreed to skip PageOffline() * pages when offlining memory by dropping its @@ -588,11 +583,11 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, * test_pages_isolated - check if pageblocks in range are isolated * @start_pfn: The first PFN of the isolated range * @end_pfn: The first PFN *after* the isolated range - * @isol_flags: Testing mode flags + * @mode: Testing mode * * This tests if all in the specified range are free. * - * If %MEMORY_OFFLINE is specified in @flags, it will consider + * If %PB_ISOLATE_MODE_MEM_OFFLINE specified in @mode, it will consider * poisoned and offlined pages free as well. * * Caller must ensure the requested range doesn't span zones. @@ -600,7 +595,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, * Returns 0 if true, -EBUSY if one or more pages are in use. */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, - int isol_flags) + enum pb_isolate_mode mode) { unsigned long pfn, flags; struct page *page; @@ -636,7 +631,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, /* Check all pages are free or marked as ISOLATED */ zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags); + pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, mode); spin_unlock_irqrestore(&zone->lock, flags); ret = pfn < end_pfn ? -EBUSY : 0; diff --git a/mm/page_owner.c b/mm/page_owner.c index cc4a6916eec6..c3ca21132c2c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -302,7 +302,7 @@ void __reset_page_owner(struct page *page, unsigned short order) /* * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false * to prevent issues in stack_depot_save(). - * This is similar to try_alloc_pages() gfp flags, but only used + * This is similar to alloc_pages_nolock() gfp flags, but only used * to signal stack_depot to avoid spin_locks. */ handle = save_stack(__GFP_NOWARN); @@ -333,9 +333,9 @@ noinline void __set_page_owner(struct page *page, unsigned short order, inc_stack_record_count(handle, gfp_mask, 1 << order); } -void __set_page_owner_migrate_reason(struct page *page, int reason) +void __folio_set_owner_migrate_reason(struct folio *folio, int reason) { - struct page_ext *page_ext = page_ext_get(page); + struct page_ext *page_ext = page_ext_get(&folio->page); struct page_owner *page_owner; if (unlikely(!page_ext)) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 68109ee93841..4eeca782b888 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -218,33 +218,39 @@ static inline void page_table_check_pmd_flags(pmd_t pmd) WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); } -void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) +void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, + unsigned int nr) { + unsigned long stride = PMD_SIZE >> PAGE_SHIFT; + unsigned int i; + if (&init_mm == mm) return; page_table_check_pmd_flags(pmd); - __page_table_check_pmd_clear(mm, *pmdp); - if (pmd_user_accessible_page(pmd)) { - page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, - pmd_write(pmd)); - } + for (i = 0; i < nr; i++) + __page_table_check_pmd_clear(mm, *(pmdp + i)); + if (pmd_user_accessible_page(pmd)) + page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd)); } -EXPORT_SYMBOL(__page_table_check_pmd_set); +EXPORT_SYMBOL(__page_table_check_pmds_set); -void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) +void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud, + unsigned int nr) { + unsigned long stride = PUD_SIZE >> PAGE_SHIFT; + unsigned int i; + if (&init_mm == mm) return; - __page_table_check_pud_clear(mm, *pudp); - if (pud_user_accessible_page(pud)) { - page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT, - pud_write(pud)); - } + for (i = 0; i < nr; i++) + __page_table_check_pud_clear(mm, *(pudp + i)); + if (pud_user_accessible_page(pud)) + page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud)); } -EXPORT_SYMBOL(__page_table_check_pud_set); +EXPORT_SYMBOL(__page_table_check_puds_set); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index e463c3be934a..e981a1a292d2 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -246,8 +246,7 @@ restart: */ pmde = pmdp_get_lockless(pvmw->pmd); - if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) || - (pmd_present(pmde) && pmd_devmap(pmde))) { + if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); pmde = *pvmw->pmd; if (!pmd_present(pmde)) { @@ -262,7 +261,7 @@ restart: return not_found(pvmw); return true; } - if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) { + if (likely(pmd_trans_huge(pmde))) { if (pvmw->flags & PVMW_MIGRATION) return not_found(pvmw); if (!check_pmd(pmd_pfn(pmde), pvmw)) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index e478777c86e1..648038247a8d 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -143,8 +143,7 @@ again: * We are ONLY installing, so avoid unnecessarily * splitting a present huge page. */ - if (pmd_present(*pmd) && - (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + if (pmd_present(*pmd) && pmd_trans_huge(*pmd)) continue; } @@ -210,8 +209,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, * We are ONLY installing, so avoid unnecessarily * splitting a present huge page. */ - if (pud_present(*pud) && - (pud_trans_huge(*pud) || pud_devmap(*pud))) + if (pud_present(*pud) && pud_trans_huge(*pud)) continue; } @@ -422,7 +420,7 @@ static inline void process_mm_walk_lock(struct mm_struct *mm, { if (walk_lock == PGWALK_RDLOCK) mmap_assert_locked(mm); - else + else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY) mmap_assert_write_locked(mm); } @@ -437,6 +435,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, case PGWALK_WRLOCK_VERIFY: vma_assert_write_locked(vma); break; + case PGWALK_VMA_RDLOCK_VERIFY: + vma_assert_locked(vma); + break; case PGWALK_RDLOCK: /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ break; @@ -585,8 +586,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } /** - * walk_page_range_novma - walk a range of pagetables not backed by a vma - * @mm: mm_struct representing the target process of page table walk + * walk_kernel_page_table_range - walk a range of kernel pagetables. * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk @@ -596,17 +596,61 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for - * walking the kernel pages tables or page tables for firmware. + * walking kernel pages tables or page tables for firmware. * * Note: Be careful to walk the kernel pages tables, the caller may be need to * take other effective approaches (mmap lock may be insufficient) to prevent * the intermediate kernel page tables belonging to the specified address range * from being freed (e.g. memory hot-remove). */ -int walk_page_range_novma(struct mm_struct *mm, unsigned long start, +int walk_kernel_page_table_range(unsigned long start, unsigned long end, + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) +{ + struct mm_struct *mm = &init_mm; + struct mm_walk walk = { + .ops = ops, + .mm = mm, + .pgd = pgd, + .private = private, + .no_vma = true + }; + + if (start >= end) + return -EINVAL; + if (!check_ops_valid(ops)) + return -EINVAL; + + /* + * Kernel intermediate page tables are usually not freed, so the mmap + * read lock is sufficient. But there are some exceptions. + * E.g. memory hot-remove. In which case, the mmap lock is insufficient + * to prevent the intermediate kernel pages tables belonging to the + * specified address range from being freed. The caller should take + * other actions to prevent this race. + */ + mmap_assert_locked(mm); + + return walk_pgd_range(start, end, &walk); +} + +/** + * walk_page_range_debug - walk a range of pagetables not backed by a vma + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @pgd: pgd to walk if different from mm->pgd + * @private: private data for callbacks' usage + * + * Similar to walk_page_range() but can walk any page tables even if they are + * not backed by VMAs. Because 'unusual' entries may be walked this function + * will also not lock the PTEs for the pte_entry() callback. + * + * This is for debugging purposes ONLY. + */ +int walk_page_range_debug(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, - pgd_t *pgd, - void *private) + pgd_t *pgd, void *private) { struct mm_walk walk = { .ops = ops, @@ -616,34 +660,24 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, .no_vma = true }; + /* For convenience, we allow traversal of kernel mappings. */ + if (mm == &init_mm) + return walk_kernel_page_table_range(start, end, ops, + pgd, private); if (start >= end || !walk.mm) return -EINVAL; if (!check_ops_valid(ops)) return -EINVAL; /* - * 1) For walking the user virtual address space: - * * The mmap lock protects the page walker from changes to the page * tables during the walk. However a read lock is insufficient to * protect those areas which don't have a VMA as munmap() detaches * the VMAs before downgrading to a read lock and actually tearing * down PTEs/page tables. In which case, the mmap write lock should - * be hold. - * - * 2) For walking the kernel virtual address space: - * - * The kernel intermediate page tables usually do not be freed, so - * the mmap map read lock is sufficient. But there are some exceptions. - * E.g. memory hot-remove. In which case, the mmap lock is insufficient - * to prevent the intermediate kernel pages tables belonging to the - * specified address range from being freed. The caller should take - * other actions to prevent this race. + * be held. */ - if (mm == &init_mm) - mmap_assert_locked(walk.mm); - else - mmap_assert_write_locked(walk.mm); + mmap_assert_write_locked(mm); return walk_pgd_range(start, end, &walk); } @@ -872,7 +906,7 @@ struct folio *folio_walk_start(struct folio_walk *fw, * TODO: FW_MIGRATION support for PUD migration entries * once there are relevant users. */ - if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) { + if (!pud_present(pud) || pud_special(pud)) { spin_unlock(ptl); goto not_found; } else if (!pud_leaf(pud)) { diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index dd3590dfc23d..9b9d5d6accae 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * mm/percpu-debug.c * * Copyright (C) 2017 Facebook Inc. * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org> diff --git a/mm/percpu.c b/mm/percpu.c index b35494c8ede2..d9cbaee92b60 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3355,7 +3355,7 @@ void __init setup_per_cpu_areas(void) */ unsigned long pcpu_nr_pages(void) { - return pcpu_nr_populated * pcpu_nr_units; + return data_race(READ_ONCE(pcpu_nr_populated)) * pcpu_nr_units; } /* diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 5a882f2b10f9..567e2d084071 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -139,8 +139,7 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && - !pmd_devmap(*pmdp)); + VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; @@ -153,7 +152,7 @@ pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t pud; VM_BUG_ON(address & ~HPAGE_PUD_MASK); - VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); + VM_BUG_ON(!pud_trans_huge(*pudp)); pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return pud; @@ -293,7 +292,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) *pmdvalp = pmdval; if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) goto nomap; - if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))) + if (unlikely(pmd_trans_huge(pmdval))) goto nomap; if (unlikely(pmd_bad(pmdval))) { pmd_clear_bad(pmd); diff --git a/mm/ptdump.c b/mm/ptdump.c index 106e1d66e9f9..b600c7f864b8 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -4,6 +4,7 @@ #include <linux/debugfs.h> #include <linux/ptdump.h> #include <linux/kasan.h> +#include "internal.h" #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) /* @@ -18,7 +19,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk, { struct ptdump_state *st = walk->private; - st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0])); + st->note_page_pte(st, addr, kasan_early_shadow_pte[0]); walk->action = ACTION_CONTINUE; @@ -38,11 +39,11 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 0, pgd_val(val)); + if (st->effective_prot_pgd) + st->effective_prot_pgd(st, val); if (pgd_leaf(val)) { - st->note_page(st, addr, 0, pgd_val(val)); + st->note_page_pgd(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -61,11 +62,11 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 1, p4d_val(val)); + if (st->effective_prot_p4d) + st->effective_prot_p4d(st, val); if (p4d_leaf(val)) { - st->note_page(st, addr, 1, p4d_val(val)); + st->note_page_p4d(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -84,11 +85,11 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 2, pud_val(val)); + if (st->effective_prot_pud) + st->effective_prot_pud(st, val); if (pud_leaf(val)) { - st->note_page(st, addr, 2, pud_val(val)); + st->note_page_pud(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -106,10 +107,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 3, pmd_val(val)); + if (st->effective_prot_pmd) + st->effective_prot_pmd(st, val); if (pmd_leaf(val)) { - st->note_page(st, addr, 3, pmd_val(val)); + st->note_page_pmd(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -122,10 +123,10 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, struct ptdump_state *st = walk->private; pte_t val = ptep_get_lockless(pte); - if (st->effective_prot) - st->effective_prot(st, 4, pte_val(val)); + if (st->effective_prot_pte) + st->effective_prot_pte(st, val); - st->note_page(st, addr, 4, pte_val(val)); + st->note_page_pte(st, addr, val); return 0; } @@ -134,9 +135,31 @@ static int ptdump_hole(unsigned long addr, unsigned long next, int depth, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - - st->note_page(st, addr, depth, 0); - + pte_t pte_zero = {0}; + pmd_t pmd_zero = {0}; + pud_t pud_zero = {0}; + p4d_t p4d_zero = {0}; + pgd_t pgd_zero = {0}; + + switch (depth) { + case 4: + st->note_page_pte(st, addr, pte_zero); + break; + case 3: + st->note_page_pmd(st, addr, pmd_zero); + break; + case 2: + st->note_page_pud(st, addr, pud_zero); + break; + case 1: + st->note_page_p4d(st, addr, p4d_zero); + break; + case 0: + st->note_page_pgd(st, addr, pgd_zero); + break; + default: + break; + } return 0; } @@ -153,16 +176,18 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) { const struct ptdump_range *range = st->range; + get_online_mems(); mmap_write_lock(mm); while (range->start != range->end) { - walk_page_range_novma(mm, range->start, range->end, + walk_page_range_debug(mm, range->start, range->end, &ptdump_ops, pgd, st); range++; } mmap_write_unlock(mm); + put_online_mems(); /* Flush out the last page */ - st->note_page(st, 0, -1, 0); + st->note_page_flush(st); } static int check_wx_show(struct seq_file *m, void *v) diff --git a/mm/readahead.c b/mm/readahead.c index 6a4e96b69702..406756d34309 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -457,7 +457,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, } void page_cache_ra_order(struct readahead_control *ractl, - struct file_ra_state *ra, unsigned int new_order) + struct file_ra_state *ra) { struct address_space *mapping = ractl->mapping; pgoff_t start = readahead_index(ractl); @@ -468,24 +468,21 @@ void page_cache_ra_order(struct readahead_control *ractl, unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); - unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping)); + unsigned int new_order = ra->order; - /* - * Fallback when size < min_nrpages as each folio should be - * at least min_nrpages anyway. - */ - if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size) + if (!mapping_large_folio_support(mapping)) { + ra->order = 0; goto fallback; + } limit = min(limit, index + ra->size - 1); - if (new_order < mapping_max_folio_order(mapping)) - new_order += 2; - new_order = min(mapping_max_folio_order(mapping), new_order); new_order = min_t(unsigned int, new_order, ilog2(ra->size)); new_order = max(new_order, min_order); + ra->order = new_order; + /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); @@ -617,8 +614,9 @@ void page_cache_sync_ra(struct readahead_control *ractl, ra->size = min(contig_count + req_count, max_pages); ra->async_size = 1; readit: + ra->order = 0; ractl->_index = ra->start; - page_cache_ra_order(ractl, ra, 0); + page_cache_ra_order(ractl, ra); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); @@ -628,8 +626,7 @@ void page_cache_async_ra(struct readahead_control *ractl, unsigned long max_pages; struct file_ra_state *ra = ractl->ra; pgoff_t index = readahead_index(ractl); - pgoff_t expected, start; - unsigned int order = folio_order(folio); + pgoff_t expected, start, end, aligned_end, align; /* no readahead */ if (!ra->ra_pages) @@ -652,7 +649,7 @@ void page_cache_async_ra(struct readahead_control *ractl, * Ramp up sizes, and push forward the readahead window. */ expected = round_down(ra->start + ra->size - ra->async_size, - 1UL << order); + folio_nr_pages(folio)); if (index == expected) { ra->start += ra->size; /* @@ -660,7 +657,6 @@ void page_cache_async_ra(struct readahead_control *ractl, * the readahead window. */ ra->size = max(ra->size, get_next_ra_size(ra, max_pages)); - ra->async_size = ra->size; goto readit; } @@ -681,18 +677,30 @@ void page_cache_async_ra(struct readahead_control *ractl, ra->size = start - index; /* old async_size */ ra->size += req_count; ra->size = get_next_ra_size(ra, max_pages); - ra->async_size = ra->size; readit: + ra->order += 2; + align = 1UL << min(ra->order, ffs(max_pages) - 1); + end = ra->start + ra->size; + aligned_end = round_down(end, align); + if (aligned_end > ra->start) + ra->size -= end - aligned_end; + ra->async_size = ra->size; ractl->_index = ra->start; - page_cache_ra_order(ractl, ra, order); + page_cache_ra_order(ractl, ra); } EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { + struct file *file; + const struct inode *inode; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; - if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) + file = fd_file(f); + if (!(file->f_mode & FMODE_READ)) return -EBADF; /* @@ -700,9 +708,15 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count) * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ - if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops || - (!S_ISREG(file_inode(fd_file(f))->i_mode) && - !S_ISBLK(file_inode(fd_file(f))->i_mode))) + if (!file->f_mapping) + return -EINVAL; + if (!file->f_mapping->a_ops) + return -EINVAL; + + inode = file_inode(file); + if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) + return -EINVAL; + if (IS_ANON_FILE(inode)) return -EINVAL; return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED); diff --git a/mm/rmap.c b/mm/rmap.c index 67bb273dfb80..f93ce27132ab 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -503,12 +503,12 @@ struct anon_vma *folio_get_anon_vma(const struct folio *folio) rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); - if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; - anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; @@ -550,12 +550,12 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, retry: rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); - if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; - anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { /* @@ -746,7 +746,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm) int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; if (pending != flushed) { - arch_flush_tlb_batched_pending(mm); + flush_tlb_mm(mm); /* * If the new TLB flushing is pending during flushing, leave * mm->tlb_flush_batched as is, to avoid losing flushing. @@ -774,7 +774,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) * @vma: The VMA we need to know the address in. * * Calculates the user virtual address of this page in the specified VMA. - * It is the caller's responsibililty to check the page is actually + * It is the caller's responsibility to check the page is actually * within the VMA. There may not currently be a PTE pointing at this * page, but if a page fault occurs at this address, this is the page * which will be accessed. @@ -789,13 +789,13 @@ unsigned long page_address_in_vma(const struct folio *folio, const struct page *page, const struct vm_area_struct *vma) { if (folio_test_anon(folio)) { - struct anon_vma *page__anon_vma = folio_anon_vma(folio); + struct anon_vma *anon_vma = folio_anon_vma(folio); /* * Note: swapoff's unuse_vma() is more efficient with this * check, and needs it to match anon_vma when KSM is active. */ - if (!vma->anon_vma || !page__anon_vma || - vma->anon_vma->root != page__anon_vma->root) + if (!vma->anon_vma || !anon_vma || + vma->anon_vma->root != anon_vma->root) return -EFAULT; } else if (!vma->vm_file) { return -EFAULT; @@ -803,7 +803,7 @@ unsigned long page_address_in_vma(const struct folio *folio, return -EFAULT; } - /* KSM folios don't reach here because of the !page__anon_vma check */ + /* KSM folios don't reach here because of the !anon_vma check */ return vma_address(vma, page_pgoff(folio, page), 1); } @@ -839,7 +839,7 @@ out: struct folio_referenced_arg { int mapcount; int referenced; - unsigned long vm_flags; + vm_flags_t vm_flags; struct mem_cgroup *memcg; }; @@ -984,7 +984,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) * the function bailed out due to rmap lock contention. */ int folio_referenced(struct folio *folio, int is_locked, - struct mem_cgroup *memcg, unsigned long *vm_flags) + struct mem_cgroup *memcg, vm_flags_t *vm_flags) { bool we_locked = false; struct folio_referenced_arg pra = { @@ -1334,9 +1334,9 @@ void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); - anon_vma += PAGE_MAPPING_ANON; + anon_vma += FOLIO_MAPPING_ANON; /* - * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written + * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written * simultaneously, so a concurrent reader (eg folio_referenced()'s * folio_test_anon()) will not see one without the other. */ @@ -1367,10 +1367,10 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, /* * page_idle does a lockless/optimistic rmap scan on folio->mapping. * Make sure the compiler doesn't split the stores of anon_vma and - * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code + * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code * could mistake the mapping for a struct address_space and crash. */ - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON; WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); folio->index = linear_page_index(vma, address); } @@ -1845,23 +1845,30 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page, #endif } -/* We support batch unmapping of PTEs for lazyfree large folios */ -static inline bool can_batch_unmap_folio_ptes(unsigned long addr, - struct folio *folio, pte_t *ptep) +static inline unsigned int folio_unmap_pte_batch(struct folio *folio, + struct page_vma_mapped_walk *pvmw, + enum ttu_flags flags, pte_t pte) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; - int max_nr = folio_nr_pages(folio); - pte_t pte = ptep_get(ptep); + unsigned long end_addr, addr = pvmw->address; + struct vm_area_struct *vma = pvmw->vma; + unsigned int max_nr; + + if (flags & TTU_HWPOISON) + return 1; + if (!folio_test_large(folio)) + return 1; + /* We may only batch within a single VMA and a single page table. */ + end_addr = pmd_addr_end(addr, vma->vm_end); + max_nr = (end_addr - addr) >> PAGE_SHIFT; + + /* We only support lazyfree batching for now ... */ if (!folio_test_anon(folio) || folio_test_swapbacked(folio)) - return false; + return 1; if (pte_unused(pte)) - return false; - if (pte_pfn(pte) != folio_pfn(folio)) - return false; + return 1; - return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, - NULL, NULL) == max_nr; + return folio_pte_batch(folio, pvmw->pte, pte, max_nr); } /* @@ -1944,7 +1951,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * restart so we can process the PTE-mapped THP. */ split_huge_pmd_locked(vma, pvmw.address, - pvmw.pmd, false, folio); + pvmw.pmd, false); flags &= ~TTU_SPLIT_HUGE_PMD; page_vma_mapped_walk_restart(&pvmw); continue; @@ -2024,9 +2031,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (pte_dirty(pteval)) folio_mark_dirty(folio); } else if (likely(pte_present(pteval))) { - if (folio_test_large(folio) && !(flags & TTU_HWPOISON) && - can_batch_unmap_folio_ptes(address, folio, pvmw.pte)) - nr_pages = folio_nr_pages(folio); + nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval); end_addr = address + nr_pages * PAGE_SIZE; flush_cache_range(vma, address, end_addr); @@ -2206,13 +2211,16 @@ discard: hugetlb_remove_rmap(folio); } else { folio_remove_rmap_ptes(folio, subpage, nr_pages, vma); - folio_ref_sub(folio, nr_pages - 1); } if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); - folio_put(folio); - /* We have already batched the entire folio */ - if (nr_pages > 1) + folio_put_refs(folio, nr_pages); + + /* + * If we are sure that we batched the entire folio and cleared + * all PTEs, we can just optimize and stop right here. + */ + if (nr_pages == folio_nr_pages(folio)) goto walk_done; continue; walk_abort: @@ -2292,13 +2300,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, pvmw.flags = PVMW_SYNC; /* - * unmap_page() in mm/huge_memory.c is the only user of migration with - * TTU_SPLIT_HUGE_PMD and it wants to freeze. - */ - if (flags & TTU_SPLIT_HUGE_PMD) - split_huge_pmd_address(vma, address, true, folio); - - /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. @@ -2323,9 +2324,16 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte) { + if (flags & TTU_SPLIT_HUGE_PMD) { + split_huge_pmd_locked(vma, pvmw.address, + pvmw.pmd, true); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION subpage = folio_page(folio, pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || @@ -2337,8 +2345,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, break; } continue; - } #endif + } /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); diff --git a/mm/secretmem.c b/mm/secretmem.c index 1b0a214ee558..60137305bc20 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -54,7 +54,6 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) pgoff_t offset = vmf->pgoff; gfp_t gfp = vmf->gfp_mask; unsigned long addr; - struct page *page; struct folio *folio; vm_fault_t ret; int err; @@ -65,16 +64,15 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) filemap_invalidate_lock_shared(mapping); retry: - page = find_lock_page(mapping, offset); - if (!page) { + folio = filemap_lock_folio(mapping, offset); + if (IS_ERR(folio)) { folio = folio_alloc(gfp | __GFP_ZERO, 0); if (!folio) { ret = VM_FAULT_OOM; goto out; } - page = &folio->page; - err = set_direct_map_invalid_noflush(page); + err = set_direct_map_invalid_noflush(folio_page(folio, 0)); if (err) { folio_put(folio); ret = vmf_error(err); @@ -90,7 +88,7 @@ retry: * already happened when we marked the page invalid * which guarantees that this call won't fail */ - set_direct_map_default_noflush(page); + set_direct_map_default_noflush(folio_page(folio, 0)); if (err == -EEXIST) goto retry; @@ -98,11 +96,11 @@ retry: goto out; } - addr = (unsigned long)page_address(page); + addr = (unsigned long)folio_address(folio); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } - vmf->page = page; + vmf->page = folio_file_page(folio, vmf->pgoff); ret = VM_FAULT_LOCKED; out: @@ -120,18 +118,18 @@ static int secretmem_release(struct inode *inode, struct file *file) return 0; } -static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) +static int secretmem_mmap_prepare(struct vm_area_desc *desc) { - unsigned long len = vma->vm_end - vma->vm_start; + const unsigned long len = desc->end - desc->start; - if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) + if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; - if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) + if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len)) return -EAGAIN; - vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); - vma->vm_ops = &secretmem_vm_ops; + desc->vm_flags |= VM_LOCKED | VM_DONTDUMP; + desc->vm_ops = &secretmem_vm_ops; return 0; } @@ -143,7 +141,7 @@ bool vma_is_secretmem(struct vm_area_struct *vma) static const struct file_operations secretmem_fops = { .release = secretmem_release, - .mmap = secretmem_mmap, + .mmap_prepare = secretmem_mmap_prepare, }; static int secretmem_migrate_folio(struct address_space *mapping, @@ -154,7 +152,7 @@ static int secretmem_migrate_folio(struct address_space *mapping, static void secretmem_free_folio(struct folio *folio) { - set_direct_map_default_noflush(&folio->page); + set_direct_map_default_noflush(folio_page(folio, 0)); folio_zero_segment(folio, 0, folio_size(folio)); } @@ -195,20 +193,13 @@ static struct file *secretmem_file_create(unsigned long flags) struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; - int err; - inode = alloc_anon_inode(secretmem_mnt->mnt_sb); + inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL); if (IS_ERR(inode)) return ERR_CAST(inode); - err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL); - if (err) { - file = ERR_PTR(err); - goto err_free_inode; - } - file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", - O_RDWR, &secretmem_fops); + O_RDWR | O_LARGEFILE, &secretmem_fops); if (IS_ERR(file)) goto err_free_inode; @@ -222,6 +213,8 @@ static struct file *secretmem_file_create(unsigned long flags) inode->i_mode |= S_IFREG; inode->i_size = 0; + atomic_inc(&secretmem_users); + return file; err_free_inode: @@ -255,9 +248,6 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) goto err_put_fd; } - file->f_flags |= O_LARGEFILE; - - atomic_inc(&secretmem_users); fd_install(fd, file); return fd; @@ -268,7 +258,15 @@ err_put_fd: static int secretmem_init_fs_context(struct fs_context *fc) { - return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; + struct pseudo_fs_context *ctx; + + ctx = init_pseudo(fc, SECRETMEM_MAGIC); + if (!ctx) + return -ENOMEM; + + fc->s_iflags |= SB_I_NOEXEC; + fc->s_iflags |= SB_I_NODEV; + return 0; } static struct file_system_type secretmem_fs = { @@ -286,9 +284,6 @@ static int __init secretmem_init(void) if (IS_ERR(secretmem_mnt)) return PTR_ERR(secretmem_mnt); - /* prevent secretmem mappings from ever getting PROT_EXEC */ - secretmem_mnt->mnt_flags |= MNT_NOEXEC; - return 0; } fs_initcall(secretmem_init); diff --git a/mm/shmem.c b/mm/shmem.c index 99327c30507c..7fdd707ac1ac 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -98,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ @@ -107,7 +107,7 @@ struct shmem_falloc { pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ - pgoff_t nr_unswapped; /* how often writepage refused to swap out */ + pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { @@ -292,7 +292,7 @@ bool vma_is_shmem(struct vm_area_struct *vma) } static LIST_HEAD(shmem_swaplist); -static DEFINE_MUTEX(shmem_swaplist_mutex); +static DEFINE_SPINLOCK(shmem_swaplist_lock); #ifdef CONFIG_TMPFS_QUOTA @@ -432,10 +432,13 @@ static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) + * + * Return: true if swapped was incremented from 0, for shmem_writeout(). */ -static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) +static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); + bool first_swapped = false; long freed; spin_lock(&info->lock); @@ -446,12 +449,15 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), - * shmem_writepage() has to raise swapped before nrpages is lowered - + * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ - if (swapped > 0) + if (swapped > 0) { + if (info->swapped == swapped) + first_swapped = true; freed += swapped; + } if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); @@ -459,6 +465,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) /* The quota case may block */ if (freed > 0) shmem_inode_unacct_blocks(inode, freed); + return first_swapped; } bool shmem_charge(struct inode *inode, long pages) @@ -615,7 +622,7 @@ static unsigned int shmem_get_orders_within_size(struct inode *inode, static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? 0 : BIT(HPAGE_PMD_ORDER); @@ -862,7 +869,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { return 0; } @@ -1375,11 +1382,11 @@ static void shmem_evict_inode(struct inode *inode) /* Wait while shmem_unuse() is scanning this inode... */ wait_var_event(&info->stop_eviction, !atomic_read(&info->stop_eviction)); - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction)) list_del_init(&info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); } } @@ -1446,8 +1453,6 @@ static int shmem_unuse_swap_entries(struct inode *inode, for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; - if (!xa_is_value(folio)) - continue; error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { @@ -1504,7 +1509,8 @@ int shmem_unuse(unsigned int type) if (list_empty(&shmem_swaplist)) return 0; - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); +start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); @@ -1517,31 +1523,38 @@ int shmem_unuse(unsigned int type) * (igrab() would protect from unlink, but not from unmount). */ atomic_inc(&info->stop_eviction); - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); - mutex_lock(&shmem_swaplist_mutex); - next = list_next_entry(info, swaplist); - if (!info->swapped) - list_del_init(&info->swaplist); + spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; + if (list_empty(&info->swaplist)) + goto start_over; + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); } - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); return error; } -/* - * Move the page from the page cache to the swap cache. +/** + * shmem_writeout - Write the folio to swap + * @folio: The folio to write + * @plug: swap plug + * @folio_list: list to put back folios on split + * + * Move the folio from the page cache to the swap cache. */ -static int shmem_writepage(struct page *page, struct writeback_control *wbc) +int shmem_writeout(struct folio *folio, struct swap_iocb **plug, + struct list_head *folio_list) { - struct folio *folio = page_folio(page); struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1550,16 +1563,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) int nr_pages; bool split = false; - /* - * Our capabilities prevent regular writeback or sync from ever calling - * shmem_writepage; but a stacking filesystem might use ->writepage of - * its underlying filesystem, in which case tmpfs should write out to - * swap only in response to memory pressure, and not for the writeback - * threads or sync. - */ - if (WARN_ON_ONCE(!wbc->for_reclaim)) - goto redirty; - if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty; @@ -1586,9 +1589,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page_to_list_to_order(page, wbc->list, 0)) + if (split_folio_to_list(folio, folio_list)) goto redirty; - folio = page_folio(page); folio_clear_dirty(folio); } @@ -1627,39 +1629,66 @@ try_split: folio_mark_uptodate(folio); } - /* - * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now before the folio is - * moved to swap cache, when its pagelock no longer protects - * the inode from eviction. But don't unlock the mutex until - * we've incremented swapped, because shmem_unuse_inode() will - * prune a !swapped inode from the swaplist under this mutex. - */ - mutex_lock(&shmem_swaplist_mutex); - if (list_empty(&info->swaplist)) - list_add(&info->swaplist, &shmem_swaplist); - if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { - shmem_recalc_inode(inode, 0, nr_pages); + bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); + int error; + + /* + * Add inode to shmem_unuse()'s list of swapped-out inodes, + * if it's not already there. Do it now before the folio is + * removed from page cache, when its pagelock no longer + * protects the inode from eviction. And do it now, after + * we've incremented swapped, because shmem_unuse() will + * prune a !swapped inode from the swaplist. + */ + if (first_swapped) { + spin_lock(&shmem_swaplist_lock); + if (list_empty(&info->swaplist)) + list_add(&info->swaplist, &shmem_swaplist); + spin_unlock(&shmem_swaplist_lock); + } + swap_shmem_alloc(folio->swap, nr_pages); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); - mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); - return swap_writepage(&folio->page, wbc); - } + error = swap_writeout(folio, plug); + if (error != AOP_WRITEPAGE_ACTIVATE) { + /* folio has been unlocked */ + return error; + } + + /* + * The intention here is to avoid holding on to the swap when + * zswap was unable to compress and unable to writeback; but + * it will be appropriate if other reactivate cases are added. + */ + error = shmem_add_to_page_cache(folio, mapping, index, + swp_to_radix_entry(folio->swap), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + /* Swap entry might be erased by racing shmem_free_swap() */ + if (!error) { + shmem_recalc_inode(inode, 0, -nr_pages); + swap_free_nr(folio->swap, nr_pages); + } - list_del_init(&info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); + /* + * The delete_from_swap_cache() below could be left for + * shrink_folio_list()'s folio_free_swap() to dispose of; + * but I'm a little nervous about letting this folio out of + * shmem_writeout() in a hybrid half-tmpfs-half-swap state + * e.g. folio_mapping(folio) might give an unexpected answer. + */ + delete_from_swap_cache(folio); + goto redirty; + } if (nr_pages > 1) goto try_split; redirty: folio_mark_dirty(folio); - if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ - folio_unlock(folio); - return 0; + return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ } +EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) @@ -1760,7 +1789,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); - unsigned long vm_flags = vma ? vma->vm_flags : 0; + vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) @@ -2262,6 +2291,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio = swap_cache_get_folio(swap, NULL, 0); order = xa_get_order(&mapping->i_pages, index); if (!folio) { + int nr_pages = 1 << order; bool fallback_order0 = false; /* Or update major stats only when swapin succeeds?? */ @@ -2275,9 +2305,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. */ if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled())) + !zswap_never_enabled() || + non_swapcache_batch(swap, nr_pages) != nr_pages)) fallback_order0 = true; /* Skip swapcache for synchronous device. */ @@ -2335,6 +2368,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, */ split_order = shmem_split_large_entry(inode, index, swap, gfp); if (split_order < 0) { + folio_put(folio); + folio = NULL; error = split_order; goto failed; } @@ -3267,9 +3302,9 @@ static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; static int -shmem_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -3301,9 +3336,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, } static int -shmem_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; @@ -3768,7 +3803,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, index--; /* - * Inform shmem_writepage() how far we have reached. + * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) @@ -4184,7 +4219,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, #ifdef CONFIG_TMPFS_XATTR -static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); @@ -4194,7 +4229,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) } static int shmem_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); @@ -4981,7 +5016,6 @@ static void shmem_put_super(struct super_block *sb) static const struct dentry_operations shmem_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, - .d_delete = always_delete_dentry, }; #endif @@ -5029,7 +5063,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) if (ctx->encoding) { sb->s_encoding = ctx->encoding; - sb->s_d_op = &shmem_ci_dentry_ops; + set_default_d_op(sb, &shmem_ci_dentry_ops); if (ctx->strict_encoding) sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; } @@ -5038,6 +5072,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #else sb->s_flags |= SB_NOUSER; #endif /* CONFIG_TMPFS */ + sb->s_d_flags |= DCACHE_DONTCACHE; sbinfo->max_blocks = ctx->blocks; sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; @@ -5191,7 +5226,6 @@ static int shmem_error_remove_folio(struct address_space *mapping, } static const struct address_space_operations shmem_aops = { - .writepage = shmem_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, @@ -5810,12 +5844,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); - if (shmem_acct_size(flags, size)) - return ERR_PTR(-ENOMEM); - if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); + if (shmem_acct_size(flags, size)) + return ERR_PTR(-ENOMEM); + inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { diff --git a/mm/show_mem.c b/mm/show_mem.c index 6af13bcd2ab3..41999e94a56d 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -94,26 +94,20 @@ void si_meminfo_node(struct sysinfo *val, int nid) unsigned long free_highpages = 0; pg_data_t *pgdat = NODE_DATA(nid); - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); - val->totalram = managed_pages; - val->sharedram = node_page_state(pgdat, NR_SHMEM); - val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); -#ifdef CONFIG_HIGHMEM for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - + managed_pages += zone_managed_pages(zone); if (is_highmem(zone)) { managed_highpages += zone_managed_pages(zone); free_highpages += zone_page_state(zone, NR_FREE_PAGES); } } + + val->totalram = managed_pages; + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); val->totalhigh = managed_highpages; val->freehigh = free_highpages; -#else - val->totalhigh = managed_highpages; - val->freehigh = free_highpages; -#endif val->mem_unit = PAGE_SIZE; } #endif @@ -223,7 +217,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), global_node_page_state(NR_SECONDARY_PAGETABLE), - global_zone_page_state(NR_BOUNCE), + 0UL, global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), free_pcp, @@ -252,7 +246,6 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z " shmem_pmdmapped:%lukB" " anon_thp:%lukB" #endif - " writeback_tmp:%lukB" " kernel_stack:%lukB" #ifdef CONFIG_SHADOW_CALL_STACK " shadow_call_stack:%lukB" @@ -279,7 +272,6 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), K(node_page_state(pgdat, NR_ANON_THPS)), #endif - K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), node_page_state(pgdat, NR_KERNEL_STACK_KB), #ifdef CONFIG_SHADOW_CALL_STACK node_page_state(pgdat, NR_KERNEL_SCS_KB), @@ -311,6 +303,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z " low:%lukB" " high:%lukB" " reserved_highatomic:%luKB" + " free_highatomic:%luKB" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" @@ -332,6 +325,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), K(zone->nr_reserved_highatomic), + K(zone->nr_free_highatomic), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), @@ -341,7 +335,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_BOUNCE)), + 0UL, K(free_pcp), K(this_cpu_read(zone->per_cpu_pageset->count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES))); diff --git a/mm/slab.h b/mm/slab.h index 05a21dc796e0..248b34c839b7 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -50,7 +50,7 @@ typedef union { /* Reuses the bits in struct page */ struct slab { - unsigned long __page_flags; + unsigned long flags; struct kmem_cache *slab_cache; union { @@ -99,7 +99,7 @@ struct slab { #define SLAB_MATCH(pg, sl) \ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) -SLAB_MATCH(flags, __page_flags); +SLAB_MATCH(flags, flags); SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG @@ -167,30 +167,6 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t) */ #define slab_page(s) folio_page(slab_folio(s), 0) -/* - * If network-based swap is enabled, sl*b must keep track of whether pages - * were allocated from pfmemalloc reserves. - */ -static inline bool slab_test_pfmemalloc(const struct slab *slab) -{ - return folio_test_active(slab_folio(slab)); -} - -static inline void slab_set_pfmemalloc(struct slab *slab) -{ - folio_set_active(slab_folio(slab)); -} - -static inline void slab_clear_pfmemalloc(struct slab *slab) -{ - folio_clear_active(slab_folio(slab)); -} - -static inline void __slab_clear_pfmemalloc(struct slab *slab) -{ - __folio_clear_active(slab_folio(slab)); -} - static inline void *slab_address(const struct slab *slab) { return folio_address(slab_folio(slab)); diff --git a/mm/slab_common.c b/mm/slab_common.c index 5be257e03c7c..bfe7c40eeee1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -2,7 +2,7 @@ /* * Slab allocator functions that are independent of the allocator strategy * - * (C) 2012 Christoph Lameter <cl@linux.com> + * (C) 2012 Christoph Lameter <cl@gentwo.org> */ #include <linux/slab.h> diff --git a/mm/slub.c b/mm/slub.c index f3d61b330a76..30003763d224 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -23,6 +23,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kasan.h> +#include <linux/node.h> #include <linux/kmsan.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -91,14 +92,14 @@ * The partially empty slabs cached on the CPU partial list are used * for performance reasons, which speeds up the allocation process. * These slabs are not frozen, but are also exempt from list management, - * by clearing the PG_workingset flag when moving out of the node + * by clearing the SL_partial flag when moving out of the node * partial list. Please see __slab_free() for more details. * * To sum up, the current scheme is: - * - node partial slab: PG_Workingset && !frozen - * - cpu partial slab: !PG_Workingset && !frozen - * - cpu slab: !PG_Workingset && frozen - * - full slab: !PG_Workingset && !frozen + * - node partial slab: SL_partial && !frozen + * - cpu partial slab: !SL_partial && !frozen + * - cpu slab: !SL_partial && frozen + * - full slab: !SL_partial && !frozen * * list_lock * @@ -183,6 +184,22 @@ * the fast path and disables lockless freelists. */ +/** + * enum slab_flags - How the slab flags bits are used. + * @SL_locked: Is locked with slab_lock() + * @SL_partial: On the per-node partial list + * @SL_pfmemalloc: Was allocated from PF_MEMALLOC reserves + * + * The slab flags share space with the page flags but some bits have + * different interpretations. The high bits are used for information + * like zone/node/section. + */ +enum slab_flags { + SL_locked = PG_locked, + SL_partial = PG_workingset, /* Historical reasons for this bit */ + SL_pfmemalloc = PG_active, /* Historical reasons for this bit */ +}; + /* * We could simply use migrate_disable()/enable() but as long as it's a * function call even on !PREEMPT_RT, use inline preempt_disable() there. @@ -447,7 +464,7 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) /* * Tracks for which NUMA nodes we have kmem_cache_nodes allocated. - * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily + * Corresponds to node_state[N_MEMORY], but can temporarily * differ during memory hotplug/hotremove operations. * Protected by slab_mutex. */ @@ -635,16 +652,35 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) #endif /* CONFIG_SLUB_CPU_PARTIAL */ /* + * If network-based swap is enabled, slub must keep track of whether memory + * were allocated from pfmemalloc reserves. + */ +static inline bool slab_test_pfmemalloc(const struct slab *slab) +{ + return test_bit(SL_pfmemalloc, &slab->flags); +} + +static inline void slab_set_pfmemalloc(struct slab *slab) +{ + set_bit(SL_pfmemalloc, &slab->flags); +} + +static inline void __slab_clear_pfmemalloc(struct slab *slab) +{ + __clear_bit(SL_pfmemalloc, &slab->flags); +} + +/* * Per slab locking using the pagelock */ static __always_inline void slab_lock(struct slab *slab) { - bit_spin_lock(PG_locked, &slab->__page_flags); + bit_spin_lock(SL_locked, &slab->flags); } static __always_inline void slab_unlock(struct slab *slab) { - bit_spin_unlock(PG_locked, &slab->__page_flags); + bit_spin_unlock(SL_locked, &slab->flags); } static inline bool @@ -1010,7 +1046,7 @@ static void print_slab_info(const struct slab *slab) { pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", slab, slab->objects, slab->inuse, slab->freelist, - &slab->__page_flags); + &slab->flags); } void skip_orig_size_check(struct kmem_cache *s, const void *object) @@ -1973,6 +2009,11 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ __GFP_ACCOUNT | __GFP_NOFAIL) +static inline void init_slab_obj_exts(struct slab *slab) +{ + slab->obj_exts = 0; +} + int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { @@ -2023,8 +2064,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, return 0; } -/* Should be called only if mem_alloc_profiling_enabled() */ -static noinline void free_slab_obj_exts(struct slab *slab) +static inline void free_slab_obj_exts(struct slab *slab) { struct slabobj_ext *obj_exts; @@ -2044,20 +2084,12 @@ static noinline void free_slab_obj_exts(struct slab *slab) slab->obj_exts = 0; } -static inline bool need_slab_obj_ext(void) -{ - if (mem_alloc_profiling_enabled()) - return true; +#else /* CONFIG_SLAB_OBJ_EXT */ - /* - * CONFIG_MEMCG creates vector of obj_cgroup objects conditionally - * inside memcg_slab_post_alloc_hook. No other users for now. - */ - return false; +static inline void init_slab_obj_exts(struct slab *slab) +{ } -#else /* CONFIG_SLAB_OBJ_EXT */ - static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { @@ -2068,11 +2100,6 @@ static inline void free_slab_obj_exts(struct slab *slab) { } -static inline bool need_slab_obj_ext(void) -{ - return false; -} - #endif /* CONFIG_SLAB_OBJ_EXT */ #ifdef CONFIG_MEM_ALLOC_PROFILING @@ -2093,10 +2120,11 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) slab = virt_to_slab(p); if (!slab_obj_exts(slab) && - WARN(alloc_slab_obj_exts(slab, s, flags, false), - "%s, %s: Failed to create slab extension vector!\n", - __func__, s->name)) + alloc_slab_obj_exts(slab, s, flags, false)) { + pr_warn_once("%s, %s: Failed to create slab extension vector!\n", + __func__, s->name); return NULL; + } return slab_obj_exts(slab) + obj_to_index(s, slab, p); } @@ -2120,7 +2148,7 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) static inline void alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) { - if (need_slab_obj_ext()) + if (mem_alloc_profiling_enabled()) __alloc_tagging_slab_alloc_hook(s, object, flags); } @@ -2592,8 +2620,12 @@ static __always_inline void account_slab(struct slab *slab, int order, static __always_inline void unaccount_slab(struct slab *slab, int order, struct kmem_cache *s) { - if (memcg_kmem_online() || need_slab_obj_ext()) - free_slab_obj_exts(slab); + /* + * The slab object extensions should now be freed regardless of + * whether mem_alloc_profiling_enabled() or not because profiling + * might have been disabled after slab->obj_exts got allocated. + */ + free_slab_obj_exts(slab); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), -(PAGE_SIZE << order)); @@ -2637,6 +2669,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) slab->objects = oo_objects(oo); slab->inuse = 0; slab->frozen = 0; + init_slab_obj_exts(slab); account_slab(slab, oo_order(oo), s, flags); @@ -2720,23 +2753,19 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab) free_slab(s, slab); } -/* - * SLUB reuses PG_workingset bit to keep track of whether it's on - * the per-node partial list. - */ static inline bool slab_test_node_partial(const struct slab *slab) { - return folio_test_workingset(slab_folio(slab)); + return test_bit(SL_partial, &slab->flags); } static inline void slab_set_node_partial(struct slab *slab) { - set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); + set_bit(SL_partial, &slab->flags); } static inline void slab_clear_node_partial(struct slab *slab) { - clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); + clear_bit(SL_partial, &slab->flags); } /* @@ -4272,7 +4301,12 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) flags = kmalloc_fix_flags(flags); flags |= __GFP_COMP; - folio = (struct folio *)alloc_pages_node_noprof(node, flags, order); + + if (node == NUMA_NO_NODE) + folio = (struct folio *)alloc_frozen_pages_noprof(flags, order); + else + folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL); + if (folio) { ptr = folio_address(folio); lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, @@ -4768,7 +4802,7 @@ static void free_large_kmalloc(struct folio *folio, void *object) lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); __folio_clear_large_kmalloc(folio); - folio_put(folio); + free_frozen_pages(&folio->page, order); } /* @@ -4933,12 +4967,12 @@ alloc_new: * When slub_debug_orig_size() is off, krealloc() only knows about the bucket * size of an allocation (but not the exact size it was allocated with) and * hence implements the following semantics for shrinking and growing buffers - * with __GFP_ZERO. + * with __GFP_ZERO:: * - * new bucket - * 0 size size - * |--------|----------------| - * | keep | zero | + * new bucket + * 0 size size + * |--------|----------------| + * | keep | zero | * * Otherwise, the original allocation size 'orig_size' could be used to * precisely clear the requested size, and the new size will also be stored @@ -4972,14 +5006,16 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * We want to attempt a large physically contiguous block first because * it is less likely to fragment multiple larger blocks and therefore * contribute to a long term fragmentation less than vmalloc fallback. - * However make sure that larger requests are not too disruptive - no - * OOM killer and no allocation failure warnings as we have a fallback. + * However make sure that larger requests are not too disruptive - i.e. + * do not direct reclaim unless physically continuous memory is preferred + * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to + * start working in the background */ if (size > PAGE_SIZE) { flags |= __GFP_NOWARN; if (!(flags & __GFP_RETRY_MAYFAIL)) - flags |= __GFP_NORETRY; + flags &= ~__GFP_DIRECT_RECLAIM; /* nofail semantic is implemented by the vmalloc fallback */ flags &= ~__GFP_NOFAIL; @@ -6150,7 +6186,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) return __kmem_cache_do_shrink(s); } -static int slab_mem_going_offline_callback(void *arg) +static int slab_mem_going_offline_callback(void) { struct kmem_cache *s; @@ -6164,46 +6200,13 @@ static int slab_mem_going_offline_callback(void *arg) return 0; } -static void slab_mem_offline_callback(void *arg) -{ - struct memory_notify *marg = arg; - int offline_node; - - offline_node = marg->status_change_nid_normal; - - /* - * If the node still has available memory. we need kmem_cache_node - * for it yet. - */ - if (offline_node < 0) - return; - - mutex_lock(&slab_mutex); - node_clear(offline_node, slab_nodes); - /* - * We no longer free kmem_cache_node structures here, as it would be - * racy with all get_node() users, and infeasible to protect them with - * slab_mutex. - */ - mutex_unlock(&slab_mutex); -} - -static int slab_mem_going_online_callback(void *arg) +static int slab_mem_going_online_callback(int nid) { struct kmem_cache_node *n; struct kmem_cache *s; - struct memory_notify *marg = arg; - int nid = marg->status_change_nid_normal; int ret = 0; /* - * If the node's memory is already available, then kmem_cache_node is - * already created. Nothing to do. - */ - if (nid < 0) - return 0; - - /* * We are bringing a node online. No memory is available yet. We must * allocate a kmem_cache_node structure in order to bring the node * online. @@ -6242,21 +6245,16 @@ out: static int slab_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { + struct node_notify *nn = arg; + int nid = nn->nid; int ret = 0; switch (action) { - case MEM_GOING_ONLINE: - ret = slab_mem_going_online_callback(arg); - break; - case MEM_GOING_OFFLINE: - ret = slab_mem_going_offline_callback(arg); - break; - case MEM_OFFLINE: - case MEM_CANCEL_ONLINE: - slab_mem_offline_callback(arg); + case NODE_ADDING_FIRST_MEMORY: + ret = slab_mem_going_online_callback(nid); break; - case MEM_ONLINE: - case MEM_CANCEL_OFFLINE: + case NODE_REMOVING_LAST_MEMORY: + ret = slab_mem_going_offline_callback(); break; } if (ret) @@ -6324,14 +6322,14 @@ void __init kmem_cache_init(void) * Initialize the nodemask for which we will allocate per node * structures. Here we don't need taking slab_mutex yet. */ - for_each_node_state(node, N_NORMAL_MEMORY) + for_each_node_state(node, N_MEMORY) node_set(node, slab_nodes); create_boot_cache(kmem_cache_node, "kmem_cache_node", sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); - hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); + hotplug_node_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); /* Able to allocate the per node structures */ slab_state = PARTIAL; diff --git a/mm/swap.c b/mm/swap.c index 77b2d5997873..3632dd061beb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -237,8 +237,9 @@ void folio_rotate_reclaimable(struct folio *folio) folio_batch_add_and_move(folio, lru_move_tail, true); } -void lru_note_cost(struct lruvec *lruvec, bool file, - unsigned int nr_io, unsigned int nr_rotated) +void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, + unsigned int nr_io, unsigned int nr_rotated) + __releases(lruvec->lru_lock) { unsigned long cost; @@ -250,18 +251,14 @@ void lru_note_cost(struct lruvec *lruvec, bool file, * different between them, adjust scan balance for CPU work. */ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; + if (!cost) { + spin_unlock_irq(&lruvec->lru_lock); + return; + } - do { + for (;;) { unsigned long lrusize; - /* - * Hold lruvec->lru_lock is safe here, since - * 1) The pinned lruvec in reclaim, or - * 2) From a pre-LRU page during refault (which also holds the - * rcu lock, so would be safe even if the page was on the LRU - * and could move simultaneously to a new lruvec). - */ - spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += cost; @@ -285,14 +282,22 @@ void lru_note_cost(struct lruvec *lruvec, bool file, lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } + spin_unlock_irq(&lruvec->lru_lock); - } while ((lruvec = parent_lruvec(lruvec))); + lruvec = parent_lruvec(lruvec); + if (!lruvec) + break; + spin_lock_irq(&lruvec->lru_lock); + } } void lru_note_cost_refault(struct folio *folio) { - lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), - folio_nr_pages(folio), 0); + struct lruvec *lruvec; + + lruvec = folio_lruvec_lock_irq(folio); + lru_note_cost_unlock_irq(lruvec, folio_is_file_lru(folio), + folio_nr_pages(folio), 0); } static void lru_activate(struct lruvec *lruvec, struct folio *folio) @@ -309,7 +314,7 @@ static void lru_activate(struct lruvec *lruvec, struct folio *folio) trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); + count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } #ifdef CONFIG_SMP @@ -581,7 +586,7 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } @@ -599,7 +604,7 @@ static void lru_deactivate(struct lruvec *lruvec, struct folio *folio) lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) @@ -625,7 +630,7 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); + count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } /* diff --git a/mm/swap.h b/mm/swap.h index 6f4a3f927edb..911ad5ff0f89 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -3,6 +3,8 @@ #define _MM_SWAP_H struct mempolicy; +struct swap_iocb; + extern int page_cluster; #ifdef CONFIG_SWAP @@ -20,8 +22,8 @@ static inline void swap_read_unplug(struct swap_iocb *plug) __swap_read_unplug(plug); } void swap_write_unplug(struct swap_iocb *sio); -int swap_writepage(struct page *page, struct writeback_control *wbc); -void __swap_writepage(struct folio *folio, struct writeback_control *wbc); +int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug); +void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); /* linux/mm/swap_state.c */ /* One swap address space for each 64M swap space */ @@ -106,6 +108,25 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return find_next_bit(sis->zeromap, end, start) - start; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + struct swap_info_struct *si = swp_swap_info(entry); + pgoff_t offset = swp_offset(entry); + int i; + + /* + * While allocating a large folio and doing mTHP swapin, we need to + * ensure all entries are not cached, otherwise, the mTHP folio will + * be in conflict with the folio in swap cache. + */ + for (i = 0; i < max_nr; i++) { + if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + return i; + } + + return i; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -141,7 +162,8 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } -static inline int swap_writepage(struct page *p, struct writeback_control *wbc) +static inline int swap_writeout(struct folio *folio, + struct swap_iocb **swap_plug) { return 0; } @@ -199,6 +221,28 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return 0; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + return 0; +} #endif /* CONFIG_SWAP */ +/** + * folio_index - File index of a folio. + * @folio: The folio. + * + * For a folio which is either in the page cache or the swap cache, + * return its index within the address_space it belongs to. If you know + * the folio is definitely in the page cache, you can look at the folio's + * index directly. + * + * Return: The index (offset in units of pages) of a folio in its file. + */ +static inline pgoff_t folio_index(struct folio *folio) +{ + if (unlikely(folio_test_swapcache(folio))) + return swap_cache_index(folio->swap); + return folio->index; +} + #endif /* _MM_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 68fd981b514f..c354435a0923 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -30,7 +30,6 @@ * vmscan's shrink_folio_list. */ static const struct address_space_operations swap_aops = { - .writepage = swap_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, @@ -232,13 +231,11 @@ void free_swap_cache(struct folio *folio) } /* - * Perform a free_page(), also freeing any swap cache associated with - * this page if it is the last user of the page. + * Freeing a folio and also freeing any swap cache associated with + * this folio if it is the last user. */ -void free_page_and_swap_cache(struct page *page) +void free_folio_and_swap_cache(struct folio *folio) { - struct folio *folio = page_folio(page); - free_swap_cache(folio); if (!is_huge_zero_folio(folio)) folio_put(folio); diff --git a/mm/swapfile.c b/mm/swapfile.c index 2eff8b51a945..b4f3cc712580 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -52,9 +52,9 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entry_range_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages); +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); @@ -192,7 +192,7 @@ static bool swap_is_last_map(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; unsigned char count = *map; - if (swap_count(count) != 1) + if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM) return false; while (++map < map_end) { @@ -956,9 +956,8 @@ new_cluster: } /* - * We don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them, then - * reread cluster_next_cpu since we dropped si->lock + * We don't have free cluster but have some clusters in discarding, + * do discard now and reclaim them. */ if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) goto new_cluster; @@ -1115,6 +1114,7 @@ static void swap_range_alloc(struct swap_info_struct *si, if (vm_swap_full()) schedule_work(&si->reclaim_work); } + atomic_long_sub(nr_entries, &nr_swap_pages); } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, @@ -1272,13 +1272,22 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); - /* - * Should not even be attempting large allocations when huge - * page swap is disabled. Warn and fail the allocation. - */ - if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) { - VM_WARN_ON_ONCE(1); - return -EINVAL; + if (order) { + /* + * Reject large allocation when THP_SWAP is disabled, + * the caller should split the folio and try again. + */ + if (!IS_ENABLED(CONFIG_THP_SWAP)) + return -EAGAIN; + + /* + * Allocation size should never exceed cluster size + * (HPAGE_PMD_SIZE). + */ + if (size > SWAPFILE_CLUSTER) { + VM_WARN_ON_ONCE(1); + return -EINVAL; + } } local_lock(&percpu_swap_cluster.lock); @@ -1304,7 +1313,6 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) goto out_free; - atomic_long_sub(size, &nr_swap_pages); return 0; out_free: @@ -1346,10 +1354,12 @@ out: return NULL; } -static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, - unsigned long offset, - unsigned char usage) +static unsigned char swap_entry_put_locked(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, + unsigned char usage) { + unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; @@ -1381,7 +1391,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, if (usage) WRITE_ONCE(si->swap_map[offset], usage); else - WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); + swap_entries_free(si, ci, entry, 1); return usage; } @@ -1452,71 +1462,104 @@ put_out: return NULL; } -static unsigned char __swap_entry_free(struct swap_info_struct *si, - swp_entry_t entry) +static void swap_entries_put_cache(struct swap_info_struct *si, + swp_entry_t entry, int nr) { - struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); - unsigned char usage; + struct swap_cluster_info *ci; ci = lock_cluster(si, offset); - usage = __swap_entry_free_locked(si, offset, 1); - if (!usage) - swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + if (swap_only_has_cache(si, offset, nr)) + swap_entries_free(si, ci, entry, nr); + else { + for (int i = 0; i < nr; i++, entry.val++) + swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); + } unlock_cluster(ci); - - return usage; } -static bool __swap_entries_free(struct swap_info_struct *si, - swp_entry_t entry, int nr) +static bool swap_entries_put_map(struct swap_info_struct *si, + swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); - unsigned int type = swp_type(entry); struct swap_cluster_info *ci; bool has_cache = false; unsigned char count; int i; - if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) + if (nr <= 1) goto fallback; - /* cross into another cluster */ - if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) + count = swap_count(data_race(si->swap_map[offset])); + if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { - unlock_cluster(ci); - goto fallback; + goto locked_fallback; } - for (i = 0; i < nr; i++) - WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); if (!has_cache) - swap_entry_range_free(si, ci, entry, nr); + swap_entries_free(si, ci, entry, nr); + else + for (i = 0; i < nr; i++) + WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); unlock_cluster(ci); return has_cache; fallback: - for (i = 0; i < nr; i++) { - if (data_race(si->swap_map[offset + i])) { - count = __swap_entry_free(si, swp_entry(type, offset + i)); - if (count == SWAP_HAS_CACHE) - has_cache = true; - } else { - WARN_ON_ONCE(1); - } + ci = lock_cluster(si, offset); +locked_fallback: + for (i = 0; i < nr; i++, entry.val++) { + count = swap_entry_put_locked(si, ci, entry, 1); + if (count == SWAP_HAS_CACHE) + has_cache = true; } + unlock_cluster(ci); return has_cache; + } /* - * Drop the last HAS_CACHE flag of swap entries, caller have to - * ensure all entries belong to the same cgroup. + * Only functions with "_nr" suffix are able to free entries spanning + * cross multi clusters, so ensure the range is within a single cluster + * when freeing entries with functions without "_nr" suffix. */ -static void swap_entry_range_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages) +static bool swap_entries_put_map_nr(struct swap_info_struct *si, + swp_entry_t entry, int nr) +{ + int cluster_nr, cluster_rest; + unsigned long offset = swp_offset(entry); + bool has_cache = false; + + cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER; + while (nr) { + cluster_nr = min(nr, cluster_rest); + has_cache |= swap_entries_put_map(si, entry, cluster_nr); + cluster_rest = SWAPFILE_CLUSTER; + nr -= cluster_nr; + entry.val += cluster_nr; + } + + return has_cache; +} + +/* + * Check if it's the last ref of swap entry in the freeing path. + * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. + */ +static inline bool __maybe_unused swap_is_last_ref(unsigned char count) +{ + return (count == SWAP_HAS_CACHE) || (count == 1) || + (count == SWAP_MAP_SHMEM); +} + +/* + * Drop the last ref of swap entries, caller have to ensure all entries + * belong to the same cgroup and cluster. + */ +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; @@ -1529,7 +1572,7 @@ static void swap_entry_range_free(struct swap_info_struct *si, ci->count -= nr_pages; do { - VM_BUG_ON(*map != SWAP_HAS_CACHE); + VM_BUG_ON(!swap_is_last_ref(*map)); *map = 0; } while (++map < map_end); @@ -1542,21 +1585,6 @@ static void swap_entry_range_free(struct swap_info_struct *si, partial_free_cluster(si, ci); } -static void cluster_swap_free_nr(struct swap_info_struct *si, - unsigned long offset, int nr_pages, - unsigned char usage) -{ - struct swap_cluster_info *ci; - unsigned long end = offset + nr_pages; - - ci = lock_cluster(si, offset); - do { - if (!__swap_entry_free_locked(si, offset, usage)) - swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); - } while (++offset < end); - unlock_cluster(ci); -} - /* * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. @@ -1573,7 +1601,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - cluster_swap_free_nr(sis, offset, nr, 1); + swap_entries_put_map(sis, swp_entry(sis->type, offset), nr); offset += nr; nr_pages -= nr; } @@ -1584,8 +1612,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) */ void put_swap_folio(struct folio *folio, swp_entry_t entry) { - unsigned long offset = swp_offset(entry); - struct swap_cluster_info *ci; struct swap_info_struct *si; int size = 1 << swap_entry_order(folio_order(folio)); @@ -1593,16 +1619,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (!si) return; - ci = lock_cluster(si, offset); - if (swap_only_has_cache(si, offset, size)) - swap_entry_range_free(si, ci, entry, size); - else { - for (int i = 0; i < size; i++, entry.val++) { - if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) - swap_entry_range_free(si, ci, entry, 1); - } - } - unlock_cluster(ci); + swap_entries_put_cache(si, entry, size); } int __swap_count(swp_entry_t entry) @@ -1797,7 +1814,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - any_only_cache = __swap_entries_free(si, entry, nr); + any_only_cache = swap_entries_put_map_nr(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their @@ -1807,13 +1824,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) goto out; /* - * Now go back over the range trying to reclaim the swap cache. This is - * more efficient for large folios because we will only try to reclaim - * the swap once per folio in the common case. If we do - * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the - * latter will get a reference and lock the folio for every individual - * page but will only succeed once the swap slot for every subpage is - * zero. + * Now go back over the range trying to reclaim the swap cache. */ for (offset = start_offset; offset < end_offset; offset += nr) { nr = 1; @@ -2359,7 +2370,7 @@ retry: * Limit the number of retries? No: when mmget_not_zero() * above fails, that mm is likely to be freeing swap from * exit_mmap(), which proceeds at its own independent pace; - * and even shmem_writepage() could have been preempted after + * and even shmem_writeout() could have been preempted after * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ @@ -3129,43 +3140,30 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return maxpages; } -static int setup_swap_map_and_extents(struct swap_info_struct *si, - union swap_header *swap_header, - unsigned char *swap_map, - unsigned long maxpages, - sector_t *span) +static int setup_swap_map(struct swap_info_struct *si, + union swap_header *swap_header, + unsigned char *swap_map, + unsigned long maxpages) { - unsigned int nr_good_pages; unsigned long i; - int nr_extents; - - nr_good_pages = maxpages - 1; /* omit header page */ + swap_map[0] = SWAP_MAP_BAD; /* omit header page */ for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) return -EINVAL; if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; - nr_good_pages--; + si->pages--; } } - if (nr_good_pages) { - swap_map[0] = SWAP_MAP_BAD; - si->max = maxpages; - si->pages = nr_good_pages; - nr_extents = setup_swap_extents(si, span); - if (nr_extents < 0) - return nr_extents; - nr_good_pages = si->pages; - } - if (!nr_good_pages) { + if (!si->pages) { pr_warn("Empty swap-file\n"); return -EINVAL; } - return nr_extents; + return 0; } #define SWAP_CLUSTER_INFO_COLS \ @@ -3205,13 +3203,17 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * Mark unusable pages as unavailable. The clusters aren't * marked free yet, so no list operations are involved yet. * - * See setup_swap_map_and_extents(): header page, bad pages, + * See setup_swap_map(): header page, bad pages, * and the EOF part of the last cluster. */ inc_cluster_info_page(si, cluster_info, 0); - for (i = 0; i < swap_header->info.nr_badpages; i++) - inc_cluster_info_page(si, cluster_info, - swap_header->info.badpages[i]); + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; + + if (page_nr >= maxpages) + continue; + inc_cluster_info_page(si, cluster_info, page_nr); + } for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) inc_cluster_info_page(si, cluster_info, i); @@ -3323,6 +3325,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } /* + * The swap subsystem needs a major overhaul to support this. + * It doesn't work yet so just disable it for now. + */ + if (mapping_min_folio_order(mapping) > 0) { + error = -EINVAL; + goto bad_swap_unlock_inode; + } + + /* * Read the swap header. */ if (!mapping->a_ops->read_folio) { @@ -3342,6 +3353,21 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap_unlock_inode; } + si->max = maxpages; + si->pages = maxpages - 1; + nr_extents = setup_swap_extents(si, &span); + if (nr_extents < 0) { + error = nr_extents; + goto bad_swap_unlock_inode; + } + if (si->pages != si->max - 1) { + pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max); + error = -EINVAL; + goto bad_swap_unlock_inode; + } + + maxpages = si->max; + /* OK, set up the swap map and apply the bad block list */ swap_map = vzalloc(maxpages); if (!swap_map) { @@ -3353,12 +3379,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; - nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map, - maxpages, &span); - if (unlikely(nr_extents < 0)) { - error = nr_extents; + error = setup_swap_map(si, swap_header, swap_map, maxpages); + if (error) goto bad_swap_unlock_inode; - } /* * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might @@ -3636,11 +3659,13 @@ int swapcache_prepare(swp_entry_t entry, int nr) return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } +/* + * Caller should ensure entries belong to the same folio so + * the entries won't span cross cluster boundary. + */ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { - unsigned long offset = swp_offset(entry); - - cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); + swap_entries_put_cache(si, entry, nr); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) @@ -3649,21 +3674,6 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry) } /* - * out-of-line methods to avoid include hell. - */ -struct address_space *swapcache_mapping(struct folio *folio) -{ - return swp_swap_info(folio->swap)->swap_file->f_mapping; -} -EXPORT_SYMBOL_GPL(swapcache_mapping); - -pgoff_t __folio_swap_cache_index(struct folio *folio) -{ - return swap_cache_index(folio->swap); -} -EXPORT_SYMBOL_GPL(__folio_swap_cache_index); - -/* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's * page of the original vmalloc'ed swap_map, to hold the continuation count @@ -3780,7 +3790,7 @@ outer: * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or caller of __swap_entry_free_locked() + * Called while __swap_duplicate() or caller of swap_entry_put_locked() * holds cluster lock. */ static bool swap_count_continued(struct swap_info_struct *si, diff --git a/mm/truncate.c b/mm/truncate.c index 5d98054094d1..91eb92a5ce4f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -191,6 +191,7 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio) bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) { loff_t pos = folio_pos(folio); + size_t size = folio_size(folio); unsigned int offset, length; struct page *split_at, *split_at2; @@ -198,14 +199,13 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) offset = start - pos; else offset = 0; - length = folio_size(folio); - if (pos + length <= (u64)end) - length = length - offset; + if (pos + size <= (u64)end) + length = size - offset; else length = end + 1 - pos - offset; folio_wait_writeback(folio); - if (length == folio_size(folio)) { + if (length == size) { truncate_inode_folio(folio->mapping, folio); return true; } @@ -224,16 +224,20 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) return true; split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); - split_at2 = folio_page(folio, - PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE); - if (!try_folio_split(folio, split_at, NULL)) { /* * try to split at offset + length to make sure folios within * the range can be dropped, especially to avoid memory waste * for shmem truncate */ - struct folio *folio2 = page_folio(split_at2); + struct folio *folio2; + + if (offset + length == size) + goto no_split; + + split_at2 = folio_page(folio, + PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE); + folio2 = page_folio(split_at2); if (!folio_try_get(folio2)) goto no_split; @@ -421,7 +425,7 @@ void truncate_inode_pages_range(struct address_space *mapping, for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; - /* We rely upon deletion not changing page->index */ + /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) continue; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index fbf2cf62ab9f..cbed91b09640 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -561,7 +561,7 @@ retry: } while (src_addr < src_start + len) { - BUG_ON(dst_addr >= dst_start + len); + VM_WARN_ON_ONCE(dst_addr >= dst_start + len); /* * Serialize via vma_lock and hugetlb_fault_mutex. @@ -602,7 +602,7 @@ retry: if (unlikely(err == -ENOENT)) { up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); - BUG_ON(!folio); + VM_WARN_ON_ONCE(!folio); err = copy_folio_from_user(folio, (const void __user *)src_addr, true); @@ -614,7 +614,7 @@ retry: dst_vma = NULL; goto retry; } else - BUG_ON(folio); + VM_WARN_ON_ONCE(folio); if (!err) { dst_addr += vma_hpagesize; @@ -635,9 +635,9 @@ out_unlock_vma: out: if (folio) folio_put(folio); - BUG_ON(copied < 0); - BUG_ON(err > 0); - BUG_ON(!copied && !err); + VM_WARN_ON_ONCE(copied < 0); + VM_WARN_ON_ONCE(err > 0); + VM_WARN_ON_ONCE(!copied && !err); return copied ? copied : err; } #else /* !CONFIG_HUGETLB_PAGE */ @@ -711,12 +711,12 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, /* * Sanitize the command parameters: */ - BUG_ON(dst_start & ~PAGE_MASK); - BUG_ON(len & ~PAGE_MASK); + VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ - BUG_ON(src_start + len <= src_start); - BUG_ON(dst_start + len <= dst_start); + VM_WARN_ON_ONCE(src_start + len <= src_start); + VM_WARN_ON_ONCE(dst_start + len <= dst_start); src_addr = src_start; dst_addr = dst_start; @@ -775,7 +775,7 @@ retry: while (src_addr < src_start + len) { pmd_t dst_pmdval; - BUG_ON(dst_addr >= dst_start + len); + VM_WARN_ON_ONCE(dst_addr >= dst_start + len); dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); if (unlikely(!dst_pmd)) { @@ -795,8 +795,8 @@ retry: * (This includes the case where the PMD used to be THP and * changed back to none after __pte_alloc().) */ - if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) || - pmd_devmap(dst_pmdval))) { + if (unlikely(!pmd_present(dst_pmdval) || + pmd_trans_huge(dst_pmdval))) { err = -EEXIST; break; } @@ -818,7 +818,7 @@ retry: up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); - BUG_ON(!folio); + VM_WARN_ON_ONCE(!folio); kaddr = kmap_local_folio(folio, 0); err = copy_from_user(kaddr, @@ -832,7 +832,7 @@ retry: flush_dcache_folio(folio); goto retry; } else - BUG_ON(folio); + VM_WARN_ON_ONCE(folio); if (!err) { dst_addr += PAGE_SIZE; @@ -852,9 +852,9 @@ out_unlock: out: if (folio) folio_put(folio); - BUG_ON(copied < 0); - BUG_ON(err > 0); - BUG_ON(!copied && !err); + VM_WARN_ON_ONCE(copied < 0); + VM_WARN_ON_ONCE(err > 0); + VM_WARN_ON_ONCE(!copied && !err); return copied ? copied : err; } @@ -940,11 +940,11 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, /* * Sanitize the command parameters: */ - BUG_ON(start & ~PAGE_MASK); - BUG_ON(len & ~PAGE_MASK); + VM_WARN_ON_ONCE(start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ - BUG_ON(start + len <= start); + VM_WARN_ON_ONCE(start + len <= start); mmap_read_lock(dst_mm); @@ -1063,9 +1063,14 @@ static int move_present_pte(struct mm_struct *mm, folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); - orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); - /* Follow mremap() behavior and treat the entry dirty after the move */ - orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); + orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); + /* Set soft dirty bit so userspace can notice the pte was moved */ +#ifdef CONFIG_MEM_SOFT_DIRTY + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); +#endif + if (pte_dirty(orig_src_pte)) + orig_dst_pte = pte_mkdirty(orig_dst_pte); + orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); out: @@ -1079,8 +1084,18 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio *src_folio) + struct folio *src_folio, + struct swap_info_struct *si, swp_entry_t entry) { + /* + * Check if the folio still belongs to the target swap entry after + * acquiring the lock. Folio can be freed in the swap cache while + * not locked. + */ + if (src_folio && unlikely(!folio_test_swapcache(src_folio) || + entry.val != src_folio->swap.val)) + return -EAGAIN; + double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, @@ -1097,9 +1112,31 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, if (src_folio) { folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); + } else { + /* + * Check if the swap entry is cached after acquiring the src_pte + * lock. Otherwise, we might miss a newly loaded swap cache folio. + * + * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. + * We are trying to catch newly added swap cache, the only possible case is + * when a folio is swapped in and out again staying in swap cache, using the + * same entry before the PTE check above. The PTL is acquired and released + * twice, each time after updating the swap_map's flag. So holding + * the PTL here ensures we see the updated value. False positive is possible, + * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the + * cache, or during the tiny synchronization window between swap cache and + * swap_map, but it will be gone very quickly, worst result is retry jitters. + */ + if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); +#ifdef CONFIG_MEM_SOFT_DIRTY + orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); +#endif set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); @@ -1404,7 +1441,7 @@ retry: } err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, - dst_ptl, src_ptl, src_folio); + dst_ptl, src_ptl, src_folio, si, entry); } out: @@ -1701,15 +1738,13 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, ssize_t moved = 0; /* Sanitize the command parameters. */ - if (WARN_ON_ONCE(src_start & ~PAGE_MASK) || - WARN_ON_ONCE(dst_start & ~PAGE_MASK) || - WARN_ON_ONCE(len & ~PAGE_MASK)) - goto out; + VM_WARN_ON_ONCE(src_start & ~PAGE_MASK); + VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ - if (WARN_ON_ONCE(src_start + len <= src_start) || - WARN_ON_ONCE(dst_start + len <= dst_start)) - goto out; + VM_WARN_ON_ONCE(src_start + len < src_start); + VM_WARN_ON_ONCE(dst_start + len < dst_start); err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); if (err) @@ -1783,12 +1818,6 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, ptl = pmd_trans_huge_lock(src_pmd, src_vma); if (ptl) { - if (pmd_devmap(*src_pmd)) { - spin_unlock(ptl); - err = -ENOENT; - break; - } - /* Check if we can move the pmd without splitting it. */ if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || !pmd_none(dst_pmdval)) { @@ -1859,18 +1888,18 @@ out_unlock: up_read(&ctx->map_changing_lock); uffd_move_unlock(dst_vma, src_vma); out: - VM_WARN_ON(moved < 0); - VM_WARN_ON(err > 0); - VM_WARN_ON(!moved && !err); + VM_WARN_ON_ONCE(moved < 0); + VM_WARN_ON_ONCE(err > 0); + VM_WARN_ON_ONCE(!moved && !err); return moved ? moved : err; } static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, - vm_flags_t flags) + vm_flags_t vm_flags) { - const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; + const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; - vm_flags_reset(vma, flags); + vm_flags_reset(vma, vm_flags); /* * For shared mappings, we want to enable writenotify while * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply @@ -1882,12 +1911,12 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, static void userfaultfd_set_ctx(struct vm_area_struct *vma, struct userfaultfd_ctx *ctx, - unsigned long flags) + vm_flags_t vm_flags) { vma_start_write(vma); vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; userfaultfd_set_vm_flags(vma, - (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags); + (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); } void userfaultfd_reset_ctx(struct vm_area_struct *vma) @@ -1902,6 +1931,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, unsigned long end) { struct vm_area_struct *ret; + bool give_up_on_oom = false; + + /* + * If we are modifying only and not splitting, just give up on the merge + * if OOM prevents us from merging successfully. + */ + if (start == vma->vm_start && end == vma->vm_end) + give_up_on_oom = true; /* Reset ptes for the whole vma range if wr-protected */ if (userfaultfd_wp(vma)) @@ -1909,7 +1946,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, vma->vm_flags & ~__VM_UFFD_FLAGS, - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, give_up_on_oom); /* * In the vma_merge() successful mprotect-like case 8: @@ -1925,14 +1962,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, /* Assumes mmap write lock taken, and mm_struct pinned. */ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, struct vm_area_struct *vma, - unsigned long vm_flags, + vm_flags_t vm_flags, unsigned long start, unsigned long end, bool wp_async) { VMA_ITERATOR(vmi, ctx->mm, start); struct vm_area_struct *prev = vma_prev(&vmi); unsigned long vma_end; - unsigned long new_flags; + vm_flags_t new_flags; if (vma->vm_start < start) prev = vma; @@ -1940,10 +1977,10 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); - BUG_ON(vma->vm_userfaultfd_ctx.ctx && - vma->vm_userfaultfd_ctx.ctx != ctx); - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); + VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && + vma->vm_userfaultfd_ctx.ctx != ctx); + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this @@ -1960,7 +1997,8 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, new_flags, - (struct vm_userfaultfd_ctx){ctx}); + (struct vm_userfaultfd_ctx){ctx}, + /* give_up_on_oom = */false); if (IS_ERR(vma)) return PTR_ERR(vma); @@ -2018,8 +2056,8 @@ void userfaultfd_release_all(struct mm_struct *mm, prev = NULL; for_each_vma(vmi, vma) { cond_resched(); - BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ - !!(vma->vm_flags & __VM_UFFD_FLAGS)); + VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^ + !!(vma->vm_flags & __VM_UFFD_FLAGS)); if (vma->vm_userfaultfd_ctx.ctx != ctx) { prev = vma; continue; diff --git a/mm/util.c b/mm/util.c index 448117da071f..f814e6a59ab1 100644 --- a/mm/util.c +++ b/mm/util.c @@ -25,6 +25,7 @@ #include <linux/sizes.h> #include <linux/compat.h> #include <linux/fsnotify.h> +#include <linux/page_idle.h> #include <linux/uaccess.h> @@ -670,9 +671,9 @@ struct anon_vma *folio_anon_vma(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; - if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) return NULL; - return (void *)(mapping - PAGE_MAPPING_ANON); + return (void *)(mapping - FOLIO_MAPPING_ANON); } /** @@ -699,7 +700,7 @@ struct address_space *folio_mapping(struct folio *folio) return swap_address_space(folio->swap); mapping = folio->mapping; - if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) + if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS) return NULL; return mapping; @@ -1131,3 +1132,152 @@ void flush_dcache_folio(struct folio *folio) } EXPORT_SYMBOL(flush_dcache_folio); #endif + +/** + * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an + * existing VMA + * @file: The file which possesss an f_op->mmap_prepare() hook + * @vma: The VMA to apply the .mmap_prepare() hook to. + * + * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain + * 'wrapper' file systems invoke a nested mmap hook of an underlying file. + * + * Until all filesystems are converted to use .mmap_prepare(), we must be + * conservative and continue to invoke these 'wrapper' filesystems using the + * deprecated .mmap() hook. + * + * However we have a problem if the underlying file system possesses an + * .mmap_prepare() hook, as we are in a different context when we invoke the + * .mmap() hook, already having a VMA to deal with. + * + * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, + * establishes a struct vm_area_desc descriptor, passes to the underlying + * .mmap_prepare() hook and applies any changes performed by it. + * + * Once the conversion of filesystems is complete this function will no longer + * be required and will be removed. + * + * Returns: 0 on success or error. + */ +int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc; + int err; + + err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); + if (err) + return err; + set_vma_from_desc(vma, &desc); + + return 0; +} +EXPORT_SYMBOL(compat_vma_mmap_prepare); + +static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, + const struct page *page) +{ + /* + * Only the first page of a high-order buddy page has PageBuddy() set. + * So we have to check manually whether this page is part of a high- + * order buddy page. + */ + if (PageBuddy(page)) + ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; + else if (page_count(page) == 0 && is_free_buddy_page(page)) + ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; + + if (folio_test_idle(folio)) + ps->flags |= PAGE_SNAPSHOT_PG_IDLE; +} + +/** + * snapshot_page() - Create a snapshot of a struct page + * @ps: Pointer to a struct page_snapshot to store the page snapshot + * @page: The page to snapshot + * + * Create a snapshot of the page and store both its struct page and struct + * folio representations in @ps. + * + * A snapshot is marked as "faithful" if the compound state of @page was + * stable and allowed safe reconstruction of the folio representation. In + * rare cases where this is not possible (e.g. due to folio splitting), + * snapshot_page() falls back to treating @page as a single page and the + * snapshot is marked as "unfaithful". The snapshot_page_is_faithful() + * helper can be used to check for this condition. + */ +void snapshot_page(struct page_snapshot *ps, const struct page *page) +{ + unsigned long head, nr_pages = 1; + struct folio *foliop; + int loops = 5; + + ps->pfn = page_to_pfn(page); + ps->flags = PAGE_SNAPSHOT_FAITHFUL; + +again: + memset(&ps->folio_snapshot, 0, sizeof(struct folio)); + memcpy(&ps->page_snapshot, page, sizeof(*page)); + head = ps->page_snapshot.compound_head; + if ((head & 1) == 0) { + ps->idx = 0; + foliop = (struct folio *)&ps->page_snapshot; + if (!folio_test_large(foliop)) { + set_ps_flags(ps, page_folio(page), page); + memcpy(&ps->folio_snapshot, foliop, + sizeof(struct page)); + return; + } + foliop = (struct folio *)page; + } else { + foliop = (struct folio *)(head - 1); + ps->idx = folio_page_idx(foliop, page); + } + + if (ps->idx < MAX_FOLIO_NR_PAGES) { + memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page)); + nr_pages = folio_nr_pages(&ps->folio_snapshot); + if (nr_pages > 1) + memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2, + sizeof(struct page)); + set_ps_flags(ps, foliop, page); + } + + if (ps->idx > nr_pages) { + if (loops-- > 0) + goto again; + clear_compound_head(&ps->page_snapshot); + foliop = (struct folio *)&ps->page_snapshot; + memcpy(&ps->folio_snapshot, foliop, sizeof(struct page)); + ps->flags = 0; + ps->idx = 0; + } +} + +#ifdef CONFIG_MMU +/** + * folio_pte_batch - detect a PTE batch for a large folio + * @folio: The large folio to detect a PTE batch for. + * @ptep: Page table pointer for the first entry. + * @pte: Page table entry for the first page. + * @max_nr: The maximum number of table entries to consider. + * + * This is a simplified variant of folio_pte_batch_flags(). + * + * Detect a PTE batch: consecutive (present) PTEs that map consecutive + * pages of the same large folio in a single VMA and a single page table. + * + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, + * the accessed bit, writable bit, dirt-bit and soft-dirty bit. + * + * ptep must map any page of the folio. max_nr must be at least one and + * must be limited by the caller so scanning cannot exceed a single VMA and + * a single page table. + * + * Return: the number of table entries in the batch. + */ +unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, + unsigned int max_nr) +{ + return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); +} +#endif /* CONFIG_MMU */ @@ -15,11 +15,15 @@ struct mmap_state { unsigned long end; pgoff_t pgoff; unsigned long pglen; - unsigned long flags; + vm_flags_t vm_flags; struct file *file; + pgprot_t page_prot; + + /* User-defined fields, perhaps updated by .mmap_prepare(). */ + const struct vm_operations_struct *vm_ops; + void *vm_private_data; unsigned long charged; - bool retry_merge; struct vm_area_struct *prev; struct vm_area_struct *next; @@ -28,9 +32,12 @@ struct mmap_state { struct vma_munmap_struct vms; struct ma_state mas_detach; struct maple_tree mt_detach; + + /* Determine if we can check KSM flags early in mmap() logic. */ + bool check_ksm_early; }; -#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \ +#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ struct mmap_state name = { \ .mm = mm_, \ .vmi = vmi_, \ @@ -38,8 +45,9 @@ struct mmap_state { .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ - .flags = flags_, \ + .vm_flags = vm_flags_, \ .file = file_, \ + .page_prot = vm_get_page_prot(vm_flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ @@ -48,7 +56,7 @@ struct mmap_state { .vmi = (map_)->vmi, \ .start = (map_)->addr, \ .end = (map_)->end, \ - .flags = (map_)->flags, \ + .vm_flags = (map_)->vm_flags, \ .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ @@ -57,6 +65,22 @@ struct mmap_state { .state = VMA_MERGE_START, \ } +/* + * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain + * more than one anon_vma_chain connecting it to more than one anon_vma. A merge + * would mean a wider range of folios sharing the root anon_vma lock, and thus + * potential lock contention, we do not wish to encourage merging such that this + * scales to a problem. + */ +static bool vma_had_uncowed_parents(struct vm_area_struct *vma) +{ + /* + * The list_is_singular() test is to avoid merging VMA cloned from + * parents. This can improve scalability caused by anon_vma lock. + */ + return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain); +} + static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; @@ -71,7 +95,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex * the kernel to generate new VMAs when old one could be * extended instead. */ - if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY) + if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY) return false; if (vma->vm_file != vmg->file) return false; @@ -82,24 +106,28 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex return true; } -static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, - struct anon_vma *anon_vma2, struct vm_area_struct *vma) +static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) { + struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev; + struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */ + struct anon_vma *tgt_anon = tgt->anon_vma; + struct anon_vma *src_anon = vmg->anon_vma; + /* - * The list_is_singular() test is to avoid merging VMA cloned from - * parents. This can improve scalability caused by anon_vma lock. + * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we + * will remove the existing VMA's anon_vma's so there's no scalability + * concerns. */ - if ((!anon_vma1 || !anon_vma2) && (!vma || - list_is_singular(&vma->anon_vma_chain))) - return true; - return anon_vma1 == anon_vma2; -} + VM_WARN_ON(src && src_anon != src->anon_vma); -/* Are the anon_vma's belonging to each VMA compatible with one another? */ -static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1, - struct vm_area_struct *vma2) -{ - return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL); + /* Case 1 - we will dup_anon_vma() from src into tgt. */ + if (!tgt_anon && src_anon) + return !vma_had_uncowed_parents(src); + /* Case 2 - we will simply use tgt's anon_vma. */ + if (tgt_anon && !src_anon) + return !vma_had_uncowed_parents(tgt); + /* Case 3 - the anon_vma's are already shared. */ + return src_anon == tgt_anon; } /* @@ -144,6 +172,9 @@ static void init_multi_vma_prep(struct vma_prepare *vp, vp->file = vma->vm_file; if (vp->file) vp->mapping = vma->vm_file->f_mapping; + + if (vmg && vmg->skip_vma_uprobe) + vp->skip_vma_uprobe = true; } /* @@ -164,7 +195,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg) pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); if (is_mergeable_vma(vmg, /* merge_next = */ true) && - is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) { + is_mergeable_anon_vma(vmg, /* merge_next = */ true)) { if (vmg->next->vm_pgoff == vmg->pgoff + pglen) return true; } @@ -184,7 +215,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg) static bool can_vma_merge_after(struct vma_merge_struct *vmg) { if (is_mergeable_vma(vmg, /* merge_next = */ false) && - is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { + is_mergeable_anon_vma(vmg, /* merge_next = */ false)) { if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) return true; } @@ -333,10 +364,13 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, if (vp->file) { i_mmap_unlock_write(vp->mapping); - uprobe_mmap(vp->vma); - if (vp->adj_next) - uprobe_mmap(vp->adj_next); + if (!vp->skip_vma_uprobe) { + uprobe_mmap(vp->vma); + + if (vp->adj_next) + uprobe_mmap(vp->adj_next); + } } if (vp->remove) { @@ -400,8 +434,10 @@ static bool can_vma_merge_left(struct vma_merge_struct *vmg) static bool can_vma_merge_right(struct vma_merge_struct *vmg, bool can_merge_left) { - if (!vmg->next || vmg->end != vmg->next->vm_start || - !can_vma_merge_before(vmg)) + struct vm_area_struct *next = vmg->next; + struct vm_area_struct *prev; + + if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg)) return false; if (!can_merge_left) @@ -414,7 +450,9 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg, * * We therefore check this in addition to mergeability to either side. */ - return are_anon_vmas_compatible(vmg->prev, vmg->next); + prev = vmg->prev; + return !prev->anon_vma || !next->anon_vma || + prev->anon_vma == next->anon_vma; } /* @@ -510,7 +548,14 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); + + /* + * Get rid of huge pages and shared page tables straddling the split + * boundary. + */ vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL); + if (is_vm_hugetlb_page(vma)) + hugetlb_split(vma, addr); if (new_below) { vma->vm_start = addr; @@ -554,7 +599,9 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, } /* - * dup_anon_vma() - Helper function to duplicate anon_vma + * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the + * instance that the destination VMA has no anon_vma but the source does. + * * @dst: The destination VMA * @src: The source VMA * @dup: Pointer to the destination VMA when successful. @@ -565,9 +612,18 @@ static int dup_anon_vma(struct vm_area_struct *dst, struct vm_area_struct *src, struct vm_area_struct **dup) { /* - * Easily overlooked: when mprotect shifts the boundary, make sure the - * expanding vma has anon_vma set if the shrinking vma had, to cover any - * anon pages imported. + * There are three cases to consider for correctly propagating + * anon_vma's on merge. + * + * The first is trivial - neither VMA has anon_vma, we need not do + * anything. + * + * The second where both have anon_vma is also a no-op, as they must + * then be the same, so there is simply nothing to copy. + * + * Here we cover the third - if the destination VMA has no anon_vma, + * that is it is unfaulted, we need to ensure that the newly merged + * range is referenced by the anon_vma's of the source. */ if (src->anon_vma && !dst->anon_vma) { int ret; @@ -666,6 +722,9 @@ static void vmg_adjust_set_range(struct vma_merge_struct *vmg) /* * Actually perform the VMA merge operation. * + * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not + * modify any VMAs or cause inconsistent state should an OOM condition arise. + * * Returns 0 on success, or an error value on failure. */ static int commit_merge(struct vma_merge_struct *vmg) @@ -685,6 +744,12 @@ static int commit_merge(struct vma_merge_struct *vmg) init_multi_vma_prep(&vp, vma, vmg); + /* + * If vmg->give_up_on_oom is set, we're safe, because we don't actually + * manipulate any VMAs until we succeed at preallocation. + * + * Past this point, we will not return an error. + */ if (vma_iter_prealloc(vmg->vmi, vma)) return -ENOMEM; @@ -778,7 +843,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( * furthermost left or right side of the VMA, then we have no chance of * merging and should abort. */ - if (vmg->flags & VM_SPECIAL || (!left_side && !right_side)) + if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side)) return NULL; if (left_side) @@ -905,28 +970,28 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( err = dup_anon_vma(next, middle, &anon_dup); } - if (err) + if (err || commit_merge(vmg)) goto abort; - err = commit_merge(vmg); - if (err) { - VM_WARN_ON(err != -ENOMEM); - - if (anon_dup) - unlink_anon_vmas(anon_dup); - - vmg->state = VMA_MERGE_ERROR_NOMEM; - return NULL; - } - - khugepaged_enter_vma(vmg->target, vmg->flags); + khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; abort: vma_iter_set(vmg->vmi, start); vma_iter_load(vmg->vmi); - vmg->state = VMA_MERGE_ERROR_NOMEM; + + if (anon_dup) + unlink_anon_vmas(anon_dup); + + /* + * This means we have failed to clone anon_vma's correctly, but no + * actual changes to VMAs have occurred, so no harm no foul - if the + * user doesn't want this reported and instead just wants to give up on + * the merge, allow it. + */ + if (!vmg->give_up_on_oom) + vmg->state = VMA_MERGE_ERROR_NOMEM; return NULL; } @@ -983,13 +1048,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) mmap_assert_write_locked(vmg->mm); VM_WARN_ON_VMG(vmg->middle, vmg); + VM_WARN_ON_VMG(vmg->target, vmg); /* vmi must point at or before the gap. */ VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); vmg->state = VMA_MERGE_NOMERGE; /* Special VMAs are unmergeable, also if no prev/next. */ - if ((vmg->flags & VM_SPECIAL) || (!prev && !next)) + if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next)) return NULL; can_merge_left = can_vma_merge_left(vmg); @@ -998,13 +1064,13 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) /* If we can merge with the next VMA, adjust vmg accordingly. */ if (can_merge_right) { vmg->end = next->vm_end; - vmg->middle = next; + vmg->target = next; } /* If we can merge with the previous VMA, adjust vmg accordingly. */ if (can_merge_left) { vmg->start = prev->vm_start; - vmg->middle = prev; + vmg->target = prev; vmg->pgoff = prev->vm_pgoff; /* @@ -1026,10 +1092,10 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) * Now try to expand adjacent VMA(s). This takes care of removing the * following VMA if we have VMAs on both sides. */ - if (vmg->middle && !vma_expand(vmg)) { - khugepaged_enter_vma(vmg->middle, vmg->flags); + if (vmg->target && !vma_expand(vmg)) { + khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; - return vmg->middle; + return vmg->target; } return NULL; @@ -1041,46 +1107,51 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) * @vmg: Describes a VMA expansion operation. * * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. - * Will expand over vmg->next if it's different from vmg->middle and vmg->end == - * vmg->next->vm_end. Checking if the vmg->middle can expand and merge with + * Will expand over vmg->next if it's different from vmg->target and vmg->end == + * vmg->next->vm_end. Checking if the vmg->target can expand and merge with * vmg->next needs to be handled by the caller. * * Returns: 0 on success. * * ASSUMPTIONS: - * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock. - * - The caller must have set @vmg->middle and @vmg->next. + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. + * - The caller must have set @vmg->target and @vmg->next. */ int vma_expand(struct vma_merge_struct *vmg) { struct vm_area_struct *anon_dup = NULL; bool remove_next = false; - struct vm_area_struct *middle = vmg->middle; + struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; + VM_WARN_ON_VMG(!target, vmg); + mmap_assert_write_locked(vmg->mm); - vma_start_write(middle); - if (next && (middle != next) && (vmg->end == next->vm_end)) { + vma_start_write(target); + if (next && (target != next) && (vmg->end == next->vm_end)) { int ret; remove_next = true; /* This should already have been checked by this point. */ VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); vma_start_write(next); - ret = dup_anon_vma(middle, next, &anon_dup); + /* + * In this case we don't report OOM, so vmg->give_up_on_mm is + * safe. + */ + ret = dup_anon_vma(target, next, &anon_dup); if (ret) return ret; } /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON_VMG(next && !remove_next && - next != middle && vmg->end > next->vm_start, vmg); + next != target && vmg->end > next->vm_start, vmg); /* Only handles expanding */ - VM_WARN_ON_VMG(middle->vm_start < vmg->start || - middle->vm_end > vmg->end, vmg); + VM_WARN_ON_VMG(target->vm_start < vmg->start || + target->vm_end > vmg->end, vmg); - vmg->target = middle; if (remove_next) vmg->__remove_next = true; @@ -1090,9 +1161,15 @@ int vma_expand(struct vma_merge_struct *vmg) return 0; nomem: - vmg->state = VMA_MERGE_ERROR_NOMEM; if (anon_dup) unlink_anon_vmas(anon_dup); + /* + * If the user requests that we just give upon OOM, we are safe to do so + * here, as commit merge provides this contract to us. Nothing has been + * changed - no harm no foul, just don't report it. + */ + if (!vmg->give_up_on_oom) + vmg->state = VMA_MERGE_ERROR_NOMEM; return -ENOMEM; } @@ -1534,6 +1611,13 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) if (vmg_nomem(vmg)) return ERR_PTR(-ENOMEM); + /* + * Split can fail for reasons other than OOM, so if the user requests + * this it's probably a mistake. + */ + VM_WARN_ON(vmg->give_up_on_oom && + (vma->vm_start != start || vma->vm_end != end)); + /* Split any preceding portion of the VMA. */ if (vma->vm_start < start) { int err = split_vma(vmg->vmi, vma, start, 1); @@ -1556,27 +1640,25 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) struct vm_area_struct *vma_modify_flags( struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags) + vm_flags_t vm_flags) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.flags = new_flags; + vmg.vm_flags = vm_flags; return vma_modify(&vmg); } struct vm_area_struct -*vma_modify_flags_name(struct vma_iterator *vmi, +*vma_modify_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags, struct anon_vma_name *new_name) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.flags = new_flags; vmg.anon_name = new_name; return vma_modify(&vmg); @@ -1601,13 +1683,16 @@ struct vm_area_struct struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags, - struct vm_userfaultfd_ctx new_ctx) + vm_flags_t vm_flags, + struct vm_userfaultfd_ctx new_ctx, + bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.flags = new_flags; + vmg.vm_flags = vm_flags; vmg.uffd_ctx = new_ctx; + if (give_up_on_oom) + vmg.give_up_on_oom = true; return vma_modify(&vmg); } @@ -1740,6 +1825,14 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } + /* + * If the VMA we are copying might contain a uprobe PTE, ensure + * that we do not establish one upon merge. Otherwise, when mremap() + * moves page tables, it will orphan the newly created PTE. + */ + if (vma->vm_file) + vmg.skip_vma_uprobe = true; + new_vma = find_vma_prev(mm, addr, &vmg.prev); if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ @@ -1791,6 +1884,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return new_vma; out_vma_link: + fixup_hugetlb_reservations(new_vma); vma_close(new_vma); if (new_vma->vm_file) @@ -2229,6 +2323,11 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, vms_complete_munmap_vmas(vms, mas_detach); } +static void update_ksm_flags(struct mmap_state *map) +{ + map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); +} + /* * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping @@ -2271,11 +2370,11 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) } /* Check against address space limit. */ - if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages)) + if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages)) return -ENOMEM; /* Private writable mapping: check memory availability. */ - if (accountable_mapping(map->file, map->flags)) { + if (accountable_mapping(map->file, map->vm_flags)) { map->charged = map->pglen; map->charged -= vms->nr_accounted; if (map->charged) { @@ -2285,7 +2384,7 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) } vms->nr_accounted = 0; - map->flags |= VM_ACCOUNT; + map->vm_flags |= VM_ACCOUNT; } /* @@ -2307,6 +2406,10 @@ static int __mmap_new_file_vma(struct mmap_state *map, int error; vma->vm_file = get_file(map->file); + + if (!map->file->f_op->mmap) + return 0; + error = mmap_file(vma->vm_file, vma); if (error) { fput(vma->vm_file); @@ -2325,13 +2428,12 @@ static int __mmap_new_file_vma(struct mmap_state *map, * Drivers should not permit writability when previously it was * disallowed. */ - VM_WARN_ON_ONCE(map->flags != vma->vm_flags && - !(map->flags & VM_MAYWRITE) && + VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags && + !(map->vm_flags & VM_MAYWRITE) && (vma->vm_flags & VM_MAYWRITE)); - /* If the flags change (and are mergeable), let's retry later. */ - map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL); - map->flags = vma->vm_flags; + map->file = vma->vm_file; + map->vm_flags = vma->vm_flags; return 0; } @@ -2362,8 +2464,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); - vm_flags_init(vma, map->flags); - vma->vm_page_prot = vm_get_page_prot(map->flags); + vm_flags_init(vma, map->vm_flags); + vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { error = -ENOMEM; @@ -2372,7 +2474,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (map->file) error = __mmap_new_file_vma(map, vma); - else if (map->flags & VM_SHARED) + else if (map->vm_flags & VM_SHARED) error = shmem_zero_setup(vma); else vma_set_anonymous(vma); @@ -2380,9 +2482,14 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (error) goto free_iter_vma; + if (!map->check_ksm_early) { + update_ksm_flags(map); + vm_flags_init(vma, map->vm_flags); + } + #ifdef CONFIG_SPARC64 /* TODO: Fix SPARC ADI! */ - WARN_ON_ONCE(!arch_validate_flags(map->flags)); + WARN_ON_ONCE(!arch_validate_flags(map->vm_flags)); #endif /* Lock the VMA since it is modified after insertion into VMA tree */ @@ -2396,8 +2503,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) * call covers the non-merge case. */ if (!vma_is_anonymous(vma)) - khugepaged_enter_vma(vma, map->flags); - ksm_add_vma(vma); + khugepaged_enter_vma(vma, map->vm_flags); *vmap = vma; return 0; @@ -2418,7 +2524,7 @@ free_vma: static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) { struct mm_struct *mm = map->mm; - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; perf_event_mmap(vma); @@ -2450,6 +2556,85 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } +/* + * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that + * specifies it. + * + * This is called prior to any merge attempt, and updates whitelisted fields + * that are permitted to be updated by the caller. + * + * All but user-defined fields will be pre-populated with original values. + * + * Returns 0 on success, or an error code otherwise. + */ +static int call_mmap_prepare(struct mmap_state *map) +{ + int err; + struct vm_area_desc desc = { + .mm = map->mm, + .start = map->addr, + .end = map->end, + + .pgoff = map->pgoff, + .file = map->file, + .vm_flags = map->vm_flags, + .page_prot = map->page_prot, + }; + + /* Invoke the hook. */ + err = vfs_mmap_prepare(map->file, &desc); + if (err) + return err; + + /* Update fields permitted to be changed. */ + map->pgoff = desc.pgoff; + map->file = desc.file; + map->vm_flags = desc.vm_flags; + map->page_prot = desc.page_prot; + /* User-defined fields. */ + map->vm_ops = desc.vm_ops; + map->vm_private_data = desc.private_data; + + return 0; +} + +static void set_vma_user_defined_fields(struct vm_area_struct *vma, + struct mmap_state *map) +{ + if (map->vm_ops) + vma->vm_ops = map->vm_ops; + vma->vm_private_data = map->vm_private_data; +} + +/* + * Are we guaranteed no driver can change state such as to preclude KSM merging? + * If so, let's set the KSM mergeable flag early so we don't break VMA merging. + */ +static bool can_set_ksm_flags_early(struct mmap_state *map) +{ + struct file *file = map->file; + + /* Anonymous mappings have no driver which can change them. */ + if (!file) + return true; + + /* + * If .mmap_prepare() is specified, then the driver will have already + * manipulated state prior to updating KSM flags. So no need to worry + * about mmap callbacks modifying VMA flags after the KSM flag has been + * updated here, which could otherwise affect KSM eligibility. + */ + if (file->f_op->mmap_prepare) + return true; + + /* shmem is safe. */ + if (shmem_file(file)) + return true; + + /* Any other .mmap callback is not safe. */ + return false; +} + static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) @@ -2457,13 +2642,21 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; int error; + bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); + map.check_ksm_early = can_set_ksm_flags_early(&map); + error = __mmap_prepare(&map, uf); + if (!error && have_mmap_prepare) + error = call_mmap_prepare(&map); if (error) goto abort_munmap; + if (map.check_ksm_early) + update_ksm_flags(&map); + /* Attempt to merge with adjacent VMAs... */ if (map.prev || map.next) { VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL); @@ -2478,16 +2671,8 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, goto unacct_error; } - /* If flags changed, we might be able to merge, so try again. */ - if (map.retry_merge) { - struct vm_area_struct *merged; - VMG_MMAP_STATE(vmg, &map, vma); - - vma_iter_config(map.vmi, map.addr, map.end); - merged = vma_merge_existing_range(&vmg); - if (merged) - vma = merged; - } + if (have_mmap_prepare) + set_vma_user_defined_fields(vma, &map); __mmap_complete(&map, vma); @@ -2567,14 +2752,14 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * @addr: The start address * @len: The length of the increase * @vma: The vma, - * @flags: The VMA Flags + * @vm_flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, unsigned long flags) + unsigned long addr, unsigned long len, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; @@ -2582,8 +2767,9 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. */ - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) + vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + vm_flags = ksm_vma_flags(mm, NULL, vm_flags); + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) @@ -2597,7 +2783,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr) { - VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); + VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr)); vmg.prev = vma; /* vmi is positioned at prev, which this mode expects. */ @@ -2618,20 +2804,19 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_set_anonymous(vma); vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); - vm_flags_init(vma, flags); - vma->vm_page_prot = vm_get_page_prot(flags); + vm_flags_init(vma, vm_flags); + vma->vm_page_prot = vm_get_page_prot(vm_flags); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; mm->map_count++; validate_mm(mm); - ksm_add_vma(vma); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; - if (flags & VM_LOCKED) + if (vm_flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vm_flags_set(vma, VM_SOFTDIRTY); return 0; @@ -2974,3 +3159,45 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock) userfaultfd_unmap_complete(mm, &uf); return ret; } + +/* Insert vm structure into process list sorted by address + * and into the inode's i_mmap tree. If vm_file is non-NULL + * then i_mmap_rwsem is taken here. + */ +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) +{ + unsigned long charged = vma_pages(vma); + + + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) + return -ENOMEM; + + if ((vma->vm_flags & VM_ACCOUNT) && + security_vm_enough_memory_mm(mm, charged)) + return -ENOMEM; + + /* + * The vm_pgoff of a purely anonymous vma should be irrelevant + * until its first write fault, when page's anon_vma and index + * are set. But now set the vm_pgoff it will almost certainly + * end up with (unless mremap moves it elsewhere before that + * first wfault), so /proc/pid/maps tells a consistent story. + * + * By setting it to reflect the virtual start address of the + * vma, merges and splits can happen in a seamless way, just + * using the existing file pgoff checks and manipulations. + * Similarly in do_mmap and in do_brk_flags. + */ + if (vma_is_anonymous(vma)) { + BUG_ON(vma->anon_vma); + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; + } + + if (vma_link(mm, vma)) { + if (vma->vm_flags & VM_ACCOUNT) + vm_unacct_memory(charged); + return -ENOMEM; + } + + return 0; +} @@ -19,6 +19,8 @@ struct vma_prepare { struct vm_area_struct *insert; struct vm_area_struct *remove; struct vm_area_struct *remove2; + + bool skip_vma_uprobe :1; }; struct unlink_vma_file_batch { @@ -96,7 +98,7 @@ struct vma_merge_struct { unsigned long end; pgoff_t pgoff; - unsigned long flags; + vm_flags_t vm_flags; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; @@ -114,6 +116,17 @@ struct vma_merge_struct { */ bool just_expand :1; + /* + * If a merge is possible, but an OOM error occurs, give up and don't + * execute the merge, returning NULL. + */ + bool give_up_on_oom :1; + + /* + * If set, skip uprobe_mmap upon merged vma. + */ + bool skip_vma_uprobe :1; + /* Internal flags set during merge process: */ /* @@ -151,13 +164,13 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } -#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \ +#define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ - .flags = flags_, \ + .vm_flags = vm_flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ } @@ -171,7 +184,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, .next = NULL, \ .start = start_, \ .end = end_, \ - .flags = vma_->vm_flags, \ + .vm_flags = vma_->vm_flags, \ .pgoff = vma_pgoff_offset(vma_, start_), \ .file = vma_->vm_file, \ .anon_vma = vma_->anon_vma, \ @@ -209,6 +222,53 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } + +/* + * Temporary helper functions for file systems which wrap an invocation of + * f_op->mmap() but which might have an underlying file system which implements + * f_op->mmap_prepare(). + */ + +static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, + struct vm_area_desc *desc) +{ + desc->mm = vma->vm_mm; + desc->start = vma->vm_start; + desc->end = vma->vm_end; + + desc->pgoff = vma->vm_pgoff; + desc->file = vma->vm_file; + desc->vm_flags = vma->vm_flags; + desc->page_prot = vma->vm_page_prot; + + desc->vm_ops = NULL; + desc->private_data = NULL; + + return desc; +} + +static inline void set_vma_from_desc(struct vm_area_struct *vma, + struct vm_area_desc *desc) +{ + /* + * Since we're invoking .mmap_prepare() despite having a partially + * established VMA, we must take care to handle setting fields + * correctly. + */ + + /* Mutable fields. Populated with initial state. */ + vma->vm_pgoff = desc->pgoff; + if (vma->vm_file != desc->file) + vma_set_file(vma, desc->file); + if (vma->vm_flags != desc->vm_flags) + vm_flags_set(vma, desc->vm_flags); + vma->vm_page_prot = desc->page_prot; + + /* User-defined fields. */ + vma->vm_ops = desc->vm_ops; + vma->vm_private_data = desc->private_data; +} + int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, @@ -228,17 +288,16 @@ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags); + vm_flags_t vm_flags); -/* We are about to modify the VMA's flags and/or anon_name. */ +/* We are about to modify the VMA's anon_name. */ __must_check struct vm_area_struct -*vma_modify_flags_name(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long new_flags, - struct anon_vma_name *new_name); +*vma_modify_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ __must_check struct vm_area_struct @@ -254,8 +313,9 @@ __must_check struct vm_area_struct struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags, - struct vm_userfaultfd_ctx new_ctx); + vm_flags_t vm_flags, + struct vm_userfaultfd_ctx new_ctx, + bool give_up_on_oom); __must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); @@ -314,7 +374,7 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma } #ifdef CONFIG_MMU -static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) +static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, vm_flags_t vm_flags) { return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); } @@ -541,4 +601,19 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address); int __vm_munmap(unsigned long start, size_t len, bool unlock); +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); + +/* vma_init.h, shared between CONFIG_MMU and nommu. */ +void __init vma_state_init(void); +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm); +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig); +void vm_area_free(struct vm_area_struct *vma); + +/* vma_exec.c */ +#ifdef CONFIG_MMU +int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, + unsigned long *top_mem_p); +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); +#endif + #endif /* __MM_VMA_H */ diff --git a/mm/vma_exec.c b/mm/vma_exec.c new file mode 100644 index 000000000000..922ee51747a6 --- /dev/null +++ b/mm/vma_exec.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Functions explicitly implemented for exec functionality which however are + * explicitly VMA-only logic. + */ + +#include "vma_internal.h" +#include "vma.h" + +/* + * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between + * this VMA and its relocated range, which will now reside at [vma->vm_start - + * shift, vma->vm_end - shift). + * + * This function is almost certainly NOT what you want for anything other than + * early executable temporary stack relocation. + */ +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) +{ + /* + * The process proceeds as follows: + * + * 1) Use shift to calculate the new vma endpoints. + * 2) Extend vma to cover both the old and new ranges. This ensures the + * arguments passed to subsequent functions are consistent. + * 3) Move vma's page tables to the new range. + * 4) Free up any cleared pgd range. + * 5) Shrink the vma to cover only the new range. + */ + + struct mm_struct *mm = vma->vm_mm; + unsigned long old_start = vma->vm_start; + unsigned long old_end = vma->vm_end; + unsigned long length = old_end - old_start; + unsigned long new_start = old_start - shift; + unsigned long new_end = old_end - shift; + VMA_ITERATOR(vmi, mm, new_start); + VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); + struct vm_area_struct *next; + struct mmu_gather tlb; + PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length); + + BUG_ON(new_start > new_end); + + /* + * ensure there are no vmas between where we want to go + * and where we are + */ + if (vma != vma_next(&vmi)) + return -EFAULT; + + vma_iter_prev_range(&vmi); + /* + * cover the whole range: [new_start, old_end) + */ + vmg.target = vma; + if (vma_expand(&vmg)) + return -ENOMEM; + + /* + * move the page tables downwards, on failure we rely on + * process cleanup to remove whatever mess we made. + */ + pmc.for_stack = true; + if (length != move_page_tables(&pmc)) + return -ENOMEM; + + tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); + if (new_end > old_start) { + /* + * when the old and new regions overlap clear from new_end. + */ + free_pgd_range(&tlb, new_end, old_end, new_end, + next ? next->vm_start : USER_PGTABLES_CEILING); + } else { + /* + * otherwise, clean from old_start; this is done to not touch + * the address space in [new_end, old_start) some architectures + * have constraints on va-space that make this illegal (IA64) - + * for the others its just a little faster. + */ + free_pgd_range(&tlb, old_start, old_end, new_end, + next ? next->vm_start : USER_PGTABLES_CEILING); + } + tlb_finish_mmu(&tlb); + + vma_prev(&vmi); + /* Shrink the vma to just the new range */ + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); +} + +/* + * Establish the stack VMA in an execve'd process, located temporarily at the + * maximum stack address provided by the architecture. + * + * We later relocate this downwards in relocate_vma_down(). + * + * This function is almost certainly NOT what you want for anything other than + * early executable initialisation. + * + * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the + * maximum addressable location in the stack (that is capable of storing a + * system word of data). + */ +int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, + unsigned long *top_mem_p) +{ + int err; + struct vm_area_struct *vma = vm_area_alloc(mm); + + if (!vma) + return -ENOMEM; + + vma_set_anonymous(vma); + + if (mmap_write_lock_killable(mm)) { + err = -EINTR; + goto err_free; + } + + /* + * Need to be called with mmap write lock + * held, to avoid race with ksmd. + */ + err = ksm_execve(mm); + if (err) + goto err_ksm; + + /* + * Place the stack at the largest stack address the architecture + * supports. Later, we'll move this to an appropriate place. We don't + * use STACK_TOP because that can depend on attributes which aren't + * configured yet. + */ + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + vma->vm_end = STACK_TOP_MAX; + vma->vm_start = vma->vm_end - PAGE_SIZE; + vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + + err = insert_vm_struct(mm, vma); + if (err) + goto err; + + mm->stack_vm = mm->total_vm = 1; + mmap_write_unlock(mm); + *vmap = vma; + *top_mem_p = vma->vm_end - sizeof(void *); + return 0; + +err: + ksm_exit(mm); +err_ksm: + mmap_write_unlock(mm); +err_free: + *vmap = NULL; + vm_area_free(vma); + return err; +} diff --git a/mm/vma_init.c b/mm/vma_init.c new file mode 100644 index 000000000000..8e53c7943561 --- /dev/null +++ b/mm/vma_init.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* + * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared + * between CONFIG_MMU and non-CONFIG_MMU kernel configurations. + */ + +#include "vma_internal.h" +#include "vma.h" + +/* SLAB cache for vm_area_struct structures */ +static struct kmem_cache *vm_area_cachep; + +void __init vma_state_init(void) +{ + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + }; + + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), &args, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| + SLAB_ACCOUNT); +} + +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!vma) + return NULL; + + vma_init(vma, mm); + + return vma; +} + +static void vm_area_init_from(const struct vm_area_struct *src, + struct vm_area_struct *dest) +{ + dest->vm_mm = src->vm_mm; + dest->vm_ops = src->vm_ops; + dest->vm_start = src->vm_start; + dest->vm_end = src->vm_end; + dest->anon_vma = src->anon_vma; + dest->vm_pgoff = src->vm_pgoff; + dest->vm_file = src->vm_file; + dest->vm_private_data = src->vm_private_data; + vm_flags_init(dest, src->vm_flags); + memcpy(&dest->vm_page_prot, &src->vm_page_prot, + sizeof(dest->vm_page_prot)); + /* + * src->shared.rb may be modified concurrently when called from + * dup_mmap(), but the clone will reinitialize it. + */ + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, + sizeof(dest->vm_userfaultfd_ctx)); +#ifdef CONFIG_ANON_VMA_NAME + dest->anon_name = src->anon_name; +#endif +#ifdef CONFIG_SWAP + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, + sizeof(dest->swap_readahead_info)); +#endif +#ifndef CONFIG_MMU + dest->vm_region = src->vm_region; +#endif +#ifdef CONFIG_NUMA + dest->vm_policy = src->vm_policy; +#endif +#ifdef __HAVE_PFNMAP_TRACKING + dest->pfnmap_track_ctx = NULL; +#endif +} + +#ifdef __HAVE_PFNMAP_TRACKING +static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig, + struct vm_area_struct *new) +{ + struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx; + + if (likely(!ctx)) + return 0; + + /* + * We don't expect to ever hit this. If ever required, we would have + * to duplicate the tracking. + */ + if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX)) + return -ENOMEM; + kref_get(&ctx->kref); + new->pfnmap_track_ctx = ctx; + return 0; +} + +static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma) +{ + struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx; + + if (likely(!ctx)) + return; + + kref_put(&ctx->kref, pfnmap_track_ctx_release); + vma->pfnmap_track_ctx = NULL; +} +#else +static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig, + struct vm_area_struct *new) +{ + return 0; +} +static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma) +{ +} +#endif + +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) +{ + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + + if (!new) + return NULL; + + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + vm_area_init_from(orig, new); + + if (vma_pfnmap_track_ctx_dup(orig, new)) { + kmem_cache_free(vm_area_cachep, new); + return NULL; + } + vma_lock_init(new, true); + INIT_LIST_HEAD(&new->anon_vma_chain); + vma_numab_state_init(new); + dup_anon_vma_name(orig, new); + + return new; +} + +void vm_area_free(struct vm_area_struct *vma) +{ + /* The vma should be detached while being destroyed. */ + vma_assert_detached(vma); + vma_numab_state_free(vma); + free_anon_vma_name(vma); + vma_pfnmap_track_ctx_release(vma); + kmem_cache_free(vm_area_cachep, vma); +} diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3ed720a787ec..6dbcdceecae1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -104,6 +104,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; + + arch_enter_lazy_mmu_mode(); + do { if (unlikely(!pte_none(ptep_get(pte)))) { if (pfn_valid(pfn)) { @@ -127,6 +130,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); + + arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -350,12 +355,30 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pte_t *pte; + pte_t ptent; + unsigned long size = PAGE_SIZE; pte = pte_offset_kernel(pmd, addr); + arch_enter_lazy_mmu_mode(); + do { - pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); +#ifdef CONFIG_HUGETLB_PAGE + size = arch_vmap_pte_range_unmap_size(addr, pte); + if (size != PAGE_SIZE) { + if (WARN_ON(!IS_ALIGNED(addr, size))) { + addr = ALIGN_DOWN(addr, size); + pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT)); + } + ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size); + if (WARN_ON(end - addr < size)) + size = end - addr; + } else +#endif + ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); + + arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; } @@ -374,8 +397,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, if (cleared || pmd_bad(*pmd)) *mask |= PGTBL_PMD_MODIFIED; - if (cleared) + if (cleared) { + WARN_ON(next - addr < PMD_SIZE); continue; + } if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next, mask); @@ -399,8 +424,10 @@ static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, if (cleared || pud_bad(*pud)) *mask |= PGTBL_PUD_MODIFIED; - if (cleared) + if (cleared) { + WARN_ON(next - addr < PUD_SIZE); continue; + } if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next, mask); @@ -487,6 +514,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { + int err = 0; pte_t *pte; /* @@ -497,21 +525,33 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; + + arch_enter_lazy_mmu_mode(); + do { struct page *page = pages[*nr]; - if (WARN_ON(!pte_none(ptep_get(pte)))) - return -EBUSY; - if (WARN_ON(!page)) - return -ENOMEM; - if (WARN_ON(!pfn_valid(page_to_pfn(page)))) - return -EINVAL; + if (WARN_ON(!pte_none(ptep_get(pte)))) { + err = -EBUSY; + break; + } + if (WARN_ON(!page)) { + err = -ENOMEM; + break; + } + if (WARN_ON(!pfn_valid(page_to_pfn(page)))) { + err = -EINVAL; + break; + } set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); + + arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; - return 0; + + return err; } static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, @@ -900,6 +940,11 @@ static struct vmap_node *vmap_nodes = &single; static __read_mostly unsigned int nr_vmap_nodes = 1; static __read_mostly unsigned int vmap_zone_size = 1; +/* A simple iterator over all vmap-nodes. */ +#define for_each_vmap_node(vn) \ + for ((vn) = &vmap_nodes[0]; \ + (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++) + static inline unsigned int addr_to_node_id(unsigned long addr) { @@ -918,6 +963,19 @@ id_to_node(unsigned int id) return &vmap_nodes[id % nr_vmap_nodes]; } +static inline unsigned int +node_to_id(struct vmap_node *node) +{ + /* Pointer arithmetic. */ + unsigned int id = node - vmap_nodes; + + if (likely(id < nr_vmap_nodes)) + return id; + + WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node); + return 0; +} + /* * We use the value 0 to represent "no node", that is why * an encoded value will be the node-id incremented by 1. @@ -990,7 +1048,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); -static atomic_long_t nr_vmalloc_pages; +static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages; +static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr; unsigned long vmalloc_nr_pages(void) { @@ -1056,12 +1115,11 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) { unsigned long va_start_lowest; struct vmap_node *vn; - int i; repeat: - for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + va_start_lowest = 0; + for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root); @@ -1698,7 +1756,7 @@ va_clip(struct rb_root *root, struct list_head *head, */ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) - return -1; + return -ENOMEM; } /* @@ -1712,7 +1770,7 @@ va_clip(struct rb_root *root, struct list_head *head, */ va->va_start = nva_start_addr + size; } else { - return -1; + return -EINVAL; } if (type != FL_FIT_TYPE) { @@ -1741,19 +1799,19 @@ va_alloc(struct vmap_area *va, /* Check the "vend" restriction. */ if (nva_start_addr + size > vend) - return vend; + return -ERANGE; /* Update the free vmap_area. */ ret = va_clip(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) - return vend; + return ret; return nva_start_addr; } /* * Returns a start address of the newly allocated area, if success. - * Otherwise a vend is returned that indicates failure. + * Otherwise an error value is returned that indicates failure. */ static __always_inline unsigned long __alloc_vmap_area(struct rb_root *root, struct list_head *head, @@ -1778,14 +1836,13 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); if (unlikely(!va)) - return vend; + return -ENOENT; nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend); - if (nva_start_addr == vend) - return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK - find_vmap_lowest_match_check(root, head, size, align); + if (!IS_ERR_VALUE(nva_start_addr)) + find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr; @@ -1915,7 +1972,7 @@ node_alloc(unsigned long size, unsigned long align, struct vmap_area *va; *vn_id = 0; - *addr = vend; + *addr = -EINVAL; /* * Fallback to a global heap if not vmalloc or there @@ -1940,7 +1997,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm, { vm->flags = flags; vm->addr = (void *)va->va_start; - vm->size = va_size(va); + vm->size = vm->requested_size = va_size(va); vm->caller = caller; va->vm = vm; } @@ -1995,20 +2052,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, } retry: - if (addr == vend) { + if (IS_ERR_VALUE(addr)) { preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); } - trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); + trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr)); /* - * If an allocation fails, the "vend" address is + * If an allocation fails, the error value is * returned. Therefore trigger the overflow path. */ - if (unlikely(addr == vend)) + if (IS_ERR_VALUE(addr)) goto overflow; va->va_start = addr; @@ -2100,8 +2157,6 @@ static unsigned long lazy_max_pages(void) return log * (32UL * 1024 * 1024 / PAGE_SIZE); } -static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); - /* * Serialize vmap purging. There is no actual critical section protected * by this lock, but we want to avoid concurrent calls for performance @@ -2111,7 +2166,6 @@ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); -static cpumask_t purge_nodes; static void reclaim_list_global(struct list_head *head) @@ -2134,7 +2188,7 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) LIST_HEAD(decay_list); struct rb_root decay_root = RB_ROOT; struct vmap_area *va, *nva; - unsigned long n_decay; + unsigned long n_decay, pool_len; int i; for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { @@ -2148,22 +2202,20 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) list_replace_init(&vn->pool[i].head, &tmp_list); spin_unlock(&vn->pool_lock); - if (full_decay) - WRITE_ONCE(vn->pool[i].len, 0); + pool_len = n_decay = vn->pool[i].len; + WRITE_ONCE(vn->pool[i].len, 0); /* Decay a pool by ~25% out of left objects. */ - n_decay = vn->pool[i].len >> 2; + if (!full_decay) + n_decay >>= 2; + pool_len -= n_decay; list_for_each_entry_safe(va, nva, &tmp_list, list) { + if (!n_decay--) + break; + list_del_init(&va->list); merge_or_add_vmap_area(va, &decay_root, &decay_list); - - if (!full_decay) { - WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1); - - if (!--n_decay) - break; - } } /* @@ -2172,9 +2224,10 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) * can populate the pool therefore a simple list replace * operation takes place here. */ - if (!full_decay && !list_empty(&tmp_list)) { + if (!list_empty(&tmp_list)) { spin_lock(&vn->pool_lock); list_replace_init(&tmp_list, &vn->pool[i].head); + WRITE_ONCE(vn->pool[i].len, pool_len); spin_unlock(&vn->pool_lock); } } @@ -2244,6 +2297,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, { unsigned long nr_purged_areas = 0; unsigned int nr_purge_helpers; + static cpumask_t purge_nodes; unsigned int nr_purge_nodes; struct vmap_node *vn; int i; @@ -2255,9 +2309,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, */ purge_nodes = CPU_MASK_NONE; - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; - + for_each_vmap_node(vn) { INIT_LIST_HEAD(&vn->purge_list); vn->skip_populate = full_pool_decay; decay_va_pool_node(vn, full_pool_decay); @@ -2276,7 +2328,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, end = max(end, list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end); - cpumask_set_cpu(i, &purge_nodes); + cpumask_set_cpu(node_to_id(vn), &purge_nodes); } nr_purge_nodes = cpumask_weight(&purge_nodes); @@ -2355,7 +2407,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) if (WARN_ON_ONCE(!list_empty(&va->list))) return; - nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT, + nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT, &vmap_lazy_nr); /* @@ -2421,7 +2473,7 @@ struct vmap_area *find_vmap_area(unsigned long addr) if (va) return va; - } while ((i = (i + 1) % nr_vmap_nodes) != j); + } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j); return NULL; } @@ -2447,7 +2499,7 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) if (va) return va; - } while ((i = (i + 1) % nr_vmap_nodes) != j); + } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j); return NULL; } @@ -2916,10 +2968,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) */ void vm_unmap_aliases(void) { - unsigned long start = ULONG_MAX, end = 0; - int flush = 0; - - _vm_unmap_aliases(start, end, flush); + _vm_unmap_aliases(ULONG_MAX, 0, 0); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); @@ -3100,7 +3149,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) /* * Before removing VM_UNINITIALIZED, * we should make sure that vm has proper values. - * Pair with smp_rmb() in show_numa_info(). + * Pair with smp_rmb() in vread_iter() and vmalloc_info_show(). */ smp_wmb(); vm->flags &= ~VM_UNINITIALIZED; @@ -3133,6 +3182,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size, area->flags = flags; area->caller = caller; + area->requested_size = requested_size; va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); if (IS_ERR(va)) { @@ -3370,12 +3420,13 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); + /* All pages of vm should be charged to same memcg, so use first one. */ + if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES)) + mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; BUG_ON(!page); - if (!(vm->flags & VM_MAP_PUT_PAGES)) - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations @@ -3671,12 +3722,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (gfp_mask & __GFP_ACCOUNT) { - int i; - - for (i = 0; i < area->nr_pages; i++) - mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); - } + /* All pages of vm should be charged to same memcg, so use first one. */ + if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) + mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, + area->nr_pages); /* * If not enough pages were obtained to accomplish an @@ -3943,9 +3992,10 @@ void *vmalloc_noprof(unsigned long size) EXPORT_SYMBOL(vmalloc_noprof); /** - * vmalloc_huge - allocate virtually contiguous memory, allow huge pages + * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages * @size: allocation size * @gfp_mask: flags for the page level allocator + * @node: node to use for allocation or NUMA_NO_NODE * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. @@ -3954,13 +4004,13 @@ EXPORT_SYMBOL(vmalloc_noprof); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) +void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) { return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, - gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, - NUMA_NO_NODE, __builtin_return_address(0)); + gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); } -EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); +EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof); /** * vzalloc - allocate virtually contiguous memory with zero fill @@ -4063,6 +4113,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof); */ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) { + struct vm_struct *vm = NULL; + size_t alloced_size = 0; size_t old_size = 0; void *n; @@ -4072,15 +4124,17 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) } if (p) { - struct vm_struct *vm; - vm = find_vm_area(p); if (unlikely(!vm)) { WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); return NULL; } - old_size = get_vm_area_size(vm); + alloced_size = get_vm_area_size(vm); + old_size = vm->requested_size; + if (WARN(alloced_size < old_size, + "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) + return NULL; } /* @@ -4088,11 +4142,26 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) * would be a good heuristic for when to shrink the vm_area? */ if (size <= old_size) { - /* Zero out spare memory. */ - if (want_init_on_alloc(flags)) + /* Zero out "freed" memory, potentially for future realloc. */ + if (want_init_on_free() || want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); + vm->requested_size = size; kasan_poison_vmalloc(p + size, old_size - size); - kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL); + return (void *)p; + } + + /* + * We already have the bytes available in the allocation; use them. + */ + if (size <= alloced_size) { + kasan_unpoison_vmalloc(p + old_size, size - old_size, + KASAN_VMALLOC_PROT_NORMAL); + /* + * No need to zero memory here, as unused memory will have + * already been zeroed at initial allocation time or during + * realloc shrink time. + */ + vm->requested_size = size; return (void *)p; } @@ -4914,39 +4983,37 @@ bool vmalloc_dump_obj(void *object) #endif #ifdef CONFIG_PROC_FS -static void show_numa_info(struct seq_file *m, struct vm_struct *v) -{ - if (IS_ENABLED(CONFIG_NUMA)) { - unsigned int nr, *counters = m->private; - unsigned int step = 1U << vm_area_page_order(v); - if (!counters) - return; +/* + * Print number of pages allocated on each memory node. + * + * This function can only be called if CONFIG_NUMA is enabled + * and VM_UNINITIALIZED bit in v->flags is disabled. + */ +static void show_numa_info(struct seq_file *m, struct vm_struct *v, + unsigned int *counters) +{ + unsigned int nr; + unsigned int step = 1U << vm_area_page_order(v); - if (v->flags & VM_UNINITIALIZED) - return; - /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ - smp_rmb(); + if (!counters) + return; - memset(counters, 0, nr_node_ids * sizeof(unsigned int)); + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); - for (nr = 0; nr < v->nr_pages; nr += step) - counters[page_to_nid(v->pages[nr])] += step; - for_each_node_state(nr, N_HIGH_MEMORY) - if (counters[nr]) - seq_printf(m, " N%u=%u", nr, counters[nr]); - } + for (nr = 0; nr < v->nr_pages; nr += step) + counters[page_to_nid(v->pages[nr])] += step; + for_each_node_state(nr, N_HIGH_MEMORY) + if (counters[nr]) + seq_printf(m, " N%u=%u", nr, counters[nr]); } static void show_purge_info(struct seq_file *m) { struct vmap_node *vn; struct vmap_area *va; - int i; - - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + for_each_vmap_node(vn) { spin_lock(&vn->lazy.lock); list_for_each_entry(va, &vn->lazy.head, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", @@ -4962,11 +5029,12 @@ static int vmalloc_info_show(struct seq_file *m, void *p) struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; - int i; + unsigned int *counters; - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + if (IS_ENABLED(CONFIG_NUMA)) + counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); list_for_each_entry(va, &vn->busy.head, list) { if (!va->vm) { @@ -4979,6 +5047,11 @@ static int vmalloc_info_show(struct seq_file *m, void *p) } v = va->vm; + if (v->flags & VM_UNINITIALIZED) + continue; + + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); @@ -5013,7 +5086,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p) if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); - show_numa_info(m, v); + if (IS_ENABLED(CONFIG_NUMA)) + show_numa_info(m, v, counters); + seq_putc(m, '\n'); } spin_unlock(&vn->busy.lock); @@ -5023,19 +5098,14 @@ static int vmalloc_info_show(struct seq_file *m, void *p) * As a final step, dump "unpurged" areas. */ show_purge_info(m); + if (IS_ENABLED(CONFIG_NUMA)) + kfree(counters); return 0; } static int __init proc_vmalloc_init(void) { - void *priv_data = NULL; - - if (IS_ENABLED(CONFIG_NUMA)) - priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); - - proc_create_single_data("vmallocinfo", - 0400, NULL, vmalloc_info_show, priv_data); - + proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show); return 0; } module_init(proc_vmalloc_init); @@ -5087,7 +5157,7 @@ static void __init vmap_init_free_space(void) static void vmap_init_nodes(void) { struct vmap_node *vn; - int i, n; + int i; #if BITS_PER_LONG == 64 /* @@ -5104,7 +5174,7 @@ static void vmap_init_nodes(void) * set of cores. Therefore a per-domain purging is supposed to * be added as well as a per-domain balancing. */ - n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); + int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); @@ -5119,8 +5189,7 @@ static void vmap_init_nodes(void) } #endif - for (n = 0; n < nr_vmap_nodes; n++) { - vn = &vmap_nodes[n]; + for_each_vmap_node(vn) { vn->busy.root = RB_ROOT; INIT_LIST_HEAD(&vn->busy.head); spin_lock_init(&vn->busy.lock); @@ -5141,15 +5210,13 @@ static void vmap_init_nodes(void) static unsigned long vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - unsigned long count; + unsigned long count = 0; struct vmap_node *vn; - int i, j; - - for (count = 0, i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + int i; - for (j = 0; j < MAX_VA_SIZE_PAGES; j++) - count += READ_ONCE(vn->pool[j].len); + for_each_vmap_node(vn) { + for (i = 0; i < MAX_VA_SIZE_PAGES; i++) + count += READ_ONCE(vn->pool[i].len); } return count ? count : SHRINK_EMPTY; @@ -5158,10 +5225,10 @@ vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) static unsigned long vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - int i; + struct vmap_node *vn; - for (i = 0; i < nr_vmap_nodes; i++) - decay_va_pool_node(&vmap_nodes[i], true); + for_each_vmap_node(vn) + decay_va_pool_node(vn, true); return SHRINK_STOP; } diff --git a/mm/vmpressure.c b/mm/vmpressure.c index bd5183dfd879..c197ed47bcc4 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -316,7 +316,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, * asserted for a second in which subsequent * pressure events can occur. */ - WRITE_ONCE(memcg->socket_pressure, jiffies + HZ); + mem_cgroup_set_socket_pressure(memcg); } } } diff --git a/mm/vmscan.c b/mm/vmscan.c index b620d74b0f66..7de11524a936 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -57,6 +57,7 @@ #include <linux/rculist_nulls.h> #include <linux/random.h> #include <linux/mmu_notifier.h> +#include <linux/parser.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -93,10 +94,8 @@ struct scan_control { unsigned long anon_cost; unsigned long file_cost; -#ifdef CONFIG_MEMCG /* Swappiness value for proactive reclaim. Always use sc_swappiness()! */ int *proactive_swappiness; -#endif /* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 @@ -120,7 +119,7 @@ struct scan_control { /* Has cache_trim_mode failed at least once? */ unsigned int cache_trim_mode_failed:1; - /* Proactive reclaim invoked by userspace through memory.reclaim */ + /* Proactive reclaim invoked by userspace */ unsigned int proactive:1; /* @@ -342,16 +341,22 @@ static void flush_reclaim_state(struct scan_control *sc) } } -static bool can_demote(int nid, struct scan_control *sc) +static bool can_demote(int nid, struct scan_control *sc, + struct mem_cgroup *memcg) { + int demotion_nid; + if (!numa_demotion_enabled) return false; if (sc && sc->no_demotion) return false; - if (next_demotion_node(nid) == NUMA_NO_NODE) + + demotion_nid = next_demotion_node(nid); + if (demotion_nid == NUMA_NO_NODE) return false; - return true; + /* If demotion node isn't in the cgroup's mems_allowed, fall back */ + return mem_cgroup_node_allowed(memcg, demotion_nid); } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, @@ -376,7 +381,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, * * Can it be reclaimed from this node via demotion? */ - return can_demote(nid, sc); + return can_demote(nid, sc, memcg); } /* @@ -646,23 +651,53 @@ typedef enum { PAGE_CLEAN, } pageout_t; +static pageout_t writeout(struct folio *folio, struct address_space *mapping, + struct swap_iocb **plug, struct list_head *folio_list) +{ + int res; + + folio_set_reclaim(folio); + + /* + * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled + * or we failed to allocate contiguous swap entries, in which case + * the split out folios get added back to folio_list. + */ + if (shmem_mapping(mapping)) + res = shmem_writeout(folio, plug, folio_list); + else + res = swap_writeout(folio, plug); + + if (res < 0) + handle_write_error(mapping, folio, res); + if (res == AOP_WRITEPAGE_ACTIVATE) { + folio_clear_reclaim(folio); + return PAGE_ACTIVATE; + } + + /* synchronous write? */ + if (!folio_test_writeback(folio)) + folio_clear_reclaim(folio); + + trace_mm_vmscan_write_folio(folio); + node_stat_add_folio(folio, NR_VMSCAN_WRITE); + return PAGE_SUCCESS; +} + /* * pageout is called by shrink_folio_list() for each dirty folio. - * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list) { /* - * If the folio is dirty, only perform writeback if that write - * will be non-blocking. To prevent this allocation from being - * stalled by pagecache activity. But note that there may be - * stalls if we need to run get_block(). We could test - * PagePrivate for that. - * - * If this process is currently in __generic_file_write_iter() against - * this folio's queue, we can perform writeback even if that - * will block. + * We no longer attempt to writeback filesystem folios here, other + * than tmpfs/shmem. That's taken care of in page-writeback. + * If we find a dirty filesystem folio at the end of the LRU list, + * typically that means the filesystem is saturating the storage + * with contiguous writes and telling it to write a folio here + * would only make the situation worse by injecting an element + * of random access. * * If the folio is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because @@ -685,47 +720,12 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } return PAGE_KEEP; } - if (mapping->a_ops->writepage == NULL) - return PAGE_ACTIVATE; - - if (folio_clear_dirty_for_io(folio)) { - int res; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = SWAP_CLUSTER_MAX, - .range_start = 0, - .range_end = LLONG_MAX, - .for_reclaim = 1, - .swap_plug = plug, - }; - - /* - * The large shmem folio can be split if CONFIG_THP_SWAP is - * not enabled or contiguous swap entries are failed to - * allocate. - */ - if (shmem_mapping(mapping) && folio_test_large(folio)) - wbc.list = folio_list; - - folio_set_reclaim(folio); - res = mapping->a_ops->writepage(&folio->page, &wbc); - if (res < 0) - handle_write_error(mapping, folio, res); - if (res == AOP_WRITEPAGE_ACTIVATE) { - folio_clear_reclaim(folio); - return PAGE_ACTIVATE; - } - - if (!folio_test_writeback(folio)) { - /* synchronous write or broken a_ops? */ - folio_clear_reclaim(folio); - } - trace_mm_vmscan_write_folio(folio); - node_stat_add_folio(folio, NR_VMSCAN_WRITE); - return PAGE_SUCCESS; - } - return PAGE_CLEAN; + if (!shmem_mapping(mapping) && !folio_test_anon(folio)) + return PAGE_ACTIVATE; + if (!folio_clear_dirty_for_io(folio)) + return PAGE_CLEAN; + return writeout(folio, mapping, plug, folio_list); } /* @@ -906,7 +906,7 @@ static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { int referenced_ptes, referenced_folio; - unsigned long vm_flags; + vm_flags_t vm_flags; referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); @@ -1005,7 +1005,8 @@ static void folio_check_dirty_writeback(struct folio *folio, mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } -struct folio *alloc_migrate_folio(struct folio *src, unsigned long private) +static struct folio *alloc_demote_folio(struct folio *src, + unsigned long private) { struct folio *dst; nodemask_t *allowed_mask; @@ -1068,7 +1069,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ - migrate_pages(demote_folios, alloc_migrate_folio, NULL, + migrate_pages(demote_folios, alloc_demote_folio, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); @@ -1096,7 +1097,8 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) */ static unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, - struct reclaim_stat *stat, bool ignore_references) + struct reclaim_stat *stat, bool ignore_references, + struct mem_cgroup *memcg) { struct folio_batch free_folios; LIST_HEAD(ret_folios); @@ -1109,7 +1111,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, folio_batch_init(&free_folios); memset(stat, 0, sizeof(*stat)); cond_resched(); - do_demote_pass = can_demote(pgdat->node_id, sc); + do_demote_pass = can_demote(pgdat->node_id, sc, memcg); retry: while (!list_empty(folio_list)) { @@ -1128,6 +1130,14 @@ retry: goto keep; if (folio_contain_hwpoisoned_page(folio)) { + /* + * unmap_poisoned_folio() can't handle large + * folio, just skip it. memory_failure() will + * handle it if the UCE is triggered again. + */ + if (folio_test_large(folio)) + goto keep_locked; + unmap_poisoned_folio(folio, folio_pfn(folio), false); folio_unlock(folio); folio_put(folio); @@ -1187,8 +1197,10 @@ retry: * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, - * not to fs). In this case mark the folio for immediate - * reclaim and continue scanning. + * not to fs), or the folio belongs to a mapping where + * waiting on writeback during reclaim may lead to a deadlock. + * In this case mark the folio for immediate reclaim and + * continue scanning. * * Require may_enter_fs() because we would wait on fs, which * may not have submitted I/O yet. And the loop driver might @@ -1213,6 +1225,8 @@ retry: * takes to write them to disk. */ if (folio_test_writeback(folio)) { + mapping = folio_mapping(folio); + /* Case 1 above */ if (current_is_kswapd() && folio_test_reclaim(folio) && @@ -1223,7 +1237,9 @@ retry: /* Case 2 above */ } else if (writeback_throttling_sane(sc) || !folio_test_reclaim(folio) || - !may_enter_fs(folio, sc->gfp_mask)) { + !may_enter_fs(folio, sc->gfp_mask) || + (mapping && + mapping_writeback_may_deadlock_on_reclaim(mapping))) { /* * This is slightly racy - * folio_end_writeback() might have @@ -1642,9 +1658,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, unsigned int noreclaim_flag; list_for_each_entry_safe(folio, next, folio_list, lru) { + /* TODO: these pages should not even appear in this list. */ + if (page_has_movable_ops(&folio->page)) + continue; if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && - !folio_test_dirty(folio) && !__folio_test_movable(folio) && - !folio_test_unevictable(folio)) { + !folio_test_dirty(folio) && !folio_test_unevictable(folio)) { folio_clear_active(folio); list_move(&folio->lru, &clean_folios); } @@ -1658,7 +1676,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, */ noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, - &stat, true); + &stat, true, NULL); memalloc_noreclaim_restore(noreclaim_flag); list_splice(&clean_folios, folio_list); @@ -1725,13 +1743,11 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; - unsigned long skipped = 0; - unsigned long scan, total_scan, nr_pages; + unsigned long skipped = 0, total_scan = 0, scan = 0; + unsigned long nr_pages; unsigned long max_nr_skipped = 0; LIST_HEAD(folios_skipped); - total_scan = 0; - scan = 0; while (scan < nr_to_scan && !list_empty(src)) { struct list_head *move_to = src; struct folio *folio; @@ -2023,7 +2039,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, item = PGSCAN_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); - __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); __count_vm_events(PGSCAN_ANON + file, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -2031,7 +2047,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); + nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false, + lruvec_memcg(lruvec)); spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); @@ -2042,11 +2059,11 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, item = PGSTEAL_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); - __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); + count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); - spin_unlock_irq(&lruvec->lru_lock); - lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); + lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, + nr_scanned - nr_reclaimed); /* * If dirty folios are scanned that are not queued for IO, it @@ -2112,7 +2129,7 @@ static void shrink_active_list(unsigned long nr_to_scan, { unsigned long nr_taken; unsigned long nr_scanned; - unsigned long vm_flags; + vm_flags_t vm_flags; LIST_HEAD(l_hold); /* The folios which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); @@ -2132,7 +2149,7 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!cgroup_reclaim(sc)) __count_vm_events(PGREFILL, nr_scanned); - __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -2189,13 +2206,11 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); __count_vm_events(PGDEACTIVATE, nr_deactivate); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&lruvec->lru_lock); - if (nr_rotated) - lru_note_cost(lruvec, file, 0, nr_rotated); + lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } @@ -2214,7 +2229,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, }; - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); @@ -2467,6 +2482,69 @@ static inline void calculate_pressure_balance(struct scan_control *sc, *denominator = ap + fp; } +static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, + struct scan_control *sc, unsigned long scan) +{ + unsigned long min, low; + + mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low); + + if (min || low) { + /* + * Scale a cgroup's reclaim pressure by proportioning + * its current usage to its memory.low or memory.min + * setting. + * + * This is important, as otherwise scanning aggression + * becomes extremely binary -- from nothing as we + * approach the memory protection threshold, to totally + * nominal as we exceed it. This results in requiring + * setting extremely liberal protection thresholds. It + * also means we simply get no protection at all if we + * set it too low, which is not ideal. + * + * If there is any protection in place, we reduce scan + * pressure by how much of the total memory used is + * within protection thresholds. + * + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * memory they are using, reducing the scan pressure + * again by how much of the total memory used is under + * hard protection. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + unsigned long protection; + + /* memory.low scaling, make sure we retry before OOM */ + if (!sc->memcg_low_reclaim && low > min) { + protection = low; + sc->memcg_low_skipped = 1; + } else { + protection = min; + } + + /* Avoid TOCTOU with earlier protection check */ + cgroup_size = max(cgroup_size, protection); + + scan -= scan * protection / (cgroup_size + 1); + + /* + * Minimally target SWAP_CLUSTER_MAX pages to keep + * reclaim moving forwards, avoiding decrementing + * sc->priority further than desirable. + */ + scan = max(scan, SWAP_CLUSTER_MAX); + } + return scan; +} + /* * Determine how aggressively the anon and file LRU lists should be * scanned. @@ -2503,6 +2581,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, goto out; } + /* Proactive reclaim initiated by userspace for anonymous memory only */ + if (swappiness == SWAPPINESS_ANON_ONLY) { + WARN_ON_ONCE(!sc->proactive); + scan_balance = SCAN_ANON; + goto out; + } + /* * Do not apply any pressure balancing cleverness when the * system is close to OOM, scan both anon and file equally @@ -2523,7 +2608,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, /* * If there is enough inactive page cache, we do not reclaim - * anything from the anonymous working right now. + * anything from the anonymous working right now to make sure + * a streaming file access pattern doesn't cause swapping. */ if (sc->cache_trim_mode) { scan_balance = SCAN_FILE; @@ -2537,70 +2623,10 @@ out: for_each_evictable_lru(lru) { bool file = is_file_lru(lru); unsigned long lruvec_size; - unsigned long low, min; unsigned long scan; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - mem_cgroup_protection(sc->target_mem_cgroup, memcg, - &min, &low); - - if (min || low) { - /* - * Scale a cgroup's reclaim pressure by proportioning - * its current usage to its memory.low or memory.min - * setting. - * - * This is important, as otherwise scanning aggression - * becomes extremely binary -- from nothing as we - * approach the memory protection threshold, to totally - * nominal as we exceed it. This results in requiring - * setting extremely liberal protection thresholds. It - * also means we simply get no protection at all if we - * set it too low, which is not ideal. - * - * If there is any protection in place, we reduce scan - * pressure by how much of the total memory used is - * within protection thresholds. - * - * There is one special case: in the first reclaim pass, - * we skip over all groups that are within their low - * protection. If that fails to reclaim enough pages to - * satisfy the reclaim goal, we come back and override - * the best-effort low protection. However, we still - * ideally want to honor how well-behaved groups are in - * that case instead of simply punishing them all - * equally. As such, we reclaim them based on how much - * memory they are using, reducing the scan pressure - * again by how much of the total memory used is under - * hard protection. - */ - unsigned long cgroup_size = mem_cgroup_size(memcg); - unsigned long protection; - - /* memory.low scaling, make sure we retry before OOM */ - if (!sc->memcg_low_reclaim && low > min) { - protection = low; - sc->memcg_low_skipped = 1; - } else { - protection = min; - } - - /* Avoid TOCTOU with earlier protection check */ - cgroup_size = max(cgroup_size, protection); - - scan = lruvec_size - lruvec_size * protection / - (cgroup_size + 1); - - /* - * Minimally target SWAP_CLUSTER_MAX pages to keep - * reclaim moving forwards, avoiding decrementing - * sc->priority further than desirable. - */ - scan = max(scan, SWAP_CLUSTER_MAX); - } else { - scan = lruvec_size; - } - + scan = apply_proportional_protection(memcg, sc, lruvec_size); scan >>= sc->priority; /* @@ -2646,7 +2672,7 @@ out: * Anonymous LRU management is a waste if there is * ultimately no way to reclaim the memory. */ -static bool can_age_anon_pages(struct pglist_data *pgdat, +static bool can_age_anon_pages(struct lruvec *lruvec, struct scan_control *sc) { /* Aging the anon LRU is valuable if swap is present: */ @@ -2654,7 +2680,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, return true; /* Also valuable if anon pages can be demoted: */ - return can_demote(pgdat->node_id, sc); + return can_demote(lruvec_pgdat(lruvec)->node_id, sc, + lruvec_memcg(lruvec)); } #ifdef CONFIG_LRU_GEN @@ -2690,8 +2717,12 @@ static bool should_clear_pmd_young(void) READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ } +/* Get the min/max evictable type based on swappiness */ +#define min_type(swappiness) (!(swappiness)) +#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY) + #define evictable_min_seq(min_seq, swappiness) \ - min((min_seq)[!(swappiness)], (min_seq)[(swappiness) <= MAX_SWAPPINESS]) + min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)]) #define for_each_gen_type_zone(gen, type, zone) \ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ @@ -2699,7 +2730,7 @@ static bool should_clear_pmd_young(void) for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) #define for_each_evictable_type(type, swappiness) \ - for ((type) = !(swappiness); (type) <= ((swappiness) <= MAX_SWAPPINESS); (type)++) + for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++) #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) @@ -2732,7 +2763,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) if (!sc->may_swap) return 0; - if (!can_demote(pgdat->node_id, sc) && + if (!can_demote(pgdat->node_id, sc, memcg) && mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) return 0; @@ -3401,7 +3432,7 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned if (!pte_present(pte) || is_zero_pfn(pfn)) return -1; - if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) + if (WARN_ON_ONCE(pte_special(pte))) return -1; if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm)) @@ -3426,9 +3457,6 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) return -1; - if (WARN_ON_ONCE(pmd_devmap(pmd))) - return -1; - if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm)) return -1; @@ -3850,7 +3878,12 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) int hist = lru_hist_from_seq(lrugen->min_seq[type]); int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - if (type ? swappiness > MAX_SWAPPINESS : !swappiness) + /* For file type, skip the check if swappiness is anon only */ + if (type && (swappiness == SWAPPINESS_ANON_ONLY)) + goto done; + + /* For anon type, skip the check if swappiness is zero (file only) */ + if (!type && !swappiness) goto done; /* prevent cold/hot inversion if the type is evictable */ @@ -3894,6 +3927,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) { int gen, type, zone; bool success = false; + bool seq_inc_flag = false; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); @@ -3910,11 +3944,20 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) } min_seq[type]++; + seq_inc_flag = true; } next: ; } + /* + * If min_seq[type] of both anonymous and file is not increased, + * we can directly return false to avoid unnecessary checking + * overhead later. + */ + if (!seq_inc_flag) + return success; + /* see the comment on lru_gen_folio */ if (swappiness && swappiness <= MAX_SWAPPINESS) { unsigned long seq = lrugen->max_seq - MIN_NR_GENS; @@ -4521,8 +4564,9 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca return true; } -static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, - int type, int tier, struct list_head *list) +static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int type, int tier, + struct list_head *list) { int i; int gen; @@ -4531,7 +4575,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, int scanned = 0; int isolated = 0; int skipped = 0; - int remaining = MAX_LRU_BATCH; + int remaining = min(nr_to_scan, MAX_LRU_BATCH); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -4588,8 +4632,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, __count_vm_events(item, isolated); __count_vm_events(PGREFILL, sorted); } - __count_memcg_events(memcg, item, isolated); - __count_memcg_events(memcg, PGREFILL, sorted); + count_memcg_events(memcg, item, isolated); + count_memcg_events(memcg, PGREFILL, sorted); __count_vm_events(PGSCAN_ANON + type, isolated); trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH, scanned, skipped, isolated, @@ -4642,7 +4686,8 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness) return positive_ctrl_err(&sp, &pv); } -static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, +static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list) { int i; @@ -4654,7 +4699,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw *type_scanned = type; - scanned = scan_folios(lruvec, sc, type, tier, list); + scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list); if (scanned) return scanned; @@ -4664,7 +4709,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw return 0; } -static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) +static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int swappiness) { int type; int scanned; @@ -4683,7 +4729,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap spin_lock_irq(&lruvec->lru_lock); - scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); + scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list); scanned += try_to_inc_min_seq(lruvec, swappiness); @@ -4695,7 +4741,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap if (list_empty(&list)) return scanned; retry: - reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); + reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg); sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, @@ -4739,7 +4785,7 @@ retry: item = PGSTEAL_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); - __count_memcg_events(memcg, item, reclaimed); + count_memcg_events(memcg, item, reclaimed); __count_vm_events(PGSTEAL_ANON + type, reclaimed); spin_unlock_irq(&lruvec->lru_lock); @@ -4804,6 +4850,8 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int s if (nr_to_scan && !mem_cgroup_online(memcg)) return nr_to_scan; + nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan); + /* try to get away with not aging at the default priority */ if (!success || sc->priority == DEF_PRIORITY) return nr_to_scan >> sc->priority; @@ -4856,7 +4904,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) if (nr_to_scan <= 0) break; - delta = evict_folios(lruvec, sc, swappiness); + delta = evict_folios(nr_to_scan, lruvec, sc, swappiness); if (!delta) break; @@ -5387,7 +5435,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, static int lru_gen_seq_show(struct seq_file *m, void *v) { unsigned long seq; - bool full = !debugfs_real_fops(m->file)->write; + bool full = debugfs_get_aux_num(m->file); struct lruvec *lruvec = v; struct lru_gen_folio *lrugen = &lruvec->lrugen; int nid = lruvec_pgdat(lruvec)->node_id; @@ -5477,7 +5525,8 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co if (sc->nr_reclaimed >= nr_to_reclaim) return 0; - if (!evict_folios(lruvec, sc, swappiness)) + if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc, + swappiness)) return 0; cond_resched(); @@ -5516,7 +5565,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (swappiness < MIN_SWAPPINESS) swappiness = get_swappiness(lruvec, sc); - else if (swappiness > MAX_SWAPPINESS + 1) + else if (swappiness > SWAPPINESS_ANON_ONLY) goto done; switch (cmd) { @@ -5573,24 +5622,35 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, while ((cur = strsep(&next, ",;\n"))) { int n; int end; - char cmd; + char cmd, swap_string[5]; unsigned int memcg_id; unsigned int nid; unsigned long seq; - unsigned int swappiness = -1; + unsigned int swappiness; unsigned long opt = -1; cur = skip_spaces(cur); if (!*cur) continue; - n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, - &seq, &end, &swappiness, &end, &opt, &end); + n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid, + &seq, &end, swap_string, &end, &opt, &end); if (n < 4 || cur[end]) { err = -EINVAL; break; } + if (n == 4) { + swappiness = -1; + } else if (!strcmp("max", swap_string)) { + /* set by userspace for anonymous memory only */ + swappiness = SWAPPINESS_ANON_ONLY; + } else { + err = kstrtouint(swap_string, 0, &swappiness); + if (err) + break; + } + err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); if (err) break; @@ -5712,8 +5772,10 @@ static int __init init_lru_gen(void) if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) pr_err("lru_gen: failed to create sysfs group\n"); - debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); - debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); + debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, 1, + &lru_gen_rw_fops); + debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, 0, + &lru_gen_ro_fops); return 0; }; @@ -5850,7 +5912,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && + if (can_age_anon_pages(lruvec, sc) && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -6669,6 +6731,15 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, return nr_reclaimed; } +#else +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + unsigned int reclaim_options, + int *swappiness) +{ + return 0; +} #endif static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) @@ -6681,10 +6752,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) return; } - if (!can_age_anon_pages(pgdat, sc)) + lruvec = mem_cgroup_lruvec(NULL, pgdat); + if (!can_age_anon_pages(lruvec, sc)) return; - lruvec = mem_cgroup_lruvec(NULL, pgdat); if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) return; @@ -6736,6 +6807,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) * meet watermarks. */ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { + enum zone_stat_item item; unsigned long free_pages; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) @@ -6746,11 +6818,33 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) /* * In defrag_mode, watermarks must be met in whole * blocks to avoid polluting allocator fallbacks. + * + * However, kswapd usually cannot accomplish this on + * its own and needs kcompactd support. Once it's + * reclaimed a compaction gap, and kswapd_shrink_node + * has dropped order, simply ensure there are enough + * base pages for compaction, wake kcompactd & sleep. */ - if (defrag_mode) - free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS); + if (defrag_mode && order) + item = NR_FREE_PAGES_BLOCKS; else - free_pages = zone_page_state(zone, NR_FREE_PAGES); + item = NR_FREE_PAGES; + + /* + * When there is a high number of CPUs in the system, + * the cumulative error from the vmstat per-cpu cache + * can blur the line between the watermarks. In that + * case, be safe and get an accurate snapshot. + * + * TODO: NR_FREE_PAGES_BLOCKS moves in steps of + * pageblock_nr_pages, while the vmstat pcp threshold + * is limited to 125. On many configurations that + * counter won't actually be per-cpu cached. But keep + * things simple for now; revisit when somebody cares. + */ + free_pages = zone_page_state(zone, item); + if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark) + free_pages = zone_page_state_snapshot(zone, item); if (__zone_watermark_ok(zone, order, mark, highest_zoneidx, 0, free_pages)) @@ -7540,36 +7634,26 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) /* * Try to free up some pages from this node through reclaim. */ -static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) +static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, + unsigned long nr_pages, + struct scan_control *sc) { - /* Minimum pages needed in order to stay on node */ - const unsigned long nr_pages = 1 << order; struct task_struct *p = current; unsigned int noreclaim_flag; - struct scan_control sc = { - .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = current_gfp_context(gfp_mask), - .order = order, - .priority = NODE_RECLAIM_PRIORITY, - .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), - .may_swap = 1, - .reclaim_idx = gfp_zone(gfp_mask), - }; unsigned long pflags; - trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, - sc.gfp_mask); + trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, sc->order, + sc->gfp_mask); cond_resched(); psi_memstall_enter(&pflags); delayacct_freepages_start(); - fs_reclaim_acquire(sc.gfp_mask); + fs_reclaim_acquire(sc->gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP */ noreclaim_flag = memalloc_noreclaim_save(); - set_task_reclaim_state(p, &sc.reclaim_state); + set_task_reclaim_state(p, &sc->reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { @@ -7578,24 +7662,36 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in * priorities until we have enough memory freed. */ do { - shrink_node(pgdat, &sc); - } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); + shrink_node(pgdat, sc); + } while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0); } set_task_reclaim_state(p, NULL); memalloc_noreclaim_restore(noreclaim_flag); - fs_reclaim_release(sc.gfp_mask); - psi_memstall_leave(&pflags); + fs_reclaim_release(sc->gfp_mask); delayacct_freepages_end(); + psi_memstall_leave(&pflags); - trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); + trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed); - return sc.nr_reclaimed >= nr_pages; + return sc->nr_reclaimed; } int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { int ret; + /* Minimum pages needed in order to stay on node */ + const unsigned long nr_pages = 1 << order; + struct scan_control sc = { + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = current_gfp_context(gfp_mask), + .order = order, + .priority = NODE_RECLAIM_PRIORITY, + .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), + .may_swap = 1, + .reclaim_idx = gfp_zone(gfp_mask), + }; /* * Node reclaim reclaims unmapped file backed pages and @@ -7630,7 +7726,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) return NODE_RECLAIM_NOSCAN; - ret = __node_reclaim(pgdat, gfp_mask, order); + ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages; clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); if (ret) @@ -7640,6 +7736,114 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) return ret; } + +enum { + MEMORY_RECLAIM_SWAPPINESS = 0, + MEMORY_RECLAIM_SWAPPINESS_MAX, + MEMORY_RECLAIM_NULL, +}; +static const match_table_t tokens = { + { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, + { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"}, + { MEMORY_RECLAIM_NULL, NULL }, +}; + +int user_proactive_reclaim(char *buf, + struct mem_cgroup *memcg, pg_data_t *pgdat) +{ + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + unsigned long nr_to_reclaim, nr_reclaimed = 0; + int swappiness = -1; + char *old_buf, *start; + substring_t args[MAX_OPT_ARGS]; + gfp_t gfp_mask = GFP_KERNEL; + + if (!buf || (!memcg && !pgdat) || (memcg && pgdat)) + return -EINVAL; + + buf = strstrip(buf); + + old_buf = buf; + nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; + if (buf == old_buf) + return -EINVAL; + + buf = strstrip(buf); + + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + switch (match_token(start, tokens, args)) { + case MEMORY_RECLAIM_SWAPPINESS: + if (match_int(&args[0], &swappiness)) + return -EINVAL; + if (swappiness < MIN_SWAPPINESS || + swappiness > MAX_SWAPPINESS) + return -EINVAL; + break; + case MEMORY_RECLAIM_SWAPPINESS_MAX: + swappiness = SWAPPINESS_ANON_ONLY; + break; + default: + return -EINVAL; + } + } + + while (nr_reclaimed < nr_to_reclaim) { + /* Will converge on zero, but reclaim enforces a minimum */ + unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; + unsigned long reclaimed; + + if (signal_pending(current)) + return -EINTR; + + /* + * This is the final attempt, drain percpu lru caches in the + * hope of introducing more evictable pages. + */ + if (!nr_retries) + lru_add_drain_all(); + + if (memcg) { + unsigned int reclaim_options; + + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | + MEMCG_RECLAIM_PROACTIVE; + reclaimed = try_to_free_mem_cgroup_pages(memcg, + batch_size, gfp_mask, + reclaim_options, + swappiness == -1 ? NULL : &swappiness); + } else { + struct scan_control sc = { + .gfp_mask = current_gfp_context(gfp_mask), + .reclaim_idx = gfp_zone(gfp_mask), + .proactive_swappiness = swappiness == -1 ? NULL : &swappiness, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX), + .may_unmap = 1, + .may_swap = 1, + .proactive = 1, + }; + + if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, + &pgdat->flags)) + return -EBUSY; + + reclaimed = __node_reclaim(pgdat, gfp_mask, + batch_size, &sc); + clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); + } + + if (!reclaimed && !nr_retries--) + return -EAGAIN; + + nr_reclaimed += reclaimed; + } + + return 0; +} + #endif /** @@ -7687,3 +7891,26 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) } } EXPORT_SYMBOL_GPL(check_move_unevictable_folios); + +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +static ssize_t reclaim_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret, nid = dev->id; + + ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid)); + return ret ? -EAGAIN : count; +} + +static DEVICE_ATTR_WO(reclaim); +int reclaim_register_node(struct node *node) +{ + return device_create_file(&node->dev, &dev_attr_reclaim); +} + +void reclaim_unregister_node(struct node *node) +{ + return device_remove_file(&node->dev, &dev_attr_reclaim); +} +#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 4c268ce39ff2..71cd1ceba191 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -7,7 +7,7 @@ * * zoned VM statistics * Copyright (C) 2006 Silicon Graphics, Inc., - * Christoph Lameter <christoph@lameter.com> + * Christoph Lameter <cl@gentwo.org> * Copyright (C) 2008-2014 Christoph Lameter */ #include <linux/fs.h> @@ -1163,320 +1163,339 @@ int fragmentation_index(struct zone *zone, unsigned int order) #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \ defined(CONFIG_NUMA) || defined(CONFIG_MEMCG) #ifdef CONFIG_ZONE_DMA -#define TEXT_FOR_DMA(xx) xx "_dma", +#define TEXT_FOR_DMA(xx, yy) [xx##_DMA] = yy "_dma", #else -#define TEXT_FOR_DMA(xx) +#define TEXT_FOR_DMA(xx, yy) #endif #ifdef CONFIG_ZONE_DMA32 -#define TEXT_FOR_DMA32(xx) xx "_dma32", +#define TEXT_FOR_DMA32(xx, yy) [xx##_DMA32] = yy "_dma32", #else -#define TEXT_FOR_DMA32(xx) +#define TEXT_FOR_DMA32(xx, yy) #endif #ifdef CONFIG_HIGHMEM -#define TEXT_FOR_HIGHMEM(xx) xx "_high", +#define TEXT_FOR_HIGHMEM(xx, yy) [xx##_HIGH] = yy "_high", #else -#define TEXT_FOR_HIGHMEM(xx) +#define TEXT_FOR_HIGHMEM(xx, yy) #endif #ifdef CONFIG_ZONE_DEVICE -#define TEXT_FOR_DEVICE(xx) xx "_device", +#define TEXT_FOR_DEVICE(xx, yy) [xx##_DEVICE] = yy "_device", #else -#define TEXT_FOR_DEVICE(xx) +#define TEXT_FOR_DEVICE(xx, yy) #endif -#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ - TEXT_FOR_HIGHMEM(xx) xx "_movable", \ - TEXT_FOR_DEVICE(xx) +#define TEXTS_FOR_ZONES(xx, yy) \ + TEXT_FOR_DMA(xx, yy) \ + TEXT_FOR_DMA32(xx, yy) \ + [xx##_NORMAL] = yy "_normal", \ + TEXT_FOR_HIGHMEM(xx, yy) \ + [xx##_MOVABLE] = yy "_movable", \ + TEXT_FOR_DEVICE(xx, yy) const char * const vmstat_text[] = { /* enum zone_stat_item counters */ - "nr_free_pages", - "nr_free_pages_blocks", - "nr_zone_inactive_anon", - "nr_zone_active_anon", - "nr_zone_inactive_file", - "nr_zone_active_file", - "nr_zone_unevictable", - "nr_zone_write_pending", - "nr_mlock", - "nr_bounce", +#define I(x) (x) + [I(NR_FREE_PAGES)] = "nr_free_pages", + [I(NR_FREE_PAGES_BLOCKS)] = "nr_free_pages_blocks", + [I(NR_ZONE_INACTIVE_ANON)] = "nr_zone_inactive_anon", + [I(NR_ZONE_ACTIVE_ANON)] = "nr_zone_active_anon", + [I(NR_ZONE_INACTIVE_FILE)] = "nr_zone_inactive_file", + [I(NR_ZONE_ACTIVE_FILE)] = "nr_zone_active_file", + [I(NR_ZONE_UNEVICTABLE)] = "nr_zone_unevictable", + [I(NR_ZONE_WRITE_PENDING)] = "nr_zone_write_pending", + [I(NR_MLOCK)] = "nr_mlock", #if IS_ENABLED(CONFIG_ZSMALLOC) - "nr_zspages", + [I(NR_ZSPAGES)] = "nr_zspages", #endif - "nr_free_cma", + [I(NR_FREE_CMA_PAGES)] = "nr_free_cma", #ifdef CONFIG_UNACCEPTED_MEMORY - "nr_unaccepted", + [I(NR_UNACCEPTED)] = "nr_unaccepted", #endif +#undef I /* enum numa_stat_item counters */ +#define I(x) (NR_VM_ZONE_STAT_ITEMS + x) #ifdef CONFIG_NUMA - "numa_hit", - "numa_miss", - "numa_foreign", - "numa_interleave", - "numa_local", - "numa_other", + [I(NUMA_HIT)] = "numa_hit", + [I(NUMA_MISS)] = "numa_miss", + [I(NUMA_FOREIGN)] = "numa_foreign", + [I(NUMA_INTERLEAVE_HIT)] = "numa_interleave", + [I(NUMA_LOCAL)] = "numa_local", + [I(NUMA_OTHER)] = "numa_other", #endif +#undef I /* enum node_stat_item counters */ - "nr_inactive_anon", - "nr_active_anon", - "nr_inactive_file", - "nr_active_file", - "nr_unevictable", - "nr_slab_reclaimable", - "nr_slab_unreclaimable", - "nr_isolated_anon", - "nr_isolated_file", - "workingset_nodes", - "workingset_refault_anon", - "workingset_refault_file", - "workingset_activate_anon", - "workingset_activate_file", - "workingset_restore_anon", - "workingset_restore_file", - "workingset_nodereclaim", - "nr_anon_pages", - "nr_mapped", - "nr_file_pages", - "nr_dirty", - "nr_writeback", - "nr_writeback_temp", - "nr_shmem", - "nr_shmem_hugepages", - "nr_shmem_pmdmapped", - "nr_file_hugepages", - "nr_file_pmdmapped", - "nr_anon_transparent_hugepages", - "nr_vmscan_write", - "nr_vmscan_immediate_reclaim", - "nr_dirtied", - "nr_written", - "nr_throttled_written", - "nr_kernel_misc_reclaimable", - "nr_foll_pin_acquired", - "nr_foll_pin_released", - "nr_kernel_stack", +#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + x) + [I(NR_INACTIVE_ANON)] = "nr_inactive_anon", + [I(NR_ACTIVE_ANON)] = "nr_active_anon", + [I(NR_INACTIVE_FILE)] = "nr_inactive_file", + [I(NR_ACTIVE_FILE)] = "nr_active_file", + [I(NR_UNEVICTABLE)] = "nr_unevictable", + [I(NR_SLAB_RECLAIMABLE_B)] = "nr_slab_reclaimable", + [I(NR_SLAB_UNRECLAIMABLE_B)] = "nr_slab_unreclaimable", + [I(NR_ISOLATED_ANON)] = "nr_isolated_anon", + [I(NR_ISOLATED_FILE)] = "nr_isolated_file", + [I(WORKINGSET_NODES)] = "workingset_nodes", + [I(WORKINGSET_REFAULT_ANON)] = "workingset_refault_anon", + [I(WORKINGSET_REFAULT_FILE)] = "workingset_refault_file", + [I(WORKINGSET_ACTIVATE_ANON)] = "workingset_activate_anon", + [I(WORKINGSET_ACTIVATE_FILE)] = "workingset_activate_file", + [I(WORKINGSET_RESTORE_ANON)] = "workingset_restore_anon", + [I(WORKINGSET_RESTORE_FILE)] = "workingset_restore_file", + [I(WORKINGSET_NODERECLAIM)] = "workingset_nodereclaim", + [I(NR_ANON_MAPPED)] = "nr_anon_pages", + [I(NR_FILE_MAPPED)] = "nr_mapped", + [I(NR_FILE_PAGES)] = "nr_file_pages", + [I(NR_FILE_DIRTY)] = "nr_dirty", + [I(NR_WRITEBACK)] = "nr_writeback", + [I(NR_SHMEM)] = "nr_shmem", + [I(NR_SHMEM_THPS)] = "nr_shmem_hugepages", + [I(NR_SHMEM_PMDMAPPED)] = "nr_shmem_pmdmapped", + [I(NR_FILE_THPS)] = "nr_file_hugepages", + [I(NR_FILE_PMDMAPPED)] = "nr_file_pmdmapped", + [I(NR_ANON_THPS)] = "nr_anon_transparent_hugepages", + [I(NR_VMSCAN_WRITE)] = "nr_vmscan_write", + [I(NR_VMSCAN_IMMEDIATE)] = "nr_vmscan_immediate_reclaim", + [I(NR_DIRTIED)] = "nr_dirtied", + [I(NR_WRITTEN)] = "nr_written", + [I(NR_THROTTLED_WRITTEN)] = "nr_throttled_written", + [I(NR_KERNEL_MISC_RECLAIMABLE)] = "nr_kernel_misc_reclaimable", + [I(NR_FOLL_PIN_ACQUIRED)] = "nr_foll_pin_acquired", + [I(NR_FOLL_PIN_RELEASED)] = "nr_foll_pin_released", + [I(NR_KERNEL_STACK_KB)] = "nr_kernel_stack", #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) - "nr_shadow_call_stack", + [I(NR_KERNEL_SCS_KB)] = "nr_shadow_call_stack", #endif - "nr_page_table_pages", - "nr_sec_page_table_pages", + [I(NR_PAGETABLE)] = "nr_page_table_pages", + [I(NR_SECONDARY_PAGETABLE)] = "nr_sec_page_table_pages", #ifdef CONFIG_IOMMU_SUPPORT - "nr_iommu_pages", + [I(NR_IOMMU_PAGES)] = "nr_iommu_pages", #endif #ifdef CONFIG_SWAP - "nr_swapcached", + [I(NR_SWAPCACHE)] = "nr_swapcached", #endif #ifdef CONFIG_NUMA_BALANCING - "pgpromote_success", - "pgpromote_candidate", + [I(PGPROMOTE_SUCCESS)] = "pgpromote_success", + [I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate", #endif - "pgdemote_kswapd", - "pgdemote_direct", - "pgdemote_khugepaged", - "pgdemote_proactive", + [I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd", + [I(PGDEMOTE_DIRECT)] = "pgdemote_direct", + [I(PGDEMOTE_KHUGEPAGED)] = "pgdemote_khugepaged", + [I(PGDEMOTE_PROACTIVE)] = "pgdemote_proactive", #ifdef CONFIG_HUGETLB_PAGE - "nr_hugetlb", + [I(NR_HUGETLB)] = "nr_hugetlb", #endif - "nr_balloon_pages", - /* system-wide enum vm_stat_item counters */ - "nr_dirty_threshold", - "nr_dirty_background_threshold", - "nr_memmap_pages", - "nr_memmap_boot_pages", + [I(NR_BALLOON_PAGES)] = "nr_balloon_pages", +#undef I -#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) + /* system-wide enum vm_stat_item counters */ +#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \ + NR_VM_NODE_STAT_ITEMS + x) + [I(NR_DIRTY_THRESHOLD)] = "nr_dirty_threshold", + [I(NR_DIRTY_BG_THRESHOLD)] = "nr_dirty_background_threshold", + [I(NR_MEMMAP_PAGES)] = "nr_memmap_pages", + [I(NR_MEMMAP_BOOT_PAGES)] = "nr_memmap_boot_pages", +#undef I + +#if defined(CONFIG_VM_EVENT_COUNTERS) /* enum vm_event_item counters */ - "pgpgin", - "pgpgout", - "pswpin", - "pswpout", - - TEXTS_FOR_ZONES("pgalloc") - TEXTS_FOR_ZONES("allocstall") - TEXTS_FOR_ZONES("pgskip") - - "pgfree", - "pgactivate", - "pgdeactivate", - "pglazyfree", - - "pgfault", - "pgmajfault", - "pglazyfreed", - - "pgrefill", - "pgreuse", - "pgsteal_kswapd", - "pgsteal_direct", - "pgsteal_khugepaged", - "pgsteal_proactive", - "pgscan_kswapd", - "pgscan_direct", - "pgscan_khugepaged", - "pgscan_proactive", - "pgscan_direct_throttle", - "pgscan_anon", - "pgscan_file", - "pgsteal_anon", - "pgsteal_file", +#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \ + NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS + x) + + [I(PGPGIN)] = "pgpgin", + [I(PGPGOUT)] = "pgpgout", + [I(PSWPIN)] = "pswpin", + [I(PSWPOUT)] = "pswpout", + +#define OFF (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \ + NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS) + TEXTS_FOR_ZONES(OFF+PGALLOC, "pgalloc") + TEXTS_FOR_ZONES(OFF+ALLOCSTALL, "allocstall") + TEXTS_FOR_ZONES(OFF+PGSCAN_SKIP, "pgskip") +#undef OFF + + [I(PGFREE)] = "pgfree", + [I(PGACTIVATE)] = "pgactivate", + [I(PGDEACTIVATE)] = "pgdeactivate", + [I(PGLAZYFREE)] = "pglazyfree", + + [I(PGFAULT)] = "pgfault", + [I(PGMAJFAULT)] = "pgmajfault", + [I(PGLAZYFREED)] = "pglazyfreed", + + [I(PGREFILL)] = "pgrefill", + [I(PGREUSE)] = "pgreuse", + [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd", + [I(PGSTEAL_DIRECT)] = "pgsteal_direct", + [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged", + [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive", + [I(PGSCAN_KSWAPD)] = "pgscan_kswapd", + [I(PGSCAN_DIRECT)] = "pgscan_direct", + [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged", + [I(PGSCAN_PROACTIVE)] = "pgscan_proactive", + [I(PGSCAN_DIRECT_THROTTLE)] = "pgscan_direct_throttle", + [I(PGSCAN_ANON)] = "pgscan_anon", + [I(PGSCAN_FILE)] = "pgscan_file", + [I(PGSTEAL_ANON)] = "pgsteal_anon", + [I(PGSTEAL_FILE)] = "pgsteal_file", #ifdef CONFIG_NUMA - "zone_reclaim_success", - "zone_reclaim_failed", + [I(PGSCAN_ZONE_RECLAIM_SUCCESS)] = "zone_reclaim_success", + [I(PGSCAN_ZONE_RECLAIM_FAILED)] = "zone_reclaim_failed", #endif - "pginodesteal", - "slabs_scanned", - "kswapd_inodesteal", - "kswapd_low_wmark_hit_quickly", - "kswapd_high_wmark_hit_quickly", - "pageoutrun", + [I(PGINODESTEAL)] = "pginodesteal", + [I(SLABS_SCANNED)] = "slabs_scanned", + [I(KSWAPD_INODESTEAL)] = "kswapd_inodesteal", + [I(KSWAPD_LOW_WMARK_HIT_QUICKLY)] = "kswapd_low_wmark_hit_quickly", + [I(KSWAPD_HIGH_WMARK_HIT_QUICKLY)] = "kswapd_high_wmark_hit_quickly", + [I(PAGEOUTRUN)] = "pageoutrun", - "pgrotated", + [I(PGROTATED)] = "pgrotated", - "drop_pagecache", - "drop_slab", - "oom_kill", + [I(DROP_PAGECACHE)] = "drop_pagecache", + [I(DROP_SLAB)] = "drop_slab", + [I(OOM_KILL)] = "oom_kill", #ifdef CONFIG_NUMA_BALANCING - "numa_pte_updates", - "numa_huge_pte_updates", - "numa_hint_faults", - "numa_hint_faults_local", - "numa_pages_migrated", + [I(NUMA_PTE_UPDATES)] = "numa_pte_updates", + [I(NUMA_HUGE_PTE_UPDATES)] = "numa_huge_pte_updates", + [I(NUMA_HINT_FAULTS)] = "numa_hint_faults", + [I(NUMA_HINT_FAULTS_LOCAL)] = "numa_hint_faults_local", + [I(NUMA_PAGE_MIGRATE)] = "numa_pages_migrated", #endif #ifdef CONFIG_MIGRATION - "pgmigrate_success", - "pgmigrate_fail", - "thp_migration_success", - "thp_migration_fail", - "thp_migration_split", + [I(PGMIGRATE_SUCCESS)] = "pgmigrate_success", + [I(PGMIGRATE_FAIL)] = "pgmigrate_fail", + [I(THP_MIGRATION_SUCCESS)] = "thp_migration_success", + [I(THP_MIGRATION_FAIL)] = "thp_migration_fail", + [I(THP_MIGRATION_SPLIT)] = "thp_migration_split", #endif #ifdef CONFIG_COMPACTION - "compact_migrate_scanned", - "compact_free_scanned", - "compact_isolated", - "compact_stall", - "compact_fail", - "compact_success", - "compact_daemon_wake", - "compact_daemon_migrate_scanned", - "compact_daemon_free_scanned", + [I(COMPACTMIGRATE_SCANNED)] = "compact_migrate_scanned", + [I(COMPACTFREE_SCANNED)] = "compact_free_scanned", + [I(COMPACTISOLATED)] = "compact_isolated", + [I(COMPACTSTALL)] = "compact_stall", + [I(COMPACTFAIL)] = "compact_fail", + [I(COMPACTSUCCESS)] = "compact_success", + [I(KCOMPACTD_WAKE)] = "compact_daemon_wake", + [I(KCOMPACTD_MIGRATE_SCANNED)] = "compact_daemon_migrate_scanned", + [I(KCOMPACTD_FREE_SCANNED)] = "compact_daemon_free_scanned", #endif #ifdef CONFIG_HUGETLB_PAGE - "htlb_buddy_alloc_success", - "htlb_buddy_alloc_fail", + [I(HTLB_BUDDY_PGALLOC)] = "htlb_buddy_alloc_success", + [I(HTLB_BUDDY_PGALLOC_FAIL)] = "htlb_buddy_alloc_fail", #endif #ifdef CONFIG_CMA - "cma_alloc_success", - "cma_alloc_fail", + [I(CMA_ALLOC_SUCCESS)] = "cma_alloc_success", + [I(CMA_ALLOC_FAIL)] = "cma_alloc_fail", #endif - "unevictable_pgs_culled", - "unevictable_pgs_scanned", - "unevictable_pgs_rescued", - "unevictable_pgs_mlocked", - "unevictable_pgs_munlocked", - "unevictable_pgs_cleared", - "unevictable_pgs_stranded", + [I(UNEVICTABLE_PGCULLED)] = "unevictable_pgs_culled", + [I(UNEVICTABLE_PGSCANNED)] = "unevictable_pgs_scanned", + [I(UNEVICTABLE_PGRESCUED)] = "unevictable_pgs_rescued", + [I(UNEVICTABLE_PGMLOCKED)] = "unevictable_pgs_mlocked", + [I(UNEVICTABLE_PGMUNLOCKED)] = "unevictable_pgs_munlocked", + [I(UNEVICTABLE_PGCLEARED)] = "unevictable_pgs_cleared", + [I(UNEVICTABLE_PGSTRANDED)] = "unevictable_pgs_stranded", #ifdef CONFIG_TRANSPARENT_HUGEPAGE - "thp_fault_alloc", - "thp_fault_fallback", - "thp_fault_fallback_charge", - "thp_collapse_alloc", - "thp_collapse_alloc_failed", - "thp_file_alloc", - "thp_file_fallback", - "thp_file_fallback_charge", - "thp_file_mapped", - "thp_split_page", - "thp_split_page_failed", - "thp_deferred_split_page", - "thp_underused_split_page", - "thp_split_pmd", - "thp_scan_exceed_none_pte", - "thp_scan_exceed_swap_pte", - "thp_scan_exceed_share_pte", + [I(THP_FAULT_ALLOC)] = "thp_fault_alloc", + [I(THP_FAULT_FALLBACK)] = "thp_fault_fallback", + [I(THP_FAULT_FALLBACK_CHARGE)] = "thp_fault_fallback_charge", + [I(THP_COLLAPSE_ALLOC)] = "thp_collapse_alloc", + [I(THP_COLLAPSE_ALLOC_FAILED)] = "thp_collapse_alloc_failed", + [I(THP_FILE_ALLOC)] = "thp_file_alloc", + [I(THP_FILE_FALLBACK)] = "thp_file_fallback", + [I(THP_FILE_FALLBACK_CHARGE)] = "thp_file_fallback_charge", + [I(THP_FILE_MAPPED)] = "thp_file_mapped", + [I(THP_SPLIT_PAGE)] = "thp_split_page", + [I(THP_SPLIT_PAGE_FAILED)] = "thp_split_page_failed", + [I(THP_DEFERRED_SPLIT_PAGE)] = "thp_deferred_split_page", + [I(THP_UNDERUSED_SPLIT_PAGE)] = "thp_underused_split_page", + [I(THP_SPLIT_PMD)] = "thp_split_pmd", + [I(THP_SCAN_EXCEED_NONE_PTE)] = "thp_scan_exceed_none_pte", + [I(THP_SCAN_EXCEED_SWAP_PTE)] = "thp_scan_exceed_swap_pte", + [I(THP_SCAN_EXCEED_SHARED_PTE)] = "thp_scan_exceed_share_pte", #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD - "thp_split_pud", + [I(THP_SPLIT_PUD)] = "thp_split_pud", #endif - "thp_zero_page_alloc", - "thp_zero_page_alloc_failed", - "thp_swpout", - "thp_swpout_fallback", + [I(THP_ZERO_PAGE_ALLOC)] = "thp_zero_page_alloc", + [I(THP_ZERO_PAGE_ALLOC_FAILED)] = "thp_zero_page_alloc_failed", + [I(THP_SWPOUT)] = "thp_swpout", + [I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback", #endif #ifdef CONFIG_MEMORY_BALLOON - "balloon_inflate", - "balloon_deflate", + [I(BALLOON_INFLATE)] = "balloon_inflate", + [I(BALLOON_DEFLATE)] = "balloon_deflate", #ifdef CONFIG_BALLOON_COMPACTION - "balloon_migrate", + [I(BALLOON_MIGRATE)] = "balloon_migrate", #endif #endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH - "nr_tlb_remote_flush", - "nr_tlb_remote_flush_received", - "nr_tlb_local_flush_all", - "nr_tlb_local_flush_one", + [I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush", + [I(NR_TLB_REMOTE_FLUSH_RECEIVED)] = "nr_tlb_remote_flush_received", + [I(NR_TLB_LOCAL_FLUSH_ALL)] = "nr_tlb_local_flush_all", + [I(NR_TLB_LOCAL_FLUSH_ONE)] = "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ #ifdef CONFIG_SWAP - "swap_ra", - "swap_ra_hit", - "swpin_zero", - "swpout_zero", + [I(SWAP_RA)] = "swap_ra", + [I(SWAP_RA_HIT)] = "swap_ra_hit", + [I(SWPIN_ZERO)] = "swpin_zero", + [I(SWPOUT_ZERO)] = "swpout_zero", #ifdef CONFIG_KSM - "ksm_swpin_copy", + [I(KSM_SWPIN_COPY)] = "ksm_swpin_copy", #endif #endif #ifdef CONFIG_KSM - "cow_ksm", + [I(COW_KSM)] = "cow_ksm", #endif #ifdef CONFIG_ZSWAP - "zswpin", - "zswpout", - "zswpwb", + [I(ZSWPIN)] = "zswpin", + [I(ZSWPOUT)] = "zswpout", + [I(ZSWPWB)] = "zswpwb", #endif #ifdef CONFIG_X86 - "direct_map_level2_splits", - "direct_map_level3_splits", - "direct_map_level2_collapses", - "direct_map_level3_collapses", + [I(DIRECT_MAP_LEVEL2_SPLIT)] = "direct_map_level2_splits", + [I(DIRECT_MAP_LEVEL3_SPLIT)] = "direct_map_level3_splits", + [I(DIRECT_MAP_LEVEL2_COLLAPSE)] = "direct_map_level2_collapses", + [I(DIRECT_MAP_LEVEL3_COLLAPSE)] = "direct_map_level3_collapses", #endif #ifdef CONFIG_PER_VMA_LOCK_STATS - "vma_lock_success", - "vma_lock_abort", - "vma_lock_retry", - "vma_lock_miss", + [I(VMA_LOCK_SUCCESS)] = "vma_lock_success", + [I(VMA_LOCK_ABORT)] = "vma_lock_abort", + [I(VMA_LOCK_RETRY)] = "vma_lock_retry", + [I(VMA_LOCK_MISS)] = "vma_lock_miss", #endif #ifdef CONFIG_DEBUG_STACK_USAGE - "kstack_1k", + [I(KSTACK_1K)] = "kstack_1k", #if THREAD_SIZE > 1024 - "kstack_2k", + [I(KSTACK_2K)] = "kstack_2k", #endif #if THREAD_SIZE > 2048 - "kstack_4k", + [I(KSTACK_4K)] = "kstack_4k", #endif #if THREAD_SIZE > 4096 - "kstack_8k", + [I(KSTACK_8K)] = "kstack_8k", #endif #if THREAD_SIZE > 8192 - "kstack_16k", + [I(KSTACK_16K)] = "kstack_16k", #endif #if THREAD_SIZE > 16384 - "kstack_32k", + [I(KSTACK_32K)] = "kstack_32k", #endif #if THREAD_SIZE > 32768 - "kstack_64k", + [I(KSTACK_64K)] = "kstack_64k", #endif #if THREAD_SIZE > 65536 - "kstack_rest", + [I(KSTACK_REST)] = "kstack_rest", #endif #endif -#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ +#undef I +#endif /* CONFIG_VM_EVENT_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ @@ -1868,7 +1887,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) if (*pos >= NR_VMSTAT_ITEMS) return NULL; - BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS); + BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) != NR_VMSTAT_ITEMS); fold_vm_numa_events(); v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL); m->private = v; diff --git a/mm/workingset.c b/mm/workingset.c index 4841ae8af411..6e7f4cb1b9a7 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -612,7 +612,6 @@ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { - struct address_space *mapping; struct page *page = virt_to_page(node); /* @@ -623,8 +622,7 @@ void workingset_update_node(struct xa_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - mapping = container_of(node->array, struct address_space, i_pages); - lockdep_assert_held(&mapping->i_pages.xa_lock); + lockdep_assert_held(&node->array->xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { diff --git a/mm/zpdesc.h b/mm/zpdesc.h index fa47fece2237..25bf5ea0beb8 100644 --- a/mm/zpdesc.h +++ b/mm/zpdesc.h @@ -7,6 +7,9 @@ #ifndef __MM_ZPDESC_H__ #define __MM_ZPDESC_H__ +#include <linux/migrate.h> +#include <linux/pagemap.h> + /* * struct zpdesc - Memory descriptor for zpool memory. * @flags: Page flags, mostly unused by zsmalloc. @@ -51,8 +54,8 @@ struct zpdesc { ZPDESC_MATCH(flags, flags); ZPDESC_MATCH(lru, lru); ZPDESC_MATCH(mapping, movable_ops); -ZPDESC_MATCH(index, next); -ZPDESC_MATCH(index, handle); +ZPDESC_MATCH(__folio_index, next); +ZPDESC_MATCH(__folio_index, handle); ZPDESC_MATCH(private, zspage); ZPDESC_MATCH(page_type, first_obj_offset); ZPDESC_MATCH(_refcount, _refcount); @@ -149,10 +152,9 @@ static inline struct zpdesc *pfn_zpdesc(unsigned long pfn) return page_zpdesc(pfn_to_page(pfn)); } -static inline void __zpdesc_set_movable(struct zpdesc *zpdesc, - const struct movable_operations *mops) +static inline void __zpdesc_set_movable(struct zpdesc *zpdesc) { - __SetPageMovable(zpdesc_page(zpdesc), mops); + SetPageMovableOps(zpdesc_page(zpdesc)); } static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc) @@ -160,16 +162,6 @@ static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc) __SetPageZsmalloc(zpdesc_page(zpdesc)); } -static inline void __zpdesc_clear_zsmalloc(struct zpdesc *zpdesc) -{ - __ClearPageZsmalloc(zpdesc_page(zpdesc)); -} - -static inline bool zpdesc_is_isolated(struct zpdesc *zpdesc) -{ - return PageIsolated(zpdesc_page(zpdesc)); -} - static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc) { return page_zone(zpdesc_page(zpdesc)); diff --git a/mm/zpool.c b/mm/zpool.c index 6d6d88930932..0a71d03369f1 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -226,20 +226,22 @@ const char *zpool_get_type(struct zpool *zpool) * @size: The amount of memory to allocate. * @gfp: The GFP flags to use when allocating memory. * @handle: Pointer to the handle to set + * @nid: The preferred node id. * * This allocates the requested amount of memory from the pool. * The gfp flags will be used when allocating memory, if the * implementation supports it. The provided @handle will be - * set to the allocated object handle. + * set to the allocated object handle. The allocation will + * prefer the NUMA node specified by @nid. * * Implementations must guarantee this to be thread-safe. * * Returns: 0 on success, negative value on error. */ int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, - unsigned long *handle) + unsigned long *handle, const int nid) { - return zpool->driver->malloc(zpool->pool, size, gfp, handle); + return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid); } /** diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 961b270f023c..2c5e56a65354 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -26,17 +26,10 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/bitops.h> #include <linux/errno.h> #include <linux/highmem.h> #include <linux/string.h> #include <linux/slab.h> -#include <linux/pgtable.h> -#include <asm/tlbflush.h> -#include <linux/cpumask.h> -#include <linux/cpu.h> -#include <linux/vmalloc.h> -#include <linux/preempt.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/shrinker.h> @@ -44,11 +37,8 @@ #include <linux/debugfs.h> #include <linux/zsmalloc.h> #include <linux/zpool.h> -#include <linux/migrate.h> -#include <linux/wait.h> -#include <linux/pagemap.h> #include <linux/fs.h> -#include <linux/local_lock.h> +#include <linux/workqueue.h> #include "zpdesc.h" #define ZSPAGE_MAGIC 0x58 @@ -243,9 +233,9 @@ static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc) dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); } -static inline struct zpdesc *alloc_zpdesc(gfp_t gfp) +static inline struct zpdesc *alloc_zpdesc(gfp_t gfp, const int nid) { - struct page *page = alloc_page(gfp); + struct page *page = alloc_pages_node(nid, gfp, 0); return page_zpdesc(page); } @@ -254,6 +244,7 @@ static inline void free_zpdesc(struct zpdesc *zpdesc) { struct page *page = zpdesc_page(zpdesc); + /* PageZsmalloc is sticky until the page is freed to the buddy. */ __free_page(page); } @@ -462,9 +453,9 @@ static void zs_zpool_destroy(void *pool) } static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, - unsigned long *handle) + unsigned long *handle, const int nid) { - *handle = zs_malloc(pool, size, gfp); + *handle = zs_malloc(pool, size, gfp, nid); if (IS_ERR_VALUE(*handle)) return PTR_ERR((void *)*handle); @@ -886,11 +877,10 @@ static void reset_zpdesc(struct zpdesc *zpdesc) { struct page *page = zpdesc_page(zpdesc); - __ClearPageMovable(page); ClearPagePrivate(page); zpdesc->zspage = NULL; zpdesc->next = NULL; - __ClearPageZsmalloc(page); + /* PageZsmalloc is sticky until the page is freed to the buddy. */ } static int trylock_zspage(struct zspage *zspage) @@ -1043,8 +1033,8 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, * Allocate a zspage for the given size class */ static struct zspage *alloc_zspage(struct zs_pool *pool, - struct size_class *class, - gfp_t gfp) + struct size_class *class, + gfp_t gfp, const int nid) { int i; struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; @@ -1053,6 +1043,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, if (!zspage) return NULL; + if (!IS_ENABLED(CONFIG_COMPACTION)) + gfp &= ~__GFP_MOVABLE; + zspage->magic = ZSPAGE_MAGIC; zspage->pool = pool; zspage->class = class->index; @@ -1061,11 +1054,10 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, for (i = 0; i < class->pages_per_zspage; i++) { struct zpdesc *zpdesc; - zpdesc = alloc_zpdesc(gfp); + zpdesc = alloc_zpdesc(gfp, nid); if (!zpdesc) { while (--i >= 0) { zpdesc_dec_zone_page_state(zpdescs[i]); - __zpdesc_clear_zsmalloc(zpdescs[i]); free_zpdesc(zpdescs[i]); } cache_free_zspage(pool, zspage); @@ -1243,19 +1235,19 @@ void zs_obj_write(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); - if (off + class->size <= PAGE_SIZE) { + if (!ZsHugePage(zspage)) + off += ZS_HANDLE_SIZE; + + if (off + mem_len <= PAGE_SIZE) { /* this object is contained entirely within a page */ void *dst = kmap_local_zpdesc(zpdesc); - if (!ZsHugePage(zspage)) - off += ZS_HANDLE_SIZE; memcpy(dst + off, handle_mem, mem_len); kunmap_local(dst); } else { /* this object spans two pages */ size_t sizes[2]; - off += ZS_HANDLE_SIZE; sizes[0] = PAGE_SIZE - off; sizes[1] = mem_len - sizes[0]; @@ -1336,12 +1328,14 @@ static unsigned long obj_malloc(struct zs_pool *pool, * @pool: pool to allocate from * @size: size of block to allocate * @gfp: gfp flags when allocating object + * @nid: The preferred node id to allocate new zspage (if needed) * * On success, handle to the allocated object is returned, * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp, + const int nid) { unsigned long handle; struct size_class *class; @@ -1376,7 +1370,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) spin_unlock(&class->lock); - zspage = alloc_zspage(pool, class, gfp); + zspage = alloc_zspage(pool, class, gfp, nid); if (!zspage) { cache_free_handle(pool, handle); return (unsigned long)ERR_PTR(-ENOMEM); @@ -1694,8 +1688,6 @@ static void lock_zspage(struct zspage *zspage) #ifdef CONFIG_COMPACTION -static const struct movable_operations zsmalloc_mops; - static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc) { @@ -1718,18 +1710,17 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, set_first_obj_offset(newzpdesc, first_obj_offset); if (unlikely(ZsHugePage(zspage))) newzpdesc->handle = oldzpdesc->handle; - __zpdesc_set_movable(newzpdesc, &zsmalloc_mops); + __zpdesc_set_movable(newzpdesc); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { /* - * Page is locked so zspage couldn't be destroyed. For detail, look at - * lock_zspage in free_zspage. + * Page is locked so zspage can't be destroyed concurrently + * (see free_zspage()). But if the page was already destroyed + * (see reset_zpdesc()), refuse isolation here. */ - VM_BUG_ON_PAGE(PageIsolated(page), page); - - return true; + return page_zpdesc(page)->zspage; } static int zs_page_migrate(struct page *newpage, struct page *page, @@ -1747,7 +1738,15 @@ static int zs_page_migrate(struct page *newpage, struct page *page, unsigned long old_obj, new_obj; unsigned int obj_idx; - VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc)); + /* + * TODO: nothing prevents a zspage from getting destroyed while + * it is isolated for migration, as the page lock is temporarily + * dropped after zs_page_isolate() succeeded: we should rework that + * and defer destroying such pages once they are un-isolated (putback) + * instead. + */ + if (!zpdesc->zspage) + return MIGRATEPAGE_SUCCESS; /* The page is locked, so this pointer must remain valid */ zspage = get_zspage(zpdesc); @@ -1819,10 +1818,9 @@ static int zs_page_migrate(struct page *newpage, struct page *page, static void zs_page_putback(struct page *page) { - VM_BUG_ON_PAGE(!PageIsolated(page), page); } -static const struct movable_operations zsmalloc_mops = { +const struct movable_operations zsmalloc_mops = { .isolate_page = zs_page_isolate, .migrate_page = zs_page_migrate, .putback_page = zs_page_putback, @@ -1885,7 +1883,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) do { WARN_ON(!zpdesc_trylock(zpdesc)); - __zpdesc_set_movable(zpdesc, &zsmalloc_mops); + __zpdesc_set_movable(zpdesc); zpdesc_unlock(zpdesc); } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); } diff --git a/mm/zswap.c b/mm/zswap.c index 0dcc54eab58b..3c0fd8a13718 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -883,18 +883,32 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct acomp_req *req; + struct crypto_acomp *acomp; + u8 *buffer; + + if (IS_ERR_OR_NULL(acomp_ctx)) + return 0; mutex_lock(&acomp_ctx->mutex); - if (!IS_ERR_OR_NULL(acomp_ctx)) { - if (!IS_ERR_OR_NULL(acomp_ctx->req)) - acomp_request_free(acomp_ctx->req); - acomp_ctx->req = NULL; - if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) - crypto_free_acomp(acomp_ctx->acomp); - kfree(acomp_ctx->buffer); - } + req = acomp_ctx->req; + acomp = acomp_ctx->acomp; + buffer = acomp_ctx->buffer; + acomp_ctx->req = NULL; + acomp_ctx->acomp = NULL; + acomp_ctx->buffer = NULL; mutex_unlock(&acomp_ctx->mutex); + /* + * Do the actual freeing after releasing the mutex to avoid subtle + * locking dependencies causing deadlocks. + */ + if (!IS_ERR_OR_NULL(req)) + acomp_request_free(req); + if (!IS_ERR_OR_NULL(acomp)) + crypto_free_acomp(acomp); + kfree(buffer); + return 0; } @@ -967,7 +981,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, zpool = pool->zpool; gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; - alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); + alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page)); if (alloc_ret) goto unlock; @@ -1056,9 +1070,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct mempolicy *mpol; bool folio_was_allocated; struct swap_info_struct *si; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - }; int ret = 0; /* try to allocate swap cache folio */ @@ -1120,7 +1131,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, folio_set_reclaim(folio); /* start writeback */ - __swap_writepage(folio, &wbc); + __swap_writepage(folio, NULL); out: if (ret && ret != -EEXIST) { |