diff options
Diffstat (limited to 'mm')
93 files changed, 5970 insertions, 4002 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 781be3240e21..d5d4eca947a6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -934,6 +934,13 @@ config ARCH_SUPPORTS_PUD_PFNMAP depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD # +# Architectures that always use weak definitions for percpu +# variables in modules should set this. +# +config ARCH_MODULE_NEEDS_WEAK_PER_CPU + bool + +# # UP and nommu archs use km based percpu allocator # config NEED_PER_CPU_KM @@ -1005,8 +1012,8 @@ config ARCH_FORCE_MAX_ORDER # the default page block order is MAX_PAGE_ORDER (10) as per # include/linux/mmzone.h. # -config PAGE_BLOCK_ORDER - int "Page Block Order" +config PAGE_BLOCK_MAX_ORDER + int "Page Block Order Upper Limit" range 1 10 if ARCH_FORCE_MAX_ORDER = 0 default 10 if ARCH_FORCE_MAX_ORDER = 0 range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0 @@ -1014,15 +1021,16 @@ config PAGE_BLOCK_ORDER help The page block order refers to the power of two number of pages that are physically contiguous and can have a migrate type associated to - them. The maximum size of the page block order is limited by - ARCH_FORCE_MAX_ORDER. + them. The maximum size of the page block order is at least limited by + ARCH_FORCE_MAX_ORDER/MAX_PAGE_ORDER. - This config allows overriding the default page block order when the - page block order is required to be smaller than ARCH_FORCE_MAX_ORDER - or MAX_PAGE_ORDER. + This config adds a new upper limit of default page block + order when the page block order is required to be smaller than + ARCH_FORCE_MAX_ORDER/MAX_PAGE_ORDER or other limits + (see include/linux/pageblock-flags.h for details). Reducing pageblock order can negatively impact THP generation - success rate. If your workloads uses THP heavily, please use this + success rate. If your workloads use THP heavily, please use this option with caution. Don't change if unsure. @@ -1109,9 +1117,6 @@ config ARCH_HAS_CURRENT_STACK_POINTER register alias named "current_stack_pointer", this config can be selected. -config ARCH_HAS_PTE_DEVMAP - bool - config ARCH_HAS_ZONE_DMA_SET bool @@ -1129,7 +1134,6 @@ config ZONE_DEVICE depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP - depends on ARCH_HAS_PTE_DEVMAP select XARRAY_MULTI help diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index d3e00731e262..2a4a649805c1 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -94,13 +94,8 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, if (!trylock_page(page)) continue; - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION) && - PageIsolated(page)) { - /* raced with isolation */ - unlock_page(page); - continue; - } - balloon_page_delete(page); + list_del(&page->lru); + balloon_page_finalize(page); __count_vm_event(BALLOON_DEFLATE); list_add(&page->lru, pages); unlock_page(page); @@ -211,6 +206,9 @@ static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; + if (!b_dev_info) + return false; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); list_del(&page->lru); b_dev_info->isolated_pages++; @@ -224,6 +222,10 @@ static void balloon_page_putback(struct page *page) struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; + /* Isolated balloon pages cannot get deflated. */ + if (WARN_ON_ONCE(!b_dev_info)) + return; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); list_add(&page->lru, &b_dev_info->pages); b_dev_info->isolated_pages--; @@ -239,6 +241,10 @@ static int balloon_page_migrate(struct page *newpage, struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + /* Isolated balloon pages cannot get deflated. */ + if (WARN_ON_ONCE(!balloon)) + return -EAGAIN; + return balloon->migratepage(balloon, newpage, page, mode); } @@ -247,6 +253,5 @@ const struct movable_operations balloon_mops = { .isolate_page = balloon_page_isolate, .putback_page = balloon_page_putback, }; -EXPORT_SYMBOL_GPL(balloon_mops); #endif /* CONFIG_BALLOON_COMPACTION */ @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/sizes.h> #include <linux/slab.h> +#include <linux/string_choices.h> #include <linux/log2.h> #include <linux/cma.h> #include <linux/highmem.h> @@ -35,12 +36,6 @@ struct cma cma_areas[MAX_CMA_AREAS]; unsigned int cma_area_count; -static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, - phys_addr_t size, phys_addr_t limit, - phys_addr_t alignment, unsigned int order_per_bit, - bool fixed, const char *name, struct cma **res_cma, - int nid); - phys_addr_t cma_get_base(const struct cma *cma) { WARN_ON_ONCE(cma->nranges != 1); @@ -358,6 +353,168 @@ static void __init list_insert_sorted( } } +static int __init cma_fixed_reserve(phys_addr_t base, phys_addr_t size) +{ + if (IS_ENABLED(CONFIG_HIGHMEM)) { + phys_addr_t highmem_start = __pa(high_memory - 1) + 1; + + /* + * If allocating at a fixed base the request region must not + * cross the low/high memory boundary. + */ + if (base < highmem_start && base + size > highmem_start) { + pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", + &base, &highmem_start); + return -EINVAL; + } + } + + if (memblock_is_region_reserved(base, size) || + memblock_reserve(base, size) < 0) { + return -EBUSY; + } + + return 0; +} + +static phys_addr_t __init cma_alloc_mem(phys_addr_t base, phys_addr_t size, + phys_addr_t align, phys_addr_t limit, int nid) +{ + phys_addr_t addr = 0; + + /* + * If there is enough memory, try a bottom-up allocation first. + * It will place the new cma area close to the start of the node + * and guarantee that the compaction is moving pages out of the + * cma area and not into it. + * Avoid using first 4GB to not interfere with constrained zones + * like DMA/DMA32. + */ +#ifdef CONFIG_PHYS_ADDR_T_64BIT + if (!memblock_bottom_up() && limit >= SZ_4G + size) { + memblock_set_bottom_up(true); + addr = memblock_alloc_range_nid(size, align, SZ_4G, limit, + nid, true); + memblock_set_bottom_up(false); + } +#endif + + /* + * On systems with HIGHMEM try allocating from there before consuming + * memory in lower zones. + */ + if (!addr && IS_ENABLED(CONFIG_HIGHMEM)) { + phys_addr_t highmem = __pa(high_memory - 1) + 1; + + /* + * All pages in the reserved area must come from the same zone. + * If the requested region crosses the low/high memory boundary, + * try allocating from high memory first and fall back to low + * memory in case of failure. + */ + if (base < highmem && limit > highmem) { + addr = memblock_alloc_range_nid(size, align, highmem, + limit, nid, true); + limit = highmem; + } + } + + if (!addr) + addr = memblock_alloc_range_nid(size, align, base, limit, nid, + true); + + return addr; +} + +static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, + phys_addr_t size, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + bool fixed, const char *name, struct cma **res_cma, + int nid) +{ + phys_addr_t memblock_end = memblock_end_of_DRAM(); + phys_addr_t base = *basep; + int ret; + + pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", + __func__, &size, &base, &limit, &alignment); + + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size) + return -EINVAL; + + if (alignment && !is_power_of_2(alignment)) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_NUMA)) + nid = NUMA_NO_NODE; + + /* Sanitise input arguments. */ + alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); + if (fixed && base & (alignment - 1)) { + pr_err("Region at %pa must be aligned to %pa bytes\n", + &base, &alignment); + return -EINVAL; + } + base = ALIGN(base, alignment); + size = ALIGN(size, alignment); + limit &= ~(alignment - 1); + + if (!base) + fixed = false; + + /* size should be aligned with order_per_bit */ + if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + + /* + * If the limit is unspecified or above the memblock end, its effective + * value will be the memblock end. Set it explicitly to simplify further + * checks. + */ + if (limit == 0 || limit > memblock_end) + limit = memblock_end; + + if (base + size > limit) { + pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n", + &size, &base, &limit); + return -EINVAL; + } + + /* Reserve memory */ + if (fixed) { + ret = cma_fixed_reserve(base, size); + if (ret) + return ret; + } else { + base = cma_alloc_mem(base, size, alignment, limit, nid); + if (!base) + return -ENOMEM; + + /* + * kmemleak scans/reads tracked objects for pointers to other + * objects but this address isn't mapped and accessible + */ + kmemleak_ignore_phys(base); + } + + ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); + if (ret) { + memblock_phys_free(base, size); + return ret; + } + + (*res_cma)->nid = nid; + *basep = base; + + return 0; +} + /* * Create CMA areas with a total size of @total_size. A normal allocation * for one area is tried first. If that fails, the biggest memblock @@ -548,8 +705,7 @@ out: (unsigned long)total_size / SZ_1M); else pr_info("Reserved %lu MiB in %d range%s\n", - (unsigned long)total_size / SZ_1M, nr, - nr > 1 ? "s" : ""); + (unsigned long)total_size / SZ_1M, nr, str_plural(nr)); return ret; } @@ -593,154 +749,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, return ret; } -static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, - phys_addr_t size, phys_addr_t limit, - phys_addr_t alignment, unsigned int order_per_bit, - bool fixed, const char *name, struct cma **res_cma, - int nid) -{ - phys_addr_t memblock_end = memblock_end_of_DRAM(); - phys_addr_t highmem_start, base = *basep; - int ret; - - /* - * We can't use __pa(high_memory) directly, since high_memory - * isn't a valid direct map VA, and DEBUG_VIRTUAL will (validly) - * complain. Find the boundary by adding one to the last valid - * address. - */ - if (IS_ENABLED(CONFIG_HIGHMEM)) - highmem_start = __pa(high_memory - 1) + 1; - else - highmem_start = memblock_end_of_DRAM(); - pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", - __func__, &size, &base, &limit, &alignment); - - if (cma_area_count == ARRAY_SIZE(cma_areas)) { - pr_err("Not enough slots for CMA reserved regions!\n"); - return -ENOSPC; - } - - if (!size) - return -EINVAL; - - if (alignment && !is_power_of_2(alignment)) - return -EINVAL; - - if (!IS_ENABLED(CONFIG_NUMA)) - nid = NUMA_NO_NODE; - - /* Sanitise input arguments. */ - alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); - if (fixed && base & (alignment - 1)) { - pr_err("Region at %pa must be aligned to %pa bytes\n", - &base, &alignment); - return -EINVAL; - } - base = ALIGN(base, alignment); - size = ALIGN(size, alignment); - limit &= ~(alignment - 1); - - if (!base) - fixed = false; - - /* size should be aligned with order_per_bit */ - if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) - return -EINVAL; - - /* - * If allocating at a fixed base the request region must not cross the - * low/high memory boundary. - */ - if (fixed && base < highmem_start && base + size > highmem_start) { - pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", - &base, &highmem_start); - return -EINVAL; - } - - /* - * If the limit is unspecified or above the memblock end, its effective - * value will be the memblock end. Set it explicitly to simplify further - * checks. - */ - if (limit == 0 || limit > memblock_end) - limit = memblock_end; - - if (base + size > limit) { - pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n", - &size, &base, &limit); - return -EINVAL; - } - - /* Reserve memory */ - if (fixed) { - if (memblock_is_region_reserved(base, size) || - memblock_reserve(base, size) < 0) { - return -EBUSY; - } - } else { - phys_addr_t addr = 0; - - /* - * If there is enough memory, try a bottom-up allocation first. - * It will place the new cma area close to the start of the node - * and guarantee that the compaction is moving pages out of the - * cma area and not into it. - * Avoid using first 4GB to not interfere with constrained zones - * like DMA/DMA32. - */ -#ifdef CONFIG_PHYS_ADDR_T_64BIT - if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) { - memblock_set_bottom_up(true); - addr = memblock_alloc_range_nid(size, alignment, SZ_4G, - limit, nid, true); - memblock_set_bottom_up(false); - } -#endif - - /* - * All pages in the reserved area must come from the same zone. - * If the requested region crosses the low/high memory boundary, - * try allocating from high memory first and fall back to low - * memory in case of failure. - */ - if (!addr && base < highmem_start && limit > highmem_start) { - addr = memblock_alloc_range_nid(size, alignment, - highmem_start, limit, nid, true); - limit = highmem_start; - } - - if (!addr) { - addr = memblock_alloc_range_nid(size, alignment, base, - limit, nid, true); - if (!addr) - return -ENOMEM; - } - - /* - * kmemleak scans/reads tracked objects for pointers to other - * objects but this address isn't mapped and accessible - */ - kmemleak_ignore_phys(addr); - base = addr; - } - - ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); - if (ret) { - memblock_phys_free(base, size); - return ret; - } - - (*res_cma)->nid = nid; - *basep = base; - - return 0; -} - static void cma_debug_show_areas(struct cma *cma) { - unsigned long next_zero_bit, next_set_bit, nr_zero; - unsigned long start; + unsigned long start, end; unsigned long nr_part; unsigned long nbits; int r; @@ -751,22 +762,12 @@ static void cma_debug_show_areas(struct cma *cma) for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; - start = 0; nbits = cma_bitmap_maxno(cma, cmr); pr_info("range %d: ", r); - for (;;) { - next_zero_bit = find_next_zero_bit(cmr->bitmap, - nbits, start); - if (next_zero_bit >= nbits) - break; - next_set_bit = find_next_bit(cmr->bitmap, nbits, - next_zero_bit); - nr_zero = next_set_bit - next_zero_bit; - nr_part = nr_zero << cma->order_per_bit; - pr_cont("%s%lu@%lu", start ? "+" : "", nr_part, - next_zero_bit); - start = next_zero_bit + nr_zero; + for_each_clear_bitrange(start, end, cmr->bitmap, nbits) { + nr_part = (end - start) << cma->order_per_bit; + pr_cont("%s%lu@%lu", start ? "+" : "", nr_part, start); } pr_info("\n"); } @@ -822,7 +823,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma->alloc_mutex); - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp); + ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); mutex_unlock(&cma->alloc_mutex); if (ret == 0) { page = pfn_to_page(pfn); @@ -854,8 +855,6 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, unsigned long i; const char *name = cma ? cma->name : NULL; - trace_cma_alloc_start(name, count, align); - if (!cma || !cma->count) return page; @@ -865,6 +864,8 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, if (!count) return page; + trace_cma_alloc_start(name, count, align); + for (r = 0; r < cma->nranges; r++) { page = NULL; diff --git a/mm/cma_debug.c b/mm/cma_debug.c index fdf899532ca0..8c7d7f8e8fbd 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -56,16 +56,8 @@ static int cma_maxchunk_get(void *data, u64 *val) for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; bitmap_maxno = cma_bitmap_maxno(cma, cmr); - end = 0; - for (;;) { - start = find_next_zero_bit(cmr->bitmap, - bitmap_maxno, end); - if (start >= bitmap_maxno) - break; - end = find_next_bit(cmr->bitmap, bitmap_maxno, - start); + for_each_clear_bitrange(start, end, cmr->bitmap, bitmap_maxno) maxchunk = max(end - start, maxchunk); - } } spin_unlock_irq(&cma->lock); *val = (u64)maxchunk << cma->order_per_bit; diff --git a/mm/compaction.c b/mm/compaction.c index 3925cb61dbb8..bf021b31c7ec 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -114,39 +114,6 @@ static unsigned long release_free_list(struct list_head *freepages) } #ifdef CONFIG_COMPACTION -bool PageMovable(struct page *page) -{ - const struct movable_operations *mops; - - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (!__PageMovable(page)) - return false; - - mops = page_movable_ops(page); - if (mops) - return true; - - return false; -} - -void __SetPageMovable(struct page *page, const struct movable_operations *mops) -{ - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page); - page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE); -} -EXPORT_SYMBOL(__SetPageMovable); - -void __ClearPageMovable(struct page *page) -{ - VM_BUG_ON_PAGE(!PageMovable(page), page); - /* - * This page still has the type of a movable page, but it's - * actually not movable any more. - */ - page->mapping = (void *)PAGE_MAPPING_MOVABLE; -} -EXPORT_SYMBOL(__ClearPageMovable); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -1082,18 +1049,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * Skip any other type of page */ if (!PageLRU(page)) { - /* - * __PageMovable can return false positive so we need - * to verify it under page_lock. - */ - if (unlikely(__PageMovable(page)) && - !PageIsolated(page)) { + /* Isolation code will deal with any races. */ + if (unlikely(page_has_movable_ops(page)) && + !PageMovableOpsIsolated(page)) { if (locked) { unlock_page_lruvec_irqrestore(locked, flags); locked = NULL; } - if (isolate_movable_page(page, mode)) { + if (isolate_movable_ops_page(page, mode)) { folio = page_folio(page); goto isolate_success; } diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 551745df011b..b3171f9406c1 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -4,7 +4,6 @@ menu "Data Access Monitoring" config DAMON bool "DAMON: Data Access Monitoring Framework" - default y help This builds a framework that allows kernel subsystems to monitor access frequency of each memory region. The information can be useful @@ -95,4 +94,20 @@ config DAMON_LRU_SORT protect frequently accessed (hot) pages while rarely accessed (cold) pages reclaimed first under memory pressure. +config DAMON_STAT + bool "Build data access monitoring stat (DAMON_STAT)" + depends on DAMON_PADDR + help + This builds the DAMON-based access monitoring statistics subsystem. + It runs DAMON and expose access monitoring results in simple stat + metrics. + +config DAMON_STAT_ENABLED_DEFAULT + bool "Enable DAMON_STAT by default" + depends on DAMON_PADDR + default DAMON_STAT + help + Whether to enable DAMON_STAT by default. Users can disable it in + boot or runtime using its 'enabled' parameter. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 8b49012ba8c3..d8d6bf5f8bff 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o +obj-$(CONFIG_DAMON_STAT) += modules-common.o stat.o diff --git a/mm/damon/core.c b/mm/damon/core.c index b217e0120e09..52a48c9316bc 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -407,6 +407,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme->wmarks = *wmarks; scheme->wmarks.activated = true; + scheme->migrate_dests = (struct damos_migrate_dests){}; scheme->target_nid = target_nid; return scheme; @@ -449,6 +450,9 @@ void damon_destroy_scheme(struct damos *s) damos_for_each_filter_safe(f, next, s) damos_destroy_filter(f); + + kfree(s->migrate_dests.node_id_arr); + kfree(s->migrate_dests.weight_arr); damon_del_scheme(s); damon_free_scheme(s); } @@ -498,8 +502,12 @@ void damon_free_target(struct damon_target *t) kfree(t); } -void damon_destroy_target(struct damon_target *t) +void damon_destroy_target(struct damon_target *t, struct damon_ctx *ctx) { + + if (ctx && ctx->ops.cleanup_target) + ctx->ops.cleanup_target(t); + damon_del_target(t); damon_free_target(t); } @@ -529,7 +537,8 @@ struct damon_ctx *damon_new_ctx(void) ctx->next_ops_update_sis = 0; mutex_init(&ctx->kdamond_lock); - mutex_init(&ctx->call_control_lock); + INIT_LIST_HEAD(&ctx->call_controls); + mutex_init(&ctx->call_controls_lock); mutex_init(&ctx->walk_control_lock); ctx->attrs.min_nr_regions = 10; @@ -545,13 +554,8 @@ static void damon_destroy_targets(struct damon_ctx *ctx) { struct damon_target *t, *next_t; - if (ctx->ops.cleanup) { - ctx->ops.cleanup(ctx); - return; - } - damon_for_each_target_safe(t, next_t, ctx) - damon_destroy_target(t); + damon_destroy_target(t, ctx); } void damon_destroy_ctx(struct damon_ctx *ctx) @@ -676,9 +680,7 @@ static bool damon_valid_intervals_goal(struct damon_attrs *attrs) * @attrs: monitoring attributes * * This function should be called while the kdamond is not running, an access - * check results aggregation is not ongoing (e.g., from &struct - * damon_callback->after_aggregation or &struct - * damon_callback->after_wmarks_check callbacks), or from damon_call(). + * check results aggregation is not ongoing (e.g., from damon_call(). * * Every time interval is in micro-seconds. * @@ -754,6 +756,19 @@ static struct damos_quota_goal *damos_nth_quota_goal( return NULL; } +static void damos_commit_quota_goal_union( + struct damos_quota_goal *dst, struct damos_quota_goal *src) +{ + switch (dst->metric) { + case DAMOS_QUOTA_NODE_MEM_USED_BP: + case DAMOS_QUOTA_NODE_MEM_FREE_BP: + dst->nid = src->nid; + break; + default: + break; + } +} + static void damos_commit_quota_goal( struct damos_quota_goal *dst, struct damos_quota_goal *src) { @@ -762,6 +777,7 @@ static void damos_commit_quota_goal( if (dst->metric == DAMOS_QUOTA_USER_INPUT) dst->current_value = src->current_value; /* keep last_psi_total as is, since it will be updated in next cycle */ + damos_commit_quota_goal_union(dst, src); } /** @@ -774,7 +790,7 @@ static void damos_commit_quota_goal( * DAMON contexts, instead of manual in-place updates. * * This function should be called from parameters-update safe context, like - * DAMON callbacks. + * damon_call(). */ int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src) { @@ -795,6 +811,7 @@ int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src) src_goal->metric, src_goal->target_value); if (!new_goal) return -ENOMEM; + damos_commit_quota_goal_union(new_goal, src_goal); damos_add_quota_goal(dst, new_goal); } return 0; @@ -939,6 +956,41 @@ static void damos_set_filters_default_reject(struct damos *s) damos_filters_default_reject(&s->ops_filters); } +static int damos_commit_dests(struct damos *dst, struct damos *src) +{ + struct damos_migrate_dests *dst_dests, *src_dests; + + dst_dests = &dst->migrate_dests; + src_dests = &src->migrate_dests; + + if (dst_dests->nr_dests != src_dests->nr_dests) { + kfree(dst_dests->node_id_arr); + kfree(dst_dests->weight_arr); + + dst_dests->node_id_arr = kmalloc_array(src_dests->nr_dests, + sizeof(*dst_dests->node_id_arr), GFP_KERNEL); + if (!dst_dests->node_id_arr) { + dst_dests->weight_arr = NULL; + return -ENOMEM; + } + + dst_dests->weight_arr = kmalloc_array(src_dests->nr_dests, + sizeof(*dst_dests->weight_arr), GFP_KERNEL); + if (!dst_dests->weight_arr) { + /* ->node_id_arr will be freed by scheme destruction */ + return -ENOMEM; + } + } + + dst_dests->nr_dests = src_dests->nr_dests; + for (int i = 0; i < src_dests->nr_dests; i++) { + dst_dests->node_id_arr[i] = src_dests->node_id_arr[i]; + dst_dests->weight_arr[i] = src_dests->weight_arr[i]; + } + + return 0; +} + static int damos_commit_filters(struct damos *dst, struct damos *src) { int err; @@ -978,6 +1030,11 @@ static int damos_commit(struct damos *dst, struct damos *src) return err; dst->wmarks = src->wmarks; + dst->target_nid = src->target_nid; + + err = damos_commit_dests(dst, src); + if (err) + return err; err = damos_commit_filters(dst, src); return err; @@ -1095,9 +1152,7 @@ static int damon_commit_targets( } else { struct damos *s; - if (damon_target_has_pid(dst)) - put_pid(dst_target->pid); - damon_destroy_target(dst_target); + damon_destroy_target(dst_target, dst); damon_for_each_scheme(s, dst) { if (s->quota.charge_target_from == dst_target) { s->quota.charge_target_from = NULL; @@ -1116,7 +1171,7 @@ static int damon_commit_targets( err = damon_commit_target(new_target, false, src_target, damon_target_has_pid(src)); if (err) { - damon_destroy_target(new_target); + damon_destroy_target(new_target, NULL); return err; } damon_add_target(dst, new_target); @@ -1135,7 +1190,7 @@ static int damon_commit_targets( * in-place updates. * * This function should be called from parameters-update safe context, like - * DAMON callbacks. + * damon_call(). */ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) { @@ -1311,7 +1366,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) return err; } -static bool damon_is_running(struct damon_ctx *ctx) +/** + * damon_is_running() - Returns if a given DAMON context is running. + * @ctx: The DAMON context to see if running. + * + * Return: true if @ctx is running, false otherwise. + */ +bool damon_is_running(struct damon_ctx *ctx) { bool running; @@ -1328,8 +1389,9 @@ static bool damon_is_running(struct damon_ctx *ctx) * * Ask DAMON worker thread (kdamond) of @ctx to call a function with an * argument data that respectively passed via &damon_call_control->fn and - * &damon_call_control->data of @control, and wait until the kdamond finishes - * handling of the request. + * &damon_call_control->data of @control. If &damon_call_control->repeat of + * @control is set, further wait until the kdamond finishes handling of the + * request. Otherwise, return as soon as the request is made. * * The kdamond executes the function with the argument in the main loop, just * after a sampling of the iteration is finished. The function can hence @@ -1341,18 +1403,18 @@ static bool damon_is_running(struct damon_ctx *ctx) */ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) { - init_completion(&control->completion); + if (!control->repeat) + init_completion(&control->completion); control->canceled = false; + INIT_LIST_HEAD(&control->list); - mutex_lock(&ctx->call_control_lock); - if (ctx->call_control) { - mutex_unlock(&ctx->call_control_lock); - return -EBUSY; - } - ctx->call_control = control; - mutex_unlock(&ctx->call_control_lock); + mutex_lock(&ctx->call_controls_lock); + list_add_tail(&ctx->call_controls, &control->list); + mutex_unlock(&ctx->call_controls_lock); if (!damon_is_running(ctx)) return -EINVAL; + if (control->repeat) + return 0; wait_for_completion(&control->completion); if (control->canceled) return -ECANCELED; @@ -1449,6 +1511,7 @@ static unsigned long damon_get_intervals_score(struct damon_ctx *c) } } target_access_events = max_access_events * goal_bp / 10000; + target_access_events = target_access_events ? : 1; return access_events * 10000 / target_access_events; } @@ -1489,6 +1552,7 @@ static void kdamond_tune_intervals(struct damon_ctx *c) new_attrs.sample_interval); new_attrs.aggr_interval = new_attrs.sample_interval * c->attrs.aggr_samples; + trace_damon_monitor_intervals_tune(new_attrs.sample_interval); damon_set_attrs(c, &new_attrs); } @@ -2009,12 +2073,26 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->esz = esz; } +static void damos_trace_esz(struct damon_ctx *c, struct damos *s, + struct damos_quota *quota) +{ + unsigned int cidx = 0, sidx = 0; + struct damos *siter; + + damon_for_each_scheme(siter, c) { + if (siter == s) + break; + sidx++; + } + trace_damos_esz(cidx, sidx, quota->esz); +} + static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) { struct damos_quota *quota = &s->quota; struct damon_target *t; struct damon_region *r; - unsigned long cumulated_sz; + unsigned long cumulated_sz, cached_esz; unsigned int score, max_score = 0; if (!quota->ms && !quota->sz && list_empty("a->goals)) @@ -2028,7 +2106,11 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; quota->charged_sz = 0; + if (trace_damos_esz_enabled()) + cached_esz = quota->esz; damos_set_effective_quota(quota); + if (trace_damos_esz_enabled() && quota->esz != cached_esz) + damos_trace_esz(c, s, quota); } if (!c->ops.get_scheme_score) @@ -2349,36 +2431,49 @@ static void kdamond_usleep(unsigned long usecs) } /* - * kdamond_call() - handle damon_call_control. + * kdamond_call() - handle damon_call_control objects. * @ctx: The &struct damon_ctx of the kdamond. * @cancel: Whether to cancel the invocation of the function. * - * If there is a &struct damon_call_control request that registered via + * If there are &struct damon_call_control requests that registered via * &damon_call() on @ctx, do or cancel the invocation of the function depending - * on @cancel. @cancel is set when the kdamond is deactivated by DAMOS - * watermarks, or the kdamond is already out of the main loop and therefore - * will be terminated. + * on @cancel. @cancel is set when the kdamond is already out of the main loop + * and therefore will be terminated. */ static void kdamond_call(struct damon_ctx *ctx, bool cancel) { struct damon_call_control *control; + LIST_HEAD(repeat_controls); int ret = 0; - mutex_lock(&ctx->call_control_lock); - control = ctx->call_control; - mutex_unlock(&ctx->call_control_lock); - if (!control) - return; - if (cancel) { - control->canceled = true; - } else { - ret = control->fn(control->data); - control->return_code = ret; + while (true) { + mutex_lock(&ctx->call_controls_lock); + control = list_first_entry_or_null(&ctx->call_controls, + struct damon_call_control, list); + mutex_unlock(&ctx->call_controls_lock); + if (!control) + break; + if (cancel) { + control->canceled = true; + } else { + ret = control->fn(control->data); + control->return_code = ret; + } + mutex_lock(&ctx->call_controls_lock); + list_del(&control->list); + mutex_unlock(&ctx->call_controls_lock); + if (!control->repeat) + complete(&control->completion); + else + list_add(&control->list, &repeat_controls); } - complete(&control->completion); - mutex_lock(&ctx->call_control_lock); - ctx->call_control = NULL; - mutex_unlock(&ctx->call_control_lock); + control = list_first_entry_or_null(&repeat_controls, + struct damon_call_control, list); + if (!control || cancel) + return; + mutex_lock(&ctx->call_controls_lock); + list_add_tail(&control->list, &ctx->call_controls); + mutex_unlock(&ctx->call_controls_lock); } /* Returns negative error code if it's not activated but should return */ @@ -2402,10 +2497,7 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) kdamond_usleep(min_wait_time); - if (ctx->callback.after_wmarks_check && - ctx->callback.after_wmarks_check(ctx)) - break; - kdamond_call(ctx, true); + kdamond_call(ctx, false); damos_walk_cancel(ctx); } return -EBUSY; @@ -2461,10 +2553,9 @@ static int kdamond_fn(void *data) while (!kdamond_need_stop(ctx)) { /* * ctx->attrs and ctx->next_{aggregation,ops_update}_sis could - * be changed from after_wmarks_check() or after_aggregation() - * callbacks. Read the values here, and use those for this - * iteration. That is, damon_set_attrs() updated new values - * are respected from next iteration. + * be changed from kdamond_call(). Read the values here, and + * use those for this iteration. That is, damon_set_attrs() + * updated new values are respected from next iteration. */ unsigned long next_aggregation_sis = ctx->next_aggregation_sis; unsigned long next_ops_update_sis = ctx->next_ops_update_sis; @@ -2482,14 +2573,10 @@ static int kdamond_fn(void *data) if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); - if (ctx->passed_sample_intervals >= next_aggregation_sis) { + if (ctx->passed_sample_intervals >= next_aggregation_sis) kdamond_merge_regions(ctx, max_nr_accesses / 10, sz_limit); - if (ctx->callback.after_aggregation && - ctx->callback.after_aggregation(ctx)) - break; - } /* * do kdamond_call() and kdamond_apply_schemes() after @@ -2555,8 +2642,6 @@ done: damon_destroy_region(r, t); } - if (ctx->callback.before_terminate) - ctx->callback.before_terminate(ctx); if (ctx->ops.cleanup) ctx->ops.cleanup(ctx); kfree(ctx->regions_score_histogram); @@ -2575,6 +2660,7 @@ done: running_exclusive_ctxs = false; mutex_unlock(&damon_lock); + damon_destroy_targets(ctx); return 0; } diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 4af8fd4a390b..151a9de5ad8b 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -230,6 +230,39 @@ out: return err; } +static int damon_lru_sort_handle_commit_inputs(void) +{ + int err; + + if (!commit_inputs) + return 0; + + err = damon_lru_sort_apply_parameters(); + commit_inputs = false; + return err; +} + +static int damon_lru_sort_damon_call_fn(void *arg) +{ + struct damon_ctx *c = arg; + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) { + if (s->action == DAMOS_LRU_PRIO) + damon_lru_sort_hot_stat = s->stat; + else if (s->action == DAMOS_LRU_DEPRIO) + damon_lru_sort_cold_stat = s->stat; + } + + return damon_lru_sort_handle_commit_inputs(); +} + +static struct damon_call_control call_control = { + .fn = damon_lru_sort_damon_call_fn, + .repeat = true, +}; + static int damon_lru_sort_turn(bool on) { int err; @@ -249,7 +282,7 @@ static int damon_lru_sort_turn(bool on) if (err) return err; kdamond_pid = ctx->kdamond->pid; - return 0; + return damon_call(ctx, &call_control); } static int damon_lru_sort_enabled_store(const char *val, @@ -288,52 +321,22 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); MODULE_PARM_DESC(enabled, "Enable or disable DAMON_LRU_SORT (default: disabled)"); -static int damon_lru_sort_handle_commit_inputs(void) -{ - int err; - - if (!commit_inputs) - return 0; - - err = damon_lru_sort_apply_parameters(); - commit_inputs = false; - return err; -} - -static int damon_lru_sort_after_aggregation(struct damon_ctx *c) -{ - struct damos *s; - - /* update the stats parameter */ - damon_for_each_scheme(s, c) { - if (s->action == DAMOS_LRU_PRIO) - damon_lru_sort_hot_stat = s->stat; - else if (s->action == DAMOS_LRU_DEPRIO) - damon_lru_sort_cold_stat = s->stat; - } - - return damon_lru_sort_handle_commit_inputs(); -} - -static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c) -{ - return damon_lru_sort_handle_commit_inputs(); -} - static int __init damon_lru_sort_init(void) { int err = damon_modules_new_paddr_ctx_target(&ctx, &target); if (err) - return err; + goto out; - ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; - ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; + call_control.data = ctx; /* 'enabled' has set before this function, probably via command line */ if (enabled) err = damon_lru_sort_turn(true); +out: + if (err && enabled) + enabled = false; return err; } diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index b43620fee6bb..99321ff5cb92 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -5,6 +5,7 @@ * Author: SeongJae Park <sj@kernel.org> */ +#include <linux/migrate.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/pagemap.h> @@ -12,6 +13,7 @@ #include <linux/swap.h> #include <linux/swapops.h> +#include "../internal.h" #include "ops-common.h" /* @@ -138,3 +140,275 @@ int damon_cold_score(struct damon_ctx *c, struct damon_region *r, /* Return coldness of the region */ return DAMOS_MAX_SCORE - hotness; } + +static bool damon_folio_mkold_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, void *arg) +{ + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); + + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) + damon_ptep_mkold(pvmw.pte, vma, addr); + else + damon_pmdp_mkold(pvmw.pmd, vma, addr); + } + return true; +} + +void damon_folio_mkold(struct folio *folio) +{ + struct rmap_walk_control rwc = { + .rmap_one = damon_folio_mkold_one, + .anon_lock = folio_lock_anon_vma_read, + }; + bool need_lock; + + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { + folio_set_idle(folio); + return; + } + + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); + if (need_lock && !folio_trylock(folio)) + return; + + rmap_walk(folio, &rwc); + + if (need_lock) + folio_unlock(folio); + +} + +static bool damon_folio_young_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, void *arg) +{ + bool *accessed = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); + pte_t pte; + + *accessed = false; + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) { + pte = ptep_get(pvmw.pte); + + /* + * PFN swap PTEs, such as device-exclusive ones, that + * actually map pages are "old" from a CPU perspective. + * The MMU notifier takes care of any device aspects. + */ + *accessed = (pte_present(pte) && pte_young(pte)) || + !folio_test_idle(folio) || + mmu_notifier_test_young(vma->vm_mm, addr); + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + *accessed = pmd_young(pmdp_get(pvmw.pmd)) || + !folio_test_idle(folio) || + mmu_notifier_test_young(vma->vm_mm, addr); +#else + WARN_ON_ONCE(1); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + } + if (*accessed) { + page_vma_mapped_walk_done(&pvmw); + break; + } + } + + /* If accessed, stop walking */ + return *accessed == false; +} + +bool damon_folio_young(struct folio *folio) +{ + bool accessed = false; + struct rmap_walk_control rwc = { + .arg = &accessed, + .rmap_one = damon_folio_young_one, + .anon_lock = folio_lock_anon_vma_read, + }; + bool need_lock; + + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { + if (folio_test_idle(folio)) + return false; + else + return true; + } + + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); + if (need_lock && !folio_trylock(folio)) + return false; + + rmap_walk(folio, &rwc); + + if (need_lock) + folio_unlock(folio); + + return accessed; +} + +bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio) +{ + bool matched = false; + struct mem_cgroup *memcg; + size_t folio_sz; + + switch (filter->type) { + case DAMOS_FILTER_TYPE_ANON: + matched = folio_test_anon(folio); + break; + case DAMOS_FILTER_TYPE_ACTIVE: + matched = folio_test_active(folio); + break; + case DAMOS_FILTER_TYPE_MEMCG: + rcu_read_lock(); + memcg = folio_memcg_check(folio); + if (!memcg) + matched = false; + else + matched = filter->memcg_id == mem_cgroup_id(memcg); + rcu_read_unlock(); + break; + case DAMOS_FILTER_TYPE_YOUNG: + matched = damon_folio_young(folio); + if (matched) + damon_folio_mkold(folio); + break; + case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: + folio_sz = folio_size(folio); + matched = filter->sz_range.min <= folio_sz && + folio_sz <= filter->sz_range.max; + break; + case DAMOS_FILTER_TYPE_UNMAPPED: + matched = !folio_mapped(folio) || !folio_raw_mapping(folio); + break; + default: + break; + } + + return matched == filter->matching; +} + +static unsigned int __damon_migrate_folio_list( + struct list_head *migrate_folios, struct pglist_data *pgdat, + int target_nid) +{ + unsigned int nr_succeeded = 0; + struct migration_target_control mtc = { + /* + * Allocate from 'node', or fail quickly and quietly. + * When this happens, 'page' will likely just be discarded + * instead of migrated. + */ + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | + __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT, + .nid = target_nid, + }; + + if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE) + return 0; + + if (list_empty(migrate_folios)) + return 0; + + /* Migration ignores all cpuset and mempolicy settings */ + migrate_pages(migrate_folios, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON, + &nr_succeeded); + + return nr_succeeded; +} + +static unsigned int damon_migrate_folio_list(struct list_head *folio_list, + struct pglist_data *pgdat, + int target_nid) +{ + unsigned int nr_migrated = 0; + struct folio *folio; + LIST_HEAD(ret_folios); + LIST_HEAD(migrate_folios); + + while (!list_empty(folio_list)) { + struct folio *folio; + + cond_resched(); + + folio = lru_to_folio(folio_list); + list_del(&folio->lru); + + if (!folio_trylock(folio)) + goto keep; + + /* Relocate its contents to another node. */ + list_add(&folio->lru, &migrate_folios); + folio_unlock(folio); + continue; +keep: + list_add(&folio->lru, &ret_folios); + } + /* 'folio_list' is always empty here */ + + /* Migrate folios selected for migration */ + nr_migrated += __damon_migrate_folio_list( + &migrate_folios, pgdat, target_nid); + /* + * Folios that could not be migrated are still in @migrate_folios. Add + * those back on @folio_list + */ + if (!list_empty(&migrate_folios)) + list_splice_init(&migrate_folios, folio_list); + + try_to_unmap_flush(); + + list_splice(&ret_folios, folio_list); + + while (!list_empty(folio_list)) { + folio = lru_to_folio(folio_list); + list_del(&folio->lru); + folio_putback_lru(folio); + } + + return nr_migrated; +} + +unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid) +{ + int nid; + unsigned long nr_migrated = 0; + LIST_HEAD(node_folio_list); + unsigned int noreclaim_flag; + + if (list_empty(folio_list)) + return nr_migrated; + + if (target_nid < 0 || target_nid >= MAX_NUMNODES || + !node_state(target_nid, N_MEMORY)) + return nr_migrated; + + noreclaim_flag = memalloc_noreclaim_save(); + + nid = folio_nid(lru_to_folio(folio_list)); + do { + struct folio *folio = lru_to_folio(folio_list); + + if (nid == folio_nid(folio)) { + list_move(&folio->lru, &node_folio_list); + continue; + } + + nr_migrated += damon_migrate_folio_list(&node_folio_list, + NODE_DATA(nid), + target_nid); + nid = folio_nid(lru_to_folio(folio_list)); + } while (!list_empty(folio_list)); + + nr_migrated += damon_migrate_folio_list(&node_folio_list, + NODE_DATA(nid), + target_nid); + + memalloc_noreclaim_restore(noreclaim_flag); + + return nr_migrated; +} diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index cc9f5da9c012..61ad54aaf256 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -11,8 +11,13 @@ struct folio *damon_get_folio(unsigned long pfn); void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr); void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr); +void damon_folio_mkold(struct folio *folio); +bool damon_folio_young(struct folio *folio); int damon_cold_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); int damon_hot_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); + +bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio); +unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 4102a8c5f992..53a55c5114fb 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -13,51 +13,11 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/memory-tiers.h> -#include <linux/migrate.h> #include <linux/mm_inline.h> #include "../internal.h" #include "ops-common.h" -static bool damon_folio_mkold_one(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *arg) -{ - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); - - while (page_vma_mapped_walk(&pvmw)) { - addr = pvmw.address; - if (pvmw.pte) - damon_ptep_mkold(pvmw.pte, vma, addr); - else - damon_pmdp_mkold(pvmw.pmd, vma, addr); - } - return true; -} - -static void damon_folio_mkold(struct folio *folio) -{ - struct rmap_walk_control rwc = { - .rmap_one = damon_folio_mkold_one, - .anon_lock = folio_lock_anon_vma_read, - }; - bool need_lock; - - if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { - folio_set_idle(folio); - return; - } - - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) - return; - - rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); - -} - static void damon_pa_mkold(unsigned long paddr) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); @@ -87,75 +47,6 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) } } -static bool damon_folio_young_one(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *arg) -{ - bool *accessed = arg; - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); - pte_t pte; - - *accessed = false; - while (page_vma_mapped_walk(&pvmw)) { - addr = pvmw.address; - if (pvmw.pte) { - pte = ptep_get(pvmw.pte); - - /* - * PFN swap PTEs, such as device-exclusive ones, that - * actually map pages are "old" from a CPU perspective. - * The MMU notifier takes care of any device aspects. - */ - *accessed = (pte_present(pte) && pte_young(pte)) || - !folio_test_idle(folio) || - mmu_notifier_test_young(vma->vm_mm, addr); - } else { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - *accessed = pmd_young(pmdp_get(pvmw.pmd)) || - !folio_test_idle(folio) || - mmu_notifier_test_young(vma->vm_mm, addr); -#else - WARN_ON_ONCE(1); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - } - if (*accessed) { - page_vma_mapped_walk_done(&pvmw); - break; - } - } - - /* If accessed, stop walking */ - return *accessed == false; -} - -static bool damon_folio_young(struct folio *folio) -{ - bool accessed = false; - struct rmap_walk_control rwc = { - .arg = &accessed, - .rmap_one = damon_folio_young_one, - .anon_lock = folio_lock_anon_vma_read, - }; - bool need_lock; - - if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { - if (folio_test_idle(folio)) - return false; - else - return true; - } - - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) - return false; - - rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); - - return accessed; -} - static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); @@ -206,49 +97,6 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } -static bool damos_pa_filter_match(struct damos_filter *filter, - struct folio *folio) -{ - bool matched = false; - struct mem_cgroup *memcg; - size_t folio_sz; - - switch (filter->type) { - case DAMOS_FILTER_TYPE_ANON: - matched = folio_test_anon(folio); - break; - case DAMOS_FILTER_TYPE_ACTIVE: - matched = folio_test_active(folio); - break; - case DAMOS_FILTER_TYPE_MEMCG: - rcu_read_lock(); - memcg = folio_memcg_check(folio); - if (!memcg) - matched = false; - else - matched = filter->memcg_id == mem_cgroup_id(memcg); - rcu_read_unlock(); - break; - case DAMOS_FILTER_TYPE_YOUNG: - matched = damon_folio_young(folio); - if (matched) - damon_folio_mkold(folio); - break; - case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: - folio_sz = folio_size(folio); - matched = filter->sz_range.min <= folio_sz && - folio_sz <= filter->sz_range.max; - break; - case DAMOS_FILTER_TYPE_UNMAPPED: - matched = !folio_mapped(folio) || !folio_raw_mapping(folio); - break; - default: - break; - } - - return matched == filter->matching; -} - /* * damos_pa_filter_out - Return true if the page should be filtered out. */ @@ -260,7 +108,7 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) return false; damos_for_each_ops_filter(filter, scheme) { - if (damos_pa_filter_match(filter, folio)) + if (damos_folio_filter_match(filter, folio)) return !filter->allow; } return scheme->ops_filters_default_reject; @@ -381,127 +229,6 @@ static unsigned long damon_pa_deactivate_pages(struct damon_region *r, sz_filter_passed); } -static unsigned int __damon_pa_migrate_folio_list( - struct list_head *migrate_folios, struct pglist_data *pgdat, - int target_nid) -{ - unsigned int nr_succeeded = 0; - nodemask_t allowed_mask = NODE_MASK_NONE; - struct migration_target_control mtc = { - /* - * Allocate from 'node', or fail quickly and quietly. - * When this happens, 'page' will likely just be discarded - * instead of migrated. - */ - .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | - __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT, - .nid = target_nid, - .nmask = &allowed_mask - }; - - if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE) - return 0; - - if (list_empty(migrate_folios)) - return 0; - - /* Migration ignores all cpuset and mempolicy settings */ - migrate_pages(migrate_folios, alloc_migrate_folio, NULL, - (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON, - &nr_succeeded); - - return nr_succeeded; -} - -static unsigned int damon_pa_migrate_folio_list(struct list_head *folio_list, - struct pglist_data *pgdat, - int target_nid) -{ - unsigned int nr_migrated = 0; - struct folio *folio; - LIST_HEAD(ret_folios); - LIST_HEAD(migrate_folios); - - while (!list_empty(folio_list)) { - struct folio *folio; - - cond_resched(); - - folio = lru_to_folio(folio_list); - list_del(&folio->lru); - - if (!folio_trylock(folio)) - goto keep; - - /* Relocate its contents to another node. */ - list_add(&folio->lru, &migrate_folios); - folio_unlock(folio); - continue; -keep: - list_add(&folio->lru, &ret_folios); - } - /* 'folio_list' is always empty here */ - - /* Migrate folios selected for migration */ - nr_migrated += __damon_pa_migrate_folio_list( - &migrate_folios, pgdat, target_nid); - /* - * Folios that could not be migrated are still in @migrate_folios. Add - * those back on @folio_list - */ - if (!list_empty(&migrate_folios)) - list_splice_init(&migrate_folios, folio_list); - - try_to_unmap_flush(); - - list_splice(&ret_folios, folio_list); - - while (!list_empty(folio_list)) { - folio = lru_to_folio(folio_list); - list_del(&folio->lru); - folio_putback_lru(folio); - } - - return nr_migrated; -} - -static unsigned long damon_pa_migrate_pages(struct list_head *folio_list, - int target_nid) -{ - int nid; - unsigned long nr_migrated = 0; - LIST_HEAD(node_folio_list); - unsigned int noreclaim_flag; - - if (list_empty(folio_list)) - return nr_migrated; - - noreclaim_flag = memalloc_noreclaim_save(); - - nid = folio_nid(lru_to_folio(folio_list)); - do { - struct folio *folio = lru_to_folio(folio_list); - - if (nid == folio_nid(folio)) { - list_move(&folio->lru, &node_folio_list); - continue; - } - - nr_migrated += damon_pa_migrate_folio_list(&node_folio_list, - NODE_DATA(nid), - target_nid); - nid = folio_nid(lru_to_folio(folio_list)); - } while (!list_empty(folio_list)); - - nr_migrated += damon_pa_migrate_folio_list(&node_folio_list, - NODE_DATA(nid), - target_nid); - - memalloc_noreclaim_restore(noreclaim_flag); - - return nr_migrated; -} - static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, unsigned long *sz_filter_passed) { @@ -529,7 +256,7 @@ put_folio: addr += folio_size(folio); folio_put(folio); } - applied = damon_pa_migrate_pages(&folio_list, s->target_nid); + applied = damon_migrate_pages(&folio_list, s->target_nid); cond_resched(); s->last_applied = folio; return applied * PAGE_SIZE; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index a675150965e0..3c71b4596676 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -194,7 +194,7 @@ static int damon_reclaim_apply_parameters(void) if (err) return err; - err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); + err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs); if (err) goto out; @@ -202,7 +202,7 @@ static int damon_reclaim_apply_parameters(void) scheme = damon_reclaim_new_scheme(); if (!scheme) goto out; - damon_set_schemes(ctx, &scheme, 1); + damon_set_schemes(param_ctx, &scheme, 1); if (quota_mem_pressure_us) { goal = damos_new_quota_goal(DAMOS_QUOTA_SOME_MEM_PSI_US, @@ -238,6 +238,35 @@ out: return err; } +static int damon_reclaim_handle_commit_inputs(void) +{ + int err; + + if (!commit_inputs) + return 0; + + err = damon_reclaim_apply_parameters(); + commit_inputs = false; + return err; +} + +static int damon_reclaim_damon_call_fn(void *arg) +{ + struct damon_ctx *c = arg; + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) + damon_reclaim_stat = s->stat; + + return damon_reclaim_handle_commit_inputs(); +} + +static struct damon_call_control call_control = { + .fn = damon_reclaim_damon_call_fn, + .repeat = true, +}; + static int damon_reclaim_turn(bool on) { int err; @@ -257,7 +286,7 @@ static int damon_reclaim_turn(bool on) if (err) return err; kdamond_pid = ctx->kdamond->pid; - return 0; + return damon_call(ctx, &call_control); } static int damon_reclaim_enabled_store(const char *val, @@ -296,48 +325,22 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); MODULE_PARM_DESC(enabled, "Enable or disable DAMON_RECLAIM (default: disabled)"); -static int damon_reclaim_handle_commit_inputs(void) -{ - int err; - - if (!commit_inputs) - return 0; - - err = damon_reclaim_apply_parameters(); - commit_inputs = false; - return err; -} - -static int damon_reclaim_after_aggregation(struct damon_ctx *c) -{ - struct damos *s; - - /* update the stats parameter */ - damon_for_each_scheme(s, c) - damon_reclaim_stat = s->stat; - - return damon_reclaim_handle_commit_inputs(); -} - -static int damon_reclaim_after_wmarks_check(struct damon_ctx *c) -{ - return damon_reclaim_handle_commit_inputs(); -} - static int __init damon_reclaim_init(void) { int err = damon_modules_new_paddr_ctx_target(&ctx, &target); if (err) - return err; + goto out; - ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; - ctx->callback.after_aggregation = damon_reclaim_after_aggregation; + call_control.data = ctx; /* 'enabled' has set before this function, probably via command line */ if (enabled) err = damon_reclaim_turn(true); +out: + if (err && enabled) + enabled = false; return err; } diff --git a/mm/damon/stat.c b/mm/damon/stat.c new file mode 100644 index 000000000000..87bcd8866d4b --- /dev/null +++ b/mm/damon/stat.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shows data access monitoring resutls in simple metrics. + */ + +#define pr_fmt(fmt) "damon-stat: " fmt + +#include <linux/damon.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sort.h> + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_stat." + +static int damon_stat_enabled_store( + const char *val, const struct kernel_param *kp); + +static const struct kernel_param_ops enabled_param_ops = { + .set = damon_stat_enabled_store, + .get = param_get_bool, +}; + +static bool enabled __read_mostly = IS_ENABLED( + CONFIG_DAMON_STAT_ENABLED_DEFAULT); +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, "Enable of disable DAMON_STAT"); + +static unsigned long estimated_memory_bandwidth __read_mostly; +module_param(estimated_memory_bandwidth, ulong, 0400); +MODULE_PARM_DESC(estimated_memory_bandwidth, + "Estimated memory bandwidth usage in bytes per second"); + +static unsigned long memory_idle_ms_percentiles[101] __read_mostly = {0,}; +module_param_array(memory_idle_ms_percentiles, ulong, NULL, 0400); +MODULE_PARM_DESC(memory_idle_ms_percentiles, + "Memory idle time percentiles in milliseconds"); + +static struct damon_ctx *damon_stat_context; + +static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long access_bytes = 0; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) + access_bytes += (r->ar.end - r->ar.start) * + r->nr_accesses; + } + estimated_memory_bandwidth = access_bytes * USEC_PER_MSEC * + MSEC_PER_SEC / c->attrs.aggr_interval; +} + +static unsigned int damon_stat_idletime(const struct damon_region *r) +{ + if (r->nr_accesses) + return 0; + return r->age + 1; +} + +static int damon_stat_cmp_regions(const void *a, const void *b) +{ + const struct damon_region *ra = *(const struct damon_region **)a; + const struct damon_region *rb = *(const struct damon_region **)b; + + return damon_stat_idletime(ra) - damon_stat_idletime(rb); +} + +static int damon_stat_sort_regions(struct damon_ctx *c, + struct damon_region ***sorted_ptr, int *nr_regions_ptr, + unsigned long *total_sz_ptr) +{ + struct damon_target *t; + struct damon_region *r; + struct damon_region **region_pointers; + unsigned int nr_regions = 0; + unsigned long total_sz = 0; + + damon_for_each_target(t, c) { + /* there is only one target */ + region_pointers = kmalloc_array(damon_nr_regions(t), + sizeof(*region_pointers), GFP_KERNEL); + if (!region_pointers) + return -ENOMEM; + damon_for_each_region(r, t) { + region_pointers[nr_regions++] = r; + total_sz += r->ar.end - r->ar.start; + } + } + sort(region_pointers, nr_regions, sizeof(*region_pointers), + damon_stat_cmp_regions, NULL); + *sorted_ptr = region_pointers; + *nr_regions_ptr = nr_regions; + *total_sz_ptr = total_sz; + return 0; +} + +static void damon_stat_set_idletime_percentiles(struct damon_ctx *c) +{ + struct damon_region **sorted_regions, *region; + int nr_regions; + unsigned long total_sz, accounted_bytes = 0; + int err, i, next_percentile = 0; + + err = damon_stat_sort_regions(c, &sorted_regions, &nr_regions, + &total_sz); + if (err) + return; + for (i = 0; i < nr_regions; i++) { + region = sorted_regions[i]; + accounted_bytes += region->ar.end - region->ar.start; + while (next_percentile <= accounted_bytes * 100 / total_sz) + memory_idle_ms_percentiles[next_percentile++] = + damon_stat_idletime(region) * + c->attrs.aggr_interval / USEC_PER_MSEC; + } + kfree(sorted_regions); +} + +static int damon_stat_damon_call_fn(void *data) +{ + struct damon_ctx *c = data; + static unsigned long last_refresh_jiffies; + + /* avoid unnecessarily frequent stat update */ + if (time_before_eq(jiffies, last_refresh_jiffies + + msecs_to_jiffies(5 * MSEC_PER_SEC))) + return 0; + last_refresh_jiffies = jiffies; + + damon_stat_set_estimated_memory_bandwidth(c); + damon_stat_set_idletime_percentiles(c); + return 0; +} + +static struct damon_ctx *damon_stat_build_ctx(void) +{ + struct damon_ctx *ctx; + struct damon_attrs attrs; + struct damon_target *target; + unsigned long start = 0, end = 0; + + ctx = damon_new_ctx(); + if (!ctx) + return NULL; + attrs = (struct damon_attrs) { + .sample_interval = 5 * USEC_PER_MSEC, + .aggr_interval = 100 * USEC_PER_MSEC, + .ops_update_interval = 60 * USEC_PER_MSEC * MSEC_PER_SEC, + .min_nr_regions = 10, + .max_nr_regions = 1000, + }; + /* + * auto-tune sampling and aggregation interval aiming 4% DAMON-observed + * accesses ratio, keeping sampling interval in [5ms, 10s] range. + */ + attrs.intervals_goal = (struct damon_intervals_goal) { + .access_bp = 400, .aggrs = 3, + .min_sample_us = 5000, .max_sample_us = 10000000, + }; + if (damon_set_attrs(ctx, &attrs)) + goto free_out; + + /* + * auto-tune sampling and aggregation interval aiming 4% DAMON-observed + * accesses ratio, keeping sampling interval in [5ms, 10s] range. + */ + ctx->attrs.intervals_goal = (struct damon_intervals_goal) { + .access_bp = 400, .aggrs = 3, + .min_sample_us = 5000, .max_sample_us = 10000000, + }; + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) + goto free_out; + + target = damon_new_target(); + if (!target) + goto free_out; + damon_add_target(ctx, target); + if (damon_set_region_biggest_system_ram_default(target, &start, &end)) + goto free_out; + return ctx; +free_out: + damon_destroy_ctx(ctx); + return NULL; +} + +static struct damon_call_control call_control = { + .fn = damon_stat_damon_call_fn, + .repeat = true, +}; + +static int damon_stat_start(void) +{ + int err; + + damon_stat_context = damon_stat_build_ctx(); + if (!damon_stat_context) + return -ENOMEM; + err = damon_start(&damon_stat_context, 1, true); + if (err) + return err; + call_control.data = damon_stat_context; + return damon_call(damon_stat_context, &call_control); +} + +static void damon_stat_stop(void) +{ + damon_stop(&damon_stat_context, 1); + damon_destroy_ctx(damon_stat_context); +} + +static bool damon_stat_init_called; + +static int damon_stat_enabled_store( + const char *val, const struct kernel_param *kp) +{ + bool is_enabled = enabled; + int err; + + err = kstrtobool(val, &enabled); + if (err) + return err; + + if (is_enabled == enabled) + return 0; + + if (!damon_stat_init_called) + /* + * probably called from command line parsing (parse_args()). + * Cannot call damon_new_ctx(). Let damon_stat_init() handle. + */ + return 0; + + if (enabled) { + err = damon_stat_start(); + if (err) + enabled = false; + return err; + } + damon_stat_stop(); + return 0; +} + +static int __init damon_stat_init(void) +{ + int err = 0; + + damon_stat_init_called = true; + + /* probably set via command line */ + if (enabled) + err = damon_stat_start(); + + if (err && enabled) + enabled = false; + return err; +} + +module_init(damon_stat_init); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 0f6c9e1fec0b..74056bcd6a2c 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -341,16 +341,45 @@ static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc( return filter; } -/* Should match with enum damos_filter_type */ -static const char * const damon_sysfs_scheme_filter_type_strs[] = { - "anon", - "active", - "memcg", - "young", - "hugepage_size", - "unmapped", - "addr", - "target", +struct damos_sysfs_filter_type_name { + enum damos_filter_type type; + char *name; +}; + +static const struct damos_sysfs_filter_type_name +damos_sysfs_filter_type_names[] = { + { + .type = DAMOS_FILTER_TYPE_ANON, + .name = "anon", + }, + { + .type = DAMOS_FILTER_TYPE_ACTIVE, + .name = "active", + }, + { + .type = DAMOS_FILTER_TYPE_MEMCG, + .name = "memcg", + }, + { + .type = DAMOS_FILTER_TYPE_YOUNG, + .name = "young", + }, + { + .type = DAMOS_FILTER_TYPE_HUGEPAGE_SIZE, + .name = "hugepage_size", + }, + { + .type = DAMOS_FILTER_TYPE_UNMAPPED, + .name = "unmapped", + }, + { + .type = DAMOS_FILTER_TYPE_ADDR, + .name = "addr", + }, + { + .type = DAMOS_FILTER_TYPE_TARGET, + .name = "target", + }, }; static ssize_t type_show(struct kobject *kobj, @@ -358,9 +387,16 @@ static ssize_t type_show(struct kobject *kobj, { struct damon_sysfs_scheme_filter *filter = container_of(kobj, struct damon_sysfs_scheme_filter, kobj); + int i; - return sysfs_emit(buf, "%s\n", - damon_sysfs_scheme_filter_type_strs[filter->type]); + for (i = 0; i < ARRAY_SIZE(damos_sysfs_filter_type_names); i++) { + const struct damos_sysfs_filter_type_name *type_name; + + type_name = &damos_sysfs_filter_type_names[i]; + if (type_name->type == filter->type) + return sysfs_emit(buf, "%s\n", type_name->name); + } + return -EINVAL; } static bool damos_sysfs_scheme_filter_valid_type( @@ -385,16 +421,19 @@ static ssize_t type_store(struct kobject *kobj, { struct damon_sysfs_scheme_filter *filter = container_of(kobj, struct damon_sysfs_scheme_filter, kobj); - enum damos_filter_type type; ssize_t ret = -EINVAL; + int i; + + for (i = 0; i < ARRAY_SIZE(damos_sysfs_filter_type_names); i++) { + const struct damos_sysfs_filter_type_name *type_name; - for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) { - if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[ - type])) { + type_name = &damos_sysfs_filter_type_names[i]; + if (sysfs_streq(buf, type_name->name)) { if (!damos_sysfs_scheme_filter_valid_type( - filter->handle_layer, type)) + filter->handle_layer, + type_name->type)) break; - filter->type = type; + filter->type = type_name->type; ret = count; break; } @@ -472,6 +511,7 @@ static ssize_t memcg_path_store(struct kobject *kobj, return -ENOMEM; strscpy(path, buf, count + 1); + kfree(filter->memcg_path); filter->memcg_path = path; return count; } @@ -784,10 +824,21 @@ static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( return watermarks; } -/* Should match with enum damos_wmark_metric */ -static const char * const damon_sysfs_wmark_metric_strs[] = { - "none", - "free_mem_rate", +struct damos_sysfs_wmark_metric_name { + enum damos_wmark_metric metric; + char *name; +}; + +static const struct damos_sysfs_wmark_metric_name +damos_sysfs_wmark_metric_names[] = { + { + .metric = DAMOS_WMARK_NONE, + .name = "none", + }, + { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .name = "free_mem_rate", + }, }; static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -795,9 +846,16 @@ static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_watermarks *watermarks = container_of(kobj, struct damon_sysfs_watermarks, kobj); + int i; - return sysfs_emit(buf, "%s\n", - damon_sysfs_wmark_metric_strs[watermarks->metric]); + for (i = 0; i < ARRAY_SIZE(damos_sysfs_wmark_metric_names); i++) { + const struct damos_sysfs_wmark_metric_name *metric_name; + + metric_name = &damos_sysfs_wmark_metric_names[i]; + if (metric_name->metric == watermarks->metric) + return sysfs_emit(buf, "%s\n", metric_name->name); + } + return -EINVAL; } static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -805,11 +863,14 @@ static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_watermarks *watermarks = container_of(kobj, struct damon_sysfs_watermarks, kobj); - enum damos_wmark_metric metric; + int i; + + for (i = 0; i < ARRAY_SIZE(damos_sysfs_wmark_metric_names); i++) { + const struct damos_sysfs_wmark_metric_name *metric_name; - for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { - if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { - watermarks->metric = metric; + metric_name = &damos_sysfs_wmark_metric_names[i]; + if (sysfs_streq(buf, metric_name->name)) { + watermarks->metric = metric_name->metric; return count; } } @@ -940,27 +1001,51 @@ struct damos_sysfs_quota_goal { int nid; }; -/* This should match with enum damos_quota_goal_metric */ -static const char * const damos_sysfs_quota_goal_metric_strs[] = { - "user_input", - "some_mem_psi_us", - "node_mem_used_bp", - "node_mem_free_bp", -}; - static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void) { return kzalloc(sizeof(struct damos_sysfs_quota_goal), GFP_KERNEL); } +struct damos_sysfs_qgoal_metric_name { + enum damos_quota_goal_metric metric; + char *name; +}; + +static +struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = { + { + .metric = DAMOS_QUOTA_USER_INPUT, + .name = "user_input", + }, + { + .metric = DAMOS_QUOTA_SOME_MEM_PSI_US, + .name = "some_mem_psi_us", + }, + { + .metric = DAMOS_QUOTA_NODE_MEM_USED_BP, + .name = "node_mem_used_bp", + }, + { + .metric = DAMOS_QUOTA_NODE_MEM_FREE_BP, + .name = "node_mem_free_bp", + }, +}; + static ssize_t target_metric_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct damos_sysfs_quota_goal *goal = container_of(kobj, struct damos_sysfs_quota_goal, kobj); + int i; - return sysfs_emit(buf, "%s\n", - damos_sysfs_quota_goal_metric_strs[goal->metric]); + for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_metric_names); i++) { + struct damos_sysfs_qgoal_metric_name *metric_name; + + metric_name = &damos_sysfs_qgoal_metric_names[i]; + if (metric_name->metric == goal->metric) + return sysfs_emit(buf, "%s\n", metric_name->name); + } + return -EINVAL; } static ssize_t target_metric_store(struct kobject *kobj, @@ -968,11 +1053,14 @@ static ssize_t target_metric_store(struct kobject *kobj, { struct damos_sysfs_quota_goal *goal = container_of(kobj, struct damos_sysfs_quota_goal, kobj); - enum damos_quota_goal_metric m; + int i; + + for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_metric_names); i++) { + struct damos_sysfs_qgoal_metric_name *metric_name; - for (m = 0; m < NR_DAMOS_QUOTA_GOAL_METRICS; m++) { - if (sysfs_streq(buf, damos_sysfs_quota_goal_metric_strs[m])) { - goal->metric = m; + metric_name = &damos_sysfs_qgoal_metric_names[i]; + if (sysfs_streq(buf, metric_name->name)) { + goal->metric = metric_name->metric; return count; } } @@ -1568,6 +1656,204 @@ static const struct kobj_type damon_sysfs_access_pattern_ktype = { }; /* + * dest (action destination) directory + */ + +struct damos_sysfs_dest { + struct kobject kobj; + unsigned int id; + unsigned int weight; +}; + +static struct damos_sysfs_dest *damos_sysfs_dest_alloc(void) +{ + return kzalloc(sizeof(struct damos_sysfs_dest), GFP_KERNEL); +} + +static ssize_t id_show( + struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_dest *dest = container_of(kobj, + struct damos_sysfs_dest, kobj); + + return sysfs_emit(buf, "%u\n", dest->id); +} + +static ssize_t id_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_dest *dest = container_of(kobj, + struct damos_sysfs_dest, kobj); + int err = kstrtouint(buf, 0, &dest->id); + + return err ? err : count; +} + +static ssize_t weight_show( + struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_dest *dest = container_of(kobj, + struct damos_sysfs_dest, kobj); + + return sysfs_emit(buf, "%u\n", dest->weight); +} + +static ssize_t weight_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_dest *dest = container_of(kobj, + struct damos_sysfs_dest, kobj); + int err = kstrtouint(buf, 0, &dest->weight); + + return err ? err : count; +} + +static void damos_sysfs_dest_release(struct kobject *kobj) +{ + struct damos_sysfs_dest *dest = container_of(kobj, + struct damos_sysfs_dest, kobj); + kfree(dest); +} + +static struct kobj_attribute damos_sysfs_dest_id_attr = + __ATTR_RW_MODE(id, 0600); + +static struct kobj_attribute damos_sysfs_dest_weight_attr = + __ATTR_RW_MODE(weight, 0600); + +static struct attribute *damos_sysfs_dest_attrs[] = { + &damos_sysfs_dest_id_attr.attr, + &damos_sysfs_dest_weight_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damos_sysfs_dest); + +static const struct kobj_type damos_sysfs_dest_ktype = { + .release = damos_sysfs_dest_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damos_sysfs_dest_groups, +}; + +/* + * dests (action destinations) directory + */ + +struct damos_sysfs_dests { + struct kobject kobj; + struct damos_sysfs_dest **dests_arr; + int nr; +}; + +static struct damos_sysfs_dests * +damos_sysfs_dests_alloc(void) +{ + return kzalloc(sizeof(struct damos_sysfs_dests), GFP_KERNEL); +} + +static void damos_sysfs_dests_rm_dirs( + struct damos_sysfs_dests *dests) +{ + struct damos_sysfs_dest **dests_arr = dests->dests_arr; + int i; + + for (i = 0; i < dests->nr; i++) + kobject_put(&dests_arr[i]->kobj); + dests->nr = 0; + kfree(dests_arr); + dests->dests_arr = NULL; +} + +static int damos_sysfs_dests_add_dirs( + struct damos_sysfs_dests *dests, int nr_dests) +{ + struct damos_sysfs_dest **dests_arr, *dest; + int err, i; + + damos_sysfs_dests_rm_dirs(dests); + if (!nr_dests) + return 0; + + dests_arr = kmalloc_array(nr_dests, sizeof(*dests_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!dests_arr) + return -ENOMEM; + dests->dests_arr = dests_arr; + + for (i = 0; i < nr_dests; i++) { + dest = damos_sysfs_dest_alloc(); + if (!dest) { + damos_sysfs_dests_rm_dirs(dests); + return -ENOMEM; + } + + err = kobject_init_and_add(&dest->kobj, + &damos_sysfs_dest_ktype, + &dests->kobj, "%d", i); + if (err) { + kobject_put(&dest->kobj); + damos_sysfs_dests_rm_dirs(dests); + return err; + } + + dests_arr[i] = dest; + dests->nr++; + } + return 0; +} + +static ssize_t nr_dests_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_dests *dests = container_of(kobj, + struct damos_sysfs_dests, kobj); + + return sysfs_emit(buf, "%d\n", dests->nr); +} + +static ssize_t nr_dests_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_dests *dests; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + dests = container_of(kobj, struct damos_sysfs_dests, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damos_sysfs_dests_add_dirs(dests, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damos_sysfs_dests_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damos_sysfs_dests, kobj)); +} + +static struct kobj_attribute damos_sysfs_dests_nr_attr = + __ATTR_RW_MODE(nr_dests, 0600); + +static struct attribute *damos_sysfs_dests_attrs[] = { + &damos_sysfs_dests_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damos_sysfs_dests); + +static const struct kobj_type damos_sysfs_dests_ktype = { + .release = damos_sysfs_dests_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damos_sysfs_dests_groups, +}; + +/* * scheme directory */ @@ -1584,20 +1870,55 @@ struct damon_sysfs_scheme { struct damon_sysfs_stats *stats; struct damon_sysfs_scheme_regions *tried_regions; int target_nid; + struct damos_sysfs_dests *dests; +}; + +struct damos_sysfs_action_name { + enum damos_action action; + char *name; }; -/* This should match with enum damos_action */ -static const char * const damon_sysfs_damos_action_strs[] = { - "willneed", - "cold", - "pageout", - "hugepage", - "nohugepage", - "lru_prio", - "lru_deprio", - "migrate_hot", - "migrate_cold", - "stat", +static struct damos_sysfs_action_name damos_sysfs_action_names[] = { + { + .action = DAMOS_WILLNEED, + .name = "willneed", + }, + { + .action = DAMOS_COLD, + .name = "cold", + }, + { + .action = DAMOS_PAGEOUT, + .name = "pageout", + }, + { + .action = DAMOS_HUGEPAGE, + .name = "hugepage", + }, + { + .action = DAMOS_NOHUGEPAGE, + .name = "nohugepage", + }, + { + .action = DAMOS_LRU_PRIO, + .name = "lru_prio", + }, + { + .action = DAMOS_LRU_DEPRIO, + .name = "lru_deprio", + }, + { + .action = DAMOS_MIGRATE_HOT, + .name = "migrate_hot", + }, + { + .action = DAMOS_MIGRATE_COLD, + .name = "migrate_cold", + }, + { + .action = DAMOS_STAT, + .name = "stat", + }, }; static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( @@ -1640,6 +1961,22 @@ out: return err; } +static int damos_sysfs_set_dests(struct damon_sysfs_scheme *scheme) +{ + struct damos_sysfs_dests *dests = damos_sysfs_dests_alloc(); + int err; + + if (!dests) + return -ENOMEM; + err = kobject_init_and_add(&dests->kobj, &damos_sysfs_dests_ktype, + &scheme->kobj, "dests"); + if (err) + kobject_put(&dests->kobj); + else + scheme->dests = dests; + return err; +} + static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) { struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); @@ -1772,9 +2109,12 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) err = damon_sysfs_scheme_set_access_pattern(scheme); if (err) return err; - err = damon_sysfs_scheme_set_quotas(scheme); + err = damos_sysfs_set_dests(scheme); if (err) goto put_access_pattern_out; + err = damon_sysfs_scheme_set_quotas(scheme); + if (err) + goto put_dests_out; err = damon_sysfs_scheme_set_watermarks(scheme); if (err) goto put_quotas_access_pattern_out; @@ -1805,6 +2145,9 @@ put_watermarks_quotas_access_pattern_out: put_quotas_access_pattern_out: kobject_put(&scheme->quotas->kobj); scheme->quotas = NULL; +put_dests_out: + kobject_put(&scheme->dests->kobj); + scheme->dests = NULL; put_access_pattern_out: kobject_put(&scheme->access_pattern->kobj); scheme->access_pattern = NULL; @@ -1815,6 +2158,8 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) { damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); kobject_put(&scheme->access_pattern->kobj); + kobject_put(&scheme->dests->kobj); + damos_sysfs_dests_rm_dirs(scheme->dests); damon_sysfs_quotas_rm_dirs(scheme->quotas); kobject_put(&scheme->quotas->kobj); kobject_put(&scheme->watermarks->kobj); @@ -1834,9 +2179,16 @@ static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_scheme *scheme = container_of(kobj, struct damon_sysfs_scheme, kobj); + int i; - return sysfs_emit(buf, "%s\n", - damon_sysfs_damos_action_strs[scheme->action]); + for (i = 0; i < ARRAY_SIZE(damos_sysfs_action_names); i++) { + struct damos_sysfs_action_name *action_name; + + action_name = &damos_sysfs_action_names[i]; + if (action_name->action == scheme->action) + return sysfs_emit(buf, "%s\n", action_name->name); + } + return -EINVAL; } static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1844,11 +2196,14 @@ static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_scheme *scheme = container_of(kobj, struct damon_sysfs_scheme, kobj); - enum damos_action action; + int i; + + for (i = 0; i < ARRAY_SIZE(damos_sysfs_action_names); i++) { + struct damos_sysfs_action_name *action_name; - for (action = 0; action < NR_DAMOS_ACTIONS; action++) { - if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { - scheme->action = action; + action_name = &damos_sysfs_action_names[i]; + if (sysfs_streq(buf, action_name->name)) { + scheme->action = action_name->action; return count; } } @@ -2221,6 +2576,29 @@ void damos_sysfs_update_effective_quotas( } } +static int damos_sysfs_add_migrate_dest(struct damos *scheme, + struct damos_sysfs_dests *sysfs_dests) +{ + struct damos_migrate_dests *dests = &scheme->migrate_dests; + int i; + + dests->node_id_arr = kmalloc_array(sysfs_dests->nr, + sizeof(*dests->node_id_arr), GFP_KERNEL); + if (!dests->node_id_arr) + return -ENOMEM; + dests->weight_arr = kmalloc_array(sysfs_dests->nr, + sizeof(*dests->weight_arr), GFP_KERNEL); + if (!dests->weight_arr) + /* ->node_id_arr will be freed by scheme destruction */ + return -ENOMEM; + for (i = 0; i < sysfs_dests->nr; i++) { + dests->node_id_arr[i] = sysfs_dests->dests_arr[i]->id; + dests->weight_arr[i] = sysfs_dests->dests_arr[i]->weight; + } + dests->nr_dests = sysfs_dests->nr; + return 0; +} + static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_scheme *sysfs_scheme) { @@ -2283,6 +2661,11 @@ static struct damos *damon_sysfs_mk_scheme( damon_destroy_scheme(scheme); return NULL; } + err = damos_sysfs_add_migrate_dest(scheme, sysfs_scheme->dests); + if (err) { + damon_destroy_scheme(scheme); + return NULL; + } return scheme; } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 1af6aff35d84..6d2b0dab50cb 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -811,11 +811,24 @@ static const struct kobj_type damon_sysfs_attrs_ktype = { * context directory */ -/* This should match with enum damon_ops_id */ -static const char * const damon_sysfs_ops_strs[] = { - "vaddr", - "fvaddr", - "paddr", +struct damon_sysfs_ops_name { + enum damon_ops_id ops_id; + char *name; +}; + +static const struct damon_sysfs_ops_name damon_sysfs_ops_names[] = { + { + .ops_id = DAMON_OPS_VADDR, + .name = "vaddr", + }, + { + .ops_id = DAMON_OPS_FVADDR, + .name = "fvaddr", + }, + { + .ops_id = DAMON_OPS_PADDR, + .name = "paddr", + }, }; struct damon_sysfs_context { @@ -934,14 +947,16 @@ static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context) static ssize_t avail_operations_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - enum damon_ops_id id; int len = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) { + const struct damon_sysfs_ops_name *ops_name; - for (id = 0; id < NR_DAMON_OPS; id++) { - if (!damon_is_registered_ops(id)) + ops_name = &damon_sysfs_ops_names[i]; + if (!damon_is_registered_ops(ops_name->ops_id)) continue; - len += sysfs_emit_at(buf, len, "%s\n", - damon_sysfs_ops_strs[id]); + len += sysfs_emit_at(buf, len, "%s\n", ops_name->name); } return len; } @@ -951,8 +966,16 @@ static ssize_t operations_show(struct kobject *kobj, { struct damon_sysfs_context *context = container_of(kobj, struct damon_sysfs_context, kobj); + int i; + + for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) { + const struct damon_sysfs_ops_name *ops_name; - return sysfs_emit(buf, "%s\n", damon_sysfs_ops_strs[context->ops_id]); + ops_name = &damon_sysfs_ops_names[i]; + if (ops_name->ops_id == context->ops_id) + return sysfs_emit(buf, "%s\n", ops_name->name); + } + return -EINVAL; } static ssize_t operations_store(struct kobject *kobj, @@ -960,11 +983,14 @@ static ssize_t operations_store(struct kobject *kobj, { struct damon_sysfs_context *context = container_of(kobj, struct damon_sysfs_context, kobj); - enum damon_ops_id id; + int i; + + for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) { + const struct damon_sysfs_ops_name *ops_name; - for (id = 0; id < NR_DAMON_OPS; id++) { - if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) { - context->ops_id = id; + ops_name = &damon_sysfs_ops_names[i]; + if (sysfs_streq(buf, ops_name->name)) { + context->ops_id = ops_name->ops_id; return count; } } @@ -1129,6 +1155,7 @@ struct damon_sysfs_kdamond { struct kobject kobj; struct damon_sysfs_contexts *contexts; struct damon_ctx *damon_ctx; + unsigned int refresh_ms; }; static struct damon_sysfs_kdamond *damon_sysfs_kdamond_alloc(void) @@ -1163,16 +1190,6 @@ static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond) kobject_put(&kdamond->contexts->kobj); } -static bool damon_sysfs_ctx_running(struct damon_ctx *ctx) -{ - bool running; - - mutex_lock(&ctx->kdamond_lock); - running = ctx->kdamond != NULL; - mutex_unlock(&ctx->kdamond_lock); - return running; -} - /* * enum damon_sysfs_cmd - Commands for a specific kdamond. */ @@ -1249,7 +1266,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, if (!ctx) running = false; else - running = damon_sysfs_ctx_running(ctx); + running = damon_is_running(ctx); return sysfs_emit(buf, "%s\n", running ? damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] : @@ -1279,18 +1296,6 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, return damon_set_attrs(ctx, &attrs); } -static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) -{ - struct damon_target *t, *next; - bool has_pid = damon_target_has_pid(ctx); - - damon_for_each_target_safe(t, next, ctx) { - if (has_pid) - put_pid(t->pid); - damon_destroy_target(t); - } -} - static int damon_sysfs_set_regions(struct damon_target *t, struct damon_sysfs_regions *sysfs_regions) { @@ -1325,7 +1330,6 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, struct damon_ctx *ctx) { struct damon_target *t = damon_new_target(); - int err = -EINVAL; if (!t) return -ENOMEM; @@ -1333,16 +1337,10 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, if (damon_target_has_pid(ctx)) { t->pid = find_get_pid(sys_target->pid); if (!t->pid) - goto destroy_targets_out; + /* caller will destroy targets */ + return -EINVAL; } - err = damon_sysfs_set_regions(t, sys_target->regions); - if (err) - goto destroy_targets_out; - return 0; - -destroy_targets_out: - damon_sysfs_destroy_targets(ctx); - return err; + return damon_sysfs_set_regions(t, sys_target->regions); } static int damon_sysfs_add_targets(struct damon_ctx *ctx, @@ -1364,21 +1362,6 @@ static int damon_sysfs_add_targets(struct damon_ctx *ctx, return 0; } -static void damon_sysfs_before_terminate(struct damon_ctx *ctx) -{ - struct damon_target *t, *next; - - if (!damon_target_has_pid(ctx)) - return; - - mutex_lock(&ctx->kdamond_lock); - damon_for_each_target_safe(t, next, ctx) { - put_pid(t->pid); - damon_destroy_target(t); - } - mutex_unlock(&ctx->kdamond_lock); -} - /* * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. * @data: The kobject wrapper that associated to the kdamond thread. @@ -1403,7 +1386,7 @@ static inline bool damon_sysfs_kdamond_running( struct damon_sysfs_kdamond *kdamond) { return kdamond->damon_ctx && - damon_sysfs_ctx_running(kdamond->damon_ctx); + damon_is_running(kdamond->damon_ctx); } static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, @@ -1450,13 +1433,11 @@ static int damon_sysfs_commit_input(void *data) test_ctx = damon_new_ctx(); err = damon_commit_ctx(test_ctx, param_ctx); if (err) { - damon_sysfs_destroy_targets(test_ctx); damon_destroy_ctx(test_ctx); goto out; } err = damon_commit_ctx(kdamond->damon_ctx, param_ctx); out: - damon_sysfs_destroy_targets(param_ctx); damon_destroy_ctx(param_ctx); return err; } @@ -1525,10 +1506,35 @@ static struct damon_ctx *damon_sysfs_build_ctx( return ERR_PTR(err); } - ctx->callback.before_terminate = damon_sysfs_before_terminate; return ctx; } +static int damon_sysfs_repeat_call_fn(void *data) +{ + struct damon_sysfs_kdamond *sysfs_kdamond = data; + static unsigned long next_update_jiffies; + + if (!sysfs_kdamond->refresh_ms) + return 0; + if (time_before(jiffies, next_update_jiffies)) + return 0; + next_update_jiffies = jiffies + + msecs_to_jiffies(sysfs_kdamond->refresh_ms); + + if (!mutex_trylock(&damon_sysfs_lock)) + return 0; + damon_sysfs_upd_tuned_intervals(sysfs_kdamond); + damon_sysfs_upd_schemes_stats(sysfs_kdamond); + damon_sysfs_upd_schemes_effective_quotas(sysfs_kdamond); + mutex_unlock(&damon_sysfs_lock); + return 0; +} + +static struct damon_call_control damon_sysfs_repeat_call_control = { + .fn = damon_sysfs_repeat_call_fn, + .repeat = true, +}; + static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) { struct damon_ctx *ctx; @@ -1553,6 +1559,9 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) return err; } kdamond->damon_ctx = ctx; + + damon_sysfs_repeat_call_control.data = kdamond; + damon_call(ctx, &damon_sysfs_repeat_call_control); return err; } @@ -1711,6 +1720,30 @@ out: return sysfs_emit(buf, "%d\n", pid); } +static ssize_t refresh_ms_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + + return sysfs_emit(buf, "%u\n", kdamond->refresh_ms); +} + +static ssize_t refresh_ms_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + unsigned int nr; + int err = kstrtouint(buf, 0, &nr); + + if (err) + return err; + + kdamond->refresh_ms = nr; + return count; +} + static void damon_sysfs_kdamond_release(struct kobject *kobj) { struct damon_sysfs_kdamond *kdamond = container_of(kobj, @@ -1727,9 +1760,13 @@ static struct kobj_attribute damon_sysfs_kdamond_state_attr = static struct kobj_attribute damon_sysfs_kdamond_pid_attr = __ATTR_RO_MODE(pid, 0400); +static struct kobj_attribute damon_sysfs_kdamond_refresh_ms_attr = + __ATTR_RW_MODE(refresh_ms, 0600); + static struct attribute *damon_sysfs_kdamond_attrs[] = { &damon_sysfs_kdamond_state_attr.attr, &damon_sysfs_kdamond_pid_attr.attr, + &damon_sysfs_kdamond_refresh_ms_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_kdamond); diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 298c67557fae..dfedfff19940 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -58,7 +58,7 @@ static void damon_test_target(struct kunit *test) damon_add_target(c, t); KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c)); - damon_destroy_target(t); + damon_destroy_target(t, c); KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); damon_destroy_ctx(c); @@ -310,7 +310,7 @@ static void damon_test_set_regions(struct kunit *test) KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]); KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]); } - damon_destroy_target(t); + damon_destroy_target(t, NULL); } static void damon_test_nr_accesses_to_accesses_bp(struct kunit *test) diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index 7cd944266a92..d2b37ccf2cc0 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -149,7 +149,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]); } - damon_destroy_target(t); + damon_destroy_target(t, NULL); } /* diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 46554e49a478..94af19c4dfed 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -15,6 +15,7 @@ #include <linux/pagewalk.h> #include <linux/sched/mm.h> +#include "../internal.h" #include "ops-common.h" #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST @@ -610,6 +611,183 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } +static bool damos_va_filter_young_match(struct damos_filter *filter, + struct folio *folio, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, pmd_t *pmdp) +{ + bool young = false; + + if (ptep) + young = pte_young(ptep_get(ptep)); + else if (pmdp) + young = pmd_young(pmdp_get(pmdp)); + + young = young || !folio_test_idle(folio) || + mmu_notifier_test_young(vma->vm_mm, addr); + + if (young && ptep) + damon_ptep_mkold(ptep, vma, addr); + else if (young && pmdp) + damon_pmdp_mkold(pmdp, vma, addr); + + return young == filter->matching; +} + +static bool damos_va_filter_out(struct damos *scheme, struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pmd_t *pmdp) +{ + struct damos_filter *filter; + bool matched; + + if (scheme->core_filters_allowed) + return false; + + damos_for_each_ops_filter(filter, scheme) { + /* + * damos_folio_filter_match checks the young filter by doing an + * rmap on the folio to find its page table. However, being the + * vaddr scheme, we have direct access to the page tables, so + * use that instead. + */ + if (filter->type == DAMOS_FILTER_TYPE_YOUNG) + matched = damos_va_filter_young_match(filter, folio, + vma, addr, ptep, pmdp); + else + matched = damos_folio_filter_match(filter, folio); + + if (matched) + return !filter->allow; + } + return scheme->ops_filters_default_reject; +} + +struct damos_va_migrate_private { + struct list_head *migration_lists; + struct damos *scheme; +}; + +/* + * Place the given folio in the migration_list corresponding to where the folio + * should be migrated. + * + * The algorithm used here is similar to weighted_interleave_nid() + */ +static void damos_va_migrate_dests_add(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, + struct damos_migrate_dests *dests, + struct list_head *migration_lists) +{ + pgoff_t ilx; + int order; + unsigned int target; + unsigned int weight_total = 0; + int i; + + /* + * If dests is empty, there is only one migration list corresponding + * to s->target_nid. + */ + if (!dests->nr_dests) { + i = 0; + goto isolate; + } + + order = folio_order(folio); + ilx = vma->vm_pgoff >> order; + ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); + + for (i = 0; i < dests->nr_dests; i++) + weight_total += dests->weight_arr[i]; + + /* If the total weights are somehow 0, don't migrate at all */ + if (!weight_total) + return; + + target = ilx % weight_total; + for (i = 0; i < dests->nr_dests; i++) { + if (target < dests->weight_arr[i]) + break; + target -= dests->weight_arr[i]; + } + +isolate: + if (!folio_isolate_lru(folio)) + return; + + list_add(&folio->lru, &migration_lists[i]); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct damos_va_migrate_private *priv = walk->private; + struct list_head *migration_lists = priv->migration_lists; + struct damos *s = priv->scheme; + struct damos_migrate_dests *dests = &s->migrate_dests; + struct folio *folio; + spinlock_t *ptl; + pmd_t pmde; + + ptl = pmd_lock(walk->mm, pmd); + pmde = pmdp_get(pmd); + + if (!pmd_present(pmde) || !pmd_trans_huge(pmde)) + goto unlock; + + /* Tell page walk code to not split the PMD */ + walk->action = ACTION_CONTINUE; + + folio = damon_get_folio(pmd_pfn(pmde)); + if (!folio) + goto unlock; + + if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd)) + goto put_folio; + + damos_va_migrate_dests_add(folio, walk->vma, addr, dests, + migration_lists); + +put_folio: + folio_put(folio); +unlock: + spin_unlock(ptl); + return 0; +} +#else +#define damos_va_migrate_pmd_entry NULL +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static int damos_va_migrate_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct damos_va_migrate_private *priv = walk->private; + struct list_head *migration_lists = priv->migration_lists; + struct damos *s = priv->scheme; + struct damos_migrate_dests *dests = &s->migrate_dests; + struct folio *folio; + pte_t ptent; + + ptent = ptep_get(pte); + if (pte_none(ptent) || !pte_present(ptent)) + return 0; + + folio = damon_get_folio(pte_pfn(ptent)); + if (!folio) + return 0; + + if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL)) + goto put_folio; + + damos_va_migrate_dests_add(folio, walk->vma, addr, dests, + migration_lists); + +put_folio: + folio_put(folio); + return 0; +} + /* * Functions for the target validity check and cleanup */ @@ -627,6 +805,11 @@ static bool damon_va_target_valid(struct damon_target *t) return false; } +static void damon_va_cleanup_target(struct damon_target *t) +{ + put_pid(t->pid); +} + #ifndef CONFIG_ADVISE_SYSCALLS static unsigned long damos_madvise(struct damon_target *target, struct damon_region *r, int behavior) @@ -653,6 +836,56 @@ static unsigned long damos_madvise(struct damon_target *target, } #endif /* CONFIG_ADVISE_SYSCALLS */ +static unsigned long damos_va_migrate(struct damon_target *target, + struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) +{ + LIST_HEAD(folio_list); + struct damos_va_migrate_private priv; + struct mm_struct *mm; + int nr_dests; + int nid; + bool use_target_nid; + unsigned long applied = 0; + struct damos_migrate_dests *dests = &s->migrate_dests; + struct mm_walk_ops walk_ops = { + .pmd_entry = damos_va_migrate_pmd_entry, + .pte_entry = damos_va_migrate_pte_entry, + .walk_lock = PGWALK_RDLOCK, + }; + + use_target_nid = dests->nr_dests == 0; + nr_dests = use_target_nid ? 1 : dests->nr_dests; + priv.scheme = s; + priv.migration_lists = kmalloc_array(nr_dests, + sizeof(*priv.migration_lists), GFP_KERNEL); + if (!priv.migration_lists) + return 0; + + for (int i = 0; i < nr_dests; i++) + INIT_LIST_HEAD(&priv.migration_lists[i]); + + + mm = damon_get_mm(target); + if (!mm) + goto free_lists; + + mmap_read_lock(mm); + walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); + mmap_read_unlock(mm); + mmput(mm); + + for (int i = 0; i < nr_dests; i++) { + nid = use_target_nid ? s->target_nid : dests->node_id_arr[i]; + applied += damon_migrate_pages(&priv.migration_lists[i], nid); + cond_resched(); + } + +free_lists: + kfree(priv.migration_lists); + return applied * PAGE_SIZE; +} + static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed) @@ -675,6 +908,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_NOHUGEPAGE: madv_action = MADV_NOHUGEPAGE; break; + case DAMOS_MIGRATE_HOT: + case DAMOS_MIGRATE_COLD: + return damos_va_migrate(t, r, scheme, sz_filter_passed); case DAMOS_STAT: return 0; default: @@ -695,6 +931,10 @@ static int damon_va_scheme_score(struct damon_ctx *context, switch (scheme->action) { case DAMOS_PAGEOUT: return damon_cold_score(context, r, scheme); + case DAMOS_MIGRATE_HOT: + return damon_hot_score(context, r, scheme); + case DAMOS_MIGRATE_COLD: + return damon_cold_score(context, r, scheme); default: break; } @@ -711,6 +951,7 @@ static int __init damon_va_initcall(void) .prepare_access_checks = damon_va_prepare_access_checks, .check_accesses = damon_va_check_accesses, .target_valid = damon_va_target_valid, + .cleanup_target = damon_va_cleanup_target, .cleanup = NULL, .apply_scheme = damon_va_apply_scheme, .get_scheme_score = damon_va_scheme_score, diff --git a/mm/debug.c b/mm/debug.c index 907382257062..b4388f4dcd4d 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -129,47 +129,13 @@ static void __dump_folio(struct folio *folio, struct page *page, static void __dump_page(const struct page *page) { - struct folio *foliop, folio; - struct page precise; - unsigned long head; - unsigned long pfn = page_to_pfn(page); - unsigned long idx, nr_pages = 1; - int loops = 5; - -again: - memcpy(&precise, page, sizeof(*page)); - head = precise.compound_head; - if ((head & 1) == 0) { - foliop = (struct folio *)&precise; - idx = 0; - if (!folio_test_large(foliop)) - goto dump; - foliop = (struct folio *)page; - } else { - foliop = (struct folio *)(head - 1); - idx = folio_page_idx(foliop, page); - } + struct page_snapshot ps; - if (idx < MAX_FOLIO_NR_PAGES) { - memcpy(&folio, foliop, 2 * sizeof(struct page)); - nr_pages = folio_nr_pages(&folio); - if (nr_pages > 1) - memcpy(&folio.__page_2, &foliop->__page_2, - sizeof(struct page)); - foliop = &folio; - } - - if (idx > nr_pages) { - if (loops-- > 0) - goto again; + snapshot_page(&ps, page); + if (!snapshot_page_is_faithful(&ps)) pr_warn("page does not match folio\n"); - precise.compound_head &= ~1UL; - foliop = (struct folio *)&precise; - idx = 0; - } -dump: - __dump_folio(foliop, &precise, pfn, idx); + __dump_folio(&ps.folio_snapshot, &ps.page_snapshot, ps.pfn, ps.idx); } void dump_page(const struct page *page, const char *reason) @@ -290,7 +256,7 @@ void dump_vmg(const struct vma_merge_struct *vmg, const char *reason) vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0, vmg->vmi ? vma_iter_end(vmg->vmi) : 0, vmg->prev, vmg->middle, vmg->next, vmg->target, - vmg->start, vmg->end, vmg->flags, + vmg->start, vmg->end, vmg->vm_flags, vmg->file, vmg->anon_vma, vmg->policy, #ifdef CONFIG_USERFAULTFD vmg->uffd_ctx.ctx, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 7731b238b534..d19031f275a3 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -20,7 +20,6 @@ #include <linux/mman.h> #include <linux/mm_types.h> #include <linux/module.h> -#include <linux/pfn_t.h> #include <linux/printk.h> #include <linux/pgtable.h> #include <linux/random.h> @@ -73,6 +72,8 @@ struct pgtable_debug_args { unsigned long fixed_pud_pfn; unsigned long fixed_pmd_pfn; unsigned long fixed_pte_pfn; + + swp_entry_t swp_entry; }; static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) @@ -348,12 +349,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) vaddr &= HPAGE_PUD_MASK; pud = pfn_pud(args->pud_pfn, args->page_prot); - /* - * Some architectures have debug checks to make sure - * huge pud mapping are only found with devmap entries - * For now test with only devmap entries. - */ - pud = pud_mkdevmap(pud); set_pud_at(args->mm, vaddr, args->pudp, pud); flush_dcache_page(page); pudp_set_wrprotect(args->mm, vaddr, args->pudp); @@ -366,7 +361,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ pud = pfn_pud(args->pud_pfn, args->page_prot); - pud = pud_mkdevmap(pud); pud = pud_wrprotect(pud); pud = pud_mkclean(pud); set_pud_at(args->mm, vaddr, args->pudp, pud); @@ -384,7 +378,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) #endif /* __PAGETABLE_PMD_FOLDED */ pud = pfn_pud(args->pud_pfn, args->page_prot); - pud = pud_mkdevmap(pud); pud = pud_mkyoung(pud); set_pud_at(args->mm, vaddr, args->pudp, pud); flush_dcache_page(page); @@ -693,53 +686,6 @@ static void __init pmd_protnone_tests(struct pgtable_debug_args *args) static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP -static void __init pte_devmap_tests(struct pgtable_debug_args *args) -{ - pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); - - pr_debug("Validating PTE devmap\n"); - WARN_ON(!pte_devmap(pte_mkdevmap(pte))); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_devmap_tests(struct pgtable_debug_args *args) -{ - pmd_t pmd; - - if (!has_transparent_hugepage()) - return; - - pr_debug("Validating PMD devmap\n"); - pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); - WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd))); -} - -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_devmap_tests(struct pgtable_debug_args *args) -{ - pud_t pud; - - if (!has_transparent_pud_hugepage()) - return; - - pr_debug("Validating PUD devmap\n"); - pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); - WARN_ON(!pud_devmap(pud_mkdevmap(pud))); -} -#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } -#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -#else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } -static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#else -static void __init pte_devmap_tests(struct pgtable_debug_args *args) { } -static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } -static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } -#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */ - static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) { pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); @@ -754,12 +700,15 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + pte_t pte; if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return; pr_debug("Validating PTE swap soft dirty\n"); + pte = swp_entry_to_pte(args->swp_entry); + WARN_ON(!is_swap_pte(pte)); + WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte))); WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte))); } @@ -793,7 +742,9 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PMD swap soft dirty\n"); - pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); + pmd = swp_entry_to_pmd(args->swp_entry); + WARN_ON(!is_swap_pmd(pmd)); + WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd))); WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd))); } @@ -804,17 +755,11 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) { - unsigned long max_swap_offset; swp_entry_t entry, entry2; pte_t pte; pr_debug("Validating PTE swap exclusive\n"); - - /* See generic_max_swapfile_size(): probe the maximum offset */ - max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); - - /* Create a swp entry with all possible bits set */ - entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset); + entry = args->swp_entry; pte = swp_entry_to_pte(entry); WARN_ON(pte_swp_exclusive(pte)); @@ -838,30 +783,34 @@ static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) static void __init pte_swap_tests(struct pgtable_debug_args *args) { - swp_entry_t swp; - pte_t pte; + swp_entry_t arch_entry; + pte_t pte1, pte2; pr_debug("Validating PTE swap\n"); - pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); - swp = __pte_to_swp_entry(pte); - pte = __swp_entry_to_pte(swp); - WARN_ON(args->fixed_pte_pfn != pte_pfn(pte)); + pte1 = swp_entry_to_pte(args->swp_entry); + WARN_ON(!is_swap_pte(pte1)); + + arch_entry = __pte_to_swp_entry(pte1); + pte2 = __swp_entry_to_pte(arch_entry); + WARN_ON(memcmp(&pte1, &pte2, sizeof(pte1))); } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION static void __init pmd_swap_tests(struct pgtable_debug_args *args) { - swp_entry_t swp; - pmd_t pmd; + swp_entry_t arch_entry; + pmd_t pmd1, pmd2; if (!has_transparent_hugepage()) return; pr_debug("Validating PMD swap\n"); - pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); - swp = __pmd_to_swp_entry(pmd); - pmd = __swp_entry_to_pmd(swp); - WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd)); + pmd1 = swp_entry_to_pmd(args->swp_entry); + WARN_ON(!is_swap_pmd(pmd1)); + + arch_entry = __pmd_to_swp_entry(pmd1); + pmd2 = __swp_entry_to_pmd(arch_entry); + WARN_ON(memcmp(&pmd1, &pmd2, sizeof(pmd1))); } #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ static void __init pmd_swap_tests(struct pgtable_debug_args *args) { } @@ -1166,6 +1115,7 @@ static void __init init_fixed_pfns(struct pgtable_debug_args *args) static int __init init_args(struct pgtable_debug_args *args) { + unsigned long max_swap_offset; struct page *page = NULL; int ret = 0; @@ -1248,6 +1198,11 @@ static int __init init_args(struct pgtable_debug_args *args) init_fixed_pfns(args); + /* See generic_max_swapfile_size(): probe the maximum offset */ + max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); + /* Create a swp entry with all possible bits set */ + args->swp_entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset); + /* * Allocate (huge) pages because some of the tests need to access * the data in the pages. The corresponding tests will be skipped @@ -1333,10 +1288,6 @@ static int __init debug_vm_pgtable(void) pte_protnone_tests(&args); pmd_protnone_tests(&args); - pte_devmap_tests(&args); - pmd_devmap_tests(&args); - pud_devmap_tests(&args); - pte_soft_dirty_tests(&args); pmd_soft_dirty_tests(&args); pte_swap_soft_dirty_tests(&args); diff --git a/mm/dmapool.c b/mm/dmapool.c index 5be8cc1c6529..5d8af6e29127 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -200,7 +200,7 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, /** - * dma_pool_create_node - Creates a pool of consistent memory blocks, for dma. + * dma_pool_create_node - Creates a pool of coherent DMA memory blocks. * @name: name of pool, for diagnostics * @dev: device that will be doing the DMA * @size: size of the blocks in this pool. @@ -210,7 +210,7 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, * Context: not in_interrupt() * * Given one of these pools, dma_pool_alloc() - * may be used to allocate memory. Such memory will all have "consistent" + * may be used to allocate memory. Such memory will all have coherent * DMA mappings, accessible by the device and its driver without using * cache flushing primitives. The actual size of blocks allocated may be * larger than requested because of alignment. @@ -395,7 +395,7 @@ void dma_pool_destroy(struct dma_pool *pool) EXPORT_SYMBOL(dma_pool_destroy); /** - * dma_pool_alloc - get a block of consistent memory + * dma_pool_alloc - get a block of coherent memory * @pool: dma pool that will produce the block * @mem_flags: GFP_* bitmask * @handle: pointer to dma address of block diff --git a/mm/execmem.c b/mm/execmem.c index 9720ac2dfa41..627e6cf64f4f 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -26,7 +26,7 @@ static struct execmem_info default_execmem_info __ro_after_init; #ifdef CONFIG_MMU static void *execmem_vmalloc(struct execmem_range *range, size_t size, - pgprot_t pgprot, unsigned long vm_flags) + pgprot_t pgprot, vm_flags_t vm_flags) { bool kasan = range->flags & EXECMEM_KASAN_SHADOW; gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; @@ -82,7 +82,7 @@ struct vm_struct *execmem_vmap(size_t size) } #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, - pgprot_t pgprot, unsigned long vm_flags) + pgprot_t pgprot, vm_flags_t vm_flags) { return vmalloc(size); } @@ -254,37 +254,9 @@ out_unlock: return ptr; } -static bool execmem_cache_rox = false; - -void execmem_cache_make_ro(void) -{ - struct maple_tree *free_areas = &execmem_cache.free_areas; - struct maple_tree *busy_areas = &execmem_cache.busy_areas; - MA_STATE(mas_free, free_areas, 0, ULONG_MAX); - MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); - struct mutex *mutex = &execmem_cache.mutex; - void *area; - - execmem_cache_rox = true; - - mutex_lock(mutex); - - mas_for_each(&mas_free, area, ULONG_MAX) { - unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT; - set_memory_ro(mas_free.index, pages); - } - - mas_for_each(&mas_busy, area, ULONG_MAX) { - unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT; - set_memory_ro(mas_busy.index, pages); - } - - mutex_unlock(mutex); -} - static int execmem_cache_populate(struct execmem_range *range, size_t size) { - unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; + vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -302,15 +274,9 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); - if (execmem_cache_rox) { - err = set_memory_rox((unsigned long)p, vm->nr_pages); - if (err) - goto err_free_mem; - } else { - err = set_memory_x((unsigned long)p, vm->nr_pages); - if (err) - goto err_free_mem; - } + err = set_memory_rox((unsigned long)p, vm->nr_pages); + if (err) + goto err_free_mem; err = execmem_cache_add(p, alloc_size); if (err) @@ -407,7 +373,7 @@ void *execmem_alloc(enum execmem_type type, size_t size) { struct execmem_range *range = &execmem_info->ranges[type]; bool use_cache = range->flags & EXECMEM_ROX_CACHE; - unsigned long vm_flags = VM_FLUSH_RESET_PERMS; + vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS; pgprot_t pgprot = range->pgprot; void *p; diff --git a/mm/filemap.c b/mm/filemap.c index bada249b9fb7..751838ef05e5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1778,8 +1778,9 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); + unsigned long nr = max_scan; - while (max_scan--) { + while (nr--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) return xas.xa_index; @@ -3215,8 +3216,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; - unsigned long vm_flags = vmf->vma->vm_flags; - unsigned int mmap_miss; + vm_flags_t vm_flags = vmf->vma->vm_flags; + unsigned short mmap_miss; #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ @@ -3231,13 +3232,17 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) if (!(vm_flags & VM_RAND_READ)) ra->size *= 2; ra->async_size = HPAGE_PMD_NR; - page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER); + ra->order = HPAGE_PMD_ORDER; + page_cache_ra_order(&ractl, ra); return fpin; } #endif - /* If we don't want any read-ahead, don't bother */ - if (vm_flags & VM_RAND_READ) + /* + * If we don't want any read-ahead, don't bother. VM_EXEC case below is + * already intended for random access. + */ + if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ) return fpin; if (!ra->ra_pages) return fpin; @@ -3260,15 +3265,43 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) if (mmap_miss > MMAP_LOTSAMISS) return fpin; - /* - * mmap read-around - */ + if (vm_flags & VM_EXEC) { + /* + * Allow arch to request a preferred minimum folio order for + * executable memory. This can often be beneficial to + * performance if (e.g.) arm64 can contpte-map the folio. + * Executable memory rarely benefits from readahead, due to its + * random access nature, so set async_size to 0. + * + * Limit to the boundaries of the VMA to avoid reading in any + * pad that might exist between sections, which would be a waste + * of memory. + */ + struct vm_area_struct *vma = vmf->vma; + unsigned long start = vma->vm_pgoff; + unsigned long end = start + vma_pages(vma); + unsigned long ra_end; + + ra->order = exec_folio_order(); + ra->start = round_down(vmf->pgoff, 1UL << ra->order); + ra->start = max(ra->start, start); + ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order); + ra_end = min(ra_end, end); + ra->size = ra_end - ra->start; + ra->async_size = 0; + } else { + /* + * mmap read-around + */ + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); + ra->size = ra->ra_pages; + ra->async_size = ra->ra_pages / 4; + ra->order = 0; + } + fpin = maybe_unlock_mmap_for_io(vmf, fpin); - ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); - ra->size = ra->ra_pages; - ra->async_size = ra->ra_pages / 4; ractl._index = ra->start; - page_cache_ra_order(&ractl, ra, 0); + page_cache_ra_order(&ractl, ra); return fpin; } @@ -3284,7 +3317,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct file_ra_state *ra = &file->f_ra; DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); struct file *fpin = NULL; - unsigned int mmap_miss; + unsigned short mmap_miss; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) @@ -3604,7 +3637,7 @@ skip: static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned long *rss, unsigned int *mmap_miss) + unsigned long *rss, unsigned short *mmap_miss) { vm_fault_t ret = 0; struct page *page = folio_page(folio, start); @@ -3666,7 +3699,7 @@ skip: static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, - unsigned long *rss, unsigned int *mmap_miss) + unsigned long *rss, unsigned short *mmap_miss) { vm_fault_t ret = 0; struct page *page = &folio->page; @@ -3708,7 +3741,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, struct folio *folio; vm_fault_t ret = 0; unsigned long rss = 0; - unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type; + unsigned int nr_pages = 0, folio_type; + unsigned short mmap_miss = 0, mmap_miss_saved; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); @@ -3814,6 +3848,18 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +int generic_file_mmap_prepare(struct vm_area_desc *desc) +{ + struct file *file = desc->file; + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->read_folio) + return -ENOEXEC; + file_accessed(file); + desc->vm_ops = &generic_file_vm_ops; + return 0; +} + /* * This is for filesystems which do not implement ->writepage. */ @@ -3823,6 +3869,13 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; return generic_file_mmap(file, vma); } + +int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc) +{ + if (is_shared_maywrite(desc->vm_flags)) + return -EINVAL; + return generic_file_mmap_prepare(desc); +} #else vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { @@ -3832,15 +3885,25 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } +int generic_file_mmap_prepare(struct vm_area_desc *desc) +{ + return -ENOSYS; +} int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } +int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc) +{ + return -ENOSYS; +} #endif /* CONFIG_MMU */ EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); +EXPORT_SYMBOL(generic_file_mmap_prepare); EXPORT_SYMBOL(generic_file_readonly_mmap); +EXPORT_SYMBOL(generic_file_readonly_mmap_prepare); static struct folio *do_read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file, gfp_t gfp) @@ -4109,7 +4172,7 @@ retry: break; } - status = a_ops->write_begin(file, mapping, pos, bytes, + status = a_ops->write_begin(iocb, mapping, pos, bytes, &folio, &fsdata); if (unlikely(status < 0)) break; @@ -4130,7 +4193,7 @@ retry: copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); - status = a_ops->write_end(file, mapping, pos, bytes, copied, + status = a_ops->write_end(iocb, mapping, pos, bytes, copied, folio, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); @@ -64,11 +64,11 @@ static inline void sanity_check_pinned_pages(struct page **pages, !folio_test_anon(folio)) continue; if (!folio_test_large(folio) || folio_test_hugetlb(folio)) - VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page); + VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio); else /* Either a PTE-mapped or a PMD-mapped THP. */ - VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) && - !PageAnonExclusive(page), page); + VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) && + !PageAnonExclusive(page), page); } } @@ -679,31 +679,9 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, return NULL; pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; - - if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && - pud_devmap(pud)) { - /* - * device mapped pages can only be returned if the caller - * will manage the page reference count. - * - * At least one of FOLL_GET | FOLL_PIN must be set, so - * assert that here: - */ - if (!(flags & (FOLL_GET | FOLL_PIN))) - return ERR_PTR(-EEXIST); - - if (flags & FOLL_TOUCH) - touch_pud(vma, addr, pudp, flags & FOLL_WRITE); - - ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap); - if (!ctx->pgmap) - return ERR_PTR(-EFAULT); - } - page = pfn_to_page(pfn); - if (!pud_devmap(pud) && !pud_write(pud) && - gup_must_unshare(vma, flags, page)) + if (!pud_write(pud) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); ret = try_grab_folio(page_folio(page), 1, flags); @@ -760,8 +738,8 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma, if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); - VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && - !PageAnonExclusive(page), page); + VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); ret = try_grab_folio(page_folio(page), 1, flags); if (ret) @@ -857,8 +835,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, page = vm_normal_page(vma, address, pte); /* - * We only care about anon pages in can_follow_write_pte() and don't - * have to worry about pte_devmap() because they are never anon. + * We only care about anon pages in can_follow_write_pte(). */ if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, vma, flags)) { @@ -866,18 +843,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, goto out; } - if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { - /* - * Only return device mapping pages in the FOLL_GET or FOLL_PIN - * case since they are only valid while holding the pgmap - * reference. - */ - *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); - if (*pgmap) - page = pte_page(pte); - else - goto no_page; - } else if (unlikely(!page)) { + if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); @@ -899,8 +865,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, goto out; } - VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && - !PageAnonExclusive(page), page); + VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ ret = try_grab_folio(folio, 1, flags); @@ -959,14 +925,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return no_page_table(vma, flags, address); if (!pmd_present(pmdval)) return no_page_table(vma, flags, address); - if (pmd_devmap(pmdval)) { - ptl = pmd_lock(mm, pmd); - page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); - spin_unlock(ptl); - if (page) - return page; - return no_page_table(vma, flags, address); - } if (likely(!pmd_leaf(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); @@ -1180,7 +1138,7 @@ static int faultin_page(struct vm_area_struct *vma, if (unshare) { fault_flags |= FAULT_FLAG_UNSHARE; /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ - VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE); + VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE); } ret = handle_mm_fault(vma, address, fault_flags, NULL); @@ -1760,10 +1718,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, } /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ - if (!*locked) { - BUG_ON(ret < 0); - BUG_ON(ret >= nr_pages); - } + VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages)); if (ret > 0) { nr_pages -= ret; @@ -1808,7 +1763,6 @@ retry: ret = mmap_read_lock_killable(mm); if (ret) { - BUG_ON(ret > 0); if (!pages_done) pages_done = ret; break; @@ -1819,11 +1773,11 @@ retry: pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ - BUG_ON(ret != 0); + VM_WARN_ON_ONCE(ret != 0); goto retry; } if (ret != 1) { - BUG_ON(ret > 1); + VM_WARN_ON_ONCE(ret > 1); if (!pages_done) pages_done = ret; break; @@ -1885,10 +1839,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, int gup_flags; long ret; - VM_BUG_ON(!PAGE_ALIGNED(start)); - VM_BUG_ON(!PAGE_ALIGNED(end)); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); + VM_WARN_ON_ONCE(!PAGE_ALIGNED(start)); + VM_WARN_ON_ONCE(!PAGE_ALIGNED(end)); + VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma); + VM_WARN_ON_ONCE_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* @@ -1957,8 +1911,8 @@ long faultin_page_range(struct mm_struct *mm, unsigned long start, int gup_flags; long ret; - VM_BUG_ON(!PAGE_ALIGNED(start)); - VM_BUG_ON(!PAGE_ALIGNED(end)); + VM_WARN_ON_ONCE(!PAGE_ALIGNED(start)); + VM_WARN_ON_ONCE(!PAGE_ALIGNED(end)); mmap_assert_locked(mm); /* @@ -2048,7 +2002,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, { struct vm_area_struct *vma; bool must_unlock = false; - unsigned long vm_flags; + vm_flags_t vm_flags; long i; if (!nr_pages) @@ -2300,27 +2254,51 @@ static void pofs_unpin(struct pages_or_folios *pofs) unpin_user_pages(pofs->pages, pofs->nr_entries); } +static struct folio *pofs_next_folio(struct folio *folio, + struct pages_or_folios *pofs, long *index_ptr) +{ + long i = *index_ptr + 1; + + if (!pofs->has_folios && folio_test_large(folio)) { + const unsigned long start_pfn = folio_pfn(folio); + const unsigned long end_pfn = start_pfn + folio_nr_pages(folio); + + for (; i < pofs->nr_entries; i++) { + unsigned long pfn = page_to_pfn(pofs->pages[i]); + + /* Is this page part of this folio? */ + if (pfn < start_pfn || pfn >= end_pfn) + break; + } + } + + if (unlikely(i == pofs->nr_entries)) + return NULL; + *index_ptr = i; + + return pofs_get_folio(pofs, i); +} + /* * Returns the number of collected folios. Return value is always >= 0. */ -static void collect_longterm_unpinnable_folios( +static unsigned long collect_longterm_unpinnable_folios( struct list_head *movable_folio_list, struct pages_or_folios *pofs) { - struct folio *prev_folio = NULL; + unsigned long collected = 0; bool drain_allow = true; - unsigned long i; - - for (i = 0; i < pofs->nr_entries; i++) { - struct folio *folio = pofs_get_folio(pofs, i); + struct folio *folio; + long i = 0; - if (folio == prev_folio) - continue; - prev_folio = folio; + for (folio = pofs_get_folio(pofs, i); folio; + folio = pofs_next_folio(folio, pofs, &i)) { if (folio_is_longterm_pinnable(folio)) continue; + collected++; + if (folio_is_device_coherent(folio)) continue; @@ -2342,6 +2320,8 @@ static void collect_longterm_unpinnable_folios( NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } + + return collected; } /* @@ -2418,9 +2398,11 @@ static long check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs) { LIST_HEAD(movable_folio_list); + unsigned long collected; - collect_longterm_unpinnable_folios(&movable_folio_list, pofs); - if (list_empty(&movable_folio_list)) + collected = collect_longterm_unpinnable_folios(&movable_folio_list, + pofs); + if (!collected) return 0; return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs); @@ -2822,9 +2804,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) return false; /* Anonymous folios pose no problem. */ - mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS; + mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS; if (mapping_flags) - return mapping_flags & PAGE_MAPPING_ANON; + return mapping_flags & FOLIO_MAPPING_ANON; /* * At this point, we know the mapping is non-null and points to an @@ -2872,7 +2854,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, int *nr) { struct dev_pagemap *pgmap = NULL; - int nr_start = *nr, ret = 0; + int ret = 0; pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); @@ -2896,19 +2878,11 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; - if (pte_devmap(pte)) { - if (unlikely(flags & FOLL_LONGTERM)) - goto pte_unmap; - - pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); - if (unlikely(!pgmap)) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - goto pte_unmap; - } - } else if (pte_special(pte)) + if (pte_special(pte)) goto pte_unmap; - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + /* If it's not marked as special it must have a valid memmap. */ + VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); folio = try_grab_folio_fast(page, 1, flags); @@ -2976,91 +2950,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ -#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, int *nr) -{ - int nr_start = *nr; - struct dev_pagemap *pgmap = NULL; - - do { - struct folio *folio; - struct page *page = pfn_to_page(pfn); - - pgmap = get_dev_pagemap(pfn, pgmap); - if (unlikely(!pgmap)) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - break; - } - - folio = try_grab_folio_fast(page, 1, flags); - if (!folio) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - break; - } - folio_set_referenced(folio); - pages[*nr] = page; - (*nr)++; - pfn++; - } while (addr += PAGE_SIZE, addr != end); - - put_dev_pagemap(pgmap); - return addr == end; -} - -static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - unsigned long fault_pfn; - int nr_start = *nr; - - fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr)) - return 0; - - if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; - } - return 1; -} - -static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - unsigned long fault_pfn; - int nr_start = *nr; - - fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr)) - return 0; - - if (unlikely(pud_val(orig) != pud_val(*pudp))) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; - } - return 1; -} -#else -static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - BUILD_BUG(); - return 0; -} - -static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - BUILD_BUG(); - return 0; -} -#endif - static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) @@ -3075,13 +2964,6 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (pmd_special(orig)) return 0; - if (pmd_devmap(orig)) { - if (unlikely(flags & FOLL_LONGTERM)) - return 0; - return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags, - pages, nr); - } - page = pmd_page(orig); refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); @@ -3122,13 +3004,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, if (pud_special(orig)) return 0; - if (pud_devmap(orig)) { - if (unlikely(flags & FOLL_LONGTERM)) - return 0; - return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags, - pages, nr); - } - page = pud_page(orig); refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); @@ -183,6 +183,7 @@ static inline unsigned long hmm_pfn_flags_order(unsigned long order) return order << HMM_PFN_ORDER_SHIFT; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) { @@ -193,7 +194,6 @@ static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, unsigned long hmm_pfns[], pmd_t pmd) @@ -302,13 +302,10 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, goto fault; /* - * Bypass devmap pte such as DAX page when all pfn requested - * flags(pfn_req_flags) are fulfilled. * Since each architecture defines a struct page for the zero page, just * fall through and treat it like a normal page. */ if (!vm_normal_page(walk->vma, addr, pte) && - !pte_devmap(pte) && !is_zero_pfn(pte_pfn(pte))) { if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); @@ -363,7 +360,7 @@ again: return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); } - if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { + if (pmd_trans_huge(pmd)) { /* * No need to take pmd_lock here, even if some other thread * is splitting the huge pmd we will get that event through @@ -374,7 +371,7 @@ again: * values. */ pmd = pmdp_get_lockless(pmdp); - if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) + if (!pmd_trans_huge(pmd)) goto again; return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd); @@ -408,8 +405,7 @@ again: return 0; } -#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ - defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +#if defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) { @@ -441,7 +437,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, return hmm_vma_walk_hole(start, end, -1, walk); } - if (pud_leaf(pud) && pud_devmap(pud)) { + if (pud_leaf(pud)) { unsigned long i, npages, pfn; unsigned int required_fault; unsigned long *hmm_pfns; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d3e66136e41a..9c38a95e9f09 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -22,7 +22,6 @@ #include <linux/mm_types.h> #include <linux/khugepaged.h> #include <linux/freezer.h> -#include <linux/pfn_t.h> #include <linux/mman.h> #include <linux/memremap.h> #include <linux/pagemap.h> @@ -99,7 +98,7 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, + vm_flags_t vm_flags, unsigned long tva_flags, unsigned long orders) { @@ -166,7 +165,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) - return shmem_allowable_huge_orders(file_inode(vma->vm_file), + return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file), vma, vma->vm_pgoff, 0, !enforce_sysfs); @@ -1372,9 +1371,17 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return __do_huge_pmd_anonymous_page(vmf); } -static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, - pgtable_t pgtable) +struct folio_or_pfn { + union { + struct folio *folio; + unsigned long pfn; + }; + bool is_folio; +}; + +static int insert_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot, + bool write, pgtable_t pgtable) { struct mm_struct *mm = vma->vm_mm; pmd_t entry; @@ -1382,8 +1389,11 @@ static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, lockdep_assert_held(pmd_lockptr(mm, pmd)); if (!pmd_none(*pmd)) { + const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : + fop.pfn; + if (write) { - if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { + if (pmd_pfn(*pmd) != pfn) { WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); return -EEXIST; } @@ -1396,11 +1406,16 @@ static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return -EEXIST; } - entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); - if (pfn_t_devmap(pfn)) - entry = pmd_mkdevmap(entry); - else + if (fop.is_folio) { + entry = folio_mk_pmd(fop.folio, vma->vm_page_prot); + + folio_get(fop.folio); + folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma); + add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR); + } else { + entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot)); entry = pmd_mkspecial(entry); + } if (write) { entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); @@ -1426,11 +1441,15 @@ static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, * * Return: vm_fault_t value. */ -vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) +vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn, + bool write) { unsigned long addr = vmf->address & PMD_MASK; struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; + struct folio_or_pfn fop = { + .pfn = pfn, + }; pgtable_t pgtable = NULL; spinlock_t *ptl; int error; @@ -1440,8 +1459,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && - !pfn_t_devmap(pfn)); + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); @@ -1455,11 +1473,11 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) return VM_FAULT_OOM; } - pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); ptl = pmd_lock(vma->vm_mm, vmf->pmd); - error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, - pgtable); + error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write, + pgtable); spin_unlock(ptl); if (error && pgtable) pte_free(vma->vm_mm, pgtable); @@ -1474,6 +1492,10 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address & PMD_MASK; struct mm_struct *mm = vma->vm_mm; + struct folio_or_pfn fop = { + .folio = folio, + .is_folio = true, + }; spinlock_t *ptl; pgtable_t pgtable = NULL; int error; @@ -1491,14 +1513,8 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, } ptl = pmd_lock(mm, vmf->pmd); - if (pmd_none(*vmf->pmd)) { - folio_get(folio); - folio_add_file_rmap_pmd(folio, &folio->page, vma); - add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR); - } - error = insert_pfn_pmd(vma, addr, vmf->pmd, - pfn_to_pfn_t(folio_pfn(folio)), vma->vm_page_prot, - write, pgtable); + error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, + write, pgtable); spin_unlock(ptl); if (error && pgtable) pte_free(mm, pgtable); @@ -1515,16 +1531,18 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) return pud; } -static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, pfn_t pfn, bool write) +static void insert_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write) { struct mm_struct *mm = vma->vm_mm; - pgprot_t prot = vma->vm_page_prot; pud_t entry; if (!pud_none(*pud)) { + const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : + fop.pfn; + if (write) { - if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn))) + if (WARN_ON_ONCE(pud_pfn(*pud) != pfn)) return; entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); @@ -1534,11 +1552,16 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, return; } - entry = pud_mkhuge(pfn_t_pud(pfn, prot)); - if (pfn_t_devmap(pfn)) - entry = pud_mkdevmap(entry); - else + if (fop.is_folio) { + entry = folio_mk_pud(fop.folio, vma->vm_page_prot); + + folio_get(fop.folio); + folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma); + add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR); + } else { + entry = pud_mkhuge(pfn_pud(fop.pfn, prot)); entry = pud_mkspecial(entry); + } if (write) { entry = pud_mkyoung(pud_mkdirty(entry)); entry = maybe_pud_mkwrite(entry, vma); @@ -1557,11 +1580,15 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, * * Return: vm_fault_t value. */ -vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) +vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn, + bool write) { unsigned long addr = vmf->address & PUD_MASK; struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; + struct folio_or_pfn fop = { + .pfn = pfn, + }; spinlock_t *ptl; /* @@ -1569,8 +1596,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && - !pfn_t_devmap(pfn)); + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); @@ -1578,10 +1604,10 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); ptl = pud_lock(vma->vm_mm, vmf->pud); - insert_pfn_pud(vma, addr, vmf->pud, pfn, write); + insert_pud(vma, addr, vmf->pud, fop, pgprot, write); spin_unlock(ptl); return VM_FAULT_NOPAGE; @@ -1603,6 +1629,10 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, unsigned long addr = vmf->address & PUD_MASK; pud_t *pud = vmf->pud; struct mm_struct *mm = vma->vm_mm; + struct folio_or_pfn fop = { + .folio = folio, + .is_folio = true, + }; spinlock_t *ptl; if (addr < vma->vm_start || addr >= vma->vm_end) @@ -1612,20 +1642,7 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, return VM_FAULT_SIGBUS; ptl = pud_lock(mm, pud); - - /* - * If there is already an entry present we assume the folio is - * already mapped, hence no need to take another reference. We - * still call insert_pfn_pud() though in case the mapping needs - * upgrading to writeable. - */ - if (pud_none(*vmf->pud)) { - folio_get(folio); - folio_add_file_rmap_pud(folio, &folio->page, vma); - add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR); - } - insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)), - write); + insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write); spin_unlock(ptl); return VM_FAULT_NOPAGE; @@ -1646,46 +1663,6 @@ void touch_pmd(struct vm_area_struct *vma, unsigned long addr, update_mmu_cache_pmd(vma, addr, pmd); } -struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags, struct dev_pagemap **pgmap) -{ - unsigned long pfn = pmd_pfn(*pmd); - struct mm_struct *mm = vma->vm_mm; - struct page *page; - int ret; - - assert_spin_locked(pmd_lockptr(mm, pmd)); - - if (flags & FOLL_WRITE && !pmd_write(*pmd)) - return NULL; - - if (pmd_present(*pmd) && pmd_devmap(*pmd)) - /* pass */; - else - return NULL; - - if (flags & FOLL_TOUCH) - touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); - - /* - * device mapped pages can only be returned if the - * caller will manage the page reference count. - */ - if (!(flags & (FOLL_GET | FOLL_PIN))) - return ERR_PTR(-EEXIST); - - pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; - *pgmap = get_dev_pagemap(pfn, *pgmap); - if (!*pgmap) - return ERR_PTR(-EFAULT); - page = pfn_to_page(pfn); - ret = try_grab_folio(page_folio(page), 1, flags); - if (ret) - page = ERR_PTR(ret); - - return page; -} - int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) @@ -1837,7 +1814,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pud = *src_pud; - if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) + if (unlikely(!pud_trans_huge(pud))) goto out_unlock; /* @@ -2699,8 +2676,7 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || - pmd_devmap(*pmd))) + if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd))) return ptl; spin_unlock(ptl); return NULL; @@ -2717,7 +2693,7 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) spinlock_t *ptl; ptl = pud_lock(vma->vm_mm, pud); - if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) + if (likely(pud_trans_huge(*pud))) return ptl; spin_unlock(ptl); return NULL; @@ -2769,7 +2745,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); - VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); + VM_BUG_ON(!pud_trans_huge(*pud)); count_vm_event(THP_SPLIT_PUD); @@ -2802,7 +2778,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pud_lock(vma->vm_mm, pud); - if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) + if (unlikely(!pud_trans_huge(*pud))) goto out; __split_huge_pud_locked(vma, pud, range.start); @@ -2875,8 +2851,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) - && !pmd_devmap(*pmd)); + VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); count_vm_event(THP_SPLIT_PMD); @@ -3084,8 +3059,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze) { VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || - is_pmd_migration_entry(*pmd)) + if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) __split_huge_pmd_locked(vma, pmd, address, freeze); } @@ -3411,10 +3385,6 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * order - 1 to new_order). * @split_at: in buddy allocator like split, the folio containing @split_at * will be split until its order becomes @new_order. - * @lock_at: the folio containing @lock_at is left locked for caller. - * @list: the after split folios will be added to @list if it is not NULL, - * otherwise to LRU lists. - * @end: the end of the file @folio maps to. -1 if @folio is anonymous memory. * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller * @mapping: @folio->mapping * @uniform_split: if the split is uniform or not (buddy allocator like split) @@ -3440,52 +3410,26 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * @page, which is split in next for loop. * * After splitting, the caller's folio reference will be transferred to the - * folio containing @page. The other folios may be freed if they are not mapped. - * - * In terms of locking, after splitting, - * 1. uniform split leaves @page (or the folio contains it) locked; - * 2. buddy allocator like (non-uniform) split leaves @folio locked. - * + * folio containing @page. The caller needs to unlock and/or free after-split + * folios if necessary. * * For !uniform_split, when -ENOMEM is returned, the original folio might be * split. The caller needs to check the input folio. */ static int __split_unmapped_folio(struct folio *folio, int new_order, - struct page *split_at, struct page *lock_at, - struct list_head *list, pgoff_t end, - struct xa_state *xas, struct address_space *mapping, - bool uniform_split) + struct page *split_at, struct xa_state *xas, + struct address_space *mapping, bool uniform_split) { - struct lruvec *lruvec; - struct address_space *swap_cache = NULL; - struct folio *origin_folio = folio; - struct folio *next_folio = folio_next(folio); - struct folio *new_folio; - struct folio *next; int order = folio_order(folio); - int split_order; int start_order = uniform_split ? new_order : order - 1; - int nr_dropped = 0; - int ret = 0; bool stop_split = false; - - if (folio_test_swapcache(folio)) { - VM_BUG_ON(mapping); - - /* a swapcache folio can only be uniformly split to order-0 */ - if (!uniform_split || new_order != 0) - return -EINVAL; - - swap_cache = swap_address_space(folio->swap); - xa_lock(&swap_cache->i_pages); - } + struct folio *next; + int split_order; + int ret = 0; if (folio_test_anon(folio)) mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); - /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ - lruvec = folio_lruvec_lock(folio); - folio_clear_has_hwpoisoned(folio); /* @@ -3495,9 +3439,9 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, for (split_order = start_order; split_order >= new_order && !stop_split; split_order--) { - int old_order = folio_order(folio); - struct folio *release; struct folio *end_folio = folio_next(folio); + int old_order = folio_order(folio); + struct folio *new_folio; /* order-1 anonymous folio is not supported */ if (folio_test_anon(folio) && split_order == 1) @@ -3519,126 +3463,45 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, if (xas_error(xas)) { ret = xas_error(xas); stop_split = true; - goto after_split; } } } - folio_split_memcg_refs(folio, old_order, split_order); - split_page_owner(&folio->page, old_order, split_order); - pgalloc_tag_split(folio, old_order, split_order); + if (!stop_split) { + folio_split_memcg_refs(folio, old_order, split_order); + split_page_owner(&folio->page, old_order, split_order); + pgalloc_tag_split(folio, old_order, split_order); - __split_folio_to_order(folio, old_order, split_order); + __split_folio_to_order(folio, old_order, split_order); + } -after_split: /* - * Iterate through after-split folios and perform related - * operations. But in buddy allocator like split, the folio + * Iterate through after-split folios and update folio stats. + * But in buddy allocator like split, the folio * containing the specified page is skipped until its order * is new_order, since the folio will be worked on in next * iteration. */ - for (release = folio; release != end_folio; release = next) { - next = folio_next(release); + for (new_folio = folio; new_folio != end_folio; new_folio = next) { + next = folio_next(new_folio); /* - * for buddy allocator like split, the folio containing - * page will be split next and should not be released, - * until the folio's order is new_order or stop_split - * is set to true by the above xas_split() failure. + * for buddy allocator like split, new_folio containing + * @split_at page could be split again, thus do not + * change stats yet. Wait until new_folio's order is + * @new_order or stop_split is set to true by the above + * xas_split() failure. */ - if (release == page_folio(split_at)) { - folio = release; + if (new_folio == page_folio(split_at)) { + folio = new_folio; if (split_order != new_order && !stop_split) continue; } - if (folio_test_anon(release)) { - mod_mthp_stat(folio_order(release), - MTHP_STAT_NR_ANON, 1); - } - - /* - * origin_folio should be kept frozon until page cache - * entries are updated with all the other after-split - * folios to prevent others seeing stale page cache - * entries. - */ - if (release == origin_folio) - continue; - - folio_ref_unfreeze(release, 1 + - ((mapping || swap_cache) ? - folio_nr_pages(release) : 0)); - - lru_add_split_folio(origin_folio, release, lruvec, - list); - - /* Some pages can be beyond EOF: drop them from cache */ - if (release->index >= end) { - if (shmem_mapping(mapping)) - nr_dropped += folio_nr_pages(release); - else if (folio_test_clear_dirty(release)) - folio_account_cleaned(release, - inode_to_wb(mapping->host)); - __filemap_remove_folio(release, NULL); - folio_put_refs(release, folio_nr_pages(release)); - } else if (mapping) { - __xa_store(&mapping->i_pages, - release->index, release, 0); - } else if (swap_cache) { - __xa_store(&swap_cache->i_pages, - swap_cache_index(release->swap), - release, 0); - } + if (folio_test_anon(new_folio)) + mod_mthp_stat(folio_order(new_folio), + MTHP_STAT_NR_ANON, 1); } } - /* - * Unfreeze origin_folio only after all page cache entries, which used - * to point to it, have been updated with new folios. Otherwise, - * a parallel folio_try_get() can grab origin_folio and its caller can - * see stale page cache entries. - */ - folio_ref_unfreeze(origin_folio, 1 + - ((mapping || swap_cache) ? folio_nr_pages(origin_folio) : 0)); - - unlock_page_lruvec(lruvec); - - if (swap_cache) - xa_unlock(&swap_cache->i_pages); - if (mapping) - xa_unlock(&mapping->i_pages); - - /* Caller disabled irqs, so they are still disabled here */ - local_irq_enable(); - - if (nr_dropped) - shmem_uncharge(mapping->host, nr_dropped); - - remap_page(origin_folio, 1 << order, - folio_test_anon(origin_folio) ? - RMP_USE_SHARED_ZEROPAGE : 0); - - /* - * At this point, folio should contain the specified page. - * For uniform split, it is left for caller to unlock. - * For buddy allocator like split, the first after-split folio is left - * for caller to unlock. - */ - for (new_folio = origin_folio; new_folio != next_folio; new_folio = next) { - next = folio_next(new_folio); - if (new_folio == page_folio(lock_at)) - continue; - - folio_unlock(new_folio); - /* - * Subpages may be freed if there wasn't any mapping - * like if add_to_swap() is running on a lru page that - * had its mapping zapped. And freeing these pages - * requires taking the lru_lock so we do the put_page - * of the tail pages after the split is complete. - */ - free_folio_and_swap_cache(new_folio); - } return ret; } @@ -3712,6 +3575,11 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, * It is in charge of checking whether the split is supported or not and * preparing @folio for __split_unmapped_folio(). * + * After splitting, the after-split folio containing @lock_at remains locked + * and others are unlocked: + * 1. for uniform split, @lock_at points to one of @folio's subpages; + * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio. + * * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be * split but not to @new_order, the caller needs to check) */ @@ -3721,16 +3589,20 @@ static int __folio_split(struct folio *folio, unsigned int new_order, { struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); + struct folio *end_folio = folio_next(folio); bool is_anon = folio_test_anon(folio); struct address_space *mapping = NULL; struct anon_vma *anon_vma = NULL; int order = folio_order(folio); + struct folio *new_folio, *next; + int nr_shmem_dropped = 0; + int remap_flags = 0; int extra_pins, ret; pgoff_t end; bool is_hzp; - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio); if (folio != page_folio(split_at) || folio != page_folio(lock_at)) return -EINVAL; @@ -3768,7 +3640,6 @@ static int __folio_split(struct folio *folio, unsigned int new_order, ret = -EBUSY; goto out; } - end = -1; mapping = NULL; anon_vma_lock_write(anon_vma); } else { @@ -3848,13 +3719,19 @@ static int __folio_split(struct folio *folio, unsigned int new_order, */ xas_lock(&xas); xas_reset(&xas); - if (xas_load(&xas) != folio) + if (xas_load(&xas) != folio) { + ret = -EAGAIN; goto fail; + } } /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { + struct address_space *swap_cache = NULL; + struct lruvec *lruvec; + int expected_refs; + if (folio_order(folio) > 1 && !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; @@ -3888,18 +3765,122 @@ static int __folio_split(struct folio *folio, unsigned int new_order, } } - ret = __split_unmapped_folio(folio, new_order, - split_at, lock_at, list, end, &xas, mapping, - uniform_split); + if (folio_test_swapcache(folio)) { + if (mapping) { + VM_WARN_ON_ONCE_FOLIO(mapping, folio); + ret = -EINVAL; + goto fail; + } + + swap_cache = swap_address_space(folio->swap); + xa_lock(&swap_cache->i_pages); + } + + /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ + lruvec = folio_lruvec_lock(folio); + + ret = __split_unmapped_folio(folio, new_order, split_at, &xas, + mapping, uniform_split); + + /* + * Unfreeze after-split folios and put them back to the right + * list. @folio should be kept frozon until page cache + * entries are updated with all the other after-split folios + * to prevent others seeing stale page cache entries. + * As a result, new_folio starts from the next folio of + * @folio. + */ + for (new_folio = folio_next(folio); new_folio != end_folio; + new_folio = next) { + unsigned long nr_pages = folio_nr_pages(new_folio); + + next = folio_next(new_folio); + + expected_refs = folio_expected_ref_count(new_folio) + 1; + folio_ref_unfreeze(new_folio, expected_refs); + + lru_add_split_folio(folio, new_folio, lruvec, list); + + /* + * Anonymous folio with swap cache. + * NOTE: shmem in swap cache is not supported yet. + */ + if (swap_cache) { + __xa_store(&swap_cache->i_pages, + swap_cache_index(new_folio->swap), + new_folio, 0); + continue; + } + + /* Anonymous folio without swap cache */ + if (!mapping) + continue; + + /* Add the new folio to the page cache. */ + if (new_folio->index < end) { + __xa_store(&mapping->i_pages, new_folio->index, + new_folio, 0); + continue; + } + + /* Drop folio beyond EOF: ->index >= end */ + if (shmem_mapping(mapping)) + nr_shmem_dropped += nr_pages; + else if (folio_test_clear_dirty(new_folio)) + folio_account_cleaned( + new_folio, inode_to_wb(mapping->host)); + __filemap_remove_folio(new_folio, NULL); + folio_put_refs(new_folio, nr_pages); + } + /* + * Unfreeze @folio only after all page cache entries, which + * used to point to it, have been updated with new folios. + * Otherwise, a parallel folio_try_get() can grab @folio + * and its caller can see stale page cache entries. + */ + expected_refs = folio_expected_ref_count(folio) + 1; + folio_ref_unfreeze(folio, expected_refs); + + unlock_page_lruvec(lruvec); + + if (swap_cache) + xa_unlock(&swap_cache->i_pages); } else { spin_unlock(&ds_queue->split_queue_lock); -fail: - if (mapping) - xas_unlock(&xas); - local_irq_enable(); - remap_page(folio, folio_nr_pages(folio), 0); ret = -EAGAIN; } +fail: + if (mapping) + xas_unlock(&xas); + + local_irq_enable(); + + if (nr_shmem_dropped) + shmem_uncharge(mapping->host, nr_shmem_dropped); + + if (!ret && is_anon) + remap_flags = RMP_USE_SHARED_ZEROPAGE; + remap_page(folio, 1 << order, remap_flags); + + /* + * Unlock all after-split folios except the one containing + * @lock_at page. If @folio is not split, it will be kept locked. + */ + for (new_folio = folio; new_folio != end_folio; new_folio = next) { + next = folio_next(new_folio); + if (new_folio == page_folio(lock_at)) + continue; + + folio_unlock(new_folio); + /* + * Subpages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + free_folio_and_swap_cache(new_folio); + } out_unlock: if (anon_vma) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8746ed2fec13..753f99b4c718 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -25,6 +25,7 @@ #include <linux/mmdebug.h> #include <linux/sched/signal.h> #include <linux/rmap.h> +#include <linux/string_choices.h> #include <linux/string_helpers.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -284,11 +285,6 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, return ret; } -static inline struct hugepage_subpool *subpool_inode(struct inode *inode) -{ - return HUGETLBFS_SB(inode->i_sb)->spool; -} - static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) { return subpool_inode(file_inode(vma->vm_file)); @@ -2340,12 +2336,15 @@ struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, struct folio *folio; spin_lock_irq(&hugetlb_lock); + if (!h->resv_huge_pages) { + spin_unlock_irq(&hugetlb_lock); + return NULL; + } + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); - if (folio) { - VM_BUG_ON(!h->resv_huge_pages); + if (folio) h->resv_huge_pages--; - } spin_unlock_irq(&hugetlb_lock); return folio; @@ -2787,20 +2786,24 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve * the old one - * @h: struct hstate old page belongs to * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ -static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, - struct folio *old_folio, struct list_head *list) +static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio, + struct list_head *list) { - gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + gfp_t gfp_mask; + struct hstate *h; int nid = folio_nid(old_folio); struct folio *new_folio = NULL; int ret = 0; retry: + /* + * The old_folio might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + */ spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { /* @@ -2829,8 +2832,10 @@ retry: cond_resched(); goto retry; } else { + h = folio_hstate(old_folio); if (!new_folio) { spin_unlock_irq(&hugetlb_lock); + gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); if (!new_folio) @@ -2874,35 +2879,24 @@ free_new: int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { - struct hstate *h; int ret = -EBUSY; - /* - * The page might have been dissolved from under our feet, so make sure - * to carefully check the state under the lock. - * Return success when racing as if we dissolved the page ourselves. - */ - spin_lock_irq(&hugetlb_lock); - if (folio_test_hugetlb(folio)) { - h = folio_hstate(folio); - } else { - spin_unlock_irq(&hugetlb_lock); + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (!folio_test_hugetlb(folio)) return 0; - } - spin_unlock_irq(&hugetlb_lock); /* * Fence off gigantic pages as there is a cyclic dependency between * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ - if (hstate_is_gigantic(h)) + if (folio_order(folio) > MAX_PAGE_ORDER) return -ENOMEM; if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) - ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); + ret = alloc_and_dissolve_hugetlb_folio(folio, list); return ret; } @@ -2916,7 +2910,6 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) */ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) { - struct hstate *h; struct folio *folio; int ret = 0; @@ -2925,23 +2918,9 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) while (start_pfn < end_pfn) { folio = pfn_folio(start_pfn); - /* - * The folio might have been dissolved from under our feet, so make sure - * to carefully check the state under the lock. - */ - spin_lock_irq(&hugetlb_lock); - if (folio_test_hugetlb(folio)) { - h = folio_hstate(folio); - } else { - spin_unlock_irq(&hugetlb_lock); - start_pfn++; - continue; - } - spin_unlock_irq(&hugetlb_lock); - - if (!folio_ref_count(folio)) { - ret = alloc_and_dissolve_hugetlb_folio(h, folio, - &isolate_list); + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) { + ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list); if (ret) break; @@ -3319,8 +3298,8 @@ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio, if (folio_test_hugetlb_cma(folio)) init_cma_pageblock(folio_page(folio, i)); else - set_pageblock_migratetype(folio_page(folio, i), - MIGRATE_MOVABLE); + init_pageblock_migratetype(folio_page(folio, i), + MIGRATE_MOVABLE, false); } } @@ -3744,7 +3723,7 @@ static void __init report_hugepages(void) buf, h->nr_huge_pages); if (nrinvalid) pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n", - buf, nrinvalid, nrinvalid > 1 ? "s" : ""); + buf, nrinvalid, str_plural(nrinvalid)); pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); } @@ -6152,8 +6131,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, - struct vm_fault *vmf) +static vm_fault_t hugetlb_wp(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; @@ -6215,16 +6193,17 @@ retry_avoidcopy: PageAnonExclusive(&old_folio->page), &old_folio->page); /* - * If the process that created a MAP_PRIVATE mapping is about to - * perform a COW due to a shared page count, attempt to satisfy - * the allocation without using the existing reserves. The pagecache - * page is used to determine if the reserve at this address was - * consumed or not. If reserves were used, a partial faulted mapping - * at the time of fork() could consume its reserves on COW instead - * of the full address range. + * If the process that created a MAP_PRIVATE mapping is about to perform + * a COW due to a shared page count, attempt to satisfy the allocation + * without using the existing reserves. + * In order to determine where this is a COW on a MAP_PRIVATE mapping it + * is enough to check whether the old_folio is anonymous. This means that + * the reserve for this address was consumed. If reserves were used, a + * partial faulted mapping at the fime of fork() could consume its reserves + * on COW instead of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && - old_folio != pagecache_folio) + folio_test_anon(old_folio)) cow_from_owner = true; folio_get(old_folio); @@ -6427,16 +6406,16 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned static vm_fault_t hugetlb_no_page(struct address_space *mapping, struct vm_fault *vmf) { + u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); + bool new_folio, new_anon_folio = false; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; - int anon_rmap = 0; - unsigned long size; + bool folio_locked = true; struct folio *folio; + unsigned long size; pte_t new_pte; - bool new_folio, new_pagecache_folio = false; - u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); /* * Currently, we are forced to kill the process in the event the @@ -6535,10 +6514,9 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, ret = VM_FAULT_SIGBUS; goto out; } - new_pagecache_folio = true; } else { + new_anon_folio = true; folio_lock(folio); - anon_rmap = 1; } } else { /* @@ -6587,7 +6565,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte)) goto backout; - if (anon_rmap) + if (new_anon_folio) hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); @@ -6602,8 +6580,16 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, hugetlb_count_add(pages_per_huge_page(h), mm); if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + /* + * No need to keep file folios locked. See comment in + * hugetlb_fault(). + */ + if (!new_anon_folio) { + folio_locked = false; + folio_unlock(folio); + } /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(folio, vmf); + ret = hugetlb_wp(vmf); } spin_unlock(vmf->ptl); @@ -6616,7 +6602,8 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, if (new_folio) folio_set_hugetlb_migratable(folio); - folio_unlock(folio); + if (folio_locked) + folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); @@ -6633,7 +6620,8 @@ out: backout: spin_unlock(vmf->ptl); backout_unlocked: - if (new_folio && !new_pagecache_folio) + /* We only need to restore reservations for private mappings */ + if (new_anon_folio) restore_reserve_on_error(h, vma, vmf->address, folio); folio_unlock(folio); @@ -6671,10 +6659,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vm_fault_t ret; u32 hash; struct folio *folio = NULL; - struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; - int need_wait_lock = 0; + bool need_wait_lock = false; struct vm_fault vmf = { .vma = vma, .address = address & huge_page_mask(h), @@ -6740,15 +6727,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; - /* - * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this - * point, so this check prevents the kernel from going below assuming - * that we have an active hugepage in pagecache. This goto expects - * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned) - * check will properly handle it. - */ + /* Not present, either a migration or a hwpoisoned entry */ if (!pte_present(vmf.orig_pte)) { - if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) { + if (is_hugetlb_entry_migration(vmf.orig_pte)) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the @@ -6759,7 +6740,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_unlock(&hugetlb_fault_mutex_table[hash]); migration_entry_wait_huge(vma, vmf.address, vmf.pte); return 0; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte))) + } else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte)) ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; @@ -6769,8 +6750,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * If we are going to COW/unshare the mapping later, we examine the * pending reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the - * spinlock. Also lookup the pagecache page now as it is used to - * determine if a reservation has been consumed. + * spinlock. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) { @@ -6780,11 +6760,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, vmf.address); - - pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, - vmf.pgoff); - if (IS_ERR(pagecache_folio)) - pagecache_folio = NULL; } vmf.ptl = huge_pte_lock(h, mm, vmf.pte); @@ -6798,10 +6773,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) { if (!userfaultfd_wp_async(vma)) { spin_unlock(vmf.ptl); - if (pagecache_folio) { - folio_unlock(pagecache_folio); - folio_put(pagecache_folio); - } hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(&vmf, VM_UFFD_WP); @@ -6813,24 +6784,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Fallthrough to CoW */ } - /* - * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and - * pagecache_folio, so here we need take the former one - * when folio != pagecache_folio or !pagecache_folio. - */ - folio = page_folio(pte_page(vmf.orig_pte)); - if (folio != pagecache_folio) - if (!folio_trylock(folio)) { - need_wait_lock = 1; - goto out_ptl; - } - - folio_get(folio); - if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(vmf.orig_pte)) { - ret = hugetlb_wp(pagecache_folio, &vmf); - goto out_put_page; + /* + * Anonymous folios need to be lock since hugetlb_wp() + * checks whether we can re-use the folio exclusively + * for us in case we are the only user of it. + */ + folio = page_folio(pte_page(vmf.orig_pte)); + if (folio_test_anon(folio) && !folio_trylock(folio)) { + need_wait_lock = true; + goto out_ptl; + } + folio_get(folio); + ret = hugetlb_wp(&vmf); + if (folio_test_anon(folio)) + folio_unlock(folio); + folio_put(folio); + goto out_ptl; } else if (likely(flags & FAULT_FLAG_WRITE)) { vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte); } @@ -6839,17 +6810,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, vmf.address, vmf.pte); -out_put_page: - if (folio != pagecache_folio) - folio_unlock(folio); - folio_put(folio); out_ptl: spin_unlock(vmf.ptl); - - if (pagecache_folio) { - folio_unlock(pagecache_folio); - folio_put(pagecache_folio); - } out_mutex: hugetlb_vma_unlock_read(vma); @@ -6862,11 +6824,16 @@ out_mutex: mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* - * Generally it's safe to hold refcount during waiting page lock. But - * here we just wait to defer the next page fault to avoid busy loop and - * the page is not used after unlocked before returning from the current - * page fault. So we are safe from accessing freed page, even if we wait - * here without taking refcount. + * hugetlb_wp drops all the locks, but the folio lock, before trying to + * unmap the folio from other processes. During that window, if another + * process mapping that folio faults in, it will take the mutex and then + * it will wait on folio_lock, causing an ABBA deadlock. + * Use trylock instead and bail out if we fail. + * + * Ideally, we should hold a refcount on the folio we wait for, but we do + * not want to use the folio after it becomes unlocked, but rather just + * wait for it to become unlocked, so hopefully next fault successes on + * the trylock. */ if (need_wait_lock) folio_wait_locked(folio); @@ -7186,11 +7153,11 @@ long hugetlb_change_protection(struct vm_area_struct *vma, /* Nothing to do. */ } else if (unlikely(is_hugetlb_entry_migration(pte))) { swp_entry_t entry = pte_to_swp_entry(pte); - struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = pfn_swap_entry_folio(entry); pte_t newpte = pte; if (is_writable_migration_entry(entry)) { - if (PageAnon(page)) + if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); else @@ -7264,8 +7231,15 @@ long hugetlb_change_protection(struct vm_area_struct *vma, return pages > 0 ? (pages << h->order) : pages; } -/* Return true if reservation was successful, false otherwise. */ -bool hugetlb_reserve_pages(struct inode *inode, +/* + * Update the reservation map for the range [from, to]. + * + * Returns the number of entries that would be added to the reservation map + * associated with the range [from, to]. This number is greater or equal to + * zero. -EINVAL or -ENOMEM is returned in case of any errors. + */ + +long hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags) @@ -7280,7 +7254,7 @@ bool hugetlb_reserve_pages(struct inode *inode, /* This should never happen */ if (from > to) { VM_WARN(1, "%s called with a negative range\n", __func__); - return false; + return -EINVAL; } /* @@ -7295,7 +7269,7 @@ bool hugetlb_reserve_pages(struct inode *inode, * without using reserves */ if (vm_flags & VM_NORESERVE) - return true; + return 0; /* * Shared mappings base their reservation on the number of pages that @@ -7402,7 +7376,7 @@ bool hugetlb_reserve_pages(struct inode *inode, hugetlb_cgroup_put_rsvd_cgroup(h_cg); } } - return true; + return chg; out_put_pages: spool_resv = chg - gbl_reserve; @@ -7430,7 +7404,7 @@ out_err: kref_put(&resv_map->refs, resv_map_release); set_vma_resv_map(vma, NULL); } - return false; + return chg < 0 ? chg : add < 0 ? add : -EINVAL; } long hugetlb_unreserve_pages(struct inode *inode, long start, long end, @@ -7485,8 +7459,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; - unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; + vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; + vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the @@ -7861,7 +7835,7 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re struct hstate *h = folio_hstate(old_folio); hugetlb_cgroup_migrate(old_folio, new_folio); - set_page_owner_migrate_reason(&new_folio->page, reason); + folio_set_owner_migrate_reason(new_folio, reason); /* * transfer temporary state of the new hugetlb folio. This is diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 27245e86df25..ba0fb1b6a5a8 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -166,7 +166,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, VM_BUG_ON(!PAGE_ALIGNED(start | end)); mmap_read_lock(&init_mm); - ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, + ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops, NULL, walk); mmap_read_unlock(&init_mm); if (ret) diff --git a/mm/internal.h b/mm/internal.h index 6b8ed2017743..1da16d550a45 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -149,7 +149,7 @@ static inline void *folio_raw_mapping(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; - return (void *)(mapping & ~PAGE_MAPPING_FLAGS); + return (void *)(mapping & ~FOLIO_MAPPING_FLAGS); } /* @@ -164,7 +164,7 @@ static inline void *folio_raw_mapping(const struct folio *folio) */ static inline int mmap_file(struct file *file, struct vm_area_struct *vma) { - int err = call_mmap(file, vma); + int err = vfs_mmap(file, vma); if (likely(!err)) return 0; @@ -202,94 +202,106 @@ static inline void vma_close(struct vm_area_struct *vma) /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; -/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ -#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) +/* Compare PTEs respecting the dirty bit. */ +#define FPB_RESPECT_DIRTY ((__force fpb_t)BIT(0)) -/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ -#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) +/* Compare PTEs respecting the soft-dirty bit. */ +#define FPB_RESPECT_SOFT_DIRTY ((__force fpb_t)BIT(1)) + +/* Compare PTEs respecting the writable bit. */ +#define FPB_RESPECT_WRITE ((__force fpb_t)BIT(2)) + +/* + * Merge PTE write bits: if any PTE in the batch is writable, modify the + * PTE at @ptentp to be writable. + */ +#define FPB_MERGE_WRITE ((__force fpb_t)BIT(3)) + +/* + * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty, + * modify the PTE at @ptentp to be young or dirty, respectively. + */ +#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(4)) static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) { - if (flags & FPB_IGNORE_DIRTY) + if (!(flags & FPB_RESPECT_DIRTY)) pte = pte_mkclean(pte); - if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) + if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY))) pte = pte_clear_soft_dirty(pte); - return pte_wrprotect(pte_mkold(pte)); + if (likely(!(flags & FPB_RESPECT_WRITE))) + pte = pte_wrprotect(pte); + return pte_mkold(pte); } /** - * folio_pte_batch - detect a PTE batch for a large folio + * folio_pte_batch_flags - detect a PTE batch for a large folio * @folio: The large folio to detect a PTE batch for. - * @addr: The user virtual address the first page is mapped at. - * @start_ptep: Page table pointer for the first entry. - * @pte: Page table entry for the first page. + * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL. + * @ptep: Page table pointer for the first entry. + * @ptentp: Pointer to a COPY of the first page table entry whose flags this + * function updates based on @flags if appropriate. * @max_nr: The maximum number of table entries to consider. * @flags: Flags to modify the PTE batch semantics. - * @any_writable: Optional pointer to indicate whether any entry except the - * first one is writable. - * @any_young: Optional pointer to indicate whether any entry except the - * first one is young. - * @any_dirty: Optional pointer to indicate whether any entry except the - * first one is dirty. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive - * pages of the same large folio. + * pages of the same large folio in a single VMA and a single page table. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, - * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and - * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). + * the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set) + * and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set). + * + * @ptep must map any page of the folio. max_nr must be at least one and + * must be limited by the caller so scanning cannot exceed a single VMA and + * a single page table. * - * start_ptep must map any page of the folio. max_nr must be at least one and - * must be limited by the caller so scanning cannot exceed a single page table. + * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will + * be updated: it's crucial that a pointer to a COPY of the first + * page table entry, obtained through ptep_get(), is provided as @ptentp. + * + * This function will be inlined to optimize based on the input parameters; + * consider using folio_pte_batch() instead if applicable. * * Return: the number of table entries in the batch. */ -static inline int folio_pte_batch(struct folio *folio, unsigned long addr, - pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, - bool *any_writable, bool *any_young, bool *any_dirty) +static inline unsigned int folio_pte_batch_flags(struct folio *folio, + struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp, + unsigned int max_nr, fpb_t flags) { - pte_t expected_pte, *ptep; - bool writable, young, dirty; - int nr, cur_nr; - - if (any_writable) - *any_writable = false; - if (any_young) - *any_young = false; - if (any_dirty) - *any_dirty = false; + bool any_writable = false, any_young = false, any_dirty = false; + pte_t expected_pte, pte = *ptentp; + unsigned int nr, cur_nr; VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); + /* + * Ensure this is a pointer to a copy not a pointer into a page table. + * If this is a stack value, it won't be a valid virtual address, but + * that's fine because it also cannot be pointing into the page table. + */ + VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp))); /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ max_nr = min_t(unsigned long, max_nr, folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte)); - nr = pte_batch_hint(start_ptep, pte); + nr = pte_batch_hint(ptep, pte); expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); - ptep = start_ptep + nr; + ptep = ptep + nr; while (nr < max_nr) { pte = ptep_get(ptep); - if (any_writable) - writable = !!pte_write(pte); - if (any_young) - young = !!pte_young(pte); - if (any_dirty) - dirty = !!pte_dirty(pte); - pte = __pte_batch_clear_ignored(pte, flags); - if (!pte_same(pte, expected_pte)) + if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte)) break; - if (any_writable) - *any_writable |= writable; - if (any_young) - *any_young |= young; - if (any_dirty) - *any_dirty |= dirty; + if (flags & FPB_MERGE_WRITE) + any_writable |= pte_write(pte); + if (flags & FPB_MERGE_YOUNG_DIRTY) { + any_young |= pte_young(pte); + any_dirty |= pte_dirty(pte); + } cur_nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, cur_nr); @@ -297,9 +309,19 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, nr += cur_nr; } + if (any_writable) + *ptentp = pte_mkwrite(*ptentp, vma); + if (any_young) + *ptentp = pte_mkyoung(*ptentp); + if (any_dirty) + *ptentp = pte_mkdirty(*ptentp); + return min(nr, max_nr); } +unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, + unsigned int max_nr); + /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte * forward or backward by delta @@ -436,8 +458,7 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp); -void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, - unsigned int order); +void page_cache_ra_order(struct readahead_control *, struct file_ra_state *); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) @@ -517,6 +538,16 @@ extern unsigned long highest_memmap_pfn; bool folio_isolate_lru(struct folio *folio); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); +#ifdef CONFIG_NUMA +int user_proactive_reclaim(char *buf, + struct mem_cgroup *memcg, pg_data_t *pgdat); +#else +static inline int user_proactive_reclaim(char *buf, + struct mem_cgroup *memcg, pg_data_t *pgdat) +{ + return 0; +} +#endif /* * in mm/rmap.c: @@ -821,7 +852,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, int nid, bool exact_nid); void memmap_init_range(unsigned long, int, unsigned long, unsigned long, - unsigned long, enum meminit_context, struct vmem_altmap *, int); + unsigned long, enum meminit_context, struct vmem_altmap *, int, + bool); #if defined CONFIG_COMPACTION || defined CONFIG_CMA @@ -929,7 +961,7 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); -extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, +extern bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes); /* @@ -1227,7 +1259,6 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -struct folio *alloc_migrate_folio(struct folio *src, unsigned long private); unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); @@ -1360,7 +1391,7 @@ int migrate_device_coherent_folio(struct folio *folio); struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, - unsigned long flags, unsigned long start, + vm_flags_t vm_flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller); @@ -1605,6 +1636,9 @@ static inline void accept_page(struct page *page) int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); +int walk_page_range_debug(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); /* pt_reclaim.c */ bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 5f922dd38ffa..2aa12dfa427a 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1977,6 +1977,11 @@ static void rust_uaf(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kasan_test_rust_uaf()); } +/* + * copy_to_kernel_nofault() is an internal helper available when + * kasan_test is built-in, so it must not be visible to loadable modules. + */ +#ifndef MODULE static void copy_to_kernel_nofault_oob(struct kunit *test) { char *ptr; @@ -2011,6 +2016,7 @@ static void copy_to_kernel_nofault_oob(struct kunit *test) kfree(ptr); } +#endif /* !MODULE */ static void copy_user_test_oob(struct kunit *test) { @@ -2131,7 +2137,9 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(match_all_not_assigned), KUNIT_CASE(match_all_ptr_tag), KUNIT_CASE(match_all_mem_tag), +#ifndef MODULE KUNIT_CASE(copy_to_kernel_nofault_oob), +#endif KUNIT_CASE(rust_uaf), KUNIT_CASE(copy_user_test_oob), {} diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 8357e1a33699..62c01b4527eb 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -370,36 +370,6 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } -/* - * This function is invoked with report_lock (a raw_spinlock) held. A - * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping - * rt_spinlock. - * - * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a - * lockdep warning for this raw_spinlock -> spinlock dependency. This config - * option is enabled by default to ensure better test coverage to expose this - * kind of RT kernel problem. This lockdep splat, however, can be suppressed - * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the - * invalid PREEMPT_RT case has been taken care of. - */ -static inline struct vm_struct *kasan_find_vm_area(void *addr) -{ - static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP); - struct vm_struct *va; - - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - return NULL; - - /* - * Suppress lockdep warning and fetch vmalloc area of the - * offending address. - */ - lock_map_acquire_try(&vmalloc_map); - va = find_vm_area(addr); - lock_map_release(&vmalloc_map); - return va; -} - static void print_address_description(void *addr, u8 tag, struct kasan_report_info *info) { @@ -429,19 +399,10 @@ static void print_address_description(void *addr, u8 tag, } if (is_vmalloc_addr(addr)) { - struct vm_struct *va = kasan_find_vm_area(addr); - - if (va) { - pr_err("The buggy address belongs to the virtual mapping at\n" - " [%px, %px) created by:\n" - " %pS\n", - va->addr, va->addr + va->size, va->caller); - pr_err("\n"); - - page = vmalloc_to_page(addr); - } else { - pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr); - } + pr_err("The buggy address belongs to a"); + if (!vmalloc_dump_obj(addr)) + pr_cont(" vmalloc virtual mapping\n"); + page = vmalloc_to_page(addr); } if (page) { diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 102048821c22..0ed3be100963 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -605,8 +605,8 @@ static unsigned long kfence_init_pool(void) pages = virt_to_page(__kfence_pool); /* - * Set up object pages: they must have PG_slab set, to avoid freeing - * these as real pages. + * Set up object pages: they must have PGTY_slab set to avoid freeing + * them as real pages. * * We also want to avoid inserting kfence_free() in the kfree() * fast-path in SLUB, and therefore need to ensure kfree() correctly diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 15203ea7d007..a55fb1dcd224 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -347,7 +347,7 @@ struct attribute_group khugepaged_attr_group = { #endif /* CONFIG_SYSFS */ int hugepage_madvise(struct vm_area_struct *vma, - unsigned long *vm_flags, int advice) + vm_flags_t *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: @@ -470,7 +470,7 @@ void __khugepaged_enter(struct mm_struct *mm) } void khugepaged_enter_vma(struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_pmd_enabled()) { @@ -941,12 +941,18 @@ static inline int check_pmd_state(pmd_t *pmd) if (pmd_none(pmde)) return SCAN_PMD_NONE; + + /* + * The folio may be under migration when khugepaged is trying to + * collapse it. Migration success or failure will eventually end + * up with a present PMD mapping a folio again. + */ + if (is_pmd_migration_entry(pmde)) + return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) return SCAN_PMD_NULL; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; - if (pmd_devmap(pmde)) - return SCAN_PMD_NULL; if (pmd_bad(pmde)) return SCAN_PMD_NULL; return SCAN_SUCCEED; @@ -2729,8 +2735,8 @@ static int madvise_collapse_errno(enum scan_result r) } } -int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end) +int madvise_collapse(struct vm_area_struct *vma, unsigned long start, + unsigned long end, bool *lock_dropped) { struct collapse_control *cc; struct mm_struct *mm = vma->vm_mm; @@ -2741,8 +2747,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - *prev = vma; - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return -EINVAL; @@ -2790,7 +2794,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, &mmap_locked, cc); } if (!mmap_locked) - *prev = NULL; /* Tell caller we dropped mmap_lock */ + *lock_dropped = true; handle_result: switch (result) { @@ -2800,7 +2804,6 @@ handle_result: break; case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); - BUG_ON(*prev); mmap_read_lock(mm); result = collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index da9cee34ee1b..8d588e685311 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1247,6 +1247,20 @@ void __ref kmemleak_transient_leak(const void *ptr) EXPORT_SYMBOL(kmemleak_transient_leak); /** + * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu + * address argument + * @ptr: percpu address of the object + */ +void __ref kmemleak_ignore_percpu(const void __percpu *ptr) +{ + pr_debug("%s(0x%px)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr)) + make_black_object((unsigned long)ptr, OBJECT_PERCPU); +} +EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu); + +/** * kmemleak_ignore - ignore an allocated object * @ptr: pointer to beginning of the object * @@ -677,28 +677,32 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } -static bool vma_ksm_compatible(struct vm_area_struct *vma) +static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags) { - if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | - VM_IO | VM_DONTEXPAND | VM_HUGETLB | - VM_MIXEDMAP| VM_DROPPABLE)) + if (vm_flags & (VM_SHARED | VM_MAYSHARE | VM_SPECIAL | + VM_HUGETLB | VM_DROPPABLE)) return false; /* just ignore the advice */ - if (vma_is_dax(vma)) + if (file_is_dax(file)) return false; #ifdef VM_SAO - if (vma->vm_flags & VM_SAO) + if (vm_flags & VM_SAO) return false; #endif #ifdef VM_SPARC_ADI - if (vma->vm_flags & VM_SPARC_ADI) + if (vm_flags & VM_SPARC_ADI) return false; #endif return true; } +static bool vma_ksm_compatible(struct vm_area_struct *vma) +{ + return ksm_compatible(vma->vm_file, vma->vm_flags); +} + static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, unsigned long addr) { @@ -889,7 +893,7 @@ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, unsigned long kpfn; expected_mapping = (void *)((unsigned long)stable_node | - PAGE_MAPPING_KSM); + FOLIO_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ folio = pfn_folio(kpfn); @@ -1066,7 +1070,7 @@ static inline void folio_set_stable_node(struct folio *folio, struct ksm_stable_node *stable_node) { VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio); - folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); + folio->mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM); } #ifdef CONFIG_SYSFS @@ -2696,14 +2700,17 @@ static int ksm_scan_thread(void *nothing) return 0; } -static void __ksm_add_vma(struct vm_area_struct *vma) +static bool __ksm_should_add_vma(const struct file *file, vm_flags_t vm_flags) { - unsigned long vm_flags = vma->vm_flags; - if (vm_flags & VM_MERGEABLE) - return; + return false; + + return ksm_compatible(file, vm_flags); +} - if (vma_ksm_compatible(vma)) +static void __ksm_add_vma(struct vm_area_struct *vma) +{ + if (__ksm_should_add_vma(vma->vm_file, vma->vm_flags)) vm_flags_set(vma, VM_MERGEABLE); } @@ -2724,16 +2731,22 @@ static int __ksm_del_vma(struct vm_area_struct *vma) return 0; } /** - * ksm_add_vma - Mark vma as mergeable if compatible + * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible + * + * @mm: Proposed VMA's mm_struct + * @file: Proposed VMA's file-backed mapping, if any. + * @vm_flags: Proposed VMA"s flags. * - * @vma: Pointer to vma + * Returns: @vm_flags possibly updated to mark mergeable. */ -void ksm_add_vma(struct vm_area_struct *vma) +vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, + vm_flags_t vm_flags) { - struct mm_struct *mm = vma->vm_mm; + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) && + __ksm_should_add_vma(file, vm_flags)) + vm_flags |= VM_MERGEABLE; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) - __ksm_add_vma(vma); + return vm_flags; } static void ksm_add_vmas(struct mm_struct *mm) @@ -2827,7 +2840,7 @@ int ksm_disable(struct mm_struct *mm) } int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags) + unsigned long end, int advice, vm_flags_t *vm_flags) { struct mm_struct *mm = vma->vm_mm; int err; @@ -3669,10 +3682,10 @@ static ssize_t advisor_mode_show(struct kobject *kobj, { const char *output; - if (ksm_advisor == KSM_ADVISOR_NONE) - output = "[none] scan-time"; - else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) + if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) output = "none [scan-time]"; + else + output = "[none] scan-time"; return sysfs_emit(buf, "%s\n", output); } diff --git a/mm/list_lru.c b/mm/list_lru.c index 490473af3122..ec48b5dadf51 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -60,30 +60,34 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) return &lru->node[nid].lru; } +static inline bool lock_list_lru(struct list_lru_one *l, bool irq) +{ + if (irq) + spin_lock_irq(&l->lock); + else + spin_lock(&l->lock); + if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) { + if (irq) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); + return false; + } + return true; +} + static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, bool irq, bool skip_empty) { struct list_lru_one *l; - long nr_items; rcu_read_lock(); again: l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); - if (likely(l)) { - if (irq) - spin_lock_irq(&l->lock); - else - spin_lock(&l->lock); - nr_items = READ_ONCE(l->nr_items); - if (likely(nr_items != LONG_MIN)) { - rcu_read_unlock(); - return l; - } - if (irq) - spin_unlock_irq(&l->lock); - else - spin_unlock(&l->lock); + if (likely(l) && lock_list_lru(l, irq)) { + rcu_read_unlock(); + return l; } /* * Caller may simply bail out if raced with reparenting or diff --git a/mm/maccess.c b/mm/maccess.c index 831b4dd7296c..486559d68858 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -82,7 +82,6 @@ Efault: pagefault_enable(); return -EFAULT; } -EXPORT_SYMBOL_GPL(copy_to_kernel_nofault); long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) { diff --git a/mm/madvise.c b/mm/madvise.c index 5f7a66a1617e..bb80fc5ea08f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -37,6 +37,8 @@ #include "internal.h" #include "swap.h" +#define __MADV_SET_ANON_VMA_NAME (-1) + /* * Maximum number of attempts we make to install guard pages before we give up * and return -ERESTARTNOINTR to have userspace try again. @@ -48,39 +50,39 @@ struct madvise_walk_private { bool pageout; }; +enum madvise_lock_mode { + MADVISE_NO_LOCK, + MADVISE_MMAP_READ_LOCK, + MADVISE_MMAP_WRITE_LOCK, + MADVISE_VMA_READ_LOCK, +}; + +struct madvise_behavior_range { + unsigned long start; + unsigned long end; +}; + struct madvise_behavior { + struct mm_struct *mm; int behavior; struct mmu_gather *tlb; -}; + enum madvise_lock_mode lock_mode; + struct anon_vma_name *anon_name; -/* - * Any behaviour which results in changes to the vma->vm_flags needs to - * take mmap_lock for writing. Others, which simply traverse vmas, need - * to only take it for reading. - */ -static int madvise_need_mmap_write(int behavior) -{ - switch (behavior) { - case MADV_REMOVE: - case MADV_WILLNEED: - case MADV_DONTNEED: - case MADV_DONTNEED_LOCKED: - case MADV_COLD: - case MADV_PAGEOUT: - case MADV_FREE: - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - case MADV_COLLAPSE: - case MADV_GUARD_INSTALL: - case MADV_GUARD_REMOVE: - return 0; - default: - /* be safe, default to 1. list exceptions explicitly */ - return 1; - } -} + /* + * The range over which the behaviour is currently being applied. If + * traversing multiple VMAs, this is updated for each. + */ + struct madvise_behavior_range range; + /* The VMA and VMA preceding it (if applicable) currently targeted. */ + struct vm_area_struct *prev; + struct vm_area_struct *vma; + bool lock_dropped; +}; #ifdef CONFIG_ANON_VMA_NAME +static int madvise_walk_vmas(struct madvise_behavior *madv_behavior); + struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; @@ -106,7 +108,8 @@ void anon_vma_name_free(struct kref *kref) struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { - mmap_assert_locked(vma->vm_mm); + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) + vma_assert_locked(vma); return vma->anon_name; } @@ -142,40 +145,39 @@ static int replace_anon_vma_name(struct vm_area_struct *vma, } #endif /* CONFIG_ANON_VMA_NAME */ /* - * Update the vm_flags on region of a vma, splitting it or merging it as - * necessary. Must be called with mmap_lock held for writing; - * Caller should ensure anon_name stability by raising its refcount even when - * anon_name belongs to a valid vma because this function might free that vma. + * Update the vm_flags or anon_name on region of a vma, splitting it or merging + * it as necessary. Must be called with mmap_lock held for writing. */ -static int madvise_update_vma(struct vm_area_struct *vma, - struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long new_flags, - struct anon_vma_name *anon_name) +static int madvise_update_vma(vm_flags_t new_flags, + struct madvise_behavior *madv_behavior) { - struct mm_struct *mm = vma->vm_mm; - int error; - VMA_ITERATOR(vmi, mm, start); - - if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { - *prev = vma; + struct vm_area_struct *vma = madv_behavior->vma; + struct madvise_behavior_range *range = &madv_behavior->range; + struct anon_vma_name *anon_name = madv_behavior->anon_name; + bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME; + VMA_ITERATOR(vmi, madv_behavior->mm, range->start); + + if (new_flags == vma->vm_flags && (!set_new_anon_name || + anon_vma_name_eq(anon_vma_name(vma), anon_name))) return 0; - } - vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags, - anon_name); + if (set_new_anon_name) + vma = vma_modify_name(&vmi, madv_behavior->prev, vma, + range->start, range->end, anon_name); + else + vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, + range->start, range->end, new_flags); + if (IS_ERR(vma)) return PTR_ERR(vma); - *prev = vma; + madv_behavior->vma = vma; /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vm_flags_reset(vma, new_flags); - if (!vma->vm_file || vma_is_anon_shmem(vma)) { - error = replace_anon_vma_name(vma, anon_name); - if (error) - return error; - } + if (set_new_anon_name) + return replace_anon_vma_name(vma, anon_name); return 0; } @@ -268,21 +270,27 @@ static void shmem_swapin_range(struct vm_area_struct *vma, } #endif /* CONFIG_SWAP */ +static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior) +{ + VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK); + madv_behavior->lock_dropped = true; +} + /* * Schedule all required I/O operations. Do not wait for completion. */ -static long madvise_willneed(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) +static long madvise_willneed(struct madvise_behavior *madv_behavior) { - struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *vma = madv_behavior->vma; + struct mm_struct *mm = madv_behavior->mm; struct file *file = vma->vm_file; + unsigned long start = madv_behavior->range.start; + unsigned long end = madv_behavior->range.end; loff_t offset; - *prev = vma; #ifdef CONFIG_SWAP if (!file) { - walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); + walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } @@ -308,7 +316,7 @@ static long madvise_willneed(struct vm_area_struct *vma, * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ - *prev = NULL; /* tell sys_madvise we drop mmap_lock */ + mark_mmap_lock_dropped(madv_behavior); get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); @@ -336,14 +344,12 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, struct folio *folio, pte_t *ptep, - pte_t pte, bool *any_young, - bool *any_dirty) + pte_t *ptentp) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; int max_nr = (end - addr) / PAGE_SIZE; - return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, - any_young, any_dirty); + return folio_pte_batch_flags(folio, NULL, ptep, ptentp, max_nr, + FPB_MERGE_YOUNG_DIRTY); } static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, @@ -481,13 +487,7 @@ restart: * next pte in the range. */ if (folio_test_large(folio)) { - bool any_young; - - nr = madvise_folio_pte_batch(addr, end, folio, pte, - ptent, &any_young, NULL); - if (any_young) - ptent = pte_mkyoung(ptent); - + nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); if (nr < folio_nr_pages(folio)) { int err; @@ -508,6 +508,7 @@ restart: pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; @@ -572,16 +573,19 @@ static const struct mm_walk_ops cold_walk_ops = { }; static void madvise_cold_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end) + struct madvise_behavior *madv_behavior) + { + struct vm_area_struct *vma = madv_behavior->vma; + struct madvise_behavior_range *range = &madv_behavior->range; struct madvise_walk_private walk_private = { .pageout = false, .tlb = tlb, }; tlb_start_vma(tlb, vma); - walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); + walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, + &walk_private); tlb_end_vma(tlb, vma); } @@ -590,28 +594,25 @@ static inline bool can_madv_lru_vma(struct vm_area_struct *vma) return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); } -static long madvise_cold(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start_addr, unsigned long end_addr) +static long madvise_cold(struct madvise_behavior *madv_behavior) { - struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *vma = madv_behavior->vma; struct mmu_gather tlb; - *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - madvise_cold_page_range(&tlb, vma, start_addr, end_addr); + tlb_gather_mmu(&tlb, madv_behavior->mm); + madvise_cold_page_range(&tlb, madv_behavior); tlb_finish_mmu(&tlb); return 0; } static void madvise_pageout_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end) + struct vm_area_struct *vma, + struct madvise_behavior_range *range) { struct madvise_walk_private walk_private = { .pageout = true, @@ -619,18 +620,16 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb, }; tlb_start_vma(tlb, vma); - walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); + walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, + &walk_private); tlb_end_vma(tlb, vma); } -static long madvise_pageout(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start_addr, unsigned long end_addr) +static long madvise_pageout(struct madvise_behavior *madv_behavior) { - struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; + struct vm_area_struct *vma = madv_behavior->vma; - *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; @@ -645,8 +644,8 @@ static long madvise_pageout(struct vm_area_struct *vma, return 0; lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); + tlb_gather_mmu(&tlb, madv_behavior->mm); + madvise_pageout_page_range(&tlb, vma, &madv_behavior->range); tlb_finish_mmu(&tlb); return 0; @@ -718,11 +717,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, * next pte in the range. */ if (folio_test_large(folio)) { - bool any_young, any_dirty; - - nr = madvise_folio_pte_batch(addr, end, folio, pte, - ptent, &any_young, &any_dirty); - + nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); if (nr < folio_nr_pages(folio)) { int err; @@ -741,16 +736,12 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, start_pte = pte; if (!start_pte) break; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } - - if (any_young) - ptent = pte_mkyoung(ptent); - if (any_dirty) - ptent = pte_mkdirty(ptent); } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { @@ -794,18 +785,31 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -static const struct mm_walk_ops madvise_free_walk_ops = { - .pmd_entry = madvise_free_pte_range, - .walk_lock = PGWALK_RDLOCK, -}; +static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode) +{ + switch (mode) { + case MADVISE_VMA_READ_LOCK: + return PGWALK_VMA_RDLOCK_VERIFY; + case MADVISE_MMAP_READ_LOCK: + return PGWALK_RDLOCK; + default: + /* Other modes don't require fixing up the walk_lock */ + WARN_ON_ONCE(1); + return PGWALK_RDLOCK; + } +} -static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, - struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr) +static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) { - struct mm_struct *mm = vma->vm_mm; + struct mm_struct *mm = madv_behavior->mm; + struct vm_area_struct *vma = madv_behavior->vma; + unsigned long start_addr = madv_behavior->range.start; + unsigned long end_addr = madv_behavior->range.end; struct mmu_notifier_range range; struct mmu_gather *tlb = madv_behavior->tlb; + struct mm_walk_ops walk_ops = { + .pmd_entry = madvise_free_pte_range, + }; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) @@ -825,8 +829,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, mmu_notifier_invalidate_range_start(&range); tlb_start_vma(tlb, vma); - walk_page_range(vma->vm_mm, range.start, range.end, - &madvise_free_walk_ops, tlb); + walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode); + walk_page_range_vma(vma, range.start, range.end, + &walk_ops, tlb); tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); return 0; @@ -851,25 +856,28 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior, - struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior) + { + struct madvise_behavior_range *range = &madv_behavior->range; struct zap_details details = { .reclaim_pt = true, .even_cows = true, }; zap_page_range_single_batched( - madv_behavior->tlb, vma, start, end - start, &details); + madv_behavior->tlb, madv_behavior->vma, range->start, + range->end - range->start, &details); return 0; } -static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, - unsigned long start, - unsigned long *end, - int behavior) +static +bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior) { + struct vm_area_struct *vma = madv_behavior->vma; + int behavior = madv_behavior->behavior; + struct madvise_behavior_range *range = &madv_behavior->range; + if (!is_vm_hugetlb_page(vma)) { unsigned int forbidden = VM_PFNMAP; @@ -881,7 +889,7 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) return false; - if (start & ~huge_page_mask(hstate_vma(vma))) + if (range->start & ~huge_page_mask(hstate_vma(vma))) return false; /* @@ -890,41 +898,38 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, * Avoid unexpected data loss by rounding down the number of * huge pages freed. */ - *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); + range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma))); return true; } -static long madvise_dontneed_free(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, - struct madvise_behavior *madv_behavior) +static long madvise_dontneed_free(struct madvise_behavior *madv_behavior) { + struct mm_struct *mm = madv_behavior->mm; + struct madvise_behavior_range *range = &madv_behavior->range; int behavior = madv_behavior->behavior; - struct mm_struct *mm = vma->vm_mm; - *prev = vma; - if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) + if (!madvise_dontneed_free_valid_vma(madv_behavior)) return -EINVAL; - if (start == end) + if (range->start == range->end) return 0; - if (!userfaultfd_remove(vma, start, end)) { - *prev = NULL; /* mmap_lock has been dropped, prev is stale */ + if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) { + struct vm_area_struct *vma; + mark_mmap_lock_dropped(madv_behavior); mmap_read_lock(mm); - vma = vma_lookup(mm, start); + madv_behavior->vma = vma = vma_lookup(mm, range->start); if (!vma) return -ENOMEM; /* * Potential end adjustment for hugetlb vma is OK as * the check below keeps end within vma. */ - if (!madvise_dontneed_free_valid_vma(vma, start, &end, - behavior)) + if (!madvise_dontneed_free_valid_vma(madv_behavior)) return -EINVAL; - if (end > vma->vm_end) { + if (range->end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old * vma was split while the mmap_lock was @@ -937,7 +942,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, * end-vma->vm_end range, but the manager can * handle a repetition fine. */ - end = vma->vm_end; + range->end = vma->vm_end; } /* * If the memory region between start and end was @@ -946,25 +951,26 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, * the adjustment for hugetlb vma above may have rounded * end down to the start address. */ - if (start == end) + if (range->start == range->end) return 0; - VM_WARN_ON(start > end); + VM_WARN_ON(range->start > range->end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) - return madvise_dontneed_single_vma( - madv_behavior, vma, start, end); + return madvise_dontneed_single_vma(madv_behavior); else if (behavior == MADV_FREE) - return madvise_free_single_vma(madv_behavior, vma, start, end); + return madvise_free_single_vma(madv_behavior); else return -EINVAL; } -static long madvise_populate(struct mm_struct *mm, unsigned long start, - unsigned long end, int behavior) +static long madvise_populate(struct madvise_behavior *madv_behavior) { - const bool write = behavior == MADV_POPULATE_WRITE; + struct mm_struct *mm = madv_behavior->mm; + const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE; int locked = 1; + unsigned long start = madv_behavior->range.start; + unsigned long end = madv_behavior->range.end; long pages; while (start < end) { @@ -1001,16 +1007,17 @@ static long madvise_populate(struct mm_struct *mm, unsigned long start, * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. */ -static long madvise_remove(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) +static long madvise_remove(struct madvise_behavior *madv_behavior) { loff_t offset; int error; struct file *f; - struct mm_struct *mm = vma->vm_mm; + struct mm_struct *mm = madv_behavior->mm; + struct vm_area_struct *vma = madv_behavior->vma; + unsigned long start = madv_behavior->range.start; + unsigned long end = madv_behavior->range.end; - *prev = NULL; /* tell sys_madvise we drop mmap_lock */ + mark_mmap_lock_dropped(madv_behavior); if (vma->vm_flags & VM_LOCKED) return -EINVAL; @@ -1073,7 +1080,7 @@ static int guard_install_pud_entry(pud_t *pud, unsigned long addr, pud_t pudval = pudp_get(pud); /* If huge return >0 so we abort the operation + zap. */ - return pud_trans_huge(pudval) || pud_devmap(pudval); + return pud_trans_huge(pudval); } static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -1082,7 +1089,7 @@ static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, pmd_t pmdval = pmdp_get(pmd); /* If huge return >0 so we abort the operation + zap. */ - return pmd_trans_huge(pmdval) || pmd_devmap(pmdval); + return pmd_trans_huge(pmdval); } static int guard_install_pte_entry(pte_t *pte, unsigned long addr, @@ -1122,14 +1129,13 @@ static const struct mm_walk_ops guard_install_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; -static long madvise_guard_install(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) +static long madvise_guard_install(struct madvise_behavior *madv_behavior) { + struct vm_area_struct *vma = madv_behavior->vma; + struct madvise_behavior_range *range = &madv_behavior->range; long err; int i; - *prev = vma; if (!is_valid_guard_vma(vma, /* allow_locked = */false)) return -EINVAL; @@ -1160,13 +1166,14 @@ static long madvise_guard_install(struct vm_area_struct *vma, unsigned long nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ - err = walk_page_range_mm(vma->vm_mm, start, end, + err = walk_page_range_mm(vma->vm_mm, range->start, range->end, &guard_install_walk_ops, &nr_pages); if (err < 0) return err; if (err == 0) { - unsigned long nr_expected_pages = PHYS_PFN(end - start); + unsigned long nr_expected_pages = + PHYS_PFN(range->end - range->start); VM_WARN_ON(nr_pages != nr_expected_pages); return 0; @@ -1176,7 +1183,8 @@ static long madvise_guard_install(struct vm_area_struct *vma, * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ - zap_page_range_single(vma, start, end - start, NULL); + zap_page_range_single(vma, range->start, + range->end - range->start, NULL); } /* @@ -1193,7 +1201,7 @@ static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, pud_t pudval = pudp_get(pud); /* If huge, cannot have guard pages present, so no-op - skip. */ - if (pud_trans_huge(pudval) || pud_devmap(pudval)) + if (pud_trans_huge(pudval)) walk->action = ACTION_CONTINUE; return 0; @@ -1205,7 +1213,7 @@ static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, pmd_t pmdval = pmdp_get(pmd); /* If huge, cannot have guard pages present, so no-op - skip. */ - if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) + if (pmd_trans_huge(pmdval)) walk->action = ACTION_CONTINUE; return 0; @@ -1232,11 +1240,11 @@ static const struct mm_walk_ops guard_remove_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; -static long madvise_guard_remove(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) +static long madvise_guard_remove(struct madvise_behavior *madv_behavior) { - *prev = vma; + struct vm_area_struct *vma = madv_behavior->vma; + struct madvise_behavior_range *range = &madv_behavior->range; + /* * We're ok with removing guards in mlock()'d ranges, as this is a * non-destructive action. @@ -1244,7 +1252,7 @@ static long madvise_guard_remove(struct vm_area_struct *vma, if (!is_valid_guard_vma(vma, /* allow_locked = */true)) return -EINVAL; - return walk_page_range(vma->vm_mm, start, end, + return walk_page_range_vma(vma, range->start, range->end, &guard_remove_walk_ops, NULL); } @@ -1253,33 +1261,40 @@ static long madvise_guard_remove(struct vm_area_struct *vma, * will handle splitting a vm area into separate areas, each area with its own * behavior. */ -static int madvise_vma_behavior(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, - void *behavior_arg) +static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) { - struct madvise_behavior *arg = behavior_arg; - int behavior = arg->behavior; + int behavior = madv_behavior->behavior; + struct vm_area_struct *vma = madv_behavior->vma; + vm_flags_t new_flags = vma->vm_flags; + struct madvise_behavior_range *range = &madv_behavior->range; int error; - struct anon_vma_name *anon_name; - unsigned long new_flags = vma->vm_flags; - if (unlikely(!can_modify_vma_madv(vma, behavior))) + if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior))) return -EPERM; switch (behavior) { case MADV_REMOVE: - return madvise_remove(vma, prev, start, end); + return madvise_remove(madv_behavior); case MADV_WILLNEED: - return madvise_willneed(vma, prev, start, end); + return madvise_willneed(madv_behavior); case MADV_COLD: - return madvise_cold(vma, prev, start, end); + return madvise_cold(madv_behavior); case MADV_PAGEOUT: - return madvise_pageout(vma, prev, start, end); + return madvise_pageout(madv_behavior); case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: - return madvise_dontneed_free(vma, prev, start, end, arg); + return madvise_dontneed_free(madv_behavior); + case MADV_COLLAPSE: + return madvise_collapse(vma, range->start, range->end, + &madv_behavior->lock_dropped); + case MADV_GUARD_INSTALL: + return madvise_guard_install(madv_behavior); + case MADV_GUARD_REMOVE: + return madvise_guard_remove(madv_behavior); + + /* The below behaviours update VMAs via madvise_update_vma(). */ + case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1293,18 +1308,18 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: - if (vma->vm_flags & VM_IO) + if (new_flags & VM_IO) return -EINVAL; new_flags &= ~VM_DONTCOPY; break; case MADV_WIPEONFORK: /* MADV_WIPEONFORK is only supported on anonymous memory. */ - if (vma->vm_file || vma->vm_flags & VM_SHARED) + if (vma->vm_file || new_flags & VM_SHARED) return -EINVAL; new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: - if (vma->vm_flags & VM_DROPPABLE) + if (new_flags & VM_DROPPABLE) return -EINVAL; new_flags &= ~VM_WIPEONFORK; break; @@ -1312,14 +1327,15 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || - (vma->vm_flags & VM_DROPPABLE)) + if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) || + (new_flags & VM_DROPPABLE)) return -EINVAL; new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: - error = ksm_madvise(vma, start, end, behavior, &new_flags); + error = ksm_madvise(vma, range->start, range->end, + behavior, &new_flags); if (error) goto out; break; @@ -1329,20 +1345,17 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, if (error) goto out; break; - case MADV_COLLAPSE: - return madvise_collapse(vma, prev, start, end); - case MADV_GUARD_INSTALL: - return madvise_guard_install(vma, prev, start, end); - case MADV_GUARD_REMOVE: - return madvise_guard_remove(vma, prev, start, end); + case __MADV_SET_ANON_VMA_NAME: + /* Only anonymous mappings can be named */ + if (vma->vm_file && !vma_is_anon_shmem(vma)) + return -EBADF; + break; } - anon_name = anon_vma_name(vma); - anon_vma_name_get(anon_name); - error = madvise_update_vma(vma, prev, start, end, new_flags, - anon_name); - anon_vma_name_put(anon_name); + /* This is a write operation.*/ + VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK); + error = madvise_update_vma(new_flags, madv_behavior); out: /* * madvise() returns EAGAIN if kernel resources, such as @@ -1357,15 +1370,15 @@ out: /* * Error injection support for memory error handling. */ -static int madvise_inject_error(int behavior, - unsigned long start, unsigned long end) +static int madvise_inject_error(struct madvise_behavior *madv_behavior) { unsigned long size; + unsigned long start = madv_behavior->range.start; + unsigned long end = madv_behavior->range.end; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - for (; start < end; start += size) { unsigned long pfn; struct page *page; @@ -1383,7 +1396,7 @@ static int madvise_inject_error(int behavior, */ size = page_size(compound_head(page)); - if (behavior == MADV_SOFT_OFFLINE) { + if (madv_behavior->behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); @@ -1402,9 +1415,9 @@ static int madvise_inject_error(int behavior, return 0; } -static bool is_memory_failure(int behavior) +static bool is_memory_failure(struct madvise_behavior *madv_behavior) { - switch (behavior) { + switch (madv_behavior->behavior) { case MADV_HWPOISON: case MADV_SOFT_OFFLINE: return true; @@ -1415,13 +1428,12 @@ static bool is_memory_failure(int behavior) #else -static int madvise_inject_error(int behavior, - unsigned long start, unsigned long end) +static int madvise_inject_error(struct madvise_behavior *madv_behavior) { return 0; } -static bool is_memory_failure(int behavior) +static bool is_memory_failure(struct madvise_behavior *madv_behavior) { return false; } @@ -1487,145 +1499,202 @@ static bool process_madvise_remote_valid(int behavior) } /* - * Walk the vmas in range [start,end), and call the visit function on each one. - * The visit function will get start and end parameters that cover the overlap - * between the current vma and the original range. Any unmapped regions in the - * original range will result in this function returning -ENOMEM while still - * calling the visit function on all of the existing vmas in the range. - * Must be called with the mmap_lock held for reading or writing. + * Try to acquire a VMA read lock if possible. + * + * We only support this lock over a single VMA, which the input range must + * span either partially or fully. + * + * This function always returns with an appropriate lock held. If a VMA read + * lock could be acquired, we return true and set madv_behavior state + * accordingly. + * + * If a VMA read lock could not be acquired, we return false and expect caller to + * fallback to mmap lock behaviour. */ -static -int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, void *arg, - int (*visit)(struct vm_area_struct *vma, - struct vm_area_struct **prev, unsigned long start, - unsigned long end, void *arg)) +static bool try_vma_read_lock(struct madvise_behavior *madv_behavior) { + struct mm_struct *mm = madv_behavior->mm; struct vm_area_struct *vma; - struct vm_area_struct *prev; - unsigned long tmp; + + vma = lock_vma_under_rcu(mm, madv_behavior->range.start); + if (!vma) + goto take_mmap_read_lock; + /* + * Must span only a single VMA; uffd and remote processes are + * unsupported. + */ + if (madv_behavior->range.end > vma->vm_end || current->mm != mm || + userfaultfd_armed(vma)) { + vma_end_read(vma); + goto take_mmap_read_lock; + } + madv_behavior->vma = vma; + return true; + +take_mmap_read_lock: + mmap_read_lock(mm); + madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK; + return false; +} + +/* + * Walk the vmas in range [start,end), and call the madvise_vma_behavior + * function on each one. The function will get start and end parameters that + * cover the overlap between the current vma and the original range. Any + * unmapped regions in the original range will result in this function returning + * -ENOMEM while still calling the madvise_vma_behavior function on all of the + * existing vmas in the range. Must be called with the mmap_lock held for + * reading or writing. + */ +static +int madvise_walk_vmas(struct madvise_behavior *madv_behavior) +{ + struct mm_struct *mm = madv_behavior->mm; + struct madvise_behavior_range *range = &madv_behavior->range; + /* range is updated to span each VMA, so store end of entire range. */ + unsigned long last_end = range->end; int unmapped_error = 0; + int error; + struct vm_area_struct *prev, *vma; /* - * If the interval [start,end) covers some unmapped address - * ranges, just ignore them, but return -ENOMEM at the end. - * - different from the way of handling in mlock etc. + * If VMA read lock is supported, apply madvise to a single VMA + * tentatively, avoiding walking VMAs. */ - vma = find_vma_prev(mm, start, &prev); - if (vma && start > vma->vm_start) + if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK && + try_vma_read_lock(madv_behavior)) { + error = madvise_vma_behavior(madv_behavior); + vma_end_read(madv_behavior->vma); + return error; + } + + vma = find_vma_prev(mm, range->start, &prev); + if (vma && range->start > vma->vm_start) prev = vma; for (;;) { - int error; - /* Still start < end. */ if (!vma) return -ENOMEM; - /* Here start < (end|vma->vm_end). */ - if (start < vma->vm_start) { + /* Here start < (last_end|vma->vm_end). */ + if (range->start < vma->vm_start) { + /* + * This indicates a gap between VMAs in the input + * range. This does not cause the operation to abort, + * rather we simply return -ENOMEM to indicate that this + * has happened, but carry on. + */ unmapped_error = -ENOMEM; - start = vma->vm_start; - if (start >= end) + range->start = vma->vm_start; + if (range->start >= last_end) break; } - /* Here vma->vm_start <= start < (end|vma->vm_end) */ - tmp = vma->vm_end; - if (end < tmp) - tmp = end; + /* Here vma->vm_start <= range->start < (last_end|vma->vm_end) */ + range->end = min(vma->vm_end, last_end); - /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = visit(vma, &prev, start, tmp, arg); + /* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */ + madv_behavior->prev = prev; + madv_behavior->vma = vma; + error = madvise_vma_behavior(madv_behavior); if (error) return error; - start = tmp; - if (prev && start < prev->vm_end) - start = prev->vm_end; - if (start >= end) + if (madv_behavior->lock_dropped) { + /* We dropped the mmap lock, we can't ref the VMA. */ + prev = NULL; + vma = NULL; + madv_behavior->lock_dropped = false; + } else { + vma = madv_behavior->vma; + prev = vma; + } + + if (vma && range->end < vma->vm_end) + range->end = vma->vm_end; + if (range->end >= last_end) break; - if (prev) - vma = find_vma(mm, prev->vm_end); - else /* madvise_remove dropped mmap_lock */ - vma = find_vma(mm, start); + + vma = find_vma(mm, vma ? vma->vm_end : range->end); + range->start = range->end; } return unmapped_error; } -#ifdef CONFIG_ANON_VMA_NAME -static int madvise_vma_anon_name(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, - void *anon_name) -{ - int error; - - /* Only anonymous mappings can be named */ - if (vma->vm_file && !vma_is_anon_shmem(vma)) - return -EBADF; - - error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - anon_name); - - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - return error; -} - -int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, struct anon_vma_name *anon_name) +/* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_lock for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior) { - unsigned long end; - unsigned long len; + if (is_memory_failure(madv_behavior)) + return MADVISE_NO_LOCK; - if (start & ~PAGE_MASK) - return -EINVAL; - len = (len_in + ~PAGE_MASK) & PAGE_MASK; - - /* Check to see whether len was rounded up from small -ve to zero */ - if (len_in && !len) - return -EINVAL; - - end = start + len; - if (end < start) - return -EINVAL; - - if (end == start) - return 0; - - return madvise_walk_vmas(mm, start, end, anon_name, - madvise_vma_anon_name); + switch (madv_behavior->behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_COLD: + case MADV_PAGEOUT: + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + case MADV_COLLAPSE: + case MADV_GUARD_INSTALL: + case MADV_GUARD_REMOVE: + return MADVISE_MMAP_READ_LOCK; + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_FREE: + return MADVISE_VMA_READ_LOCK; + default: + return MADVISE_MMAP_WRITE_LOCK; + } } -#endif /* CONFIG_ANON_VMA_NAME */ -static int madvise_lock(struct mm_struct *mm, int behavior) +static int madvise_lock(struct madvise_behavior *madv_behavior) { - if (is_memory_failure(behavior)) - return 0; + struct mm_struct *mm = madv_behavior->mm; + enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior); - if (madvise_need_mmap_write(behavior)) { + switch (lock_mode) { + case MADVISE_NO_LOCK: + break; + case MADVISE_MMAP_WRITE_LOCK: if (mmap_write_lock_killable(mm)) return -EINTR; - } else { + break; + case MADVISE_MMAP_READ_LOCK: mmap_read_lock(mm); + break; + case MADVISE_VMA_READ_LOCK: + /* We will acquire the lock per-VMA in madvise_walk_vmas(). */ + break; } + + madv_behavior->lock_mode = lock_mode; return 0; } -static void madvise_unlock(struct mm_struct *mm, int behavior) +static void madvise_unlock(struct madvise_behavior *madv_behavior) { - if (is_memory_failure(behavior)) - return; + struct mm_struct *mm = madv_behavior->mm; - if (madvise_need_mmap_write(behavior)) + switch (madv_behavior->lock_mode) { + case MADVISE_NO_LOCK: + return; + case MADVISE_MMAP_WRITE_LOCK: mmap_write_unlock(mm); - else + break; + case MADVISE_MMAP_READ_LOCK: mmap_read_unlock(mm); + break; + case MADVISE_VMA_READ_LOCK: + /* We will drop the lock per-VMA in madvise_walk_vmas(). */ + break; + } + + madv_behavior->lock_mode = MADVISE_NO_LOCK; } static bool madvise_batch_tlb_flush(int behavior) @@ -1640,11 +1709,10 @@ static bool madvise_batch_tlb_flush(int behavior) } } -static void madvise_init_tlb(struct madvise_behavior *madv_behavior, - struct mm_struct *mm) +static void madvise_init_tlb(struct madvise_behavior *madv_behavior) { if (madvise_batch_tlb_flush(madv_behavior->behavior)) - tlb_gather_mmu(madv_behavior->tlb, mm); + tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm); } static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) @@ -1699,9 +1767,9 @@ static bool madvise_should_skip(unsigned long start, size_t len_in, return false; } -static bool is_madvise_populate(int behavior) +static bool is_madvise_populate(struct madvise_behavior *madv_behavior) { - switch (behavior) { + switch (madv_behavior->behavior) { case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: return true; @@ -1710,26 +1778,42 @@ static bool is_madvise_populate(int behavior) } } -static int madvise_do_behavior(struct mm_struct *mm, - unsigned long start, size_t len_in, +/* + * untagged_addr_remote() assumes mmap_lock is already held. On + * architectures like x86 and RISC-V, tagging is tricky because each + * mm may have a different tagging mask. However, we might only hold + * the per-VMA lock (currently only local processes are supported), + * so untagged_addr is used to avoid the mmap_lock assertion for + * local processes. + */ +static inline unsigned long get_untagged_addr(struct mm_struct *mm, + unsigned long start) +{ + return current->mm == mm ? untagged_addr(start) : + untagged_addr_remote(mm, start); +} + +static int madvise_do_behavior(unsigned long start, size_t len_in, struct madvise_behavior *madv_behavior) { - int behavior = madv_behavior->behavior; struct blk_plug plug; - unsigned long end; int error; + struct madvise_behavior_range *range = &madv_behavior->range; + + if (is_memory_failure(madv_behavior)) { + range->start = start; + range->end = start + len_in; + return madvise_inject_error(madv_behavior); + } - if (is_memory_failure(behavior)) - return madvise_inject_error(behavior, start, start + len_in); - start = untagged_addr_remote(mm, start); - end = start + PAGE_ALIGN(len_in); + range->start = get_untagged_addr(madv_behavior->mm, start); + range->end = range->start + PAGE_ALIGN(len_in); blk_start_plug(&plug); - if (is_madvise_populate(behavior)) - error = madvise_populate(mm, start, end, behavior); + if (is_madvise_populate(madv_behavior)) + error = madvise_populate(madv_behavior); else - error = madvise_walk_vmas(mm, start, end, madv_behavior, - madvise_vma_behavior); + error = madvise_walk_vmas(madv_behavior); blk_finish_plug(&plug); return error; } @@ -1811,19 +1895,20 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh int error; struct mmu_gather tlb; struct madvise_behavior madv_behavior = { + .mm = mm, .behavior = behavior, .tlb = &tlb, }; if (madvise_should_skip(start, len_in, behavior, &error)) return error; - error = madvise_lock(mm, behavior); + error = madvise_lock(&madv_behavior); if (error) return error; - madvise_init_tlb(&madv_behavior, mm); - error = madvise_do_behavior(mm, start, len_in, &madv_behavior); + madvise_init_tlb(&madv_behavior); + error = madvise_do_behavior(start, len_in, &madv_behavior); madvise_finish_tlb(&madv_behavior); - madvise_unlock(mm, behavior); + madvise_unlock(&madv_behavior); return error; } @@ -1841,16 +1926,17 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, size_t total_len; struct mmu_gather tlb; struct madvise_behavior madv_behavior = { + .mm = mm, .behavior = behavior, .tlb = &tlb, }; total_len = iov_iter_count(iter); - ret = madvise_lock(mm, behavior); + ret = madvise_lock(&madv_behavior); if (ret) return ret; - madvise_init_tlb(&madv_behavior, mm); + madvise_init_tlb(&madv_behavior); while (iov_iter_count(iter)) { unsigned long start = (unsigned long)iter_iov_addr(iter); @@ -1860,8 +1946,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, if (madvise_should_skip(start, len_in, behavior, &error)) ret = error; else - ret = madvise_do_behavior(mm, start, len_in, - &madv_behavior); + ret = madvise_do_behavior(start, len_in, &madv_behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat @@ -1880,11 +1965,11 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, /* Drop and reacquire lock to unwind race. */ madvise_finish_tlb(&madv_behavior); - madvise_unlock(mm, behavior); - ret = madvise_lock(mm, behavior); + madvise_unlock(&madv_behavior); + ret = madvise_lock(&madv_behavior); if (ret) goto out; - madvise_init_tlb(&madv_behavior, mm); + madvise_init_tlb(&madv_behavior); continue; } if (ret < 0) @@ -1892,7 +1977,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, iov_iter_advance(iter, iter_iov_len(iter)); } madvise_finish_tlb(&madv_behavior); - madvise_unlock(mm, behavior); + madvise_unlock(&madv_behavior); out: ret = (total_len - iov_iter_count(iter)) ? : ret; @@ -1963,3 +2048,88 @@ free_iov: out: return ret; } + +#ifdef CONFIG_ANON_VMA_NAME + +#define ANON_VMA_NAME_MAX_LEN 80 +#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" + +static inline bool is_valid_name_char(char ch) +{ + /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ + return ch > 0x1f && ch < 0x7f && + !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); +} + +static int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, struct anon_vma_name *anon_name) +{ + unsigned long end; + unsigned long len; + int error; + struct madvise_behavior madv_behavior = { + .mm = mm, + .behavior = __MADV_SET_ANON_VMA_NAME, + .anon_name = anon_name, + }; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + madv_behavior.range.start = start; + madv_behavior.range.end = end; + + error = madvise_lock(&madv_behavior); + if (error) + return error; + error = madvise_walk_vmas(&madv_behavior); + madvise_unlock(&madv_behavior); + + return error; +} + +int set_anon_vma_name(unsigned long addr, unsigned long size, + const char __user *uname) +{ + struct anon_vma_name *anon_name = NULL; + struct mm_struct *mm = current->mm; + int error; + + if (uname) { + char *name, *pch; + + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + if (IS_ERR(name)) + return PTR_ERR(name); + + for (pch = name; *pch != '\0'; pch++) { + if (!is_valid_name_char(*pch)) { + kfree(name); + return -EINVAL; + } + } + /* anon_vma has its own copy */ + anon_name = anon_vma_name_alloc(name); + kfree(name); + if (!anon_name) + return -ENOMEM; + } + + error = madvise_set_anon_name(mm, addr, size, anon_name); + anon_vma_name_put(anon_name); + + return error; +} +#endif diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 2f8829b3541a..c193de6cb23a 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -129,7 +129,7 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, pmd_t pmdval = pmdp_get_lockless(pmd); /* Do not split a huge pmd, present or migrated */ - if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) { + if (pmd_trans_huge(pmdval)) { WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval)); walk->action = ACTION_CONTINUE; } @@ -152,7 +152,7 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, pud_t pudval = READ_ONCE(*pud); /* Do not split a huge pud */ - if (pud_trans_huge(pudval) || pud_devmap(pudval)) { + if (pud_trans_huge(pudval)) { WARN_ON(pud_write(pudval) || pud_dirty(pudval)); walk->action = ACTION_CONTINUE; } @@ -218,7 +218,7 @@ static void wp_clean_post_vma(struct mm_walk *walk) static int wp_clean_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { - unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags); + vm_flags_t vm_flags = READ_ONCE(walk->vma->vm_flags); /* Skip non-applicable VMAs */ if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) != diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 902da8a9c643..8dd7fbed5a94 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -51,7 +51,6 @@ #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/seq_file.h> -#include <linux/parser.h> #include <linux/vmpressure.h> #include <linux/memremap.h> #include <linux/mm_inline.h> @@ -474,8 +473,6 @@ static const unsigned int memcg_vm_event_stat[] = { NUMA_PAGE_MIGRATE, NUMA_PTE_UPDATES, NUMA_HINT_FAULTS, - NUMA_TASK_MIGRATE, - NUMA_TASK_SWAP, #endif }; @@ -573,9 +570,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, if (!val) return; - /* TODO: add to cgroup update tree once it is nmi-safe. */ - if (!in_nmi()) - css_rstat_updated(&memcg->css, cpu); + css_rstat_updated(&memcg->css, cpu); statc_pcpu = memcg->vmstats_percpu; for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) { statc = this_cpu_ptr(statc_pcpu); @@ -2530,7 +2525,8 @@ static inline void account_slab_nmi_safe(struct mem_cgroup *memcg, } else { struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id]; - /* TODO: add to cgroup update tree once it is nmi-safe. */ + /* preemption is disabled in_nmi(). */ + css_rstat_updated(&memcg->css, smp_processor_id()); if (idx == NR_SLAB_RECLAIMABLE_B) atomic_add(nr, &pn->slab_reclaimable); else @@ -2753,7 +2749,8 @@ static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val) if (likely(!in_nmi())) { mod_memcg_state(memcg, MEMCG_KMEM, val); } else { - /* TODO: add to cgroup update tree once it is nmi-safe. */ + /* preemption is disabled in_nmi(). */ + css_rstat_updated(&memcg->css, smp_processor_id()); atomic_add(val, &memcg->kmem_stat); } } @@ -3757,7 +3754,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) INIT_LIST_HEAD(&memcg->memory_peaks); INIT_LIST_HEAD(&memcg->swap_peaks); spin_lock_init(&memcg->peaks_lock); - memcg->socket_pressure = jiffies; + memcg->socket_pressure = get_jiffies_64(); +#if BITS_PER_LONG < 64 + seqlock_init(&memcg->socket_pressure_seqlock); +#endif memcg1_memcg_init(memcg); memcg->kmemcg_id = -1; INIT_LIST_HEAD(&memcg->objcg_list); @@ -4566,83 +4566,15 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } -enum { - MEMORY_RECLAIM_SWAPPINESS = 0, - MEMORY_RECLAIM_SWAPPINESS_MAX, - MEMORY_RECLAIM_NULL, -}; - -static const match_table_t tokens = { - { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, - { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"}, - { MEMORY_RECLAIM_NULL, NULL }, -}; - static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned int nr_retries = MAX_RECLAIM_RETRIES; - unsigned long nr_to_reclaim, nr_reclaimed = 0; - int swappiness = -1; - unsigned int reclaim_options; - char *old_buf, *start; - substring_t args[MAX_OPT_ARGS]; - - buf = strstrip(buf); - - old_buf = buf; - nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; - if (buf == old_buf) - return -EINVAL; - - buf = strstrip(buf); - - while ((start = strsep(&buf, " ")) != NULL) { - if (!strlen(start)) - continue; - switch (match_token(start, tokens, args)) { - case MEMORY_RECLAIM_SWAPPINESS: - if (match_int(&args[0], &swappiness)) - return -EINVAL; - if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS) - return -EINVAL; - break; - case MEMORY_RECLAIM_SWAPPINESS_MAX: - swappiness = SWAPPINESS_ANON_ONLY; - break; - default: - return -EINVAL; - } - } - - reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; - while (nr_reclaimed < nr_to_reclaim) { - /* Will converge on zero, but reclaim enforces a minimum */ - unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; - unsigned long reclaimed; - - if (signal_pending(current)) - return -EINTR; - - /* - * This is the final attempt, drain percpu lru caches in the - * hope of introducing more evictable pages for - * try_to_free_mem_cgroup_pages(). - */ - if (!nr_retries) - lru_add_drain_all(); - - reclaimed = try_to_free_mem_cgroup_pages(memcg, - batch_size, GFP_KERNEL, - reclaim_options, - swappiness == -1 ? NULL : &swappiness); - - if (!reclaimed && !nr_retries--) - return -EAGAIN; + int ret; - nr_reclaimed += reclaimed; - } + ret = user_proactive_reclaim(buf, memcg, NULL); + if (ret) + return ret; return nbytes; } diff --git a/mm/memfd.c b/mm/memfd.c index ab367e61553d..bbe679895ef6 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -32,8 +32,7 @@ static bool memfd_folio_has_extra_refs(struct folio *folio) { - return folio_ref_count(folio) - folio_mapcount(folio) != - folio_nr_pages(folio); + return folio_ref_count(folio) != folio_expected_ref_count(folio); } static void memfd_tag_pins(struct xa_state *xas) @@ -71,7 +70,6 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) #ifdef CONFIG_HUGETLB_PAGE struct folio *folio; gfp_t gfp_mask; - int err; if (is_file_hugepages(memfd)) { /* @@ -80,12 +78,19 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) * alloc from. Also, the folio will be pinned for an indefinite * amount of time, so it is not expected to be migrated away. */ + struct inode *inode = file_inode(memfd); struct hstate *h = hstate_file(memfd); + int err = -ENOMEM; + long nr_resv; gfp_mask = htlb_alloc_mask(h); gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); idx >>= huge_page_order(h); + nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0); + if (nr_resv < 0) + return ERR_PTR(nr_resv); + folio = alloc_hugetlb_folio_reserve(h, numa_node_id(), NULL, @@ -96,12 +101,17 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) idx); if (err) { folio_put(folio); - return ERR_PTR(err); + goto err_unresv; } + + hugetlb_set_folio_subpool(folio, subpool_inode(inode)); folio_unlock(folio); return folio; } - return ERR_PTR(-ENOMEM); +err_unresv: + if (nr_resv > 0) + hugetlb_unreserve_pages(inode, idx, idx + 1, 0); + return ERR_PTR(err); } #endif return shmem_read_folio(memfd->f_mapping, idx); @@ -333,10 +343,10 @@ static inline bool is_write_sealed(unsigned int seals) return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); } -static int check_write_seal(unsigned long *vm_flags_ptr) +static int check_write_seal(vm_flags_t *vm_flags_ptr) { - unsigned long vm_flags = *vm_flags_ptr; - unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE); + vm_flags_t vm_flags = *vm_flags_ptr; + vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE); /* If a private mapping then writability is irrelevant. */ if (!(mask & VM_SHARED)) @@ -358,7 +368,7 @@ static int check_write_seal(unsigned long *vm_flags_ptr) return 0; } -int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr) +int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr) { int err = 0; unsigned int *seals_ptr = memfd_file_seals_ptr(file); @@ -401,7 +411,7 @@ static char *alloc_name(const char __user *uname) if (!name) return ERR_PTR(-ENOMEM); - strcpy(name, MFD_NAME_PREFIX); + memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN); /* returned length does not include terminating zero */ len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); if (len < 0) { @@ -475,22 +485,22 @@ SYSCALL_DEFINE2(memfd_create, fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); if (fd < 0) { error = fd; - goto err_name; + goto err_free_name; } file = alloc_file(name, flags); if (IS_ERR(file)) { error = PTR_ERR(file); - goto err_fd; + goto err_free_fd; } fd_install(fd, file); kfree(name); return fd; -err_fd: +err_free_fd: put_unused_fd(fd); -err_name: +err_free_name: kfree(name); return error; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b91a33fb6c69..3047b9ac667e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1388,8 +1388,8 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) if (PageSlab(page)) return false; - /* Soft offline could migrate non-LRU movable pages */ - if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) + /* Soft offline could migrate movable_ops pages */ + if ((flags & MF_SOFT_OFFLINE) && page_has_movable_ops(page)) return true; return PageLRU(page) || is_free_buddy_page(page); @@ -1561,6 +1561,10 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) return ret; } +/* + * The caller must guarantee the folio isn't large folio, except hugetlb. + * try_to_unmap() can't handle it. + */ int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; @@ -2503,19 +2507,6 @@ static void memory_failure_work_func(struct work_struct *work) } } -/* - * Process memory_failure work queued on the specified CPU. - * Used to avoid return-to-userspace racing with the memory_failure workqueue. - */ -void memory_failure_queue_kick(int cpu) -{ - struct memory_failure_cpu *mf_cpu; - - mf_cpu = &per_cpu(memory_failure_cpu, cpu); - cancel_work_sync(&mf_cpu->work); - memory_failure_work_func(&mf_cpu->work); -} - static int __init memory_failure_init(void) { struct memory_failure_cpu *mf_cpu; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index fc14fe53e9b7..0382b6942b8b 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -872,25 +872,18 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self, unsigned long action, void *_arg) { struct memory_tier *memtier; - struct memory_notify *arg = _arg; - - /* - * Only update the node migration order when a node is - * changing status, like online->offline. - */ - if (arg->status_change_nid < 0) - return notifier_from_errno(0); + struct node_notify *nn = _arg; switch (action) { - case MEM_OFFLINE: + case NODE_REMOVED_LAST_MEMORY: mutex_lock(&memory_tier_lock); - if (clear_node_memory_tier(arg->status_change_nid)) + if (clear_node_memory_tier(nn->nid)) establish_demotion_targets(); mutex_unlock(&memory_tier_lock); break; - case MEM_ONLINE: + case NODE_ADDED_FIRST_MEMORY: mutex_lock(&memory_tier_lock); - memtier = set_node_memory_tier(arg->status_change_nid); + memtier = set_node_memory_tier(nn->nid); if (!IS_ERR(memtier)) establish_demotion_targets(); mutex_unlock(&memory_tier_lock); @@ -929,7 +922,7 @@ static int __init memory_tier_init(void) nodes_and(default_dram_nodes, node_states[N_MEMORY], node_states[N_CPU]); - hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); + hotplug_node_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); return 0; } subsys_initcall(memory_tier_init); diff --git a/mm/memory.c b/mm/memory.c index 8eba595056fe..0ba4f6b71847 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,7 +57,6 @@ #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> -#include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> @@ -125,6 +124,24 @@ int randomize_va_space __read_mostly = 2; #endif +static const struct ctl_table mmu_sysctl_table[] = { + { + .procname = "randomize_va_space", + .data = &randomize_va_space, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_mm_sysctl(void) +{ + register_sysctl_init("kernel", mmu_sysctl_table); + return 0; +} + +subsys_initcall(init_mm_sysctl); + #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { @@ -380,32 +397,26 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, vma_start_write(vma); unlink_anon_vmas(vma); - if (is_vm_hugetlb_page(vma)) { - unlink_file_vma(vma); - hugetlb_free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); - } else { - unlink_file_vma_batch_init(&vb); - unlink_file_vma_batch_add(&vb, vma); + unlink_file_vma_batch_init(&vb); + unlink_file_vma_batch_add(&vb, vma); - /* - * Optimization: gather nearby vmas into one call down - */ - while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_vm_hugetlb_page(next)) { - vma = next; - next = mas_find(mas, ceiling - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; - if (mm_wr_locked) - vma_start_write(vma); - unlink_anon_vmas(vma); - unlink_file_vma_batch_add(&vb, vma); - } - unlink_file_vma_batch_final(&vb); - free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { + vma = next; + next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; + if (mm_wr_locked) + vma_start_write(vma); + unlink_anon_vmas(vma); + unlink_file_vma_batch_add(&vb, vma); } + unlink_file_vma_batch_final(&vb); + + free_pgd_range(tlb, addr, vma->vm_end, + floor, next ? next->vm_start : ceiling); vma = next; } while (vma); } @@ -598,16 +609,6 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; if (is_zero_pfn(pfn)) return NULL; - if (pte_devmap(pte)) - /* - * NOTE: New users of ZONE_DEVICE will not set pte_devmap() - * and will have refcounts incremented on their struct pages - * when they are inserted into PTEs, thus they are safe to - * return here. Legacy ZONE_DEVICE pages that set pte_devmap() - * do not have refcounts. Example of legacy ZONE_DEVICE is - * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. - */ - return NULL; print_bad_pte(vma, addr, pte, NULL); return NULL; @@ -685,9 +686,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, } } - if (pmd_devmap(pmd)) - return NULL; - if (is_huge_zero_pmd(pmd)) + if (is_huge_zero_pfn(pfn)) return NULL; if (unlikely(pfn > highest_memmap_pfn)) return NULL; @@ -797,7 +796,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { - unsigned long vm_flags = dst_vma->vm_flags; + vm_flags_t vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); pte_t pte = orig_pte; struct folio *folio; @@ -985,10 +984,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int max_nr, int *rss, struct folio **prealloc) { + fpb_t flags = FPB_MERGE_WRITE; struct page *page; struct folio *folio; - bool any_writable; - fpb_t flags = 0; int err, nr; page = vm_normal_page(src_vma, addr, pte); @@ -1003,13 +1001,12 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * by keeping the batching logic separate. */ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { - if (src_vma->vm_flags & VM_SHARED) - flags |= FPB_IGNORE_DIRTY; - if (!vma_soft_dirty_enabled(src_vma)) - flags |= FPB_IGNORE_SOFT_DIRTY; + if (!(src_vma->vm_flags & VM_SHARED)) + flags |= FPB_RESPECT_DIRTY; + if (vma_soft_dirty_enabled(src_vma)) + flags |= FPB_RESPECT_SOFT_DIRTY; - nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, - &any_writable, NULL, NULL); + nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1023,8 +1020,6 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma folio_dup_file_rmap_ptes(folio, page, nr, dst_vma); rss[mm_counter_file(folio)] += nr; } - if (any_writable) - pte = pte_mkwrite(pte, src_vma); __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr); return nr; @@ -1250,8 +1245,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) - || pmd_devmap(*src_pmd)) { + if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, @@ -1287,7 +1281,7 @@ copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { + if (pud_trans_huge(*src_pud)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); @@ -1549,7 +1543,6 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; @@ -1579,9 +1572,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { - nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, - NULL, NULL, NULL); - + nr = folio_pte_batch(folio, pte, ptent, max_nr); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, force_break, any_skipped); @@ -1801,7 +1792,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { @@ -1843,7 +1834,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_trans_huge(*pud) || pud_devmap(*pud)) { + if (pud_trans_huge(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { mmap_assert_locked(tlb->mm); split_huge_pud(vma, pud, addr); @@ -2448,7 +2439,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, EXPORT_SYMBOL(vm_map_pages_zero); static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, pgprot_t prot, bool mkwrite) + unsigned long pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, entry; @@ -2470,7 +2461,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, * allocation and mapping invalidation so just skip the * update. */ - if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { + if (pte_pfn(entry) != pfn) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } @@ -2483,10 +2474,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, } /* Ok, finally just insert the thing.. */ - if (pfn_t_devmap(pfn)) - entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); - else - entry = pte_mkspecial(pfn_t_pte(pfn, prot)); + entry = pte_mkspecial(pfn_pte(pfn, prot)); if (mkwrite) { entry = pte_mkyoung(entry); @@ -2557,8 +2545,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, pfnmap_setup_cachemode_pfn(pfn, &pgprot); - return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, - false); + return insert_pfn(vma, addr, pfn, pgprot, false); } EXPORT_SYMBOL(vmf_insert_pfn_prot); @@ -2589,25 +2576,22 @@ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vmf_insert_pfn); -static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite) +static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn, + bool mkwrite) { - if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) && + if (unlikely(is_zero_pfn(pfn)) && (mkwrite || !vm_mixed_zeropage_allowed(vma))) return false; /* these checks mirror the abort conditions in vm_normal_page */ if (vma->vm_flags & VM_MIXEDMAP) return true; - if (pfn_t_devmap(pfn)) - return true; - if (pfn_t_special(pfn)) - return true; - if (is_zero_pfn(pfn_t_to_pfn(pfn))) + if (is_zero_pfn(pfn)) return true; return false; } static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn, bool mkwrite) + unsigned long addr, unsigned long pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; int err; @@ -2618,9 +2602,9 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); - if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) + if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; /* @@ -2630,8 +2614,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && - !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) { struct page *page; /* @@ -2639,7 +2622,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, * regardless of whether the caller specified flags that * result in pfn_t_has_page() == false. */ - page = pfn_to_page(pfn_t_to_pfn(pfn)); + page = pfn_to_page(pfn); err = insert_page(vma, addr, page, pgprot, mkwrite); } else { return insert_pfn(vma, addr, pfn, pgprot, mkwrite); @@ -2674,7 +2657,7 @@ vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) + unsigned long pfn) { return __vm_insert_mixed(vma, addr, pfn, false); } @@ -2686,7 +2669,7 @@ EXPORT_SYMBOL(vmf_insert_mixed); * the same entry was actually inserted. */ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn) + unsigned long addr, unsigned long pfn) { return __vm_insert_mixed(vma, addr, pfn, true); } @@ -4315,26 +4298,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - struct swap_info_struct *si = swp_swap_info(entry); - pgoff_t offset = swp_offset(entry); - int i; - - /* - * While allocating a large folio and doing swap_read_folio, which is - * the case the being faulted pte doesn't have swapcache. We need to - * ensure all PTEs have no cache as well, otherwise, we might go to - * swap devices while the content is in swapcache. - */ - for (i = 0; i < max_nr; i++) { - if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) - return i; - } - - return i; -} - /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. @@ -5425,10 +5388,10 @@ fallback: /* * Using per-page fault to maintain the uffd semantics, and same - * approach also applies to non-anonymous-shmem faults to avoid + * approach also applies to non shmem/tmpfs faults to avoid * inflating the RSS of the process. */ - if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) || + if (!vma_is_shmem(vma) || unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { @@ -6148,7 +6111,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; @@ -6172,7 +6135,7 @@ retry_pud: pud_t orig_pud = *vmf.pud; barrier(); - if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { + if (pud_trans_huge(orig_pud)) { /* * TODO once we support anonymous PUDs: NUMA case and @@ -6213,7 +6176,7 @@ retry_pud: pmd_migration_entry_wait(mm, vmf.pmd); return 0; } - if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { + if (pmd_trans_huge(vmf.orig_pmd)) { if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf); @@ -6740,6 +6703,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, while (len) { int bytes, offset; void *maddr; + struct folio *folio; struct vm_area_struct *vma = NULL; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); @@ -6771,21 +6735,22 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, if (bytes <= 0) break; } else { + folio = page_folio(page); bytes = len; offset = addr & (PAGE_SIZE-1); if (bytes > PAGE_SIZE-offset) bytes = PAGE_SIZE-offset; - maddr = kmap_local_page(page); + maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE); if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); - set_page_dirty_lock(page); + folio_mark_dirty_lock(folio); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); } - unmap_and_put_page(page, maddr); + folio_release_kmap(folio, maddr); } len -= bytes; buf += bytes; @@ -6864,6 +6829,7 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, while (len) { int bytes, offset, retval; void *maddr; + struct folio *folio; struct page *page; struct vm_area_struct *vma = NULL; @@ -6879,17 +6845,18 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, goto out; } + folio = page_folio(page); bytes = len; offset = addr & (PAGE_SIZE - 1); if (bytes > PAGE_SIZE - offset) bytes = PAGE_SIZE - offset; - maddr = kmap_local_page(page); + maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE); retval = strscpy(buf, maddr + offset, bytes); if (retval >= 0) { /* Found the end of the string */ buf += retval; - unmap_and_put_page(page, maddr); + folio_release_kmap(folio, maddr); break; } @@ -6907,7 +6874,7 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, } len -= bytes; - unmap_and_put_page(page, maddr); + folio_release_kmap(folio, maddr); } out: diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b1caedbade5b..1f15af712bc3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -35,6 +35,7 @@ #include <linux/compaction.h> #include <linux/rmap.h> #include <linux/module.h> +#include <linux/node.h> #include <asm/tlbflush.h> @@ -699,30 +700,6 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) online_mem_sections(start_pfn, end_pfn); } -/* check which state of node_states will be changed when online memory */ -static void node_states_check_changes_online(unsigned long nr_pages, - struct zone *zone, struct memory_notify *arg) -{ - int nid = zone_to_nid(zone); - - arg->status_change_nid = NUMA_NO_NODE; - arg->status_change_nid_normal = NUMA_NO_NODE; - - if (!node_state(nid, N_MEMORY)) - arg->status_change_nid = nid; - if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) - arg->status_change_nid_normal = nid; -} - -static void node_states_set_node(int node, struct memory_notify *arg) -{ - if (arg->status_change_nid_normal >= 0) - node_set_state(node, N_NORMAL_MEMORY); - - if (arg->status_change_nid >= 0) - node_set_state(node, N_MEMORY); -} - static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { @@ -770,7 +747,8 @@ static inline void section_taint_zone_device(unsigned long pfn) */ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, int migratetype) + struct vmem_altmap *altmap, int migratetype, + bool isolate_pageblock) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; @@ -797,12 +775,13 @@ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, /* * TODO now we have a visible range of pages which are not associated - * with their zone properly. Not nice but set_pfnblock_flags_mask + * with their zone properly. Not nice but set_pfnblock_migratetype() * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, - MEMINIT_HOTPLUG, altmap, migratetype); + MEMINIT_HOTPLUG, altmap, migratetype, + isolate_pageblock); set_zone_contiguous(zone); } @@ -1127,7 +1106,8 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, if (mhp_off_inaccessible) page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE, + false); for (i = 0; i < nr_pages; i++) { struct page *page = pfn_to_page(pfn + i); @@ -1173,11 +1153,17 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { - unsigned long flags; - int need_zonelists_rebuild = 0; + struct memory_notify mem_arg = { + .start_pfn = pfn, + .nr_pages = nr_pages, + }; + struct node_notify node_arg = { + .nid = NUMA_NO_NODE, + }; const int nid = zone_to_nid(zone); + int need_zonelists_rebuild = 0; + unsigned long flags; int ret; - struct memory_notify arg; /* * {on,off}lining is constrained to full memory sections (or more @@ -1192,13 +1178,19 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, /* associate pfn range with the zone */ - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); - - arg.start_pfn = pfn; - arg.nr_pages = nr_pages; - node_states_check_changes_online(nr_pages, zone, &arg); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE, + true); + + if (!node_state(nid, N_MEMORY)) { + /* Adding memory to the node for the first time */ + node_arg.nid = nid; + ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg); + ret = notifier_to_errno(ret); + if (ret) + goto failed_addition; + } - ret = memory_notify(MEM_GOING_ONLINE, &arg); + ret = memory_notify(MEM_GOING_ONLINE, &mem_arg); ret = notifier_to_errno(ret); if (ret) goto failed_addition; @@ -1224,12 +1216,13 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, online_pages_range(pfn, nr_pages); adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); - node_states_set_node(nid, &arg); + if (node_arg.nid >= 0) + node_set_state(nid, N_MEMORY); if (need_zonelists_rebuild) build_all_zonelists(NULL); /* Basic onlining is complete, allow allocation of onlined pages. */ - undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); + undo_isolate_page_range(pfn, pfn + nr_pages); /* * Freshly onlined pages aren't shuffled (e.g., all pages are placed to @@ -1245,16 +1238,22 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, kswapd_run(nid); kcompactd_run(nid); + if (node_arg.nid >= 0) + /* First memory added successfully. Notify consumers. */ + node_notify(NODE_ADDED_FIRST_MEMORY, &node_arg); + writeback_set_ratelimit(); - memory_notify(MEM_ONLINE, &arg); + memory_notify(MEM_ONLINE, &mem_arg); return 0; failed_addition: pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", (unsigned long long) pfn << PAGE_SHIFT, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); - memory_notify(MEM_CANCEL_ONLINE, &arg); + memory_notify(MEM_CANCEL_ONLINE, &mem_arg); + if (node_arg.nid != NUMA_NO_NODE) + node_notify(NODE_CANCEL_ADDING_FIRST_MEMORY, &node_arg); remove_pfn_range_from_zone(zone, pfn, nr_pages); return ret; } @@ -1571,13 +1570,12 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) * We online node here. We can't roll back from here. */ node_set_online(nid); - ret = __register_one_node(nid); + ret = register_one_node(nid); BUG_ON(ret); } - register_memory_blocks_under_node(nid, PFN_DOWN(start), - PFN_UP(start + size - 1), - MEMINIT_HOTPLUG); + register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start), + PFN_UP(start + size - 1)); /* create new memmap entry */ if (!strcmp(res->name, "System RAM")) @@ -1741,8 +1739,8 @@ bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, - * non-lru movable pages and hugepages). Will skip over most unmovable + * Scan pfn range [start,end) to find movable/migratable pages (LRU and + * hugetlb folio, movable_ops pages). Will skip over most unmovable * pages (esp., pages that can be skipped when offlining), but bail out on * definitely unmovable pages. * @@ -1761,13 +1759,11 @@ static int scan_movable_pages(unsigned long start, unsigned long end, struct folio *folio; page = pfn_to_page(pfn); - if (PageLRU(page)) - goto found; - if (__PageMovable(page)) + if (PageLRU(page) || page_has_movable_ops(page)) goto found; /* - * PageOffline() pages that are not marked __PageMovable() and + * PageOffline() pages that do not have movable_ops and * have a reference count > 0 (after MEM_GOING_OFFLINE) are * definitely unmovable. If their reference count would be 0, * they could at least be skipped when offlining memory. @@ -1886,54 +1882,6 @@ static int __init cmdline_parse_movable_node(char *p) } early_param("movable_node", cmdline_parse_movable_node); -/* check which state of node_states will be changed when offline memory */ -static void node_states_check_changes_offline(unsigned long nr_pages, - struct zone *zone, struct memory_notify *arg) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long present_pages = 0; - enum zone_type zt; - - arg->status_change_nid = NUMA_NO_NODE; - arg->status_change_nid_normal = NUMA_NO_NODE; - - /* - * Check whether node_states[N_NORMAL_MEMORY] will be changed. - * If the memory to be offline is within the range - * [0..ZONE_NORMAL], and it is the last present memory there, - * the zones in that range will become empty after the offlining, - * thus we can determine that we need to clear the node from - * node_states[N_NORMAL_MEMORY]. - */ - for (zt = 0; zt <= ZONE_NORMAL; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) - arg->status_change_nid_normal = zone_to_nid(zone); - - /* - * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM - * does not apply as we don't support 32bit. - * Here we count the possible pages from ZONE_MOVABLE. - * If after having accounted all the pages, we see that the nr_pages - * to be offlined is over or equal to the accounted pages, - * we know that the node will become empty, and so, we can clear - * it for N_MEMORY as well. - */ - present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; - - if (nr_pages >= present_pages) - arg->status_change_nid = zone_to_nid(zone); -} - -static void node_states_clear_node(int node, struct memory_notify *arg) -{ - if (arg->status_change_nid_normal >= 0) - node_clear_state(node, N_NORMAL_MEMORY); - - if (arg->status_change_nid >= 0) - node_clear_state(node, N_MEMORY); -} - static int count_system_ram_pages_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { @@ -1949,11 +1897,18 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, int offline_pages(unsigned long start_pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { - const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn, managed_pages, system_ram_pages = 0; + const unsigned long end_pfn = start_pfn + nr_pages; + struct pglist_data *pgdat = zone->zone_pgdat; const int node = zone_to_nid(zone); + struct memory_notify mem_arg = { + .start_pfn = start_pfn, + .nr_pages = nr_pages, + }; + struct node_notify node_arg = { + .nid = NUMA_NO_NODE, + }; unsigned long flags; - struct memory_notify arg; char *reason; int ret; @@ -2005,18 +1960,28 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, - MIGRATE_MOVABLE, - MEMORY_OFFLINE | REPORT_FAILURE); + PB_ISOLATE_MODE_MEM_OFFLINE); if (ret) { reason = "failure to isolate range"; goto failed_removal_pcplists_disabled; } - arg.start_pfn = start_pfn; - arg.nr_pages = nr_pages; - node_states_check_changes_offline(nr_pages, zone, &arg); + /* + * Check whether the node will have no present pages after we offline + * 'nr_pages' more. If so, we know that the node will become empty, and + * so we will clear N_MEMORY for it. + */ + if (nr_pages >= pgdat->node_present_pages) { + node_arg.nid = node; + ret = node_notify(NODE_REMOVING_LAST_MEMORY, &node_arg); + ret = notifier_to_errno(ret); + if (ret) { + reason = "node notifier failure"; + goto failed_removal_isolated; + } + } - ret = memory_notify(MEM_GOING_OFFLINE, &arg); + ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg); ret = notifier_to_errno(ret); if (ret) { reason = "notifier failure"; @@ -2065,7 +2030,8 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, goto failed_removal_isolated; } - ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); + ret = test_pages_isolated(start_pfn, end_pfn, + PB_ISOLATE_MODE_MEM_OFFLINE); } while (ret); @@ -2096,27 +2062,32 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, * Make sure to mark the node as memory-less before rebuilding the zone * list. Otherwise this node would still appear in the fallback lists. */ - node_states_clear_node(node, &arg); + if (node_arg.nid >= 0) + node_clear_state(node, N_MEMORY); if (!populated_zone(zone)) { zone_pcp_reset(zone); build_all_zonelists(NULL); } - if (arg.status_change_nid >= 0) { + if (node_arg.nid >= 0) { kcompactd_stop(node); kswapd_stop(node); + /* Node went memoryless. Notify consumers */ + node_notify(NODE_REMOVED_LAST_MEMORY, &node_arg); } writeback_set_ratelimit(); - memory_notify(MEM_OFFLINE, &arg); + memory_notify(MEM_OFFLINE, &mem_arg); remove_pfn_range_from_zone(zone, start_pfn, nr_pages); return 0; failed_removal_isolated: /* pushback to free area */ - undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); - memory_notify(MEM_CANCEL_OFFLINE, &arg); + undo_isolate_page_range(start_pfn, end_pfn); + memory_notify(MEM_CANCEL_OFFLINE, &mem_arg); + if (node_arg.nid != NUMA_NO_NODE) + node_notify(NODE_CANCEL_REMOVING_LAST_MEMORY, &node_arg); failed_removal_pcplists_disabled: lru_cache_enable(); zone_pcp_enable(zone); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3b1dfd08338b..eb83cff7db8c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -675,7 +675,6 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct vm_area_struct *vma = walk->vma; struct folio *folio; struct queue_pages *qp = walk->private; @@ -712,9 +711,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, if (!folio || folio_is_zone_device(folio)) continue; if (folio_test_large(folio) && max_nr != 1) - nr = folio_pte_batch(folio, addr, pte, ptent, - max_nr, fpb_flags, - NULL, NULL, NULL); + nr = folio_pte_batch(folio, pte, ptent, max_nr); /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. @@ -3703,18 +3700,15 @@ static void wi_state_free(void) struct weighted_interleave_state *old_wi_state; mutex_lock(&wi_state_lock); - old_wi_state = rcu_dereference_protected(wi_state, lockdep_is_held(&wi_state_lock)); - if (!old_wi_state) { - mutex_unlock(&wi_state_lock); - return; - } - rcu_assign_pointer(wi_state, NULL); mutex_unlock(&wi_state_lock); - synchronize_rcu(); - kfree(old_wi_state); + + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } } static struct kobj_attribute wi_auto_attr = @@ -3791,20 +3785,17 @@ static int wi_node_notifier(struct notifier_block *nb, unsigned long action, void *data) { int err; - struct memory_notify *arg = data; - int nid = arg->status_change_nid; - - if (nid < 0) - return NOTIFY_OK; + struct node_notify *nn = data; + int nid = nn->nid; switch (action) { - case MEM_ONLINE: + case NODE_ADDED_FIRST_MEMORY: err = sysfs_wi_node_add(nid); if (err) pr_err("failed to add sysfs for node%d during hotplug: %d\n", nid, err); break; - case MEM_OFFLINE: + case NODE_REMOVED_LAST_MEMORY: sysfs_wi_node_delete(nid); break; } @@ -3843,7 +3834,7 @@ static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) } } - hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); + hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); return 0; err_cleanup_kobj: diff --git a/mm/mempool.c b/mm/mempool.c index 3223337135d0..204a216b6418 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -540,11 +540,43 @@ void mempool_free(void *element, mempool_t *pool) if (likely(pool->curr_nr < pool->min_nr)) { add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); - wake_up(&pool->wait); + if (wq_has_sleeper(&pool->wait)) + wake_up(&pool->wait); return; } spin_unlock_irqrestore(&pool->lock, flags); } + + /* + * Handle the min_nr = 0 edge case: + * + * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds, + * so waiters sleeping on pool->wait would never be woken by the + * wake-up path of previous test. This explicit check ensures the + * allocation of element when both min_nr and curr_nr are 0, and + * any active waiters are properly awakened. + * + * Inline the same logic as previous test, add_element() cannot be + * directly used here since it has BUG_ON to deny if min_nr equals + * curr_nr, so here picked rest of add_element() to use without + * BUG_ON check. + */ + if (unlikely(pool->min_nr == 0 && + READ_ONCE(pool->curr_nr) == 0)) { + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr == 0)) { + /* Inline the logic of add_element() */ + poison_element(pool, element); + if (kasan_poison_element(pool, element)) + pool->elements[pool->curr_nr++] = element; + spin_unlock_irqrestore(&pool->lock, flags); + if (wq_has_sleeper(&pool->wait)) + wake_up(&pool->wait); + return; + } + spin_unlock_irqrestore(&pool->lock, flags); + } + pool->free(element, pool->pool_data); } EXPORT_SYMBOL(mempool_free); diff --git a/mm/memremap.c b/mm/memremap.c index c417c843e9b1..b0ce0d8254bd 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -5,7 +5,6 @@ #include <linux/kasan.h> #include <linux/memory_hotplug.h> #include <linux/memremap.h> -#include <linux/pfn_t.h> #include <linux/swap.h> #include <linux/mm.h> #include <linux/mmzone.h> @@ -39,30 +38,6 @@ unsigned long memremap_compat_align(void) EXPORT_SYMBOL_GPL(memremap_compat_align); #endif -#ifdef CONFIG_FS_DAX -DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL(devmap_managed_key); - -static void devmap_managed_enable_put(struct dev_pagemap *pgmap) -{ - if (pgmap->type == MEMORY_DEVICE_FS_DAX) - static_branch_dec(&devmap_managed_key); -} - -static void devmap_managed_enable_get(struct dev_pagemap *pgmap) -{ - if (pgmap->type == MEMORY_DEVICE_FS_DAX) - static_branch_inc(&devmap_managed_key); -} -#else -static void devmap_managed_enable_get(struct dev_pagemap *pgmap) -{ -} -static void devmap_managed_enable_put(struct dev_pagemap *pgmap) -{ -} -#endif /* CONFIG_FS_DAX */ - static void pgmap_array_delete(struct range *range) { xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end), @@ -151,7 +126,6 @@ void memunmap_pages(struct dev_pagemap *pgmap) percpu_ref_exit(&pgmap->ref); WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); - devmap_managed_enable_put(pgmap); } EXPORT_SYMBOL_GPL(memunmap_pages); @@ -254,7 +228,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, PHYS_PFN(range->start), PHYS_PFN(range_len(range)), params->altmap, - MIGRATE_MOVABLE); + MIGRATE_MOVABLE, false); } mem_hotplug_done(); @@ -332,10 +306,6 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } break; case MEMORY_DEVICE_FS_DAX: - if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { - WARN(1, "File system DAX not supported\n"); - return ERR_PTR(-EINVAL); - } params.pgprot = pgprot_decrypted(params.pgprot); break; case MEMORY_DEVICE_GENERIC: @@ -354,8 +324,6 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) if (error) return ERR_PTR(error); - devmap_managed_enable_get(pgmap); - /* * Clear the pgmap nr_range as it will be incremented for each * successfully processed range. This communicates how many diff --git a/mm/migrate.c b/mm/migrate.c index 8cf0f9c9599d..425401b2d4e1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -35,7 +35,6 @@ #include <linux/compat.h> #include <linux/hugetlb.h> #include <linux/gfp.h> -#include <linux/pfn_t.h> #include <linux/page_idle.h> #include <linux/page_owner.h> #include <linux/sched/mm.h> @@ -44,6 +43,8 @@ #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <linux/pagewalk.h> +#include <linux/balloon_compaction.h> +#include <linux/zsmalloc.h> #include <asm/tlbflush.h> @@ -52,8 +53,47 @@ #include "internal.h" #include "swap.h" -bool isolate_movable_page(struct page *page, isolate_mode_t mode) +static const struct movable_operations *page_movable_ops(struct page *page) { + VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page); + + /* + * If we enable page migration for a page of a certain type by marking + * it as movable, the page type must be sticky until the page gets freed + * back to the buddy. + */ +#ifdef CONFIG_BALLOON_COMPACTION + if (PageOffline(page)) + /* Only balloon compaction sets PageOffline pages movable. */ + return &balloon_mops; +#endif /* CONFIG_BALLOON_COMPACTION */ +#if defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION) + if (PageZsmalloc(page)) + return &zsmalloc_mops; +#endif /* defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION) */ + return NULL; +} + +/** + * isolate_movable_ops_page - isolate a movable_ops page for migration + * @page: The page. + * @mode: The isolation mode. + * + * Try to isolate a movable_ops page for migration. Will fail if the page is + * not a movable_ops page, if the page is already isolated for migration + * or if the page was just was released by its owner. + * + * Once isolated, the page cannot get freed until it is either putback + * or migrated. + * + * Returns true if isolation succeeded, otherwise false. + */ +bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode) +{ + /* + * TODO: these pages will not be folios in the future. All + * folio dependencies will have to be removed. + */ struct folio *folio = folio_get_nontail_page(page); const struct movable_operations *mops; @@ -70,11 +110,14 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) goto out; /* - * Check movable flag before taking the page lock because + * Check for movable_ops pages before taking the page lock because * we use non-atomic bitops on newly allocated page flags so * unconditionally grabbing the lock ruins page's owner side. + * + * Note that once a page has movable_ops, it will stay that way + * until the page was freed. */ - if (unlikely(!__folio_test_movable(folio))) + if (unlikely(!page_has_movable_ops(page))) goto out_putfolio; /* @@ -91,18 +134,20 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) if (unlikely(!folio_trylock(folio))) goto out_putfolio; - if (!folio_test_movable(folio) || folio_test_isolated(folio)) + VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page); + if (PageMovableOpsIsolated(page)) goto out_no_isolated; - mops = folio_movable_ops(folio); - VM_BUG_ON_FOLIO(!mops, folio); + mops = page_movable_ops(page); + if (WARN_ON_ONCE(!mops)) + goto out_no_isolated; - if (!mops->isolate_page(&folio->page, mode)) + if (!mops->isolate_page(page, mode)) goto out_no_isolated; /* Driver shouldn't use the isolated flag */ - WARN_ON_ONCE(folio_test_isolated(folio)); - folio_set_isolated(folio); + VM_WARN_ON_ONCE_PAGE(PageMovableOpsIsolated(page), page); + SetPageMovableOpsIsolated(page); folio_unlock(folio); return true; @@ -115,12 +160,69 @@ out: return false; } -static void putback_movable_folio(struct folio *folio) +/** + * putback_movable_ops_page - putback an isolated movable_ops page + * @page: The isolated page. + * + * Putback an isolated movable_ops page. + * + * After the page was putback, it might get freed instantly. + */ +static void putback_movable_ops_page(struct page *page) { - const struct movable_operations *mops = folio_movable_ops(folio); + /* + * TODO: these pages will not be folios in the future. All + * folio dependencies will have to be removed. + */ + struct folio *folio = page_folio(page); - mops->putback_page(&folio->page); - folio_clear_isolated(folio); + VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page); + VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(page), page); + folio_lock(folio); + page_movable_ops(page)->putback_page(page); + ClearPageMovableOpsIsolated(page); + folio_unlock(folio); + folio_put(folio); +} + +/** + * migrate_movable_ops_page - migrate an isolated movable_ops page + * @dst: The destination page. + * @src: The source page. + * @mode: The migration mode. + * + * Migrate an isolated movable_ops page. + * + * If the src page was already released by its owner, the src page is + * un-isolated (putback) and migration succeeds; the migration core will be the + * owner of both pages. + * + * If the src page was not released by its owner and the migration was + * successful, the owner of the src page and the dst page are swapped and + * the src page is un-isolated. + * + * If migration fails, the ownership stays unmodified and the src page + * remains isolated: migration may be retried later or the page can be putback. + * + * TODO: migration core will treat both pages as folios and lock them before + * this call to unlock them after this call. Further, the folio refcounts on + * src and dst are also released by migration core. These pages will not be + * folios in the future, so that must be reworked. + * + * Returns MIGRATEPAGE_SUCCESS on success, otherwise a negative error + * code. + */ +static int migrate_movable_ops_page(struct page *dst, struct page *src, + enum migrate_mode mode) +{ + int rc = MIGRATEPAGE_SUCCESS; + + VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src); + VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src); + rc = page_movable_ops(src)->migrate_page(dst, src, mode); + if (rc == MIGRATEPAGE_SUCCESS) + ClearPageMovableOpsIsolated(src); + return rc; } /* @@ -142,20 +244,8 @@ void putback_movable_pages(struct list_head *l) continue; } list_del(&folio->lru); - /* - * We isolated non-lru movable folio so here we can use - * __folio_test_movable because LRU folio's mapping cannot - * have PAGE_MAPPING_MOVABLE. - */ - if (unlikely(__folio_test_movable(folio))) { - VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio); - folio_lock(folio); - if (folio_test_movable(folio)) - putback_movable_folio(folio); - else - folio_clear_isolated(folio); - folio_unlock(folio); - folio_put(folio); + if (unlikely(page_has_movable_ops(&folio->page))) { + putback_movable_ops_page(&folio->page); } else { node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), -folio_nr_pages(folio)); @@ -167,26 +257,20 @@ void putback_movable_pages(struct list_head *l) /* Must be called with an elevated refcount on the non-hugetlb folio */ bool isolate_folio_to_list(struct folio *folio, struct list_head *list) { - bool isolated, lru; - if (folio_test_hugetlb(folio)) return folio_isolate_hugetlb(folio, list); - lru = !__folio_test_movable(folio); - if (lru) - isolated = folio_isolate_lru(folio); - else - isolated = isolate_movable_page(&folio->page, - ISOLATE_UNEVICTABLE); - - if (!isolated) - return false; - - list_add(&folio->lru, list); - if (lru) + if (page_has_movable_ops(&folio->page)) { + if (!isolate_movable_ops_page(&folio->page, + ISOLATE_UNEVICTABLE)) + return false; + } else { + if (!folio_isolate_lru(folio)) + return false; node_stat_add_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio)); - + } + list_add(&folio->lru, list); return true; } @@ -956,11 +1040,12 @@ static int fallback_migrate_folio(struct address_space *mapping, } /* - * Move a page to a newly allocated page - * The page is locked and all ptes have been successfully removed. + * Move a src folio to a newly allocated dst folio. + * + * The src and dst folios are locked and the src folios was unmapped from + * the page tables. * - * The new page will have replaced the old page if this function - * is successful. + * On success, the src folio was replaced by the dst folio. * * Return value: * < 0 - error code @@ -969,78 +1054,40 @@ static int fallback_migrate_folio(struct address_space *mapping, static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) { + struct address_space *mapping = folio_mapping(src); int rc = -EAGAIN; - bool is_lru = !__folio_test_movable(src); VM_BUG_ON_FOLIO(!folio_test_locked(src), src); VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst); - if (likely(is_lru)) { - struct address_space *mapping = folio_mapping(src); - - if (!mapping) - rc = migrate_folio(mapping, dst, src, mode); - else if (mapping_inaccessible(mapping)) - rc = -EOPNOTSUPP; - else if (mapping->a_ops->migrate_folio) - /* - * Most folios have a mapping and most filesystems - * provide a migrate_folio callback. Anonymous folios - * are part of swap space which also has its own - * migrate_folio callback. This is the most common path - * for page migration. - */ - rc = mapping->a_ops->migrate_folio(mapping, dst, src, - mode); - else - rc = fallback_migrate_folio(mapping, dst, src, mode); - } else { - const struct movable_operations *mops; - + if (!mapping) + rc = migrate_folio(mapping, dst, src, mode); + else if (mapping_inaccessible(mapping)) + rc = -EOPNOTSUPP; + else if (mapping->a_ops->migrate_folio) /* - * In case of non-lru page, it could be released after - * isolation step. In that case, we shouldn't try migration. + * Most folios have a mapping and most filesystems + * provide a migrate_folio callback. Anonymous folios + * are part of swap space which also has its own + * migrate_folio callback. This is the most common path + * for page migration. */ - VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); - if (!folio_test_movable(src)) { - rc = MIGRATEPAGE_SUCCESS; - folio_clear_isolated(src); - goto out; - } - - mops = folio_movable_ops(src); - rc = mops->migrate_page(&dst->page, &src->page, mode); - WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && - !folio_test_isolated(src)); - } + rc = mapping->a_ops->migrate_folio(mapping, dst, src, + mode); + else + rc = fallback_migrate_folio(mapping, dst, src, mode); - /* - * When successful, old pagecache src->mapping must be cleared before - * src is freed; but stats require that PageAnon be left as PageAnon. - */ if (rc == MIGRATEPAGE_SUCCESS) { - if (__folio_test_movable(src)) { - VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); - - /* - * We clear PG_movable under page_lock so any compactor - * cannot try to migrate this page. - */ - folio_clear_isolated(src); - } - /* - * Anonymous and movable src->mapping will be cleared by - * free_pages_prepare so don't reset it here for keeping - * the type to work PageAnon, for example. + * For pagecache folios, src->mapping must be cleared before src + * is freed. Anonymous folios must stay anonymous until freed. */ - if (!folio_mapping_flags(src)) + if (!folio_test_anon(src)) src->mapping = NULL; if (likely(!folio_is_zone_device(dst))) flush_dcache_folio(dst); } -out: return rc; } @@ -1107,12 +1154,7 @@ static void migrate_folio_undo_dst(struct folio *dst, bool locked, static void migrate_folio_done(struct folio *src, enum migrate_reason reason) { - /* - * Compaction can migrate also non-LRU pages which are - * not accounted to NR_ISOLATED_*. They can be recognized - * as __folio_test_movable - */ - if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION) + if (likely(!page_has_movable_ops(&src->page)) && reason != MR_DEMOTION) mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + folio_is_file_lru(src), -folio_nr_pages(src)); @@ -1131,7 +1173,6 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, int rc = -EAGAIN; int old_page_state = 0; struct anon_vma *anon_vma = NULL; - bool is_lru = data_race(!__folio_test_movable(src)); bool locked = false; bool dst_locked = false; @@ -1232,7 +1273,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, goto out; dst_locked = true; - if (unlikely(!is_lru)) { + if (unlikely(page_has_movable_ops(&src->page))) { __migrate_folio_record(dst, old_page_state, anon_vma); return MIGRATEPAGE_UNMAP; } @@ -1291,20 +1332,23 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, int rc; int old_page_state = 0; struct anon_vma *anon_vma = NULL; - bool is_lru = !__folio_test_movable(src); struct list_head *prev; __migrate_folio_extract(dst, &old_page_state, &anon_vma); prev = dst->lru.prev; list_del(&dst->lru); + if (unlikely(page_has_movable_ops(&src->page))) { + rc = migrate_movable_ops_page(&dst->page, &src->page, mode); + if (rc) + goto out; + goto out_unlock_both; + } + rc = move_to_new_folio(dst, src, mode); if (rc) goto out; - if (unlikely(!is_lru)) - goto out_unlock_both; - /* * When successful, push dst to LRU immediately: so that if it * turns out to be an mlocked page, remove_migration_ptes() will @@ -1323,7 +1367,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, out_unlock_both: folio_unlock(dst); - set_page_owner_migrate_reason(&dst->page, reason); + folio_set_owner_migrate_reason(dst, reason); /* * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased @@ -2319,13 +2363,6 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, } /* - * The move_pages() man page does not have an -EEXIST choice, so - * use -EFAULT instead. - */ - if (err == -EEXIST) - err = -EFAULT; - - /* * If the page is already on the target node (!err), store the * node, otherwise, store the err. */ @@ -2399,6 +2436,7 @@ set_status: static int get_compat_pages_array(const void __user *chunk_pages[], const void __user * __user *pages, + unsigned long chunk_offset, unsigned long chunk_nr) { compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages; @@ -2406,7 +2444,7 @@ static int get_compat_pages_array(const void __user *chunk_pages[], int i; for (i = 0; i < chunk_nr; i++) { - if (get_user(p, pages32 + i)) + if (get_user(p, pages32 + chunk_offset + i)) return -EFAULT; chunk_pages[i] = compat_ptr(p); } @@ -2425,27 +2463,28 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, #define DO_PAGES_STAT_CHUNK_NR 16UL const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; int chunk_status[DO_PAGES_STAT_CHUNK_NR]; + unsigned long chunk_offset = 0; while (nr_pages) { unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR); if (in_compat_syscall()) { if (get_compat_pages_array(chunk_pages, pages, - chunk_nr)) + chunk_offset, chunk_nr)) break; } else { - if (copy_from_user(chunk_pages, pages, + if (copy_from_user(chunk_pages, pages + chunk_offset, chunk_nr * sizeof(*chunk_pages))) break; } do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); - if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) + if (copy_to_user(status + chunk_offset, chunk_status, + chunk_nr * sizeof(*status))) break; - pages += chunk_nr; - status += chunk_nr; + chunk_offset += chunk_nr; nr_pages -= chunk_nr; } return nr_pages ? -EFAULT : 0; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 3158afe7eb23..e05e14d6eacd 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -615,7 +615,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, pmdp = pmd_alloc(mm, pudp, addr); if (!pmdp) goto abort; - if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) + if (pmd_trans_huge(*pmdp)) goto abort; if (pte_alloc(mm, pmdp)) goto abort; diff --git a/mm/mlock.c b/mm/mlock.c index 3cb72b579ffd..a1d93ad33c6d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -307,15 +307,13 @@ void munlock_folio(struct folio *folio) static inline unsigned int folio_mlock_step(struct folio *folio, pte_t *pte, unsigned long addr, unsigned long end) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; unsigned int count = (end - addr) >> PAGE_SHIFT; pte_t ptent = ptep_get(pte); if (!folio_test_large(folio)) return 1; - return folio_pte_batch(folio, addr, pte, ptent, count, fpb_flags, NULL, - NULL, NULL); + return folio_pte_batch(folio, pte, ptent, count); } static inline bool allow_mlock_munlock(struct folio *folio, diff --git a/mm/mm_init.c b/mm/mm_init.c index f2944748f526..5c21b3af216b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -685,7 +685,8 @@ void __meminit __init_page_from_nid(unsigned long pfn, int nid) __init_single_page(pfn_to_page(pfn), pfn, zid, nid); if (pageblock_aligned(pfn)) - set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE); + init_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE, + false); } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -874,7 +875,8 @@ static void __init init_unavailable_range(unsigned long spfn, void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, unsigned long zone_end_pfn, enum meminit_context context, - struct vmem_altmap *altmap, int migratetype) + struct vmem_altmap *altmap, int migratetype, + bool isolate_pageblock) { unsigned long pfn, end_pfn = start_pfn + size; struct page *page; @@ -931,7 +933,8 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone * over the place during system boot. */ if (pageblock_aligned(pfn)) { - set_pageblock_migratetype(page, migratetype); + init_pageblock_migratetype(page, migratetype, + isolate_pageblock); cond_resched(); } pfn++; @@ -954,7 +957,8 @@ static void __init memmap_init_zone_range(struct zone *zone, return; memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, - zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE, + false); if (*hole_pfn < start_pfn) init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); @@ -1035,7 +1039,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * because this is done early in section_activate() */ if (pageblock_aligned(pfn)) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + init_pageblock_migratetype(page, MIGRATE_MOVABLE, false); cond_resched(); } @@ -1509,7 +1513,7 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order = PAGE_BLOCK_ORDER; + unsigned int order = PAGE_BLOCK_MAX_ORDER; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) @@ -1996,7 +2000,8 @@ static void __init deferred_free_pages(unsigned long pfn, /* Free a large naturally-aligned chunk if possible */ if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) { for (i = 0; i < nr_pages; i += pageblock_nr_pages) - set_pageblock_migratetype(page + i, MIGRATE_MOVABLE); + init_pageblock_migratetype(page + i, MIGRATE_MOVABLE, + false); __free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY); return; } @@ -2006,7 +2011,8 @@ static void __init deferred_free_pages(unsigned long pfn, for (i = 0; i < nr_pages; i++, page++, pfn++) { if (pageblock_aligned(pfn)) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + init_pageblock_migratetype(page, MIGRATE_MOVABLE, + false); __free_pages_core(page, 0, MEMINIT_EARLY); } } @@ -2305,7 +2311,7 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_count(p, 0); } while (++p, --i); - set_pageblock_migratetype(page, MIGRATE_CMA); + init_pageblock_migratetype(page, MIGRATE_CMA, false); set_page_refcounted(page); /* pages were reserved and not allocated */ clear_page_tag_ref(page); @@ -2319,7 +2325,7 @@ void __init init_cma_reserved_pageblock(struct page *page) */ void __init init_cma_pageblock(struct page *page) { - set_pageblock_migratetype(page, MIGRATE_CMA); + init_pageblock_migratetype(page, MIGRATE_CMA, false); adjust_managed_page_count(page, pageblock_nr_pages); page_zone(page)->cma_pages += pageblock_nr_pages; } diff --git a/mm/mmap.c b/mm/mmap.c index 09c563c95112..7306253cc3b5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -80,7 +80,7 @@ core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); /* Update vma->vm_page_prot to reflect vma->vm_flags. */ void vma_set_page_prot(struct vm_area_struct *vma) { - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; pgprot_t vm_page_prot; vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); @@ -127,18 +127,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) origbrk = mm->brk; + min_brk = mm->start_brk; #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting * randomize_va_space to 2, which will still cause mm->start_brk * to be arbitrarily shifted */ - if (current->brk_randomized) - min_brk = mm->start_brk; - else + if (!current->brk_randomized) min_brk = mm->end_data; -#else - min_brk = mm->start_brk; #endif if (brk < min_brk) goto out; @@ -228,12 +225,12 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, +bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; - if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) + if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) return true; locked_pages = bytes >> PAGE_SHIFT; @@ -475,7 +472,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags &= ~VM_MAYEXEC; } - if (!file_has_valid_mmap_hooks(file)) + if (!can_mmap_file(file)) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; @@ -871,9 +868,8 @@ mm_get_unmapped_area(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - if (test_bit(MMF_TOPDOWN, &mm->flags)) - return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags, 0); - return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0); + return mm_get_unmapped_area_vmflags(mm, file, addr, len, + pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); @@ -1207,7 +1203,7 @@ out: return ret; } -int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) +int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; @@ -1224,7 +1220,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) return 0; /* Until we need other flags, refuse anything except VM_EXEC. */ - if ((flags & (~VM_EXEC)) != 0) + if ((vm_flags & (~VM_EXEC)) != 0) return -EINVAL; if (mmap_write_lock_killable(mm)) @@ -1239,7 +1235,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) goto munmap_failed; vma = vma_prev(&vmi); - ret = do_brk_flags(&vmi, vma, addr, len, flags); + ret = do_brk_flags(&vmi, vma, addr, len, vm_flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); @@ -1444,7 +1440,7 @@ static vm_fault_t special_mapping_fault(struct vm_fault *vmf) static struct vm_area_struct *__install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long vm_flags, void *priv, + vm_flags_t vm_flags, void *priv, const struct vm_operations_struct *ops) { int ret; @@ -1496,7 +1492,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma, struct vm_area_struct *_install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long vm_flags, const struct vm_special_mapping *spec) + vm_flags_t vm_flags, const struct vm_special_mapping *spec) { return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, &special_mapping_vmops); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 5f725cc67334..729fb7d0dd59 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -178,6 +178,99 @@ inval: count_vm_vma_lock_event(VMA_LOCK_ABORT); return NULL; } + +static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm, + struct vma_iterator *vmi, + unsigned long from_addr) +{ + struct vm_area_struct *vma; + int ret; + + ret = mmap_read_lock_killable(mm); + if (ret) + return ERR_PTR(ret); + + /* Lookup the vma at the last position again under mmap_read_lock */ + vma_iter_set(vmi, from_addr); + vma = vma_next(vmi); + if (vma) { + /* Very unlikely vma->vm_refcnt overflow case */ + if (unlikely(!vma_start_read_locked(vma))) + vma = ERR_PTR(-EAGAIN); + } + + mmap_read_unlock(mm); + + return vma; +} + +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, + struct vma_iterator *vmi, + unsigned long from_addr) +{ + struct vm_area_struct *vma; + unsigned int mm_wr_seq; + bool mmap_unlocked; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); +retry: + /* Start mmap_lock speculation in case we need to verify the vma later */ + mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); + vma = vma_next(vmi); + if (!vma) + return NULL; + + vma = vma_start_read(mm, vma); + if (IS_ERR_OR_NULL(vma)) { + /* + * Retry immediately if the vma gets detached from under us. + * Infinite loop should not happen because the vma we find will + * have to be constantly knocked out from under us. + */ + if (PTR_ERR(vma) == -EAGAIN) { + /* reset to search from the last address */ + vma_iter_set(vmi, from_addr); + goto retry; + } + + goto fallback; + } + + /* + * Verify the vma we locked belongs to the same address space and it's + * not behind of the last search position. + */ + if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end)) + goto fallback_unlock; + + /* + * vma can be ahead of the last search position but we need to verify + * it was not shrunk after we found it and another vma has not been + * installed ahead of it. Otherwise we might observe a gap that should + * not be there. + */ + if (from_addr < vma->vm_start) { + /* Verify only if the address space might have changed since vma lookup. */ + if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { + vma_iter_set(vmi, from_addr); + if (vma != vma_next(vmi)) + goto fallback_unlock; + } + } + + return vma; + +fallback_unlock: + vma_end_read(vma); +fallback: + rcu_read_unlock(); + vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); + rcu_read_lock(); + /* Reinitialize the iterator after re-entering rcu read section */ + vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end); + + return vma; +} #endif /* CONFIG_PER_VMA_LOCK */ #ifdef CONFIG_LOCK_MM_AND_FIND_VMA diff --git a/mm/mprotect.c b/mm/mprotect.c index 88608d0dc2c2..2ddd37b2f462 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -40,11 +40,8 @@ #include "internal.h" -bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +static bool maybe_change_pte_writable(struct vm_area_struct *vma, pte_t pte) { - struct page *page; - if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) return false; @@ -60,16 +57,32 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, if (userfaultfd_pte_wp(vma, pte)) return false; - if (!(vma->vm_flags & VM_SHARED)) { - /* - * Writable MAP_PRIVATE mapping: We can only special-case on - * exclusive anonymous pages, because we know that our - * write-fault handler similarly would map them writable without - * any additional checks while holding the PT lock. - */ - page = vm_normal_page(vma, addr, pte); - return page && PageAnon(page) && PageAnonExclusive(page); - } + return true; +} + +static bool can_change_private_pte_writable(struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + struct page *page; + + if (!maybe_change_pte_writable(vma, pte)) + return false; + + /* + * Writable MAP_PRIVATE mapping: We can only special-case on + * exclusive anonymous pages, because we know that our + * write-fault handler similarly would map them writable without + * any additional checks while holding the PT lock. + */ + page = vm_normal_page(vma, addr, pte); + return page && PageAnon(page) && PageAnonExclusive(page); +} + +static bool can_change_shared_pte_writable(struct vm_area_struct *vma, + pte_t pte) +{ + if (!maybe_change_pte_writable(vma, pte)) + return false; VM_WARN_ON_ONCE(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)); @@ -83,6 +96,183 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, return pte_dirty(pte); } +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + if (!(vma->vm_flags & VM_SHARED)) + return can_change_private_pte_writable(vma, addr, pte); + + return can_change_shared_pte_writable(vma, pte); +} + +static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep, + pte_t pte, int max_nr_ptes, fpb_t flags) +{ + /* No underlying folio, so cannot batch */ + if (!folio) + return 1; + + if (!folio_test_large(folio)) + return 1; + + return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr_ptes, flags); +} + +static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr, + pte_t oldpte, pte_t *pte, int target_node, + struct folio **foliop) +{ + struct folio *folio = NULL; + bool ret = true; + bool toptier; + int nid; + + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + goto skip; + + folio = vm_normal_folio(vma, addr, oldpte); + if (!folio) + goto skip; + + if (folio_is_zone_device(folio) || folio_test_ksm(folio)) + goto skip; + + /* Also skip shared copy-on-write pages */ + if (is_cow_mapping(vma->vm_flags) && + (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio))) + goto skip; + + /* + * While migration can move some dirty pages, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (folio_is_file_lru(folio) && folio_test_dirty(folio)) + goto skip; + + /* + * Don't mess with PTEs if page is already on the node + * a single-threaded process is running on. + */ + nid = folio_nid(folio); + if (target_node == nid) + goto skip; + + toptier = node_is_toptier(nid); + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier) + goto skip; + + ret = false; + if (folio_use_access_time(folio)) + folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); + +skip: + *foliop = folio; + return ret; +} + +/* Set nr_ptes number of ptes, starting from idx */ +static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, + int idx, bool set_write, struct mmu_gather *tlb) +{ + /* + * Advance the position in the batch by idx; note that if idx > 0, + * then the nr_ptes passed here is <= batch size - idx. + */ + addr += idx * PAGE_SIZE; + ptep += idx; + oldpte = pte_advance_pfn(oldpte, idx); + ptent = pte_advance_pfn(ptent, idx); + + if (set_write) + ptent = pte_mkwrite(ptent, vma); + + modify_prot_commit_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes); + if (pte_needs_flush(oldpte, ptent)) + tlb_flush_pte_range(tlb, addr, nr_ptes * PAGE_SIZE); +} + +/* + * Get max length of consecutive ptes pointing to PageAnonExclusive() pages or + * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce + * that the ptes point to consecutive pages of the same anon large folio. + */ +static int page_anon_exclusive_sub_batch(int start_idx, int max_len, + struct page *first_page, bool expected_anon_exclusive) +{ + int idx; + + for (idx = start_idx + 1; idx < start_idx + max_len; ++idx) { + if (expected_anon_exclusive != PageAnonExclusive(first_page + idx)) + break; + } + return idx - start_idx; +} + +/* + * This function is a result of trying our very best to retain the + * "avoid the write-fault handler" optimization. In can_change_pte_writable(), + * if the vma is a private vma, and we cannot determine whether to change + * the pte to writable just from the vma and the pte, we then need to look + * at the actual page pointed to by the pte. Unfortunately, if we have a + * batch of ptes pointing to consecutive pages of the same anon large folio, + * the anon-exclusivity (or the negation) of the first page does not guarantee + * the anon-exclusivity (or the negation) of the other pages corresponding to + * the pte batch; hence in this case it is incorrect to decide to change or + * not change the ptes to writable just by using information from the first + * pte of the batch. Therefore, we must individually check all pages and + * retrieve sub-batches. + */ +static void commit_anon_folio_batch(struct vm_area_struct *vma, + struct folio *folio, unsigned long addr, pte_t *ptep, + pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) +{ + struct page *first_page = folio_page(folio, 0); + bool expected_anon_exclusive; + int sub_batch_idx = 0; + int len; + + while (nr_ptes) { + expected_anon_exclusive = PageAnonExclusive(first_page + sub_batch_idx); + len = page_anon_exclusive_sub_batch(sub_batch_idx, nr_ptes, + first_page, expected_anon_exclusive); + prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, len, + sub_batch_idx, expected_anon_exclusive, tlb); + sub_batch_idx += len; + nr_ptes -= len; + } +} + +static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, + struct folio *folio, unsigned long addr, pte_t *ptep, + pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) +{ + bool set_write; + + if (vma->vm_flags & VM_SHARED) { + set_write = can_change_shared_pte_writable(vma, ptent); + prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes, + /* idx = */ 0, set_write, tlb); + return; + } + + set_write = maybe_change_pte_writable(vma, ptent) && + (folio && folio_test_anon(folio)); + if (!set_write) { + prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes, + /* idx = */ 0, set_write, tlb); + return; + } + commit_anon_folio_batch(vma, folio, addr, ptep, oldpte, ptent, nr_ptes, tlb); +} + static long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -94,6 +284,7 @@ static long change_pte_range(struct mmu_gather *tlb, bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + int nr_ptes; tlb_change_page_size(tlb, PAGE_SIZE); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -108,8 +299,12 @@ static long change_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { + nr_ptes = 1; oldpte = ptep_get(pte); if (pte_present(oldpte)) { + const fpb_t flags = FPB_RESPECT_SOFT_DIRTY | FPB_RESPECT_WRITE; + int max_nr_ptes = (end - addr) >> PAGE_SHIFT; + struct folio *folio = NULL; pte_t ptent; /* @@ -117,56 +312,23 @@ static long change_pte_range(struct mmu_gather *tlb, * pages. See similar comment in change_huge_pmd. */ if (prot_numa) { - struct folio *folio; - int nid; - bool toptier; + int ret = prot_numa_skip(vma, addr, oldpte, pte, + target_node, &folio); + if (ret) { - /* Avoid TLB flush if possible */ - if (pte_protnone(oldpte)) + /* determine batch to skip */ + nr_ptes = mprotect_folio_pte_batch(folio, + pte, oldpte, max_nr_ptes, /* flags = */ 0); continue; + } + } + if (!folio) folio = vm_normal_folio(vma, addr, oldpte); - if (!folio || folio_is_zone_device(folio) || - folio_test_ksm(folio)) - continue; - - /* Also skip shared copy-on-write pages */ - if (is_cow_mapping(vma->vm_flags) && - (folio_maybe_dma_pinned(folio) || - folio_maybe_mapped_shared(folio))) - continue; - /* - * While migration can move some dirty pages, - * it cannot move them all from MIGRATE_ASYNC - * context. - */ - if (folio_is_file_lru(folio) && - folio_test_dirty(folio)) - continue; - - /* - * Don't mess with PTEs if page is already on the node - * a single-threaded process is running on. - */ - nid = folio_nid(folio); - if (target_node == nid) - continue; - toptier = node_is_toptier(nid); - - /* - * Skip scanning top tier node if normal numa - * balancing is disabled - */ - if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - toptier) - continue; - if (folio_use_access_time(folio)) - folio_xchg_access_time(folio, - jiffies_to_msecs(jiffies)); - } + nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags); - oldpte = ptep_modify_prot_start(vma, addr, pte); + oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes); ptent = pte_modify(oldpte, newprot); if (uffd_wp) @@ -188,14 +350,13 @@ static long change_pte_range(struct mmu_gather *tlb, * COW or special handling is required. */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && - !pte_write(ptent) && - can_change_pte_writable(vma, addr, ptent)) - ptent = pte_mkwrite(ptent, vma); - - ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); - if (pte_needs_flush(oldpte, ptent)) - tlb_flush_pte_range(tlb, addr, PAGE_SIZE); - pages++; + !pte_write(ptent)) + set_write_prot_commit_flush_ptes(vma, folio, + addr, pte, oldpte, ptent, nr_ptes, tlb); + else + prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent, + nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb); + pages += nr_ptes; } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); pte_t newpte; @@ -280,7 +441,7 @@ static long change_pte_range(struct mmu_gather *tlb, pages++; } } - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -376,7 +537,7 @@ again: goto next; _pmd = pmdp_get_lockless(pmd); - if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) { + if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd)) { if ((next - addr != HPAGE_PMD_SIZE) || pgtable_split_needed(vma, cp_flags)) { __split_huge_pmd(vma, pmd, addr, false); @@ -596,10 +757,10 @@ static const struct mm_walk_ops prot_none_walk_ops = { int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, - unsigned long start, unsigned long end, unsigned long newflags) + unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; - unsigned long oldflags = READ_ONCE(vma->vm_flags); + vm_flags_t oldflags = READ_ONCE(vma->vm_flags); long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; @@ -774,8 +935,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, nstart = start; tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { - unsigned long mask_off_old_flags; - unsigned long newflags; + vm_flags_t mask_off_old_flags; + vm_flags_t newflags; int new_vma_pkey; if (vma->vm_start != tmp) { diff --git a/mm/mremap.c b/mm/mremap.c index 60f6b8d0d5f0..e15cf2e444c7 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -52,7 +52,7 @@ struct vma_remap_struct { unsigned long addr; /* User-specified address from which we remap. */ unsigned long old_len; /* Length of range being remapped. */ unsigned long new_len; /* Desired new length of mapping. */ - unsigned long flags; /* user-specified MREMAP_* flags. */ + const unsigned long flags; /* user-specified MREMAP_* flags. */ unsigned long new_addr; /* Optionally, desired new address. */ /* uffd state. */ @@ -65,10 +65,11 @@ struct vma_remap_struct { /* Internal state, determined in do_mremap(). */ unsigned long delta; /* Absolute delta of old_len,new_len. */ - bool mlocked; /* Was the VMA mlock()'d? */ + bool populate_expand; /* mlock()'d expanded, must populate. */ enum mremap_type remap_type; /* expand, shrink, etc. */ bool mmap_locked; /* Is mm currently write-locked? */ unsigned long charged; /* If VM_ACCOUNT, # pages to account. */ + bool vmi_needs_invalidate; /* Is the VMA iterator invalidated? */ }; static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) @@ -170,13 +171,29 @@ static pte_t move_soft_dirty_pte(pte_t pte) return pte; } +static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t pte, int max_nr) +{ + struct folio *folio; + + if (max_nr == 1) + return 1; + + folio = vm_normal_folio(vma, addr, pte); + if (!folio || !folio_test_large(folio)) + return 1; + + return folio_pte_batch(folio, ptep, pte, max_nr); +} + static int move_ptes(struct pagetable_move_control *pmc, unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd) { struct vm_area_struct *vma = pmc->old; bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct mm_struct *mm = vma->vm_mm; - pte_t *old_pte, *new_pte, pte; + pte_t *old_ptep, *new_ptep; + pte_t old_pte, pte; pmd_t dummy_pmdval; spinlock_t *old_ptl, *new_ptl; bool force_flush = false; @@ -184,6 +201,8 @@ static int move_ptes(struct pagetable_move_control *pmc, unsigned long new_addr = pmc->new_addr; unsigned long old_end = old_addr + extent; unsigned long len = old_end - old_addr; + int max_nr_ptes; + int nr_ptes; int err = 0; /* @@ -211,8 +230,8 @@ static int move_ptes(struct pagetable_move_control *pmc, * We don't have to worry about the ordering of src and dst * pte locks because exclusive mmap_lock prevents deadlock. */ - old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); - if (!old_pte) { + old_ptep = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); + if (!old_ptep) { err = -EAGAIN; goto out; } @@ -223,10 +242,10 @@ static int move_ptes(struct pagetable_move_control *pmc, * mmap_lock, so this new_pte page is stable, so there is no need to get * pmdval and do pmd_same() check. */ - new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval, + new_ptep = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval, &new_ptl); - if (!new_pte) { - pte_unmap_unlock(old_pte, old_ptl); + if (!new_ptep) { + pte_unmap_unlock(old_ptep, old_ptl); err = -EAGAIN; goto out; } @@ -235,14 +254,16 @@ static int move_ptes(struct pagetable_move_control *pmc, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); - for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, - new_pte++, new_addr += PAGE_SIZE) { - VM_WARN_ON_ONCE(!pte_none(*new_pte)); + for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE, + new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) { + VM_WARN_ON_ONCE(!pte_none(*new_ptep)); - if (pte_none(ptep_get(old_pte))) + nr_ptes = 1; + max_nr_ptes = (old_end - old_addr) >> PAGE_SHIFT; + old_pte = ptep_get(old_ptep); + if (pte_none(old_pte)) continue; - pte = ptep_get_and_clear(mm, old_addr, old_pte); /* * If we are remapping a valid PTE, make sure * to flush TLB before we drop the PTL for the @@ -254,13 +275,17 @@ static int move_ptes(struct pagetable_move_control *pmc, * the TLB entry for the old mapping has been * flushed. */ - if (pte_present(pte)) + if (pte_present(old_pte)) { + nr_ptes = mremap_folio_pte_batch(vma, old_addr, old_ptep, + old_pte, max_nr_ptes); force_flush = true; + } + pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0); pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) - pte_clear(mm, new_addr, new_pte); + pte_clear(mm, new_addr, new_ptep); else { if (need_clear_uffd_wp) { if (pte_present(pte)) @@ -268,7 +293,7 @@ static int move_ptes(struct pagetable_move_control *pmc, else if (is_swap_pte(pte)) pte = pte_swp_clear_uffd_wp(pte); } - set_pte_at(mm, new_addr, new_pte, pte); + set_ptes(mm, new_addr, new_ptep, pte, nr_ptes); } } @@ -277,8 +302,8 @@ static int move_ptes(struct pagetable_move_control *pmc, flush_tlb_range(vma, old_end - len, old_end); if (new_ptl != old_ptl) spin_unlock(new_ptl); - pte_unmap(new_pte - 1); - pte_unmap_unlock(old_pte - 1, old_ptl); + pte_unmap(new_ptep - 1); + pte_unmap_unlock(old_ptep - 1, old_ptl); out: if (pmc->need_rmap_locks) drop_rmap_locks(vma); @@ -794,7 +819,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc) new_pud = alloc_new_pud(mm, pmc->new_addr); if (!new_pud) break; - if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) { + if (pud_trans_huge(*old_pud)) { if (extent == HPAGE_PUD_SIZE) { move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud); /* We ignore and continue on error? */ @@ -813,8 +838,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc) if (!new_pmd) break; again: - if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || - pmd_devmap(*old_pmd)) { + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE && move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd)) continue; @@ -886,7 +910,11 @@ static bool vrm_overlaps(struct vma_remap_struct *vrm) return false; } -/* Do the mremap() flags require that the new_addr parameter be specified? */ +/* + * Will a new address definitely be assigned? This either if the user specifies + * it via MREMAP_FIXED, or if MREMAP_DONTUNMAP is used, indicating we will + * always detemrine a target address. + */ static bool vrm_implies_new_addr(struct vma_remap_struct *vrm) { return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP); @@ -932,7 +960,7 @@ static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm) * * Returns true on success, false if insufficient memory to charge. */ -static bool vrm_charge(struct vma_remap_struct *vrm) +static bool vrm_calc_charge(struct vma_remap_struct *vrm) { unsigned long charged; @@ -983,10 +1011,8 @@ static void vrm_stat_account(struct vma_remap_struct *vrm, struct vm_area_struct *vma = vrm->vma; vm_stat_account(mm, vma->vm_flags, pages); - if (vma->vm_flags & VM_LOCKED) { + if (vma->vm_flags & VM_LOCKED) mm->locked_vm += pages; - vrm->mlocked = true; - } } /* @@ -999,7 +1025,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) struct vm_area_struct *vma = vrm->vma; unsigned long old_addr = vrm->addr; unsigned long old_len = vrm->old_len; - unsigned long dummy = vma->vm_flags; + vm_flags_t dummy = vma->vm_flags; /* * We'd prefer to avoid failure later on in do_munmap: @@ -1086,6 +1112,7 @@ static void unmap_source_vma(struct vma_remap_struct *vrm) err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false); vrm->vma = NULL; /* Invalidated. */ + vrm->vmi_needs_invalidate = true; if (err) { /* OOM: unable to split vma, just get accounts right */ vm_acct_memory(len >> PAGE_SHIFT); @@ -1161,6 +1188,10 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm, *new_vma_ptr = NULL; return -ENOMEM; } + /* By merging, we may have invalidated any iterator in use. */ + if (vma != vrm->vma) + vrm->vmi_needs_invalidate = true; + vrm->vma = vma; pmc.old = vma; pmc.new = new_vma; @@ -1237,8 +1268,11 @@ static unsigned long move_vma(struct vma_remap_struct *vrm) if (err) return err; - /* If accounted, charge the number of bytes the operation will use. */ - if (!vrm_charge(vrm)) + /* + * If accounted, determine the number of bytes the operation will + * charge. + */ + if (!vrm_calc_charge(vrm)) return -ENOMEM; /* We don't want racing faults. */ @@ -1277,64 +1311,6 @@ static unsigned long move_vma(struct vma_remap_struct *vrm) } /* - * resize_is_valid() - Ensure the vma can be resized to the new length at the give - * address. - * - * Return 0 on success, error otherwise. - */ -static int resize_is_valid(struct vma_remap_struct *vrm) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = vrm->vma; - unsigned long addr = vrm->addr; - unsigned long old_len = vrm->old_len; - unsigned long new_len = vrm->new_len; - unsigned long pgoff; - - /* - * !old_len is a special case where an attempt is made to 'duplicate' - * a mapping. This makes no sense for private mappings as it will - * instead create a fresh/new mapping unrelated to the original. This - * is contrary to the basic idea of mremap which creates new mappings - * based on the original. There are no known use cases for this - * behavior. As a result, fail such attempts. - */ - if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { - pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", - current->comm, current->pid); - return -EINVAL; - } - - if ((vrm->flags & MREMAP_DONTUNMAP) && - (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) - return -EINVAL; - - /* We can't remap across vm area boundaries */ - if (old_len > vma->vm_end - addr) - return -EFAULT; - - if (new_len == old_len) - return 0; - - /* Need to be careful about a growing mapping */ - pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; - pgoff += vma->vm_pgoff; - if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) - return -EINVAL; - - if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) - return -EFAULT; - - if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta)) - return -EAGAIN; - - if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) - return -ENOMEM; - - return 0; -} - -/* * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so * execute this, optionally dropping the mmap lock when we do so. * @@ -1383,14 +1359,6 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm) struct mm_struct *mm = current->mm; unsigned long err; - /* Is the new length or address silly? */ - if (vrm->new_len > TASK_SIZE || - vrm->new_addr > TASK_SIZE - vrm->new_len) - return -EINVAL; - - if (vrm_overlaps(vrm)) - return -EINVAL; - if (vrm->flags & MREMAP_FIXED) { /* * In mremap_to(). @@ -1400,6 +1368,7 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm) err = do_munmap(mm, vrm->new_addr, vrm->new_len, vrm->uf_unmap_early); vrm->vma = NULL; /* Invalidated. */ + vrm->vmi_needs_invalidate = true; if (err) return err; @@ -1421,10 +1390,6 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm) vrm->old_len = vrm->new_len; } - err = resize_is_valid(vrm); - if (err) - return err; - /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ if (vrm->flags & MREMAP_DONTUNMAP) { vm_flags_t vm_flags = vrm->vma->vm_flags; @@ -1473,68 +1438,6 @@ static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm) } /* - * Are the parameters passed to mremap() valid? If so return 0, otherwise return - * error. - */ -static unsigned long check_mremap_params(struct vma_remap_struct *vrm) - -{ - unsigned long addr = vrm->addr; - unsigned long flags = vrm->flags; - - /* Ensure no unexpected flag values. */ - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) - return -EINVAL; - - /* Start address must be page-aligned. */ - if (offset_in_page(addr)) - return -EINVAL; - - /* - * We allow a zero old-len as a special case - * for DOS-emu "duplicate shm area" thing. But - * a zero new-len is nonsensical. - */ - if (!PAGE_ALIGN(vrm->new_len)) - return -EINVAL; - - /* Remainder of checks are for cases with specific new_addr. */ - if (!vrm_implies_new_addr(vrm)) - return 0; - - /* The new address must be page-aligned. */ - if (offset_in_page(vrm->new_addr)) - return -EINVAL; - - /* A fixed address implies a move. */ - if (!(flags & MREMAP_MAYMOVE)) - return -EINVAL; - - /* MREMAP_DONTUNMAP does not allow resizing in the process. */ - if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len) - return -EINVAL; - - /* - * move_vma() need us to stay 4 maps below the threshold, otherwise - * it will bail out at the very beginning. - * That is a problem if we have already unmaped the regions here - * (new_addr, and old_addr), because userspace will not know the - * state of the vma's after it gets -ENOMEM. - * So, to avoid such scenario we can pre-compute if the whole - * operation has high chances to success map-wise. - * Worst-scenario case is when both vma's (new_addr and old_addr) get - * split in 3 before unmapping it. - * That means 2 more maps (1 for each) to the ones we already hold. - * Check whether current map count plus 2 still leads us to 4 maps below - * the threshold, otherwise return -ENOMEM here to be more safe. - */ - if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) - return -ENOMEM; - - return 0; -} - -/* * We know we can expand the VMA in-place by delta pages, so do so. * * If we discover the VMA is locked, update mm_struct statistics accordingly and @@ -1546,7 +1449,7 @@ static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm) struct vm_area_struct *vma = vrm->vma; VMA_ITERATOR(vmi, mm, vma->vm_end); - if (!vrm_charge(vrm)) + if (!vrm_calc_charge(vrm)) return -ENOMEM; /* @@ -1590,8 +1493,6 @@ static bool align_hugetlb(struct vma_remap_struct *vrm) if (vrm->new_len > vrm->old_len) return false; - vrm_set_delta(vrm); - return true; } @@ -1605,11 +1506,6 @@ static bool align_hugetlb(struct vma_remap_struct *vrm) static unsigned long expand_vma(struct vma_remap_struct *vrm) { unsigned long err; - unsigned long addr = vrm->addr; - - err = resize_is_valid(vrm); - if (err) - return err; /* * [addr, old_len) spans precisely to the end of the VMA, so try to @@ -1620,16 +1516,8 @@ static unsigned long expand_vma(struct vma_remap_struct *vrm) if (err) return err; - /* - * We want to populate the newly expanded portion of the VMA to - * satisfy the expectation that mlock()'ing a VMA maintains all - * of its pages in memory. - */ - if (vrm->mlocked) - vrm->new_addr = addr; - /* OK we're done! */ - return addr; + return vrm->addr; } /* @@ -1680,64 +1568,371 @@ static unsigned long mremap_at(struct vma_remap_struct *vrm) return expand_vma(vrm); } - BUG(); + /* Should not be possible. */ + WARN_ON_ONCE(1); + return -EINVAL; } -static unsigned long do_mremap(struct vma_remap_struct *vrm) +/* + * Will this operation result in the VMA being expanded or moved and thus need + * to map a new portion of virtual address space? + */ +static bool vrm_will_map_new(struct vma_remap_struct *vrm) +{ + if (vrm->remap_type == MREMAP_EXPAND) + return true; + + if (vrm_implies_new_addr(vrm)) + return true; + + return false; +} + +/* Does this remap ONLY move mappings? */ +static bool vrm_move_only(struct vma_remap_struct *vrm) +{ + if (!(vrm->flags & MREMAP_FIXED)) + return false; + + if (vrm->old_len != vrm->new_len) + return false; + + return true; +} + +static void notify_uffd(struct vma_remap_struct *vrm, bool failed) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long ret; - ret = check_mremap_params(vrm); - if (ret) - return ret; + /* Regardless of success/failure, we always notify of any unmaps. */ + userfaultfd_unmap_complete(mm, vrm->uf_unmap_early); + if (failed) + mremap_userfaultfd_fail(vrm->uf); + else + mremap_userfaultfd_complete(vrm->uf, vrm->addr, + vrm->new_addr, vrm->old_len); + userfaultfd_unmap_complete(mm, vrm->uf_unmap); +} - vrm->old_len = PAGE_ALIGN(vrm->old_len); - vrm->new_len = PAGE_ALIGN(vrm->new_len); - vrm_set_delta(vrm); +static bool vma_multi_allowed(struct vm_area_struct *vma) +{ + struct file *file; - if (mmap_write_lock_killable(mm)) - return -EINTR; - vrm->mmap_locked = true; + /* + * We can't support moving multiple uffd VMAs as notify requires + * mmap lock to be dropped. + */ + if (userfaultfd_armed(vma)) + return false; - vma = vrm->vma = vma_lookup(mm, vrm->addr); - if (!vma) { - ret = -EFAULT; - goto out; + /* + * Custom get unmapped area might result in MREMAP_FIXED not + * being obeyed. + */ + file = vma->vm_file; + if (file && !vma_is_shmem(vma) && !is_vm_hugetlb_page(vma)) { + const struct file_operations *fop = file->f_op; + + if (fop->get_unmapped_area) + return false; } + return true; +} + +static int check_prep_vma(struct vma_remap_struct *vrm) +{ + struct vm_area_struct *vma = vrm->vma; + struct mm_struct *mm = current->mm; + unsigned long addr = vrm->addr; + unsigned long old_len, new_len, pgoff; + + if (!vma) + return -EFAULT; + /* If mseal()'d, mremap() is prohibited. */ - if (!can_modify_vma(vma)) { - ret = -EPERM; - goto out; - } + if (!can_modify_vma(vma)) + return -EPERM; /* Align to hugetlb page size, if required. */ - if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) { - ret = -EINVAL; - goto out; - } + if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) + return -EINVAL; + vrm_set_delta(vrm); vrm->remap_type = vrm_remap_type(vrm); + /* For convenience, we set new_addr even if VMA won't move. */ + if (!vrm_implies_new_addr(vrm)) + vrm->new_addr = addr; - /* Actually execute mremap. */ - ret = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm); + /* Below only meaningful if we expand or move a VMA. */ + if (!vrm_will_map_new(vrm)) + return 0; -out: - if (vrm->mmap_locked) { - mmap_write_unlock(mm); - vrm->mmap_locked = false; + old_len = vrm->old_len; + new_len = vrm->new_len; - if (!offset_in_page(ret) && vrm->mlocked && vrm->new_len > vrm->old_len) - mm_populate(vrm->new_addr + vrm->old_len, vrm->delta); + /* + * !old_len is a special case where an attempt is made to 'duplicate' + * a mapping. This makes no sense for private mappings as it will + * instead create a fresh/new mapping unrelated to the original. This + * is contrary to the basic idea of mremap which creates new mappings + * based on the original. There are no known use cases for this + * behavior. As a result, fail such attempts. + */ + if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { + pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", + current->comm, current->pid); + return -EINVAL; } - userfaultfd_unmap_complete(mm, vrm->uf_unmap_early); - mremap_userfaultfd_complete(vrm->uf, vrm->addr, ret, vrm->old_len); - userfaultfd_unmap_complete(mm, vrm->uf_unmap); + if ((vrm->flags & MREMAP_DONTUNMAP) && + (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) + return -EINVAL; + + /* + * We permit crossing of boundaries for the range being unmapped due to + * a shrink. + */ + if (vrm->remap_type == MREMAP_SHRINK) + old_len = new_len; + + /* + * We can't remap across the end of VMAs, as another VMA may be + * adjacent: + * + * addr vma->vm_end + * |-----.----------| + * | . | + * |-----.----------| + * .<--------->xxx> + * old_len + * + * We also require that vma->vm_start <= addr < vma->vm_end. + */ + if (old_len > vma->vm_end - addr) + return -EFAULT; + + if (new_len == old_len) + return 0; + + /* We are expanding and the VMA is mlock()'d so we need to populate. */ + if (vma->vm_flags & VM_LOCKED) + vrm->populate_expand = true; + + /* Need to be careful about a growing mapping */ + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; + if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) + return -EINVAL; + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) + return -EFAULT; + + if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta)) + return -EAGAIN; + + if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) + return -ENOMEM; + + return 0; +} + +/* + * Are the parameters passed to mremap() valid? If so return 0, otherwise return + * error. + */ +static unsigned long check_mremap_params(struct vma_remap_struct *vrm) + +{ + unsigned long addr = vrm->addr; + unsigned long flags = vrm->flags; + + /* Ensure no unexpected flag values. */ + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) + return -EINVAL; + + /* Start address must be page-aligned. */ + if (offset_in_page(addr)) + return -EINVAL; + + /* + * We allow a zero old-len as a special case + * for DOS-emu "duplicate shm area" thing. But + * a zero new-len is nonsensical. + */ + if (!vrm->new_len) + return -EINVAL; + + /* Is the new length or address silly? */ + if (vrm->new_len > TASK_SIZE || + vrm->new_addr > TASK_SIZE - vrm->new_len) + return -EINVAL; - return ret; + /* Remainder of checks are for cases with specific new_addr. */ + if (!vrm_implies_new_addr(vrm)) + return 0; + + /* The new address must be page-aligned. */ + if (offset_in_page(vrm->new_addr)) + return -EINVAL; + + /* A fixed address implies a move. */ + if (!(flags & MREMAP_MAYMOVE)) + return -EINVAL; + + /* MREMAP_DONTUNMAP does not allow resizing in the process. */ + if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len) + return -EINVAL; + + /* Target VMA must not overlap source VMA. */ + if (vrm_overlaps(vrm)) + return -EINVAL; + + /* + * move_vma() need us to stay 4 maps below the threshold, otherwise + * it will bail out at the very beginning. + * That is a problem if we have already unmaped the regions here + * (new_addr, and old_addr), because userspace will not know the + * state of the vma's after it gets -ENOMEM. + * So, to avoid such scenario we can pre-compute if the whole + * operation has high chances to success map-wise. + * Worst-scenario case is when both vma's (new_addr and old_addr) get + * split in 3 before unmapping it. + * That means 2 more maps (1 for each) to the ones we already hold. + * Check whether current map count plus 2 still leads us to 4 maps below + * the threshold, otherwise return -ENOMEM here to be more safe. + */ + if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) + return -ENOMEM; + + return 0; +} + +static unsigned long remap_move(struct vma_remap_struct *vrm) +{ + struct vm_area_struct *vma; + unsigned long start = vrm->addr; + unsigned long end = vrm->addr + vrm->old_len; + unsigned long new_addr = vrm->new_addr; + bool allowed = true, seen_vma = false; + unsigned long target_addr = new_addr; + unsigned long res = -EFAULT; + unsigned long last_end; + VMA_ITERATOR(vmi, current->mm, start); + + /* + * When moving VMAs we allow for batched moves across multiple VMAs, + * with all VMAs in the input range [addr, addr + old_len) being moved + * (and split as necessary). + */ + for_each_vma_range(vmi, vma, end) { + /* Account for start, end not aligned with VMA start, end. */ + unsigned long addr = max(vma->vm_start, start); + unsigned long len = min(end, vma->vm_end) - addr; + unsigned long offset, res_vma; + + if (!allowed) + return -EFAULT; + + /* No gap permitted at the start of the range. */ + if (!seen_vma && start < vma->vm_start) + return -EFAULT; + + /* + * To sensibly move multiple VMAs, accounting for the fact that + * get_unmapped_area() may align even MAP_FIXED moves, we simply + * attempt to move such that the gaps between source VMAs remain + * consistent in destination VMAs, e.g.: + * + * X Y X Y + * <---> <-> <---> <-> + * |-------| |-----| |-----| |-------| |-----| |-----| + * | A | | B | | C | ---> | A' | | B' | | C' | + * |-------| |-----| |-----| |-------| |-----| |-----| + * new_addr + * + * So we map B' at A'->vm_end + X, and C' at B'->vm_end + Y. + */ + offset = seen_vma ? vma->vm_start - last_end : 0; + last_end = vma->vm_end; + + vrm->vma = vma; + vrm->addr = addr; + vrm->new_addr = target_addr + offset; + vrm->old_len = vrm->new_len = len; + + allowed = vma_multi_allowed(vma); + if (seen_vma && !allowed) + return -EFAULT; + + res_vma = check_prep_vma(vrm); + if (!res_vma) + res_vma = mremap_to(vrm); + if (IS_ERR_VALUE(res_vma)) + return res_vma; + + if (!seen_vma) { + VM_WARN_ON_ONCE(allowed && res_vma != new_addr); + res = res_vma; + } + + /* mmap lock is only dropped on shrink. */ + VM_WARN_ON_ONCE(!vrm->mmap_locked); + /* This is a move, no expand should occur. */ + VM_WARN_ON_ONCE(vrm->populate_expand); + + if (vrm->vmi_needs_invalidate) { + vma_iter_invalidate(&vmi); + vrm->vmi_needs_invalidate = false; + } + seen_vma = true; + target_addr = res_vma + vrm->new_len; + } + + return res; +} + +static unsigned long do_mremap(struct vma_remap_struct *vrm) +{ + struct mm_struct *mm = current->mm; + unsigned long res; + bool failed; + + vrm->old_len = PAGE_ALIGN(vrm->old_len); + vrm->new_len = PAGE_ALIGN(vrm->new_len); + + res = check_mremap_params(vrm); + if (res) + return res; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + vrm->mmap_locked = true; + + if (vrm_move_only(vrm)) { + res = remap_move(vrm); + } else { + vrm->vma = vma_lookup(current->mm, vrm->addr); + res = check_prep_vma(vrm); + if (res) + goto out; + + /* Actually execute mremap. */ + res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm); + } + +out: + failed = IS_ERR_VALUE(res); + + if (vrm->mmap_locked) + mmap_write_unlock(mm); + + /* VMA mlock'd + was expanded, so populated expanded region. */ + if (!failed && vrm->populate_expand) + mm_populate(vrm->new_addr + vrm->old_len, vrm->delta); + + notify_uffd(vrm, failed); + return res; } /* diff --git a/mm/nommu.c b/mm/nommu.c index b624acec6d2e..736d0e0f0618 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -126,7 +126,7 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, unsigned long vm_flags, int node, + pgprot_t prot, vm_flags_t vm_flags, int node, const void *caller) { return __vmalloc_noprof(size, gfp_mask); @@ -719,7 +719,7 @@ static int validate_mmap_request(struct file *file, if (file) { /* files must support mmap */ - if (!file->f_op->mmap) + if (!can_mmap_file(file)) return -ENODEV; /* work out if what we've got could possibly be shared @@ -844,12 +844,12 @@ static int validate_mmap_request(struct file *file, * we've determined that we can make the mapping, now translate what we * now know into VMA flags */ -static unsigned long determine_vm_flags(struct file *file, - unsigned long prot, - unsigned long flags, - unsigned long capabilities) +static vm_flags_t determine_vm_flags(struct file *file, + unsigned long prot, + unsigned long flags, + unsigned long capabilities) { - unsigned long vm_flags; + vm_flags_t vm_flags; vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 72b0ff0d4bae..3e248d1c3969 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1101,9 +1101,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) * such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to - * 1% by default. Without strictlimit feature, fuse writeback may - * consume arbitrary amount of RAM because it is accounted in - * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". + * 1% by default. * * Here, in wb_position_ratio(), we calculate pos_ratio based on * two values: wb_dirty and wb_thresh. Let's consider an example: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2ef3c07266b3..d1d037f97c5f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -353,81 +353,225 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } +static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit) +{ + return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS; +} + +static __always_inline void +get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn, + unsigned long **bitmap_word, unsigned long *bitidx) +{ + unsigned long *bitmap; + unsigned long word_bitidx; + +#ifdef CONFIG_MEMORY_ISOLATION + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8); +#else + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); +#endif + BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits)); + VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); + + bitmap = get_pageblock_bitmap(page, pfn); + *bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = *bitidx / BITS_PER_LONG; + *bitidx &= (BITS_PER_LONG - 1); + *bitmap_word = &bitmap[word_bitidx]; +} + + /** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * __get_pfnblock_flags_mask - Return the requested group of flags for + * a pageblock_nr_pages block of pages * @page: The page within the block of interest * @pfn: The target page frame number * @mask: mask of bits that the caller is interested in * * Return: pageblock_bits flags */ -unsigned long get_pfnblock_flags_mask(const struct page *page, - unsigned long pfn, unsigned long mask) +static unsigned long __get_pfnblock_flags_mask(const struct page *page, + unsigned long pfn, + unsigned long mask) { - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; + unsigned long *bitmap_word; + unsigned long bitidx; unsigned long word; - bitmap = get_pageblock_bitmap(page, pfn); - bitidx = pfn_to_bitidx(page, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); + get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); /* - * This races, without locks, with set_pfnblock_flags_mask(). Ensure + * This races, without locks, with set_pfnblock_migratetype(). Ensure * a consistent read of the memory array, so that results, even though * racy, are not corrupted. */ - word = READ_ONCE(bitmap[word_bitidx]); + word = READ_ONCE(*bitmap_word); return (word >> bitidx) & mask; } -static __always_inline int get_pfnblock_migratetype(const struct page *page, - unsigned long pfn) +/** + * get_pfnblock_bit - Check if a standalone bit of a pageblock is set + * @page: The page within the block of interest + * @pfn: The target page frame number + * @pb_bit: pageblock bit to check + * + * Return: true if the bit is set, otherwise false + */ +bool get_pfnblock_bit(const struct page *page, unsigned long pfn, + enum pageblock_bits pb_bit) { - return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); + unsigned long *bitmap_word; + unsigned long bitidx; + + if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit))) + return false; + + get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); + + return test_bit(bitidx + pb_bit, bitmap_word); } /** - * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * get_pfnblock_migratetype - Return the migratetype of a pageblock * @page: The page within the block of interest - * @flags: The flags to set * @pfn: The target page frame number - * @mask: mask of bits that the caller is interested in + * + * Return: The migratetype of the pageblock + * + * Use get_pfnblock_migratetype() if caller already has both @page and @pfn + * to save a call to page_to_pfn(). */ -void set_pfnblock_flags_mask(struct page *page, unsigned long flags, - unsigned long pfn, - unsigned long mask) +__always_inline enum migratetype +get_pfnblock_migratetype(const struct page *page, unsigned long pfn) { - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long word; + unsigned long mask = MIGRATETYPE_AND_ISO_MASK; + unsigned long flags; - BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); - BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); + flags = __get_pfnblock_flags_mask(page, pfn, mask); - bitmap = get_pageblock_bitmap(page, pfn); - bitidx = pfn_to_bitidx(page, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); +#ifdef CONFIG_MEMORY_ISOLATION + if (flags & BIT(PB_migrate_isolate)) + return MIGRATE_ISOLATE; +#endif + return flags & MIGRATETYPE_MASK; +} - VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); +/** + * __set_pfnblock_flags_mask - Set the requested group of flags for + * a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @flags: The flags to set + * @mask: mask of bits that the caller is interested in + */ +static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long flags, unsigned long mask) +{ + unsigned long *bitmap_word; + unsigned long bitidx; + unsigned long word; + + get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); mask <<= bitidx; flags <<= bitidx; - word = READ_ONCE(bitmap[word_bitidx]); + word = READ_ONCE(*bitmap_word); do { - } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags)); + } while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags)); +} + +/** + * set_pfnblock_bit - Set a standalone bit of a pageblock + * @page: The page within the block of interest + * @pfn: The target page frame number + * @pb_bit: pageblock bit to set + */ +void set_pfnblock_bit(const struct page *page, unsigned long pfn, + enum pageblock_bits pb_bit) +{ + unsigned long *bitmap_word; + unsigned long bitidx; + + if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit))) + return; + + get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); + + set_bit(bitidx + pb_bit, bitmap_word); } -void set_pageblock_migratetype(struct page *page, int migratetype) +/** + * clear_pfnblock_bit - Clear a standalone bit of a pageblock + * @page: The page within the block of interest + * @pfn: The target page frame number + * @pb_bit: pageblock bit to clear + */ +void clear_pfnblock_bit(const struct page *page, unsigned long pfn, + enum pageblock_bits pb_bit) +{ + unsigned long *bitmap_word; + unsigned long bitidx; + + if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit))) + return; + + get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); + + clear_bit(bitidx + pb_bit, bitmap_word); +} + +/** + * set_pageblock_migratetype - Set the migratetype of a pageblock + * @page: The page within the block of interest + * @migratetype: migratetype to set + */ +static void set_pageblock_migratetype(struct page *page, + enum migratetype migratetype) { if (unlikely(page_group_by_mobility_disabled && migratetype < MIGRATE_PCPTYPES)) migratetype = MIGRATE_UNMOVABLE; - set_pfnblock_flags_mask(page, (unsigned long)migratetype, - page_to_pfn(page), MIGRATETYPE_MASK); +#ifdef CONFIG_MEMORY_ISOLATION + if (migratetype == MIGRATE_ISOLATE) { + VM_WARN_ONCE(1, + "Use set_pageblock_isolate() for pageblock isolation"); + return; + } + VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page), + PB_migrate_isolate), + "Use clear_pageblock_isolate() to unisolate pageblock"); + /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */ +#endif + __set_pfnblock_flags_mask(page, page_to_pfn(page), + (unsigned long)migratetype, + MIGRATETYPE_AND_ISO_MASK); +} + +void __meminit init_pageblock_migratetype(struct page *page, + enum migratetype migratetype, + bool isolate) +{ + unsigned long flags; + + if (unlikely(page_group_by_mobility_disabled && + migratetype < MIGRATE_PCPTYPES)) + migratetype = MIGRATE_UNMOVABLE; + + flags = migratetype; + +#ifdef CONFIG_MEMORY_ISOLATION + if (migratetype == MIGRATE_ISOLATE) { + VM_WARN_ONCE( + 1, + "Set isolate=true to isolate pageblock with a migratetype"); + return; + } + if (isolate) + flags |= BIT(PB_migrate_isolate); +#endif + __set_pfnblock_flags_mask(page, page_to_pfn(page), flags, + MIGRATETYPE_AND_ISO_MASK); } #ifdef CONFIG_DEBUG_VM @@ -667,7 +811,7 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone, int nr_pages = 1 << order; VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, - "page type is %lu, passed migratetype is %d (nr=%d)\n", + "page type is %d, passed migratetype is %d (nr=%d)\n", get_pageblock_migratetype(page), migratetype, nr_pages); if (tail) @@ -693,7 +837,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone, /* Free page moving can fail, so it happens before the type update */ VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, - "page type is %lu, passed migratetype is %d (nr=%d)\n", + "page type is %d, passed migratetype is %d (nr=%d)\n", get_pageblock_migratetype(page), old_mt, nr_pages); list_move_tail(&page->buddy_list, &area->free_list[new_mt]); @@ -715,7 +859,7 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon int nr_pages = 1 << order; VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, - "page type is %lu, passed migratetype is %d (nr=%d)\n", + "page type is %d, passed migratetype is %d (nr=%d)\n", get_pageblock_migratetype(page), migratetype, nr_pages); /* clear reported state and update reported page count */ @@ -1231,11 +1375,14 @@ __always_inline bool free_pages_prepare(struct page *page, (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; } } - if (PageMappingFlags(page)) { - if (PageAnon(page)) - mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); - page->mapping = NULL; + if (folio_test_anon(folio)) { + mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); + folio->mapping = NULL; } + if (unlikely(page_has_type(page))) + /* Reset the page_type (which overlays _mapcount) */ + page->page_type = UINT_MAX; + if (is_check_pages_enabled()) { if (free_page_is_bad(page)) bad++; @@ -1781,8 +1928,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, #endif /* - * Change the type of a block and move all its free pages to that - * type's freelist. + * Move all free pages of a block to new type's freelist. Caller needs to + * change the block type. */ static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, int old_mt, int new_mt) @@ -1814,8 +1961,6 @@ static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, pages_moved += 1 << order; } - set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); - return pages_moved; } @@ -1860,7 +2005,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page, * migration are movable. But we don't actually try * isolating, as that would be expensive. */ - if (PageLRU(page) || __PageMovable(page)) + if (PageLRU(page) || page_has_movable_ops(page)) (*num_movable)++; pfn++; } @@ -1873,11 +2018,16 @@ static int move_freepages_block(struct zone *zone, struct page *page, int old_mt, int new_mt) { unsigned long start_pfn; + int res; if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) return -1; - return __move_freepages_block(zone, start_pfn, old_mt, new_mt); + res = __move_freepages_block(zone, start_pfn, old_mt, new_mt); + set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); + + return res; + } #ifdef CONFIG_MEMORY_ISOLATION @@ -1905,11 +2055,19 @@ static unsigned long find_large_buddy(unsigned long start_pfn) return start_pfn; } +static inline void toggle_pageblock_isolate(struct page *page, bool isolate) +{ + if (isolate) + set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate); + else + clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate); +} + /** - * move_freepages_block_isolate - move free pages in block for page isolation + * __move_freepages_block_isolate - move free pages in block for page isolation * @zone: the zone * @page: the pageblock page - * @migratetype: migratetype to set on the pageblock + * @isolate: to isolate the given pageblock or unisolate it * * This is similar to move_freepages_block(), but handles the special * case encountered in page isolation, where the block of interest @@ -1924,10 +2082,18 @@ static unsigned long find_large_buddy(unsigned long start_pfn) * * Returns %true if pages could be moved, %false otherwise. */ -bool move_freepages_block_isolate(struct zone *zone, struct page *page, - int migratetype) +static bool __move_freepages_block_isolate(struct zone *zone, + struct page *page, bool isolate) { unsigned long start_pfn, pfn; + int from_mt; + int to_mt; + + if (isolate == get_pageblock_isolate(page)) { + VM_WARN_ONCE(1, "%s a pageblock that is already in that state", + isolate ? "Isolate" : "Unisolate"); + return false; + } if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) return false; @@ -1944,7 +2110,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, del_page_from_free_list(buddy, zone, order, get_pfnblock_migratetype(buddy, pfn)); - set_pageblock_migratetype(page, migratetype); + toggle_pageblock_isolate(page, isolate); split_large_buddy(zone, buddy, pfn, order, FPI_NONE); return true; } @@ -1955,16 +2121,38 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, del_page_from_free_list(page, zone, order, get_pfnblock_migratetype(page, pfn)); - set_pageblock_migratetype(page, migratetype); + toggle_pageblock_isolate(page, isolate); split_large_buddy(zone, page, pfn, order, FPI_NONE); return true; } move: - __move_freepages_block(zone, start_pfn, - get_pfnblock_migratetype(page, start_pfn), - migratetype); + /* Use MIGRATETYPE_MASK to get non-isolate migratetype */ + if (isolate) { + from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page), + MIGRATETYPE_MASK); + to_mt = MIGRATE_ISOLATE; + } else { + from_mt = MIGRATE_ISOLATE; + to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page), + MIGRATETYPE_MASK); + } + + __move_freepages_block(zone, start_pfn, from_mt, to_mt); + toggle_pageblock_isolate(pfn_to_page(start_pfn), isolate); + return true; } + +bool pageblock_isolate_and_move_free_pages(struct zone *zone, struct page *page) +{ + return __move_freepages_block_isolate(zone, page, true); +} + +bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *page) +{ + return __move_freepages_block_isolate(zone, page, false); +} + #endif /* CONFIG_MEMORY_ISOLATION */ static void change_pageblock_range(struct page *pageblock_page, @@ -2156,6 +2344,7 @@ try_to_claim_block(struct zone *zone, struct page *page, if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) { __move_freepages_block(zone, start_pfn, block_type, start_type); + set_pageblock_migratetype(pfn_to_page(start_pfn), start_type); return __rmqueue_smallest(zone, order, start_type); } @@ -3123,7 +3312,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, /* * Do not instrument rmqueue() with KMSAN. This function may call - * __msan_poison_alloca() through a call to set_pfnblock_flags_mask(). + * __msan_poison_alloca() through a call to set_pfnblock_migratetype(). * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it * may call rmqueue() again, which will result in a deadlock. */ @@ -5028,11 +5217,28 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page_noprof); +static void ___free_pages(struct page *page, unsigned int order, + fpi_t fpi_flags) +{ + /* get PageHead before we drop reference */ + int head = PageHead(page); + /* get alloc tag in case the page is released by others */ + struct alloc_tag *tag = pgalloc_tag_get(page); + + if (put_page_testzero(page)) + __free_frozen_pages(page, order, fpi_flags); + else if (!head) { + pgalloc_tag_sub_pages(tag, (1 << order) - 1); + while (order-- > 0) + __free_frozen_pages(page + (1 << order), order, + fpi_flags); + } +} + /** - * ___free_pages - Free pages allocated with alloc_pages(). + * __free_pages - Free pages allocated with alloc_pages(). * @page: The page pointer returned from alloc_pages(). * @order: The order of the allocation. - * @fpi_flags: Free Page Internal flags. * * This function can free multi-page allocations that are not compound * pages. It does not check that the @order passed in matches that of @@ -5049,23 +5255,6 @@ EXPORT_SYMBOL(get_zeroed_page_noprof); * Context: May be called in interrupt context or while holding a normal * spinlock, but not in NMI context or while holding a raw spinlock. */ -static void ___free_pages(struct page *page, unsigned int order, - fpi_t fpi_flags) -{ - /* get PageHead before we drop reference */ - int head = PageHead(page); - /* get alloc tag in case the page is released by others */ - struct alloc_tag *tag = pgalloc_tag_get(page); - - if (put_page_testzero(page)) - __free_frozen_pages(page, order, fpi_flags); - else if (!head) { - pgalloc_tag_sub_pages(tag, (1 << order) - 1); - while (order-- > 0) - __free_frozen_pages(page + (1 << order), order, - fpi_flags); - } -} void __free_pages(struct page *page, unsigned int order) { ___free_pages(page, order, FPI_NONE); @@ -6505,13 +6694,9 @@ static void alloc_contig_dump_pages(struct list_head *page_list) } } -/* - * [start, end) must belong to a single zone. - * @migratetype: using migratetype to filter the type of migration in - * trace_mm_alloc_contig_migrate_range_info. - */ +/* [start, end) must belong to a single zone. */ static int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, int migratetype) + unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; @@ -6523,10 +6708,6 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, .gfp_mask = cc->gfp_mask, .reason = MR_CONTIG_RANGE, }; - struct page *page; - unsigned long total_mapped = 0; - unsigned long total_migrated = 0; - unsigned long total_reclaimed = 0; lru_cache_disable(); @@ -6552,22 +6733,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, &cc->migratepages); cc->nr_migratepages -= nr_reclaimed; - if (trace_mm_alloc_contig_migrate_range_info_enabled()) { - total_reclaimed += nr_reclaimed; - list_for_each_entry(page, &cc->migratepages, lru) { - struct folio *folio = page_folio(page); - - total_mapped += folio_mapped(folio) * - folio_nr_pages(folio); - } - } - ret = migrate_pages(&cc->migratepages, alloc_migration_target, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); - if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret) - total_migrated += cc->nr_migratepages; - /* * On -ENOMEM, migrate_pages() bails out right away. It is pointless * to retry again over this error, so do the same here. @@ -6583,10 +6751,6 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, putback_movable_pages(&cc->migratepages); } - trace_mm_alloc_contig_migrate_range_info(start, end, migratetype, - total_migrated, - total_reclaimed, - total_mapped); return (ret < 0) ? ret : 0; } @@ -6654,10 +6818,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate * @end: one-past-the-last PFN to allocate - * @migratetype: migratetype of the underlying pageblocks (either - * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks - * in range must have the same migratetype and it must - * be either of the two. + * @alloc_flags: allocation information * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some * action and reclaim modifiers are supported. Reclaim modifiers * control allocation behavior during compaction/migration/reclaim. @@ -6674,7 +6835,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) * need to be freed with free_contig_range(). */ int alloc_contig_range_noprof(unsigned long start, unsigned long end, - unsigned migratetype, gfp_t gfp_mask) + acr_flags_t alloc_flags, gfp_t gfp_mask) { unsigned long outer_start, outer_end; int ret = 0; @@ -6689,6 +6850,9 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, .alloc_contig = true, }; INIT_LIST_HEAD(&cc.migratepages); + enum pb_isolate_mode mode = (alloc_flags & ACR_FLAGS_CMA) ? + PB_ISOLATE_MODE_CMA_ALLOC : + PB_ISOLATE_MODE_OTHER; gfp_mask = current_gfp_context(gfp_mask); if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) @@ -6715,7 +6879,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, * put back to page allocator so that buddy can use them. */ - ret = start_isolate_page_range(start, end, migratetype, 0); + ret = start_isolate_page_range(start, end, mode); if (ret) goto done; @@ -6731,7 +6895,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, * allocated. So, if we fall through be sure to clear ret so that * -EBUSY is not accidentally used or returned to caller. */ - ret = __alloc_contig_migrate_range(&cc, start, end, migratetype); + ret = __alloc_contig_migrate_range(&cc, start, end); if (ret && ret != -EBUSY) goto done; @@ -6765,7 +6929,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, outer_start = find_large_buddy(start); /* Make sure the range is really isolated. */ - if (test_pages_isolated(outer_start, end, 0)) { + if (test_pages_isolated(outer_start, end, mode)) { ret = -EBUSY; goto done; } @@ -6798,7 +6962,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, start, end, outer_start, outer_end); } done: - undo_isolate_page_range(start, end, migratetype); + undo_isolate_page_range(start, end); return ret; } EXPORT_SYMBOL(alloc_contig_range_noprof); @@ -6808,8 +6972,8 @@ static int __alloc_contig_pages(unsigned long start_pfn, { unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE, - gfp_mask); + return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE, + gfp_mask); } static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, diff --git a/mm/page_ext.c b/mm/page_ext.c index c351fdfe9e9a..d7396a8970e5 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -369,25 +369,15 @@ static void __invalidate_page_ext(unsigned long pfn) } static int __meminit online_page_ext(unsigned long start_pfn, - unsigned long nr_pages, - int nid) + unsigned long nr_pages) { + int nid = pfn_to_nid(start_pfn); unsigned long start, end, pfn; int fail = 0; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); - if (nid == NUMA_NO_NODE) { - /* - * In this case, "nid" already exists and contains valid memory. - * "start_pfn" passed to us is a pfn which is an arg for - * online__pages(), and start_pfn should exist. - */ - nid = pfn_to_nid(start_pfn); - VM_BUG_ON(!node_online(nid)); - } - for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) fail = init_section_page_ext(pfn, nid); if (!fail) @@ -435,8 +425,7 @@ static int __meminit page_ext_callback(struct notifier_block *self, switch (action) { case MEM_GOING_ONLINE: - ret = online_page_ext(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); + ret = online_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_OFFLINE: offline_page_ext(mn->start_pfn, diff --git a/mm/page_idle.c b/mm/page_idle.c index 408aaf29a3ea..a82b340dc204 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -208,7 +208,7 @@ static const struct bin_attribute *const page_idle_bin_attrs[] = { }; static const struct attribute_group page_idle_attr_group = { - .bin_attrs_new = page_idle_bin_attrs, + .bin_attrs = page_idle_bin_attrs, .name = "page_idle", }; diff --git a/mm/page_io.c b/mm/page_io.c index f7716b6569fa..a2056a5ecb13 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -237,14 +237,13 @@ static void swap_zeromap_folio_clear(struct folio *folio) * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. */ -int swap_writeout(struct folio *folio, struct writeback_control *wbc) +int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) { - int ret; + int ret = 0; + + if (folio_free_swap(folio)) + goto out_unlock; - if (folio_free_swap(folio)) { - folio_unlock(folio); - return 0; - } /* * Arch code may have to preserve more data than just the page * contents, e.g. memory tags. @@ -252,8 +251,7 @@ int swap_writeout(struct folio *folio, struct writeback_control *wbc) ret = arch_prepare_to_swap(folio); if (ret) { folio_mark_dirty(folio); - folio_unlock(folio); - return ret; + goto out_unlock; } /* @@ -264,28 +262,30 @@ int swap_writeout(struct folio *folio, struct writeback_control *wbc) */ if (is_folio_zero_filled(folio)) { swap_zeromap_folio_set(folio); - folio_unlock(folio); - return 0; - } else { - /* - * Clear bits this folio occupies in the zeromap to prevent - * zero data being read in from any previous zero writes that - * occupied the same swap entries. - */ - swap_zeromap_folio_clear(folio); + goto out_unlock; } + + /* + * Clear bits this folio occupies in the zeromap to prevent zero data + * being read in from any previous zero writes that occupied the same + * swap entries. + */ + swap_zeromap_folio_clear(folio); + if (zswap_store(folio)) { count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT); - folio_unlock(folio); - return 0; + goto out_unlock; } if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) { folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; } - __swap_writepage(folio, wbc); + __swap_writepage(folio, swap_plug); return 0; +out_unlock: + folio_unlock(folio); + return ret; } static inline void count_swpout_vm_event(struct folio *folio) @@ -371,9 +371,9 @@ static void sio_write_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } -static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc) +static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug) { - struct swap_iocb *sio = NULL; + struct swap_iocb *sio = swap_plug ? *swap_plug : NULL; struct swap_info_struct *sis = swp_swap_info(folio->swap); struct file *swap_file = sis->swap_file; loff_t pos = swap_dev_pos(folio->swap); @@ -381,8 +381,6 @@ static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); - if (wbc->swap_plug) - sio = *wbc->swap_plug; if (sio) { if (sio->iocb.ki_filp != swap_file || sio->iocb.ki_pos + sio->len != pos) { @@ -401,22 +399,21 @@ static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); sio->len += folio_size(folio); sio->pages += 1; - if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) { + if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) { swap_write_unplug(sio); sio = NULL; } - if (wbc->swap_plug) - *wbc->swap_plug = sio; + if (swap_plug) + *swap_plug = sio; } static void swap_writepage_bdev_sync(struct folio *folio, - struct writeback_control *wbc, struct swap_info_struct *sis) + struct swap_info_struct *sis) { struct bio_vec bv; struct bio bio; - bio_init(&bio, sis->bdev, &bv, 1, - REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); + bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP); bio.bi_iter.bi_sector = swap_folio_sector(folio); bio_add_folio_nofail(&bio, folio, folio_size(folio), 0); @@ -431,13 +428,11 @@ static void swap_writepage_bdev_sync(struct folio *folio, } static void swap_writepage_bdev_async(struct folio *folio, - struct writeback_control *wbc, struct swap_info_struct *sis) + struct swap_info_struct *sis) { struct bio *bio; - bio = bio_alloc(sis->bdev, 1, - REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), - GFP_NOIO); + bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP, GFP_NOIO); bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_write; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); @@ -449,7 +444,7 @@ static void swap_writepage_bdev_async(struct folio *folio, submit_bio(bio); } -void __swap_writepage(struct folio *folio, struct writeback_control *wbc) +void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) { struct swap_info_struct *sis = swp_swap_info(folio->swap); @@ -460,16 +455,16 @@ void __swap_writepage(struct folio *folio, struct writeback_control *wbc) * is safe. */ if (data_race(sis->flags & SWP_FS_OPS)) - swap_writepage_fs(folio, wbc); + swap_writepage_fs(folio, swap_plug); /* * ->flags can be updated non-atomicially (scan_swap_map_slots), * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race * is safe. */ else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO)) - swap_writepage_bdev_sync(folio, wbc, sis); + swap_writepage_bdev_sync(folio, sis); else - swap_writepage_bdev_async(folio, wbc, sis); + swap_writepage_bdev_async(folio, sis); } void swap_write_unplug(struct swap_iocb *sio) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index b2fc5266e3d2..f72b6cd38b95 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -21,9 +21,9 @@ * consequently belong to a single zone. * * PageLRU check without isolation or lru_lock could race so that - * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable - * check without lock_page also may miss some movable non-lru pages at - * race condition. So you can't expect this function should be exact. + * MIGRATE_MOVABLE block might include unmovable pages. Similarly, pages + * with movable_ops can only be identified some time after they were + * allocated. So you can't expect this function should be exact. * * Returns a page without holding a reference. If the caller wants to * dereference that page (e.g., dumping), it has to make sure that it @@ -31,7 +31,7 @@ * */ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags) + enum pb_isolate_mode mode) { struct page *page = pfn_to_page(start_pfn); struct zone *zone = page_zone(page); @@ -46,7 +46,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e * isolate CMA pageblocks even when they are not movable in fact * so consider them movable here. */ - if (is_migrate_cma(migratetype)) + if (mode == PB_ISOLATE_MODE_CMA_ALLOC) return NULL; return page; @@ -92,7 +92,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e h = size_to_hstate(folio_size(folio)); if (h && !hugepage_migration_supported(h)) return page; - } else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) { + } else if (!folio_test_lru(folio)) { return page; } @@ -117,7 +117,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e * The HWPoisoned page may be not in buddy system, and * page_count() is not 0. */ - if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) + if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page)) continue; /* @@ -130,10 +130,10 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e * move these pages that still have a reference count > 0. * (false negatives in this function only) */ - if ((flags & MEMORY_OFFLINE) && PageOffline(page)) + if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page)) continue; - if (__PageMovable(page) || PageLRU(page)) + if (PageLRU(page) || page_has_movable_ops(page)) continue; /* @@ -151,7 +151,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e * present in [start_pfn, end_pfn). The pageblock must intersect with * [start_pfn, end_pfn). */ -static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags, +static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode, unsigned long start_pfn, unsigned long end_pfn) { struct zone *zone = page_zone(page); @@ -186,9 +186,9 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ end_pfn); unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, - migratetype, isol_flags); + mode); if (!unmovable) { - if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) { + if (!pageblock_isolate_and_move_free_pages(zone, page)) { spin_unlock_irqrestore(&zone->lock, flags); return -EBUSY; } @@ -198,7 +198,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ } spin_unlock_irqrestore(&zone->lock, flags); - if (isol_flags & REPORT_FAILURE) { + if (mode == PB_ISOLATE_MODE_MEM_OFFLINE) { /* * printk() with zone->lock held will likely trigger a * lockdep splat, so defer it here. @@ -209,7 +209,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ return -EBUSY; } -static void unset_migratetype_isolate(struct page *page, int migratetype) +static void unset_migratetype_isolate(struct page *page) { struct zone *zone; unsigned long flags; @@ -262,10 +262,10 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) * Isolating this block already succeeded, so this * should not fail on zone boundaries. */ - WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype)); + WARN_ON_ONCE(!pageblock_unisolate_and_move_free_pages(zone, page)); } else { - set_pageblock_migratetype(page, migratetype); - __putback_isolated_page(page, order, migratetype); + clear_pageblock_isolate(page); + __putback_isolated_page(page, order, get_pageblock_migratetype(page)); } zone->nr_isolate_pageblock--; out: @@ -292,11 +292,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * isolate_single_pageblock() -- tries to isolate a pageblock that might be * within a free or in-use page. * @boundary_pfn: pageblock-aligned pfn that a page might cross - * @flags: isolation flags + * @mode: isolation mode * @isolate_before: isolate the pageblock before the boundary_pfn * @skip_isolation: the flag to skip the pageblock isolation in second * isolate_single_pageblock() - * @migratetype: migrate type to set in error recovery. * * Free and in-use pages can be as big as MAX_PAGE_ORDER and contain more than one * pageblock. When not all pageblocks within a page are isolated at the same @@ -311,8 +310,9 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * either. The function handles this by splitting the free page or migrating * the in-use page then splitting the free page. */ -static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, - bool isolate_before, bool skip_isolation, int migratetype) +static int isolate_single_pageblock(unsigned long boundary_pfn, + enum pb_isolate_mode mode, bool isolate_before, + bool skip_isolation) { unsigned long start_pfn; unsigned long isolate_pageblock; @@ -338,12 +338,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, zone->zone_start_pfn); if (skip_isolation) { - int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); - - VM_BUG_ON(!is_migrate_isolate(mt)); + VM_BUG_ON(!get_pageblock_isolate(pfn_to_page(isolate_pageblock))); } else { - ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype, - flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages); + ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), + mode, isolate_pageblock, + isolate_pageblock + pageblock_nr_pages); if (ret) return ret; @@ -383,7 +382,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, if (PageBuddy(page)) { int order = buddy_order(page); - /* move_freepages_block_isolate() handled this */ + /* pageblock_isolate_and_move_free_pages() handled this */ VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn); pfn += 1UL << order; @@ -422,7 +421,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, * proper free and split handling for them. */ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); - VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page); + VM_WARN_ON_ONCE_PAGE(page_has_movable_ops(page), page); goto failed; } @@ -433,7 +432,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, failed: /* restore the original migratetype */ if (!skip_isolation) - unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype); + unset_migratetype_isolate(pfn_to_page(isolate_pageblock)); return -EBUSY; } @@ -441,14 +440,7 @@ failed: * start_isolate_page_range() - mark page range MIGRATE_ISOLATE * @start_pfn: The first PFN of the range to be isolated. * @end_pfn: The last PFN of the range to be isolated. - * @migratetype: Migrate type to set in error recovery. - * @flags: The following flags are allowed (they can be combined in - * a bit mask) - * MEMORY_OFFLINE - isolate to offline (!allocate) memory - * e.g., skip over PageHWPoison() pages - * and PageOffline() pages. - * REPORT_FAILURE - report details about the failure to - * isolate the range + * @mode: isolation mode * * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in * the range will never be allocated. Any free pages and pages freed in the @@ -481,7 +473,7 @@ failed: * Return: 0 on success and -EBUSY if any part of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags) + enum pb_isolate_mode mode) { unsigned long pfn; struct page *page; @@ -492,8 +484,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, bool skip_isolation = false; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, flags, false, - skip_isolation, migratetype); + ret = isolate_single_pageblock(isolate_start, mode, false, + skip_isolation); if (ret) return ret; @@ -501,10 +493,9 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, skip_isolation = true; /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, flags, true, - skip_isolation, migratetype); + ret = isolate_single_pageblock(isolate_end, mode, true, skip_isolation); if (ret) { - unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); + unset_migratetype_isolate(pfn_to_page(isolate_start)); return ret; } @@ -513,12 +504,11 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < isolate_end - pageblock_nr_pages; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && set_migratetype_isolate(page, migratetype, flags, - start_pfn, end_pfn)) { - undo_isolate_page_range(isolate_start, pfn, migratetype); + if (page && set_migratetype_isolate(page, mode, start_pfn, + end_pfn)) { + undo_isolate_page_range(isolate_start, pfn); unset_migratetype_isolate( - pfn_to_page(isolate_end - pageblock_nr_pages), - migratetype); + pfn_to_page(isolate_end - pageblock_nr_pages)); return -EBUSY; } } @@ -529,13 +519,10 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * undo_isolate_page_range - undo effects of start_isolate_page_range() * @start_pfn: The first PFN of the isolated range * @end_pfn: The last PFN of the isolated range - * @migratetype: New migrate type to set on the range * - * This finds every MIGRATE_ISOLATE page block in the given range - * and switches it to @migratetype. + * This finds and unsets every MIGRATE_ISOLATE page block in the given range */ -void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype) +void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct page *page; @@ -548,7 +535,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, page = __first_valid_page(pfn, pageblock_nr_pages); if (!page || !is_migrate_isolate_page(page)) continue; - unset_migratetype_isolate(page, migratetype); + unset_migratetype_isolate(page); } } /* @@ -560,7 +547,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, */ static unsigned long __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, - int flags) + enum pb_isolate_mode mode) { struct page *page; @@ -573,11 +560,12 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, * simple way to verify that as VM_BUG_ON(), though. */ pfn += 1 << buddy_order(page); - else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) + else if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && + PageHWPoison(page)) /* A HWPoisoned page cannot be also PageBuddy */ pfn++; - else if ((flags & MEMORY_OFFLINE) && PageOffline(page) && - !page_count(page)) + else if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && + PageOffline(page) && !page_count(page)) /* * The responsible driver agreed to skip PageOffline() * pages when offlining memory by dropping its @@ -595,11 +583,11 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, * test_pages_isolated - check if pageblocks in range are isolated * @start_pfn: The first PFN of the isolated range * @end_pfn: The first PFN *after* the isolated range - * @isol_flags: Testing mode flags + * @mode: Testing mode * * This tests if all in the specified range are free. * - * If %MEMORY_OFFLINE is specified in @flags, it will consider + * If %PB_ISOLATE_MODE_MEM_OFFLINE specified in @mode, it will consider * poisoned and offlined pages free as well. * * Caller must ensure the requested range doesn't span zones. @@ -607,7 +595,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, * Returns 0 if true, -EBUSY if one or more pages are in use. */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, - int isol_flags) + enum pb_isolate_mode mode) { unsigned long pfn, flags; struct page *page; @@ -643,7 +631,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, /* Check all pages are free or marked as ISOLATED */ zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags); + pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, mode); spin_unlock_irqrestore(&zone->lock, flags); ret = pfn < end_pfn ? -EBUSY : 0; diff --git a/mm/page_owner.c b/mm/page_owner.c index 9928c9ac8c31..c3ca21132c2c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -333,9 +333,9 @@ noinline void __set_page_owner(struct page *page, unsigned short order, inc_stack_record_count(handle, gfp_mask, 1 << order); } -void __set_page_owner_migrate_reason(struct page *page, int reason) +void __folio_set_owner_migrate_reason(struct folio *folio, int reason) { - struct page_ext *page_ext = page_ext_get(page); + struct page_ext *page_ext = page_ext_get(&folio->page); struct page_owner *page_owner; if (unlikely(!page_ext)) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index e463c3be934a..e981a1a292d2 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -246,8 +246,7 @@ restart: */ pmde = pmdp_get_lockless(pvmw->pmd); - if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) || - (pmd_present(pmde) && pmd_devmap(pmde))) { + if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); pmde = *pvmw->pmd; if (!pmd_present(pmde)) { @@ -262,7 +261,7 @@ restart: return not_found(pvmw); return true; } - if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) { + if (likely(pmd_trans_huge(pmde))) { if (pvmw->flags & PVMW_MIGRATION) return not_found(pvmw); if (!check_pmd(pmd_pfn(pmde), pvmw)) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index e478777c86e1..648038247a8d 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -143,8 +143,7 @@ again: * We are ONLY installing, so avoid unnecessarily * splitting a present huge page. */ - if (pmd_present(*pmd) && - (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + if (pmd_present(*pmd) && pmd_trans_huge(*pmd)) continue; } @@ -210,8 +209,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, * We are ONLY installing, so avoid unnecessarily * splitting a present huge page. */ - if (pud_present(*pud) && - (pud_trans_huge(*pud) || pud_devmap(*pud))) + if (pud_present(*pud) && pud_trans_huge(*pud)) continue; } @@ -422,7 +420,7 @@ static inline void process_mm_walk_lock(struct mm_struct *mm, { if (walk_lock == PGWALK_RDLOCK) mmap_assert_locked(mm); - else + else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY) mmap_assert_write_locked(mm); } @@ -437,6 +435,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, case PGWALK_WRLOCK_VERIFY: vma_assert_write_locked(vma); break; + case PGWALK_VMA_RDLOCK_VERIFY: + vma_assert_locked(vma); + break; case PGWALK_RDLOCK: /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ break; @@ -585,8 +586,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } /** - * walk_page_range_novma - walk a range of pagetables not backed by a vma - * @mm: mm_struct representing the target process of page table walk + * walk_kernel_page_table_range - walk a range of kernel pagetables. * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk @@ -596,17 +596,61 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for - * walking the kernel pages tables or page tables for firmware. + * walking kernel pages tables or page tables for firmware. * * Note: Be careful to walk the kernel pages tables, the caller may be need to * take other effective approaches (mmap lock may be insufficient) to prevent * the intermediate kernel page tables belonging to the specified address range * from being freed (e.g. memory hot-remove). */ -int walk_page_range_novma(struct mm_struct *mm, unsigned long start, +int walk_kernel_page_table_range(unsigned long start, unsigned long end, + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) +{ + struct mm_struct *mm = &init_mm; + struct mm_walk walk = { + .ops = ops, + .mm = mm, + .pgd = pgd, + .private = private, + .no_vma = true + }; + + if (start >= end) + return -EINVAL; + if (!check_ops_valid(ops)) + return -EINVAL; + + /* + * Kernel intermediate page tables are usually not freed, so the mmap + * read lock is sufficient. But there are some exceptions. + * E.g. memory hot-remove. In which case, the mmap lock is insufficient + * to prevent the intermediate kernel pages tables belonging to the + * specified address range from being freed. The caller should take + * other actions to prevent this race. + */ + mmap_assert_locked(mm); + + return walk_pgd_range(start, end, &walk); +} + +/** + * walk_page_range_debug - walk a range of pagetables not backed by a vma + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @pgd: pgd to walk if different from mm->pgd + * @private: private data for callbacks' usage + * + * Similar to walk_page_range() but can walk any page tables even if they are + * not backed by VMAs. Because 'unusual' entries may be walked this function + * will also not lock the PTEs for the pte_entry() callback. + * + * This is for debugging purposes ONLY. + */ +int walk_page_range_debug(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, - pgd_t *pgd, - void *private) + pgd_t *pgd, void *private) { struct mm_walk walk = { .ops = ops, @@ -616,34 +660,24 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, .no_vma = true }; + /* For convenience, we allow traversal of kernel mappings. */ + if (mm == &init_mm) + return walk_kernel_page_table_range(start, end, ops, + pgd, private); if (start >= end || !walk.mm) return -EINVAL; if (!check_ops_valid(ops)) return -EINVAL; /* - * 1) For walking the user virtual address space: - * * The mmap lock protects the page walker from changes to the page * tables during the walk. However a read lock is insufficient to * protect those areas which don't have a VMA as munmap() detaches * the VMAs before downgrading to a read lock and actually tearing * down PTEs/page tables. In which case, the mmap write lock should - * be hold. - * - * 2) For walking the kernel virtual address space: - * - * The kernel intermediate page tables usually do not be freed, so - * the mmap map read lock is sufficient. But there are some exceptions. - * E.g. memory hot-remove. In which case, the mmap lock is insufficient - * to prevent the intermediate kernel pages tables belonging to the - * specified address range from being freed. The caller should take - * other actions to prevent this race. + * be held. */ - if (mm == &init_mm) - mmap_assert_locked(walk.mm); - else - mmap_assert_write_locked(walk.mm); + mmap_assert_write_locked(mm); return walk_pgd_range(start, end, &walk); } @@ -872,7 +906,7 @@ struct folio *folio_walk_start(struct folio_walk *fw, * TODO: FW_MIGRATION support for PUD migration entries * once there are relevant users. */ - if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) { + if (!pud_present(pud) || pud_special(pud)) { spin_unlock(ptl); goto not_found; } else if (!pud_leaf(pud)) { diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index dd3590dfc23d..9b9d5d6accae 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * mm/percpu-debug.c * * Copyright (C) 2017 Facebook Inc. * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org> diff --git a/mm/percpu.c b/mm/percpu.c index b35494c8ede2..d9cbaee92b60 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3355,7 +3355,7 @@ void __init setup_per_cpu_areas(void) */ unsigned long pcpu_nr_pages(void) { - return pcpu_nr_populated * pcpu_nr_units; + return data_race(READ_ONCE(pcpu_nr_populated)) * pcpu_nr_units; } /* diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 5a882f2b10f9..567e2d084071 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -139,8 +139,7 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && - !pmd_devmap(*pmdp)); + VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; @@ -153,7 +152,7 @@ pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t pud; VM_BUG_ON(address & ~HPAGE_PUD_MASK); - VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); + VM_BUG_ON(!pud_trans_huge(*pudp)); pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return pud; @@ -293,7 +292,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) *pmdvalp = pmdval; if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) goto nomap; - if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))) + if (unlikely(pmd_trans_huge(pmdval))) goto nomap; if (unlikely(pmd_bad(pmdval))) { pmd_clear_bad(pmd); diff --git a/mm/ptdump.c b/mm/ptdump.c index 9374f29cdc6f..b600c7f864b8 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -4,6 +4,7 @@ #include <linux/debugfs.h> #include <linux/ptdump.h> #include <linux/kasan.h> +#include "internal.h" #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) /* @@ -175,13 +176,15 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) { const struct ptdump_range *range = st->range; + get_online_mems(); mmap_write_lock(mm); while (range->start != range->end) { - walk_page_range_novma(mm, range->start, range->end, + walk_page_range_debug(mm, range->start, range->end, &ptdump_ops, pgd, st); range++; } mmap_write_unlock(mm); + put_online_mems(); /* Flush out the last page */ st->note_page_flush(st); diff --git a/mm/readahead.c b/mm/readahead.c index 20d36d6b055e..406756d34309 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -457,7 +457,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, } void page_cache_ra_order(struct readahead_control *ractl, - struct file_ra_state *ra, unsigned int new_order) + struct file_ra_state *ra) { struct address_space *mapping = ractl->mapping; pgoff_t start = readahead_index(ractl); @@ -468,24 +468,21 @@ void page_cache_ra_order(struct readahead_control *ractl, unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); - unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping)); + unsigned int new_order = ra->order; - /* - * Fallback when size < min_nrpages as each folio should be - * at least min_nrpages anyway. - */ - if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size) + if (!mapping_large_folio_support(mapping)) { + ra->order = 0; goto fallback; + } limit = min(limit, index + ra->size - 1); - if (new_order < mapping_max_folio_order(mapping)) - new_order += 2; - new_order = min(mapping_max_folio_order(mapping), new_order); new_order = min_t(unsigned int, new_order, ilog2(ra->size)); new_order = max(new_order, min_order); + ra->order = new_order; + /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); @@ -617,8 +614,9 @@ void page_cache_sync_ra(struct readahead_control *ractl, ra->size = min(contig_count + req_count, max_pages); ra->async_size = 1; readit: + ra->order = 0; ractl->_index = ra->start; - page_cache_ra_order(ractl, ra, 0); + page_cache_ra_order(ractl, ra); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); @@ -628,8 +626,7 @@ void page_cache_async_ra(struct readahead_control *ractl, unsigned long max_pages; struct file_ra_state *ra = ractl->ra; pgoff_t index = readahead_index(ractl); - pgoff_t expected, start; - unsigned int order = folio_order(folio); + pgoff_t expected, start, end, aligned_end, align; /* no readahead */ if (!ra->ra_pages) @@ -652,7 +649,7 @@ void page_cache_async_ra(struct readahead_control *ractl, * Ramp up sizes, and push forward the readahead window. */ expected = round_down(ra->start + ra->size - ra->async_size, - 1UL << order); + folio_nr_pages(folio)); if (index == expected) { ra->start += ra->size; /* @@ -660,7 +657,6 @@ void page_cache_async_ra(struct readahead_control *ractl, * the readahead window. */ ra->size = max(ra->size, get_next_ra_size(ra, max_pages)); - ra->async_size = ra->size; goto readit; } @@ -681,10 +677,16 @@ void page_cache_async_ra(struct readahead_control *ractl, ra->size = start - index; /* old async_size */ ra->size += req_count; ra->size = get_next_ra_size(ra, max_pages); - ra->async_size = ra->size; readit: + ra->order += 2; + align = 1UL << min(ra->order, ffs(max_pages) - 1); + end = ra->start + ra->size; + aligned_end = round_down(end, align); + if (aligned_end > ra->start) + ra->size -= end - aligned_end; + ra->async_size = ra->size; ractl->_index = ra->start; - page_cache_ra_order(ractl, ra, order); + page_cache_ra_order(ractl, ra); } EXPORT_SYMBOL_GPL(page_cache_async_ra); diff --git a/mm/rmap.c b/mm/rmap.c index fb63d9256f09..f93ce27132ab 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -503,12 +503,12 @@ struct anon_vma *folio_get_anon_vma(const struct folio *folio) rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); - if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; - anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; @@ -550,12 +550,12 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, retry: rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); - if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; - anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { /* @@ -746,7 +746,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm) int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; if (pending != flushed) { - arch_flush_tlb_batched_pending(mm); + flush_tlb_mm(mm); /* * If the new TLB flushing is pending during flushing, leave * mm->tlb_flush_batched as is, to avoid losing flushing. @@ -839,7 +839,7 @@ out: struct folio_referenced_arg { int mapcount; int referenced; - unsigned long vm_flags; + vm_flags_t vm_flags; struct mem_cgroup *memcg; }; @@ -984,7 +984,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) * the function bailed out due to rmap lock contention. */ int folio_referenced(struct folio *folio, int is_locked, - struct mem_cgroup *memcg, unsigned long *vm_flags) + struct mem_cgroup *memcg, vm_flags_t *vm_flags) { bool we_locked = false; struct folio_referenced_arg pra = { @@ -1334,9 +1334,9 @@ void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); - anon_vma += PAGE_MAPPING_ANON; + anon_vma += FOLIO_MAPPING_ANON; /* - * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written + * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written * simultaneously, so a concurrent reader (eg folio_referenced()'s * folio_test_anon()) will not see one without the other. */ @@ -1367,10 +1367,10 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, /* * page_idle does a lockless/optimistic rmap scan on folio->mapping. * Make sure the compiler doesn't split the stores of anon_vma and - * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code + * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code * could mistake the mapping for a struct address_space and crash. */ - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON; WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); folio->index = linear_page_index(vma, address); } @@ -1845,23 +1845,30 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page, #endif } -/* We support batch unmapping of PTEs for lazyfree large folios */ -static inline bool can_batch_unmap_folio_ptes(unsigned long addr, - struct folio *folio, pte_t *ptep) +static inline unsigned int folio_unmap_pte_batch(struct folio *folio, + struct page_vma_mapped_walk *pvmw, + enum ttu_flags flags, pte_t pte) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; - int max_nr = folio_nr_pages(folio); - pte_t pte = ptep_get(ptep); + unsigned long end_addr, addr = pvmw->address; + struct vm_area_struct *vma = pvmw->vma; + unsigned int max_nr; + + if (flags & TTU_HWPOISON) + return 1; + if (!folio_test_large(folio)) + return 1; + /* We may only batch within a single VMA and a single page table. */ + end_addr = pmd_addr_end(addr, vma->vm_end); + max_nr = (end_addr - addr) >> PAGE_SHIFT; + + /* We only support lazyfree batching for now ... */ if (!folio_test_anon(folio) || folio_test_swapbacked(folio)) - return false; + return 1; if (pte_unused(pte)) - return false; - if (pte_pfn(pte) != folio_pfn(folio)) - return false; + return 1; - return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, - NULL, NULL) == max_nr; + return folio_pte_batch(folio, pvmw->pte, pte, max_nr); } /* @@ -2024,9 +2031,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (pte_dirty(pteval)) folio_mark_dirty(folio); } else if (likely(pte_present(pteval))) { - if (folio_test_large(folio) && !(flags & TTU_HWPOISON) && - can_batch_unmap_folio_ptes(address, folio, pvmw.pte)) - nr_pages = folio_nr_pages(folio); + nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval); end_addr = address + nr_pages * PAGE_SIZE; flush_cache_range(vma, address, end_addr); @@ -2206,13 +2211,16 @@ discard: hugetlb_remove_rmap(folio); } else { folio_remove_rmap_ptes(folio, subpage, nr_pages, vma); - folio_ref_sub(folio, nr_pages - 1); } if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); - folio_put(folio); - /* We have already batched the entire folio */ - if (nr_pages > 1) + folio_put_refs(folio, nr_pages); + + /* + * If we are sure that we batched the entire folio and cleared + * all PTEs, we can just optimize and stop right here. + */ + if (nr_pages == folio_nr_pages(folio)) goto walk_done; continue; walk_abort: diff --git a/mm/secretmem.c b/mm/secretmem.c index 589b26c2d553..60137305bc20 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -54,7 +54,6 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) pgoff_t offset = vmf->pgoff; gfp_t gfp = vmf->gfp_mask; unsigned long addr; - struct page *page; struct folio *folio; vm_fault_t ret; int err; @@ -65,16 +64,15 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) filemap_invalidate_lock_shared(mapping); retry: - page = find_lock_page(mapping, offset); - if (!page) { + folio = filemap_lock_folio(mapping, offset); + if (IS_ERR(folio)) { folio = folio_alloc(gfp | __GFP_ZERO, 0); if (!folio) { ret = VM_FAULT_OOM; goto out; } - page = &folio->page; - err = set_direct_map_invalid_noflush(page); + err = set_direct_map_invalid_noflush(folio_page(folio, 0)); if (err) { folio_put(folio); ret = vmf_error(err); @@ -90,7 +88,7 @@ retry: * already happened when we marked the page invalid * which guarantees that this call won't fail */ - set_direct_map_default_noflush(page); + set_direct_map_default_noflush(folio_page(folio, 0)); if (err == -EEXIST) goto retry; @@ -98,11 +96,11 @@ retry: goto out; } - addr = (unsigned long)page_address(page); + addr = (unsigned long)folio_address(folio); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } - vmf->page = page; + vmf->page = folio_file_page(folio, vmf->pgoff); ret = VM_FAULT_LOCKED; out: @@ -154,7 +152,7 @@ static int secretmem_migrate_folio(struct address_space *mapping, static void secretmem_free_folio(struct folio *folio) { - set_direct_map_default_noflush(&folio->page); + set_direct_map_default_noflush(folio_page(folio, 0)); folio_zero_segment(folio, 0, folio_size(folio)); } @@ -195,20 +193,13 @@ static struct file *secretmem_file_create(unsigned long flags) struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; - int err; - inode = alloc_anon_inode(secretmem_mnt->mnt_sb); + inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL); if (IS_ERR(inode)) return ERR_CAST(inode); - err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL); - if (err) { - file = ERR_PTR(err); - goto err_free_inode; - } - file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", - O_RDWR, &secretmem_fops); + O_RDWR | O_LARGEFILE, &secretmem_fops); if (IS_ERR(file)) goto err_free_inode; @@ -222,6 +213,8 @@ static struct file *secretmem_file_create(unsigned long flags) inode->i_mode |= S_IFREG; inode->i_size = 0; + atomic_inc(&secretmem_users); + return file; err_free_inode: @@ -255,9 +248,6 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) goto err_put_fd; } - file->f_flags |= O_LARGEFILE; - - atomic_inc(&secretmem_users); fd_install(fd, file); return fd; @@ -268,7 +258,15 @@ err_put_fd: static int secretmem_init_fs_context(struct fs_context *fc) { - return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; + struct pseudo_fs_context *ctx; + + ctx = init_pseudo(fc, SECRETMEM_MAGIC); + if (!ctx) + return -ENOMEM; + + fc->s_iflags |= SB_I_NOEXEC; + fc->s_iflags |= SB_I_NODEV; + return 0; } static struct file_system_type secretmem_fs = { @@ -286,9 +284,6 @@ static int __init secretmem_init(void) if (IS_ERR(secretmem_mnt)) return PTR_ERR(secretmem_mnt); - /* prevent secretmem mappings from ever getting PROT_EXEC */ - secretmem_mnt->mnt_flags |= MNT_NOEXEC; - return 0; } fs_initcall(secretmem_init); diff --git a/mm/shmem.c b/mm/shmem.c index 0c5fb4ffa03a..7fdd707ac1ac 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -292,7 +292,7 @@ bool vma_is_shmem(struct vm_area_struct *vma) } static LIST_HEAD(shmem_swaplist); -static DEFINE_MUTEX(shmem_swaplist_mutex); +static DEFINE_SPINLOCK(shmem_swaplist_lock); #ifdef CONFIG_TMPFS_QUOTA @@ -432,10 +432,13 @@ static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) + * + * Return: true if swapped was incremented from 0, for shmem_writeout(). */ -static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) +static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); + bool first_swapped = false; long freed; spin_lock(&info->lock); @@ -450,8 +453,11 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ - if (swapped > 0) + if (swapped > 0) { + if (info->swapped == swapped) + first_swapped = true; freed += swapped; + } if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); @@ -459,6 +465,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) /* The quota case may block */ if (freed > 0) shmem_inode_unacct_blocks(inode, freed); + return first_swapped; } bool shmem_charge(struct inode *inode, long pages) @@ -615,7 +622,7 @@ static unsigned int shmem_get_orders_within_size(struct inode *inode, static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? 0 : BIT(HPAGE_PMD_ORDER); @@ -862,7 +869,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { return 0; } @@ -1375,11 +1382,11 @@ static void shmem_evict_inode(struct inode *inode) /* Wait while shmem_unuse() is scanning this inode... */ wait_var_event(&info->stop_eviction, !atomic_read(&info->stop_eviction)); - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction)) list_del_init(&info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); } } @@ -1502,7 +1509,7 @@ int shmem_unuse(unsigned int type) if (list_empty(&shmem_swaplist)) return 0; - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { @@ -1516,12 +1523,12 @@ start_over: * (igrab() would protect from unlink, but not from unmount). */ atomic_inc(&info->stop_eviction); - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) @@ -1532,7 +1539,7 @@ start_over: if (!info->swapped) list_del_init(&info->swaplist); } - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); return error; } @@ -1540,11 +1547,13 @@ start_over: /** * shmem_writeout - Write the folio to swap * @folio: The folio to write - * @wbc: How writeback is to be done + * @plug: swap plug + * @folio_list: list to put back folios on split * * Move the folio from the page cache to the swap cache. */ -int shmem_writeout(struct folio *folio, struct writeback_control *wbc) +int shmem_writeout(struct folio *folio, struct swap_iocb **plug, + struct list_head *folio_list) { struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; @@ -1554,9 +1563,6 @@ int shmem_writeout(struct folio *folio, struct writeback_control *wbc) int nr_pages; bool split = false; - if (WARN_ON_ONCE(!wbc->for_reclaim)) - goto redirty; - if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty; @@ -1583,7 +1589,7 @@ int shmem_writeout(struct folio *folio, struct writeback_control *wbc) try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_folio_to_list(folio, wbc->list)) + if (split_folio_to_list(folio, folio_list)) goto redirty; folio_clear_dirty(folio); } @@ -1623,38 +1629,64 @@ try_split: folio_mark_uptodate(folio); } - /* - * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now before the folio is - * moved to swap cache, when its pagelock no longer protects - * the inode from eviction. But don't unlock the mutex until - * we've incremented swapped, because shmem_unuse_inode() will - * prune a !swapped inode from the swaplist under this mutex. - */ - mutex_lock(&shmem_swaplist_mutex); - if (list_empty(&info->swaplist)) - list_add(&info->swaplist, &shmem_swaplist); - if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { - shmem_recalc_inode(inode, 0, nr_pages); + bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); + int error; + + /* + * Add inode to shmem_unuse()'s list of swapped-out inodes, + * if it's not already there. Do it now before the folio is + * removed from page cache, when its pagelock no longer + * protects the inode from eviction. And do it now, after + * we've incremented swapped, because shmem_unuse() will + * prune a !swapped inode from the swaplist. + */ + if (first_swapped) { + spin_lock(&shmem_swaplist_lock); + if (list_empty(&info->swaplist)) + list_add(&info->swaplist, &shmem_swaplist); + spin_unlock(&shmem_swaplist_lock); + } + swap_shmem_alloc(folio->swap, nr_pages); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); - mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); - return swap_writeout(folio, wbc); + error = swap_writeout(folio, plug); + if (error != AOP_WRITEPAGE_ACTIVATE) { + /* folio has been unlocked */ + return error; + } + + /* + * The intention here is to avoid holding on to the swap when + * zswap was unable to compress and unable to writeback; but + * it will be appropriate if other reactivate cases are added. + */ + error = shmem_add_to_page_cache(folio, mapping, index, + swp_to_radix_entry(folio->swap), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + /* Swap entry might be erased by racing shmem_free_swap() */ + if (!error) { + shmem_recalc_inode(inode, 0, -nr_pages); + swap_free_nr(folio->swap, nr_pages); + } + + /* + * The delete_from_swap_cache() below could be left for + * shrink_folio_list()'s folio_free_swap() to dispose of; + * but I'm a little nervous about letting this folio out of + * shmem_writeout() in a hybrid half-tmpfs-half-swap state + * e.g. folio_mapping(folio) might give an unexpected answer. + */ + delete_from_swap_cache(folio); + goto redirty; } - if (!info->swapped) - list_del_init(&info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); if (nr_pages > 1) goto try_split; redirty: folio_mark_dirty(folio); - if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ - folio_unlock(folio); - return 0; + return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ } EXPORT_SYMBOL_GPL(shmem_writeout); @@ -1757,7 +1789,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); - unsigned long vm_flags = vma ? vma->vm_flags : 0; + vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) @@ -2259,6 +2291,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio = swap_cache_get_folio(swap, NULL, 0); order = xa_get_order(&mapping->i_pages, index); if (!folio) { + int nr_pages = 1 << order; bool fallback_order0 = false; /* Or update major stats only when swapin succeeds?? */ @@ -2272,9 +2305,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. */ if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled())) + !zswap_never_enabled() || + non_swapcache_batch(swap, nr_pages) != nr_pages)) fallback_order0 = true; /* Skip swapcache for synchronous device. */ @@ -3266,9 +3302,9 @@ static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; static int -shmem_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -3300,9 +3336,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, } static int -shmem_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; @@ -4183,7 +4219,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, #ifdef CONFIG_TMPFS_XATTR -static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); @@ -4193,7 +4229,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) } static int shmem_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); @@ -4980,7 +5016,6 @@ static void shmem_put_super(struct super_block *sb) static const struct dentry_operations shmem_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, - .d_delete = always_delete_dentry, }; #endif @@ -5028,7 +5063,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) if (ctx->encoding) { sb->s_encoding = ctx->encoding; - sb->s_d_op = &shmem_ci_dentry_ops; + set_default_d_op(sb, &shmem_ci_dentry_ops); if (ctx->strict_encoding) sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; } @@ -5037,6 +5072,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #else sb->s_flags |= SB_NOUSER; #endif /* CONFIG_TMPFS */ + sb->s_d_flags |= DCACHE_DONTCACHE; sbinfo->max_blocks = ctx->blocks; sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; diff --git a/mm/show_mem.c b/mm/show_mem.c index 0cf8bf5d832d..41999e94a56d 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -246,7 +246,6 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z " shmem_pmdmapped:%lukB" " anon_thp:%lukB" #endif - " writeback_tmp:%lukB" " kernel_stack:%lukB" #ifdef CONFIG_SHADOW_CALL_STACK " shadow_call_stack:%lukB" @@ -273,7 +272,6 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), K(node_page_state(pgdat, NR_ANON_THPS)), #endif - K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), node_page_state(pgdat, NR_KERNEL_STACK_KB), #ifdef CONFIG_SHADOW_CALL_STACK node_page_state(pgdat, NR_KERNEL_SCS_KB), diff --git a/mm/slab.h b/mm/slab.h index 05a21dc796e0..248b34c839b7 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -50,7 +50,7 @@ typedef union { /* Reuses the bits in struct page */ struct slab { - unsigned long __page_flags; + unsigned long flags; struct kmem_cache *slab_cache; union { @@ -99,7 +99,7 @@ struct slab { #define SLAB_MATCH(pg, sl) \ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) -SLAB_MATCH(flags, __page_flags); +SLAB_MATCH(flags, flags); SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG @@ -167,30 +167,6 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t) */ #define slab_page(s) folio_page(slab_folio(s), 0) -/* - * If network-based swap is enabled, sl*b must keep track of whether pages - * were allocated from pfmemalloc reserves. - */ -static inline bool slab_test_pfmemalloc(const struct slab *slab) -{ - return folio_test_active(slab_folio(slab)); -} - -static inline void slab_set_pfmemalloc(struct slab *slab) -{ - folio_set_active(slab_folio(slab)); -} - -static inline void slab_clear_pfmemalloc(struct slab *slab) -{ - folio_clear_active(slab_folio(slab)); -} - -static inline void __slab_clear_pfmemalloc(struct slab *slab) -{ - __folio_clear_active(slab_folio(slab)); -} - static inline void *slab_address(const struct slab *slab) { return folio_address(slab_folio(slab)); diff --git a/mm/slub.c b/mm/slub.c index 31e11ef256f9..cf7c6032d5fd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -23,6 +23,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kasan.h> +#include <linux/node.h> #include <linux/kmsan.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -91,14 +92,14 @@ * The partially empty slabs cached on the CPU partial list are used * for performance reasons, which speeds up the allocation process. * These slabs are not frozen, but are also exempt from list management, - * by clearing the PG_workingset flag when moving out of the node + * by clearing the SL_partial flag when moving out of the node * partial list. Please see __slab_free() for more details. * * To sum up, the current scheme is: - * - node partial slab: PG_Workingset && !frozen - * - cpu partial slab: !PG_Workingset && !frozen - * - cpu slab: !PG_Workingset && frozen - * - full slab: !PG_Workingset && !frozen + * - node partial slab: SL_partial && !frozen + * - cpu partial slab: !SL_partial && !frozen + * - cpu slab: !SL_partial && frozen + * - full slab: !SL_partial && !frozen * * list_lock * @@ -183,6 +184,22 @@ * the fast path and disables lockless freelists. */ +/** + * enum slab_flags - How the slab flags bits are used. + * @SL_locked: Is locked with slab_lock() + * @SL_partial: On the per-node partial list + * @SL_pfmemalloc: Was allocated from PF_MEMALLOC reserves + * + * The slab flags share space with the page flags but some bits have + * different interpretations. The high bits are used for information + * like zone/node/section. + */ +enum slab_flags { + SL_locked = PG_locked, + SL_partial = PG_workingset, /* Historical reasons for this bit */ + SL_pfmemalloc = PG_active, /* Historical reasons for this bit */ +}; + /* * We could simply use migrate_disable()/enable() but as long as it's a * function call even on !PREEMPT_RT, use inline preempt_disable() there. @@ -447,7 +464,7 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) /* * Tracks for which NUMA nodes we have kmem_cache_nodes allocated. - * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily + * Corresponds to node_state[N_MEMORY], but can temporarily * differ during memory hotplug/hotremove operations. * Protected by slab_mutex. */ @@ -635,16 +652,35 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) #endif /* CONFIG_SLUB_CPU_PARTIAL */ /* + * If network-based swap is enabled, slub must keep track of whether memory + * were allocated from pfmemalloc reserves. + */ +static inline bool slab_test_pfmemalloc(const struct slab *slab) +{ + return test_bit(SL_pfmemalloc, &slab->flags); +} + +static inline void slab_set_pfmemalloc(struct slab *slab) +{ + set_bit(SL_pfmemalloc, &slab->flags); +} + +static inline void __slab_clear_pfmemalloc(struct slab *slab) +{ + __clear_bit(SL_pfmemalloc, &slab->flags); +} + +/* * Per slab locking using the pagelock */ static __always_inline void slab_lock(struct slab *slab) { - bit_spin_lock(PG_locked, &slab->__page_flags); + bit_spin_lock(SL_locked, &slab->flags); } static __always_inline void slab_unlock(struct slab *slab) { - bit_spin_unlock(PG_locked, &slab->__page_flags); + bit_spin_unlock(SL_locked, &slab->flags); } static inline bool @@ -1010,7 +1046,7 @@ static void print_slab_info(const struct slab *slab) { pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", slab, slab->objects, slab->inuse, slab->freelist, - &slab->__page_flags); + &slab->flags); } void skip_orig_size_check(struct kmem_cache *s, const void *object) @@ -2717,23 +2753,19 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab) free_slab(s, slab); } -/* - * SLUB reuses PG_workingset bit to keep track of whether it's on - * the per-node partial list. - */ static inline bool slab_test_node_partial(const struct slab *slab) { - return folio_test_workingset(slab_folio(slab)); + return test_bit(SL_partial, &slab->flags); } static inline void slab_set_node_partial(struct slab *slab) { - set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); + set_bit(SL_partial, &slab->flags); } static inline void slab_clear_node_partial(struct slab *slab) { - clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); + clear_bit(SL_partial, &slab->flags); } /* @@ -4269,7 +4301,12 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) flags = kmalloc_fix_flags(flags); flags |= __GFP_COMP; - folio = (struct folio *)alloc_pages_node_noprof(node, flags, order); + + if (node == NUMA_NO_NODE) + folio = (struct folio *)alloc_frozen_pages_noprof(flags, order); + else + folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL); + if (folio) { ptr = folio_address(folio); lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, @@ -4765,7 +4802,7 @@ static void free_large_kmalloc(struct folio *folio, void *object) lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); __folio_clear_large_kmalloc(folio); - folio_put(folio); + free_frozen_pages(&folio->page, order); } /* @@ -4930,12 +4967,12 @@ alloc_new: * When slub_debug_orig_size() is off, krealloc() only knows about the bucket * size of an allocation (but not the exact size it was allocated with) and * hence implements the following semantics for shrinking and growing buffers - * with __GFP_ZERO. + * with __GFP_ZERO:: * - * new bucket - * 0 size size - * |--------|----------------| - * | keep | zero | + * new bucket + * 0 size size + * |--------|----------------| + * | keep | zero | * * Otherwise, the original allocation size 'orig_size' could be used to * precisely clear the requested size, and the new size will also be stored @@ -6149,7 +6186,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) return __kmem_cache_do_shrink(s); } -static int slab_mem_going_offline_callback(void *arg) +static int slab_mem_going_offline_callback(void) { struct kmem_cache *s; @@ -6163,46 +6200,13 @@ static int slab_mem_going_offline_callback(void *arg) return 0; } -static void slab_mem_offline_callback(void *arg) -{ - struct memory_notify *marg = arg; - int offline_node; - - offline_node = marg->status_change_nid_normal; - - /* - * If the node still has available memory. we need kmem_cache_node - * for it yet. - */ - if (offline_node < 0) - return; - - mutex_lock(&slab_mutex); - node_clear(offline_node, slab_nodes); - /* - * We no longer free kmem_cache_node structures here, as it would be - * racy with all get_node() users, and infeasible to protect them with - * slab_mutex. - */ - mutex_unlock(&slab_mutex); -} - -static int slab_mem_going_online_callback(void *arg) +static int slab_mem_going_online_callback(int nid) { struct kmem_cache_node *n; struct kmem_cache *s; - struct memory_notify *marg = arg; - int nid = marg->status_change_nid_normal; int ret = 0; /* - * If the node's memory is already available, then kmem_cache_node is - * already created. Nothing to do. - */ - if (nid < 0) - return 0; - - /* * We are bringing a node online. No memory is available yet. We must * allocate a kmem_cache_node structure in order to bring the node * online. @@ -6241,21 +6245,16 @@ out: static int slab_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { + struct node_notify *nn = arg; + int nid = nn->nid; int ret = 0; switch (action) { - case MEM_GOING_ONLINE: - ret = slab_mem_going_online_callback(arg); - break; - case MEM_GOING_OFFLINE: - ret = slab_mem_going_offline_callback(arg); - break; - case MEM_OFFLINE: - case MEM_CANCEL_ONLINE: - slab_mem_offline_callback(arg); + case NODE_ADDING_FIRST_MEMORY: + ret = slab_mem_going_online_callback(nid); break; - case MEM_ONLINE: - case MEM_CANCEL_OFFLINE: + case NODE_REMOVING_LAST_MEMORY: + ret = slab_mem_going_offline_callback(); break; } if (ret) @@ -6324,14 +6323,14 @@ void __init kmem_cache_init(void) * Initialize the nodemask for which we will allocate per node * structures. Here we don't need taking slab_mutex yet. */ - for_each_node_state(node, N_NORMAL_MEMORY) + for_each_node_state(node, N_MEMORY) node_set(node, slab_nodes); create_boot_cache(kmem_cache_node, "kmem_cache_node", sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); - hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); + hotplug_node_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); /* Able to allocate the per node structures */ slab_state = PARTIAL; diff --git a/mm/swap.c b/mm/swap.c index 4fc322f7111a..3632dd061beb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -237,8 +237,9 @@ void folio_rotate_reclaimable(struct folio *folio) folio_batch_add_and_move(folio, lru_move_tail, true); } -void lru_note_cost(struct lruvec *lruvec, bool file, - unsigned int nr_io, unsigned int nr_rotated) +void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, + unsigned int nr_io, unsigned int nr_rotated) + __releases(lruvec->lru_lock) { unsigned long cost; @@ -250,18 +251,14 @@ void lru_note_cost(struct lruvec *lruvec, bool file, * different between them, adjust scan balance for CPU work. */ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; + if (!cost) { + spin_unlock_irq(&lruvec->lru_lock); + return; + } - do { + for (;;) { unsigned long lrusize; - /* - * Hold lruvec->lru_lock is safe here, since - * 1) The pinned lruvec in reclaim, or - * 2) From a pre-LRU page during refault (which also holds the - * rcu lock, so would be safe even if the page was on the LRU - * and could move simultaneously to a new lruvec). - */ - spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += cost; @@ -285,14 +282,22 @@ void lru_note_cost(struct lruvec *lruvec, bool file, lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } + spin_unlock_irq(&lruvec->lru_lock); - } while ((lruvec = parent_lruvec(lruvec))); + lruvec = parent_lruvec(lruvec); + if (!lruvec) + break; + spin_lock_irq(&lruvec->lru_lock); + } } void lru_note_cost_refault(struct folio *folio) { - lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), - folio_nr_pages(folio), 0); + struct lruvec *lruvec; + + lruvec = folio_lruvec_lock_irq(folio); + lru_note_cost_unlock_irq(lruvec, folio_is_file_lru(folio), + folio_nr_pages(folio), 0); } static void lru_activate(struct lruvec *lruvec, struct folio *folio) diff --git a/mm/swap.h b/mm/swap.h index 2269eb9df0af..911ad5ff0f89 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -3,6 +3,8 @@ #define _MM_SWAP_H struct mempolicy; +struct swap_iocb; + extern int page_cluster; #ifdef CONFIG_SWAP @@ -20,8 +22,8 @@ static inline void swap_read_unplug(struct swap_iocb *plug) __swap_read_unplug(plug); } void swap_write_unplug(struct swap_iocb *sio); -int swap_writeout(struct folio *folio, struct writeback_control *wbc); -void __swap_writepage(struct folio *folio, struct writeback_control *wbc); +int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug); +void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); /* linux/mm/swap_state.c */ /* One swap address space for each 64M swap space */ @@ -106,6 +108,25 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return find_next_bit(sis->zeromap, end, start) - start; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + struct swap_info_struct *si = swp_swap_info(entry); + pgoff_t offset = swp_offset(entry); + int i; + + /* + * While allocating a large folio and doing mTHP swapin, we need to + * ensure all entries are not cached, otherwise, the mTHP folio will + * be in conflict with the folio in swap cache. + */ + for (i = 0; i < max_nr; i++) { + if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + return i; + } + + return i; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -141,7 +162,8 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } -static inline int swap_writeout(struct folio *f, struct writeback_control *wbc) +static inline int swap_writeout(struct folio *folio, + struct swap_iocb **swap_plug) { return 0; } @@ -199,6 +221,10 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return 0; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + return 0; +} #endif /* CONFIG_SWAP */ /** diff --git a/mm/swapfile.c b/mm/swapfile.c index 68ce283e84be..b4f3cc712580 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -956,9 +956,8 @@ new_cluster: } /* - * We don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them, then - * reread cluster_next_cpu since we dropped si->lock + * We don't have free cluster but have some clusters in discarding, + * do discard now and reclaim them. */ if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) goto new_cluster; @@ -1115,6 +1114,7 @@ static void swap_range_alloc(struct swap_info_struct *si, if (vm_swap_full()) schedule_work(&si->reclaim_work); } + atomic_long_sub(nr_entries, &nr_swap_pages); } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, @@ -1313,7 +1313,6 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) goto out_free; - atomic_long_sub(size, &nr_swap_pages); return 0; out_free: @@ -3141,43 +3140,30 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return maxpages; } -static int setup_swap_map_and_extents(struct swap_info_struct *si, - union swap_header *swap_header, - unsigned char *swap_map, - unsigned long maxpages, - sector_t *span) +static int setup_swap_map(struct swap_info_struct *si, + union swap_header *swap_header, + unsigned char *swap_map, + unsigned long maxpages) { - unsigned int nr_good_pages; unsigned long i; - int nr_extents; - - nr_good_pages = maxpages - 1; /* omit header page */ + swap_map[0] = SWAP_MAP_BAD; /* omit header page */ for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) return -EINVAL; if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; - nr_good_pages--; + si->pages--; } } - if (nr_good_pages) { - swap_map[0] = SWAP_MAP_BAD; - si->max = maxpages; - si->pages = nr_good_pages; - nr_extents = setup_swap_extents(si, span); - if (nr_extents < 0) - return nr_extents; - nr_good_pages = si->pages; - } - if (!nr_good_pages) { + if (!si->pages) { pr_warn("Empty swap-file\n"); return -EINVAL; } - return nr_extents; + return 0; } #define SWAP_CLUSTER_INFO_COLS \ @@ -3217,13 +3203,17 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * Mark unusable pages as unavailable. The clusters aren't * marked free yet, so no list operations are involved yet. * - * See setup_swap_map_and_extents(): header page, bad pages, + * See setup_swap_map(): header page, bad pages, * and the EOF part of the last cluster. */ inc_cluster_info_page(si, cluster_info, 0); - for (i = 0; i < swap_header->info.nr_badpages; i++) - inc_cluster_info_page(si, cluster_info, - swap_header->info.badpages[i]); + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; + + if (page_nr >= maxpages) + continue; + inc_cluster_info_page(si, cluster_info, page_nr); + } for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) inc_cluster_info_page(si, cluster_info, i); @@ -3363,6 +3353,21 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap_unlock_inode; } + si->max = maxpages; + si->pages = maxpages - 1; + nr_extents = setup_swap_extents(si, &span); + if (nr_extents < 0) { + error = nr_extents; + goto bad_swap_unlock_inode; + } + if (si->pages != si->max - 1) { + pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max); + error = -EINVAL; + goto bad_swap_unlock_inode; + } + + maxpages = si->max; + /* OK, set up the swap map and apply the bad block list */ swap_map = vzalloc(maxpages); if (!swap_map) { @@ -3374,12 +3379,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; - nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map, - maxpages, &span); - if (unlikely(nr_extents < 0)) { - error = nr_extents; + error = setup_swap_map(si, swap_header, swap_map, maxpages); + if (error) goto bad_swap_unlock_inode; - } /* * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index bc473ad21202..cbed91b09640 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -561,7 +561,7 @@ retry: } while (src_addr < src_start + len) { - BUG_ON(dst_addr >= dst_start + len); + VM_WARN_ON_ONCE(dst_addr >= dst_start + len); /* * Serialize via vma_lock and hugetlb_fault_mutex. @@ -602,7 +602,7 @@ retry: if (unlikely(err == -ENOENT)) { up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); - BUG_ON(!folio); + VM_WARN_ON_ONCE(!folio); err = copy_folio_from_user(folio, (const void __user *)src_addr, true); @@ -614,7 +614,7 @@ retry: dst_vma = NULL; goto retry; } else - BUG_ON(folio); + VM_WARN_ON_ONCE(folio); if (!err) { dst_addr += vma_hpagesize; @@ -635,9 +635,9 @@ out_unlock_vma: out: if (folio) folio_put(folio); - BUG_ON(copied < 0); - BUG_ON(err > 0); - BUG_ON(!copied && !err); + VM_WARN_ON_ONCE(copied < 0); + VM_WARN_ON_ONCE(err > 0); + VM_WARN_ON_ONCE(!copied && !err); return copied ? copied : err; } #else /* !CONFIG_HUGETLB_PAGE */ @@ -711,12 +711,12 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, /* * Sanitize the command parameters: */ - BUG_ON(dst_start & ~PAGE_MASK); - BUG_ON(len & ~PAGE_MASK); + VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ - BUG_ON(src_start + len <= src_start); - BUG_ON(dst_start + len <= dst_start); + VM_WARN_ON_ONCE(src_start + len <= src_start); + VM_WARN_ON_ONCE(dst_start + len <= dst_start); src_addr = src_start; dst_addr = dst_start; @@ -775,7 +775,7 @@ retry: while (src_addr < src_start + len) { pmd_t dst_pmdval; - BUG_ON(dst_addr >= dst_start + len); + VM_WARN_ON_ONCE(dst_addr >= dst_start + len); dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); if (unlikely(!dst_pmd)) { @@ -795,8 +795,8 @@ retry: * (This includes the case where the PMD used to be THP and * changed back to none after __pte_alloc().) */ - if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) || - pmd_devmap(dst_pmdval))) { + if (unlikely(!pmd_present(dst_pmdval) || + pmd_trans_huge(dst_pmdval))) { err = -EEXIST; break; } @@ -818,7 +818,7 @@ retry: up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); - BUG_ON(!folio); + VM_WARN_ON_ONCE(!folio); kaddr = kmap_local_folio(folio, 0); err = copy_from_user(kaddr, @@ -832,7 +832,7 @@ retry: flush_dcache_folio(folio); goto retry; } else - BUG_ON(folio); + VM_WARN_ON_ONCE(folio); if (!err) { dst_addr += PAGE_SIZE; @@ -852,9 +852,9 @@ out_unlock: out: if (folio) folio_put(folio); - BUG_ON(copied < 0); - BUG_ON(err > 0); - BUG_ON(!copied && !err); + VM_WARN_ON_ONCE(copied < 0); + VM_WARN_ON_ONCE(err > 0); + VM_WARN_ON_ONCE(!copied && !err); return copied ? copied : err; } @@ -940,11 +940,11 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, /* * Sanitize the command parameters: */ - BUG_ON(start & ~PAGE_MASK); - BUG_ON(len & ~PAGE_MASK); + VM_WARN_ON_ONCE(start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ - BUG_ON(start + len <= start); + VM_WARN_ON_ONCE(start + len <= start); mmap_read_lock(dst_mm); @@ -1084,8 +1084,18 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio *src_folio) + struct folio *src_folio, + struct swap_info_struct *si, swp_entry_t entry) { + /* + * Check if the folio still belongs to the target swap entry after + * acquiring the lock. Folio can be freed in the swap cache while + * not locked. + */ + if (src_folio && unlikely(!folio_test_swapcache(src_folio) || + entry.val != src_folio->swap.val)) + return -EAGAIN; + double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, @@ -1102,6 +1112,25 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, if (src_folio) { folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); + } else { + /* + * Check if the swap entry is cached after acquiring the src_pte + * lock. Otherwise, we might miss a newly loaded swap cache folio. + * + * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. + * We are trying to catch newly added swap cache, the only possible case is + * when a folio is swapped in and out again staying in swap cache, using the + * same entry before the PTE check above. The PTL is acquired and released + * twice, each time after updating the swap_map's flag. So holding + * the PTL here ensures we see the updated value. False positive is possible, + * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the + * cache, or during the tiny synchronization window between swap cache and + * swap_map, but it will be gone very quickly, worst result is retry jitters. + */ + if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); @@ -1412,7 +1441,7 @@ retry: } err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, - dst_ptl, src_ptl, src_folio); + dst_ptl, src_ptl, src_folio, si, entry); } out: @@ -1709,15 +1738,13 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, ssize_t moved = 0; /* Sanitize the command parameters. */ - if (WARN_ON_ONCE(src_start & ~PAGE_MASK) || - WARN_ON_ONCE(dst_start & ~PAGE_MASK) || - WARN_ON_ONCE(len & ~PAGE_MASK)) - goto out; + VM_WARN_ON_ONCE(src_start & ~PAGE_MASK); + VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ - if (WARN_ON_ONCE(src_start + len <= src_start) || - WARN_ON_ONCE(dst_start + len <= dst_start)) - goto out; + VM_WARN_ON_ONCE(src_start + len < src_start); + VM_WARN_ON_ONCE(dst_start + len < dst_start); err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); if (err) @@ -1791,12 +1818,6 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, ptl = pmd_trans_huge_lock(src_pmd, src_vma); if (ptl) { - if (pmd_devmap(*src_pmd)) { - spin_unlock(ptl); - err = -ENOENT; - break; - } - /* Check if we can move the pmd without splitting it. */ if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || !pmd_none(dst_pmdval)) { @@ -1867,18 +1888,18 @@ out_unlock: up_read(&ctx->map_changing_lock); uffd_move_unlock(dst_vma, src_vma); out: - VM_WARN_ON(moved < 0); - VM_WARN_ON(err > 0); - VM_WARN_ON(!moved && !err); + VM_WARN_ON_ONCE(moved < 0); + VM_WARN_ON_ONCE(err > 0); + VM_WARN_ON_ONCE(!moved && !err); return moved ? moved : err; } static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, - vm_flags_t flags) + vm_flags_t vm_flags) { - const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; + const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; - vm_flags_reset(vma, flags); + vm_flags_reset(vma, vm_flags); /* * For shared mappings, we want to enable writenotify while * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply @@ -1890,12 +1911,12 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, static void userfaultfd_set_ctx(struct vm_area_struct *vma, struct userfaultfd_ctx *ctx, - unsigned long flags) + vm_flags_t vm_flags) { vma_start_write(vma); vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; userfaultfd_set_vm_flags(vma, - (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags); + (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); } void userfaultfd_reset_ctx(struct vm_area_struct *vma) @@ -1941,14 +1962,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, /* Assumes mmap write lock taken, and mm_struct pinned. */ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, struct vm_area_struct *vma, - unsigned long vm_flags, + vm_flags_t vm_flags, unsigned long start, unsigned long end, bool wp_async) { VMA_ITERATOR(vmi, ctx->mm, start); struct vm_area_struct *prev = vma_prev(&vmi); unsigned long vma_end; - unsigned long new_flags; + vm_flags_t new_flags; if (vma->vm_start < start) prev = vma; @@ -1956,10 +1977,10 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); - BUG_ON(vma->vm_userfaultfd_ctx.ctx && - vma->vm_userfaultfd_ctx.ctx != ctx); - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); + VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && + vma->vm_userfaultfd_ctx.ctx != ctx); + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this @@ -2035,8 +2056,8 @@ void userfaultfd_release_all(struct mm_struct *mm, prev = NULL; for_each_vma(vmi, vma) { cond_resched(); - BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ - !!(vma->vm_flags & __VM_UFFD_FLAGS)); + VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^ + !!(vma->vm_flags & __VM_UFFD_FLAGS)); if (vma->vm_userfaultfd_ctx.ctx != ctx) { prev = vma; continue; diff --git a/mm/util.c b/mm/util.c index 448117da071f..f814e6a59ab1 100644 --- a/mm/util.c +++ b/mm/util.c @@ -25,6 +25,7 @@ #include <linux/sizes.h> #include <linux/compat.h> #include <linux/fsnotify.h> +#include <linux/page_idle.h> #include <linux/uaccess.h> @@ -670,9 +671,9 @@ struct anon_vma *folio_anon_vma(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; - if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) return NULL; - return (void *)(mapping - PAGE_MAPPING_ANON); + return (void *)(mapping - FOLIO_MAPPING_ANON); } /** @@ -699,7 +700,7 @@ struct address_space *folio_mapping(struct folio *folio) return swap_address_space(folio->swap); mapping = folio->mapping; - if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) + if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS) return NULL; return mapping; @@ -1131,3 +1132,152 @@ void flush_dcache_folio(struct folio *folio) } EXPORT_SYMBOL(flush_dcache_folio); #endif + +/** + * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an + * existing VMA + * @file: The file which possesss an f_op->mmap_prepare() hook + * @vma: The VMA to apply the .mmap_prepare() hook to. + * + * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain + * 'wrapper' file systems invoke a nested mmap hook of an underlying file. + * + * Until all filesystems are converted to use .mmap_prepare(), we must be + * conservative and continue to invoke these 'wrapper' filesystems using the + * deprecated .mmap() hook. + * + * However we have a problem if the underlying file system possesses an + * .mmap_prepare() hook, as we are in a different context when we invoke the + * .mmap() hook, already having a VMA to deal with. + * + * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, + * establishes a struct vm_area_desc descriptor, passes to the underlying + * .mmap_prepare() hook and applies any changes performed by it. + * + * Once the conversion of filesystems is complete this function will no longer + * be required and will be removed. + * + * Returns: 0 on success or error. + */ +int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc; + int err; + + err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); + if (err) + return err; + set_vma_from_desc(vma, &desc); + + return 0; +} +EXPORT_SYMBOL(compat_vma_mmap_prepare); + +static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, + const struct page *page) +{ + /* + * Only the first page of a high-order buddy page has PageBuddy() set. + * So we have to check manually whether this page is part of a high- + * order buddy page. + */ + if (PageBuddy(page)) + ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; + else if (page_count(page) == 0 && is_free_buddy_page(page)) + ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; + + if (folio_test_idle(folio)) + ps->flags |= PAGE_SNAPSHOT_PG_IDLE; +} + +/** + * snapshot_page() - Create a snapshot of a struct page + * @ps: Pointer to a struct page_snapshot to store the page snapshot + * @page: The page to snapshot + * + * Create a snapshot of the page and store both its struct page and struct + * folio representations in @ps. + * + * A snapshot is marked as "faithful" if the compound state of @page was + * stable and allowed safe reconstruction of the folio representation. In + * rare cases where this is not possible (e.g. due to folio splitting), + * snapshot_page() falls back to treating @page as a single page and the + * snapshot is marked as "unfaithful". The snapshot_page_is_faithful() + * helper can be used to check for this condition. + */ +void snapshot_page(struct page_snapshot *ps, const struct page *page) +{ + unsigned long head, nr_pages = 1; + struct folio *foliop; + int loops = 5; + + ps->pfn = page_to_pfn(page); + ps->flags = PAGE_SNAPSHOT_FAITHFUL; + +again: + memset(&ps->folio_snapshot, 0, sizeof(struct folio)); + memcpy(&ps->page_snapshot, page, sizeof(*page)); + head = ps->page_snapshot.compound_head; + if ((head & 1) == 0) { + ps->idx = 0; + foliop = (struct folio *)&ps->page_snapshot; + if (!folio_test_large(foliop)) { + set_ps_flags(ps, page_folio(page), page); + memcpy(&ps->folio_snapshot, foliop, + sizeof(struct page)); + return; + } + foliop = (struct folio *)page; + } else { + foliop = (struct folio *)(head - 1); + ps->idx = folio_page_idx(foliop, page); + } + + if (ps->idx < MAX_FOLIO_NR_PAGES) { + memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page)); + nr_pages = folio_nr_pages(&ps->folio_snapshot); + if (nr_pages > 1) + memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2, + sizeof(struct page)); + set_ps_flags(ps, foliop, page); + } + + if (ps->idx > nr_pages) { + if (loops-- > 0) + goto again; + clear_compound_head(&ps->page_snapshot); + foliop = (struct folio *)&ps->page_snapshot; + memcpy(&ps->folio_snapshot, foliop, sizeof(struct page)); + ps->flags = 0; + ps->idx = 0; + } +} + +#ifdef CONFIG_MMU +/** + * folio_pte_batch - detect a PTE batch for a large folio + * @folio: The large folio to detect a PTE batch for. + * @ptep: Page table pointer for the first entry. + * @pte: Page table entry for the first page. + * @max_nr: The maximum number of table entries to consider. + * + * This is a simplified variant of folio_pte_batch_flags(). + * + * Detect a PTE batch: consecutive (present) PTEs that map consecutive + * pages of the same large folio in a single VMA and a single page table. + * + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, + * the accessed bit, writable bit, dirt-bit and soft-dirty bit. + * + * ptep must map any page of the folio. max_nr must be at least one and + * must be limited by the caller so scanning cannot exceed a single VMA and + * a single page table. + * + * Return: the number of table entries in the batch. + */ +unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, + unsigned int max_nr) +{ + return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); +} +#endif /* CONFIG_MMU */ @@ -15,7 +15,7 @@ struct mmap_state { unsigned long end; pgoff_t pgoff; unsigned long pglen; - unsigned long flags; + vm_flags_t vm_flags; struct file *file; pgprot_t page_prot; @@ -32,9 +32,12 @@ struct mmap_state { struct vma_munmap_struct vms; struct ma_state mas_detach; struct maple_tree mt_detach; + + /* Determine if we can check KSM flags early in mmap() logic. */ + bool check_ksm_early; }; -#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \ +#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ struct mmap_state name = { \ .mm = mm_, \ .vmi = vmi_, \ @@ -42,9 +45,9 @@ struct mmap_state { .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ - .flags = flags_, \ + .vm_flags = vm_flags_, \ .file = file_, \ - .page_prot = vm_get_page_prot(flags_), \ + .page_prot = vm_get_page_prot(vm_flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ @@ -53,7 +56,7 @@ struct mmap_state { .vmi = (map_)->vmi, \ .start = (map_)->addr, \ .end = (map_)->end, \ - .flags = (map_)->flags, \ + .vm_flags = (map_)->vm_flags, \ .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ @@ -92,7 +95,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex * the kernel to generate new VMAs when old one could be * extended instead. */ - if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY) + if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY) return false; if (vma->vm_file != vmg->file) return false; @@ -840,7 +843,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( * furthermost left or right side of the VMA, then we have no chance of * merging and should abort. */ - if (vmg->flags & VM_SPECIAL || (!left_side && !right_side)) + if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side)) return NULL; if (left_side) @@ -967,27 +970,10 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( err = dup_anon_vma(next, middle, &anon_dup); } - if (err) + if (err || commit_merge(vmg)) goto abort; - err = commit_merge(vmg); - if (err) { - VM_WARN_ON(err != -ENOMEM); - - if (anon_dup) - unlink_anon_vmas(anon_dup); - - /* - * We've cleaned up any cloned anon_vma's, no VMAs have been - * modified, no harm no foul if the user requests that we not - * report this and just give up, leaving the VMAs unmerged. - */ - if (!vmg->give_up_on_oom) - vmg->state = VMA_MERGE_ERROR_NOMEM; - return NULL; - } - - khugepaged_enter_vma(vmg->target, vmg->flags); + khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; @@ -995,6 +981,9 @@ abort: vma_iter_set(vmg->vmi, start); vma_iter_load(vmg->vmi); + if (anon_dup) + unlink_anon_vmas(anon_dup); + /* * This means we have failed to clone anon_vma's correctly, but no * actual changes to VMAs have occurred, so no harm no foul - if the @@ -1059,13 +1048,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) mmap_assert_write_locked(vmg->mm); VM_WARN_ON_VMG(vmg->middle, vmg); + VM_WARN_ON_VMG(vmg->target, vmg); /* vmi must point at or before the gap. */ VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); vmg->state = VMA_MERGE_NOMERGE; /* Special VMAs are unmergeable, also if no prev/next. */ - if ((vmg->flags & VM_SPECIAL) || (!prev && !next)) + if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next)) return NULL; can_merge_left = can_vma_merge_left(vmg); @@ -1074,13 +1064,13 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) /* If we can merge with the next VMA, adjust vmg accordingly. */ if (can_merge_right) { vmg->end = next->vm_end; - vmg->middle = next; + vmg->target = next; } /* If we can merge with the previous VMA, adjust vmg accordingly. */ if (can_merge_left) { vmg->start = prev->vm_start; - vmg->middle = prev; + vmg->target = prev; vmg->pgoff = prev->vm_pgoff; /* @@ -1102,10 +1092,10 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) * Now try to expand adjacent VMA(s). This takes care of removing the * following VMA if we have VMAs on both sides. */ - if (vmg->middle && !vma_expand(vmg)) { - khugepaged_enter_vma(vmg->middle, vmg->flags); + if (vmg->target && !vma_expand(vmg)) { + khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; - return vmg->middle; + return vmg->target; } return NULL; @@ -1117,27 +1107,29 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) * @vmg: Describes a VMA expansion operation. * * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. - * Will expand over vmg->next if it's different from vmg->middle and vmg->end == - * vmg->next->vm_end. Checking if the vmg->middle can expand and merge with + * Will expand over vmg->next if it's different from vmg->target and vmg->end == + * vmg->next->vm_end. Checking if the vmg->target can expand and merge with * vmg->next needs to be handled by the caller. * * Returns: 0 on success. * * ASSUMPTIONS: - * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock. - * - The caller must have set @vmg->middle and @vmg->next. + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. + * - The caller must have set @vmg->target and @vmg->next. */ int vma_expand(struct vma_merge_struct *vmg) { struct vm_area_struct *anon_dup = NULL; bool remove_next = false; - struct vm_area_struct *middle = vmg->middle; + struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; + VM_WARN_ON_VMG(!target, vmg); + mmap_assert_write_locked(vmg->mm); - vma_start_write(middle); - if (next && (middle != next) && (vmg->end == next->vm_end)) { + vma_start_write(target); + if (next && (target != next) && (vmg->end == next->vm_end)) { int ret; remove_next = true; @@ -1148,19 +1140,18 @@ int vma_expand(struct vma_merge_struct *vmg) * In this case we don't report OOM, so vmg->give_up_on_mm is * safe. */ - ret = dup_anon_vma(middle, next, &anon_dup); + ret = dup_anon_vma(target, next, &anon_dup); if (ret) return ret; } /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON_VMG(next && !remove_next && - next != middle && vmg->end > next->vm_start, vmg); + next != target && vmg->end > next->vm_start, vmg); /* Only handles expanding */ - VM_WARN_ON_VMG(middle->vm_start < vmg->start || - middle->vm_end > vmg->end, vmg); + VM_WARN_ON_VMG(target->vm_start < vmg->start || + target->vm_end > vmg->end, vmg); - vmg->target = middle; if (remove_next) vmg->__remove_next = true; @@ -1649,27 +1640,25 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) struct vm_area_struct *vma_modify_flags( struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags) + vm_flags_t vm_flags) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.flags = new_flags; + vmg.vm_flags = vm_flags; return vma_modify(&vmg); } struct vm_area_struct -*vma_modify_flags_name(struct vma_iterator *vmi, +*vma_modify_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags, struct anon_vma_name *new_name) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.flags = new_flags; vmg.anon_name = new_name; return vma_modify(&vmg); @@ -1694,13 +1683,13 @@ struct vm_area_struct struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags, + vm_flags_t vm_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.flags = new_flags; + vmg.vm_flags = vm_flags; vmg.uffd_ctx = new_ctx; if (give_up_on_oom) vmg.give_up_on_oom = true; @@ -2334,6 +2323,11 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, vms_complete_munmap_vmas(vms, mas_detach); } +static void update_ksm_flags(struct mmap_state *map) +{ + map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); +} + /* * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping @@ -2376,11 +2370,11 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) } /* Check against address space limit. */ - if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages)) + if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages)) return -ENOMEM; /* Private writable mapping: check memory availability. */ - if (accountable_mapping(map->file, map->flags)) { + if (accountable_mapping(map->file, map->vm_flags)) { map->charged = map->pglen; map->charged -= vms->nr_accounted; if (map->charged) { @@ -2390,7 +2384,7 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) } vms->nr_accounted = 0; - map->flags |= VM_ACCOUNT; + map->vm_flags |= VM_ACCOUNT; } /* @@ -2434,11 +2428,12 @@ static int __mmap_new_file_vma(struct mmap_state *map, * Drivers should not permit writability when previously it was * disallowed. */ - VM_WARN_ON_ONCE(map->flags != vma->vm_flags && - !(map->flags & VM_MAYWRITE) && + VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags && + !(map->vm_flags & VM_MAYWRITE) && (vma->vm_flags & VM_MAYWRITE)); - map->flags = vma->vm_flags; + map->file = vma->vm_file; + map->vm_flags = vma->vm_flags; return 0; } @@ -2469,7 +2464,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); - vm_flags_init(vma, map->flags); + vm_flags_init(vma, map->vm_flags); vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { @@ -2479,7 +2474,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (map->file) error = __mmap_new_file_vma(map, vma); - else if (map->flags & VM_SHARED) + else if (map->vm_flags & VM_SHARED) error = shmem_zero_setup(vma); else vma_set_anonymous(vma); @@ -2487,9 +2482,14 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (error) goto free_iter_vma; + if (!map->check_ksm_early) { + update_ksm_flags(map); + vm_flags_init(vma, map->vm_flags); + } + #ifdef CONFIG_SPARC64 /* TODO: Fix SPARC ADI! */ - WARN_ON_ONCE(!arch_validate_flags(map->flags)); + WARN_ON_ONCE(!arch_validate_flags(map->vm_flags)); #endif /* Lock the VMA since it is modified after insertion into VMA tree */ @@ -2503,8 +2503,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) * call covers the non-merge case. */ if (!vma_is_anonymous(vma)) - khugepaged_enter_vma(vma, map->flags); - ksm_add_vma(vma); + khugepaged_enter_vma(vma, map->vm_flags); *vmap = vma; return 0; @@ -2525,7 +2524,7 @@ free_vma: static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) { struct mm_struct *mm = map->mm; - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; perf_event_mmap(vma); @@ -2578,19 +2577,19 @@ static int call_mmap_prepare(struct mmap_state *map) .pgoff = map->pgoff, .file = map->file, - .vm_flags = map->flags, + .vm_flags = map->vm_flags, .page_prot = map->page_prot, }; /* Invoke the hook. */ - err = __call_mmap_prepare(map->file, &desc); + err = vfs_mmap_prepare(map->file, &desc); if (err) return err; /* Update fields permitted to be changed. */ map->pgoff = desc.pgoff; map->file = desc.file; - map->flags = desc.vm_flags; + map->vm_flags = desc.vm_flags; map->page_prot = desc.page_prot; /* User-defined fields. */ map->vm_ops = desc.vm_ops; @@ -2607,6 +2606,35 @@ static void set_vma_user_defined_fields(struct vm_area_struct *vma, vma->vm_private_data = map->vm_private_data; } +/* + * Are we guaranteed no driver can change state such as to preclude KSM merging? + * If so, let's set the KSM mergeable flag early so we don't break VMA merging. + */ +static bool can_set_ksm_flags_early(struct mmap_state *map) +{ + struct file *file = map->file; + + /* Anonymous mappings have no driver which can change them. */ + if (!file) + return true; + + /* + * If .mmap_prepare() is specified, then the driver will have already + * manipulated state prior to updating KSM flags. So no need to worry + * about mmap callbacks modifying VMA flags after the KSM flag has been + * updated here, which could otherwise affect KSM eligibility. + */ + if (file->f_op->mmap_prepare) + return true; + + /* shmem is safe. */ + if (shmem_file(file)) + return true; + + /* Any other .mmap callback is not safe. */ + return false; +} + static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) @@ -2618,12 +2646,17 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); + map.check_ksm_early = can_set_ksm_flags_early(&map); + error = __mmap_prepare(&map, uf); if (!error && have_mmap_prepare) error = call_mmap_prepare(&map); if (error) goto abort_munmap; + if (map.check_ksm_early) + update_ksm_flags(&map); + /* Attempt to merge with adjacent VMAs... */ if (map.prev || map.next) { VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL); @@ -2719,14 +2752,14 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * @addr: The start address * @len: The length of the increase * @vma: The vma, - * @flags: The VMA Flags + * @vm_flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, unsigned long flags) + unsigned long addr, unsigned long len, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; @@ -2734,8 +2767,9 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. */ - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) + vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + vm_flags = ksm_vma_flags(mm, NULL, vm_flags); + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) @@ -2749,7 +2783,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr) { - VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); + VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr)); vmg.prev = vma; /* vmi is positioned at prev, which this mode expects. */ @@ -2770,20 +2804,19 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_set_anonymous(vma); vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); - vm_flags_init(vma, flags); - vma->vm_page_prot = vm_get_page_prot(flags); + vm_flags_init(vma, vm_flags); + vma->vm_page_prot = vm_get_page_prot(vm_flags); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; mm->map_count++; validate_mm(mm); - ksm_add_vma(vma); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; - if (flags & VM_LOCKED) + if (vm_flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vm_flags_set(vma, VM_SOFTDIRTY); return 0; @@ -3127,7 +3160,6 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock) return ret; } - /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_rwsem is taken here. @@ -98,7 +98,7 @@ struct vma_merge_struct { unsigned long end; pgoff_t pgoff; - unsigned long flags; + vm_flags_t vm_flags; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; @@ -164,13 +164,13 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } -#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \ +#define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ - .flags = flags_, \ + .vm_flags = vm_flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ } @@ -184,7 +184,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, .next = NULL, \ .start = start_, \ .end = end_, \ - .flags = vma_->vm_flags, \ + .vm_flags = vma_->vm_flags, \ .pgoff = vma_pgoff_offset(vma_, start_), \ .file = vma_->vm_file, \ .anon_vma = vma_->anon_vma, \ @@ -222,6 +222,53 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } + +/* + * Temporary helper functions for file systems which wrap an invocation of + * f_op->mmap() but which might have an underlying file system which implements + * f_op->mmap_prepare(). + */ + +static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, + struct vm_area_desc *desc) +{ + desc->mm = vma->vm_mm; + desc->start = vma->vm_start; + desc->end = vma->vm_end; + + desc->pgoff = vma->vm_pgoff; + desc->file = vma->vm_file; + desc->vm_flags = vma->vm_flags; + desc->page_prot = vma->vm_page_prot; + + desc->vm_ops = NULL; + desc->private_data = NULL; + + return desc; +} + +static inline void set_vma_from_desc(struct vm_area_struct *vma, + struct vm_area_desc *desc) +{ + /* + * Since we're invoking .mmap_prepare() despite having a partially + * established VMA, we must take care to handle setting fields + * correctly. + */ + + /* Mutable fields. Populated with initial state. */ + vma->vm_pgoff = desc->pgoff; + if (vma->vm_file != desc->file) + vma_set_file(vma, desc->file); + if (vma->vm_flags != desc->vm_flags) + vm_flags_set(vma, desc->vm_flags); + vma->vm_page_prot = desc->page_prot; + + /* User-defined fields. */ + vma->vm_ops = desc->vm_ops; + vma->vm_private_data = desc->private_data; +} + int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, @@ -241,17 +288,16 @@ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags); + vm_flags_t vm_flags); -/* We are about to modify the VMA's flags and/or anon_name. */ +/* We are about to modify the VMA's anon_name. */ __must_check struct vm_area_struct -*vma_modify_flags_name(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long new_flags, - struct anon_vma_name *new_name); +*vma_modify_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ __must_check struct vm_area_struct @@ -267,7 +313,7 @@ __must_check struct vm_area_struct struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags, + vm_flags_t vm_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom); @@ -328,7 +374,7 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma } #ifdef CONFIG_MMU -static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) +static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, vm_flags_t vm_flags) { return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); } diff --git a/mm/vma_exec.c b/mm/vma_exec.c index 2dffb02ed6a2..922ee51747a6 100644 --- a/mm/vma_exec.c +++ b/mm/vma_exec.c @@ -54,7 +54,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - vmg.middle = vma; + vmg.target = vma; if (vma_expand(&vmg)) return -ENOMEM; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ab986dd09b6a..6dbcdceecae1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -514,6 +514,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { + int err = 0; pte_t *pte; /* @@ -530,12 +531,18 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, do { struct page *page = pages[*nr]; - if (WARN_ON(!pte_none(ptep_get(pte)))) - return -EBUSY; - if (WARN_ON(!page)) - return -ENOMEM; - if (WARN_ON(!pfn_valid(page_to_pfn(page)))) - return -EINVAL; + if (WARN_ON(!pte_none(ptep_get(pte)))) { + err = -EBUSY; + break; + } + if (WARN_ON(!page)) { + err = -ENOMEM; + break; + } + if (WARN_ON(!pfn_valid(page_to_pfn(page)))) { + err = -EINVAL; + break; + } set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; @@ -543,7 +550,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; - return 0; + + return err; } static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, diff --git a/mm/vmpressure.c b/mm/vmpressure.c index bd5183dfd879..c197ed47bcc4 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -316,7 +316,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, * asserted for a second in which subsequent * pressure events can occur. */ - WRITE_ONCE(memcg->socket_pressure, jiffies + HZ); + mem_cgroup_set_socket_pressure(memcg); } } } diff --git a/mm/vmscan.c b/mm/vmscan.c index f8dfd2864bbf..7de11524a936 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -57,6 +57,7 @@ #include <linux/rculist_nulls.h> #include <linux/random.h> #include <linux/mmu_notifier.h> +#include <linux/parser.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -93,10 +94,8 @@ struct scan_control { unsigned long anon_cost; unsigned long file_cost; -#ifdef CONFIG_MEMCG /* Swappiness value for proactive reclaim. Always use sc_swappiness()! */ int *proactive_swappiness; -#endif /* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 @@ -120,7 +119,7 @@ struct scan_control { /* Has cache_trim_mode failed at least once? */ unsigned int cache_trim_mode_failed:1; - /* Proactive reclaim invoked by userspace through memory.reclaim */ + /* Proactive reclaim invoked by userspace */ unsigned int proactive:1; /* @@ -652,14 +651,45 @@ typedef enum { PAGE_CLEAN, } pageout_t; +static pageout_t writeout(struct folio *folio, struct address_space *mapping, + struct swap_iocb **plug, struct list_head *folio_list) +{ + int res; + + folio_set_reclaim(folio); + + /* + * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled + * or we failed to allocate contiguous swap entries, in which case + * the split out folios get added back to folio_list. + */ + if (shmem_mapping(mapping)) + res = shmem_writeout(folio, plug, folio_list); + else + res = swap_writeout(folio, plug); + + if (res < 0) + handle_write_error(mapping, folio, res); + if (res == AOP_WRITEPAGE_ACTIVATE) { + folio_clear_reclaim(folio); + return PAGE_ACTIVATE; + } + + /* synchronous write? */ + if (!folio_test_writeback(folio)) + folio_clear_reclaim(folio); + + trace_mm_vmscan_write_folio(folio); + node_stat_add_folio(folio, NR_VMSCAN_WRITE); + return PAGE_SUCCESS; +} + /* * pageout is called by shrink_folio_list() for each dirty folio. */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list) { - int (*writeout)(struct folio *, struct writeback_control *); - /* * We no longer attempt to writeback filesystem folios here, other * than tmpfs/shmem. That's taken care of in page-writeback. @@ -690,51 +720,12 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } return PAGE_KEEP; } - if (shmem_mapping(mapping)) - writeout = shmem_writeout; - else if (folio_test_anon(folio)) - writeout = swap_writeout; - else - return PAGE_ACTIVATE; - if (folio_clear_dirty_for_io(folio)) { - int res; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = SWAP_CLUSTER_MAX, - .range_start = 0, - .range_end = LLONG_MAX, - .for_reclaim = 1, - .swap_plug = plug, - }; - - /* - * The large shmem folio can be split if CONFIG_THP_SWAP is - * not enabled or contiguous swap entries are failed to - * allocate. - */ - if (shmem_mapping(mapping) && folio_test_large(folio)) - wbc.list = folio_list; - - folio_set_reclaim(folio); - res = writeout(folio, &wbc); - if (res < 0) - handle_write_error(mapping, folio, res); - if (res == AOP_WRITEPAGE_ACTIVATE) { - folio_clear_reclaim(folio); - return PAGE_ACTIVATE; - } - - if (!folio_test_writeback(folio)) { - /* synchronous write? */ - folio_clear_reclaim(folio); - } - trace_mm_vmscan_write_folio(folio); - node_stat_add_folio(folio, NR_VMSCAN_WRITE); - return PAGE_SUCCESS; - } - - return PAGE_CLEAN; + if (!shmem_mapping(mapping) && !folio_test_anon(folio)) + return PAGE_ACTIVATE; + if (!folio_clear_dirty_for_io(folio)) + return PAGE_CLEAN; + return writeout(folio, mapping, plug, folio_list); } /* @@ -915,7 +906,7 @@ static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { int referenced_ptes, referenced_folio; - unsigned long vm_flags; + vm_flags_t vm_flags; referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); @@ -1014,7 +1005,8 @@ static void folio_check_dirty_writeback(struct folio *folio, mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } -struct folio *alloc_migrate_folio(struct folio *src, unsigned long private) +static struct folio *alloc_demote_folio(struct folio *src, + unsigned long private) { struct folio *dst; nodemask_t *allowed_mask; @@ -1077,7 +1069,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ - migrate_pages(demote_folios, alloc_migrate_folio, NULL, + migrate_pages(demote_folios, alloc_demote_folio, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); @@ -1138,6 +1130,14 @@ retry: goto keep; if (folio_contain_hwpoisoned_page(folio)) { + /* + * unmap_poisoned_folio() can't handle large + * folio, just skip it. memory_failure() will + * handle it if the UCE is triggered again. + */ + if (folio_test_large(folio)) + goto keep_locked; + unmap_poisoned_folio(folio, folio_pfn(folio), false); folio_unlock(folio); folio_put(folio); @@ -1658,9 +1658,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, unsigned int noreclaim_flag; list_for_each_entry_safe(folio, next, folio_list, lru) { + /* TODO: these pages should not even appear in this list. */ + if (page_has_movable_ops(&folio->page)) + continue; if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && - !folio_test_dirty(folio) && !__folio_test_movable(folio) && - !folio_test_unevictable(folio)) { + !folio_test_dirty(folio) && !folio_test_unevictable(folio)) { folio_clear_active(folio); list_move(&folio->lru, &clean_folios); } @@ -2059,9 +2061,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, __count_vm_events(item, nr_reclaimed); count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); - spin_unlock_irq(&lruvec->lru_lock); - lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); + lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, + nr_scanned - nr_reclaimed); /* * If dirty folios are scanned that are not queued for IO, it @@ -2127,7 +2129,7 @@ static void shrink_active_list(unsigned long nr_to_scan, { unsigned long nr_taken; unsigned long nr_scanned; - unsigned long vm_flags; + vm_flags_t vm_flags; LIST_HEAD(l_hold); /* The folios which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); @@ -2207,10 +2209,8 @@ static void shrink_active_list(unsigned long nr_to_scan, count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&lruvec->lru_lock); - if (nr_rotated) - lru_note_cost(lruvec, file, 0, nr_rotated); + lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } @@ -2482,6 +2482,69 @@ static inline void calculate_pressure_balance(struct scan_control *sc, *denominator = ap + fp; } +static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, + struct scan_control *sc, unsigned long scan) +{ + unsigned long min, low; + + mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low); + + if (min || low) { + /* + * Scale a cgroup's reclaim pressure by proportioning + * its current usage to its memory.low or memory.min + * setting. + * + * This is important, as otherwise scanning aggression + * becomes extremely binary -- from nothing as we + * approach the memory protection threshold, to totally + * nominal as we exceed it. This results in requiring + * setting extremely liberal protection thresholds. It + * also means we simply get no protection at all if we + * set it too low, which is not ideal. + * + * If there is any protection in place, we reduce scan + * pressure by how much of the total memory used is + * within protection thresholds. + * + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * memory they are using, reducing the scan pressure + * again by how much of the total memory used is under + * hard protection. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + unsigned long protection; + + /* memory.low scaling, make sure we retry before OOM */ + if (!sc->memcg_low_reclaim && low > min) { + protection = low; + sc->memcg_low_skipped = 1; + } else { + protection = min; + } + + /* Avoid TOCTOU with earlier protection check */ + cgroup_size = max(cgroup_size, protection); + + scan -= scan * protection / (cgroup_size + 1); + + /* + * Minimally target SWAP_CLUSTER_MAX pages to keep + * reclaim moving forwards, avoiding decrementing + * sc->priority further than desirable. + */ + scan = max(scan, SWAP_CLUSTER_MAX); + } + return scan; +} + /* * Determine how aggressively the anon and file LRU lists should be * scanned. @@ -2560,70 +2623,10 @@ out: for_each_evictable_lru(lru) { bool file = is_file_lru(lru); unsigned long lruvec_size; - unsigned long low, min; unsigned long scan; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - mem_cgroup_protection(sc->target_mem_cgroup, memcg, - &min, &low); - - if (min || low) { - /* - * Scale a cgroup's reclaim pressure by proportioning - * its current usage to its memory.low or memory.min - * setting. - * - * This is important, as otherwise scanning aggression - * becomes extremely binary -- from nothing as we - * approach the memory protection threshold, to totally - * nominal as we exceed it. This results in requiring - * setting extremely liberal protection thresholds. It - * also means we simply get no protection at all if we - * set it too low, which is not ideal. - * - * If there is any protection in place, we reduce scan - * pressure by how much of the total memory used is - * within protection thresholds. - * - * There is one special case: in the first reclaim pass, - * we skip over all groups that are within their low - * protection. If that fails to reclaim enough pages to - * satisfy the reclaim goal, we come back and override - * the best-effort low protection. However, we still - * ideally want to honor how well-behaved groups are in - * that case instead of simply punishing them all - * equally. As such, we reclaim them based on how much - * memory they are using, reducing the scan pressure - * again by how much of the total memory used is under - * hard protection. - */ - unsigned long cgroup_size = mem_cgroup_size(memcg); - unsigned long protection; - - /* memory.low scaling, make sure we retry before OOM */ - if (!sc->memcg_low_reclaim && low > min) { - protection = low; - sc->memcg_low_skipped = 1; - } else { - protection = min; - } - - /* Avoid TOCTOU with earlier protection check */ - cgroup_size = max(cgroup_size, protection); - - scan = lruvec_size - lruvec_size * protection / - (cgroup_size + 1); - - /* - * Minimally target SWAP_CLUSTER_MAX pages to keep - * reclaim moving forwards, avoiding decrementing - * sc->priority further than desirable. - */ - scan = max(scan, SWAP_CLUSTER_MAX); - } else { - scan = lruvec_size; - } - + scan = apply_proportional_protection(memcg, sc, lruvec_size); scan >>= sc->priority; /* @@ -3429,7 +3432,7 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned if (!pte_present(pte) || is_zero_pfn(pfn)) return -1; - if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) + if (WARN_ON_ONCE(pte_special(pte))) return -1; if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm)) @@ -3454,9 +3457,6 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) return -1; - if (WARN_ON_ONCE(pmd_devmap(pmd))) - return -1; - if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm)) return -1; @@ -3927,6 +3927,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) { int gen, type, zone; bool success = false; + bool seq_inc_flag = false; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); @@ -3943,11 +3944,20 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) } min_seq[type]++; + seq_inc_flag = true; } next: ; } + /* + * If min_seq[type] of both anonymous and file is not increased, + * we can directly return false to avoid unnecessary checking + * overhead later. + */ + if (!seq_inc_flag) + return success; + /* see the comment on lru_gen_folio */ if (swappiness && swappiness <= MAX_SWAPPINESS) { unsigned long seq = lrugen->max_seq - MIN_NR_GENS; @@ -4554,8 +4564,9 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca return true; } -static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, - int type, int tier, struct list_head *list) +static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int type, int tier, + struct list_head *list) { int i; int gen; @@ -4564,7 +4575,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, int scanned = 0; int isolated = 0; int skipped = 0; - int remaining = MAX_LRU_BATCH; + int remaining = min(nr_to_scan, MAX_LRU_BATCH); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -4675,7 +4686,8 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness) return positive_ctrl_err(&sp, &pv); } -static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, +static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list) { int i; @@ -4687,7 +4699,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw *type_scanned = type; - scanned = scan_folios(lruvec, sc, type, tier, list); + scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list); if (scanned) return scanned; @@ -4697,7 +4709,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw return 0; } -static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) +static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int swappiness) { int type; int scanned; @@ -4716,7 +4729,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap spin_lock_irq(&lruvec->lru_lock); - scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); + scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list); scanned += try_to_inc_min_seq(lruvec, swappiness); @@ -4837,6 +4850,8 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int s if (nr_to_scan && !mem_cgroup_online(memcg)) return nr_to_scan; + nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan); + /* try to get away with not aging at the default priority */ if (!success || sc->priority == DEF_PRIORITY) return nr_to_scan >> sc->priority; @@ -4889,7 +4904,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) if (nr_to_scan <= 0) break; - delta = evict_folios(lruvec, sc, swappiness); + delta = evict_folios(nr_to_scan, lruvec, sc, swappiness); if (!delta) break; @@ -5420,7 +5435,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, static int lru_gen_seq_show(struct seq_file *m, void *v) { unsigned long seq; - bool full = !debugfs_real_fops(m->file)->write; + bool full = debugfs_get_aux_num(m->file); struct lruvec *lruvec = v; struct lru_gen_folio *lrugen = &lruvec->lrugen; int nid = lruvec_pgdat(lruvec)->node_id; @@ -5510,7 +5525,8 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co if (sc->nr_reclaimed >= nr_to_reclaim) return 0; - if (!evict_folios(lruvec, sc, swappiness)) + if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc, + swappiness)) return 0; cond_resched(); @@ -5756,8 +5772,10 @@ static int __init init_lru_gen(void) if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) pr_err("lru_gen: failed to create sysfs group\n"); - debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); - debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); + debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, 1, + &lru_gen_rw_fops); + debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, 0, + &lru_gen_ro_fops); return 0; }; @@ -6713,6 +6731,15 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, return nr_reclaimed; } +#else +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + unsigned int reclaim_options, + int *swappiness) +{ + return 0; +} #endif static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) @@ -7607,36 +7634,26 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) /* * Try to free up some pages from this node through reclaim. */ -static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) +static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, + unsigned long nr_pages, + struct scan_control *sc) { - /* Minimum pages needed in order to stay on node */ - const unsigned long nr_pages = 1 << order; struct task_struct *p = current; unsigned int noreclaim_flag; - struct scan_control sc = { - .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = current_gfp_context(gfp_mask), - .order = order, - .priority = NODE_RECLAIM_PRIORITY, - .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), - .may_swap = 1, - .reclaim_idx = gfp_zone(gfp_mask), - }; unsigned long pflags; - trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, - sc.gfp_mask); + trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, sc->order, + sc->gfp_mask); cond_resched(); psi_memstall_enter(&pflags); delayacct_freepages_start(); - fs_reclaim_acquire(sc.gfp_mask); + fs_reclaim_acquire(sc->gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP */ noreclaim_flag = memalloc_noreclaim_save(); - set_task_reclaim_state(p, &sc.reclaim_state); + set_task_reclaim_state(p, &sc->reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { @@ -7645,24 +7662,36 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in * priorities until we have enough memory freed. */ do { - shrink_node(pgdat, &sc); - } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); + shrink_node(pgdat, sc); + } while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0); } set_task_reclaim_state(p, NULL); memalloc_noreclaim_restore(noreclaim_flag); - fs_reclaim_release(sc.gfp_mask); - psi_memstall_leave(&pflags); + fs_reclaim_release(sc->gfp_mask); delayacct_freepages_end(); + psi_memstall_leave(&pflags); - trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); + trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed); - return sc.nr_reclaimed >= nr_pages; + return sc->nr_reclaimed; } int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { int ret; + /* Minimum pages needed in order to stay on node */ + const unsigned long nr_pages = 1 << order; + struct scan_control sc = { + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = current_gfp_context(gfp_mask), + .order = order, + .priority = NODE_RECLAIM_PRIORITY, + .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), + .may_swap = 1, + .reclaim_idx = gfp_zone(gfp_mask), + }; /* * Node reclaim reclaims unmapped file backed pages and @@ -7697,7 +7726,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) return NODE_RECLAIM_NOSCAN; - ret = __node_reclaim(pgdat, gfp_mask, order); + ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages; clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); if (ret) @@ -7707,6 +7736,114 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) return ret; } + +enum { + MEMORY_RECLAIM_SWAPPINESS = 0, + MEMORY_RECLAIM_SWAPPINESS_MAX, + MEMORY_RECLAIM_NULL, +}; +static const match_table_t tokens = { + { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, + { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"}, + { MEMORY_RECLAIM_NULL, NULL }, +}; + +int user_proactive_reclaim(char *buf, + struct mem_cgroup *memcg, pg_data_t *pgdat) +{ + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + unsigned long nr_to_reclaim, nr_reclaimed = 0; + int swappiness = -1; + char *old_buf, *start; + substring_t args[MAX_OPT_ARGS]; + gfp_t gfp_mask = GFP_KERNEL; + + if (!buf || (!memcg && !pgdat) || (memcg && pgdat)) + return -EINVAL; + + buf = strstrip(buf); + + old_buf = buf; + nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; + if (buf == old_buf) + return -EINVAL; + + buf = strstrip(buf); + + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + switch (match_token(start, tokens, args)) { + case MEMORY_RECLAIM_SWAPPINESS: + if (match_int(&args[0], &swappiness)) + return -EINVAL; + if (swappiness < MIN_SWAPPINESS || + swappiness > MAX_SWAPPINESS) + return -EINVAL; + break; + case MEMORY_RECLAIM_SWAPPINESS_MAX: + swappiness = SWAPPINESS_ANON_ONLY; + break; + default: + return -EINVAL; + } + } + + while (nr_reclaimed < nr_to_reclaim) { + /* Will converge on zero, but reclaim enforces a minimum */ + unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; + unsigned long reclaimed; + + if (signal_pending(current)) + return -EINTR; + + /* + * This is the final attempt, drain percpu lru caches in the + * hope of introducing more evictable pages. + */ + if (!nr_retries) + lru_add_drain_all(); + + if (memcg) { + unsigned int reclaim_options; + + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | + MEMCG_RECLAIM_PROACTIVE; + reclaimed = try_to_free_mem_cgroup_pages(memcg, + batch_size, gfp_mask, + reclaim_options, + swappiness == -1 ? NULL : &swappiness); + } else { + struct scan_control sc = { + .gfp_mask = current_gfp_context(gfp_mask), + .reclaim_idx = gfp_zone(gfp_mask), + .proactive_swappiness = swappiness == -1 ? NULL : &swappiness, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX), + .may_unmap = 1, + .may_swap = 1, + .proactive = 1, + }; + + if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, + &pgdat->flags)) + return -EBUSY; + + reclaimed = __node_reclaim(pgdat, gfp_mask, + batch_size, &sc); + clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); + } + + if (!reclaimed && !nr_retries--) + return -EAGAIN; + + nr_reclaimed += reclaimed; + } + + return 0; +} + #endif /** @@ -7754,3 +7891,26 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) } } EXPORT_SYMBOL_GPL(check_move_unevictable_folios); + +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +static ssize_t reclaim_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret, nid = dev->id; + + ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid)); + return ret ? -EAGAIN : count; +} + +static DEVICE_ATTR_WO(reclaim); +int reclaim_register_node(struct node *node) +{ + return device_create_file(&node->dev, &dev_attr_reclaim); +} + +void reclaim_unregister_node(struct node *node) +{ + return device_remove_file(&node->dev, &dev_attr_reclaim); +} +#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 429ae5339bfe..71cd1ceba191 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1163,321 +1163,339 @@ int fragmentation_index(struct zone *zone, unsigned int order) #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \ defined(CONFIG_NUMA) || defined(CONFIG_MEMCG) #ifdef CONFIG_ZONE_DMA -#define TEXT_FOR_DMA(xx) xx "_dma", +#define TEXT_FOR_DMA(xx, yy) [xx##_DMA] = yy "_dma", #else -#define TEXT_FOR_DMA(xx) +#define TEXT_FOR_DMA(xx, yy) #endif #ifdef CONFIG_ZONE_DMA32 -#define TEXT_FOR_DMA32(xx) xx "_dma32", +#define TEXT_FOR_DMA32(xx, yy) [xx##_DMA32] = yy "_dma32", #else -#define TEXT_FOR_DMA32(xx) +#define TEXT_FOR_DMA32(xx, yy) #endif #ifdef CONFIG_HIGHMEM -#define TEXT_FOR_HIGHMEM(xx) xx "_high", +#define TEXT_FOR_HIGHMEM(xx, yy) [xx##_HIGH] = yy "_high", #else -#define TEXT_FOR_HIGHMEM(xx) +#define TEXT_FOR_HIGHMEM(xx, yy) #endif #ifdef CONFIG_ZONE_DEVICE -#define TEXT_FOR_DEVICE(xx) xx "_device", +#define TEXT_FOR_DEVICE(xx, yy) [xx##_DEVICE] = yy "_device", #else -#define TEXT_FOR_DEVICE(xx) +#define TEXT_FOR_DEVICE(xx, yy) #endif -#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ - TEXT_FOR_HIGHMEM(xx) xx "_movable", \ - TEXT_FOR_DEVICE(xx) +#define TEXTS_FOR_ZONES(xx, yy) \ + TEXT_FOR_DMA(xx, yy) \ + TEXT_FOR_DMA32(xx, yy) \ + [xx##_NORMAL] = yy "_normal", \ + TEXT_FOR_HIGHMEM(xx, yy) \ + [xx##_MOVABLE] = yy "_movable", \ + TEXT_FOR_DEVICE(xx, yy) const char * const vmstat_text[] = { /* enum zone_stat_item counters */ - "nr_free_pages", - "nr_free_pages_blocks", - "nr_zone_inactive_anon", - "nr_zone_active_anon", - "nr_zone_inactive_file", - "nr_zone_active_file", - "nr_zone_unevictable", - "nr_zone_write_pending", - "nr_mlock", +#define I(x) (x) + [I(NR_FREE_PAGES)] = "nr_free_pages", + [I(NR_FREE_PAGES_BLOCKS)] = "nr_free_pages_blocks", + [I(NR_ZONE_INACTIVE_ANON)] = "nr_zone_inactive_anon", + [I(NR_ZONE_ACTIVE_ANON)] = "nr_zone_active_anon", + [I(NR_ZONE_INACTIVE_FILE)] = "nr_zone_inactive_file", + [I(NR_ZONE_ACTIVE_FILE)] = "nr_zone_active_file", + [I(NR_ZONE_UNEVICTABLE)] = "nr_zone_unevictable", + [I(NR_ZONE_WRITE_PENDING)] = "nr_zone_write_pending", + [I(NR_MLOCK)] = "nr_mlock", #if IS_ENABLED(CONFIG_ZSMALLOC) - "nr_zspages", + [I(NR_ZSPAGES)] = "nr_zspages", #endif - "nr_free_cma", + [I(NR_FREE_CMA_PAGES)] = "nr_free_cma", #ifdef CONFIG_UNACCEPTED_MEMORY - "nr_unaccepted", + [I(NR_UNACCEPTED)] = "nr_unaccepted", #endif +#undef I /* enum numa_stat_item counters */ +#define I(x) (NR_VM_ZONE_STAT_ITEMS + x) #ifdef CONFIG_NUMA - "numa_hit", - "numa_miss", - "numa_foreign", - "numa_interleave", - "numa_local", - "numa_other", + [I(NUMA_HIT)] = "numa_hit", + [I(NUMA_MISS)] = "numa_miss", + [I(NUMA_FOREIGN)] = "numa_foreign", + [I(NUMA_INTERLEAVE_HIT)] = "numa_interleave", + [I(NUMA_LOCAL)] = "numa_local", + [I(NUMA_OTHER)] = "numa_other", #endif +#undef I /* enum node_stat_item counters */ - "nr_inactive_anon", - "nr_active_anon", - "nr_inactive_file", - "nr_active_file", - "nr_unevictable", - "nr_slab_reclaimable", - "nr_slab_unreclaimable", - "nr_isolated_anon", - "nr_isolated_file", - "workingset_nodes", - "workingset_refault_anon", - "workingset_refault_file", - "workingset_activate_anon", - "workingset_activate_file", - "workingset_restore_anon", - "workingset_restore_file", - "workingset_nodereclaim", - "nr_anon_pages", - "nr_mapped", - "nr_file_pages", - "nr_dirty", - "nr_writeback", - "nr_writeback_temp", - "nr_shmem", - "nr_shmem_hugepages", - "nr_shmem_pmdmapped", - "nr_file_hugepages", - "nr_file_pmdmapped", - "nr_anon_transparent_hugepages", - "nr_vmscan_write", - "nr_vmscan_immediate_reclaim", - "nr_dirtied", - "nr_written", - "nr_throttled_written", - "nr_kernel_misc_reclaimable", - "nr_foll_pin_acquired", - "nr_foll_pin_released", - "nr_kernel_stack", +#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + x) + [I(NR_INACTIVE_ANON)] = "nr_inactive_anon", + [I(NR_ACTIVE_ANON)] = "nr_active_anon", + [I(NR_INACTIVE_FILE)] = "nr_inactive_file", + [I(NR_ACTIVE_FILE)] = "nr_active_file", + [I(NR_UNEVICTABLE)] = "nr_unevictable", + [I(NR_SLAB_RECLAIMABLE_B)] = "nr_slab_reclaimable", + [I(NR_SLAB_UNRECLAIMABLE_B)] = "nr_slab_unreclaimable", + [I(NR_ISOLATED_ANON)] = "nr_isolated_anon", + [I(NR_ISOLATED_FILE)] = "nr_isolated_file", + [I(WORKINGSET_NODES)] = "workingset_nodes", + [I(WORKINGSET_REFAULT_ANON)] = "workingset_refault_anon", + [I(WORKINGSET_REFAULT_FILE)] = "workingset_refault_file", + [I(WORKINGSET_ACTIVATE_ANON)] = "workingset_activate_anon", + [I(WORKINGSET_ACTIVATE_FILE)] = "workingset_activate_file", + [I(WORKINGSET_RESTORE_ANON)] = "workingset_restore_anon", + [I(WORKINGSET_RESTORE_FILE)] = "workingset_restore_file", + [I(WORKINGSET_NODERECLAIM)] = "workingset_nodereclaim", + [I(NR_ANON_MAPPED)] = "nr_anon_pages", + [I(NR_FILE_MAPPED)] = "nr_mapped", + [I(NR_FILE_PAGES)] = "nr_file_pages", + [I(NR_FILE_DIRTY)] = "nr_dirty", + [I(NR_WRITEBACK)] = "nr_writeback", + [I(NR_SHMEM)] = "nr_shmem", + [I(NR_SHMEM_THPS)] = "nr_shmem_hugepages", + [I(NR_SHMEM_PMDMAPPED)] = "nr_shmem_pmdmapped", + [I(NR_FILE_THPS)] = "nr_file_hugepages", + [I(NR_FILE_PMDMAPPED)] = "nr_file_pmdmapped", + [I(NR_ANON_THPS)] = "nr_anon_transparent_hugepages", + [I(NR_VMSCAN_WRITE)] = "nr_vmscan_write", + [I(NR_VMSCAN_IMMEDIATE)] = "nr_vmscan_immediate_reclaim", + [I(NR_DIRTIED)] = "nr_dirtied", + [I(NR_WRITTEN)] = "nr_written", + [I(NR_THROTTLED_WRITTEN)] = "nr_throttled_written", + [I(NR_KERNEL_MISC_RECLAIMABLE)] = "nr_kernel_misc_reclaimable", + [I(NR_FOLL_PIN_ACQUIRED)] = "nr_foll_pin_acquired", + [I(NR_FOLL_PIN_RELEASED)] = "nr_foll_pin_released", + [I(NR_KERNEL_STACK_KB)] = "nr_kernel_stack", #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) - "nr_shadow_call_stack", + [I(NR_KERNEL_SCS_KB)] = "nr_shadow_call_stack", #endif - "nr_page_table_pages", - "nr_sec_page_table_pages", + [I(NR_PAGETABLE)] = "nr_page_table_pages", + [I(NR_SECONDARY_PAGETABLE)] = "nr_sec_page_table_pages", #ifdef CONFIG_IOMMU_SUPPORT - "nr_iommu_pages", + [I(NR_IOMMU_PAGES)] = "nr_iommu_pages", #endif #ifdef CONFIG_SWAP - "nr_swapcached", + [I(NR_SWAPCACHE)] = "nr_swapcached", #endif #ifdef CONFIG_NUMA_BALANCING - "pgpromote_success", - "pgpromote_candidate", + [I(PGPROMOTE_SUCCESS)] = "pgpromote_success", + [I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate", #endif - "pgdemote_kswapd", - "pgdemote_direct", - "pgdemote_khugepaged", - "pgdemote_proactive", + [I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd", + [I(PGDEMOTE_DIRECT)] = "pgdemote_direct", + [I(PGDEMOTE_KHUGEPAGED)] = "pgdemote_khugepaged", + [I(PGDEMOTE_PROACTIVE)] = "pgdemote_proactive", #ifdef CONFIG_HUGETLB_PAGE - "nr_hugetlb", + [I(NR_HUGETLB)] = "nr_hugetlb", #endif - "nr_balloon_pages", - /* system-wide enum vm_stat_item counters */ - "nr_dirty_threshold", - "nr_dirty_background_threshold", - "nr_memmap_pages", - "nr_memmap_boot_pages", + [I(NR_BALLOON_PAGES)] = "nr_balloon_pages", +#undef I -#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) + /* system-wide enum vm_stat_item counters */ +#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \ + NR_VM_NODE_STAT_ITEMS + x) + [I(NR_DIRTY_THRESHOLD)] = "nr_dirty_threshold", + [I(NR_DIRTY_BG_THRESHOLD)] = "nr_dirty_background_threshold", + [I(NR_MEMMAP_PAGES)] = "nr_memmap_pages", + [I(NR_MEMMAP_BOOT_PAGES)] = "nr_memmap_boot_pages", +#undef I + +#if defined(CONFIG_VM_EVENT_COUNTERS) /* enum vm_event_item counters */ - "pgpgin", - "pgpgout", - "pswpin", - "pswpout", - - TEXTS_FOR_ZONES("pgalloc") - TEXTS_FOR_ZONES("allocstall") - TEXTS_FOR_ZONES("pgskip") - - "pgfree", - "pgactivate", - "pgdeactivate", - "pglazyfree", - - "pgfault", - "pgmajfault", - "pglazyfreed", - - "pgrefill", - "pgreuse", - "pgsteal_kswapd", - "pgsteal_direct", - "pgsteal_khugepaged", - "pgsteal_proactive", - "pgscan_kswapd", - "pgscan_direct", - "pgscan_khugepaged", - "pgscan_proactive", - "pgscan_direct_throttle", - "pgscan_anon", - "pgscan_file", - "pgsteal_anon", - "pgsteal_file", +#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \ + NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS + x) + + [I(PGPGIN)] = "pgpgin", + [I(PGPGOUT)] = "pgpgout", + [I(PSWPIN)] = "pswpin", + [I(PSWPOUT)] = "pswpout", + +#define OFF (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \ + NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS) + TEXTS_FOR_ZONES(OFF+PGALLOC, "pgalloc") + TEXTS_FOR_ZONES(OFF+ALLOCSTALL, "allocstall") + TEXTS_FOR_ZONES(OFF+PGSCAN_SKIP, "pgskip") +#undef OFF + + [I(PGFREE)] = "pgfree", + [I(PGACTIVATE)] = "pgactivate", + [I(PGDEACTIVATE)] = "pgdeactivate", + [I(PGLAZYFREE)] = "pglazyfree", + + [I(PGFAULT)] = "pgfault", + [I(PGMAJFAULT)] = "pgmajfault", + [I(PGLAZYFREED)] = "pglazyfreed", + + [I(PGREFILL)] = "pgrefill", + [I(PGREUSE)] = "pgreuse", + [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd", + [I(PGSTEAL_DIRECT)] = "pgsteal_direct", + [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged", + [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive", + [I(PGSCAN_KSWAPD)] = "pgscan_kswapd", + [I(PGSCAN_DIRECT)] = "pgscan_direct", + [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged", + [I(PGSCAN_PROACTIVE)] = "pgscan_proactive", + [I(PGSCAN_DIRECT_THROTTLE)] = "pgscan_direct_throttle", + [I(PGSCAN_ANON)] = "pgscan_anon", + [I(PGSCAN_FILE)] = "pgscan_file", + [I(PGSTEAL_ANON)] = "pgsteal_anon", + [I(PGSTEAL_FILE)] = "pgsteal_file", #ifdef CONFIG_NUMA - "zone_reclaim_success", - "zone_reclaim_failed", + [I(PGSCAN_ZONE_RECLAIM_SUCCESS)] = "zone_reclaim_success", + [I(PGSCAN_ZONE_RECLAIM_FAILED)] = "zone_reclaim_failed", #endif - "pginodesteal", - "slabs_scanned", - "kswapd_inodesteal", - "kswapd_low_wmark_hit_quickly", - "kswapd_high_wmark_hit_quickly", - "pageoutrun", + [I(PGINODESTEAL)] = "pginodesteal", + [I(SLABS_SCANNED)] = "slabs_scanned", + [I(KSWAPD_INODESTEAL)] = "kswapd_inodesteal", + [I(KSWAPD_LOW_WMARK_HIT_QUICKLY)] = "kswapd_low_wmark_hit_quickly", + [I(KSWAPD_HIGH_WMARK_HIT_QUICKLY)] = "kswapd_high_wmark_hit_quickly", + [I(PAGEOUTRUN)] = "pageoutrun", - "pgrotated", + [I(PGROTATED)] = "pgrotated", - "drop_pagecache", - "drop_slab", - "oom_kill", + [I(DROP_PAGECACHE)] = "drop_pagecache", + [I(DROP_SLAB)] = "drop_slab", + [I(OOM_KILL)] = "oom_kill", #ifdef CONFIG_NUMA_BALANCING - "numa_pte_updates", - "numa_huge_pte_updates", - "numa_hint_faults", - "numa_hint_faults_local", - "numa_pages_migrated", - "numa_task_migrated", - "numa_task_swapped", + [I(NUMA_PTE_UPDATES)] = "numa_pte_updates", + [I(NUMA_HUGE_PTE_UPDATES)] = "numa_huge_pte_updates", + [I(NUMA_HINT_FAULTS)] = "numa_hint_faults", + [I(NUMA_HINT_FAULTS_LOCAL)] = "numa_hint_faults_local", + [I(NUMA_PAGE_MIGRATE)] = "numa_pages_migrated", #endif #ifdef CONFIG_MIGRATION - "pgmigrate_success", - "pgmigrate_fail", - "thp_migration_success", - "thp_migration_fail", - "thp_migration_split", + [I(PGMIGRATE_SUCCESS)] = "pgmigrate_success", + [I(PGMIGRATE_FAIL)] = "pgmigrate_fail", + [I(THP_MIGRATION_SUCCESS)] = "thp_migration_success", + [I(THP_MIGRATION_FAIL)] = "thp_migration_fail", + [I(THP_MIGRATION_SPLIT)] = "thp_migration_split", #endif #ifdef CONFIG_COMPACTION - "compact_migrate_scanned", - "compact_free_scanned", - "compact_isolated", - "compact_stall", - "compact_fail", - "compact_success", - "compact_daemon_wake", - "compact_daemon_migrate_scanned", - "compact_daemon_free_scanned", + [I(COMPACTMIGRATE_SCANNED)] = "compact_migrate_scanned", + [I(COMPACTFREE_SCANNED)] = "compact_free_scanned", + [I(COMPACTISOLATED)] = "compact_isolated", + [I(COMPACTSTALL)] = "compact_stall", + [I(COMPACTFAIL)] = "compact_fail", + [I(COMPACTSUCCESS)] = "compact_success", + [I(KCOMPACTD_WAKE)] = "compact_daemon_wake", + [I(KCOMPACTD_MIGRATE_SCANNED)] = "compact_daemon_migrate_scanned", + [I(KCOMPACTD_FREE_SCANNED)] = "compact_daemon_free_scanned", #endif #ifdef CONFIG_HUGETLB_PAGE - "htlb_buddy_alloc_success", - "htlb_buddy_alloc_fail", + [I(HTLB_BUDDY_PGALLOC)] = "htlb_buddy_alloc_success", + [I(HTLB_BUDDY_PGALLOC_FAIL)] = "htlb_buddy_alloc_fail", #endif #ifdef CONFIG_CMA - "cma_alloc_success", - "cma_alloc_fail", + [I(CMA_ALLOC_SUCCESS)] = "cma_alloc_success", + [I(CMA_ALLOC_FAIL)] = "cma_alloc_fail", #endif - "unevictable_pgs_culled", - "unevictable_pgs_scanned", - "unevictable_pgs_rescued", - "unevictable_pgs_mlocked", - "unevictable_pgs_munlocked", - "unevictable_pgs_cleared", - "unevictable_pgs_stranded", + [I(UNEVICTABLE_PGCULLED)] = "unevictable_pgs_culled", + [I(UNEVICTABLE_PGSCANNED)] = "unevictable_pgs_scanned", + [I(UNEVICTABLE_PGRESCUED)] = "unevictable_pgs_rescued", + [I(UNEVICTABLE_PGMLOCKED)] = "unevictable_pgs_mlocked", + [I(UNEVICTABLE_PGMUNLOCKED)] = "unevictable_pgs_munlocked", + [I(UNEVICTABLE_PGCLEARED)] = "unevictable_pgs_cleared", + [I(UNEVICTABLE_PGSTRANDED)] = "unevictable_pgs_stranded", #ifdef CONFIG_TRANSPARENT_HUGEPAGE - "thp_fault_alloc", - "thp_fault_fallback", - "thp_fault_fallback_charge", - "thp_collapse_alloc", - "thp_collapse_alloc_failed", - "thp_file_alloc", - "thp_file_fallback", - "thp_file_fallback_charge", - "thp_file_mapped", - "thp_split_page", - "thp_split_page_failed", - "thp_deferred_split_page", - "thp_underused_split_page", - "thp_split_pmd", - "thp_scan_exceed_none_pte", - "thp_scan_exceed_swap_pte", - "thp_scan_exceed_share_pte", + [I(THP_FAULT_ALLOC)] = "thp_fault_alloc", + [I(THP_FAULT_FALLBACK)] = "thp_fault_fallback", + [I(THP_FAULT_FALLBACK_CHARGE)] = "thp_fault_fallback_charge", + [I(THP_COLLAPSE_ALLOC)] = "thp_collapse_alloc", + [I(THP_COLLAPSE_ALLOC_FAILED)] = "thp_collapse_alloc_failed", + [I(THP_FILE_ALLOC)] = "thp_file_alloc", + [I(THP_FILE_FALLBACK)] = "thp_file_fallback", + [I(THP_FILE_FALLBACK_CHARGE)] = "thp_file_fallback_charge", + [I(THP_FILE_MAPPED)] = "thp_file_mapped", + [I(THP_SPLIT_PAGE)] = "thp_split_page", + [I(THP_SPLIT_PAGE_FAILED)] = "thp_split_page_failed", + [I(THP_DEFERRED_SPLIT_PAGE)] = "thp_deferred_split_page", + [I(THP_UNDERUSED_SPLIT_PAGE)] = "thp_underused_split_page", + [I(THP_SPLIT_PMD)] = "thp_split_pmd", + [I(THP_SCAN_EXCEED_NONE_PTE)] = "thp_scan_exceed_none_pte", + [I(THP_SCAN_EXCEED_SWAP_PTE)] = "thp_scan_exceed_swap_pte", + [I(THP_SCAN_EXCEED_SHARED_PTE)] = "thp_scan_exceed_share_pte", #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD - "thp_split_pud", + [I(THP_SPLIT_PUD)] = "thp_split_pud", #endif - "thp_zero_page_alloc", - "thp_zero_page_alloc_failed", - "thp_swpout", - "thp_swpout_fallback", + [I(THP_ZERO_PAGE_ALLOC)] = "thp_zero_page_alloc", + [I(THP_ZERO_PAGE_ALLOC_FAILED)] = "thp_zero_page_alloc_failed", + [I(THP_SWPOUT)] = "thp_swpout", + [I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback", #endif #ifdef CONFIG_MEMORY_BALLOON - "balloon_inflate", - "balloon_deflate", + [I(BALLOON_INFLATE)] = "balloon_inflate", + [I(BALLOON_DEFLATE)] = "balloon_deflate", #ifdef CONFIG_BALLOON_COMPACTION - "balloon_migrate", + [I(BALLOON_MIGRATE)] = "balloon_migrate", #endif #endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH - "nr_tlb_remote_flush", - "nr_tlb_remote_flush_received", - "nr_tlb_local_flush_all", - "nr_tlb_local_flush_one", + [I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush", + [I(NR_TLB_REMOTE_FLUSH_RECEIVED)] = "nr_tlb_remote_flush_received", + [I(NR_TLB_LOCAL_FLUSH_ALL)] = "nr_tlb_local_flush_all", + [I(NR_TLB_LOCAL_FLUSH_ONE)] = "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ #ifdef CONFIG_SWAP - "swap_ra", - "swap_ra_hit", - "swpin_zero", - "swpout_zero", + [I(SWAP_RA)] = "swap_ra", + [I(SWAP_RA_HIT)] = "swap_ra_hit", + [I(SWPIN_ZERO)] = "swpin_zero", + [I(SWPOUT_ZERO)] = "swpout_zero", #ifdef CONFIG_KSM - "ksm_swpin_copy", + [I(KSM_SWPIN_COPY)] = "ksm_swpin_copy", #endif #endif #ifdef CONFIG_KSM - "cow_ksm", + [I(COW_KSM)] = "cow_ksm", #endif #ifdef CONFIG_ZSWAP - "zswpin", - "zswpout", - "zswpwb", + [I(ZSWPIN)] = "zswpin", + [I(ZSWPOUT)] = "zswpout", + [I(ZSWPWB)] = "zswpwb", #endif #ifdef CONFIG_X86 - "direct_map_level2_splits", - "direct_map_level3_splits", - "direct_map_level2_collapses", - "direct_map_level3_collapses", + [I(DIRECT_MAP_LEVEL2_SPLIT)] = "direct_map_level2_splits", + [I(DIRECT_MAP_LEVEL3_SPLIT)] = "direct_map_level3_splits", + [I(DIRECT_MAP_LEVEL2_COLLAPSE)] = "direct_map_level2_collapses", + [I(DIRECT_MAP_LEVEL3_COLLAPSE)] = "direct_map_level3_collapses", #endif #ifdef CONFIG_PER_VMA_LOCK_STATS - "vma_lock_success", - "vma_lock_abort", - "vma_lock_retry", - "vma_lock_miss", + [I(VMA_LOCK_SUCCESS)] = "vma_lock_success", + [I(VMA_LOCK_ABORT)] = "vma_lock_abort", + [I(VMA_LOCK_RETRY)] = "vma_lock_retry", + [I(VMA_LOCK_MISS)] = "vma_lock_miss", #endif #ifdef CONFIG_DEBUG_STACK_USAGE - "kstack_1k", + [I(KSTACK_1K)] = "kstack_1k", #if THREAD_SIZE > 1024 - "kstack_2k", + [I(KSTACK_2K)] = "kstack_2k", #endif #if THREAD_SIZE > 2048 - "kstack_4k", + [I(KSTACK_4K)] = "kstack_4k", #endif #if THREAD_SIZE > 4096 - "kstack_8k", + [I(KSTACK_8K)] = "kstack_8k", #endif #if THREAD_SIZE > 8192 - "kstack_16k", + [I(KSTACK_16K)] = "kstack_16k", #endif #if THREAD_SIZE > 16384 - "kstack_32k", + [I(KSTACK_32K)] = "kstack_32k", #endif #if THREAD_SIZE > 32768 - "kstack_64k", + [I(KSTACK_64K)] = "kstack_64k", #endif #if THREAD_SIZE > 65536 - "kstack_rest", + [I(KSTACK_REST)] = "kstack_rest", #endif #endif -#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ +#undef I +#endif /* CONFIG_VM_EVENT_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ @@ -1869,7 +1887,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) if (*pos >= NR_VMSTAT_ITEMS) return NULL; - BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS); + BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) != NR_VMSTAT_ITEMS); fold_vm_numa_events(); v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL); m->private = v; diff --git a/mm/zpdesc.h b/mm/zpdesc.h index d3df316e5bb7..25bf5ea0beb8 100644 --- a/mm/zpdesc.h +++ b/mm/zpdesc.h @@ -152,10 +152,9 @@ static inline struct zpdesc *pfn_zpdesc(unsigned long pfn) return page_zpdesc(pfn_to_page(pfn)); } -static inline void __zpdesc_set_movable(struct zpdesc *zpdesc, - const struct movable_operations *mops) +static inline void __zpdesc_set_movable(struct zpdesc *zpdesc) { - __SetPageMovable(zpdesc_page(zpdesc), mops); + SetPageMovableOps(zpdesc_page(zpdesc)); } static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc) @@ -163,16 +162,6 @@ static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc) __SetPageZsmalloc(zpdesc_page(zpdesc)); } -static inline void __zpdesc_clear_zsmalloc(struct zpdesc *zpdesc) -{ - __ClearPageZsmalloc(zpdesc_page(zpdesc)); -} - -static inline bool zpdesc_is_isolated(struct zpdesc *zpdesc) -{ - return PageIsolated(zpdesc_page(zpdesc)); -} - static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc) { return page_zone(zpdesc_page(zpdesc)); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 999b513c7fdf..2c5e56a65354 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -244,6 +244,7 @@ static inline void free_zpdesc(struct zpdesc *zpdesc) { struct page *page = zpdesc_page(zpdesc); + /* PageZsmalloc is sticky until the page is freed to the buddy. */ __free_page(page); } @@ -876,11 +877,10 @@ static void reset_zpdesc(struct zpdesc *zpdesc) { struct page *page = zpdesc_page(zpdesc); - __ClearPageMovable(page); ClearPagePrivate(page); zpdesc->zspage = NULL; zpdesc->next = NULL; - __ClearPageZsmalloc(page); + /* PageZsmalloc is sticky until the page is freed to the buddy. */ } static int trylock_zspage(struct zspage *zspage) @@ -1043,6 +1043,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, if (!zspage) return NULL; + if (!IS_ENABLED(CONFIG_COMPACTION)) + gfp &= ~__GFP_MOVABLE; + zspage->magic = ZSPAGE_MAGIC; zspage->pool = pool; zspage->class = class->index; @@ -1055,7 +1058,6 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, if (!zpdesc) { while (--i >= 0) { zpdesc_dec_zone_page_state(zpdescs[i]); - __zpdesc_clear_zsmalloc(zpdescs[i]); free_zpdesc(zpdescs[i]); } cache_free_zspage(pool, zspage); @@ -1686,8 +1688,6 @@ static void lock_zspage(struct zspage *zspage) #ifdef CONFIG_COMPACTION -static const struct movable_operations zsmalloc_mops; - static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc) { @@ -1710,18 +1710,17 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, set_first_obj_offset(newzpdesc, first_obj_offset); if (unlikely(ZsHugePage(zspage))) newzpdesc->handle = oldzpdesc->handle; - __zpdesc_set_movable(newzpdesc, &zsmalloc_mops); + __zpdesc_set_movable(newzpdesc); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { /* - * Page is locked so zspage couldn't be destroyed. For detail, look at - * lock_zspage in free_zspage. + * Page is locked so zspage can't be destroyed concurrently + * (see free_zspage()). But if the page was already destroyed + * (see reset_zpdesc()), refuse isolation here. */ - VM_BUG_ON_PAGE(PageIsolated(page), page); - - return true; + return page_zpdesc(page)->zspage; } static int zs_page_migrate(struct page *newpage, struct page *page, @@ -1739,7 +1738,15 @@ static int zs_page_migrate(struct page *newpage, struct page *page, unsigned long old_obj, new_obj; unsigned int obj_idx; - VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc)); + /* + * TODO: nothing prevents a zspage from getting destroyed while + * it is isolated for migration, as the page lock is temporarily + * dropped after zs_page_isolate() succeeded: we should rework that + * and defer destroying such pages once they are un-isolated (putback) + * instead. + */ + if (!zpdesc->zspage) + return MIGRATEPAGE_SUCCESS; /* The page is locked, so this pointer must remain valid */ zspage = get_zspage(zpdesc); @@ -1811,10 +1818,9 @@ static int zs_page_migrate(struct page *newpage, struct page *page, static void zs_page_putback(struct page *page) { - VM_BUG_ON_PAGE(!PageIsolated(page), page); } -static const struct movable_operations zsmalloc_mops = { +const struct movable_operations zsmalloc_mops = { .isolate_page = zs_page_isolate, .migrate_page = zs_page_migrate, .putback_page = zs_page_putback, @@ -1877,7 +1883,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) do { WARN_ON(!zpdesc_trylock(zpdesc)); - __zpdesc_set_movable(zpdesc, &zsmalloc_mops); + __zpdesc_set_movable(zpdesc); zpdesc_unlock(zpdesc); } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); } diff --git a/mm/zswap.c b/mm/zswap.c index 455e9425c5f5..3c0fd8a13718 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1070,9 +1070,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct mempolicy *mpol; bool folio_was_allocated; struct swap_info_struct *si; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - }; int ret = 0; /* try to allocate swap cache folio */ @@ -1134,7 +1131,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, folio_set_reclaim(folio); /* start writeback */ - __swap_writepage(folio, &wbc); + __swap_writepage(folio, NULL); out: if (ret && ret != -EEXIST) { |