diff options
author | Russell King (Oracle) <rmk+kernel@armlinux.org.uk> | 2024-06-10 12:03:21 +0100 |
---|---|---|
committer | Russell King (Oracle) <rmk+kernel@armlinux.org.uk> | 2024-06-10 12:03:21 +0100 |
commit | 594ce0b8a998aa4d05827cd7c0d0dcec9a1e3ae2 (patch) | |
tree | 070bd60a8fda15e5f47339d3f6888a0fe2ca6fe9 /mm | |
parent | 616501eccb58615f8f352a29239ea6c6fc5e6546 (diff) | |
parent | e3cf20e5c68df604315ab30bdbe15dc8a5da556b (diff) |
Merge topic branches 'clkdev' and 'fixes' into for-linus
Diffstat (limited to 'mm')
105 files changed, 11291 insertions, 6844 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index ffc3a2ba3a8c..b4cb45255a54 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -45,22 +45,6 @@ config ZSWAP_DEFAULT_ON The selection made here can be overridden by using the kernel command line 'zswap.enabled=' option. -config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON - bool "Invalidate zswap entries when pages are loaded" - depends on ZSWAP - help - If selected, exclusive loads for zswap will be enabled at boot, - otherwise it will be disabled. - - If exclusive loads are enabled, when a page is loaded from zswap, - the zswap entry is invalidated at once, as opposed to leaving it - in zswap until the swap entry is freed. - - This avoids having two copies of the same page in memory - (compressed and uncompressed) after faulting in a page from zswap. - The cost is that if the page was never dirtied and needs to be - swapped out again, it will be re-compressed. - config ZSWAP_SHRINKER_DEFAULT_ON bool "Shrink the zswap pool on memory pressure" depends on ZSWAP @@ -349,10 +333,9 @@ config SHUFFLE_PAGE_ALLOCATOR While the randomization improves cache utilization it may negatively impact workloads on platforms without a cache. For - this reason, by default, the randomization is enabled only - after runtime detection of a direct-mapped memory-side-cache. - Otherwise, the randomization may be force enabled with the - 'page_alloc.shuffle' kernel command line parameter. + this reason, by default, the randomization is not enabled even + if SHUFFLE_PAGE_ALLOCATOR=y. The randomization may be force enabled + with the 'page_alloc.shuffle' kernel command line parameter. Say Y if unsure. @@ -489,7 +472,7 @@ config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP config HAVE_MEMBLOCK_PHYS_MAP bool -config HAVE_FAST_GUP +config HAVE_GUP_FAST depends on MMU bool @@ -599,7 +582,7 @@ config MEMORY_BALLOON # support for memory balloon compaction config BALLOON_COMPACTION bool "Allow for balloon memory compaction/migration" - def_bool y + default y depends on COMPACTION && MEMORY_BALLOON help Memory fragmentation introduced by ballooning might reduce @@ -614,7 +597,7 @@ config BALLOON_COMPACTION # support for memory compaction config COMPACTION bool "Allow for memory compaction" - def_bool y + default y select MIGRATION depends on MMU help @@ -637,7 +620,6 @@ config COMPACT_UNEVICTABLE_DEFAULT # support for free page reporting config PAGE_REPORTING bool "Free page reporting" - def_bool n help Free page reporting allows for the incremental acquisition of free pages from the buddy allocator for the purpose of reporting @@ -649,7 +631,7 @@ config PAGE_REPORTING # config MIGRATION bool "Page migration" - def_bool y + default y depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU help Allows the migration of the physical location of pages of processes @@ -867,6 +849,12 @@ config READ_ONLY_THP_FOR_FS endif # TRANSPARENT_HUGEPAGE # +# The architecture supports pgtable leaves that is larger than PAGE_SIZE +# +config PGTABLE_HAS_HUGE_LEAVES + def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE + +# # UP and nommu archs use km based percpu allocator # config NEED_PER_CPU_KM @@ -901,15 +889,6 @@ config CMA If unsure, say "n". -config CMA_DEBUG - bool "CMA debug messages (DEVELOPMENT)" - depends on DEBUG_KERNEL && CMA - help - Turns on debug messages in CMA. This produces KERN_DEBUG - messages for every CMA call as well as various messages while - processing calls such as dma_alloc_from_contiguous(). - This option does not affect warning and error messages. - config CMA_DEBUGFS bool "CMA debugfs interface" depends on CMA && DEBUG_FS @@ -926,14 +905,14 @@ config CMA_SYSFS config CMA_AREAS int "Maximum count of the CMA areas" depends on CMA - default 19 if NUMA - default 7 + default 20 if NUMA + default 8 help CMA allows to create CMA areas for particular purpose, mainly, used as device private area. This parameter sets the maximum number of CMA area in the system. - If unsure, leave the default value "7" in UMA and "19" in NUMA. + If unsure, leave the default value "8" in UMA and "20" in NUMA. config MEM_SOFT_DIRTY bool "Track memory changes" @@ -998,6 +977,12 @@ config IDLE_PAGE_TRACKING See Documentation/admin-guide/mm/idle_page_tracking.rst for more details. +# Architectures which implement cpu_dcache_is_aliasing() to query +# whether the data caches are aliased (VIVT or VIPT with dcache +# aliasing) need to select this. +config ARCH_HAS_CPU_CACHE_ALIASING + bool + config ARCH_HAS_CACHE_LINE_SIZE bool @@ -1261,6 +1246,9 @@ config LOCK_MM_AND_FIND_VMA config IOMMU_MM_DATA bool +config EXECMEM + bool + source "mm/damon/Kconfig" endmenu diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 321ab379994f..afc72fde0f03 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -64,11 +64,11 @@ config SLUB_DEBUG_ON help Boot with debugging on by default. SLUB boots by default with the runtime debug capabilities switched off. Enabling this is - equivalent to specifying the "slub_debug" parameter on boot. + equivalent to specifying the "slab_debug" parameter on boot. There is no support for more fine grained debug control like - possible with slub_debug=xxx. SLUB debugging may be switched + possible with slab_debug=xxx. SLUB debugging may be switched off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying - "slub_debug=-". + "slab_debug=-". config PAGE_OWNER bool "Track page owner" diff --git a/mm/Makefile b/mm/Makefile index e4b5b75aaec9..8fb85acda1b1 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,6 +5,7 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slub.o := n +KASAN_SANITIZE_kmemleak.o := n KCSAN_SANITIZE_kmemleak.o := n # These produce frequent data race reports: most of them are due to races on @@ -29,8 +30,7 @@ KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n KCOV_INSTRUMENT_failslab.o := n -CFLAGS_init-mm.o += $(call cc-disable-warning, override-init) -CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides) +CFLAGS_init-mm.o += -Wno-override-init mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ @@ -43,6 +43,10 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o endif +ifdef CONFIG_64BIT +mmu-$(CONFIG_MMU) += mseal.o +endif + obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page-writeback.o folio-compat.o \ readahead.o swap.o truncate.o vmscan.o shrinker.o \ @@ -134,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_EXECMEM) += execmem.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e039d05304dd..e61bbb1bd622 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -39,6 +39,19 @@ struct workqueue_struct *bdi_wq; #include <linux/debugfs.h> #include <linux/seq_file.h> +struct wb_stats { + unsigned long nr_dirty; + unsigned long nr_io; + unsigned long nr_more_io; + unsigned long nr_dirty_time; + unsigned long nr_writeback; + unsigned long nr_reclaimable; + unsigned long nr_dirtied; + unsigned long nr_written; + unsigned long dirty_thresh; + unsigned long wb_thresh; +}; + static struct dentry *bdi_debug_root; static void bdi_debug_init(void) @@ -46,31 +59,68 @@ static void bdi_debug_init(void) bdi_debug_root = debugfs_create_dir("bdi", NULL); } -static int bdi_debug_stats_show(struct seq_file *m, void *v) +static void collect_wb_stats(struct wb_stats *stats, + struct bdi_writeback *wb) { - struct backing_dev_info *bdi = m->private; - struct bdi_writeback *wb = &bdi->wb; - unsigned long background_thresh; - unsigned long dirty_thresh; - unsigned long wb_thresh; - unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; - nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_io_list) - nr_dirty++; + stats->nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_io_list) - nr_io++; + stats->nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_io_list) - nr_more_io++; + stats->nr_more_io++; list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) - nr_dirty_time++; + stats->nr_dirty_time++; spin_unlock(&wb->list_lock); + stats->nr_writeback += wb_stat(wb, WB_WRITEBACK); + stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE); + stats->nr_dirtied += wb_stat(wb, WB_DIRTIED); + stats->nr_written += wb_stat(wb, WB_WRITTEN); + stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh); +} + +#ifdef CONFIG_CGROUP_WRITEBACK +static void bdi_collect_stats(struct backing_dev_info *bdi, + struct wb_stats *stats) +{ + struct bdi_writeback *wb; + + rcu_read_lock(); + list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { + if (!wb_tryget(wb)) + continue; + + collect_wb_stats(stats, wb); + wb_put(wb); + } + rcu_read_unlock(); +} +#else +static void bdi_collect_stats(struct backing_dev_info *bdi, + struct wb_stats *stats) +{ + collect_wb_stats(stats, &bdi->wb); +} +#endif + +static int bdi_debug_stats_show(struct seq_file *m, void *v) +{ + struct backing_dev_info *bdi = m->private; + unsigned long background_thresh; + unsigned long dirty_thresh; + struct wb_stats stats; + unsigned long tot_bw; + global_dirty_limits(&background_thresh, &dirty_thresh); - wb_thresh = wb_calc_thresh(wb, dirty_thresh); + + memset(&stats, 0, sizeof(stats)); + stats.dirty_thresh = dirty_thresh; + bdi_collect_stats(bdi, &stats); + tot_bw = atomic_long_read(&bdi->tot_write_bandwidth); seq_printf(m, "BdiWriteback: %10lu kB\n" @@ -87,37 +137,114 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", - (unsigned long) K(wb_stat(wb, WB_WRITEBACK)), - (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)), - K(wb_thresh), + K(stats.nr_writeback), + K(stats.nr_reclaimable), + K(stats.wb_thresh), K(dirty_thresh), K(background_thresh), - (unsigned long) K(wb_stat(wb, WB_DIRTIED)), - (unsigned long) K(wb_stat(wb, WB_WRITTEN)), - (unsigned long) K(wb->write_bandwidth), - nr_dirty, - nr_io, - nr_more_io, - nr_dirty_time, + K(stats.nr_dirtied), + K(stats.nr_written), + K(tot_bw), + stats.nr_dirty, + stats.nr_io, + stats.nr_more_io, + stats.nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->wb.state); return 0; } DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); +static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb, + struct wb_stats *stats) +{ + + seq_printf(m, + "WbCgIno: %10lu\n" + "WbWriteback: %10lu kB\n" + "WbReclaimable: %10lu kB\n" + "WbDirtyThresh: %10lu kB\n" + "WbDirtied: %10lu kB\n" + "WbWritten: %10lu kB\n" + "WbWriteBandwidth: %10lu kBps\n" + "b_dirty: %10lu\n" + "b_io: %10lu\n" + "b_more_io: %10lu\n" + "b_dirty_time: %10lu\n" + "state: %10lx\n\n", +#ifdef CONFIG_CGROUP_WRITEBACK + cgroup_ino(wb->memcg_css->cgroup), +#else + 1ul, +#endif + K(stats->nr_writeback), + K(stats->nr_reclaimable), + K(stats->wb_thresh), + K(stats->nr_dirtied), + K(stats->nr_written), + K(wb->avg_write_bandwidth), + stats->nr_dirty, + stats->nr_io, + stats->nr_more_io, + stats->nr_dirty_time, + wb->state); +} + +static int cgwb_debug_stats_show(struct seq_file *m, void *v) +{ + struct backing_dev_info *bdi = m->private; + unsigned long background_thresh; + unsigned long dirty_thresh; + struct bdi_writeback *wb; + + global_dirty_limits(&background_thresh, &dirty_thresh); + + rcu_read_lock(); + list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { + struct wb_stats stats = { .dirty_thresh = dirty_thresh }; + + if (!wb_tryget(wb)) + continue; + + collect_wb_stats(&stats, wb); + + /* + * Calculate thresh of wb in writeback cgroup which is min of + * thresh in global domain and thresh in cgroup domain. Drop + * rcu lock because cgwb_calc_thresh may sleep in + * cgroup_rstat_flush. We can do so here because we have a ref. + */ + if (mem_cgroup_wb_domain(wb)) { + rcu_read_unlock(); + stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb)); + rcu_read_lock(); + } + + wb_stats_show(m, wb, &stats); + + wb_put(wb); + } + rcu_read_unlock(); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats); + static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, &bdi_debug_stats_fops); + debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi, + &cgwb_debug_stats_fops); } static void bdi_debug_unregister(struct backing_dev_info *bdi) { debugfs_remove_recursive(bdi->debug_dir); } -#else +#else /* CONFIG_DEBUG_FS */ static inline void bdi_debug_init(void) { } @@ -128,7 +255,7 @@ static inline void bdi_debug_register(struct backing_dev_info *bdi, static inline void bdi_debug_unregister(struct backing_dev_info *bdi) { } -#endif +#endif /* CONFIG_DEBUG_FS */ static ssize_t read_ahead_kb_store(struct device *dev, struct device_attribute *attr, @@ -372,31 +499,6 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); -/* - * This function is used when the first inode for this wb is marked dirty. It - * wakes-up the corresponding bdi thread which should then take care of the - * periodic background write-out of dirty inodes. Since the write-out would - * starts only 'dirty_writeback_interval' centisecs from now anyway, we just - * set up a timer which wakes the bdi thread up later. - * - * Note, we wouldn't bother setting up the timer, but this function is on the - * fast-path (used by '__mark_inode_dirty()'), so we save few context switches - * by delaying the wake-up. - * - * We have to be careful not to postpone flush work if it is scheduled for - * earlier. Thus we use queue_delayed_work(). - */ -void wb_wakeup_delayed(struct bdi_writeback *wb) -{ - unsigned long timeout; - - timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - spin_lock_irq(&wb->work_lock); - if (test_bit(WB_registered, &wb->state)) - queue_delayed_work(bdi_wq, &wb->dwork, timeout); - spin_unlock_irq(&wb->work_lock); -} - static void wb_update_bandwidth_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), @@ -413,7 +515,7 @@ static void wb_update_bandwidth_workfn(struct work_struct *work) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) { - int i, err; + int err; memset(wb, 0, sizeof(*wb)); @@ -441,18 +543,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, if (err) return err; - for (i = 0; i < NR_WB_STAT_ITEMS; i++) { - err = percpu_counter_init(&wb->stat[i], 0, gfp); - if (err) - goto out_destroy_stat; - } - - return 0; + err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS); + if (err) + fprop_local_destroy_percpu(&wb->completions); -out_destroy_stat: - while (i--) - percpu_counter_destroy(&wb->stat[i]); - fprop_local_destroy_percpu(&wb->completions); return err; } @@ -485,13 +579,8 @@ static void wb_shutdown(struct bdi_writeback *wb) static void wb_exit(struct bdi_writeback *wb) { - int i; - WARN_ON(delayed_work_pending(&wb->dwork)); - - for (i = 0; i < NR_WB_STAT_ITEMS; i++) - percpu_counter_destroy(&wb->stat[i]); - + percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS); fprop_local_destroy_percpu(&wb->completions); } @@ -14,11 +14,6 @@ #define pr_fmt(fmt) "cma: " fmt -#ifdef CONFIG_CMA_DEBUG -#ifndef DEBUG -# define DEBUG -#endif -#endif #define CREATE_TRACE_POINTS #include <linux/memblock.h> @@ -187,10 +182,6 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, if (!size || !memblock_is_region_reserved(base, size)) return -EINVAL; - /* alignment should be aligned with order_per_bit */ - if (!IS_ALIGNED(CMA_MIN_ALIGNMENT_PAGES, 1 << order_per_bit)) - return -EINVAL; - /* ensure minimal alignment required by mm core */ if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES)) return -EINVAL; @@ -387,7 +378,6 @@ err: return ret; } -#ifdef CONFIG_CMA_DEBUG static void cma_debug_show_areas(struct cma *cma) { unsigned long next_zero_bit, next_set_bit, nr_zero; @@ -412,9 +402,6 @@ static void cma_debug_show_areas(struct cma *cma) pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); spin_unlock_irq(&cma->lock); } -#else -static inline void cma_debug_show_areas(struct cma *cma) { } -#endif /** * cma_alloc() - allocate pages from contiguous area @@ -436,17 +423,18 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned long i; struct page *page = NULL; int ret = -ENOMEM; + const char *name = cma ? cma->name : NULL; + + trace_cma_alloc_start(name, count, align); if (!cma || !cma->count || !cma->bitmap) - goto out; + return page; pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__, (void *)cma, cma->name, count, align); if (!count) - goto out; - - trace_cma_alloc_start(cma->name, count, align); + return page; mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); @@ -454,7 +442,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, bitmap_count = cma_bitmap_pages_to_bits(cma, count); if (bitmap_count > bitmap_maxno) - goto out; + return page; for (;;) { spin_lock_irq(&cma->lock); @@ -496,8 +484,6 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, start = bitmap_no + mask + 1; } - trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret); - /* * CMA can allocate multiple page blocks, which results in different * blocks being marked with different tags. Reset the tags to ignore @@ -515,14 +501,13 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, } pr_debug("%s(): returned %p\n", __func__, page); -out: + trace_cma_alloc_finish(name, pfn, page, count, align, ret); if (page) { count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); } else { count_vm_event(CMA_ALLOC_FAIL); - if (cma) - cma_sysfs_account_fail_pages(cma, count); + cma_sysfs_account_fail_pages(cma, count); } return page; @@ -573,6 +558,7 @@ bool cma_release(struct cma *cma, const struct page *pages, free_contig_range(pfn, count); cma_clear_bitmap(cma, pfn, count); + cma_sysfs_account_release_pages(cma, count); trace_cma_release(cma->name, pfn, pages, count); return true; @@ -27,6 +27,8 @@ struct cma { atomic64_t nr_pages_succeeded; /* the number of CMA page allocation failures */ atomic64_t nr_pages_failed; + /* the number of CMA page released */ + atomic64_t nr_pages_released; /* kobject requires dynamic object */ struct cma_kobject *cma_kobj; #endif @@ -44,10 +46,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma) #ifdef CONFIG_CMA_SYSFS void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages); void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages); +void cma_sysfs_account_release_pages(struct cma *cma, unsigned long nr_pages); #else static inline void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages) {}; static inline void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages) {}; +static inline void cma_sysfs_account_release_pages(struct cma *cma, + unsigned long nr_pages) {}; #endif #endif diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c index 56347d15b7e8..f50db3973171 100644 --- a/mm/cma_sysfs.c +++ b/mm/cma_sysfs.c @@ -24,6 +24,11 @@ void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages) atomic64_add(nr_pages, &cma->nr_pages_failed); } +void cma_sysfs_account_release_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_released); +} + static inline struct cma *cma_from_kobj(struct kobject *kobj) { return container_of(kobj, struct cma_kobject, kobj)->cma; @@ -48,6 +53,15 @@ static ssize_t alloc_pages_fail_show(struct kobject *kobj, } CMA_ATTR_RO(alloc_pages_fail); +static ssize_t release_pages_success_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_released)); +} +CMA_ATTR_RO(release_pages_success); + static void cma_kobj_release(struct kobject *kobj) { struct cma *cma = cma_from_kobj(kobj); @@ -60,6 +74,7 @@ static void cma_kobj_release(struct kobject *kobj) static struct attribute *cma_attrs[] = { &alloc_pages_success_attr.attr, &alloc_pages_fail_attr.attr, + &release_pages_success_attr.attr, NULL, }; ATTRIBUTE_GROUPS(cma); diff --git a/mm/compaction.c b/mm/compaction.c index b961db601df4..e731d45befc7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -40,9 +40,22 @@ static inline void count_compact_events(enum vm_event_item item, long delta) { count_vm_events(item, delta); } + +/* + * order == -1 is expected when compacting proactively via + * 1. /proc/sys/vm/compact_memory + * 2. /sys/devices/system/node/nodex/compact + * 3. /proc/sys/vm/compaction_proactiveness + */ +static inline bool is_via_compact_memory(int order) +{ + return order == -1; +} + #else #define count_compact_event(item) do { } while (0) #define count_compact_events(item, delta) do { } while (0) +static inline bool is_via_compact_memory(int order) { return false; } #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA @@ -66,45 +79,56 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) #endif -static unsigned long release_freepages(struct list_head *freelist) +static void split_map_pages(struct list_head *freepages) { + unsigned int i, order; struct page *page, *next; - unsigned long high_pfn = 0; + LIST_HEAD(tmp_list); - list_for_each_entry_safe(page, next, freelist, lru) { - unsigned long pfn = page_to_pfn(page); - list_del(&page->lru); - __free_page(page); - if (pfn > high_pfn) - high_pfn = pfn; - } + for (order = 0; order < NR_PAGE_ORDERS; order++) { + list_for_each_entry_safe(page, next, &freepages[order], lru) { + unsigned int nr_pages; - return high_pfn; + list_del(&page->lru); + + nr_pages = 1 << order; + + post_alloc_hook(page, order, __GFP_MOVABLE); + if (order) + split_page(page, order); + + for (i = 0; i < nr_pages; i++) { + list_add(&page->lru, &tmp_list); + page++; + } + } + list_splice_init(&tmp_list, &freepages[0]); + } } -static void split_map_pages(struct list_head *list) +static unsigned long release_free_list(struct list_head *freepages) { - unsigned int i, order, nr_pages; - struct page *page, *next; - LIST_HEAD(tmp_list); - - list_for_each_entry_safe(page, next, list, lru) { - list_del(&page->lru); + int order; + unsigned long high_pfn = 0; - order = page_private(page); - nr_pages = 1 << order; + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct page *page, *next; - post_alloc_hook(page, order, __GFP_MOVABLE); - if (order) - split_page(page, order); + list_for_each_entry_safe(page, next, &freepages[order], lru) { + unsigned long pfn = page_to_pfn(page); - for (i = 0; i < nr_pages; i++) { - list_add(&page->lru, &tmp_list); - page++; + list_del(&page->lru); + /* + * Convert free pages into post allocation pages, so + * that we can free them via __free_page. + */ + post_alloc_hook(page, order, __GFP_MOVABLE); + __free_pages(page, order); + if (pfn > high_pfn) + high_pfn = pfn; } } - - list_splice(&tmp_list, list); + return high_pfn; } #ifdef CONFIG_COMPACTION @@ -657,7 +681,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, nr_scanned += isolated - 1; total_isolated += isolated; cc->nr_freepages += isolated; - list_add_tail(&page->lru, freelist); + list_add_tail(&page->lru, &freelist[order]); if (!strict && cc->nr_migratepages <= cc->nr_freepages) { blockpfn += isolated; @@ -722,7 +746,11 @@ isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { unsigned long isolated, pfn, block_start_pfn, block_end_pfn; - LIST_HEAD(freelist); + int order; + struct list_head tmp_freepages[NR_PAGE_ORDERS]; + + for (order = 0; order < NR_PAGE_ORDERS; order++) + INIT_LIST_HEAD(&tmp_freepages[order]); pfn = start_pfn; block_start_pfn = pageblock_start_pfn(pfn); @@ -753,7 +781,7 @@ isolate_freepages_range(struct compact_control *cc, break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, &freelist, 0, true); + block_end_pfn, tmp_freepages, 0, true); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -770,15 +798,15 @@ isolate_freepages_range(struct compact_control *cc, */ } - /* __isolate_free_page() does not map the pages */ - split_map_pages(&freelist); - if (pfn < end_pfn) { /* Loop terminated early, cleanup. */ - release_freepages(&freelist); + release_free_list(tmp_freepages); return 0; } + /* __isolate_free_page() does not map the pages */ + split_map_pages(tmp_freepages); + /* We don't use freelists for anything. */ return pfn; } @@ -817,6 +845,32 @@ static bool too_many_isolated(struct compact_control *cc) } /** + * skip_isolation_on_order() - determine when to skip folio isolation based on + * folio order and compaction target order + * @order: to-be-isolated folio order + * @target_order: compaction target order + * + * This avoids unnecessary folio isolations during compaction. + */ +static bool skip_isolation_on_order(int order, int target_order) +{ + /* + * Unless we are performing global compaction (i.e., + * is_via_compact_memory), skip any folios that are larger than the + * target order: we wouldn't be here if we'd have a free folio with + * the desired target_order, so migrating this folio would likely fail + * later. + */ + if (!is_via_compact_memory(target_order) && order >= target_order) + return true; + /* + * We limit memory compaction to pageblocks and won't try + * creating free blocks of memory that are larger than that. + */ + return order >= pageblock_order; +} + +/** * isolate_migratepages_block() - isolate all migrate-able pages within * a single pageblock * @cc: Compaction control structure. @@ -947,7 +1001,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, valid_page = page; } - if (PageHuge(page) && cc->alloc_contig) { + if (PageHuge(page)) { + /* + * skip hugetlbfs if we are not compacting for pages + * bigger than its order. THPs and other compound pages + * are handled below. + */ + if (!cc->alloc_contig) { + const unsigned int order = compound_order(page); + + if (order <= MAX_PAGE_ORDER) { + low_pfn += (1UL << order) - 1; + nr_scanned += (1UL << order) - 1; + } + goto isolate_fail; + } + /* for alloc_contig case */ if (locked) { unlock_page_lruvec_irqrestore(locked, flags); locked = NULL; @@ -1008,21 +1077,24 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * Regardless of being on LRU, compound pages such as THP and - * hugetlbfs are not to be compacted unless we are attempting - * an allocation much larger than the huge page size (eg CMA). - * We can potentially save a lot of iterations if we skip them - * at once. The check is racy, but we can consider only valid - * values and the only danger is skipping too much. + * Regardless of being on LRU, compound pages such as THP + * (hugetlbfs is handled above) are not to be compacted unless + * we are attempting an allocation larger than the compound + * page size. We can potentially save a lot of iterations if we + * skip them at once. The check is racy, but we can consider + * only valid values and the only danger is skipping too much. */ if (PageCompound(page) && !cc->alloc_contig) { const unsigned int order = compound_order(page); - if (likely(order <= MAX_PAGE_ORDER)) { - low_pfn += (1UL << order) - 1; - nr_scanned += (1UL << order) - 1; + /* Skip based on page order and compaction target order. */ + if (skip_isolation_on_order(order, cc->order)) { + if (order <= MAX_PAGE_ORDER) { + low_pfn += (1UL << order) - 1; + nr_scanned += (1UL << order) - 1; + } + goto isolate_fail; } - goto isolate_fail; } /* @@ -1165,10 +1237,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * folio become large since the non-locked check, - * and it's on LRU. + * Check LRU folio order under the lock */ - if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) { + if (unlikely(skip_isolation_on_order(folio_order(folio), + cc->order) && + !cc->alloc_contig)) { low_pfn += folio_nr_pages(folio) - 1; nr_scanned += folio_nr_pages(folio) - 1; folio_set_lru(folio); @@ -1365,12 +1438,14 @@ static bool suitable_migration_target(struct compact_control *cc, { /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { + int order = cc->order > 0 ? cc->order : pageblock_order; + /* * We are checking page_order without zone->lock taken. But * the only small danger is that we skip a potentially suitable * pageblock, so it's not worth to check order for valid range. */ - if (buddy_order_unsafe(page) >= pageblock_order) + if (buddy_order_unsafe(page) >= order) return false; } @@ -1458,7 +1533,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn) if (!page) return; - isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); + isolate_freepages_block(cc, &start_pfn, end_pfn, cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ if (start_pfn == end_pfn && !cc->no_set_skip_hint) @@ -1587,7 +1662,7 @@ static void fast_isolate_freepages(struct compact_control *cc) nr_scanned += nr_isolated - 1; total_isolated += nr_isolated; cc->nr_freepages += nr_isolated; - list_add_tail(&page->lru, &cc->freepages); + list_add_tail(&page->lru, &cc->freepages[order]); count_compact_events(COMPACTISOLATED, nr_isolated); } else { /* If isolation fails, abort the search */ @@ -1664,13 +1739,12 @@ static void isolate_freepages(struct compact_control *cc) unsigned long isolate_start_pfn; /* exact pfn we start at */ unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ - struct list_head *freelist = &cc->freepages; unsigned int stride; /* Try a small search of the free lists for a candidate */ fast_isolate_freepages(cc); if (cc->nr_freepages) - goto splitmap; + return; /* * Initialise the free scanner. The starting point is where we last @@ -1730,7 +1804,7 @@ static void isolate_freepages(struct compact_control *cc) /* Found a block suitable for isolating free pages from. */ nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, stride, false); + block_end_pfn, cc->freepages, stride, false); /* Update the skip hint if the full pageblock was scanned */ if (isolate_start_pfn == block_end_pfn) @@ -1771,33 +1845,62 @@ static void isolate_freepages(struct compact_control *cc) * and the loop terminated due to isolate_start_pfn < low_pfn */ cc->free_pfn = isolate_start_pfn; - -splitmap: - /* __isolate_free_page() does not map the pages */ - split_map_pages(freelist); } /* * This is a migrate-callback that "allocates" freepages by taking pages * from the isolated freelists in the block we are migrating to. */ -static struct folio *compaction_alloc(struct folio *src, unsigned long data) +static struct folio *compaction_alloc_noprof(struct folio *src, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; struct folio *dst; + int order = folio_order(src); + bool has_isolated_pages = false; + int start_order; + struct page *freepage; + unsigned long size; + +again: + for (start_order = order; start_order < NR_PAGE_ORDERS; start_order++) + if (!list_empty(&cc->freepages[start_order])) + break; - if (list_empty(&cc->freepages)) { + /* no free pages in the list */ + if (start_order == NR_PAGE_ORDERS) { + if (has_isolated_pages) + return NULL; isolate_freepages(cc); + has_isolated_pages = true; + goto again; + } - if (list_empty(&cc->freepages)) - return NULL; + freepage = list_first_entry(&cc->freepages[start_order], struct page, + lru); + size = 1 << start_order; + + list_del(&freepage->lru); + + while (start_order > order) { + start_order--; + size >>= 1; + + list_add(&freepage[size].lru, &cc->freepages[start_order]); + set_page_private(&freepage[size], start_order); } + dst = (struct folio *)freepage; - dst = list_entry(cc->freepages.next, struct folio, lru); - list_del(&dst->lru); - cc->nr_freepages--; + post_alloc_hook(&dst->page, order, __GFP_MOVABLE); + if (order) + prep_compound_page(&dst->page, order); + cc->nr_freepages -= 1 << order; + cc->nr_migratepages -= 1 << order; + return page_rmappable_folio(&dst->page); +} - return dst; +static struct folio *compaction_alloc(struct folio *src, unsigned long data) +{ + return alloc_hooks(compaction_alloc_noprof(src, data)); } /* @@ -1808,9 +1911,19 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data) static void compaction_free(struct folio *dst, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; + int order = folio_order(dst); + struct page *page = &dst->page; - list_add(&dst->lru, &cc->freepages); - cc->nr_freepages++; + if (folio_put_testzero(dst)) { + free_pages_prepare(page, order); + list_add(&dst->lru, &cc->freepages[order]); + cc->nr_freepages += 1 << order; + } + cc->nr_migratepages += 1 << order; + /* + * someone else has referenced the page, we cannot take it back to our + * free list. + */ } /* possible outcome of isolate_migratepages */ @@ -2087,17 +2200,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) } /* - * order == -1 is expected when compacting proactively via - * 1. /proc/sys/vm/compact_memory - * 2. /sys/devices/system/node/nodex/compact - * 3. /proc/sys/vm/compaction_proactiveness - */ -static inline bool is_via_compact_memory(int order) -{ - return order == -1; -} - -/* * Determine whether kswapd is (or recently was!) running on this node. * * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't @@ -2433,7 +2535,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; - unsigned int nr_succeeded = 0; + unsigned int nr_succeeded = 0, nr_migratepages; + int order; /* * These counters track activities during zone compaction. Initialize @@ -2443,7 +2546,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->total_free_scanned = 0; cc->nr_migratepages = 0; cc->nr_freepages = 0; - INIT_LIST_HEAD(&cc->freepages); + for (order = 0; order < NR_PAGE_ORDERS; order++) + INIT_LIST_HEAD(&cc->freepages[order]); INIT_LIST_HEAD(&cc->migratepages); cc->migratetype = gfp_migratetype(cc->gfp_mask); @@ -2551,11 +2655,17 @@ rescan: pageblock_start_pfn(cc->migrate_pfn - 1)); } + /* + * Record the number of pages to migrate since the + * compaction_alloc/free() will update cc->nr_migratepages + * properly. + */ + nr_migratepages = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION, &nr_succeeded); - trace_mm_compaction_migratepages(cc, nr_succeeded); + trace_mm_compaction_migratepages(nr_migratepages, nr_succeeded); /* All pages were either migrated or will be released */ cc->nr_migratepages = 0; @@ -2629,7 +2739,7 @@ out: * so we don't leave any returned pages behind in the next attempt. */ if (cc->nr_freepages > 0) { - unsigned long free_pfn = release_freepages(&cc->freepages); + unsigned long free_pfn = release_free_list(cc->freepages); cc->nr_freepages = 0; VM_BUG_ON(free_pfn == 0); @@ -2648,7 +2758,6 @@ out: trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret); - VM_BUG_ON(!list_empty(&cc->freepages)); VM_BUG_ON(!list_empty(&cc->migratepages)); return ret; @@ -2783,25 +2892,27 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } /* - * Compact all zones within a node till each zone's fragmentation score - * reaches within proactive compaction thresholds (as determined by the - * proactiveness tunable). + * compact_node() - compact all zones within a node + * @pgdat: The node page data + * @proactive: Whether the compaction is proactive * - * It is possible that the function returns before reaching score targets - * due to various back-off conditions, such as, contention on per-node or - * per-zone locks. + * For proactive compaction, compact till each zone's fragmentation score + * reaches within proactive compaction thresholds (as determined by the + * proactiveness tunable), it is possible that the function returns before + * reaching score targets due to various back-off conditions, such as, + * contention on per-node or per-zone locks. */ -static void proactive_compact_node(pg_data_t *pgdat) +static int compact_node(pg_data_t *pgdat, bool proactive) { int zoneid; struct zone *zone; struct compact_control cc = { .order = -1, - .mode = MIGRATE_SYNC_LIGHT, + .mode = proactive ? MIGRATE_SYNC_LIGHT : MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, .gfp_mask = GFP_KERNEL, - .proactive_compaction = true, + .proactive_compaction = proactive, }; for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { @@ -2809,54 +2920,39 @@ static void proactive_compact_node(pg_data_t *pgdat) if (!populated_zone(zone)) continue; + if (fatal_signal_pending(current)) + return -EINTR; + cc.zone = zone; compact_zone(&cc, NULL); - count_compact_events(KCOMPACTD_MIGRATE_SCANNED, - cc.total_migrate_scanned); - count_compact_events(KCOMPACTD_FREE_SCANNED, - cc.total_free_scanned); + if (proactive) { + count_compact_events(KCOMPACTD_MIGRATE_SCANNED, + cc.total_migrate_scanned); + count_compact_events(KCOMPACTD_FREE_SCANNED, + cc.total_free_scanned); + } } -} -/* Compact all zones within a node */ -static void compact_node(int nid) -{ - pg_data_t *pgdat = NODE_DATA(nid); - int zoneid; - struct zone *zone; - struct compact_control cc = { - .order = -1, - .mode = MIGRATE_SYNC, - .ignore_skip_hint = true, - .whole_zone = true, - .gfp_mask = GFP_KERNEL, - }; - - - for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { - - zone = &pgdat->node_zones[zoneid]; - if (!populated_zone(zone)) - continue; - - cc.zone = zone; - - compact_zone(&cc, NULL); - } + return 0; } -/* Compact all nodes in the system */ -static void compact_nodes(void) +/* Compact all zones of all nodes in the system */ +static int compact_nodes(void) { - int nid; + int ret, nid; /* Flush pending updates to the LRU lists */ lru_add_drain_all(); - for_each_online_node(nid) - compact_node(nid); + for_each_online_node(nid) { + ret = compact_node(NODE_DATA(nid), false); + if (ret) + return ret; + } + + return 0; } static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, @@ -2902,9 +2998,9 @@ static int sysctl_compaction_handler(struct ctl_table *table, int write, return -EINVAL; if (write) - compact_nodes(); + ret = compact_nodes(); - return 0; + return ret; } #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) @@ -2918,7 +3014,7 @@ static ssize_t compact_store(struct device *dev, /* Flush pending updates to the LRU lists */ lru_add_drain_all(); - compact_node(nid); + compact_node(NODE_DATA(nid), false); } return count; @@ -3127,7 +3223,7 @@ static int kcompactd(void *p) unsigned int prev_score, score; prev_score = fragmentation_score_node(pgdat); - proactive_compact_node(pgdat); + compact_node(pgdat, true); score = fragmentation_score_node(pgdat); /* * Defer proactive compaction if the fragmentation @@ -3254,7 +3350,6 @@ static struct ctl_table vm_compaction[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; static int __init kcompactd_init(void) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 29f43fbc2eff..fecb8172410c 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -71,7 +71,7 @@ config DAMON_SYSFS_KUNIT_TEST If unsure, say N. -config DAMON_DBGFS +config DAMON_DBGFS_DEPRECATED bool "DAMON debugfs interface (DEPRECATED!)" depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS help @@ -84,6 +84,11 @@ config DAMON_DBGFS (DAMON_SYSFS). If you depend on this and cannot move, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org. +config DAMON_DBGFS + bool + default y + depends on DAMON_DBGFS_DEPRECATED + config DAMON_DBGFS_KUNIT_TEST bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS depends on DAMON_DBGFS && KUNIT=y diff --git a/mm/damon/core.c b/mm/damon/core.c index 5b325749fc12..6392f1cc97a3 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -11,6 +11,7 @@ #include <linux/delay.h> #include <linux/kthread.h> #include <linux/mm.h> +#include <linux/psi.h> #include <linux/slab.h> #include <linux/string.h> @@ -299,16 +300,53 @@ void damos_destroy_filter(struct damos_filter *f) damos_free_filter(f); } -/* initialize private fields of damos_quota and return the pointer */ -static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota) +struct damos_quota_goal *damos_new_quota_goal( + enum damos_quota_goal_metric metric, + unsigned long target_value) { + struct damos_quota_goal *goal; + + goal = kmalloc(sizeof(*goal), GFP_KERNEL); + if (!goal) + return NULL; + goal->metric = metric; + goal->target_value = target_value; + INIT_LIST_HEAD(&goal->list); + return goal; +} + +void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g) +{ + list_add_tail(&g->list, &q->goals); +} + +static void damos_del_quota_goal(struct damos_quota_goal *g) +{ + list_del(&g->list); +} + +static void damos_free_quota_goal(struct damos_quota_goal *g) +{ + kfree(g); +} + +void damos_destroy_quota_goal(struct damos_quota_goal *g) +{ + damos_del_quota_goal(g); + damos_free_quota_goal(g); +} + +/* initialize fields of @quota that normally API users wouldn't set */ +static struct damos_quota *damos_quota_init(struct damos_quota *quota) +{ + quota->esz = 0; quota->total_charged_sz = 0; quota->total_charged_ns = 0; - quota->esz = 0; quota->charged_sz = 0; quota->charged_from = 0; quota->charge_target_from = NULL; quota->charge_addr_from = 0; + quota->esz_bp = 0; return quota; } @@ -336,7 +374,9 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); - scheme->quota = *(damos_quota_init_priv(quota)); + scheme->quota = *(damos_quota_init(quota)); + /* quota.goals should be separately set by caller */ + INIT_LIST_HEAD(&scheme->quota.goals); scheme->wmarks = *wmarks; scheme->wmarks.activated = true; @@ -373,8 +413,12 @@ static void damon_free_scheme(struct damos *s) void damon_destroy_scheme(struct damos *s) { + struct damos_quota_goal *g, *g_next; struct damos_filter *f, *next; + damos_for_each_quota_goal_safe(g, g_next, &s->quota) + damos_destroy_quota_goal(g); + damos_for_each_filter_safe(f, next, s) damos_destroy_filter(f); damon_del_scheme(s); @@ -1083,21 +1127,78 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input, return min_input; } -/* Shouldn't be called if quota->ms, quota->sz, and quota->get_score unset */ +#ifdef CONFIG_PSI + +static u64 damos_get_some_mem_psi_total(void) +{ + if (static_branch_likely(&psi_disabled)) + return 0; + return div_u64(psi_system.total[PSI_AVGS][PSI_MEM * 2], + NSEC_PER_USEC); +} + +#else /* CONFIG_PSI */ + +static inline u64 damos_get_some_mem_psi_total(void) +{ + return 0; +}; + +#endif /* CONFIG_PSI */ + +static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) +{ + u64 now_psi_total; + + switch (goal->metric) { + case DAMOS_QUOTA_USER_INPUT: + /* User should already set goal->current_value */ + break; + case DAMOS_QUOTA_SOME_MEM_PSI_US: + now_psi_total = damos_get_some_mem_psi_total(); + goal->current_value = now_psi_total - goal->last_psi_total; + goal->last_psi_total = now_psi_total; + break; + default: + break; + } +} + +/* Return the highest score since it makes schemes least aggressive */ +static unsigned long damos_quota_score(struct damos_quota *quota) +{ + struct damos_quota_goal *goal; + unsigned long highest_score = 0; + + damos_for_each_quota_goal(goal, quota) { + damos_set_quota_goal_current_value(goal); + highest_score = max(highest_score, + goal->current_value * 10000 / + goal->target_value); + } + + return highest_score; +} + +/* + * Called only if quota->ms, or quota->sz are set, or quota->goals is not empty + */ static void damos_set_effective_quota(struct damos_quota *quota) { unsigned long throughput; unsigned long esz; - if (!quota->ms && !quota->get_score) { + if (!quota->ms && list_empty("a->goals)) { quota->esz = quota->sz; return; } - if (quota->get_score) { + if (!list_empty("a->goals)) { + unsigned long score = damos_quota_score(quota); + quota->esz_bp = damon_feed_loop_next_input( max(quota->esz_bp, 10000UL), - quota->get_score(quota->get_score_arg)); + score); esz = quota->esz_bp / 10000; } @@ -1107,7 +1208,7 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->total_charged_ns; else throughput = PAGE_SIZE * 1024; - if (quota->get_score) + if (!list_empty("a->goals)) esz = min(throughput * quota->ms, esz); else esz = throughput * quota->ms; @@ -1127,7 +1228,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) unsigned long cumulated_sz; unsigned int score, max_score = 0; - if (!quota->ms && !quota->sz && !quota->get_score) + if (!quota->ms && !quota->sz && list_empty("a->goals)) return; /* New charge window starts */ @@ -1380,12 +1481,14 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) return true; } -static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric) +static int damos_get_wmark_metric_value(enum damos_wmark_metric metric, + unsigned long *metric_value) { switch (metric) { case DAMOS_WMARK_FREE_MEM_RATE: - return global_zone_page_state(NR_FREE_PAGES) * 1000 / + *metric_value = global_zone_page_state(NR_FREE_PAGES) * 1000 / totalram_pages(); + return 0; default: break; } @@ -1400,10 +1503,9 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme) { unsigned long metric; - if (scheme->wmarks.metric == DAMOS_WMARK_NONE) + if (damos_get_wmark_metric_value(scheme->wmarks.metric, &metric)) return 0; - metric = damos_wmark_metric_value(scheme->wmarks.metric); /* higher than high watermark or lower than low watermark */ if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) { if (scheme->wmarks.activated) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 7dac24e69e3b..2461cfe2e968 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -15,6 +15,11 @@ #include <linux/page_idle.h> #include <linux/slab.h> +#define DAMON_DBGFS_DEPRECATION_NOTICE \ + "DAMON debugfs interface is deprecated, so users should move " \ + "to DAMON_SYSFS. If you cannot, please report your usecase to " \ + "damon@lists.linux.dev and linux-mm@kvack.org.\n" + static struct damon_ctx **dbgfs_ctxs; static int dbgfs_nr_ctxs; static struct dentry **dbgfs_dirs; @@ -22,10 +27,7 @@ static DEFINE_MUTEX(damon_dbgfs_lock); static void damon_dbgfs_warn_deprecation(void) { - pr_warn_once("DAMON debugfs interface is deprecated, " - "so users should move to DAMON_SYSFS. If you cannot, " - "please report your usecase to damon@lists.linux.dev and " - "linux-mm@kvack.org.\n"); + pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE); } /* @@ -805,6 +807,14 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx) damon_destroy_ctx(ctx); } +static ssize_t damon_dbgfs_deprecated_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE; + + return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf)); +} + /* * Make a context of @name and create a debugfs directory for it. * @@ -1056,6 +1066,10 @@ static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file) return nonseekable_open(inode, file); } +static const struct file_operations deprecated_fops = { + .read = damon_dbgfs_deprecated_read, +}; + static const struct file_operations mk_contexts_fops = { .open = damon_dbgfs_static_file_open, .write = dbgfs_mk_context_write, @@ -1076,9 +1090,9 @@ static int __init __damon_dbgfs_init(void) { struct dentry *dbgfs_root; const char * const file_names[] = {"mk_contexts", "rm_contexts", - "monitor_on"}; + "monitor_on_DEPRECATED", "DEPRECATED"}; const struct file_operations *fops[] = {&mk_contexts_fops, - &rm_contexts_fops, &monitor_on_fops}; + &rm_contexts_fops, &monitor_on_fops, &deprecated_fops}; int i; dbgfs_root = debugfs_create_dir("damon", NULL); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 081e2a325778..18797c1b419b 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -16,8 +16,8 @@ #include "../internal.h" #include "ops-common.h" -static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma, - unsigned long addr, void *arg) +static bool damon_folio_mkold_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, void *arg) { DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); @@ -31,33 +31,38 @@ static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma, return true; } -static void damon_pa_mkold(unsigned long paddr) +static void damon_folio_mkold(struct folio *folio) { - struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); struct rmap_walk_control rwc = { - .rmap_one = __damon_pa_mkold, + .rmap_one = damon_folio_mkold_one, .anon_lock = folio_lock_anon_vma_read, }; bool need_lock; - if (!folio) - return; - if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { folio_set_idle(folio); - goto out; + return; } need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); if (need_lock && !folio_trylock(folio)) - goto out; + return; rmap_walk(folio, &rwc); if (need_lock) folio_unlock(folio); -out: +} + +static void damon_pa_mkold(unsigned long paddr) +{ + struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); + + if (!folio) + return; + + damon_folio_mkold(folio); folio_put(folio); } @@ -79,8 +84,8 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) } } -static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, - unsigned long addr, void *arg) +static bool damon_folio_young_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, void *arg) { bool *accessed = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); @@ -111,38 +116,44 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, return *accessed == false; } -static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) +static bool damon_folio_young(struct folio *folio) { - struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); bool accessed = false; struct rmap_walk_control rwc = { .arg = &accessed, - .rmap_one = __damon_pa_young, + .rmap_one = damon_folio_young_one, .anon_lock = folio_lock_anon_vma_read, }; bool need_lock; - if (!folio) - return false; - if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { if (folio_test_idle(folio)) - accessed = false; + return false; else - accessed = true; - goto out; + return true; } need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); if (need_lock && !folio_trylock(folio)) - goto out; + return false; rmap_walk(folio, &rwc); if (need_lock) folio_unlock(folio); -out: + return accessed; +} + +static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) +{ + struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); + bool accessed; + + if (!folio) + return false; + + accessed = damon_folio_young(folio); *folio_sz = folio_size(folio); folio_put(folio); return accessed; @@ -203,6 +214,11 @@ static bool __damos_pa_filter_out(struct damos_filter *filter, matched = filter->memcg_id == mem_cgroup_id(memcg); rcu_read_unlock(); break; + case DAMOS_FILTER_TYPE_YOUNG: + matched = damon_folio_young(folio); + if (matched) + damon_folio_mkold(folio); + break; default: break; } @@ -228,6 +244,22 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) { unsigned long addr, applied; LIST_HEAD(folio_list); + bool install_young_filter = true; + struct damos_filter *filter; + + /* check access in page level again by default */ + damos_for_each_filter(filter, s) { + if (filter->type == DAMOS_FILTER_TYPE_YOUNG) { + install_young_filter = false; + break; + } + } + if (install_young_filter) { + filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true); + if (!filter) + return 0; + damos_add_filter(s, filter); + } for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { struct folio *folio = damon_get_folio(PHYS_PFN(addr)); @@ -249,6 +281,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) put_folio: folio_put(folio); } + if (install_young_filter) + damos_destroy_filter(filter); applied = reclaim_pages(&folio_list); cond_resched(); return applied * PAGE_SIZE; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 66e190f0374a..9bd341d62b4c 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -62,6 +62,36 @@ static struct damos_quota damon_reclaim_quota = { }; DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota); +/* + * Desired level of memory pressure-stall time in microseconds. + * + * While keeping the caps that set by other quotas, DAMON_RECLAIM automatically + * increases and decreases the effective level of the quota aiming this level of + * memory pressure is incurred. System-wide ``some`` memory PSI in microseconds + * per quota reset interval (``quota_reset_interval_ms``) is collected and + * compared to this value to see if the aim is satisfied. Value zero means + * disabling this auto-tuning feature. + * + * Disabled by default. + */ +static unsigned long quota_mem_pressure_us __read_mostly; +module_param(quota_mem_pressure_us, ulong, 0600); + +/* + * User-specifiable feedback for auto-tuning of the effective quota. + * + * While keeping the caps that set by other quotas, DAMON_RECLAIM automatically + * increases and decreases the effective level of the quota aiming receiving this + * feedback of value ``10,000`` from the user. DAMON_RECLAIM assumes the feedback + * value and the quota are positively proportional. Value zero means disabling + * this auto-tuning feature. + * + * Disabled by default. + * + */ +static unsigned long quota_autotune_feedback __read_mostly; +module_param(quota_autotune_feedback, ulong, 0600); + static struct damos_watermarks damon_reclaim_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = 5000000, /* 5 seconds */ @@ -159,11 +189,13 @@ static void damon_reclaim_copy_quota_status(struct damos_quota *dst, dst->charged_from = src->charged_from; dst->charge_target_from = src->charge_target_from; dst->charge_addr_from = src->charge_addr_from; + dst->esz_bp = src->esz_bp; } static int damon_reclaim_apply_parameters(void) { struct damos *scheme, *old_scheme; + struct damos_quota_goal *goal; struct damos_filter *filter; int err = 0; @@ -180,6 +212,27 @@ static int damon_reclaim_apply_parameters(void) damon_reclaim_copy_quota_status(&scheme->quota, &old_scheme->quota); } + + if (quota_mem_pressure_us) { + goal = damos_new_quota_goal(DAMOS_QUOTA_SOME_MEM_PSI_US, + quota_mem_pressure_us); + if (!goal) { + damon_destroy_scheme(scheme); + return -ENOMEM; + } + damos_add_quota_goal(&scheme->quota, goal); + } + + if (quota_autotune_feedback) { + goal = damos_new_quota_goal(DAMOS_QUOTA_USER_INPUT, 10000); + if (!goal) { + damon_destroy_scheme(scheme); + return -ENOMEM; + } + goal->current_value = quota_autotune_feedback; + damos_add_quota_goal(&scheme->quota, goal); + } + if (skip_anon) { filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); if (!filter) { diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 4c37a166eb81..a63f51577cff 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -49,6 +49,8 @@ int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx, bool total_bytes_only); +void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx); + bool damos_sysfs_regions_upd_done(void); int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); @@ -57,5 +59,9 @@ int damon_sysfs_schemes_clear_regions( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); -void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, +int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); + +void damos_sysfs_update_effective_quotas( + struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index ae0f0b314f3a..bea5bc52846a 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -127,17 +127,17 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = { * * Once the tried regions update request is received, the request handling * start function (damon_sysfs_scheme_update_regions_start()) sets the status - * of all schemes as 'idle' again, and register ->before_damos_apply() and - * ->after_sampling() callbacks. + * of all schemes as 'idle' again, and register ->before_damos_apply() + * callback. * * Then, the first followup ->before_damos_apply() callback * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first - * ->after_sampling() callback (damon_sysfs_after_sampling()) after the call - * is called only after the scheme is completely applied - * to the given snapshot. Hence the callback knows the situation by showing - * 'started' status, and sets the status as 'finished'. Then, - * damon_sysfs_before_damos_apply() understands the situation by showing the - * 'finished' status and do nothing. + * ->after_sampling() or ->after_aggregation() callback + * (damon_sysfs_cmd_request_callback()) after the call is called only after + * the scheme is completely applied to the given snapshot. Hence the callback + * knows the situation by showing 'started' status, and sets the status as + * 'finished'. Then, damon_sysfs_before_damos_apply() understands the + * situation by showing the 'finished' status and do nothing. * * If DAMOS is not applied to any region due to any reasons including the * access pattern, the watermarks, the quotas, and the filters, @@ -343,6 +343,7 @@ static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) static const char * const damon_sysfs_scheme_filter_type_strs[] = { "anon", "memcg", + "young", "addr", "target", }; @@ -826,15 +827,48 @@ static const struct kobj_type damon_sysfs_watermarks_ktype = { struct damos_sysfs_quota_goal { struct kobject kobj; + enum damos_quota_goal_metric metric; unsigned long target_value; unsigned long current_value; }; +/* This should match with enum damos_action */ +static const char * const damos_sysfs_quota_goal_metric_strs[] = { + "user_input", + "some_mem_psi_us", +}; + static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void) { return kzalloc(sizeof(struct damos_sysfs_quota_goal), GFP_KERNEL); } +static ssize_t target_metric_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, + struct damos_sysfs_quota_goal, kobj); + + return sysfs_emit(buf, "%s\n", + damos_sysfs_quota_goal_metric_strs[goal->metric]); +} + +static ssize_t target_metric_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, + struct damos_sysfs_quota_goal, kobj); + enum damos_quota_goal_metric m; + + for (m = 0; m < NR_DAMOS_QUOTA_GOAL_METRICS; m++) { + if (sysfs_streq(buf, damos_sysfs_quota_goal_metric_strs[m])) { + goal->metric = m; + return count; + } + } + return -EINVAL; +} + static ssize_t target_value_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -880,6 +914,9 @@ static void damos_sysfs_quota_goal_release(struct kobject *kobj) kfree(container_of(kobj, struct damos_sysfs_quota_goal, kobj)); } +static struct kobj_attribute damos_sysfs_quota_goal_target_metric_attr = + __ATTR_RW_MODE(target_metric, 0600); + static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr = __ATTR_RW_MODE(target_value, 0600); @@ -887,6 +924,7 @@ static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr = __ATTR_RW_MODE(current_value, 0600); static struct attribute *damos_sysfs_quota_goal_attrs[] = { + &damos_sysfs_quota_goal_target_metric_attr.attr, &damos_sysfs_quota_goal_target_value_attr.attr, &damos_sysfs_quota_goal_current_value_attr.attr, NULL, @@ -1139,6 +1177,7 @@ struct damon_sysfs_quotas { unsigned long ms; unsigned long sz; unsigned long reset_interval_ms; + unsigned long effective_sz; /* Effective size quota in bytes */ }; static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) @@ -1252,6 +1291,15 @@ static ssize_t reset_interval_ms_store(struct kobject *kobj, return count; } +static ssize_t effective_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->effective_sz); +} + static void damon_sysfs_quotas_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); @@ -1266,10 +1314,14 @@ static struct kobj_attribute damon_sysfs_quotas_sz_attr = static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = __ATTR_RW_MODE(reset_interval_ms, 0600); +static struct kobj_attribute damon_sysfs_quotas_effective_bytes_attr = + __ATTR_RO_MODE(effective_bytes, 0400); + static struct attribute *damon_sysfs_quotas_attrs[] = { &damon_sysfs_quotas_ms_attr.attr, &damon_sysfs_quotas_sz_attr.attr, &damon_sysfs_quotas_reset_interval_ms_attr.attr, + &damon_sysfs_quotas_effective_bytes_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_quotas); @@ -1868,35 +1920,35 @@ static int damon_sysfs_set_scheme_filters(struct damos *scheme, return 0; } -static unsigned long damos_sysfs_get_quota_score(void *arg) -{ - return (unsigned long)arg; -} - -static void damos_sysfs_set_quota_score( +static int damos_sysfs_set_quota_score( struct damos_sysfs_quota_goals *sysfs_goals, struct damos_quota *quota) { - struct damos_sysfs_quota_goal *sysfs_goal; + struct damos_quota_goal *goal, *next; int i; - quota->get_score = NULL; - quota->get_score_arg = (void *)0; + damos_for_each_quota_goal_safe(goal, next, quota) + damos_destroy_quota_goal(goal); + for (i = 0; i < sysfs_goals->nr; i++) { - sysfs_goal = sysfs_goals->goals_arr[i]; + struct damos_sysfs_quota_goal *sysfs_goal = + sysfs_goals->goals_arr[i]; + if (!sysfs_goal->target_value) continue; - /* Higher score makes scheme less aggressive */ - quota->get_score_arg = (void *)max( - (unsigned long)quota->get_score_arg, - sysfs_goal->current_value * 10000 / + goal = damos_new_quota_goal(sysfs_goal->metric, sysfs_goal->target_value); - quota->get_score = damos_sysfs_get_quota_score; + if (!goal) + return -ENOMEM; + if (sysfs_goal->metric == DAMOS_QUOTA_USER_INPUT) + goal->current_value = sysfs_goal->current_value; + damos_add_quota_goal(quota, goal); } + return 0; } -void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, +int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx) { struct damos *scheme; @@ -1904,16 +1956,41 @@ void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, damon_for_each_scheme(scheme, ctx) { struct damon_sysfs_scheme *sysfs_scheme; + int err; /* user could have removed the scheme sysfs dir */ if (i >= sysfs_schemes->nr) break; sysfs_scheme = sysfs_schemes->schemes_arr[i]; - damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals, + err = damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals, &scheme->quota); + if (err) + /* kdamond will clean up schemes and terminated */ + return err; i++; } + return 0; +} + +void damos_sysfs_update_effective_quotas( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_quotas *sysfs_quotas; + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + + sysfs_quotas = + sysfs_schemes->schemes_arr[schemes_idx++]->quotas; + sysfs_quotas->effective_sz = scheme->quota.esz; + } } static struct damos *damon_sysfs_mk_scheme( @@ -1953,13 +2030,17 @@ static struct damos *damon_sysfs_mk_scheme( .low = sysfs_wmarks->low, }; - damos_sysfs_set_quota_score(sysfs_quotas->goals, "a); - scheme = damon_new_scheme(&pattern, sysfs_scheme->action, sysfs_scheme->apply_interval_us, "a, &wmarks); if (!scheme) return NULL; + err = damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota); + if (err) { + damon_destroy_scheme(scheme); + return NULL; + } + err = damon_sysfs_set_scheme_filters(scheme, sysfs_filters); if (err) { damon_destroy_scheme(scheme); @@ -1995,7 +2076,11 @@ static void damon_sysfs_update_scheme(struct damos *scheme, scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses; scheme->quota.weight_age = sysfs_weights->age; - damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota); + err = damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota); + if (err) { + damon_destroy_scheme(scheme); + return; + } scheme->wmarks.metric = sysfs_wmarks->metric; scheme->wmarks.interval = sysfs_wmarks->interval_us; @@ -2122,7 +2207,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, * callback is registered, damon_sysfs_lock should be held to ensure the * regions directories exist. */ -static int damon_sysfs_after_sampling(struct damon_ctx *ctx) +void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx) { struct damon_sysfs_schemes *sysfs_schemes = damon_sysfs_schemes_for_damos_callback; @@ -2138,8 +2223,6 @@ static int damon_sysfs_after_sampling(struct damon_ctx *ctx) sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_FINISHED; } - - return 0; } /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ @@ -2212,7 +2295,6 @@ int damon_sysfs_schemes_update_regions_start( damos_tried_regions_init_upd_status(sysfs_schemes, ctx); damos_regions_upd_total_bytes_only = total_bytes_only; ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; - ctx->callback.after_sampling = damon_sysfs_after_sampling; return 0; } @@ -2241,7 +2323,6 @@ int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) { damon_sysfs_schemes_for_damos_callback = NULL; ctx->callback.before_damos_apply = NULL; - ctx->callback.after_sampling = NULL; damon_sysfs_schemes_region_idx = 0; return 0; } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 1f891e18b4ee..6fee383bc0c5 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1020,6 +1020,11 @@ enum damon_sysfs_cmd { */ DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS, /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: Update the + * effective size quota of the scheme in bytes. + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS, + /* * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. */ NR_DAMON_SYSFS_CMDS, @@ -1035,6 +1040,7 @@ static const char * const damon_sysfs_cmd_strs[] = { "update_schemes_tried_bytes", "update_schemes_tried_regions", "clear_schemes_tried_regions", + "update_schemes_effective_quotas", }; /* @@ -1371,19 +1377,43 @@ static int damon_sysfs_commit_schemes_quota_goals( ctx = sysfs_kdamond->damon_ctx; sysfs_ctx = sysfs_kdamond->contexts->contexts_arr[0]; - damos_sysfs_set_quota_scores(sysfs_ctx->schemes, ctx); + return damos_sysfs_set_quota_scores(sysfs_ctx->schemes, ctx); +} + +/* + * damon_sysfs_upd_schemes_effective_quotas() - Update schemes effective quotas + * sysfs files. + * @kdamond: The kobject wrapper that associated to the kdamond thread. + * + * This function reads the schemes' effective quotas of specific kdamond and + * update the related values for sysfs files. This function should be called + * from DAMON callbacks while holding ``damon_syfs_lock``, to safely access the + * DAMON contexts-internal data and DAMON sysfs variables. + */ +static int damon_sysfs_upd_schemes_effective_quotas( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + damos_sysfs_update_effective_quotas( + kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; } + /* * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. * @c: The DAMON context of the callback. * @active: Whether @c is not deactivated due to watermarks. + * @after_aggr: Whether this is called from after_aggregation() callback. * * This function is periodically called back from the kdamond thread for @c. * Then, it checks if there is a waiting DAMON sysfs request and handles it. */ -static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active) +static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active, + bool after_aggregation) { struct damon_sysfs_kdamond *kdamond; bool total_bytes_only = false; @@ -1401,6 +1431,8 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active) err = damon_sysfs_upd_schemes_stats(kdamond); break; case DAMON_SYSFS_CMD_COMMIT: + if (!after_aggregation) + goto out; err = damon_sysfs_commit_input(kdamond); break; case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: @@ -1418,6 +1450,7 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active) goto keep_lock_out; } } else { + damos_sysfs_mark_finished_regions_updates(c); /* * Continue regions updating if DAMON is till * active and the update for all schemes is not @@ -1432,6 +1465,9 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active) case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: err = damon_sysfs_clear_schemes_regions(kdamond); break; + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: + err = damon_sysfs_upd_schemes_effective_quotas(kdamond); + break; default: break; } @@ -1450,7 +1486,16 @@ static int damon_sysfs_after_wmarks_check(struct damon_ctx *c) * after_wmarks_check() is called back while the context is deactivated * by watermarks. */ - return damon_sysfs_cmd_request_callback(c, false); + return damon_sysfs_cmd_request_callback(c, false, false); +} + +static int damon_sysfs_after_sampling(struct damon_ctx *c) +{ + /* + * after_sampling() is called back only while the context is not + * deactivated by watermarks. + */ + return damon_sysfs_cmd_request_callback(c, true, false); } static int damon_sysfs_after_aggregation(struct damon_ctx *c) @@ -1459,7 +1504,7 @@ static int damon_sysfs_after_aggregation(struct damon_ctx *c) * after_aggregation() is called back only while the context is not * deactivated by watermarks. */ - return damon_sysfs_cmd_request_callback(c, true); + return damon_sysfs_cmd_request_callback(c, true, true); } static struct damon_ctx *damon_sysfs_build_ctx( @@ -1478,6 +1523,7 @@ static struct damon_ctx *damon_sysfs_build_ctx( } ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check; + ctx->callback.after_sampling = damon_sysfs_after_sampling; ctx->callback.after_aggregation = damon_sysfs_after_aggregation; ctx->callback.before_terminate = damon_sysfs_before_terminate; return ctx; diff --git a/mm/debug.c b/mm/debug.c index ee533a5ceb79..69e524c3e601 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -51,87 +51,98 @@ const struct trace_print_flags vmaflag_names[] = { {0, NULL} }; -static void __dump_page(struct page *page) +static void __dump_folio(struct folio *folio, struct page *page, + unsigned long pfn, unsigned long idx) { - struct folio *folio = page_folio(page); - struct page *head = &folio->page; - struct address_space *mapping; - bool compound = PageCompound(page); - /* - * Accessing the pageblock without the zone lock. It could change to - * "isolate" again in the meantime, but since we are just dumping the - * state for debugging, it should be fine to accept a bit of - * inaccuracy here due to racing. - */ - bool page_cma = is_migrate_cma_page(page); - int mapcount; + struct address_space *mapping = folio_mapping(folio); + int mapcount = atomic_read(&page->_mapcount); char *type = ""; - if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) { - /* - * Corrupt page, so we cannot call page_mapping. Instead, do a - * safe subset of the steps that page_mapping() does. Caution: - * this will be misleading for tail pages, PageSwapCache pages, - * and potentially other situations. (See the page_mapping() - * implementation for what's missing here.) - */ - unsigned long tmp = (unsigned long)page->mapping; - - if (tmp & PAGE_MAPPING_ANON) - mapping = NULL; - else - mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS); - head = page; - folio = (struct folio *)page; - compound = false; - } else { - mapping = page_mapping(page); - } - - /* - * Avoid VM_BUG_ON() in page_mapcount(). - * page->_mapcount space in struct page is used by sl[aou]b pages to - * encode own info. - */ - mapcount = PageSlab(head) ? 0 : page_mapcount(page); - - pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", - page, page_ref_count(head), mapcount, mapping, - page_to_pgoff(page), page_to_pfn(page)); - if (compound) { - pr_warn("head:%p order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", - head, compound_order(head), + mapcount = page_type_has_type(mapcount) ? 0 : mapcount + 1; + pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", + folio_ref_count(folio), mapcount, mapping, + folio->index + idx, pfn); + if (folio_test_large(folio)) { + pr_warn("head: order:%u mapcount:%d entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", + folio_order(folio), + folio_mapcount(folio), folio_entire_mapcount(folio), folio_nr_pages_mapped(folio), atomic_read(&folio->_pincount)); } #ifdef CONFIG_MEMCG - if (head->memcg_data) - pr_warn("memcg:%lx\n", head->memcg_data); + if (folio->memcg_data) + pr_warn("memcg:%lx\n", folio->memcg_data); #endif - if (PageKsm(page)) + if (folio_test_ksm(folio)) type = "ksm "; - else if (PageAnon(page)) + else if (folio_test_anon(folio)) type = "anon "; else if (mapping) dump_mapping(mapping); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); - pr_warn("%sflags: %pGp%s\n", type, &head->flags, - page_cma ? " CMA" : ""); - pr_warn("page_type: %pGt\n", &head->page_type); + /* + * Accessing the pageblock without the zone lock. It could change to + * "isolate" again in the meantime, but since we are just dumping the + * state for debugging, it should be fine to accept a bit of + * inaccuracy here due to racing. + */ + pr_warn("%sflags: %pGp%s\n", type, &folio->flags, + is_migrate_cma_folio(folio, pfn) ? " CMA" : ""); + if (page_has_type(&folio->page)) + pr_warn("page_type: %pGt\n", &folio->page.page_type); print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, sizeof(struct page), false); - if (head != page) + if (folio_test_large(folio)) print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32, - sizeof(unsigned long), head, - sizeof(struct page), false); + sizeof(unsigned long), folio, + 2 * sizeof(struct page), false); } -void dump_page(struct page *page, const char *reason) +static void __dump_page(const struct page *page) +{ + struct folio *foliop, folio; + struct page precise; + unsigned long pfn = page_to_pfn(page); + unsigned long idx, nr_pages = 1; + int loops = 5; + +again: + memcpy(&precise, page, sizeof(*page)); + foliop = page_folio(&precise); + if (foliop == (struct folio *)&precise) { + idx = 0; + if (!folio_test_large(foliop)) + goto dump; + foliop = (struct folio *)page; + } else { + idx = folio_page_idx(foliop, page); + } + + if (idx < MAX_FOLIO_NR_PAGES) { + memcpy(&folio, foliop, 2 * sizeof(struct page)); + nr_pages = folio_nr_pages(&folio); + foliop = &folio; + } + + if (idx > nr_pages) { + if (loops-- > 0) + goto again; + pr_warn("page does not match folio\n"); + precise.compound_head &= ~1UL; + foliop = (struct folio *)&precise; + idx = 0; + } + +dump: + __dump_folio(foliop, &precise, pfn, idx); +} + +void dump_page(const struct page *page, const char *reason) { if (PagePoisoned(page)) pr_warn("page:%p is uninitialized and poisoned", page); @@ -162,9 +173,6 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { pr_emerg("mm %px task_size %lu\n" -#ifdef CONFIG_MMU - "get_unmapped_area %px\n" -#endif "mmap_base %lu mmap_legacy_base %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" @@ -190,9 +198,6 @@ void dump_mm(const struct mm_struct *mm) "def_flags: %#lx(%pGv)\n", mm, mm->task_size, -#ifdef CONFIG_MMU - mm->get_unmapped_area, -#endif mm->mmap_base, mm->mmap_legacy_base, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c index 6755f0c9d4a3..d46acf989dde 100644 --- a/mm/debug_page_alloc.c +++ b/mm/debug_page_alloc.c @@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) } early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); -bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype) +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (order >= debug_guardpage_minorder()) return false; @@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, __SetPageGuard(page); INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); - /* Guard pages are not available for any usage */ - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -(1 << order), migratetype); return true; } -void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype) +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order) { __ClearPageGuard(page); - set_page_private(page, 0); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, (1 << order), migratetype); } diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 65c19025da3d..b104a353b532 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -30,6 +30,7 @@ #include <linux/start_kernel.h> #include <linux/sched/mm.h> #include <linux/io.h> +#include <linux/vmalloc.h> #include <asm/cacheflush.h> #include <asm/pgalloc.h> @@ -981,6 +982,7 @@ static void __init pmd_thp_tests(struct pgtable_debug_args *args) #ifndef __HAVE_ARCH_PMDP_INVALIDATE WARN_ON(!pmd_trans_huge(pmd_mkinvalid(pmd_mkhuge(pmd)))); WARN_ON(!pmd_present(pmd_mkinvalid(pmd_mkhuge(pmd)))); + WARN_ON(!pmd_leaf(pmd_mkinvalid(pmd_mkhuge(pmd)))); #endif /* __HAVE_ARCH_PMDP_INVALIDATE */ } diff --git a/mm/execmem.c b/mm/execmem.c new file mode 100644 index 000000000000..0c4b36bc6d10 --- /dev/null +++ b/mm/execmem.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2002 Richard Henderson + * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. + * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> + * Copyright (C) 2024 Mike Rapoport IBM. + */ + +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/execmem.h> +#include <linux/moduleloader.h> + +static struct execmem_info *execmem_info __ro_after_init; +static struct execmem_info default_execmem_info __ro_after_init; + +static void *__execmem_alloc(struct execmem_range *range, size_t size) +{ + bool kasan = range->flags & EXECMEM_KASAN_SHADOW; + unsigned long vm_flags = VM_FLUSH_RESET_PERMS; + gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; + unsigned long start = range->start; + unsigned long end = range->end; + unsigned int align = range->alignment; + pgprot_t pgprot = range->pgprot; + void *p; + + if (kasan) + vm_flags |= VM_DEFER_KMEMLEAK; + + p = __vmalloc_node_range(size, align, start, end, gfp_flags, + pgprot, vm_flags, NUMA_NO_NODE, + __builtin_return_address(0)); + if (!p && range->fallback_start) { + start = range->fallback_start; + end = range->fallback_end; + p = __vmalloc_node_range(size, align, start, end, gfp_flags, + pgprot, vm_flags, NUMA_NO_NODE, + __builtin_return_address(0)); + } + + if (!p) { + pr_warn_ratelimited("execmem: unable to allocate memory\n"); + return NULL; + } + + if (kasan && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) { + vfree(p); + return NULL; + } + + return kasan_reset_tag(p); +} + +void *execmem_alloc(enum execmem_type type, size_t size) +{ + struct execmem_range *range = &execmem_info->ranges[type]; + + return __execmem_alloc(range, size); +} + +void execmem_free(void *ptr) +{ + /* + * This memory may be RO, and freeing RO memory in an interrupt is not + * supported by vmalloc. + */ + WARN_ON(in_interrupt()); + vfree(ptr); +} + +static bool execmem_validate(struct execmem_info *info) +{ + struct execmem_range *r = &info->ranges[EXECMEM_DEFAULT]; + + if (!r->alignment || !r->start || !r->end || !pgprot_val(r->pgprot)) { + pr_crit("Invalid parameters for execmem allocator, module loading will fail"); + return false; + } + + return true; +} + +static void execmem_init_missing(struct execmem_info *info) +{ + struct execmem_range *default_range = &info->ranges[EXECMEM_DEFAULT]; + + for (int i = EXECMEM_DEFAULT + 1; i < EXECMEM_TYPE_MAX; i++) { + struct execmem_range *r = &info->ranges[i]; + + if (!r->start) { + if (i == EXECMEM_MODULE_DATA) + r->pgprot = PAGE_KERNEL; + else + r->pgprot = default_range->pgprot; + r->alignment = default_range->alignment; + r->start = default_range->start; + r->end = default_range->end; + r->flags = default_range->flags; + r->fallback_start = default_range->fallback_start; + r->fallback_end = default_range->fallback_end; + } + } +} + +struct execmem_info * __weak execmem_arch_setup(void) +{ + return NULL; +} + +static void __init __execmem_init(void) +{ + struct execmem_info *info = execmem_arch_setup(); + + if (!info) { + info = execmem_info = &default_execmem_info; + info->ranges[EXECMEM_DEFAULT].start = VMALLOC_START; + info->ranges[EXECMEM_DEFAULT].end = VMALLOC_END; + info->ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL_EXEC; + info->ranges[EXECMEM_DEFAULT].alignment = 1; + } + + if (!execmem_validate(info)) + return; + + execmem_init_missing(info); + + execmem_info = info; +} + +#ifdef CONFIG_ARCH_WANTS_EXECMEM_LATE +static int __init execmem_late_init(void) +{ + __execmem_init(); + return 0; +} +core_initcall(execmem_late_init); +#else +void __init execmem_init(void) +{ + __execmem_init(); +} +#endif diff --git a/mm/filemap.c b/mm/filemap.c index 4a30de98a8c7..382c3d06bfb1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -124,6 +124,15 @@ * ->private_lock (zap_pte_range->block_dirty_folio) */ +static void mapping_set_update(struct xa_state *xas, + struct address_space *mapping) +{ + if (dax_mapping(mapping) || shmem_mapping(mapping)) + return; + xas_set_update(xas, workingset_update_node); + xas_set_lru(xas, &shadow_nodes); +} + static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { @@ -159,7 +168,7 @@ static void filemap_unaccount_folio(struct address_space *mapping, add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); if (mapping_exiting(mapping) && !folio_test_large(folio)) { - int mapcount = page_mapcount(&folio->page); + int mapcount = folio_mapcount(folio); if (folio_ref_count(folio) >= mapcount + 2) { /* @@ -843,23 +852,18 @@ noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { XA_STATE(xas, &mapping->i_pages, index); - int huge = folio_test_hugetlb(folio); - bool charged = false; - long nr = 1; + void *alloced_shadow = NULL; + int alloced_order = 0; + bool huge; + long nr; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); mapping_set_update(&xas, mapping); - if (!huge) { - int error = mem_cgroup_charge(folio, NULL, gfp); - if (error) - return error; - charged = true; - } - VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); xas_set_order(&xas, index, folio_order(folio)); + huge = folio_test_hugetlb(folio); nr = folio_nr_pages(folio); gfp &= GFP_RECLAIM_MASK; @@ -867,13 +871,10 @@ noinline int __filemap_add_folio(struct address_space *mapping, folio->mapping = mapping; folio->index = xas.xa_index; - do { - unsigned int order = xa_get_order(xas.xa, xas.xa_index); + for (;;) { + int order = -1, split_order = 0; void *entry, *old = NULL; - if (order > folio_order(folio)) - xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), - order, gfp); xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry; @@ -881,19 +882,33 @@ noinline int __filemap_add_folio(struct address_space *mapping, xas_set_err(&xas, -EEXIST); goto unlock; } + /* + * If a larger entry exists, + * it will be the first and only entry iterated. + */ + if (order == -1) + order = xas_get_order(&xas); + } + + /* entry may have changed before we re-acquire the lock */ + if (alloced_order && (old != alloced_shadow || order != alloced_order)) { + xas_destroy(&xas); + alloced_order = 0; } if (old) { - if (shadowp) - *shadowp = old; - /* entry may have been split before we acquired lock */ - order = xa_get_order(xas.xa, xas.xa_index); - if (order > folio_order(folio)) { + if (order > 0 && order > folio_order(folio)) { /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); + if (!alloced_order) { + split_order = order; + goto unlock; + } xas_split(&xas, old, order); xas_reset(&xas); } + if (shadowp) + *shadowp = old; } xas_store(&xas, folio); @@ -909,9 +924,24 @@ noinline int __filemap_add_folio(struct address_space *mapping, __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } + unlock: xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp)); + + /* split needed, alloc here and retry. */ + if (split_order) { + xas_split_alloc(&xas, old, split_order, gfp); + if (xas_error(&xas)) + goto error; + alloced_shadow = old; + alloced_order = split_order; + xas_reset(&xas); + continue; + } + + if (!xas_nomem(&xas, gfp)) + break; + } if (xas_error(&xas)) goto error; @@ -919,8 +949,6 @@ unlock: trace_mm_filemap_add_to_page_cache(folio); return 0; error: - if (charged) - mem_cgroup_uncharge(folio); folio->mapping = NULL; /* Leave page->index set: truncation relies upon it */ folio_put_refs(folio, nr); @@ -934,11 +962,16 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, void *shadow = NULL; int ret; + ret = mem_cgroup_charge(folio, NULL, gfp); + if (ret) + return ret; + __folio_set_locked(folio); ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); - if (unlikely(ret)) + if (unlikely(ret)) { + mem_cgroup_uncharge(folio); __folio_clear_locked(folio); - else { + } else { /* * The folio might have been evicted from cache only * recently, in which case it should be activated like @@ -957,7 +990,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) { int n; struct folio *folio; @@ -972,9 +1005,9 @@ struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) return folio; } - return folio_alloc(gfp, order); + return folio_alloc_noprof(gfp, order); } -EXPORT_SYMBOL(filemap_alloc_folio); +EXPORT_SYMBOL(filemap_alloc_folio_noprof); #endif /* @@ -1354,7 +1387,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; - struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + struct folio *folio = pfn_swap_entry_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { @@ -1531,7 +1564,7 @@ EXPORT_SYMBOL(folio_end_private_2); * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio. + * Wait for PG_private_2 to be cleared on a folio. */ void folio_wait_private_2(struct folio *folio) { @@ -1544,8 +1577,8 @@ EXPORT_SYMBOL(folio_wait_private_2); * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a - * fatal signal is received by the calling task. + * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is + * received by the calling task. * * Return: * - 0 if successful. @@ -1777,7 +1810,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); * C. Return the page to the page allocator * * This means that any page may have its reference count temporarily - * increased by a speculative page cache (or fast GUP) lookup as it can + * increased by a speculative page cache (or GUP-fast) lookup as it can * be allocated by another user before the RCU grace period expires. * Because the refcount temporarily acquired here may end up being the * last refcount on the page, any page allocation must be freeable by @@ -1912,8 +1945,6 @@ no_page: gfp_t alloc_gfp = gfp; err = -ENOMEM; - if (order == 1) - order = 0; if (order > 0) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); @@ -2609,15 +2640,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* - * Pairs with a barrier in - * block_write_end()->mark_buffer_dirty() or other page - * dirtying routines like iomap_write_end() to ensure - * changes to page contents are visible before we see - * increased inode size. - */ - smp_rmb(); - - /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ @@ -3183,6 +3205,48 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, return fpin; } +static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = 0; + pte_t *ptep; + + /* + * We might have COW'ed a pagecache folio and might now have an mlocked + * anon folio mapped. The original pagecache folio is not mlocked and + * might have been evicted. During a read+clear/modify/write update of + * the PTE, such as done in do_numa_page()/change_pte_range(), we + * temporarily clear the PTE under PT lock and might detect it here as + * "none" when not holding the PT lock. + * + * Not rechecking the PTE under PT lock could result in an unexpected + * major fault in an mlock'ed region. Recheck only for this special + * scenario while holding the PT lock, to not degrade non-mlocked + * scenarios. Recheck the PTE without PT lock firstly, thereby reducing + * the number of times we hold PT lock. + */ + if (!(vma->vm_flags & VM_LOCKED)) + return 0; + + if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) + return 0; + + ptep = pte_offset_map(vmf->pmd, vmf->address); + if (unlikely(!ptep)) + return VM_FAULT_NOPAGE; + + if (unlikely(!pte_none(ptep_get_lockless(ptep)))) { + ret = VM_FAULT_NOPAGE; + } else { + spin_lock(vmf->ptl); + if (unlikely(!pte_none(ptep_get(ptep)))) + ret = VM_FAULT_NOPAGE; + spin_unlock(vmf->ptl); + } + pte_unmap(ptep); + return ret; +} + /** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault @@ -3238,6 +3302,10 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) mapping_locked = true; } } else { + ret = filemap_fault_recheck_pte_none(vmf); + if (unlikely(ret)) + return ret; + /* No page in the page cache at all */ count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); @@ -3437,7 +3505,7 @@ skip: static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned int *mmap_miss) + unsigned long *rss, unsigned int *mmap_miss) { vm_fault_t ret = 0; struct page *page = folio_page(folio, start); @@ -3448,7 +3516,15 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, if (PageHWPoison(page + count)) goto skip; - (*mmap_miss)++; + /* + * If there are too many folios that are recently evicted + * in a file, they will probably continue to be evicted. + * In such situation, read-ahead is only a waste of IO. + * Don't decrease mmap_miss in this scenario to make sure + * we can stop read-ahead. + */ + if (!folio_test_workingset(folio)) + (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be @@ -3463,6 +3539,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, skip: if (count) { set_pte_range(vmf, folio, page, count, addr); + *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; @@ -3477,6 +3554,7 @@ skip: if (count) { set_pte_range(vmf, folio, page, count, addr); + *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; @@ -3489,7 +3567,7 @@ skip: static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, - unsigned int *mmap_miss) + unsigned long *rss, unsigned int *mmap_miss) { vm_fault_t ret = 0; struct page *page = &folio->page; @@ -3497,7 +3575,9 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, if (PageHWPoison(page)) return ret; - (*mmap_miss)++; + /* See comment of filemap_map_folio_range() */ + if (!folio_test_workingset(folio)) + (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be @@ -3511,6 +3591,7 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, ret = VM_FAULT_NOPAGE; set_pte_range(vmf, folio, page, 1, addr); + (*rss)++; folio_ref_inc(folio); return ret; @@ -3527,7 +3608,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; vm_fault_t ret = 0; - unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved; + unsigned long rss = 0; + unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); @@ -3546,6 +3628,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, folio_put(folio); goto out; } + + folio_type = mm_counter_file(folio); do { unsigned long end; @@ -3557,15 +3641,16 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!folio_test_large(folio)) ret |= filemap_map_order0_folio(vmf, - folio, addr, &mmap_miss); + folio, addr, &rss, &mmap_miss); else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, - nr_pages, &mmap_miss); + nr_pages, &rss, &mmap_miss); folio_unlock(folio); folio_put(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); + add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); @@ -4090,6 +4175,60 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) } EXPORT_SYMBOL(filemap_release_folio); +/** + * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache + * @inode: The inode to flush + * @flush: Set to write back rather than simply invalidate. + * @start: First byte to in range. + * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start + * onwards. + * + * Invalidate all the folios on an inode that contribute to the specified + * range, possibly writing them back first. Whilst the operation is + * undertaken, the invalidate lock is held to prevent new folios from being + * installed. + */ +int filemap_invalidate_inode(struct inode *inode, bool flush, + loff_t start, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t first = start >> PAGE_SHIFT; + pgoff_t last = end >> PAGE_SHIFT; + pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1; + + if (!mapping || !mapping->nrpages || end < start) + goto out; + + /* Prevent new folios from being added to the inode. */ + filemap_invalidate_lock(mapping); + + if (!mapping->nrpages) + goto unlock; + + unmap_mapping_pages(mapping, first, nr, false); + + /* Write back the data if we're asked to. */ + if (flush) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = start, + .range_end = end, + }; + + filemap_fdatawrite_wbc(mapping, &wbc); + } + + /* Wait for writeback to complete on all folios and discard. */ + truncate_inode_pages_range(mapping, start, end); + +unlock: + filemap_invalidate_unlock(mapping); +out: + return filemap_check_errors(mapping); +} +EXPORT_SYMBOL_GPL(filemap_invalidate_inode); + #ifdef CONFIG_CACHESTAT_SYSCALL /** * filemap_cachestat() - compute the page cache statistics of a mapping @@ -4153,7 +4292,23 @@ static void filemap_cachestat(struct address_space *mapping, /* shmem file - in swap cache */ swp_entry_t swp = radix_to_swp_entry(folio); + /* swapin error results in poisoned entry */ + if (non_swap_entry(swp)) + goto resched; + + /* + * Getting a swap entry from the shmem + * inode means we beat + * shmem_unuse(). rcu_read_lock() + * ensures swapoff waits for us before + * freeing the swapper space. However, + * we can race with swapping and + * invalidation, so there might not be + * a shadow in the swapcache (yet). + */ shadow = get_shadow_from_swap_cache(swp); + if (!shadow) + goto resched; } #endif if (workingset_test_recent(shadow, true, &workingset)) diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 50412014f16f..f31e0ce65b11 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -58,12 +58,6 @@ bool set_page_dirty(struct page *page) } EXPORT_SYMBOL(set_page_dirty); -int __set_page_dirty_nobuffers(struct page *page) -{ - return filemap_dirty_folio(page_mapping(page), page_folio(page)); -} -EXPORT_SYMBOL(__set_page_dirty_nobuffers); - bool clear_page_dirty_for_io(struct page *page) { return folio_clear_dirty_for_io(page_folio(page)); @@ -89,7 +89,7 @@ retry: * belongs to this folio. */ if (unlikely(page_folio(page) != folio)) { - if (!put_devmap_managed_page_refs(&folio->page, refs)) + if (!put_devmap_managed_folio_refs(folio, refs)) folio_put_refs(folio, refs); goto retry; } @@ -156,7 +156,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) */ if (unlikely((flags & FOLL_LONGTERM) && !folio_is_longterm_pinnable(folio))) { - if (!put_devmap_managed_page_refs(&folio->page, refs)) + if (!put_devmap_managed_folio_refs(folio, refs)) folio_put_refs(folio, refs); return NULL; } @@ -198,7 +198,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) refs *= GUP_PIN_COUNTING_BIAS; } - if (!put_devmap_managed_page_refs(&folio->page, refs)) + if (!put_devmap_managed_folio_refs(folio, refs)) folio_put_refs(folio, refs); } @@ -440,7 +440,7 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, } EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); -static void unpin_user_pages_lockless(struct page **pages, unsigned long npages) +static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long i; struct folio *folio; @@ -500,23 +500,344 @@ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) } #ifdef CONFIG_MMU + +#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_GUP_FAST) +static int record_subpages(struct page *page, unsigned long sz, + unsigned long addr, unsigned long end, + struct page **pages) +{ + struct page *start_page; + int nr; + + start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT); + for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) + pages[nr] = nth_page(start_page, nr); + + return nr; +} +#endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_GUP_FAST */ + +#ifdef CONFIG_ARCH_HAS_HUGEPD +static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, + unsigned long sz) +{ + unsigned long __boundary = (addr + sz) & ~(sz-1); + return (__boundary - 1 < end - 1) ? __boundary : end; +} + +/* + * Returns 1 if succeeded, 0 if failed, -EMLINK if unshare needed. + * + * NOTE: for the same entry, gup-fast and gup-slow can return different + * results (0 v.s. -EMLINK) depending on whether vma is available. This is + * the expected behavior, where we simply want gup-fast to fallback to + * gup-slow to take the vma reference first. + */ +static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz, + unsigned long addr, unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + unsigned long pte_end; + struct page *page; + struct folio *folio; + pte_t pte; + int refs; + + pte_end = (addr + sz) & ~(sz-1); + if (pte_end < end) + end = pte_end; + + pte = huge_ptep_get(ptep); + + if (!pte_access_permitted(pte, flags & FOLL_WRITE)) + return 0; + + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + page = pte_page(pte); + refs = record_subpages(page, sz, addr, end, pages + *nr); + + folio = try_grab_folio(page, refs, flags); + if (!folio) + return 0; + + if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { + gup_put_folio(folio, refs, flags); + return 0; + } + + if (!pte_write(pte) && gup_must_unshare(vma, flags, &folio->page)) { + gup_put_folio(folio, refs, flags); + return -EMLINK; + } + + *nr += refs; + folio_set_referenced(folio); + return 1; +} + +/* + * NOTE: currently GUP for a hugepd is only possible on hugetlbfs file + * systems on Power, which does not have issue with folio writeback against + * GUP updates. When hugepd will be extended to support non-hugetlbfs or + * even anonymous memory, we need to do extra check as what we do with most + * of the other folios. See writable_file_mapping_allowed() and + * gup_fast_folio_allowed() for more information. + */ +static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, + unsigned long addr, unsigned int pdshift, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + pte_t *ptep; + unsigned long sz = 1UL << hugepd_shift(hugepd); + unsigned long next; + int ret; + + ptep = hugepte_offset(hugepd, addr, pdshift); + do { + next = hugepte_addr_end(addr, end, sz); + ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr); + if (ret != 1) + return ret; + } while (ptep++, addr = next, addr != end); + + return 1; +} + +static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, + unsigned long addr, unsigned int pdshift, + unsigned int flags, + struct follow_page_context *ctx) +{ + struct page *page; + struct hstate *h; + spinlock_t *ptl; + int nr = 0, ret; + pte_t *ptep; + + /* Only hugetlb supports hugepd */ + if (WARN_ON_ONCE(!is_vm_hugetlb_page(vma))) + return ERR_PTR(-EFAULT); + + h = hstate_vma(vma); + ptep = hugepte_offset(hugepd, addr, pdshift); + ptl = huge_pte_lock(h, vma->vm_mm, ptep); + ret = gup_hugepd(vma, hugepd, addr, pdshift, addr + PAGE_SIZE, + flags, &page, &nr); + spin_unlock(ptl); + + if (ret == 1) { + /* GUP succeeded */ + WARN_ON_ONCE(nr != 1); + ctx->page_mask = (1U << huge_page_order(h)) - 1; + return page; + } + + /* ret can be either 0 (translates to NULL) or negative */ + return ERR_PTR(ret); +} +#else /* CONFIG_ARCH_HAS_HUGEPD */ +static inline int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, + unsigned long addr, unsigned int pdshift, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + return 0; +} + +static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, + unsigned long addr, unsigned int pdshift, + unsigned int flags, + struct follow_page_context *ctx) +{ + return NULL; +} +#endif /* CONFIG_ARCH_HAS_HUGEPD */ + + static struct page *no_page_table(struct vm_area_struct *vma, - unsigned int flags) + unsigned int flags, unsigned long address) { + if (!(flags & FOLL_DUMP)) + return NULL; + /* - * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate unnecessary pages or + * When core dumping, we don't want to allocate unnecessary pages or * page tables. Return error instead of NULL to skip handle_mm_fault, * then get_dump_page() will return NULL to leave a hole in the dump. * But we can only make this optimization where a hole would surely * be zero-filled if handle_mm_fault() actually did handle it. */ - if ((flags & FOLL_DUMP) && - (vma_is_anonymous(vma) || !vma->vm_ops->fault)) + if (is_vm_hugetlb_page(vma)) { + struct hstate *h = hstate_vma(vma); + + if (!hugetlbfs_pagecache_present(h, vma, address)) + return ERR_PTR(-EFAULT); + } else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) { + return ERR_PTR(-EFAULT); + } + + return NULL; +} + +#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +static struct page *follow_huge_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp, + int flags, struct follow_page_context *ctx) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page; + pud_t pud = *pudp; + unsigned long pfn = pud_pfn(pud); + int ret; + + assert_spin_locked(pud_lockptr(mm, pudp)); + + if ((flags & FOLL_WRITE) && !pud_write(pud)) + return NULL; + + if (!pud_present(pud)) + return NULL; + + pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; + + if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && + pud_devmap(pud)) { + /* + * device mapped pages can only be returned if the caller + * will manage the page reference count. + * + * At least one of FOLL_GET | FOLL_PIN must be set, so + * assert that here: + */ + if (!(flags & (FOLL_GET | FOLL_PIN))) + return ERR_PTR(-EEXIST); + + if (flags & FOLL_TOUCH) + touch_pud(vma, addr, pudp, flags & FOLL_WRITE); + + ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap); + if (!ctx->pgmap) + return ERR_PTR(-EFAULT); + } + + page = pfn_to_page(pfn); + + if (!pud_devmap(pud) && !pud_write(pud) && + gup_must_unshare(vma, flags, page)) + return ERR_PTR(-EMLINK); + + ret = try_grab_page(page, flags); + if (ret) + page = ERR_PTR(ret); + else + ctx->page_mask = HPAGE_PUD_NR - 1; + + return page; +} + +/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ +static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) +{ + /* If the pmd is writable, we can write to the page. */ + if (pmd_write(pmd)) + return true; + + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + + /* ... and a write-fault isn't required for other reasons. */ + if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) + return false; + return !userfaultfd_huge_pmd_wp(vma, pmd); +} + +static struct page *follow_huge_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmd, + unsigned int flags, + struct follow_page_context *ctx) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t pmdval = *pmd; + struct page *page; + int ret; + + assert_spin_locked(pmd_lockptr(mm, pmd)); + + page = pmd_page(pmdval); + if ((flags & FOLL_WRITE) && + !can_follow_write_pmd(pmdval, page, vma, flags)) + return NULL; + + /* Avoid dumping huge zero page */ + if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval)) return ERR_PTR(-EFAULT); + + if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags)) + return NULL; + + if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page)) + return ERR_PTR(-EMLINK); + + VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); + + ret = try_grab_page(page, flags); + if (ret) + return ERR_PTR(ret); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH)) + touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; + ctx->page_mask = HPAGE_PMD_NR - 1; + + return page; +} + +#else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ +static struct page *follow_huge_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp, + int flags, struct follow_page_context *ctx) +{ return NULL; } +static struct page *follow_huge_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmd, + unsigned int flags, + struct follow_page_context *ctx) +{ + return NULL; +} +#endif /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ + static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { @@ -593,7 +914,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; @@ -685,7 +1006,7 @@ no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); } static struct page *follow_pmd_mask(struct vm_area_struct *vma, @@ -701,42 +1022,45 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, pmd = pmd_offset(pudp, address); pmdval = pmdp_get_lockless(pmd); if (pmd_none(pmdval)) - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); if (!pmd_present(pmdval)) - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); + if (unlikely(is_hugepd(__hugepd(pmd_val(pmdval))))) + return follow_hugepd(vma, __hugepd(pmd_val(pmdval)), + address, PMD_SHIFT, flags, ctx); if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); } - if (likely(!pmd_trans_huge(pmdval))) + if (likely(!pmd_leaf(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_present(*pmd))) { + pmdval = *pmd; + if (unlikely(!pmd_present(pmdval))) { spin_unlock(ptl); - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); } - if (unlikely(!pmd_trans_huge(*pmd))) { + if (unlikely(!pmd_leaf(pmdval))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - if (flags & FOLL_SPLIT_PMD) { + if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - page = follow_trans_huge_pmd(vma, address, pmd, flags); + page = follow_huge_pmd(vma, address, pmd, flags, ctx); spin_unlock(ptl); - ctx->page_mask = HPAGE_PMD_NR - 1; return page; } @@ -745,26 +1069,30 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned int flags, struct follow_page_context *ctx) { - pud_t *pud; + pud_t *pudp, pud; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; - pud = pud_offset(p4dp, address); - if (pud_none(*pud)) - return no_page_table(vma, flags); - if (pud_devmap(*pud)) { - ptl = pud_lock(mm, pud); - page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); + pudp = pud_offset(p4dp, address); + pud = READ_ONCE(*pudp); + if (!pud_present(pud)) + return no_page_table(vma, flags, address); + if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) + return follow_hugepd(vma, __hugepd(pud_val(pud)), + address, PUD_SHIFT, flags, ctx); + if (pud_leaf(pud)) { + ptl = pud_lock(mm, pudp); + page = follow_huge_pud(vma, address, pudp, flags, ctx); spin_unlock(ptl); if (page) return page; - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); } - if (unlikely(pud_bad(*pud))) - return no_page_table(vma, flags); + if (unlikely(pud_bad(pud))) + return no_page_table(vma, flags, address); - return follow_pmd_mask(vma, address, pud, flags, ctx); + return follow_pmd_mask(vma, address, pudp, flags, ctx); } static struct page *follow_p4d_mask(struct vm_area_struct *vma, @@ -772,16 +1100,20 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned int flags, struct follow_page_context *ctx) { - p4d_t *p4d; + p4d_t *p4dp, p4d; - p4d = p4d_offset(pgdp, address); - if (p4d_none(*p4d)) - return no_page_table(vma, flags); - BUILD_BUG_ON(p4d_huge(*p4d)); - if (unlikely(p4d_bad(*p4d))) - return no_page_table(vma, flags); + p4dp = p4d_offset(pgdp, address); + p4d = READ_ONCE(*p4dp); + BUILD_BUG_ON(p4d_leaf(p4d)); + + if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) + return follow_hugepd(vma, __hugepd(p4d_val(p4d)), + address, P4D_SHIFT, flags, ctx); + + if (!p4d_present(p4d) || p4d_bad(p4d)) + return no_page_table(vma, flags, address); - return follow_pud_mask(vma, address, p4d, flags, ctx); + return follow_pud_mask(vma, address, p4dp, flags, ctx); } /** @@ -814,24 +1146,24 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; + struct page *page; - ctx->page_mask = 0; - - /* - * Call hugetlb_follow_page_mask for hugetlb vmas as it will use - * special hugetlb page table walking code. This eliminates the - * need to check for hugetlb entries in the general walking code. - */ - if (is_vm_hugetlb_page(vma)) - return hugetlb_follow_page_mask(vma, address, flags, - &ctx->page_mask); + vma_pgtable_walk_begin(vma); + ctx->page_mask = 0; pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return no_page_table(vma, flags); + if (unlikely(is_hugepd(__hugepd(pgd_val(*pgd))))) + page = follow_hugepd(vma, __hugepd(pgd_val(*pgd)), + address, PGDIR_SHIFT, flags, ctx); + else if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + page = no_page_table(vma, flags, address); + else + page = follow_p4d_mask(vma, address, pgd, flags, ctx); - return follow_p4d_mask(vma, address, pgd, flags, ctx); + vma_pgtable_walk_end(vma); + + return page; } struct page *follow_page(struct vm_area_struct *vma, unsigned long address, @@ -1206,6 +1538,22 @@ static long __get_user_pages(struct mm_struct *mm, /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { + /* + * MADV_POPULATE_(READ|WRITE) wants to handle VMA + * lookups+error reporting differently. + */ + if (gup_flags & FOLL_MADV_POPULATE) { + vma = vma_lookup(mm, start); + if (!vma) { + ret = -ENOMEM; + goto out; + } + if (check_vma_flags(vma, gup_flags)) { + ret = -EINVAL; + goto out; + } + goto retry; + } vma = gup_vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, @@ -1653,20 +2001,22 @@ long populate_vma_page_range(struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKONFAULT) return nr_pages; + /* ... similarly, we've never faulted in PROT_NONE pages */ + if (!vma_is_accessible(vma)) + return -EFAULT; + gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW * and we would not want to dirty them for nothing. + * + * Otherwise, do a read fault, and use FOLL_FORCE in case it's not + * readable (ie write-only or executable). */ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; - - /* - * We want mlock to succeed for regions that have any permissions - * other than PROT_NONE. - */ - if (vma_is_accessible(vma)) + else gup_flags |= FOLL_FORCE; if (locked) @@ -1683,35 +2033,35 @@ long populate_vma_page_range(struct vm_area_struct *vma, } /* - * faultin_vma_page_range() - populate (prefault) page tables inside the - * given VMA range readable/writable + * faultin_page_range() - populate (prefault) page tables inside the + * given range readable/writable * * This takes care of mlocking the pages, too, if VM_LOCKED is set. * - * @vma: target vma + * @mm: the mm to populate page tables in * @start: start address * @end: end address * @write: whether to prefault readable or writable * @locked: whether the mmap_lock is still held * - * Returns either number of processed pages in the vma, or a negative error - * code on error (see __get_user_pages()). + * Returns either number of processed pages in the MM, or a negative error + * code on error (see __get_user_pages()). Note that this function reports + * errors related to VMAs, such as incompatible mappings, as expected by + * MADV_POPULATE_(READ|WRITE). + * + * The range must be page-aligned. * - * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and - * covered by the VMA. If it's released, *@locked will be set to 0. + * mm->mmap_lock must be held. If it's released, *@locked will be set to 0. */ -long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, bool write, int *locked) +long faultin_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool write, int *locked) { - struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* @@ -1723,19 +2073,13 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, * a poisoned page. * !FOLL_FORCE: Require proper access permissions. */ - gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE; + gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE | + FOLL_MADV_POPULATE; if (write) gup_flags |= FOLL_WRITE; - /* - * We want to report -EINVAL instead of -EFAULT for any permission - * problems or incompatible mappings. - */ - if (check_vma_flags(vma, gup_flags)) - return -EINVAL; - - ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, locked); + ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked, + gup_flags); lru_add_drain(); return ret; } @@ -2132,6 +2476,7 @@ static int migrate_longterm_unpinnable_pages( struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, + .reason = MR_LONGTERM_PIN, }; if (migrate_pages(movable_page_list, alloc_migration_target, @@ -2419,7 +2764,7 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, EXPORT_SYMBOL(get_user_pages_unlocked); /* - * Fast GUP + * GUP-fast * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be @@ -2433,7 +2778,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * * Another way to achieve this is to batch up page table containing pages * belonging to more than one mm_user, then rcu_sched a callback to free those - * pages. Disabling interrupts will allow the fast_gup walker to both block + * pages. Disabling interrupts will allow the gup_fast() walker to both block * the rcu_sched callback, and an IPI that we broadcast for splitting THPs * (which is a relatively rare event). The code below adopts this strategy. * @@ -2451,15 +2796,17 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_FAST_GUP +#ifdef CONFIG_HAVE_GUP_FAST /* - * Used in the GUP-fast path to determine whether a pin is permitted for a - * specific folio. + * Used in the GUP-fast path to determine whether GUP is permitted to work on + * a specific folio. * * This call assumes the caller has pinned the folio, that the lowest page table * level still points to this folio, and that interrupts have been disabled. * + * GUP-fast must reject all secretmem folios. + * * Writing to pinned file-backed dirty tracked folios is inherently problematic * (see comment describing the writable_file_mapping_allowed() function). We * therefore try to avoid the most egregious case of a long-term mapping doing @@ -2469,25 +2816,34 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * in the fast path, so instead we whitelist known good cases and if in doubt, * fall back to the slow path. */ -static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags) +static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) { + bool reject_file_backed = false; struct address_space *mapping; + bool check_secretmem = false; unsigned long mapping_flags; /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the one we disallow. */ - if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) != + if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) == (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) - return true; + reject_file_backed = true; - /* The folio is pinned, so we can safely access folio fields. */ + /* We hold a folio reference, so we can safely access folio fields. */ + + /* secretmem folios are always order-0 folios. */ + if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio)) + check_secretmem = true; + + if (!reject_file_backed && !check_secretmem) + return true; if (WARN_ON_ONCE(folio_test_slab(folio))) return false; - /* hugetlb mappings do not require dirty-tracking. */ + /* hugetlb neither requires dirty-tracking nor can be secretmem. */ if (folio_test_hugetlb(folio)) return true; @@ -2523,50 +2879,48 @@ static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags) /* * At this point, we know the mapping is non-null and points to an - * address_space object. The only remaining whitelisted file system is - * shmem. + * address_space object. */ - return shmem_mapping(mapping); + if (check_secretmem && secretmem_mapping(mapping)) + return false; + /* The only remaining allowed file system is shmem. */ + return !reject_file_backed || shmem_mapping(mapping); } -static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, - unsigned int flags, - struct page **pages) +static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start, + unsigned int flags, struct page **pages) { while ((*nr) - nr_start) { - struct page *page = pages[--(*nr)]; + struct folio *folio = page_folio(pages[--(*nr)]); - ClearPageReferenced(page); - if (flags & FOLL_PIN) - unpin_user_page(page); - else - put_page(page); + folio_clear_referenced(folio); + gup_put_folio(folio, 1, flags); } } #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* - * Fast-gup relies on pte change detection to avoid concurrent pgtable + * GUP-fast relies on pte change detection to avoid concurrent pgtable * operations. * - * To pin the page, fast-gup needs to do below in order: + * To pin the page, GUP-fast needs to do below in order: * (1) pin the page (by prefetching pte), then (2) check pte not changed. * * For the rest of pgtable operations where pgtable updates can be racy - * with fast-gup, we need to do (1) clear pte, then (2) check whether page + * with GUP-fast, we need to do (1) clear pte, then (2) check whether page * is pinned. * * Above will work for all pte-level operations, including THP split. * - * For THP collapse, it's a bit more complicated because fast-gup may be + * For THP collapse, it's a bit more complicated because GUP-fast may be * walking a pgtable page that is being freed (pte is still valid but pmd * can be cleared already). To avoid race in such condition, we need to * also check pmd here to make sure pmd doesn't change (corresponds to * pmdp_collapse_flush() in the THP collapse code path). */ -static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; @@ -2599,7 +2953,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, flags, pages); + gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); goto pte_unmap; } } else if (pte_special(pte)) @@ -2612,18 +2966,13 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, if (!folio) goto pte_unmap; - if (unlikely(folio_is_secretmem(folio))) { - gup_put_folio(folio, 1, flags); - goto pte_unmap; - } - if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } - if (!folio_fast_pin_allowed(folio, flags)) { + if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2668,44 +3017,45 @@ pte_unmap: * * For a futex to be placed on a THP tail page, get_futex_key requires a * get_user_pages_fast_only implementation that can pin pages. Thus it's still - * useful to have gup_huge_pmd even if we can't operate on ptes. + * useful to have gup_fast_pmd_leaf even if we can't operate on ptes. */ -static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -static int __gup_device_huge(unsigned long pfn, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, int *nr) { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; do { + struct folio *folio; struct page *page = pfn_to_page(pfn); pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, flags, pages); + gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); break; } if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { - undo_dev_pagemap(nr, nr_start, flags, pages); + gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); break; } - SetPageReferenced(page); - pages[*nr] = page; - if (unlikely(try_grab_page(page, flags))) { - undo_dev_pagemap(nr, nr_start, flags, pages); + folio = try_grab_folio(page, 1, flags); + if (!folio) { + gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); break; } + folio_set_referenced(folio); + pages[*nr] = page; (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); @@ -2714,156 +3064,62 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, return addr == end; } -static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) + if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { - undo_dev_pagemap(nr, nr_start, flags, pages); + gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } -static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) + if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { - undo_dev_pagemap(nr, nr_start, flags, pages); + gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } #else -static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { BUILD_BUG(); return 0; } -static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { BUILD_BUG(); return 0; } #endif -static int record_subpages(struct page *page, unsigned long addr, - unsigned long end, struct page **pages) -{ - int nr; - - for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) - pages[nr] = nth_page(page, nr); - - return nr; -} - -#ifdef CONFIG_ARCH_HAS_HUGEPD -static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, - unsigned long sz) -{ - unsigned long __boundary = (addr + sz) & ~(sz-1); - return (__boundary - 1 < end - 1) ? __boundary : end; -} - -static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) -{ - unsigned long pte_end; - struct page *page; - struct folio *folio; - pte_t pte; - int refs; - - pte_end = (addr + sz) & ~(sz-1); - if (pte_end < end) - end = pte_end; - - pte = huge_ptep_get(ptep); - - if (!pte_access_permitted(pte, flags & FOLL_WRITE)) - return 0; - - /* hugepages are never "special" */ - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT); - refs = record_subpages(page, addr, end, pages + *nr); - - folio = try_grab_folio(page, refs, flags); - if (!folio) - return 0; - - if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!folio_fast_pin_allowed(folio, flags)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - *nr += refs; - folio_set_referenced(folio); - return 1; -} - -static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned int pdshift, unsigned long end, unsigned int flags, - struct page **pages, int *nr) -{ - pte_t *ptep; - unsigned long sz = 1UL << hugepd_shift(hugepd); - unsigned long next; - - ptep = hugepte_offset(hugepd, addr, pdshift); - do { - next = hugepte_addr_end(addr, end, sz); - if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr)) - return 0; - } while (ptep++, addr = next, addr != end); - - return 1; -} -#else -static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned int pdshift, unsigned long end, unsigned int flags, - struct page **pages, int *nr) -{ - return 0; -} -#endif /* CONFIG_ARCH_HAS_HUGEPD */ - -static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { struct page *page; struct folio *folio; @@ -2875,12 +3131,12 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; - return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, - pages, nr); + return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags, + pages, nr); } - page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT); - refs = record_subpages(page, addr, end, pages + *nr); + page = pmd_page(orig); + refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) @@ -2891,7 +3147,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - if (!folio_fast_pin_allowed(folio, flags)) { + if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2905,9 +3161,9 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 1; } -static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { struct page *page; struct folio *folio; @@ -2919,12 +3175,12 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; - return __gup_device_huge_pud(orig, pudp, addr, end, flags, - pages, nr); + return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags, + pages, nr); } - page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT); - refs = record_subpages(page, addr, end, pages + *nr); + page = pud_page(orig); + refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) @@ -2935,7 +3191,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - if (!folio_fast_pin_allowed(folio, flags)) { + if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2950,9 +3206,9 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 1; } -static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { int refs; struct page *page; @@ -2963,8 +3219,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, BUILD_BUG_ON(pgd_devmap(orig)); - page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT); - refs = record_subpages(page, addr, end, pages + *nr); + page = pgd_page(orig); + refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) @@ -2980,7 +3236,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } - if (!folio_fast_pin_allowed(folio, flags)) { + if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2990,8 +3246,9 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 1; } -static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, - unsigned int flags, struct page **pages, int *nr) +static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { unsigned long next; pmd_t *pmdp; @@ -3004,13 +3261,12 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo if (!pmd_present(pmd)) return 0; - if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || - pmd_devmap(pmd))) { - /* See gup_pte_range() */ + if (unlikely(pmd_leaf(pmd))) { + /* See gup_fast_pte_range() */ if (pmd_protnone(pmd)) return 0; - if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, + if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags, pages, nr)) return 0; @@ -3019,18 +3275,20 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo * architecture have different format for hugetlbfs * pmd format and THP pmd format */ - if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, - PMD_SHIFT, next, flags, pages, nr)) + if (gup_hugepd(NULL, __hugepd(pmd_val(pmd)), addr, + PMD_SHIFT, next, flags, pages, nr) != 1) return 0; - } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) + } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags, + pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); return 1; } -static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, - unsigned int flags, struct page **pages, int *nr) +static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { unsigned long next; pud_t *pudp; @@ -3042,23 +3300,25 @@ static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned lo next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; - if (unlikely(pud_huge(pud) || pud_devmap(pud))) { - if (!gup_huge_pud(pud, pudp, addr, next, flags, - pages, nr)) + if (unlikely(pud_leaf(pud))) { + if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags, + pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { - if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, - PUD_SHIFT, next, flags, pages, nr)) + if (gup_hugepd(NULL, __hugepd(pud_val(pud)), addr, + PUD_SHIFT, next, flags, pages, nr) != 1) return 0; - } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) + } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags, + pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } -static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, - unsigned int flags, struct page **pages, int *nr) +static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { unsigned long next; p4d_t *p4dp; @@ -3068,21 +3328,22 @@ static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned lo p4d_t p4d = READ_ONCE(*p4dp); next = p4d_addr_end(addr, end); - if (p4d_none(p4d)) + if (!p4d_present(p4d)) return 0; - BUILD_BUG_ON(p4d_huge(p4d)); + BUILD_BUG_ON(p4d_leaf(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { - if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, - P4D_SHIFT, next, flags, pages, nr)) + if (gup_hugepd(NULL, __hugepd(p4d_val(p4d)), addr, + P4D_SHIFT, next, flags, pages, nr) != 1) return 0; - } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) + } else if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags, + pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); return 1; } -static void gup_pgd_range(unsigned long addr, unsigned long end, +static void gup_fast_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; @@ -3095,24 +3356,25 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; - if (unlikely(pgd_huge(pgd))) { - if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, - pages, nr)) + if (unlikely(pgd_leaf(pgd))) { + if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags, + pages, nr)) return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { - if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, - PGDIR_SHIFT, next, flags, pages, nr)) + if (gup_hugepd(NULL, __hugepd(pgd_val(pgd)), addr, + PGDIR_SHIFT, next, flags, pages, nr) != 1) return; - } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) + } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, + pages, nr)) return; } while (pgdp++, addr = next, addr != end); } #else -static inline void gup_pgd_range(unsigned long addr, unsigned long end, +static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { } -#endif /* CONFIG_HAVE_FAST_GUP */ +#endif /* CONFIG_HAVE_GUP_FAST */ #ifndef gup_fast_permitted /* @@ -3125,16 +3387,14 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) } #endif -static unsigned long lockless_pages_from_mm(unsigned long start, - unsigned long end, - unsigned int gup_flags, - struct page **pages) +static unsigned long gup_fast(unsigned long start, unsigned long end, + unsigned int gup_flags, struct page **pages) { unsigned long flags; int nr_pinned = 0; unsigned seq; - if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || + if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) || !gup_fast_permitted(start, end)) return 0; @@ -3156,16 +3416,16 @@ static unsigned long lockless_pages_from_mm(unsigned long start, * that come from THPs splitting. */ local_irq_save(flags); - gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); + gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned); local_irq_restore(flags); /* * When pinning pages for DMA there could be a concurrent write protect - * from fork() via copy_page_range(), in this case always fail fast GUP. + * from fork() via copy_page_range(), in this case always fail GUP-fast. */ if (gup_flags & FOLL_PIN) { if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) { - unpin_user_pages_lockless(pages, nr_pinned); + gup_fast_unpin_user_pages(pages, nr_pinned); return 0; } else { sanity_check_pinned_pages(pages, nr_pinned); @@ -3174,10 +3434,8 @@ static unsigned long lockless_pages_from_mm(unsigned long start, return nr_pinned; } -static int internal_get_user_pages_fast(unsigned long start, - unsigned long nr_pages, - unsigned int gup_flags, - struct page **pages) +static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages) { unsigned long len, end; unsigned long nr_pinned; @@ -3205,7 +3463,7 @@ static int internal_get_user_pages_fast(unsigned long start, if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); + nr_pinned = gup_fast(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) return nr_pinned; @@ -3259,7 +3517,7 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; - return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); + return gup_fast_fallback(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast_only); @@ -3290,7 +3548,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET)) return -EINVAL; - return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); + return gup_fast_fallback(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); @@ -3318,7 +3576,7 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, { if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; - return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); + return gup_fast_fallback(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); @@ -424,22 +424,17 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, walk->action = ACTION_CONTINUE; pud = READ_ONCE(*pudp); - if (pud_none(pud)) { + if (!pud_present(pud)) { spin_unlock(ptl); return hmm_vma_walk_hole(start, end, -1, walk); } - if (pud_huge(pud) && pud_devmap(pud)) { + if (pud_leaf(pud) && pud_devmap(pud)) { unsigned long i, npages, pfn; unsigned int required_fault; unsigned long *hmm_pfns; unsigned long cpu_flags; - if (!pud_present(pud)) { - spin_unlock(ptl); - return hmm_vma_walk_hole(start, end, -1, walk); - } - i = (addr - range->start) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT; hmm_pfns = &range->hmm_pfns[i]; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 94c958f7ebb5..317de2afd371 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -38,6 +38,7 @@ #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <linux/compat.h> +#include <linux/pgalloc_tag.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -73,17 +74,20 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc); static atomic_t huge_zero_refcount; -struct page *huge_zero_page __read_mostly; +struct folio *huge_zero_folio __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { + bool smaps = tva_flags & TVA_SMAPS; + bool in_pf = tva_flags & TVA_IN_PF; + bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; /* Check the intersection of requested and supported orders. */ orders &= vma_is_anonymous(vma) ? THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; @@ -191,24 +195,24 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, static bool get_huge_zero_page(void) { - struct page *zero_page; + struct folio *zero_folio; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) return true; - zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, + zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); - if (!zero_page) { + if (!zero_folio) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); return false; } preempt_disable(); - if (cmpxchg(&huge_zero_page, NULL, zero_page)) { + if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) { preempt_enable(); - __free_pages(zero_page, compound_order(zero_page)); + folio_put(zero_folio); goto retry; } - WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page)); + WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio)); /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); @@ -226,10 +230,10 @@ static void put_huge_zero_page(void) BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); } -struct page *mm_get_huge_zero_page(struct mm_struct *mm) +struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) - return READ_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_folio); if (!get_huge_zero_page()) return NULL; @@ -237,10 +241,10 @@ struct page *mm_get_huge_zero_page(struct mm_struct *mm) if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) put_huge_zero_page(); - return READ_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_folio); } -void mm_put_huge_zero_page(struct mm_struct *mm) +void mm_put_huge_zero_folio(struct mm_struct *mm) { if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) put_huge_zero_page(); @@ -257,10 +261,10 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, struct shrink_control *sc) { if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { - struct page *zero_page = xchg(&huge_zero_page, NULL); - BUG_ON(zero_page == NULL); + struct folio *zero_folio = xchg(&huge_zero_folio, NULL); + BUG_ON(zero_folio == NULL); WRITE_ONCE(huge_zero_pfn, ~0UL); - __free_pages(zero_page, compound_order(zero_page)); + folio_put(zero_folio); return HPAGE_PMD_NR; } @@ -525,6 +529,52 @@ static const struct kobj_type thpsize_ktype = { .sysfs_ops = &kobj_sysfs_ops, }; +DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}}; + +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) +{ + unsigned long sum = 0; + int cpu; + + for_each_possible_cpu(cpu) { + struct mthp_stat *this = &per_cpu(mthp_stats, cpu); + + sum += this->stats[order][item]; + } + + return sum; +} + +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + int order = to_thpsize(kobj)->order; \ + \ + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ +} \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); +DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT); +DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK); + +static struct attribute *stats_attrs[] = { + &anon_fault_alloc_attr.attr, + &anon_fault_fallback_attr.attr, + &anon_fault_fallback_charge_attr.attr, + &anon_swpout_attr.attr, + &anon_swpout_fallback_attr.attr, + NULL, +}; + +static struct attribute_group stats_attr_group = { + .name = "stats", + .attrs = stats_attrs, +}; + static struct thpsize *thpsize_create(int order, struct kobject *parent) { unsigned long size = (PAGE_SIZE << order) / SZ_1K; @@ -548,6 +598,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) return ERR_PTR(ret); } + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); + if (ret) { + kobject_put(&thpsize->kobj); + return ERR_PTR(ret); + } + thpsize->order = order; return thpsize; } @@ -684,11 +740,6 @@ static int __init hugepage_init(void) * hugepages can't be allocated by the buddy allocator */ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER); - /* - * we use page->mapping and page->index in second tail page - * as list_head: assuming THP order >= 2 - */ - MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); err = hugepage_init_sysfs(&hugepage_kobj); if (err) @@ -788,25 +839,19 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio) } #endif -void folio_prep_large_rmappable(struct folio *folio) -{ - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); - INIT_LIST_HEAD(&folio->_deferred_list); - folio_set_large_rmappable(folio); -} - -static inline bool is_transparent_hugepage(struct folio *folio) +static inline bool is_transparent_hugepage(const struct folio *folio) { if (!folio_test_large(folio)) return false; - return is_huge_zero_page(&folio->page) || + return is_huge_zero_folio(folio) || folio_test_large_rmappable(folio); } static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, - loff_t off, unsigned long flags, unsigned long size) + loff_t off, unsigned long flags, unsigned long size, + vm_flags_t vm_flags) { loff_t off_end = off + len; loff_t off_align = round_up(off, size); @@ -822,8 +867,8 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, if (len_pad < len || (off + len_pad) < off) return 0; - ret = current->mm->get_unmapped_area(filp, addr, len_pad, - off >> PAGE_SHIFT, flags); + ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, + off >> PAGE_SHIFT, flags, vm_flags); /* * The failure might be due to length padding. The caller will retry @@ -841,25 +886,32 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, off_sub = (off - ret) & (size - 1); - if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown && - !off_sub) + if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) return ret + size; ret += off_sub; return ret; } -unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) { unsigned long ret; loff_t off = (loff_t)pgoff << PAGE_SHIFT; - ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE); + ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags); if (ret) return ret; - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, + vm_flags); +} + +unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); @@ -878,6 +930,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } folio_throttle_swaprate(folio, gfp); @@ -927,6 +981,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } @@ -977,14 +1032,14 @@ gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) } /* Caller must hold page table lock. */ -static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, +static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, - struct page *zero_page) + struct folio *zero_folio) { pmd_t entry; if (!pmd_none(*pmd)) return; - entry = mk_pmd(zero_page, vma->vm_page_prot); + entry = mk_pmd(&zero_folio->page, vma->vm_page_prot); entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); @@ -997,24 +1052,27 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) gfp_t gfp; struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + vm_fault_t ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; - struct page *zero_page; + struct folio *zero_folio; vm_fault_t ret; + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) return VM_FAULT_OOM; - zero_page = mm_get_huge_zero_page(vma->vm_mm); - if (unlikely(!zero_page)) { + zero_folio = mm_get_huge_zero_folio(vma->vm_mm); + if (unlikely(!zero_folio)) { pte_free(vma->vm_mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1032,8 +1090,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { - set_huge_zero_page(pgtable, vma->vm_mm, vma, - haddr, vmf->pmd, zero_page); + set_huge_zero_folio(pgtable, vma->vm_mm, vma, + haddr, vmf->pmd, zero_folio); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); } @@ -1047,6 +1105,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); @@ -1226,8 +1285,8 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, bool write) +void touch_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, bool write) { pmd_t _pmd; @@ -1342,11 +1401,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ if (is_huge_zero_pmd(pmd)) { /* - * get_huge_zero_page() will never allocate a new page here, - * since we already have a zero page to copy. It just takes a - * reference. + * mm_get_huge_zero_folio() will never allocate a new + * folio here, since we already have a zero page to + * copy. It just takes a reference. */ - mm_get_huge_zero_page(dst_mm); + mm_get_huge_zero_folio(dst_mm); goto out_zero_page; } @@ -1383,8 +1442,8 @@ out: } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void touch_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, bool write) +void touch_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, bool write) { pud_t _pud; @@ -1396,49 +1455,6 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr, update_mmu_cache_pud(vma, addr, pud); } -struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags, struct dev_pagemap **pgmap) -{ - unsigned long pfn = pud_pfn(*pud); - struct mm_struct *mm = vma->vm_mm; - struct page *page; - int ret; - - assert_spin_locked(pud_lockptr(mm, pud)); - - if (flags & FOLL_WRITE && !pud_write(*pud)) - return NULL; - - if (pud_present(*pud) && pud_devmap(*pud)) - /* pass */; - else - return NULL; - - if (flags & FOLL_TOUCH) - touch_pud(vma, addr, pud, flags & FOLL_WRITE); - - /* - * device mapped pages can only be returned if the - * caller will manage the page reference count. - * - * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here: - */ - if (!(flags & (FOLL_GET | FOLL_PIN))) - return ERR_PTR(-EEXIST); - - pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; - *pgmap = get_dev_pagemap(pfn, *pgmap); - if (!*pgmap) - return ERR_PTR(-EFAULT); - page = pfn_to_page(pfn); - - ret = try_grab_page(page, flags); - if (ret) - page = ERR_PTR(ret); - - return page; -} - int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma) @@ -1625,88 +1641,6 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma, return pmd_dirty(pmd); } -/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ -static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, - struct vm_area_struct *vma, - unsigned int flags) -{ - /* If the pmd is writable, we can write to the page. */ - if (pmd_write(pmd)) - return true; - - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) - return false; - - /* ... and a write-fault isn't required for other reasons. */ - if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) - return false; - return !userfaultfd_huge_pmd_wp(vma, pmd); -} - -struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, - unsigned long addr, - pmd_t *pmd, - unsigned int flags) -{ - struct mm_struct *mm = vma->vm_mm; - struct page *page; - int ret; - - assert_spin_locked(pmd_lockptr(mm, pmd)); - - page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); - - if ((flags & FOLL_WRITE) && - !can_follow_write_pmd(*pmd, page, vma, flags)) - return NULL; - - /* Avoid dumping huge zero page */ - if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) - return ERR_PTR(-EFAULT); - - if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags)) - return NULL; - - if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) - return ERR_PTR(-EMLINK); - - VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && - !PageAnonExclusive(page), page); - - ret = try_grab_page(page, flags); - if (ret) - return ERR_PTR(ret); - - if (flags & FOLL_TOUCH) - touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); - - page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; - VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); - - return page; -} - /* NUMA hinting page fault entry point for trans huge pmds */ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { @@ -1752,7 +1686,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) */ if (node_is_toptier(nid)) last_cpupid = folio_last_cpupid(folio); - target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags); + target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); goto out_map; @@ -1822,12 +1756,12 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, goto out; } - folio = pfn_folio(pmd_pfn(orig_pmd)); + folio = pmd_folio(orig_pmd); /* * If other processes are mapping this folio, we couldn't discard * the folio unless they all do MADV_FREE so let's skip the folio. */ - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) goto out; if (!folio_trylock(folio)) @@ -1905,36 +1839,39 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); } else { - struct page *page = NULL; + struct folio *folio = NULL; int flush_needed = 1; if (pmd_present(orig_pmd)) { - page = pmd_page(orig_pmd); - folio_remove_rmap_pmd(page_folio(page), page, vma); - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + struct page *page = pmd_page(orig_pmd); + + folio = page_folio(page); + folio_remove_rmap_pmd(folio, page, vma); + WARN_ON_ONCE(folio_mapcount(folio) < 0); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { swp_entry_t entry; VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); - page = pfn_swap_entry_to_page(entry); + folio = pfn_swap_entry_folio(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); - if (PageAnon(page)) { + if (folio_test_anon(folio)) { zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); - add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); + add_mm_counter(tlb->mm, mm_counter_file(folio), + -HPAGE_PMD_NR); } spin_unlock(ptl); if (flush_needed) - tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); + tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); } return 1; } @@ -2045,7 +1982,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + struct folio *folio = pfn_swap_entry_folio(entry); pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); @@ -2089,7 +2026,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_protnone(*pmd)) goto unlock; - folio = page_folio(pmd_page(*pmd)); + folio = pmd_folio(*pmd); toptier = node_is_toptier(folio_nid(folio)); /* * Skip scanning top tier node if normal numa @@ -2155,7 +2092,7 @@ unlock: #ifdef CONFIG_USERFAULTFD /* - * The PT lock for src_pmd and the mmap_lock for reading are held by + * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by * the caller, but it must return after releasing the page_table_lock. * Just move the page from src_pmd to dst_pmd if possible. * Return zero if succeeded in moving the page, -EAGAIN if it needs to be @@ -2178,7 +2115,8 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm src_ptl = pmd_lockptr(mm, src_pmd); lockdep_assert_held(src_ptl); - mmap_assert_locked(mm); + vma_assert_locked(src_vma); + vma_assert_locked(dst_vma); /* Sanity checks before the operation */ if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) || @@ -2197,13 +2135,18 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm } src_page = pmd_page(src_pmdval); - if (unlikely(!PageAnonExclusive(src_page))) { - spin_unlock(src_ptl); - return -EBUSY; - } - src_folio = page_folio(src_page); - folio_get(src_folio); + if (!is_huge_zero_pmd(src_pmdval)) { + if (unlikely(!PageAnonExclusive(src_page))) { + spin_unlock(src_ptl); + return -EBUSY; + } + + src_folio = page_folio(src_page); + folio_get(src_folio); + } else + src_folio = NULL; + spin_unlock(src_ptl); flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE); @@ -2211,19 +2154,22 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm src_addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); - folio_lock(src_folio); + if (src_folio) { + folio_lock(src_folio); - /* - * split_huge_page walks the anon_vma chain without the page - * lock. Serialize against it with the anon_vma lock, the page - * lock is not enough. - */ - src_anon_vma = folio_get_anon_vma(src_folio); - if (!src_anon_vma) { - err = -EAGAIN; - goto unlock_folio; - } - anon_vma_lock_write(src_anon_vma); + /* + * split_huge_page walks the anon_vma chain without the page + * lock. Serialize against it with the anon_vma lock, the page + * lock is not enough. + */ + src_anon_vma = folio_get_anon_vma(src_folio); + if (!src_anon_vma) { + err = -EAGAIN; + goto unlock_folio; + } + anon_vma_lock_write(src_anon_vma); + } else + src_anon_vma = NULL; dst_ptl = pmd_lockptr(mm, dst_pmd); double_pt_lock(src_ptl, dst_ptl); @@ -2232,45 +2178,54 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm err = -EAGAIN; goto unlock_ptls; } - if (folio_maybe_dma_pinned(src_folio) || - !PageAnonExclusive(&src_folio->page)) { - err = -EBUSY; - goto unlock_ptls; - } + if (src_folio) { + if (folio_maybe_dma_pinned(src_folio) || + !PageAnonExclusive(&src_folio->page)) { + err = -EBUSY; + goto unlock_ptls; + } - if (WARN_ON_ONCE(!folio_test_head(src_folio)) || - WARN_ON_ONCE(!folio_test_anon(src_folio))) { - err = -EBUSY; - goto unlock_ptls; - } + if (WARN_ON_ONCE(!folio_test_head(src_folio)) || + WARN_ON_ONCE(!folio_test_anon(src_folio))) { + err = -EBUSY; + goto unlock_ptls; + } - folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); + /* Folio got pinned from under us. Put it back and fail the move. */ + if (folio_maybe_dma_pinned(src_folio)) { + set_pmd_at(mm, src_addr, src_pmd, src_pmdval); + err = -EBUSY; + goto unlock_ptls; + } - src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); - /* Folio got pinned from under us. Put it back and fail the move. */ - if (folio_maybe_dma_pinned(src_folio)) { - set_pmd_at(mm, src_addr, src_pmd, src_pmdval); - err = -EBUSY; - goto unlock_ptls; - } + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); - _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); - /* Follow mremap() behavior and treat the entry dirty after the move */ - _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); + _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); + /* Follow mremap() behavior and treat the entry dirty after the move */ + _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); + } else { + src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); + _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot); + } set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd); pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); unlock_ptls: double_pt_unlock(src_ptl, dst_ptl); - anon_vma_unlock_write(src_anon_vma); - put_anon_vma(src_anon_vma); + if (src_anon_vma) { + anon_vma_unlock_write(src_anon_vma); + put_anon_vma(src_anon_vma); + } unlock_folio: /* unblock rmap walks */ - folio_unlock(src_folio); + if (src_folio) + folio_unlock(src_folio); mmu_notifier_invalidate_range_end(&range); - folio_put(src_folio); + if (src_folio) + folio_put(src_folio); return err; } #endif /* CONFIG_USERFAULTFD */ @@ -2442,7 +2397,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = pfn_swap_entry_to_page(entry); + folio = pfn_swap_entry_folio(entry); } else { page = pmd_page(old_pmd); folio = page_folio(page); @@ -2453,7 +2408,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, folio_remove_rmap_pmd(folio, page, vma); folio_put(folio); } - add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); + add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); return; } @@ -2470,32 +2425,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } - /* - * Up to this point the pmd is present and huge and userland has the - * whole access to the hugepage during the split (which happens in - * place). If we overwrite the pmd with the not-huge version pointing - * to the pte here (which of course we could if all CPUs were bug - * free), userland could trigger a small page size TLB miss on the - * small sized TLB while the hugepage TLB entry is still established in - * the huge TLB. Some CPU doesn't like that. - * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum - * 383 on page 105. Intel should be safe but is also warns that it's - * only safe if the permission and cache attributes of the two entries - * loaded in the two TLB is identical (which should be the case here). - * But it is generally safer to never allow small and huge TLB entries - * for the same virtual address to be loaded simultaneously. So instead - * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the - * current pmd notpresent (atomically because here the pmd_trans_huge - * must remain set at all times on the pmd until the split is complete - * for this pmd), then we flush the SMP TLB and finally we write the - * non-huge version of the pmd entry with pmd_populate. - */ - old_pmd = pmdp_invalidate(vma, haddr, pmd); - - pmd_migration = is_pmd_migration_entry(old_pmd); + pmd_migration = is_pmd_migration_entry(*pmd); if (unlikely(pmd_migration)) { swp_entry_t entry; + old_pmd = *pmd; entry = pmd_to_swp_entry(old_pmd); page = pfn_swap_entry_to_page(entry); write = is_writable_migration_entry(entry); @@ -2506,6 +2440,30 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); } else { + /* + * Up to this point the pmd is present and huge and userland has + * the whole access to the hugepage during the split (which + * happens in place). If we overwrite the pmd with the not-huge + * version pointing to the pte here (which of course we could if + * all CPUs were bug free), userland could trigger a small page + * size TLB miss on the small sized TLB while the hugepage TLB + * entry is still established in the huge TLB. Some CPU doesn't + * like that. See + * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum + * 383 on page 105. Intel should be safe but is also warns that + * it's only safe if the permission and cache attributes of the + * two entries loaded in the two TLB is identical (which should + * be the case here). But it is generally safer to never allow + * small and huge TLB entries for the same virtual address to be + * loaded simultaneously. So instead of doing "pmd_populate(); + * flush_pmd_tlb_range();" we first mark the current pmd + * notpresent (atomically because here the pmd_trans_huge must + * remain set at all times on the pmd until the split is + * complete for this pmd), then we flush the SMP TLB and finally + * we write the non-huge version of the pmd entry with + * pmd_populate. + */ + old_pmd = pmdp_invalidate(vma, haddr, pmd); page = pmd_page(old_pmd); folio = page_folio(page); if (pmd_dirty(old_pmd)) { @@ -2559,15 +2517,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte); - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { - pte_t entry; - /* - * Note that NUMA hinting access restrictions are not - * transferred to avoid any possibility of altering - * permissions across VMAs. - */ - if (freeze || pmd_migration) { + + /* + * Note that NUMA hinting access restrictions are not transferred to + * avoid any possibility of altering permissions across VMAs. + */ + if (freeze || pmd_migration) { + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + pte_t entry; swp_entry_t swp_entry; + if (write) swp_entry = make_writable_migration_entry( page_to_pfn(page + i)); @@ -2586,25 +2545,32 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_swp_mksoft_dirty(entry); if (uffd_wp) entry = pte_swp_mkuffd_wp(entry); - } else { - entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); - if (write) - entry = pte_mkwrite(entry, vma); - if (!young) - entry = pte_mkold(entry); - /* NOTE: this may set soft-dirty too on some archs */ - if (dirty) - entry = pte_mkdirty(entry); - if (soft_dirty) - entry = pte_mksoft_dirty(entry); - if (uffd_wp) - entry = pte_mkuffd_wp(entry); + + VM_WARN_ON(!pte_none(ptep_get(pte + i))); + set_pte_at(mm, addr, pte + i, entry); } - VM_BUG_ON(!pte_none(ptep_get(pte))); - set_pte_at(mm, addr, pte, entry); - pte++; + } else { + pte_t entry; + + entry = mk_pte(page, READ_ONCE(vma->vm_page_prot)); + if (write) + entry = pte_mkwrite(entry, vma); + if (!young) + entry = pte_mkold(entry); + /* NOTE: this may set soft-dirty too on some archs */ + if (dirty) + entry = pte_mkdirty(entry); + if (soft_dirty) + entry = pte_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); + + for (i = 0; i < HPAGE_PMD_NR; i++) + VM_WARN_ON(!pte_none(ptep_get(pte + i))); + + set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR); } - pte_unmap(pte - 1); + pte_unmap(pte); if (!pmd_migration) folio_remove_rmap_pmd(folio, page, vma); @@ -2640,7 +2606,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * It's safe to call pmd_page when folio is set because it's * guaranteed that pmd is present. */ - if (folio && folio != page_folio(pmd_page(*pmd))) + if (folio && folio != pmd_folio(*pmd)) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); } @@ -2698,11 +2664,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_folio(struct folio *folio) { - enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | - TTU_SYNC | TTU_BATCH_FLUSH; + enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC | + TTU_BATCH_FLUSH; VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (folio_test_pmd_mappable(folio)) + ttu_flags |= TTU_SPLIT_HUGE_PMD; + /* * Anon pages need migration entries to preserve them, but file * pages can simply be left unmapped, then faulted back on demand. @@ -2736,7 +2705,6 @@ static void lru_add_page_tail(struct page *head, struct page *tail, struct lruvec *lruvec, struct list_head *list) { VM_BUG_ON_PAGE(!PageHead(head), head); - VM_BUG_ON_PAGE(PageCompound(tail), head); VM_BUG_ON_PAGE(PageLRU(tail), head); lockdep_assert_held(&lruvec->lru_lock); @@ -2757,7 +2725,8 @@ static void lru_add_page_tail(struct page *head, struct page *tail, } static void __split_huge_page_tail(struct folio *folio, int tail, - struct lruvec *lruvec, struct list_head *list) + struct lruvec *lruvec, struct list_head *list, + unsigned int new_order) { struct page *head = &folio->page; struct page *page_tail = head + tail; @@ -2827,10 +2796,15 @@ static void __split_huge_page_tail(struct folio *folio, int tail, * which needs correct compound_head(). */ clear_compound_head(page_tail); + if (new_order) { + prep_compound_page(page_tail, new_order); + folio_set_large_rmappable(new_folio); + } /* Finally unfreeze refcount. Additional reference from page cache. */ - page_ref_unfreeze(page_tail, 1 + (!folio_test_anon(folio) || - folio_test_swapcache(folio))); + page_ref_unfreeze(page_tail, + 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ? + folio_nr_pages(new_folio) : 0)); if (folio_test_young(folio)) folio_set_young(new_folio); @@ -2848,18 +2822,20 @@ static void __split_huge_page_tail(struct folio *folio, int tail, } static void __split_huge_page(struct page *page, struct list_head *list, - pgoff_t end) + pgoff_t end, unsigned int new_order) { struct folio *folio = page_folio(page); struct page *head = &folio->page; struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; - unsigned int nr = thp_nr_pages(head); int i, nr_dropped = 0; + unsigned int new_nr = 1 << new_order; + int order = folio_order(folio); + unsigned int nr = 1 << order; /* complete memcg works before add pages to LRU */ - split_page_memcg(head, nr); + split_page_memcg(head, order, new_order); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { offset = swp_offset(folio->swap); @@ -2872,13 +2848,13 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageHasHWPoisoned(head); - for (i = nr - 1; i >= 1; i--) { - __split_huge_page_tail(folio, i, lruvec, list); + for (i = nr - new_nr; i >= new_nr; i -= new_nr) { + __split_huge_page_tail(folio, i, lruvec, list, new_order); /* Some pages can be beyond EOF: drop them from page cache */ if (head[i].index >= end) { struct folio *tail = page_folio(head + i); - if (shmem_mapping(head->mapping)) + if (shmem_mapping(folio->mapping)) nr_dropped++; else if (folio_test_clear_dirty(tail)) folio_account_cleaned(tail, @@ -2886,7 +2862,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, __filemap_remove_folio(tail, NULL); folio_put(tail); } else if (!PageAnon(page)) { - __xa_store(&head->mapping->i_pages, head[i].index, + __xa_store(&folio->mapping->i_pages, head[i].index, head + i, 0); } else if (swap_cache) { __xa_store(&swap_cache->i_pages, offset + i, @@ -2894,40 +2870,53 @@ static void __split_huge_page(struct page *page, struct list_head *list, } } - ClearPageCompound(head); + if (!new_order) + ClearPageCompound(head); + else { + struct folio *new_folio = (struct folio *)head; + + folio_set_order(new_folio, new_order); + } unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ - split_page_owner(head, nr); + split_page_owner(head, order, new_order); + pgalloc_tag_split(head, 1 << order); /* See comment in __split_huge_page_tail() */ - if (PageAnon(head)) { + if (folio_test_anon(folio)) { /* Additional pin to swap cache */ - if (PageSwapCache(head)) { - page_ref_add(head, 2); + if (folio_test_swapcache(folio)) { + folio_ref_add(folio, 1 + new_nr); xa_unlock(&swap_cache->i_pages); } else { - page_ref_inc(head); + folio_ref_inc(folio); } } else { /* Additional pin to page cache */ - page_ref_add(head, 2); - xa_unlock(&head->mapping->i_pages); + folio_ref_add(folio, 1 + new_nr); + xa_unlock(&folio->mapping->i_pages); } local_irq_enable(); if (nr_dropped) - shmem_uncharge(head->mapping->host, nr_dropped); + shmem_uncharge(folio->mapping->host, nr_dropped); remap_page(folio, nr); - if (folio_test_swapcache(folio)) - split_swap_cluster(folio->swap); + /* + * set page to its compound_head when split to non order-0 pages, so + * we can skip unlocking it below, since PG_locked is transferred to + * the compound_head of the page and the caller will unlock it. + */ + if (new_order) + page = compound_head(page); - for (i = 0; i < nr; i++) { + for (i = 0; i < nr; i += new_nr) { struct page *subpage = head + i; + struct folio *new_folio = page_folio(subpage); if (subpage == page) continue; - unlock_page(subpage); + folio_unlock(new_folio); /* * Subpages may be freed if there wasn't any mapping @@ -2957,31 +2946,59 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) } /* - * This function splits huge page into normal pages. @page can point to any - * subpage of huge page to split. Split doesn't change the position of @page. + * This function splits a large folio into smaller folios of order @new_order. + * @page can point to any page of the large folio to split. The split operation + * does not change the position of @page. + * + * Prerequisites: * - * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. - * The huge page must be locked. + * 1) The caller must hold a reference on the @page's owning folio, also known + * as the large folio. + * + * 2) The large folio must be locked. + * + * 3) The folio must not be pinned. Any unexpected folio references, including + * GUP pins, will result in the folio not getting split; instead, the caller + * will receive an -EAGAIN. + * + * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not + * supported for non-file-backed folios, because folio->_deferred_list, which + * is used by partially mapped folios, is stored in subpage 2, but an order-1 + * folio only has subpages 0 and 1. File-backed order-1 folios are supported, + * since they do not use _deferred_list. + * + * After splitting, the caller's folio reference will be transferred to @page, + * resulting in a raised refcount of @page after this call. The other pages may + * be freed if they are not mapped. * * If @list is null, tail pages will be added to LRU list, otherwise, to @list. * - * Both head page and tail pages will inherit mapping, flags, and so on from - * the hugepage. + * Pages in @new_order will inherit the mapping, flags, and so on from the + * huge page. * - * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if - * they are not mapped. + * Returns 0 if the huge page was split successfully. * - * Returns 0 if the hugepage is split successfully. - * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under - * us. + * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if + * the folio was concurrently removed from the page cache. + * + * Returns -EBUSY when trying to split the huge zeropage, if the folio is + * under writeback, if fs-specific folio metadata cannot currently be + * released, or if some unexpected race happened (e.g., anon VMA disappeared, + * truncation). + * + * Returns -EINVAL when trying to split to an order that is incompatible + * with the folio. Splitting to order 0 is compatible with all folios. */ -int split_huge_page_to_list(struct page *page, struct list_head *list) +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) { struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(folio); - XA_STATE(xas, &folio->mapping->i_pages, folio->index); + /* reset xarray order to new order after split */ + XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; + bool is_thp = folio_test_pmd_mappable(folio); int extra_pins, ret; pgoff_t end; bool is_hzp; @@ -2989,7 +3006,35 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - is_hzp = is_huge_zero_page(&folio->page); + if (new_order >= folio_order(folio)) + return -EINVAL; + + /* Cannot split anonymous THP to order-1 */ + if (new_order == 1 && folio_test_anon(folio)) { + VM_WARN_ONCE(1, "Cannot split to order-1 folio"); + return -EINVAL; + } + + if (new_order) { + /* Only swapping a whole PMD-mapped folio is supported */ + if (folio_test_swapcache(folio)) + return -EINVAL; + /* Split shmem folio to non-zero order not supported */ + if (shmem_mapping(folio->mapping)) { + VM_WARN_ONCE(1, + "Cannot split shmem folio to non-0 order"); + return -EINVAL; + } + /* No split if the file system does not support large folio */ + if (!mapping_large_folio_support(folio->mapping)) { + VM_WARN_ONCE(1, + "Cannot split file folio to non-0 order"); + return -EINVAL; + } + } + + + is_hzp = is_huge_zero_folio(folio); if (is_hzp) { pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); return -EBUSY; @@ -3082,16 +3127,24 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { - if (!list_empty(&folio->_deferred_list)) { + if (folio_order(folio) > 1 && + !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; - list_del(&folio->_deferred_list); + /* + * Reinitialize page_deferred_list after removing the + * page from the split_queue, otherwise a subsequent + * split will see list corruption when checking the + * page_deferred_list. + */ + list_del_init(&folio->_deferred_list); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { int nr = folio_nr_pages(folio); xas_split(&xas, folio, folio_order(folio)); - if (folio_test_pmd_mappable(folio)) { + if (folio_test_pmd_mappable(folio) && + new_order < HPAGE_PMD_ORDER) { if (folio_test_swapbacked(folio)) { __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); @@ -3103,7 +3156,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } } - __split_huge_page(page, list, end); + __split_huge_page(page, list, end, new_order); ret = 0; } else { spin_unlock(&ds_queue->split_queue_lock); @@ -3124,7 +3177,8 @@ out_unlock: i_mmap_unlock_read(mapping); out: xas_destroy(&xas); - count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); + if (is_thp) + count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; } @@ -3133,6 +3187,9 @@ void folio_undo_large_rmappable(struct folio *folio) struct deferred_split *ds_queue; unsigned long flags; + if (folio_order(folio) <= 1) + return; + /* * At this point, there is no one trying to add the folio to * deferred_list. If folio is not in deferred_list, it's safe @@ -3158,7 +3215,12 @@ void deferred_split_folio(struct folio *folio) #endif unsigned long flags; - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); + /* + * Order 1 folios have no space for a deferred list, but we also + * won't waste much memory by not adding them to the deferred list. + */ + if (folio_order(folio) <= 1) + return; /* * The try_to_unmap() in page reclaim path might reach here too, @@ -3178,7 +3240,8 @@ void deferred_split_folio(struct folio *folio) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(&folio->_deferred_list)) { - count_vm_event(THP_DEFERRED_SPLIT_PAGE); + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG @@ -3316,7 +3379,7 @@ static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) } static int split_huge_pages_pid(int pid, unsigned long vaddr_start, - unsigned long vaddr_end) + unsigned long vaddr_end, unsigned int new_order) { int ret = 0; struct task_struct *task; @@ -3379,14 +3442,23 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, if (!is_transparent_hugepage(folio)) goto next; + if (new_order >= folio_order(folio)) + goto next; + total++; - if (!can_split_folio(folio, NULL)) + /* + * For folios with private, split_huge_page_to_list_to_order() + * will try to drop it before split and then check if the folio + * can be split or not. So skip the check here. + */ + if (!folio_test_private(folio) && + !can_split_folio(folio, NULL)) goto next; if (!folio_trylock(folio)) goto next; - if (!split_folio(folio)) + if (!split_folio_to_order(folio, new_order)) split++; folio_unlock(folio); @@ -3404,7 +3476,7 @@ out: } static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, - pgoff_t off_end) + pgoff_t off_end, unsigned int new_order) { struct filename *file; struct file *candidate; @@ -3440,10 +3512,13 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, total++; nr_pages = folio_nr_pages(folio); + if (new_order >= folio_order(folio)) + goto next; + if (!folio_trylock(folio)) goto next; - if (!split_folio(folio)) + if (!split_folio_to_order(folio, new_order)) split++; folio_unlock(folio); @@ -3468,10 +3543,14 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, { static DEFINE_MUTEX(split_debug_mutex); ssize_t ret; - /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */ + /* + * hold pid, start_vaddr, end_vaddr, new_order or + * file_path, off_start, off_end, new_order + */ char input_buf[MAX_INPUT_BUF_SZ]; int pid; unsigned long vaddr_start, vaddr_end; + unsigned int new_order = 0; ret = mutex_lock_interruptible(&split_debug_mutex); if (ret) @@ -3500,29 +3579,29 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, goto out; } - ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end); - if (ret != 2) { + ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); + if (ret != 2 && ret != 3) { ret = -EINVAL; goto out; } - ret = split_huge_pages_in_file(file_path, off_start, off_end); + ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order); if (!ret) ret = input_len; goto out; } - ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); + ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order); if (ret == 1 && pid == 1) { split_huge_pages_all(); ret = strlen(input_buf); goto out; - } else if (ret != 3) { + } else if (ret != 3 && ret != 4) { ret = -EINVAL; goto out; } - ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); + ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order); if (!ret) ret = strlen(input_buf); out: diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ed1581b670d4..6be78e7d4f6e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,6 +35,7 @@ #include <linux/delayacct.h> #include <linux/memory.h> #include <linux/mm_inline.h> +#include <linux/padata.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -68,7 +69,7 @@ static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) #endif static unsigned long hugetlb_cma_size __initdata; -__initdata LIST_HEAD(huge_boot_pages); +__initdata struct list_head huge_boot_pages[MAX_NUMNODES]; /* for command line parsing */ static struct hstate * __initdata parsed_hstate; @@ -1464,15 +1465,15 @@ static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) * next node from which to allocate, handling wrap at end of node * mask. */ -static int hstate_next_node_to_alloc(struct hstate *h, +static int hstate_next_node_to_alloc(int *next_node, nodemask_t *nodes_allowed) { int nid; VM_BUG_ON(!nodes_allowed); - nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); - h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + nid = get_valid_node_allowed(*next_node, nodes_allowed); + *next_node = next_node_allowed(nid, nodes_allowed); return nid; } @@ -1495,10 +1496,10 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) return nid; } -#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ +#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \ for (nr_nodes = nodes_weight(*mask); \ nr_nodes > 0 && \ - ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \ nr_nodes--) #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ @@ -1516,7 +1517,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, struct page *p; atomic_set(&folio->_entire_mapcount, 0); - atomic_set(&folio->_nr_pages_mapped, 0); + atomic_set(&folio->_large_mapcount, 0); atomic_set(&folio->_pincount, 0); for (i = 1; i < nr_pages; i++) { @@ -1618,19 +1619,11 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { } #endif -static inline void __clear_hugetlb_destructor(struct hstate *h, - struct folio *folio) -{ - lockdep_assert_held(&hugetlb_lock); - - folio_clear_hugetlb(folio); -} - /* * Remove hugetlb folio from lists. - * If vmemmap exists for the folio, update dtor so that the folio appears - * as just a compound page. Otherwise, wait until after allocating vmemmap - * to update dtor. + * If vmemmap exists for the folio, clear the hugetlb flag so that the + * folio appears as just a compound page. Otherwise, wait until after + * allocating vmemmap to clear the flag. * * A reference is held on the folio, except in the case of demote. * @@ -1661,12 +1654,12 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, } /* - * We can only clear the hugetlb destructor after allocating vmemmap + * We can only clear the hugetlb flag after allocating vmemmap * pages. Otherwise, someone (memory error handling) may try to write * to tail struct pages. */ if (!folio_test_hugetlb_vmemmap_optimized(folio)) - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); /* * In the case of demote we do not ref count the page as it will soon @@ -1710,7 +1703,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, h->surplus_huge_pages_node[nid]++; } - folio_set_hugetlb(folio); + __folio_set_hugetlb(folio); folio_change_private(folio, NULL); /* * We have to set hugetlb_vmemmap_optimized again as above @@ -1733,14 +1726,14 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, */ return; - arch_clear_hugepage_flags(&folio->page); + arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); } static void __update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio) { - bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio); + bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1753,11 +1746,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, return; /* - * If folio is not vmemmap optimized (!clear_dtor), then the folio + * If folio is not vmemmap optimized (!clear_flag), then the folio * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio * can only be passed hugetlb pages and will BUG otherwise. */ - if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) { + if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1778,11 +1771,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, /* * If vmemmap pages were allocated above, then we need to clear the - * hugetlb destructor under the hugetlb lock. + * hugetlb flag under the hugetlb lock. */ - if (clear_dtor) { + if (folio_test_hugetlb(folio)) { spin_lock_irq(&hugetlb_lock); - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); } @@ -1795,7 +1788,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_folio(folio, huge_page_order(h)); } else { - __free_pages(&folio->page, huge_page_order(h)); + INIT_LIST_HEAD(&folio->_deferred_list); + folio_put(folio); } } @@ -1883,7 +1877,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h, list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); @@ -1908,7 +1902,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h, } else { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); @@ -1941,14 +1935,14 @@ retry: * should only be pages on the non_hvo_folios list. * Do note that the non_hvo_folios list could be empty. * Without HVO enabled, ret will be 0 and there is no need to call - * __clear_hugetlb_destructor as this was done previously. + * __folio_clear_hugetlb as this was done previously. */ VM_WARN_ON(!list_empty(folio_list)); VM_WARN_ON(ret < 0); if (!list_empty(&non_hvo_folios) && ret) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(folio, &non_hvo_folios, lru) - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); } @@ -1973,7 +1967,7 @@ void free_huge_folio(struct folio *folio) { /* * Can't pass hstate in here because it is called from the - * compound page destructor. + * generic mm code. */ struct hstate *h = folio_hstate(folio); int nid = folio_nid(folio); @@ -2030,7 +2024,7 @@ void free_huge_folio(struct folio *folio) spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else { - arch_clear_hugepage_flags(&folio->page); + arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } @@ -2048,7 +2042,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) { - folio_set_hugetlb(folio); + __folio_set_hugetlb(folio); INIT_LIST_HEAD(&folio->lru); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); @@ -2123,10 +2117,10 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, set_compound_head(p, &folio->page); } __folio_set_head(folio); - /* we rely on prep_new_hugetlb_folio to set the destructor */ + /* we rely on prep_new_hugetlb_folio to set the hugetlb flag */ folio_set_order(folio, order); atomic_set(&folio->_entire_mapcount, -1); - atomic_set(&folio->_nr_pages_mapped, 0); + atomic_set(&folio->_large_mapcount, -1); atomic_set(&folio->_pincount, 0); return true; @@ -2159,31 +2153,15 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, } /* - * PageHuge() only returns true for hugetlbfs pages, but not for normal or - * transparent huge pages. See the PageTransHuge() documentation for more - * details. - */ -int PageHuge(struct page *page) -{ - struct folio *folio; - - if (!PageCompound(page)) - return 0; - folio = page_folio(page); - return folio_test_hugetlb(folio); -} -EXPORT_SYMBOL_GPL(PageHuge); - -/* * Find and lock address space (mapping) in write mode. * - * Upon entry, the page is locked which means that page_mapping() is + * Upon entry, the folio is locked which means that folio_mapping() is * stable. Due to locking order, we can only trylock_write. If we can * not get the lock, simply return NULL to caller. */ -struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) +struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) { - struct address_space *mapping = page_mapping(hpage); + struct address_space *mapping = folio_mapping(folio); if (!mapping) return mapping; @@ -2199,13 +2177,13 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, nodemask_t *node_alloc_noretry) { int order = huge_page_order(h); - struct page *page; + struct folio *folio; bool alloc_try_hard = true; bool retry = true; /* - * By default we always try hard to allocate the page with - * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in + * By default we always try hard to allocate the folio with + * __GFP_RETRY_MAYFAIL flag. However, if we are allocating folios in * a loop (to adjust global huge page counts) and previous allocation * failed, do not continue to try hard on the same node. Use the * node_alloc_noretry bitmap to manage this state information. @@ -2218,43 +2196,42 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: - page = __alloc_pages(gfp_mask, order, nid, nmask); + folio = __folio_alloc(gfp_mask, order, nid, nmask); - /* Freeze head page */ - if (page && !page_ref_freeze(page, 1)) { - __free_pages(page, order); + if (folio && !folio_ref_freeze(folio, 1)) { + folio_put(folio); if (retry) { /* retry once */ retry = false; goto retry; } /* WOW! twice in a row. */ - pr_warn("HugeTLB head page unexpected inflated ref count\n"); - page = NULL; + pr_warn("HugeTLB unexpected inflated folio ref count\n"); + folio = NULL; } /* - * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this - * indicates an overall state change. Clear bit so that we resume - * normal 'try hard' allocations. + * If we did not specify __GFP_RETRY_MAYFAIL, but still got a + * folio this indicates an overall state change. Clear bit so + * that we resume normal 'try hard' allocations. */ - if (node_alloc_noretry && page && !alloc_try_hard) + if (node_alloc_noretry && folio && !alloc_try_hard) node_clear(nid, *node_alloc_noretry); /* - * If we tried hard to get a page but failed, set bit so that + * If we tried hard to get a folio but failed, set bit so that * subsequent attempts will not try as hard until there is an * overall state change. */ - if (node_alloc_noretry && !page && alloc_try_hard) + if (node_alloc_noretry && !folio && alloc_try_hard) node_set(nid, *node_alloc_noretry); - if (!page) { + if (!folio) { __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); return NULL; } __count_vm_event(HTLB_BUDDY_PGALLOC); - return page_folio(page); + return folio; } static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, @@ -2350,12 +2327,13 @@ static void prep_and_add_allocated_folios(struct hstate *h, */ static struct folio *alloc_pool_huge_folio(struct hstate *h, nodemask_t *nodes_allowed, - nodemask_t *node_alloc_noretry) + nodemask_t *node_alloc_noretry, + int *next_node) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nr_nodes, node; - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) { struct folio *folio; folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, @@ -2399,8 +2377,8 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h, } /* - * Dissolve a given free hugepage into free buddy pages. This function does - * nothing for in-use hugepages and non-hugepages. + * Dissolve a given free hugetlb folio into free buddy pages. This function + * does nothing for in-use hugetlb folios and non-hugetlb folios. * This function returns values like below: * * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages @@ -2412,10 +2390,9 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h, * 0: successfully dissolved free hugepages or the page is not a * hugepage (considered as already dissolved) */ -int dissolve_free_huge_page(struct page *page) +int dissolve_free_hugetlb_folio(struct folio *folio) { int rc = -EBUSY; - struct folio *folio = page_folio(page); retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ @@ -2492,13 +2469,13 @@ out: * make specified memory blocks removable from the system. * Note that this will dissolve a free gigantic hugepage completely, if any * part of it lies within the given range. - * Also note that if dissolve_free_huge_page() returns with an error, all - * free hugepages that were dissolved before that error are lost. + * Also note that if dissolve_free_hugetlb_folio() returns with an error, all + * free hugetlb folios that were dissolved before that error are lost. */ -int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; - struct page *page; + struct folio *folio; int rc = 0; unsigned int order; struct hstate *h; @@ -2511,8 +2488,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) order = min(order, huge_page_order(h)); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { - page = pfn_to_page(pfn); - rc = dissolve_free_huge_page(page); + folio = pfn_folio(pfn); + rc = dissolve_free_hugetlb_folio(folio); if (rc) break; } @@ -2619,7 +2596,7 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, /* folio migration callback function */ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, - nodemask_t *nmask, gfp_t gfp_mask) + nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback) { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { @@ -2634,6 +2611,10 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, } spin_unlock_irq(&hugetlb_lock); + /* We cannot fallback to other nodes, as we could break the per-node pool. */ + if (!allow_alloc_fallback) + gfp_mask |= __GFP_THISNODE; + return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } @@ -3029,21 +3010,9 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = folio_nid(old_folio); - struct folio *new_folio; + struct folio *new_folio = NULL; int ret = 0; - /* - * Before dissolving the folio, we need to allocate a new one for the - * pool to remain stable. Here, we allocate the folio and 'prep' it - * by doing everything but actually updating counters and adding to - * the pool. This simplifies and let us do most of the processing - * under the lock. - */ - new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); - if (!new_folio) - return -ENOMEM; - __prep_new_hugetlb_folio(h, new_folio); - retry: spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { @@ -3073,6 +3042,16 @@ retry: cond_resched(); goto retry; } else { + if (!new_folio) { + spin_unlock_irq(&hugetlb_lock); + new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, + NULL, NULL); + if (!new_folio) + return -ENOMEM; + __prep_new_hugetlb_folio(h, new_folio); + goto retry; + } + /* * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be @@ -3100,9 +3079,11 @@ retry: free_new: spin_unlock_irq(&hugetlb_lock); - /* Folio has a zero ref count, but needs a ref to be freed */ - folio_ref_unfreeze(new_folio, 1); - update_and_free_hugetlb_folio(h, new_folio, false); + if (new_folio) { + /* Folio has a zero ref count, but needs a ref to be freed */ + folio_ref_unfreeze(new_folio, 1); + update_and_free_hugetlb_folio(h, new_folio, false); + } return ret; } @@ -3266,9 +3247,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); - if (deferred_reserve) + if (deferred_reserve) { + spin_lock_irq(&hugetlb_lock); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); + spin_unlock_irq(&hugetlb_lock); + } } if (!memcg_charge_ret) @@ -3299,7 +3283,7 @@ int alloc_bootmem_huge_page(struct hstate *h, int nid) int __alloc_bootmem_huge_page(struct hstate *h, int nid) { struct huge_bootmem_page *m = NULL; /* initialize for clang */ - int nr_nodes, node; + int nr_nodes, node = nid; /* do node specific alloc */ if (nid != NUMA_NO_NODE) { @@ -3310,7 +3294,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) goto found; } /* allocate from next node when distributing huge pages */ - for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) { m = memblock_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); @@ -3337,7 +3321,7 @@ found: huge_page_size(h) - PAGE_SIZE); /* Put them into a private list first because mem_map is not up yet */ INIT_LIST_HEAD(&m->list); - list_add(&m->list, &huge_boot_pages); + list_add(&m->list, &huge_boot_pages[node]); m->hstate = h; return 1; } @@ -3388,8 +3372,6 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, /* Send list for bulk vmemmap optimization processing */ hugetlb_vmemmap_optimize_folios(h, folio_list); - /* Add all new pool pages to free lists in one lock cycle */ - spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { if (!folio_test_hugetlb_vmemmap_optimized(folio)) { /* @@ -3402,23 +3384,25 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } + /* Subdivide locks to achieve better parallel performance */ + spin_lock_irqsave(&hugetlb_lock, flags); __prep_account_new_huge_page(h, folio_nid(folio)); enqueue_hugetlb_folio(h, folio); + spin_unlock_irqrestore(&hugetlb_lock, flags); } - spin_unlock_irqrestore(&hugetlb_lock, flags); } /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. */ -static void __init gather_bootmem_prealloc(void) +static void __init gather_bootmem_prealloc_node(unsigned long nid) { LIST_HEAD(folio_list); struct huge_bootmem_page *m; struct hstate *h = NULL, *prev_h = NULL; - list_for_each_entry(m, &huge_boot_pages, list) { + list_for_each_entry(m, &huge_boot_pages[nid], list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; @@ -3451,6 +3435,31 @@ static void __init gather_bootmem_prealloc(void) prep_and_add_bootmem_folios(h, &folio_list); } +static void __init gather_bootmem_prealloc_parallel(unsigned long start, + unsigned long end, void *arg) +{ + int nid; + + for (nid = start; nid < end; nid++) + gather_bootmem_prealloc_node(nid); +} + +static void __init gather_bootmem_prealloc(void) +{ + struct padata_mt_job job = { + .thread_fn = gather_bootmem_prealloc_parallel, + .fn_arg = NULL, + .start = 0, + .size = num_node_state(N_MEMORY), + .align = 1, + .min_chunk = 1, + .max_threads = num_node_state(N_MEMORY), + .numa_aware = true, + }; + + padata_do_multithreaded(&job); +} + static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; @@ -3482,6 +3491,108 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) h->max_huge_pages_node[nid] = i; } +static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h) +{ + int i; + bool node_specific_alloc = false; + + for_each_online_node(i) { + if (h->max_huge_pages_node[i] > 0) { + hugetlb_hstate_alloc_pages_onenode(h, i); + node_specific_alloc = true; + } + } + + return node_specific_alloc; +} + +static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h) +{ + if (allocated < h->max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", + h->max_huge_pages, buf, allocated); + h->max_huge_pages = allocated; + } +} + +static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg) +{ + struct hstate *h = (struct hstate *)arg; + int i, num = end - start; + nodemask_t node_alloc_noretry; + LIST_HEAD(folio_list); + int next_node = first_online_node; + + /* Bit mask controlling how hard we retry per-node allocations.*/ + nodes_clear(node_alloc_noretry); + + for (i = 0; i < num; ++i) { + struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], + &node_alloc_noretry, &next_node); + if (!folio) + break; + + list_move(&folio->lru, &folio_list); + cond_resched(); + } + + prep_and_add_allocated_folios(h, &folio_list); +} + +static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) +{ + unsigned long i; + + for (i = 0; i < h->max_huge_pages; ++i) { + if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) + break; + cond_resched(); + } + + return i; +} + +static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) +{ + struct padata_mt_job job = { + .fn_arg = h, + .align = 1, + .numa_aware = true + }; + + job.thread_fn = hugetlb_pages_alloc_boot_node; + job.start = 0; + job.size = h->max_huge_pages; + + /* + * job.max_threads is twice the num_node_state(N_MEMORY), + * + * Tests below indicate that a multiplier of 2 significantly improves + * performance, and although larger values also provide improvements, + * the gains are marginal. + * + * Therefore, choosing 2 as the multiplier strikes a good balance between + * enhancing parallel processing capabilities and maintaining efficient + * resource management. + * + * +------------+-------+-------+-------+-------+-------+ + * | multiplier | 1 | 2 | 3 | 4 | 5 | + * +------------+-------+-------+-------+-------+-------+ + * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms | + * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms | + * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms | + * +------------+-------+-------+-------+-------+-------+ + */ + job.max_threads = num_node_state(N_MEMORY) * 2; + job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2; + padata_do_multithreaded(&job); + + return h->nr_huge_pages; +} + /* * NOTE: this routine is called in different contexts for gigantic and * non-gigantic pages. @@ -3495,11 +3606,8 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) */ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { - unsigned long i; - struct folio *folio; - LIST_HEAD(folio_list); - nodemask_t *node_alloc_noretry; - bool node_specific_alloc = false; + unsigned long allocated; + static bool initialized __initdata; /* skip gigantic hugepages allocation if hugetlb_cma enabled */ if (hstate_is_gigantic(h) && hugetlb_cma_size) { @@ -3507,66 +3615,26 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) return; } - /* do node specific alloc */ - for_each_online_node(i) { - if (h->max_huge_pages_node[i] > 0) { - hugetlb_hstate_alloc_pages_onenode(h, i); - node_specific_alloc = true; - } + /* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */ + if (!initialized) { + int i = 0; + + for (i = 0; i < MAX_NUMNODES; i++) + INIT_LIST_HEAD(&huge_boot_pages[i]); + initialized = true; } - if (node_specific_alloc) + /* do node specific alloc */ + if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; /* below will do all node balanced alloc */ - if (!hstate_is_gigantic(h)) { - /* - * Bit mask controlling how hard we retry per-node allocations. - * Ignore errors as lower level routines can deal with - * node_alloc_noretry == NULL. If this kmalloc fails at boot - * time, we are likely in bigger trouble. - */ - node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), - GFP_KERNEL); - } else { - /* allocations done at boot time */ - node_alloc_noretry = NULL; - } - - /* bit mask controlling how hard we retry per-node allocations */ - if (node_alloc_noretry) - nodes_clear(*node_alloc_noretry); - - for (i = 0; i < h->max_huge_pages; ++i) { - if (hstate_is_gigantic(h)) { - /* - * gigantic pages not added to list as they are not - * added to pools now. - */ - if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) - break; - } else { - folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], - node_alloc_noretry); - if (!folio) - break; - list_add(&folio->lru, &folio_list); - } - cond_resched(); - } - - /* list will be empty if hstate_is_gigantic */ - prep_and_add_allocated_folios(h, &folio_list); - - if (i < h->max_huge_pages) { - char buf[32]; + if (hstate_is_gigantic(h)) + allocated = hugetlb_gigantic_pages_alloc_boot(h); + else + allocated = hugetlb_pages_alloc_boot(h); - string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); - pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", - h->max_huge_pages, buf, i); - h->max_huge_pages = i; - } - kfree(node_alloc_noretry); + hugetlb_hstate_alloc_pages_errcheck(allocated, h); } static void __init hugetlb_init_hstates(void) @@ -3668,7 +3736,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) { if (h->surplus_huge_pages_node[node]) goto found; } @@ -3783,7 +3851,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, cond_resched(); folio = alloc_pool_huge_folio(h, nodes_allowed, - node_alloc_noretry); + node_alloc_noretry, + &h->next_nid_to_alloc); if (!folio) { prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); @@ -4958,7 +5027,6 @@ static struct ctl_table hugetlb_table[] = { .mode = 0644, .proc_handler = hugetlb_overcommit_handler, }, - { } }; static void hugetlb_sysctl_init(void) @@ -5585,6 +5653,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + bool adjust_reservation = false; unsigned long last_addr_mask; bool force_flush = false; @@ -5677,7 +5746,31 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(page_folio(page)); + /* + * Restore the reservation for anonymous page, otherwise the + * backing page could be stolen by someone. + * If there we are freeing a surplus, do not set the restore + * reservation bit. + */ + if (!h->surplus_huge_pages && __vma_private_lock(vma) && + folio_test_anon(page_folio(page))) { + folio_set_hugetlb_restore_reserve(page_folio(page)); + /* Reservation to be adjusted after the spin lock */ + adjust_reservation = true; + } + spin_unlock(ptl); + + /* + * Adjust the reservation for the region that will have the + * reserve restored. Keep in mind that vma_needs_reservation() changes + * resv->adds_in_progress if it succeeds. If this is not done, + * do_exit() will not see it, and will keep the reservation + * forever. + */ + if (adjust_reservation && vma_needs_reservation(h, vma, address)) + vma_add_reservation(h, vma, address); + tlb_remove_page_size(tlb, page, huge_page_size(h)); /* * Bail out after unmapping reference page if supplied @@ -5824,18 +5917,18 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, unsigned int flags, - struct folio *pagecache_folio, spinlock_t *ptl) +static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, + struct vm_fault *vmf) { - const bool unshare = flags & FAULT_FLAG_UNSHARE; - pte_t pte = huge_ptep_get(ptep); + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + pte_t pte = huge_ptep_get(vmf->pte); struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; int outside_reserve = 0; vm_fault_t ret = 0; - unsigned long haddr = address & huge_page_mask(h); struct mmu_notifier_range range; /* @@ -5858,7 +5951,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { - set_huge_ptep_writable(vma, haddr, ptep); + set_huge_ptep_writable(vma, vmf->address, vmf->pte); return 0; } @@ -5870,6 +5963,13 @@ retry_avoidcopy: /* * If no-one else is actually using this page, we're the exclusive * owner and can reuse this page. + * + * Note that we don't rely on the (safer) folio refcount here, because + * copying the hugetlb folio when there are unexpected (temporary) + * folio references could harm simple fork()+exit() users when + * we run out of free hugetlb folios: we would have to kill processes + * in scenarios that used to work. As a side effect, there can still + * be leaks between processes, for example, with FOLL_GET users. */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { if (!PageAnonExclusive(&old_folio->page)) { @@ -5877,7 +5977,7 @@ retry_avoidcopy: SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) - set_huge_ptep_writable(vma, haddr, ptep); + set_huge_ptep_writable(vma, vmf->address, vmf->pte); delayacct_wpcopy_end(); return 0; @@ -5904,8 +6004,8 @@ retry_avoidcopy: * Drop page table lock as buddy allocator may be called. It will * be acquired again before returning to the caller, as expected. */ - spin_unlock(ptl); - new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); + spin_unlock(vmf->ptl); + new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve); if (IS_ERR(new_folio)) { /* @@ -5930,19 +6030,21 @@ retry_avoidcopy: * * Reacquire both after unmap operation. */ - idx = vma_hugecache_offset(h, vma, haddr); + idx = vma_hugecache_offset(h, vma, vmf->address); hash = hugetlb_fault_mutex_hash(mapping, idx); hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - unmap_ref_private(mm, vma, &old_folio->page, haddr); + unmap_ref_private(mm, vma, &old_folio->page, + vmf->address); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); - spin_lock(ptl); - ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (likely(ptep && - pte_same(huge_ptep_get(ptep), pte))) + spin_lock(vmf->ptl); + vmf->pte = hugetlb_walk(vma, vmf->address, + huge_page_size(h)); + if (likely(vmf->pte && + pte_same(huge_ptep_get(vmf->pte), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table @@ -5960,42 +6062,42 @@ retry_avoidcopy: * When the original hugepage is shared one, it does not have * anon_vma prepared. */ - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) goto out_release_all; - } - if (copy_user_large_folio(new_folio, old_folio, address, vma)) { - ret = VM_FAULT_HWPOISON_LARGE; + if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) { + ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_release_all; } __folio_mark_uptodate(new_folio); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, - haddr + huge_page_size(h)); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address, + vmf->address + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* * Retake the page table lock to check for racing updates * before the page tables are altered */ - spin_lock(ptl); - ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { + spin_lock(vmf->ptl); + vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); + if (likely(vmf->pte && pte_same(huge_ptep_get(vmf->pte), pte))) { pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); /* Break COW or unshare */ - huge_ptep_clear_flush(vma, haddr, ptep); + huge_ptep_clear_flush(vma, vmf->address, vmf->pte); hugetlb_remove_rmap(old_folio); - hugetlb_add_new_anon_rmap(new_folio, vma, haddr); + hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); - set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h)); + set_huge_pte_at(mm, vmf->address, vmf->pte, newpte, + huge_page_size(h)); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ new_folio = old_folio; } - spin_unlock(ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: /* @@ -6003,12 +6105,12 @@ out_release_all: * unshare) */ if (new_folio != old_folio) - restore_reserve_on_error(h, vma, haddr, new_folio); + restore_reserve_on_error(h, vma, vmf->address, new_folio); folio_put(new_folio); out_release_old: folio_put(old_folio); - spin_lock(ptl); /* Caller expects lock to be held */ + spin_lock(vmf->ptl); /* Caller expects lock to be held */ delayacct_wpcopy_end(); return ret; @@ -6017,8 +6119,8 @@ out_release_old: /* * Return whether there is a pagecache page to back given address within VMA. */ -static bool hugetlbfs_pagecache_present(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) +bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = linear_page_index(vma, address); @@ -6060,39 +6162,21 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping return 0; } -static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, +static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf, struct address_space *mapping, - pgoff_t idx, - unsigned int flags, - unsigned long haddr, - unsigned long addr, unsigned long reason) { u32 hash; - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = addr, - .flags = flags, - - /* - * Hard to debug if it ends up being - * used by a callee that assumes - * something about the other - * uninitialized fields... same as in - * memory.c - */ - }; /* * vma_lock and hugetlb_fault_mutex must be dropped before handling * userfault. Also mmap_lock could be dropped due to handling * userfault, any vma operation should be careful from here. */ - hugetlb_vma_unlock_read(vma); - hash = hugetlb_fault_mutex_hash(mapping, idx); + hugetlb_vma_unlock_read(vmf->vma); + hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - return handle_userfault(&vmf, reason); + return handle_userfault(vmf, reason); } /* @@ -6112,22 +6196,19 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, return same; } -static vm_fault_t hugetlb_no_page(struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, pgoff_t idx, - unsigned long address, pte_t *ptep, - pte_t old_pte, unsigned int flags) +static vm_fault_t hugetlb_no_page(struct address_space *mapping, + struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; struct folio *folio; pte_t new_pte; - spinlock_t *ptl; - unsigned long haddr = address & huge_page_mask(h); bool new_folio, new_pagecache_folio = false; - u32 hash = hugetlb_fault_mutex_hash(mapping, idx); + u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); /* * Currently, we are forced to kill the process in the event the @@ -6146,10 +6227,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * before we get page_table_lock. */ new_folio = false; - folio = filemap_lock_hugetlb_folio(h, mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) + if (vmf->pgoff >= size) goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { @@ -6170,17 +6251,22 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * never happen on the page after UFFDIO_COPY has * correctly installed the page and returned. */ - if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } - return hugetlb_handle_userfault(vma, mapping, idx, flags, - haddr, address, + return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MISSING); } - folio = alloc_hugetlb_folio(vma, haddr, 0); + if (!(vma->vm_flags & VM_MAYSHARE)) { + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) + goto out; + } + + folio = alloc_hugetlb_folio(vma, vmf->address, 0); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being @@ -6194,18 +6280,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - if (hugetlb_pte_stable(h, mm, ptep, old_pte)) + if (hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } - clear_huge_page(&folio->page, address, pages_per_huge_page(h)); + clear_huge_page(&folio->page, vmf->real_address, + pages_per_huge_page(h)); __folio_mark_uptodate(folio); new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(folio, mapping, idx); + int err = hugetlb_add_to_page_cache(folio, mapping, + vmf->pgoff); if (err) { /* * err can't be -EEXIST which implies someone @@ -6214,17 +6302,15 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, folio); + restore_reserve_on_error(h, vma, vmf->address, + folio); folio_put(folio); + ret = VM_FAULT_SIGBUS; goto out; } new_pagecache_folio = true; } else { folio_lock(folio); - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; - goto backout_unlocked; - } anon_rmap = 1; } } else { @@ -6244,12 +6330,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, folio_unlock(folio); folio_put(folio); /* See comment in userfaultfd_missing() block above */ - if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } - return hugetlb_handle_userfault(vma, mapping, idx, flags, - haddr, address, + return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MINOR); } } @@ -6260,23 +6345,23 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * any allocations necessary to record that reservation occur outside * the spinlock. */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - if (vma_needs_reservation(h, vma, haddr) < 0) { + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + if (vma_needs_reservation(h, vma, vmf->address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, haddr); + vma_end_reservation(h, vma, vmf->address); } - ptl = huge_pte_lock(h, mm, ptep); + vmf->ptl = huge_pte_lock(h, mm, vmf->pte); ret = 0; /* If pte changed from under us, retry */ - if (!pte_same(huge_ptep_get(ptep), old_pte)) + if (!pte_same(huge_ptep_get(vmf->pte), vmf->orig_pte)) goto backout; if (anon_rmap) - hugetlb_add_new_anon_rmap(folio, vma, haddr); + hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) @@ -6285,17 +6370,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ - if (unlikely(pte_marker_uffd_wp(old_pte))) + if (unlikely(pte_marker_uffd_wp(vmf->orig_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); - set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h)); + set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h)); hugetlb_count_add(pages_per_huge_page(h), mm); - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); + ret = hugetlb_wp(folio, vmf); } - spin_unlock(ptl); + spin_unlock(vmf->ptl); /* * Only set hugetlb_migratable in newly allocated pages. Existing pages @@ -6312,10 +6397,10 @@ out: return ret; backout: - spin_unlock(ptl); + spin_unlock(vmf->ptl); backout_unlocked: if (new_folio && !new_pagecache_folio) - restore_reserve_on_error(h, vma, haddr, folio); + restore_reserve_on_error(h, vma, vmf->address, folio); folio_unlock(folio); folio_put(folio); @@ -6349,23 +6434,27 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - pte_t *ptep, entry; - spinlock_t *ptl; vm_fault_t ret; u32 hash; - pgoff_t idx; struct folio *folio = NULL; struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; - unsigned long haddr = address & huge_page_mask(h); + struct vm_fault vmf = { + .vma = vma, + .address = address & huge_page_mask(h), + .real_address = address, + .flags = flags, + .pgoff = vma_hugecache_offset(h, vma, + address & huge_page_mask(h)), + /* TODO: Track hugetlb faults using vm_fault */ - /* TODO: Handle faults under the VMA lock */ - if (flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + /* + * Some fields may not be initialized, be careful as it may + * be hard to debug if called functions make assumptions + */ + }; /* * Serialize hugepage allocation and instantiation, so that we don't @@ -6373,31 +6462,31 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * the same page in the page cache. */ mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, haddr); - hash = hugetlb_fault_mutex_hash(mapping, idx); + hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * Acquire vma lock before calling huge_pte_alloc and hold - * until finished with ptep. This prevents huge_pmd_unshare from - * being called elsewhere and making the ptep no longer valid. + * until finished with vmf.pte. This prevents huge_pmd_unshare from + * being called elsewhere and making the vmf.pte no longer valid. */ hugetlb_vma_lock_read(vma); - ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); - if (!ptep) { + vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h)); + if (!vmf.pte) { hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return VM_FAULT_OOM; } - entry = huge_ptep_get(ptep); - if (huge_pte_none_mostly(entry)) { - if (is_pte_marker(entry)) { + vmf.orig_pte = huge_ptep_get(vmf.pte); + if (huge_pte_none_mostly(vmf.orig_pte)) { + if (is_pte_marker(vmf.orig_pte)) { pte_marker marker = - pte_marker_get(pte_to_swp_entry(entry)); + pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); if (marker & PTE_MARKER_POISONED) { - ret = VM_FAULT_HWPOISON_LARGE; + ret = VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; } } @@ -6408,21 +6497,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ - return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, - entry, flags); + return hugetlb_no_page(mapping, &vmf); } ret = 0; /* - * entry could be a migration/hwpoison entry at this point, so this - * check prevents the kernel from going below assuming that we have - * an active hugepage in pagecache. This goto expects the 2nd page - * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will - * properly handle it. + * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this + * point, so this check prevents the kernel from going below assuming + * that we have an active hugepage in pagecache. This goto expects + * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned) + * check will properly handle it. */ - if (!pte_present(entry)) { - if (unlikely(is_hugetlb_entry_migration(entry))) { + if (!pte_present(vmf.orig_pte)) { + if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the @@ -6431,9 +6519,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * be released there. */ mutex_unlock(&hugetlb_fault_mutex_table[hash]); - migration_entry_wait_huge(vma, ptep); + migration_entry_wait_huge(vma, vmf.pte); return 0; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte))) ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; @@ -6447,37 +6535,31 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * determine if a reservation has been consumed. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && - !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { - if (vma_needs_reservation(h, vma, haddr) < 0) { + !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) { + if (vma_needs_reservation(h, vma, vmf.address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, haddr); + vma_end_reservation(h, vma, vmf.address); - pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx); + pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, + vmf.pgoff); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } - ptl = huge_pte_lock(h, mm, ptep); + vmf.ptl = huge_pte_lock(h, mm, vmf.pte); /* Check for a racing update before calling hugetlb_wp() */ - if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte)))) goto out_ptl; /* Handle userfault-wp first, before trying to lock more pages */ - if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && - (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { + if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) && + (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) { if (!userfaultfd_wp_async(vma)) { - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = address, - .flags = flags, - }; - - spin_unlock(ptl); + spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); folio_put(pagecache_folio); @@ -6487,18 +6569,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_userfault(&vmf, VM_UFFD_WP); } - entry = huge_pte_clear_uffd_wp(entry); - set_huge_pte_at(mm, haddr, ptep, entry, + vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte); + set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte, huge_page_size(hstate_vma(vma))); /* Fallthrough to CoW */ } /* - * hugetlb_wp() requires page locks of pte_page(entry) and + * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and * pagecache_folio, so here we need take the former one * when folio != pagecache_folio or !pagecache_folio. */ - folio = page_folio(pte_page(entry)); + folio = page_folio(pte_page(vmf.orig_pte)); if (folio != pagecache_folio) if (!folio_trylock(folio)) { need_wait_lock = 1; @@ -6508,24 +6590,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, folio_get(folio); if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { - if (!huge_pte_write(entry)) { - ret = hugetlb_wp(mm, vma, address, ptep, flags, - pagecache_folio, ptl); + if (!huge_pte_write(vmf.orig_pte)) { + ret = hugetlb_wp(pagecache_folio, &vmf); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { - entry = huge_pte_mkdirty(entry); + vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte); } } - entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, + vmf.orig_pte = pte_mkyoung(vmf.orig_pte); + if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, haddr, ptep); + update_mmu_cache(vma, vmf.address, vmf.pte); out_put_page: if (folio != pagecache_folio) folio_unlock(folio); folio_put(folio); out_ptl: - spin_unlock(ptl); + spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); @@ -6561,7 +6642,13 @@ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); + /* + * This is used to allocate a temporary hugetlb to hold the copied + * content, which will then be copied again to the final hugetlb + * consuming a reservation. Set the alloc_fallback to false to indicate + * that breaking the per-node hugetlb pool is not allowed in this case. + */ + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false); mpol_cond_put(mpol); return folio; @@ -6688,11 +6775,20 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, } /* - * The memory barrier inside __folio_mark_uptodate makes sure that - * preceding stores to the page contents become visible before - * the set_pte_at() write. + * If we just allocated a new page, we need a memory barrier to ensure + * that preceding stores to the page become visible before the + * set_pte_at() write. The memory barrier inside __folio_mark_uptodate + * is what we need. + * + * In the case where we have not allocated a new page (is_continue), + * the page must already be uptodate. UFFDIO_CONTINUE already includes + * an earlier smp_wmb() to ensure that prior stores will be visible + * before the set_pte_at() write. */ - __folio_mark_uptodate(folio); + if (!is_continue) + __folio_mark_uptodate(folio); + else + WARN_ON_ONCE(!folio_test_uptodate(folio)); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { @@ -6782,77 +6878,6 @@ out_release_nounlock: } #endif /* CONFIG_USERFAULTFD */ -struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) -{ - struct hstate *h = hstate_vma(vma); - struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & huge_page_mask(h); - struct page *page = NULL; - spinlock_t *ptl; - pte_t *pte, entry; - int ret; - - hugetlb_vma_lock_read(vma); - pte = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (!pte) - goto out_unlock; - - ptl = huge_pte_lock(h, mm, pte); - entry = huge_ptep_get(pte); - if (pte_present(entry)) { - page = pte_page(entry); - - if (!huge_pte_write(entry)) { - if (flags & FOLL_WRITE) { - page = NULL; - goto out; - } - - if (gup_must_unshare(vma, flags, page)) { - /* Tell the caller to do unsharing */ - page = ERR_PTR(-EMLINK); - goto out; - } - } - - page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT)); - - /* - * Note that page may be a sub-page, and with vmemmap - * optimizations the page struct may be read only. - * try_grab_page() will increase the ref count on the - * head page, so this will be OK. - * - * try_grab_page() should always be able to get the page here, - * because we hold the ptl lock and have verified pte_present(). - */ - ret = try_grab_page(page, flags); - - if (WARN_ON_ONCE(ret)) { - page = ERR_PTR(ret); - goto out; - } - - *page_mask = (1U << huge_page_order(h)) - 1; - } -out: - spin_unlock(ptl); -out_unlock: - hugetlb_vma_unlock_read(vma); - - /* - * Fixup retval for dump requests: if pagecache doesn't exist, - * don't try to allocate a new page but just skip it. - */ - if (!page && (flags & FOLL_DUMP) && - !hugetlbfs_pagecache_present(h, vma, address)) - page = ERR_PTR(-EFAULT); - - return page; -} - long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -6943,9 +6968,13 @@ long hugetlb_change_protection(struct vm_area_struct *vma, if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); } else if (unlikely(is_pte_marker(pte))) { - /* No other markers apply for now. */ - WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); - if (uffd_wp_resolve) + /* + * Do nothing on a poison marker; page is + * corrupted, permissons do not apply. Here + * pte_marker_uffd_wp()==true implies !poison + * because they're mutual exclusive. + */ + if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); } else if (!huge_pte_none(pte)) { @@ -7695,6 +7724,13 @@ void __init hugetlb_cma_reserve(int order) bool node_specific_cma_alloc = false; int nid; + /* + * HugeTLB CMA reservation is required for gigantic + * huge pages which could not be allocated via the + * page allocator. Just warn if there is any change + * breaking this assumption. + */ + VM_WARN_ON(order <= MAX_PAGE_ORDER); cma_reserve_called = true; if (!hugetlb_cma_size) @@ -7765,9 +7801,9 @@ void __init hugetlb_cma_reserve(int order) * huge page demotion. */ res = cma_declare_contiguous_nid(0, size, 0, - PAGE_SIZE << HUGETLB_PAGE_ORDER, - 0, false, name, - &hugetlb_cma[nid], nid); + PAGE_SIZE << order, + HUGETLB_PAGE_ORDER, false, name, + &hugetlb_cma[nid], nid); if (res) { pr_warn("hugetlb_cma: reservation failed: err %d, node %d", res, nid); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index aa4486bd3904..e20339a346b9 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -308,7 +308,7 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, { if (hugetlb_cgroup_disabled() || !h_cg) return; - + lockdep_assert_held(&hugetlb_lock); __set_hugetlb_cgroup(folio, h_cg, rsvd); if (!rsvd) { unsigned long usage = diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index da177e49d956..b9a55322e52c 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -679,7 +679,6 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = { .mode = 0644, .proc_handler = proc_dobool, }, - { } }; static int __init hugetlb_vmemmap_init(void) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index d0548e382b6b..c9d653f51e45 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -15,7 +15,7 @@ static int hwpoison_inject(void *data, u64 val) { unsigned long pfn = val; struct page *p; - struct page *hpage; + struct folio *folio; int err; if (!capable(CAP_SYS_ADMIN)) @@ -25,16 +25,17 @@ static int hwpoison_inject(void *data, u64 val) return -ENXIO; p = pfn_to_page(pfn); - hpage = compound_head(p); + folio = page_folio(p); if (!hwpoison_filter_enable) goto inject; - shake_page(hpage); + shake_folio(folio); /* * This implies unable to support non-LRU pages except free page. */ - if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p)) + if (!folio_test_lru(folio) && !folio_test_hugetlb(folio) && + !is_free_buddy_page(p)) return 0; /* @@ -42,7 +43,7 @@ static int hwpoison_inject(void *data, u64 val) * the targeted owner (or on a free page). * memory_failure() will redo the check reliably inside page lock. */ - err = hwpoison_filter(hpage); + err = hwpoison_filter(&folio->page); if (err) return 0; diff --git a/mm/internal.h b/mm/internal.h index f309a010d50f..b2c75b12014e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,8 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <linux/tracepoint-defs.h> struct folio_batch; @@ -70,19 +72,207 @@ void page_writeback_init(void); /* * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. + * + * Don't use this function outside of debugging code. */ -static inline int folio_nr_pages_mapped(struct folio *folio) +static inline int folio_nr_pages_mapped(const struct folio *folio) { return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } -static inline void *folio_raw_mapping(struct folio *folio) +/* + * Retrieve the first entry of a folio based on a provided entry within the + * folio. We cannot rely on folio->swap as there is no guarantee that it has + * been initialized. Used for calling arch_swap_restore() + */ +static inline swp_entry_t folio_swap(swp_entry_t entry, + const struct folio *folio) +{ + swp_entry_t swap = { + .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)), + }; + + return swap; +} + +static inline void *folio_raw_mapping(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; return (void *)(mapping & ~PAGE_MAPPING_FLAGS); } +#ifdef CONFIG_MMU + +/* Flags for folio_pte_batch(). */ +typedef int __bitwise fpb_t; + +/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ +#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) + +/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ +#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) + +static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) +{ + if (flags & FPB_IGNORE_DIRTY) + pte = pte_mkclean(pte); + if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) + pte = pte_clear_soft_dirty(pte); + return pte_wrprotect(pte_mkold(pte)); +} + +/** + * folio_pte_batch - detect a PTE batch for a large folio + * @folio: The large folio to detect a PTE batch for. + * @addr: The user virtual address the first page is mapped at. + * @start_ptep: Page table pointer for the first entry. + * @pte: Page table entry for the first page. + * @max_nr: The maximum number of table entries to consider. + * @flags: Flags to modify the PTE batch semantics. + * @any_writable: Optional pointer to indicate whether any entry except the + * first one is writable. + * @any_young: Optional pointer to indicate whether any entry except the + * first one is young. + * @any_dirty: Optional pointer to indicate whether any entry except the + * first one is dirty. + * + * Detect a PTE batch: consecutive (present) PTEs that map consecutive + * pages of the same large folio. + * + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, + * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and + * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). + * + * start_ptep must map any page of the folio. max_nr must be at least one and + * must be limited by the caller so scanning cannot exceed a single page table. + * + * Return: the number of table entries in the batch. + */ +static inline int folio_pte_batch(struct folio *folio, unsigned long addr, + pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, + bool *any_writable, bool *any_young, bool *any_dirty) +{ + unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); + const pte_t *end_ptep = start_ptep + max_nr; + pte_t expected_pte, *ptep; + bool writable, young, dirty; + int nr; + + if (any_writable) + *any_writable = false; + if (any_young) + *any_young = false; + if (any_dirty) + *any_dirty = false; + + VM_WARN_ON_FOLIO(!pte_present(pte), folio); + VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); + VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); + + nr = pte_batch_hint(start_ptep, pte); + expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); + ptep = start_ptep + nr; + + while (ptep < end_ptep) { + pte = ptep_get(ptep); + if (any_writable) + writable = !!pte_write(pte); + if (any_young) + young = !!pte_young(pte); + if (any_dirty) + dirty = !!pte_dirty(pte); + pte = __pte_batch_clear_ignored(pte, flags); + + if (!pte_same(pte, expected_pte)) + break; + + /* + * Stop immediately once we reached the end of the folio. In + * corner cases the next PFN might fall into a different + * folio. + */ + if (pte_pfn(pte) >= folio_end_pfn) + break; + + if (any_writable) + *any_writable |= writable; + if (any_young) + *any_young |= young; + if (any_dirty) + *any_dirty |= dirty; + + nr = pte_batch_hint(ptep, pte); + expected_pte = pte_advance_pfn(expected_pte, nr); + ptep += nr; + } + + return min(ptep - start_ptep, max_nr); +} + +/** + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. + * @pte: The initial pte state; is_swap_pte(pte) must be true and + * non_swap_entry() must be false. + * + * Increments the swap offset, while maintaining all other fields, including + * swap type, and any swp pte bits. The resulting pte is returned. + */ +static inline pte_t pte_next_swp_offset(pte_t pte) +{ + swp_entry_t entry = pte_to_swp_entry(pte); + pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), + (swp_offset(entry) + 1))); + + if (pte_swp_soft_dirty(pte)) + new = pte_swp_mksoft_dirty(new); + if (pte_swp_exclusive(pte)) + new = pte_swp_mkexclusive(new); + if (pte_swp_uffd_wp(pte)) + new = pte_swp_mkuffd_wp(new); + + return new; +} + +/** + * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries + * @start_ptep: Page table pointer for the first entry. + * @max_nr: The maximum number of table entries to consider. + * @pte: Page table entry for the first entry. + * + * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs + * containing swap entries all with consecutive offsets and targeting the same + * swap type, all with matching swp pte bits. + * + * max_nr must be at least one and must be limited by the caller so scanning + * cannot exceed a single page table. + * + * Return: the number of table entries in the batch. + */ +static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) +{ + pte_t expected_pte = pte_next_swp_offset(pte); + const pte_t *end_ptep = start_ptep + max_nr; + pte_t *ptep = start_ptep + 1; + + VM_WARN_ON(max_nr < 1); + VM_WARN_ON(!is_swap_pte(pte)); + VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); + + while (ptep < end_ptep) { + pte = ptep_get(ptep); + + if (!pte_same(pte, expected_pte)) + break; + + expected_pte = pte_next_swp_offset(expected_pte); + ptep++; + } + + return ptep - start_ptep; +} +#endif /* CONFIG_MMU */ + void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled); static inline void acct_reclaim_writeback(struct folio *folio) @@ -103,6 +293,7 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) wake_up(wqh); } +vm_fault_t vmf_anon_prepare(struct vm_fault *vmf); vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); @@ -397,6 +588,7 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); +extern void kernel_init_pages(struct page *page, int numpages); /* * This will have no effect, other than possibly generating a warning, if the @@ -419,8 +611,8 @@ static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; - if (folio && folio_order(folio) > 1) - folio_prep_large_rmappable(folio); + if (folio && folio_test_large(folio)) + folio_set_large_rmappable(folio); return folio; } @@ -429,9 +621,12 @@ static inline void prep_compound_head(struct page *page, unsigned int order) struct folio *folio = (struct folio *)page; folio_set_order(folio, order); + atomic_set(&folio->_large_mapcount, -1); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); + if (order > 1) + INIT_LIST_HEAD(&folio->_deferred_list); } static inline void prep_compound_tail(struct page *head, int tail_idx) @@ -447,10 +642,12 @@ extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); +extern bool free_pages_prepare(struct page *page, unsigned int order); + extern int user_min_free_kbytes; -extern void free_unref_page(struct page *page, unsigned int order); -extern void free_unref_page_list(struct list_head *list); +void free_unref_page(struct page *page, unsigned int order); +void free_unref_folios(struct folio_batch *fbatch); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); @@ -464,10 +661,6 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int); - -int split_free_page(struct page *free_page, - unsigned int order, unsigned long split_pfn_offset); - #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -481,7 +674,7 @@ int split_free_page(struct page *free_page, * completes when free_pfn <= migrate_pfn */ struct compact_control { - struct list_head freepages; /* List of free pages to migrate to */ + struct list_head freepages[NR_PAGE_ORDERS]; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ @@ -537,7 +730,8 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + int migratetype); /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); @@ -590,9 +784,8 @@ struct anon_vma *folio_anon_vma(struct folio *folio); void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); -extern long faultin_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - bool write, int *locked); +extern long faultin_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); @@ -694,13 +887,17 @@ void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); -/* - * Return the start of user virtual address at the specific offset within - * a vma. +/** + * vma_address - Find the virtual address a page range is mapped at + * @vma: The vma which maps this object. + * @pgoff: The page offset within its object. + * @nr_pages: The number of pages to consider. + * + * If any page in this range is mapped by this VMA, return the first address + * where any of these pages appear. Otherwise, return -EFAULT. */ -static inline unsigned long -vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages, - struct vm_area_struct *vma) +static inline unsigned long vma_address(struct vm_area_struct *vma, + pgoff_t pgoff, unsigned long nr_pages) { unsigned long address; @@ -720,18 +917,6 @@ vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages, } /* - * Return the start of user virtual address of a page within a vma. - * Returns -EFAULT if all of the page is outside the range of vma. - * If page is a compound head, the entire compound page is considered. - */ -static inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) -{ - VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ - return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma); -} - -/* * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. */ @@ -852,6 +1037,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) /* * mm/memory-failure.c */ +void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; @@ -945,17 +1131,13 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype) return migratetype == MIGRATE_HIGHATOMIC; } -static inline bool is_migrate_highatomic_page(struct page *page) -{ - return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; -} - void setup_zone_pageset(struct zone *zone); struct migration_target_control { int nid; /* preferred node id */ nodemask_t *nmask; gfp_t gfp_mask; + enum migrate_reason reason; }; /* @@ -992,10 +1174,10 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); -int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, +int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int page_nid, int *flags); -void free_zone_device_page(struct page *page); +void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_page(struct page *page); /* @@ -1007,9 +1189,10 @@ int __must_check try_grab_page(struct page *page, unsigned int flags); /* * mm/huge_memory.c */ -struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, - unsigned int flags); +void touch_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, bool write); +void touch_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, bool write); /* * mm/mmap.c @@ -1031,10 +1214,13 @@ enum { FOLL_FAST_ONLY = 1 << 20, /* allow unlocking the mmap lock */ FOLL_UNLOCKABLE = 1 << 21, + /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */ + FOLL_MADV_POPULATE = 1 << 22, }; #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ - FOLL_FAST_ONLY | FOLL_UNLOCKABLE) + FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \ + FOLL_MADV_POPULATE) /* * Indicates for which pages that are write-protected in the page table, @@ -1091,20 +1277,10 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, } /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_rmb(); /* - * During GUP-fast we might not get called on the head page for a - * hugetlb page that is mapped using cont-PTE, because GUP-fast does - * not work with the abstracted hugetlb PTEs that always point at the - * head page. For hugetlb, PageAnonExclusive only applies on the head - * page (as it cannot be partially COW-shared), so lookup the head page. - */ - if (unlikely(!PageHead(page) && PageHuge(page))) - page = compound_head(page); - - /* * Note that PageKsm() pages cannot be exclusive, and consequently, * cannot get pinned. */ @@ -1114,6 +1290,15 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, extern bool mirrored_kernelcore; extern bool memblock_has_mirror(void); +static __always_inline void vma_set_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + pgoff_t pgoff) +{ + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; +} + static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { /* @@ -1138,6 +1323,35 @@ static inline void vma_iter_config(struct vma_iterator *vmi, __mas_set_range(&vmi->mas, index, last - 1); } +static inline void vma_iter_reset(struct vma_iterator *vmi) +{ + mas_reset(&vmi->mas); +} + +static inline +struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) +{ + return mas_prev_range(&vmi->mas, min); +} + +static inline +struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) +{ + return mas_next_range(&vmi->mas, max); +} + +static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, + unsigned long max, unsigned long size) +{ + return mas_empty_area(&vmi->mas, min, max - 1, size); +} + +static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, + unsigned long max, unsigned long size) +{ + return mas_empty_area_rev(&vmi->mas, min, max - 1, size); +} + /* * VMA Iterator functions shared between nommu and mmap */ @@ -1221,6 +1435,43 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); +#ifdef CONFIG_64BIT +/* VM is sealed, in vm_flags */ +#define VM_SEALED _BITUL(63) +#endif + +#ifdef CONFIG_64BIT +static inline int can_do_mseal(unsigned long flags) +{ + if (flags) + return -EINVAL; + + return 0; +} + +bool can_modify_mm(struct mm_struct *mm, unsigned long start, + unsigned long end); +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, + unsigned long end, int behavior); +#else +static inline int can_do_mseal(unsigned long flags) +{ + return -EPERM; +} + +static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + return true; +} + +static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, + unsigned long end, int behavior) +{ + return true; +} +#endif + #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) @@ -1266,4 +1517,8 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, } #endif /* CONFIG_SHRINKER_DEBUG */ +/* Only track the nodes of mappings with shadow entries */ +void workingset_update_node(struct xa_node *node); +extern struct list_lru shadow_nodes; + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6ca63e8dda74..e7c9a4dc89f8 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -55,7 +55,7 @@ void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack) u64 ts_nsec = local_clock(); track->cpu = cpu; - track->timestamp = ts_nsec >> 3; + track->timestamp = ts_nsec >> 9; #endif /* CONFIG_KASAN_EXTRA_INFO */ track->pid = current->pid; track->stack = stack; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 1900f8576034..6310a180278b 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -334,14 +334,6 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); -/* Only allow cache merging when no per-object metadata is present. */ -slab_flags_t kasan_never_merge(void) -{ - if (!kasan_requires_meta()) - return 0; - return SLAB_KASAN; -} - /* * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. @@ -370,15 +362,13 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, return; /* - * SLAB_KASAN is used to mark caches that are sanitized by KASAN - * and that thus have per-object metadata. - * Currently this flag is used in two places: - * 1. In slab_ksize() to account for per-object metadata when - * calculating the size of the accessible memory within the object. - * 2. In slab_common.c via kasan_never_merge() to prevent merging of - * caches with per-object metadata. + * SLAB_KASAN is used to mark caches that are sanitized by KASAN and + * that thus have per-object metadata. Currently, this flag is used in + * slab_ksize() to account for per-object metadata when calculating the + * size of the accessible memory within the object. Additionally, we use + * SLAB_NO_MERGE to prevent merging of caches with per-object metadata. */ - *flags |= SLAB_KASAN; + *flags |= SLAB_KASAN | SLAB_NO_MERGE; ok_size = *size; diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 2b994092a2d4..9958ebc15d38 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -16,6 +16,7 @@ #include <linux/static_key.h> #include <linux/string.h> #include <linux/types.h> +#include <linux/vmalloc.h> #include "kasan.h" diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 971cfff4ca0b..7b32be2a3cf0 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -440,7 +440,8 @@ static void kmalloc_oob_16(struct kunit *test) /* This test is specifically crafted for the generic mode. */ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); - ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL); + /* RELOC_HIDE to prevent gcc from warning about short alloc */ + ptr1 = RELOC_HIDE(kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL), 0); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL); @@ -697,6 +698,84 @@ static void kmalloc_uaf3(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]); } +static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe) +{ + int *i_unsafe = unsafe; + + KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*i_unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, WRITE_ONCE(*i_unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, smp_load_acquire(i_unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, smp_store_release(i_unsafe, 42)); + + KUNIT_EXPECT_KASAN_FAIL(test, atomic_read(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_set(unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_add(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_and(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_andnot(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_or(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_xor(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_xchg(unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_cmpxchg(unsafe, 21, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(unsafe, safe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub_and_test(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_and_test(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_and_test(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_negative(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe)); + + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_read(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_set(unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_and(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_andnot(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_or(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xor(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xchg(unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_cmpxchg(unsafe, 21, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(unsafe, safe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub_and_test(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_and_test(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_and_test(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_negative(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe)); +} + +static void kasan_atomics(struct kunit *test) +{ + void *a1, *a2; + + /* + * Just as with kasan_bitops_tags(), we allocate 48 bytes of memory such + * that the following 16 bytes will make up the redzone. + */ + a1 = kzalloc(48, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a1); + a2 = kzalloc(sizeof(atomic_long_t), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a2); + + /* Use atomics to access the redzone. */ + kasan_atomics_helper(test, a1 + 48, a2); + + kfree(a1); + kfree(a2); +} + static void kmalloc_double_kzfree(struct kunit *test) { char *ptr; @@ -1883,6 +1962,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_strings), KUNIT_CASE(kasan_bitops_generic), KUNIT_CASE(kasan_bitops_tags), + KUNIT_CASE(kasan_atomics), KUNIT_CASE(vmalloc_helpers_tags), KUNIT_CASE(vmalloc_oob), KUNIT_CASE(vmap_tags), diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c index 8b7b3ea2c74e..27ec22767e42 100644 --- a/mm/kasan/kasan_test_module.c +++ b/mm/kasan/kasan_test_module.c @@ -62,7 +62,7 @@ static noinline void __init copy_user_test(void) kfree(kmem); } -static int __init test_kasan_module_init(void) +static int __init kasan_test_module_init(void) { /* * Temporarily enable multi-shot mode. Otherwise, KASAN would only @@ -77,5 +77,5 @@ static int __init test_kasan_module_init(void) return -EAGAIN; } -module_init(test_kasan_module_init); +module_init(kasan_test_module_init); MODULE_LICENSE("GPL"); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 7afa4feb03e1..b48c768acc84 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -267,7 +267,7 @@ static void print_track(struct kasan_track *track, const char *prefix) u64 ts_nsec = track->timestamp; unsigned long rem_usec; - ts_nsec <<= 3; + ts_nsec <<= 9; rem_usec = do_div(ts_nsec, NSEC_PER_SEC) / 1000; pr_err("%s by task %u on cpu %d at %lu.%06lus:\n", diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 9ef84f31833f..d6210ca48dda 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -199,19 +199,12 @@ static bool shadow_mapped(unsigned long addr) pud = pud_offset(p4d, addr); if (pud_none(*pud)) return false; - - /* - * We can't use pud_large() or pud_huge(), the first one is - * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse - * pud_bad(), if pud is bad then it's bad because it's huge. - */ - if (pud_bad(*pud)) + if (pud_leaf(*pud)) return true; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return false; - - if (pmd_bad(*pmd)) + if (pmd_leaf(*pmd)) return true; pte = pte_offset_kernel(pmd, addr); return !pte_none(ptep_get(pte)); diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 8350f5c06f2e..964b8482275b 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -595,9 +595,9 @@ static unsigned long kfence_init_pool(void) continue; __folio_set_slab(slab_folio(slab)); -#ifdef CONFIG_MEMCG - slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg | - MEMCG_DATA_OBJCGS; +#ifdef CONFIG_MEMCG_KMEM + slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | + MEMCG_DATA_OBJEXTS; #endif } @@ -645,8 +645,8 @@ reset_slab: if (!i || (i % 2)) continue; -#ifdef CONFIG_MEMCG - slab->memcg_data = 0; +#ifdef CONFIG_MEMCG_KMEM + slab->obj_exts = 0; #endif __folio_clear_slab(slab_folio(slab)); } @@ -1139,8 +1139,8 @@ void __kfence_free(void *addr) { struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); -#ifdef CONFIG_MEMCG - KFENCE_WARN_ON(meta->objcg); +#ifdef CONFIG_MEMCG_KMEM + KFENCE_WARN_ON(meta->obj_exts.objcg); #endif /* * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index f46fbb03062b..084f5f36e8e7 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -97,8 +97,8 @@ struct kfence_metadata { struct kfence_track free_track; /* For updating alloc_covered on frees. */ u32 alloc_stack_hash; -#ifdef CONFIG_MEMCG - struct obj_cgroup *objcg; +#ifdef CONFIG_MEMCG_KMEM + struct slabobj_ext obj_exts; #endif }; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2b219acb528e..774a97e6e2da 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) +{ + return hpage_collapse_test_exit(mm) || + test_bit(MMF_DISABLE_THP, &mm->flags); +} + void __khugepaged_enter(struct mm_struct *mm) { struct khugepaged_mm_slot *mm_slot; @@ -447,7 +453,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, false, false, true, + if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } @@ -577,7 +583,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); - if (page_mapcount(page) > 1) { + /* See hpage_collapse_scan_pmd(). */ + if (folio_likely_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { @@ -683,9 +690,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spinlock_t *ptl, struct list_head *compound_pagelist) { - struct folio *src_folio; - struct page *src_page; - struct page *tmp; + struct folio *src, *tmp; pte_t *_pte; pte_t pteval; @@ -704,10 +709,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, ksm_might_unmap_zero_page(vma->vm_mm, pteval); } } else { - src_page = pte_page(pteval); - src_folio = page_folio(src_page); - if (!folio_test_large(src_folio)) - release_pte_folio(src_folio); + struct page *src_page = pte_page(pteval); + + src = page_folio(src_page); + if (!folio_test_large(src)) + release_pte_folio(src); /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats @@ -715,20 +721,19 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); - folio_remove_rmap_pte(src_folio, src_page, vma); + folio_remove_rmap_pte(src, src_page, vma); spin_unlock(ptl); free_page_and_swap_cache(src_page); } } - list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { - list_del(&src_page->lru); - mod_node_page_state(page_pgdat(src_page), - NR_ISOLATED_ANON + page_is_file_lru(src_page), - -compound_nr(src_page)); - unlock_page(src_page); - free_swap_cache(src_page); - putback_lru_page(src_page); + list_for_each_entry_safe(src, tmp, compound_pagelist, lru) { + list_del(&src->lru); + node_stat_sub_folio(src, NR_ISOLATED_ANON + + folio_is_file_lru(src)); + folio_unlock(src); + free_swap_cache(src); + folio_putback_lru(src); } } @@ -763,7 +768,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte, * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. * * @pte: starting of the PTEs to copy from - * @page: the new hugepage to copy contents to + * @folio: the new hugepage to copy contents to * @pmd: pointer to the new hugepage's PMD * @orig_pmd: the original raw pages' PMD * @vma: the original raw pages' virtual memory area @@ -771,33 +776,29 @@ static void __collapse_huge_page_copy_failed(pte_t *pte, * @ptl: lock on raw pages' PTEs * @compound_pagelist: list that stores compound pages */ -static int __collapse_huge_page_copy(pte_t *pte, - struct page *page, - pmd_t *pmd, - pmd_t orig_pmd, - struct vm_area_struct *vma, - unsigned long address, - spinlock_t *ptl, - struct list_head *compound_pagelist) +static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, + pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, + unsigned long address, spinlock_t *ptl, + struct list_head *compound_pagelist) { - struct page *src_page; - pte_t *_pte; - pte_t pteval; - unsigned long _address; + unsigned int i; int result = SCAN_SUCCEED; /* * Copying pages' contents is subject to memory poison at any iteration. */ - for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; - _pte++, page++, _address += PAGE_SIZE) { - pteval = ptep_get(_pte); + for (i = 0; i < HPAGE_PMD_NR; i++) { + pte_t pteval = ptep_get(pte + i); + struct page *page = folio_page(folio, i); + unsigned long src_addr = address + i * PAGE_SIZE; + struct page *src_page; + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - clear_user_highpage(page, _address); + clear_user_highpage(page, src_addr); continue; } src_page = pte_page(pteval); - if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) { + if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) { result = SCAN_COPY_MC; break; } @@ -887,20 +888,6 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc) } #endif -static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node, - nodemask_t *nmask) -{ - *folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask); - - if (unlikely(!*folio)) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - return false; - } - - count_vm_event(THP_COLLAPSE_ALLOC); - return true; -} - /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. @@ -913,8 +900,9 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; + unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; - if (unlikely(hpage_collapse_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); @@ -923,8 +911,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, - cc->is_khugepaged, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1055,7 +1042,7 @@ out: return result; } -static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, +static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, struct collapse_control *cc) { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : @@ -1063,20 +1050,23 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, int node = hpage_collapse_find_target_node(cc); struct folio *folio; - if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) { - *hpage = NULL; + folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); + if (!folio) { + *foliop = NULL; + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); return SCAN_ALLOC_HUGE_PAGE_FAIL; } + count_vm_event(THP_COLLAPSE_ALLOC); if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { folio_put(folio); - *hpage = NULL; + *foliop = NULL; return SCAN_CGROUP_CHARGE_FAIL; } count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); - *hpage = folio_page(folio, 0); + *foliop = folio; return SCAN_SUCCEED; } @@ -1089,7 +1079,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, pte_t *pte; pgtable_t pgtable; struct folio *folio; - struct page *hpage; spinlock_t *pmd_ptl, *pte_ptl; int result = SCAN_FAIL; struct vm_area_struct *vma; @@ -1105,7 +1094,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, */ mmap_read_unlock(mm); - result = alloc_charge_hpage(&hpage, mm, cc); + result = alloc_charge_folio(&folio, mm, cc); if (result != SCAN_SUCCEED) goto out_nolock; @@ -1165,7 +1154,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * huge and small TLB entries for the same virtual address to * avoid the risk of CPU bugs in that area. * - * Parallel fast GUP is fine since fast GUP will back off when + * Parallel GUP-fast is fine since GUP-fast will back off when * it detects PMD is changed. */ _pmd = pmdp_collapse_flush(vma, address, pmd); @@ -1204,14 +1193,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, */ anon_vma_unlock_write(vma->anon_vma); - result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd, + result = __collapse_huge_page_copy(pte, folio, pmd, _pmd, vma, address, pte_ptl, &compound_pagelist); pte_unmap(pte); if (unlikely(result != SCAN_SUCCEED)) goto out_up_write; - folio = page_folio(hpage); /* * The smp_wmb() inside __folio_mark_uptodate() ensures the * copy_huge_page writes become visible before the set_pmd_at() @@ -1220,7 +1208,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); + _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); @@ -1232,14 +1220,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); - hpage = NULL; + folio = NULL; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: - if (hpage) - put_page(hpage); + if (folio) + folio_put(folio); trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); return result; } @@ -1330,8 +1318,20 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, result = SCAN_PAGE_NULL; goto out_unmap; } + folio = page_folio(page); - if (page_mapcount(page) > 1) { + if (!folio_test_anon(folio)) { + result = SCAN_PAGE_ANON; + goto out_unmap; + } + + /* + * We treat a single page as shared if any part of the THP + * is shared. "False negatives" from + * folio_likely_mapped_shared() are not expected to matter + * much in practice. + */ + if (folio_likely_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { @@ -1341,7 +1341,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, } } - folio = page_folio(page); /* * Record which node the original page is from and save this * information to cc->node_load[]. @@ -1362,16 +1361,12 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, result = SCAN_PAGE_LOCK; goto out_unmap; } - if (!folio_test_anon(folio)) { - result = SCAN_PAGE_ANON; - goto out_unmap; - } /* * Check if the page has any GUP (or other external) pins. * * Here the check may be racy: - * it may see total_mapcount > refcount in some cases? + * it may see folio_mapcount() > folio_ref_count(). * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check @@ -1506,8 +1501,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, - PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -1634,7 +1628,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, /* step 3: set proper refcount and mm_counters. */ if (nr_ptes) { folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); } /* step 4: remove empty page table */ @@ -1665,7 +1659,7 @@ abort: if (nr_ptes) { flush_tlb_mm(mm); folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); } if (start_pte) pte_unmap_unlock(start_pte, ptl); @@ -1793,29 +1787,26 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; - struct page *hpage; - struct page *page; - struct page *tmp; - struct folio *folio; + struct page *dst; + struct folio *folio, *tmp, *new_folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); - int nr = 0; VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); - result = alloc_charge_hpage(&hpage, mm, cc); + result = alloc_charge_folio(&new_folio, mm, cc); if (result != SCAN_SUCCEED) goto out; - __SetPageLocked(hpage); + __folio_set_locked(new_folio); if (is_shmem) - __SetPageSwapBacked(hpage); - hpage->index = start; - hpage->mapping = mapping; + __folio_set_swapbacked(new_folio); + new_folio->index = start; + new_folio->mapping = mapping; /* * Ensure we have slots for all the pages in the range. This is @@ -1835,11 +1826,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, for (index = start; index < end; index++) { xas_set(&xas, index); - page = xas_load(&xas); + folio = xas_load(&xas); VM_BUG_ON(index != xas.xa_index); if (is_shmem) { - if (!page) { + if (!folio) { /* * Stop if extent has been truncated or * hole-punched, and is now completely @@ -1855,7 +1846,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, continue; } - if (xa_is_value(page) || !PageUptodate(page)) { + if (xa_is_value(folio) || !folio_test_uptodate(folio)) { xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, @@ -1865,28 +1856,27 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } /* drain lru cache to help isolate_lru_page() */ lru_add_drain(); - page = folio_file_page(folio, index); - } else if (trylock_page(page)) { - get_page(page); + } else if (folio_trylock(folio)) { + folio_get(folio); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; goto xa_locked; } } else { /* !is_shmem */ - if (!page || xa_is_value(page)) { + if (!folio || xa_is_value(folio)) { xas_unlock_irq(&xas); page_cache_sync_readahead(mapping, &file->f_ra, file, index, end - index); /* drain lru cache to help isolate_lru_page() */ lru_add_drain(); - page = find_lock_page(mapping, index); - if (unlikely(page == NULL)) { + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) { result = SCAN_FAIL; goto xa_unlocked; } - } else if (PageDirty(page)) { + } else if (folio_test_dirty(folio)) { /* * khugepaged only works on read-only fd, * so this page is dirty because it hasn't @@ -1904,12 +1894,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, filemap_flush(mapping); result = SCAN_FAIL; goto xa_unlocked; - } else if (PageWriteback(page)) { + } else if (folio_test_writeback(folio)) { xas_unlock_irq(&xas); result = SCAN_FAIL; goto xa_unlocked; - } else if (trylock_page(page)) { - get_page(page); + } else if (folio_trylock(folio)) { + folio_get(folio); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; @@ -1918,35 +1908,31 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } /* - * The page must be locked, so we can drop the i_pages lock + * The folio must be locked, so we can drop the i_pages lock * without racing with truncate. */ - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - /* make sure the page is up to date */ - if (unlikely(!PageUptodate(page))) { + /* make sure the folio is up to date */ + if (unlikely(!folio_test_uptodate(folio))) { result = SCAN_FAIL; goto out_unlock; } /* * If file was truncated then extended, or hole-punched, before - * we locked the first page, then a THP might be there already. + * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration. */ - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - result = compound_order(head) == HPAGE_PMD_ORDER && - head->index == start + if (folio_test_large(folio)) { + result = folio_order(folio) == HPAGE_PMD_ORDER && + folio->index == start /* Maybe PMD-mapped */ ? SCAN_PTE_MAPPED_HUGEPAGE : SCAN_PAGE_COMPOUND; goto out_unlock; } - folio = page_folio(page); - if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; @@ -1956,7 +1942,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, folio_test_writeback(folio))) { /* * khugepaged only works on read-only fd, so this - * page is dirty because it hasn't been flushed + * folio is dirty because it hasn't been flushed * since first write. */ result = SCAN_FAIL; @@ -1980,33 +1966,34 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_lock_irq(&xas); - VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page); + VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio); /* - * We control three references to the page: + * We control three references to the folio: * - we hold a pin on it; * - one reference from page cache; - * - one from isolate_lru_page; - * If those are the only references, then any new usage of the - * page will have to fetch it from the page cache. That requires - * locking the page to handle truncate, so any new usage will be - * blocked until we unlock page after collapse/during rollback. + * - one from lru_isolate_folio; + * If those are the only references, then any new usage + * of the folio will have to fetch it from the page + * cache. That requires locking the folio to handle + * truncate, so any new usage will be blocked until we + * unlock folio after collapse/during rollback. */ - if (page_count(page) != 3) { + if (folio_ref_count(folio) != 3) { result = SCAN_PAGE_COUNT; xas_unlock_irq(&xas); - putback_lru_page(page); + folio_putback_lru(folio); goto out_unlock; } /* - * Accumulate the pages that are being collapsed. + * Accumulate the folios that are being collapsed. */ - list_add_tail(&page->lru, &pagelist); + list_add_tail(&folio->lru, &pagelist); continue; out_unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto xa_unlocked; } @@ -2045,23 +2032,27 @@ xa_unlocked: } /* - * The old pages are locked, so they won't change anymore. + * The old folios are locked, so they won't change anymore. */ index = start; - list_for_each_entry(page, &pagelist, lru) { - while (index < page->index) { - clear_highpage(hpage + (index % HPAGE_PMD_NR)); + dst = folio_page(new_folio, 0); + list_for_each_entry(folio, &pagelist, lru) { + while (index < folio->index) { + clear_highpage(dst); index++; + dst++; } - if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) { + if (copy_mc_highpage(dst, folio_page(folio, 0)) > 0) { result = SCAN_COPY_MC; goto rollback; } index++; + dst++; } while (index < end) { - clear_highpage(hpage + (index % HPAGE_PMD_NR)); + clear_highpage(dst); index++; + dst++; } if (nr_none) { @@ -2089,16 +2080,17 @@ xa_unlocked: } /* - * If userspace observed a missing page in a VMA with a MODE_MISSING - * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that - * page. If so, we need to roll back to avoid suppressing such an - * event. Since wp/minor userfaultfds don't give userspace any - * guarantees that the kernel doesn't fill a missing page with a zero - * page, so they don't matter here. + * If userspace observed a missing page in a VMA with + * a MODE_MISSING userfaultfd, then it might expect a + * UFFD_EVENT_PAGEFAULT for that page. If so, we need to + * roll back to avoid suppressing such an event. Since + * wp/minor userfaultfds don't give userspace any + * guarantees that the kernel doesn't fill a missing + * page with a zero page, so they don't matter here. * - * Any userfaultfds registered after this point will not be able to - * observe any missing pages due to the previously inserted retry - * entries. + * Any userfaultfds registered after this point will + * not be able to observe any missing pages due to the + * previously inserted retry entries. */ vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { if (userfaultfd_missing(vma)) { @@ -2123,33 +2115,32 @@ immap_locked: xas_lock_irq(&xas); } - folio = page_folio(hpage); - nr = folio_nr_pages(folio); if (is_shmem) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); + __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); else - __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); + __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); if (nr_none) { - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_none); + __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_none); + __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); } /* - * Mark hpage as uptodate before inserting it into the page cache so - * that it isn't mistaken for an fallocated but unwritten page. + * Mark new_folio as uptodate before inserting it into the + * page cache so that it isn't mistaken for an fallocated but + * unwritten page. */ - folio_mark_uptodate(folio); - folio_ref_add(folio, HPAGE_PMD_NR - 1); + folio_mark_uptodate(new_folio); + folio_ref_add(new_folio, HPAGE_PMD_NR - 1); if (is_shmem) - folio_mark_dirty(folio); - folio_add_lru(folio); + folio_mark_dirty(new_folio); + folio_add_lru(new_folio); /* Join all the small entries into a single multi-index entry. */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, folio); + xas_store(&xas, new_folio); WARN_ON_ONCE(xas_error(&xas)); xas_unlock_irq(&xas); @@ -2160,18 +2151,18 @@ immap_locked: retract_page_tables(mapping, start); if (cc && !cc->is_khugepaged) result = SCAN_PTE_MAPPED_HUGEPAGE; - folio_unlock(folio); + folio_unlock(new_folio); /* - * The collapse has succeeded, so free the old pages. + * The collapse has succeeded, so free the old folios. */ - list_for_each_entry_safe(page, tmp, &pagelist, lru) { - list_del(&page->lru); - page->mapping = NULL; - ClearPageActive(page); - ClearPageUnevictable(page); - unlock_page(page); - folio_put_refs(page_folio(page), 3); + list_for_each_entry_safe(folio, tmp, &pagelist, lru) { + list_del(&folio->lru); + folio->mapping = NULL; + folio_clear_active(folio); + folio_clear_unevictable(folio); + folio_unlock(folio); + folio_put_refs(folio, 3); } goto out; @@ -2185,11 +2176,11 @@ rollback: shmem_uncharge(mapping->host, nr_none); } - list_for_each_entry_safe(page, tmp, &pagelist, lru) { - list_del(&page->lru); - unlock_page(page); - putback_lru_page(page); - put_page(page); + list_for_each_entry_safe(folio, tmp, &pagelist, lru) { + list_del(&folio->lru); + folio_unlock(folio); + folio_putback_lru(folio); + folio_put(folio); } /* * Undo the updates of filemap_nr_thps_inc for non-SHMEM @@ -2205,13 +2196,13 @@ rollback: smp_mb(); } - hpage->mapping = NULL; + new_folio->mapping = NULL; - unlock_page(hpage); - put_page(hpage); + folio_unlock(new_folio); + folio_put(new_folio); out: VM_BUG_ON(!list_empty(&pagelist)); - trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result); + trace_mm_khugepaged_collapse_file(mm, new_folio, index, is_shmem, addr, file, HPAGE_PMD_NR, result); return result; } @@ -2219,7 +2210,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { - struct page *page = NULL; + struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; @@ -2231,11 +2222,11 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); rcu_read_lock(); - xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { ++swap; if (cc->is_khugepaged && swap > khugepaged_max_ptes_swap) { @@ -2250,11 +2241,9 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, * TODO: khugepaged should compact smaller compound pages * into a PMD sized page */ - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - result = compound_order(head) == HPAGE_PMD_ORDER && - head->index == start + if (folio_test_large(folio)) { + result = folio_order(folio) == HPAGE_PMD_ORDER && + folio->index == start /* Maybe PMD-mapped */ ? SCAN_PTE_MAPPED_HUGEPAGE : SCAN_PAGE_COMPOUND; @@ -2267,28 +2256,29 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, break; } - node = page_to_nid(page); + node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } cc->node_load[node]++; - if (!PageLRU(page)) { + if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; break; } - if (page_count(page) != - 1 + page_mapcount(page) + page_has_private(page)) { + if (folio_ref_count(folio) != + 1 + folio_mapcount(folio) + folio_test_private(folio)) { result = SCAN_PAGE_COUNT; break; } /* - * We probably should check if the page is referenced here, but - * nobody would transfer pte_young() to PageReferenced() for us. - * And rmap walk here is just too costly... + * We probably should check if the folio is referenced + * here, but nobody would transfer pte_young() to + * folio_test_referenced() for us. And rmap walk here + * is just too costly... */ present++; @@ -2310,7 +2300,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, } } - trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result); + trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } #else @@ -2360,7 +2350,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, goto breakouterloop_mmap_lock; progress++; - if (unlikely(hpage_collapse_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; vma_iter_init(&vmi, mm, khugepaged_scan.address); @@ -2368,12 +2358,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, unsigned long hstart, hend; cond_resched(); - if (unlikely(hpage_collapse_test_exit(mm))) { + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, - true, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, + TVA_ENFORCE_SYSFS, PMD_ORDER)) { skip: progress++; continue; @@ -2390,7 +2380,7 @@ skip: bool mmap_locked = true; cond_resched(); - if (unlikely(hpage_collapse_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || @@ -2408,7 +2398,7 @@ skip: fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); - if (hpage_collapse_test_exit(mm)) + if (hpage_collapse_test_exit_or_disable(mm)) goto breakouterloop; *result = collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); @@ -2710,8 +2700,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, - PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 6a540c2b27c5..d5b6fba44fc9 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -114,12 +114,6 @@ #define BYTES_PER_POINTER sizeof(void *) -/* GFP bitmask for kmemleak internal allocations */ -#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ - __GFP_NOLOCKDEP)) | \ - __GFP_NORETRY | __GFP_NOMEMALLOC | \ - __GFP_NOWARN) - /* scanning area inside a memory block */ struct kmemleak_scan_area { struct hlist_node node; @@ -158,9 +152,9 @@ struct kmemleak_object { int count; /* checksum for detecting modified objects */ u32 checksum; + depot_stack_handle_t trace_handle; /* memory ranges to be scanned inside an object (empty for all) */ struct hlist_head area_list; - depot_stack_handle_t trace_handle; unsigned long jiffies; /* creation timestamp */ pid_t pid; /* pid of the current task */ char comm[TASK_COMM_LEN]; /* executable name */ @@ -463,7 +457,8 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) /* try the slab allocator first */ if (object_cache) { - object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); + object = kmem_cache_alloc_noprof(object_cache, + gfp_nested_mask(gfp)); if (object) return object; } @@ -947,7 +942,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer); if (scan_area_cache) - area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); + area = kmem_cache_alloc_noprof(scan_area_cache, + gfp_nested_mask(gfp)); raw_spin_lock_irqsave(&object->lock, flags); if (!area) { diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 5d6e2dee5692..22e8657800ef 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -285,6 +285,17 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, } EXPORT_SYMBOL(kmsan_copy_to_user); +void kmsan_memmove(void *to, const void *from, size_t size) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + kmsan_enter_runtime(); + kmsan_internal_memmove_metadata(to, (void *)from, size); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(kmsan_memmove); + /* Helper function to check an URB. */ void kmsan_handle_urb(const struct urb *urb, bool is_out) { @@ -359,6 +370,12 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, } /* Functions from kmsan-checks.h follow. */ + +/* + * To create an origin, kmsan_poison_memory() unwinds the stacks and stores it + * into the stack depot. This may cause deadlocks if done from within KMSAN + * runtime, therefore we bail out if kmsan_in_runtime(). + */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { if (!kmsan_enabled || kmsan_in_runtime()) @@ -371,47 +388,31 @@ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) } EXPORT_SYMBOL(kmsan_poison_memory); +/* + * Unlike kmsan_poison_memory(), this function can be used from within KMSAN + * runtime, because it does not trigger allocations or call instrumented code. + */ void kmsan_unpoison_memory(const void *address, size_t size) { unsigned long ua_flags; - if (!kmsan_enabled || kmsan_in_runtime()) + if (!kmsan_enabled) return; ua_flags = user_access_save(); - kmsan_enter_runtime(); /* The users may want to poison/unpoison random memory. */ kmsan_internal_unpoison_memory((void *)address, size, KMSAN_POISON_NOCHECK); - kmsan_leave_runtime(); user_access_restore(ua_flags); } EXPORT_SYMBOL(kmsan_unpoison_memory); /* - * Version of kmsan_unpoison_memory() that can be called from within the KMSAN - * runtime. - * - * Non-instrumented IRQ entry functions receive struct pt_regs from assembly - * code. Those regs need to be unpoisoned, otherwise using them will result in - * false positives. - * Using kmsan_unpoison_memory() is not an option in entry code, because the - * return value of in_task() is inconsistent - as a result, certain calls to - * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that - * the registers are unpoisoned even if kmsan_in_runtime() is true in the early - * entry code. + * Version of kmsan_unpoison_memory() called from IRQ entry functions. */ void kmsan_unpoison_entry_regs(const struct pt_regs *regs) { - unsigned long ua_flags; - - if (!kmsan_enabled) - return; - - ua_flags = user_access_save(); - kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs), - KMSAN_POISON_NOCHECK); - user_access_restore(ua_flags); + kmsan_unpoison_memory((void *)regs, sizeof(*regs)); } void kmsan_check_memory(const void *addr, size_t size) @@ -890,14 +890,14 @@ static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) free_stable_node(stable_node); } -enum get_ksm_page_flags { - GET_KSM_PAGE_NOLOCK, - GET_KSM_PAGE_LOCK, - GET_KSM_PAGE_TRYLOCK +enum ksm_get_folio_flags { + KSM_GET_FOLIO_NOLOCK, + KSM_GET_FOLIO_LOCK, + KSM_GET_FOLIO_TRYLOCK }; /* - * get_ksm_page: checks if the page indicated by the stable node + * ksm_get_folio: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. * In which case we can trust the content of the page, and it * returns the gotten page; but if the page has now been zapped, @@ -915,10 +915,10 @@ enum get_ksm_page_flags { * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct ksm_stable_node *stable_node, - enum get_ksm_page_flags flags) +static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, + enum ksm_get_folio_flags flags) { - struct page *page; + struct folio *folio; void *expected_mapping; unsigned long kpfn; @@ -926,8 +926,8 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node, PAGE_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ - page = pfn_to_page(kpfn); - if (READ_ONCE(page->mapping) != expected_mapping) + folio = pfn_folio(kpfn); + if (READ_ONCE(folio->mapping) != expected_mapping) goto stale; /* @@ -940,41 +940,41 @@ again: * in folio_migrate_mapping(), it might still be our page, * in which case it's essential to keep the node. */ - while (!get_page_unless_zero(page)) { + while (!folio_try_get(folio)) { /* * Another check for page->mapping != expected_mapping would * work here too. We have chosen the !PageSwapCache test to * optimize the common case, when the page is or is about to * be freed: PageSwapCache is cleared (under spin_lock_irq) * in the ref_freeze section of __remove_mapping(); but Anon - * page->mapping reset to NULL later, in free_pages_prepare(). + * folio->mapping reset to NULL later, in free_pages_prepare(). */ - if (!PageSwapCache(page)) + if (!folio_test_swapcache(folio)) goto stale; cpu_relax(); } - if (READ_ONCE(page->mapping) != expected_mapping) { - put_page(page); + if (READ_ONCE(folio->mapping) != expected_mapping) { + folio_put(folio); goto stale; } - if (flags == GET_KSM_PAGE_TRYLOCK) { - if (!trylock_page(page)) { - put_page(page); + if (flags == KSM_GET_FOLIO_TRYLOCK) { + if (!folio_trylock(folio)) { + folio_put(folio); return ERR_PTR(-EBUSY); } - } else if (flags == GET_KSM_PAGE_LOCK) - lock_page(page); + } else if (flags == KSM_GET_FOLIO_LOCK) + folio_lock(folio); - if (flags != GET_KSM_PAGE_NOLOCK) { - if (READ_ONCE(page->mapping) != expected_mapping) { - unlock_page(page); - put_page(page); + if (flags != KSM_GET_FOLIO_NOLOCK) { + if (READ_ONCE(folio->mapping) != expected_mapping) { + folio_unlock(folio); + folio_put(folio); goto stale; } } - return page; + return folio; stale: /* @@ -998,16 +998,16 @@ static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { struct ksm_stable_node *stable_node; - struct page *page; + struct folio *folio; stable_node = rmap_item->head; - page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); - if (!page) + folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); + if (!folio) goto out; hlist_del(&rmap_item->hlist); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (!hlist_empty(&stable_node->hlist)) ksm_pages_sharing--; @@ -1094,11 +1094,11 @@ static inline struct ksm_stable_node *page_stable_node(struct page *page) return folio_stable_node(page_folio(page)); } -static inline void set_page_stable_node(struct page *page, - struct ksm_stable_node *stable_node) +static inline void folio_set_stable_node(struct folio *folio, + struct ksm_stable_node *stable_node) { - VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page); - page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); + VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio); + folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); } #ifdef CONFIG_SYSFS @@ -1107,13 +1107,13 @@ static inline void set_page_stable_node(struct page *page, */ static int remove_stable_node(struct ksm_stable_node *stable_node) { - struct page *page; + struct folio *folio; int err; - page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); - if (!page) { + folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); + if (!folio) { /* - * get_ksm_page did remove_node_from_stable_tree itself. + * ksm_get_folio did remove_node_from_stable_tree itself. */ return 0; } @@ -1124,22 +1124,22 @@ static int remove_stable_node(struct ksm_stable_node *stable_node) * merge_across_nodes/max_page_sharing be switched. */ err = -EBUSY; - if (!page_mapped(page)) { + if (!folio_mapped(folio)) { /* - * The stable node did not yet appear stale to get_ksm_page(), - * since that allows for an unmapped ksm page to be recognized + * The stable node did not yet appear stale to ksm_get_folio(), + * since that allows for an unmapped ksm folio to be recognized * right up until it is freed; but the node is safe to remove. - * This page might be in an LRU cache waiting to be freed, - * or it might be PageSwapCache (perhaps under writeback), + * This folio might be in an LRU cache waiting to be freed, + * or it might be in the swapcache (perhaps under writeback), * or it might have been removed from swapcache a moment ago. */ - set_page_stable_node(page, NULL); + folio_set_stable_node(folio, NULL); remove_node_from_stable_tree(stable_node); err = 0; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } @@ -1275,23 +1275,24 @@ static u32 calc_checksum(struct page *page) return checksum; } -static int write_protect_page(struct vm_area_struct *vma, struct page *page, +static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; - DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0); + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0); int swapped; int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; pte_t entry; - pvmw.address = page_address_in_vma(page, vma); + if (WARN_ON_ONCE(folio_test_large(folio))) + return err; + + pvmw.address = page_address_in_vma(&folio->page, vma); if (pvmw.address == -EFAULT) goto out; - BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1301,12 +1302,12 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) goto out_unlock; - anon_exclusive = PageAnonExclusive(page); + anon_exclusive = PageAnonExclusive(&folio->page); entry = ptep_get(pvmw.pte); if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { - swapped = PageSwapCache(page); - flush_cache_page(vma, pvmw.address, page_to_pfn(page)); + swapped = folio_test_swapcache(folio); + flush_cache_page(vma, pvmw.address, folio_pfn(folio)); /* * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make @@ -1326,26 +1327,26 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * Check that no O_DIRECT or similar I/O is in progress on the * page */ - if (page_mapcount(page) + 1 + swapped != page_count(page)) { + if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (anon_exclusive && - folio_try_share_anon_rmap_pte(page_folio(page), page)) { + folio_try_share_anon_rmap_pte(folio, &folio->page)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } if (pte_dirty(entry)) - set_page_dirty(page); + folio_mark_dirty(folio); entry = pte_mkclean(entry); if (pte_write(entry)) entry = pte_wrprotect(entry); - set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); + set_pte_at(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = entry; err = 0; @@ -1447,7 +1448,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, newpte); + set_pte_at(mm, addr, ptep, newpte); folio = page_folio(page); folio_remove_rmap_pte(folio, page, vma); @@ -1505,14 +1506,14 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ - if (write_protect_page(vma, page, &orig_pte) == 0) { + if (write_protect_page(vma, page_folio(page), &orig_pte) == 0) { if (!kpage) { /* * While we hold page lock, upgrade page from * PageAnon+anon_vma to PageKsm+NULL stable_node: * stable_tree_insert() will update stable_node. */ - set_page_stable_node(page, NULL); + folio_set_stable_node(page_folio(page), NULL); mark_page_accessed(page); /* * Page reclaim just frees a clean page with no dirty @@ -1617,14 +1618,14 @@ bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) return __is_page_sharing_candidate(stable_node, 0); } -static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, - struct ksm_stable_node **_stable_node, - struct rb_root *root, - bool prune_stale_stable_nodes) +static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) { struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; - struct page *_tree_page, *tree_page = NULL; + struct folio *folio, *tree_folio = NULL; int nr = 0; int found_rmap_hlist_len; @@ -1643,24 +1644,24 @@ static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, * We must walk all stable_node_dup to prune the stale * stable nodes during lookup. * - * get_ksm_page can drop the nodes from the + * ksm_get_folio can drop the nodes from the * stable_node->hlist if they point to freed pages * (that's why we do a _safe walk). The "dup" * stable_node parameter itself will be freed from * under us if it returns NULL. */ - _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK); - if (!_tree_page) + folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK); + if (!folio) continue; nr += 1; if (is_page_sharing_candidate(dup)) { if (!found || dup->rmap_hlist_len > found_rmap_hlist_len) { if (found) - put_page(tree_page); + folio_put(tree_folio); found = dup; found_rmap_hlist_len = found->rmap_hlist_len; - tree_page = _tree_page; + tree_folio = folio; /* skip put_page for found dup */ if (!prune_stale_stable_nodes) @@ -1668,7 +1669,7 @@ static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, continue; } } - put_page(_tree_page); + folio_put(folio); } if (found) { @@ -1733,7 +1734,7 @@ static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, } *_stable_node_dup = found; - return tree_page; + return tree_folio; } static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node, @@ -1750,7 +1751,7 @@ static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stabl } /* - * Like for get_ksm_page, this function can free the *_stable_node and + * Like for ksm_get_folio, this function can free the *_stable_node and * *_stable_node_dup if the returned tree_page is NULL. * * It can also free and overwrite *_stable_node with the found @@ -1763,16 +1764,16 @@ static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stabl * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ -static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, - struct ksm_stable_node **_stable_node, - struct rb_root *root, - bool prune_stale_stable_nodes) +static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) { struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { *_stable_node_dup = stable_node; - return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); + return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); } /* * _stable_node_dup set to NULL means the stable_node @@ -1785,24 +1786,24 @@ static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_du prune_stale_stable_nodes); } -static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d, - struct ksm_stable_node **s_n, - struct rb_root *root) +static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d, + struct ksm_stable_node **s_n, + struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } -static __always_inline struct page *chain(struct ksm_stable_node **s_n_d, - struct ksm_stable_node *s_n, - struct rb_root *root) +static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d, + struct ksm_stable_node *s_n, + struct rb_root *root) { struct ksm_stable_node *old_stable_node = s_n; - struct page *tree_page; + struct folio *tree_folio; - tree_page = __stable_node_chain(s_n_d, &s_n, root, false); + tree_folio = __stable_node_chain(s_n_d, &s_n, root, false); /* not pruning dups so s_n cannot have changed */ VM_BUG_ON(s_n != old_stable_node); - return tree_page; + return tree_folio; } /* @@ -1822,28 +1823,30 @@ static struct page *stable_tree_search(struct page *page) struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; struct ksm_stable_node *page_node; + struct folio *folio; - page_node = page_stable_node(page); + folio = page_folio(page); + page_node = folio_stable_node(folio); if (page_node && page_node->head != &migrate_nodes) { /* ksm page forked */ - get_page(page); - return page; + folio_get(folio); + return &folio->page; } - nid = get_kpfn_nid(page_to_pfn(page)); + nid = get_kpfn_nid(folio_pfn(folio)); root = root_stable_tree + nid; again: new = &root->rb_node; parent = NULL; while (*new) { - struct page *tree_page; + struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; - tree_page = chain_prune(&stable_node_dup, &stable_node, root); + tree_folio = chain_prune(&stable_node_dup, &stable_node, root); /* * NOTE: stable_node may have been freed by * chain_prune() if the returned stable_node_dup is @@ -1877,14 +1880,14 @@ again: * write protected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, - GET_KSM_PAGE_NOLOCK); + tree_folio = ksm_get_folio(stable_node_any, + KSM_GET_FOLIO_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); - if (!tree_page) { + if (!tree_folio) { /* * If we walked over a stale stable_node, - * get_ksm_page() will call rb_erase() and it + * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate @@ -1894,8 +1897,8 @@ again: goto again; } - ret = memcmp_pages(page, tree_page); - put_page(tree_page); + ret = memcmp_pages(page, &tree_folio->page); + folio_put(tree_folio); parent = *new; if (ret < 0) @@ -1906,12 +1909,15 @@ again: if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); /* - * Test if the migrated page should be merged - * into a stable node dup. If the mapcount is - * 1 we can migrate it with another KSM page - * without adding it to the chain. + * If the mapcount of our migrated KSM folio is + * at most 1, we can merge it with another + * KSM folio where we know that we have space + * for one more mapping without exceeding the + * ksm_max_page_sharing limit: see + * chain_prune(). This way, we can avoid adding + * this stable node to the chain. */ - if (page_mapcount(page) > 1) + if (folio_mapcount(folio) > 1) goto chain_append; } @@ -1938,26 +1944,26 @@ again: * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ - tree_page = get_ksm_page(stable_node_dup, - GET_KSM_PAGE_TRYLOCK); + tree_folio = ksm_get_folio(stable_node_dup, + KSM_GET_FOLIO_TRYLOCK); - if (PTR_ERR(tree_page) == -EBUSY) + if (PTR_ERR(tree_folio) == -EBUSY) return ERR_PTR(-EBUSY); - if (unlikely(!tree_page)) + if (unlikely(!tree_folio)) /* * The tree may have been rebalanced, * so re-evaluate parent and new. */ goto again; - unlock_page(tree_page); + folio_unlock(tree_folio); if (get_kpfn_nid(stable_node_dup->kpfn) != NUMA(stable_node_dup->nid)) { - put_page(tree_page); + folio_put(tree_folio); goto replace; } - return tree_page; + return &tree_folio->page; } } @@ -1970,8 +1976,8 @@ again: rb_insert_color(&page_node->node, root); out: if (is_page_sharing_candidate(page_node)) { - get_page(page); - return page; + folio_get(folio); + return &folio->page; } else return NULL; @@ -1996,12 +2002,12 @@ replace: &page_node->node, root); if (is_page_sharing_candidate(page_node)) - get_page(page); + folio_get(folio); else - page = NULL; + folio = NULL; } else { rb_erase(&stable_node_dup->node, root); - page = NULL; + folio = NULL; } } else { VM_BUG_ON(!is_stable_node_chain(stable_node)); @@ -2012,16 +2018,16 @@ replace: DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); if (is_page_sharing_candidate(page_node)) - get_page(page); + folio_get(folio); else - page = NULL; + folio = NULL; } else { - page = NULL; + folio = NULL; } } stable_node_dup->head = &migrate_nodes; list_add(&stable_node_dup->list, stable_node_dup->head); - return page; + return &folio->page; chain_append: /* stable_node_dup could be null if it reached the limit */ @@ -2064,7 +2070,7 @@ chain_append: * This function returns the stable tree node just allocated on success, * NULL otherwise. */ -static struct ksm_stable_node *stable_tree_insert(struct page *kpage) +static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio) { int nid; unsigned long kpfn; @@ -2074,7 +2080,7 @@ static struct ksm_stable_node *stable_tree_insert(struct page *kpage) struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; bool need_chain = false; - kpfn = page_to_pfn(kpage); + kpfn = folio_pfn(kfolio); nid = get_kpfn_nid(kpfn); root = root_stable_tree + nid; again: @@ -2082,13 +2088,13 @@ again: new = &root->rb_node; while (*new) { - struct page *tree_page; + struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; - tree_page = chain(&stable_node_dup, stable_node, root); + tree_folio = chain(&stable_node_dup, stable_node, root); if (!stable_node_dup) { /* * Either all stable_node dups were full in @@ -2110,14 +2116,14 @@ again: * write protected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, - GET_KSM_PAGE_NOLOCK); + tree_folio = ksm_get_folio(stable_node_any, + KSM_GET_FOLIO_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); - if (!tree_page) { + if (!tree_folio) { /* * If we walked over a stale stable_node, - * get_ksm_page() will call rb_erase() and it + * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate @@ -2127,8 +2133,8 @@ again: goto again; } - ret = memcmp_pages(kpage, tree_page); - put_page(tree_page); + ret = memcmp_pages(&kfolio->page, &tree_folio->page); + folio_put(tree_folio); parent = *new; if (ret < 0) @@ -2147,7 +2153,6 @@ again: INIT_HLIST_HEAD(&stable_node_dup->hlist); stable_node_dup->kpfn = kpfn; - set_page_stable_node(kpage, stable_node_dup); stable_node_dup->rmap_hlist_len = 0; DO_NUMA(stable_node_dup->nid = nid); if (!need_chain) { @@ -2166,6 +2171,8 @@ again: stable_node_chain_add_dup(stable_node_dup, stable_node); } + folio_set_stable_node(kfolio, stable_node_dup); + return stable_node_dup; } @@ -2425,7 +2432,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite * node in the stable tree and add both rmap_items. */ lock_page(kpage); - stable_node = stable_tree_insert(kpage); + stable_node = stable_tree_insert(page_folio(kpage)); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node, false); @@ -2597,14 +2604,14 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) */ if (!ksm_merge_across_nodes) { struct ksm_stable_node *stable_node, *next; - struct page *page; + struct folio *folio; list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { - page = get_ksm_page(stable_node, - GET_KSM_PAGE_NOLOCK); - if (page) - put_page(page); + folio = ksm_get_folio(stable_node, + KSM_GET_FOLIO_NOLOCK); + if (folio) + folio_put(folio); cond_resched(); } } @@ -3172,12 +3179,11 @@ again: /* * Collect processes when the error hit an ksm page. */ -void collect_procs_ksm(struct page *page, struct list_head *to_kill, - int force_early) +void collect_procs_ksm(struct folio *folio, struct page *page, + struct list_head *to_kill, int force_early) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; - struct folio *folio = page_folio(page); struct vm_area_struct *vma; struct task_struct *tsk; @@ -3229,11 +3235,11 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) /* * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible - * to get_ksm_page() before it can see that folio->mapping + * to ksm_get_folio() before it can see that folio->mapping * has gone stale (or that folio_test_swapcache has been cleared). */ smp_wmb(); - set_page_stable_node(&folio->page, NULL); + folio_set_stable_node(folio, NULL); } } #endif /* CONFIG_MIGRATION */ @@ -3256,7 +3262,7 @@ static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) { /* - * Don't get_ksm_page, page has already gone: + * Don't ksm_get_folio, page has already gone: * which is why we keep kpfn instead of page* */ remove_node_from_stable_tree(stable_node); @@ -3344,7 +3350,7 @@ static int ksm_memory_callback(struct notifier_block *self, * Most of the work is done by page migration; but there might * be a few stable_nodes left over, still pointing to struct * pages which have been offlined: prune those from the tree, - * otherwise get_ksm_page() might later try to access a + * otherwise ksm_get_folio() might later try to access a * non-existent struct page. */ ksm_check_stable_tree(mn->start_pfn, diff --git a/mm/list_lru.c b/mm/list_lru.c index 35b0147542a9..3fd64736bc45 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -162,20 +162,6 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, } EXPORT_SYMBOL_GPL(list_lru_isolate_move); -void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg) -{ - struct list_lru_one *list = - list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); - - if (list_empty(item)) { - list_add_tail(item, &list->list); - if (!list->nr_items++) - set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); - } -} -EXPORT_SYMBOL_GPL(list_lru_putback); - unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { @@ -257,6 +243,9 @@ restart: */ assert_spin_locked(&nlru->lock); goto restart; + case LRU_STOP: + assert_spin_locked(&nlru->lock); + goto out; default: BUG(); } @@ -567,6 +556,9 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, lru->shrinker_id = shrinker->id; else lru->shrinker_id = -1; + + if (mem_cgroup_kmem_disabled()) + memcg_aware = false; #endif lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); diff --git a/mm/madvise.c b/mm/madvise.c index cfa5e7288261..a77893462b92 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -321,6 +321,18 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) file_permission(vma->vm_file, MAY_WRITE) == 0; } +static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, + struct folio *folio, pte_t *ptep, + pte_t pte, bool *any_young, + bool *any_dirty) +{ + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + int max_nr = (end - addr) / PAGE_SIZE; + + return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, + any_young, any_dirty); +} + static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -336,6 +348,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, LIST_HEAD(folio_list); bool pageout_anon_only_filter; unsigned int batch_count = 0; + int nr; if (fatal_signal_pending(current)) return -EINTR; @@ -363,10 +376,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, goto huge_unlock; } - folio = pfn_folio(pmd_pfn(orig_pmd)); + folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) @@ -386,7 +399,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, return 0; } - if (pmd_young(orig_pmd)) { + if (!pageout && pmd_young(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); @@ -423,7 +436,8 @@ restart: return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); - for (; addr < end; pte++, addr += PAGE_SIZE) { + for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { + nr = 1; ptent = ptep_get(pte); if (++batch_count == SWAP_CLUSTER_MAX) { @@ -447,55 +461,64 @@ restart: continue; /* - * Creating a THP page is expensive so split it only if we - * are sure it's worth. Split it if we are only owner. + * If we encounter a large folio, only split it if it is not + * fully mapped within the range we are operating on. Otherwise + * leave it as is so that it can be swapped out whole. If we + * fail to split a folio, leave it in place and advance to the + * next pte in the range. */ if (folio_test_large(folio)) { - int err; - - if (folio_estimated_sharers(folio) != 1) - break; - if (pageout_anon_only_filter && !folio_test_anon(folio)) - break; - if (!folio_trylock(folio)) - break; - folio_get(folio); - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(start_pte, ptl); - start_pte = NULL; - err = split_folio(folio); - folio_unlock(folio); - folio_put(folio); - if (err) - break; - start_pte = pte = - pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!start_pte) - break; - arch_enter_lazy_mmu_mode(); - pte--; - addr -= PAGE_SIZE; - continue; + bool any_young; + + nr = madvise_folio_pte_batch(addr, end, folio, pte, + ptent, &any_young, NULL); + if (any_young) + ptent = pte_mkyoung(ptent); + + if (nr < folio_nr_pages(folio)) { + int err; + + if (folio_likely_mapped_shared(folio)) + continue; + if (pageout_anon_only_filter && !folio_test_anon(folio)) + continue; + if (!folio_trylock(folio)) + continue; + folio_get(folio); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + start_pte = NULL; + err = split_folio(folio); + folio_unlock(folio); + folio_put(folio); + start_pte = pte = + pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!start_pte) + break; + arch_enter_lazy_mmu_mode(); + if (!err) + nr = 0; + continue; + } } /* * Do not interfere with other mappings of this folio and - * non-LRU folio. + * non-LRU folio. If we have a large folio at this point, we + * know it is fully mapped so if its mapcount is the same as its + * number of pages, it must be exclusive. */ - if (!folio_test_lru(folio) || folio_mapcount(folio) != 1) + if (!folio_test_lru(folio) || + folio_mapcount(folio) != folio_nr_pages(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - - if (pte_young(ptent)) { - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - ptent = pte_mkold(ptent); - set_pte_at(mm, addr, pte, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); + if (!pageout && pte_young(ptent)) { + clear_young_dirty_ptes(vma, addr, pte, nr, + CYDP_CLEAR_YOUNG); + tlb_remove_tlb_entries(tlb, pte, nr, addr); } /* @@ -620,6 +643,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { + const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; struct mmu_gather *tlb = walk->private; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; @@ -628,6 +652,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, struct folio *folio; int nr_swap = 0; unsigned long next; + int nr, max_nr; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) @@ -640,7 +665,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); - for (; addr != end; pte++, addr += PAGE_SIZE) { + for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { + nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) @@ -655,9 +681,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { - nr_swap--; - free_swap_and_cache(entry); - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + max_nr = (end - addr) / PAGE_SIZE; + nr = swap_pte_batch(pte, max_nr, ptent); + nr_swap -= nr; + free_swap_and_cache_nr(entry, nr); + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); @@ -670,44 +698,57 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; /* - * If pmd isn't transhuge but the folio is large and - * is owned by only this process, split it and - * deactivate all pages. + * If we encounter a large folio, only split it if it is not + * fully mapped within the range we are operating on. Otherwise + * leave it as is so that it can be marked as lazyfree. If we + * fail to split a folio, leave it in place and advance to the + * next pte in the range. */ if (folio_test_large(folio)) { - int err; + bool any_young, any_dirty; - if (folio_estimated_sharers(folio) != 1) - break; - if (!folio_trylock(folio)) - break; - folio_get(folio); - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(start_pte, ptl); - start_pte = NULL; - err = split_folio(folio); - folio_unlock(folio); - folio_put(folio); - if (err) - break; - start_pte = pte = - pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!start_pte) - break; - arch_enter_lazy_mmu_mode(); - pte--; - addr -= PAGE_SIZE; - continue; + nr = madvise_folio_pte_batch(addr, end, folio, pte, + ptent, &any_young, &any_dirty); + + if (nr < folio_nr_pages(folio)) { + int err; + + if (folio_likely_mapped_shared(folio)) + continue; + if (!folio_trylock(folio)) + continue; + folio_get(folio); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + start_pte = NULL; + err = split_folio(folio); + folio_unlock(folio); + folio_put(folio); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + start_pte = pte; + if (!start_pte) + break; + arch_enter_lazy_mmu_mode(); + if (!err) + nr = 0; + continue; + } + + if (any_young) + ptent = pte_mkyoung(ptent); + if (any_dirty) + ptent = pte_mkdirty(ptent); } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { if (!folio_trylock(folio)) continue; /* - * If folio is shared with others, we mustn't clear - * the folio's dirty flag. + * If we have a large folio at this point, we know it is + * fully mapped so if its mapcount is the same as its + * number of pages, it must be exclusive. */ - if (folio_mapcount(folio) != 1) { + if (folio_mapcount(folio) != folio_nr_pages(folio)) { folio_unlock(folio); continue; } @@ -723,19 +764,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, } if (pte_young(ptent) || pte_dirty(ptent)) { - /* - * Some of architecture(ex, PPC) don't update TLB - * with set_pte_at and tlb_remove_tlb_entry so for - * the portability, remap the pte with old|clean - * after pte clearing. - */ - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - - ptent = pte_mkold(ptent); - ptent = pte_mkclean(ptent); - set_pte_at(mm, addr, pte, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); + clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); + tlb_remove_tlb_entries(tlb, pte, nr, addr); } folio_mark_lazyfree(folio); } @@ -901,39 +931,19 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, return -EINVAL; } -static long madvise_populate(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, - int behavior) +static long madvise_populate(struct mm_struct *mm, unsigned long start, + unsigned long end, int behavior) { const bool write = behavior == MADV_POPULATE_WRITE; - struct mm_struct *mm = vma->vm_mm; - unsigned long tmp_end; int locked = 1; long pages; - *prev = vma; - while (start < end) { - /* - * We might have temporarily dropped the lock. For example, - * our VMA might have been split. - */ - if (!vma || start >= vma->vm_end) { - vma = vma_lookup(mm, start); - if (!vma) - return -ENOMEM; - } - - tmp_end = min_t(unsigned long, end, vma->vm_end); /* Populate (prefault) page tables readable/writable. */ - pages = faultin_vma_page_range(vma, start, tmp_end, write, - &locked); + pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; - *prev = NULL; - vma = NULL; } if (pages < 0) { switch (pages) { @@ -949,7 +959,7 @@ static long madvise_populate(struct vm_area_struct *vma, pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; - case -ENOMEM: + case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } @@ -1034,9 +1044,6 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(vma, prev, start, end, behavior); - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - return madvise_populate(vma, prev, start, end, behavior); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1394,6 +1401,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * -EIO - an I/O error occurred while paging in data. * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. + * -EPERM - memory is sealed. */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { @@ -1437,10 +1445,29 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh start = untagged_addr_remote(mm, start); end = start + len; + /* + * Check if the address range is sealed for do_madvise(). + * can_modify_mm_madv assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) { + error = -EPERM; + goto out; + } + blk_start_plug(&plug); - error = madvise_walk_vmas(mm, start, end, behavior, - madvise_vma_behavior); + switch (behavior) { + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + error = madvise_populate(mm, start, end, behavior); + break; + default: + error = madvise_walk_vmas(mm, start, end, behavior, + madvise_vma_behavior); + break; + } blk_finish_plug(&plug); + +out: if (write) mmap_write_unlock(mm); else diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 61932c9215e7..7fad15b2290c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -33,6 +33,7 @@ #include <linux/shmem_fs.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/vm_event_item.h> #include <linux/smp.h> #include <linux/page-flags.h> @@ -349,7 +350,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, /* * A lot of the calls to the cache allocation functions are expected to be - * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are + * inlined by the compiler. Since the calls to memcg_slab_post_alloc_hook() are * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ @@ -574,6 +575,136 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } +/* Subset of node_stat_item for memcg stats */ +static const unsigned int memcg_node_stat_items[] = { + NR_INACTIVE_ANON, + NR_ACTIVE_ANON, + NR_INACTIVE_FILE, + NR_ACTIVE_FILE, + NR_UNEVICTABLE, + NR_SLAB_RECLAIMABLE_B, + NR_SLAB_UNRECLAIMABLE_B, + WORKINGSET_REFAULT_ANON, + WORKINGSET_REFAULT_FILE, + WORKINGSET_ACTIVATE_ANON, + WORKINGSET_ACTIVATE_FILE, + WORKINGSET_RESTORE_ANON, + WORKINGSET_RESTORE_FILE, + WORKINGSET_NODERECLAIM, + NR_ANON_MAPPED, + NR_FILE_MAPPED, + NR_FILE_PAGES, + NR_FILE_DIRTY, + NR_WRITEBACK, + NR_SHMEM, + NR_SHMEM_THPS, + NR_FILE_THPS, + NR_ANON_THPS, + NR_KERNEL_STACK_KB, + NR_PAGETABLE, + NR_SECONDARY_PAGETABLE, +#ifdef CONFIG_SWAP + NR_SWAPCACHE, +#endif +}; + +static const unsigned int memcg_stat_items[] = { + MEMCG_SWAP, + MEMCG_SOCK, + MEMCG_PERCPU_B, + MEMCG_VMALLOC, + MEMCG_KMEM, + MEMCG_ZSWAP_B, + MEMCG_ZSWAPPED, +}; + +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items) +#define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \ + ARRAY_SIZE(memcg_stat_items)) +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly; + +static void init_memcg_stats(void) +{ + int8_t i, j = 0; + + BUILD_BUG_ON(MEMCG_NR_STAT >= S8_MAX); + + for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i) + mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j; + + for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i) + mem_cgroup_stats_index[memcg_stat_items[i]] = ++j; +} + +static inline int memcg_stats_index(int idx) +{ + return mem_cgroup_stats_index[idx] - 1; +} + +struct lruvec_stats_percpu { + /* Local (CPU and cgroup) state */ + long state[NR_MEMCG_NODE_STAT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[NR_MEMCG_NODE_STAT_ITEMS]; +}; + +struct lruvec_stats { + /* Aggregated (CPU and subtree) state */ + long state[NR_MEMCG_NODE_STAT_ITEMS]; + + /* Non-hierarchical (CPU aggregated) state */ + long state_local[NR_MEMCG_NODE_STAT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[NR_MEMCG_NODE_STAT_ITEMS]; +}; + +unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) +{ + struct mem_cgroup_per_node *pn; + long x; + int i; + + if (mem_cgroup_disabled()) + return node_page_state(lruvec_pgdat(lruvec), idx); + + i = memcg_stats_index(idx); + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + return 0; + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + x = READ_ONCE(pn->lruvec_stats->state[i]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + +unsigned long lruvec_page_state_local(struct lruvec *lruvec, + enum node_stat_item idx) +{ + struct mem_cgroup_per_node *pn; + long x; + int i; + + if (mem_cgroup_disabled()) + return node_page_state(lruvec_pgdat(lruvec), idx); + + i = memcg_stats_index(idx); + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + return 0; + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + x = READ_ONCE(pn->lruvec_stats->state_local[i]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { PGPGIN, @@ -605,11 +736,13 @@ static const unsigned int memcg_vm_event_stat[] = { }; #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) -static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; +static int8_t mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; static void init_memcg_events(void) { - int i; + int8_t i; + + BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= S8_MAX); for (i = 0; i < NR_MEMCG_EVENTS; ++i) mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; @@ -631,11 +764,11 @@ struct memcg_vmstats_percpu { /* The above should fit a single cacheline for memcg_rstat_updated() */ /* Local (CPU and cgroup) page state & events */ - long state[MEMCG_NR_STAT]; + long state[MEMCG_VMSTAT_SIZE]; unsigned long events[NR_MEMCG_EVENTS]; /* Delta calculation for lockless upward propagation */ - long state_prev[MEMCG_NR_STAT]; + long state_prev[MEMCG_VMSTAT_SIZE]; unsigned long events_prev[NR_MEMCG_EVENTS]; /* Cgroup1: threshold notifications & softlimit tree updates */ @@ -645,15 +778,15 @@ struct memcg_vmstats_percpu { struct memcg_vmstats { /* Aggregated (CPU and subtree) page state & events */ - long state[MEMCG_NR_STAT]; + long state[MEMCG_VMSTAT_SIZE]; unsigned long events[NR_MEMCG_EVENTS]; /* Non-hierarchical (CPU aggregated) page state & events */ - long state_local[MEMCG_NR_STAT]; + long state_local[MEMCG_VMSTAT_SIZE]; unsigned long events_local[NR_MEMCG_EVENTS]; /* Pending child counts during tree propagation */ - long state_pending[MEMCG_NR_STAT]; + long state_pending[MEMCG_VMSTAT_SIZE]; unsigned long events_pending[NR_MEMCG_EVENTS]; /* Stats updates since the last flush */ @@ -714,6 +847,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { struct memcg_vmstats_percpu *statc; int cpu = smp_processor_id(); + unsigned int stats_updates; if (!val) return; @@ -721,8 +855,9 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) cgroup_rstat_updated(memcg->css.cgroup, cpu); statc = this_cpu_ptr(memcg->vmstats_percpu); for (; statc; statc = statc->parent) { - statc->stats_updates += abs(val); - if (statc->stats_updates < MEMCG_CHARGE_BATCH) + stats_updates = READ_ONCE(statc->stats_updates) + abs(val); + WRITE_ONCE(statc->stats_updates, stats_updates); + if (stats_updates < MEMCG_CHARGE_BATCH) continue; /* @@ -730,9 +865,9 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) * redundant. Avoid the overhead of the atomic update. */ if (!memcg_vmstats_needs_flush(statc->vmstats)) - atomic64_add(statc->stats_updates, + atomic64_add(stats_updates, &statc->vmstats->stats_updates); - statc->stats_updates = 0; + WRITE_ONCE(statc->stats_updates, 0); } } @@ -784,7 +919,13 @@ static void flush_memcg_stats_dwork(struct work_struct *w) unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long x = READ_ONCE(memcg->vmstats->state[idx]); + long x; + int i = memcg_stats_index(idx); + + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + return 0; + + x = READ_ONCE(memcg->vmstats->state[i]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -814,20 +955,31 @@ static int memcg_state_val_in_pages(int idx, int val) * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item * @val: delta to add to the counter, can be negative */ -void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) +void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, + int val) { + int i = memcg_stats_index(idx); + if (mem_cgroup_disabled()) return; - __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + return; + + __this_cpu_add(memcg->vmstats_percpu->state[i], val); memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { - long x = READ_ONCE(memcg->vmstats->state_local[idx]); + long x; + int i = memcg_stats_index(idx); + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + return 0; + + x = READ_ONCE(memcg->vmstats->state_local[i]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -835,11 +987,16 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) return x; } -void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, - int val) +static void __mod_memcg_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, + int val) { struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; + int i = memcg_stats_index(idx); + + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + return; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; @@ -856,8 +1013,6 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, case NR_ANON_MAPPED: case NR_FILE_MAPPED: case NR_ANON_THPS: - case NR_SHMEM_PMDMAPPED: - case NR_FILE_PMDMAPPED: WARN_ON_ONCE(!in_task()); break; default: @@ -866,10 +1021,10 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, } /* Update memcg */ - __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + __this_cpu_add(memcg->vmstats_percpu->state[i], val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); memcg_stats_unlock(); @@ -951,34 +1106,38 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { - int index = memcg_events_index(idx); + int i = memcg_events_index(idx); + + if (mem_cgroup_disabled()) + return; - if (mem_cgroup_disabled() || index < 0) + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) return; memcg_stats_lock(); - __this_cpu_add(memcg->vmstats_percpu->events[index], count); + __this_cpu_add(memcg->vmstats_percpu->events[i], count); memcg_rstat_updated(memcg, count); memcg_stats_unlock(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - int index = memcg_events_index(event); + int i = memcg_events_index(event); - if (index < 0) + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event)) return 0; - return READ_ONCE(memcg->vmstats->events[index]); + + return READ_ONCE(memcg->vmstats->events[i]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { - int index = memcg_events_index(event); + int i = memcg_events_index(event); - if (index < 0) + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event)) return 0; - return READ_ONCE(memcg->vmstats->events_local[index]); + return READ_ONCE(memcg->vmstats->events_local[i]); } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -2029,8 +2188,6 @@ static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) if (current->in_user_fault) { css_get(&memcg->css); current->memcg_in_oom = memcg; - current->memcg_oom_gfp_mask = mask; - current->memcg_oom_order = order; } return false; } @@ -2309,6 +2466,7 @@ static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; + unsigned int stock_pages; unsigned long flags; bool ret = false; @@ -2318,8 +2476,9 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) { - stock->nr_pages -= nr_pages; + stock_pages = READ_ONCE(stock->nr_pages); + if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) { + WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages); ret = true; } @@ -2333,16 +2492,18 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) */ static void drain_stock(struct memcg_stock_pcp *stock) { + unsigned int stock_pages = READ_ONCE(stock->nr_pages); struct mem_cgroup *old = READ_ONCE(stock->cached); if (!old) return; - if (stock->nr_pages) { - page_counter_uncharge(&old->memory, stock->nr_pages); + if (stock_pages) { + page_counter_uncharge(&old->memory, stock_pages); if (do_memsw_account()) - page_counter_uncharge(&old->memsw, stock->nr_pages); - stock->nr_pages = 0; + page_counter_uncharge(&old->memsw, stock_pages); + + WRITE_ONCE(stock->nr_pages, 0); } css_put(&old->css); @@ -2368,8 +2529,7 @@ static void drain_local_stock(struct work_struct *dummy) clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); } /* @@ -2379,6 +2539,7 @@ static void drain_local_stock(struct work_struct *dummy) static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; + unsigned int stock_pages; stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ @@ -2386,9 +2547,10 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) css_get(&memcg->css); WRITE_ONCE(stock->cached, memcg); } - stock->nr_pages += nr_pages; + stock_pages = READ_ONCE(stock->nr_pages) + nr_pages; + WRITE_ONCE(stock->nr_pages, stock_pages); - if (stock->nr_pages > MEMCG_CHARGE_BATCH) + if (stock_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); } @@ -2427,7 +2589,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) rcu_read_lock(); memcg = READ_ONCE(stock->cached); - if (memcg && stock->nr_pages && + if (memcg && READ_ONCE(stock->nr_pages) && mem_cgroup_is_descendant(memcg, root_memcg)) flush = true; else if (obj_stock_flush_required(stock, root_memcg)) @@ -2977,88 +3139,44 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) } #ifdef CONFIG_MEMCG_KMEM -/* - * The allocated objcg pointers array is not accounted directly. - * Moreover, it should not come from DMA buffer and is not readily - * reclaimable. So those GFP bits should be masked off. - */ -#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ - __GFP_ACCOUNT | __GFP_NOFAIL) -/* - * mod_objcg_mlstate() may be called with irq enabled, so - * mod_memcg_lruvec_state() should be used. - */ -static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, - struct pglist_data *pgdat, - enum node_stat_item idx, int nr) +static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) { struct mem_cgroup *memcg; struct lruvec *lruvec; + lockdep_assert_irqs_disabled(); + rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); lruvec = mem_cgroup_lruvec(memcg, pgdat); - mod_memcg_lruvec_state(lruvec, idx, nr); + __mod_memcg_lruvec_state(lruvec, idx, nr); rcu_read_unlock(); } -int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, - gfp_t gfp, bool new_slab) -{ - unsigned int objects = objs_per_slab(s, slab); - unsigned long memcg_data; - void *vec; - - gfp &= ~OBJCGS_CLEAR_MASK; - vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, - slab_nid(slab)); - if (!vec) - return -ENOMEM; - - memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; - if (new_slab) { - /* - * If the slab is brand new and nobody can yet access its - * memcg_data, no synchronization is required and memcg_data can - * be simply assigned. - */ - slab->memcg_data = memcg_data; - } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { - /* - * If the slab is already in use, somebody can allocate and - * assign obj_cgroups in parallel. In this case the existing - * objcg vector should be reused. - */ - kfree(vec); - return 0; - } - - kmemleak_not_leak(vec); - return 0; -} - static __always_inline struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) { /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in - * slab->memcg_data. + * slab->obj_exts. */ if (folio_test_slab(folio)) { - struct obj_cgroup **objcgs; + struct slabobj_ext *obj_exts; struct slab *slab; unsigned int off; slab = folio_slab(folio); - objcgs = slab_objcgs(slab); - if (!objcgs) + obj_exts = slab_obj_exts(slab); + if (!obj_exts) return NULL; off = obj_to_index(slab->slab_cache, slab, p); - if (objcgs[off]) - return obj_cgroup_memcg(objcgs[off]); + if (obj_exts[off].objcg) + return obj_cgroup_memcg(obj_exts[off].objcg); return NULL; } @@ -3066,7 +3184,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) /* * folio_memcg_check() is used here, because in theory we can encounter * a folio where the slab flag has been cleared already, but - * slab->memcg_data has not been freed yet + * slab->obj_exts has not been freed yet * folio_memcg_check() will guarantee that a proper memory * cgroup pointer or NULL will be returned. */ @@ -3144,8 +3262,7 @@ static struct obj_cgroup *current_objcg_update(void) if (old) { old = (struct obj_cgroup *) ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); old = NULL; } @@ -3355,7 +3472,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) obj_cgroup_put(objcg); } -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, +static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { struct memcg_stock_pcp *stock; @@ -3383,12 +3500,12 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, struct pglist_data *oldpg = stock->cached_pgdat; if (stock->nr_slab_reclaimable_b) { - mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, + __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, + __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } @@ -3414,11 +3531,10 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, } } if (nr) - mod_objcg_mlstate(objcg, pgdat, idx, nr); + __mod_objcg_mlstate(objcg, pgdat, idx, nr); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -3481,13 +3597,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) */ if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { if (stock->nr_slab_reclaimable_b) { - mod_objcg_mlstate(old, stock->cached_pgdat, + __mod_objcg_mlstate(old, stock->cached_pgdat, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - mod_objcg_mlstate(old, stock->cached_pgdat, + __mod_objcg_mlstate(old, stock->cached_pgdat, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; @@ -3545,8 +3661,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, } local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); @@ -3601,27 +3716,119 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) refill_obj_stock(objcg, size, true); } +static inline size_t obj_full_size(struct kmem_cache *s) +{ + /* + * For each accounted object there is an extra space which is used + * to store obj_cgroup membership. Charge it too. + */ + return s->size + sizeof(struct obj_cgroup *); +} + +bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, + gfp_t flags, size_t size, void **p) +{ + struct obj_cgroup *objcg; + struct slab *slab; + unsigned long off; + size_t i; + + /* + * The obtained objcg pointer is safe to use within the current scope, + * defined by current task or set_active_memcg() pair. + * obj_cgroup_get() is used to get a permanent reference. + */ + objcg = current_obj_cgroup(); + if (!objcg) + return true; + + /* + * slab_alloc_node() avoids the NULL check, so we might be called with a + * single NULL object. kmem_cache_alloc_bulk() aborts if it can't fill + * the whole requested size. + * return success as there's nothing to free back + */ + if (unlikely(*p == NULL)) + return true; + + flags &= gfp_allowed_mask; + + if (lru) { + int ret; + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + ret = memcg_list_lru_alloc(memcg, lru, flags); + css_put(&memcg->css); + + if (ret) + return false; + } + + if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s))) + return false; + + for (i = 0; i < size; i++) { + slab = virt_to_slab(p[i]); + + if (!slab_obj_exts(slab) && + alloc_slab_obj_exts(slab, s, flags, false)) { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + continue; + } + + off = obj_to_index(s, slab, p[i]); + obj_cgroup_get(objcg); + slab_obj_exts(slab)[off].objcg = objcg; + mod_objcg_state(objcg, slab_pgdat(slab), + cache_vmstat_idx(s), obj_full_size(s)); + } + + return true; +} + +void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects, struct slabobj_ext *obj_exts) +{ + for (int i = 0; i < objects; i++) { + struct obj_cgroup *objcg; + unsigned int off; + + off = obj_to_index(s, slab, p[i]); + objcg = obj_exts[off].objcg; + if (!objcg) + continue; + + obj_exts[off].objcg = NULL; + obj_cgroup_uncharge(objcg, obj_full_size(s)); + mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), + -obj_full_size(s)); + obj_cgroup_put(objcg); + } +} #endif /* CONFIG_MEMCG_KMEM */ /* * Because page_memcg(head) is not set on tails, set it now. */ -void split_page_memcg(struct page *head, unsigned int nr) +void split_page_memcg(struct page *head, int old_order, int new_order) { struct folio *folio = page_folio(head); struct mem_cgroup *memcg = folio_memcg(folio); int i; + unsigned int old_nr = 1 << old_order; + unsigned int new_nr = 1 << new_order; if (mem_cgroup_disabled() || !memcg) return; - for (i = 1; i < nr; i++) + for (i = new_nr; i < old_nr; i += new_nr) folio_page(folio, i)->memcg_data = folio->memcg_data; if (folio_memcg_kmem(folio)) - obj_cgroup_get_many(__folio_objcg(folio), nr - 1); + obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1); else - css_get_many(&memcg->css, nr - 1); + css_get_many(&memcg->css, old_nr / new_nr - 1); } #ifdef CONFIG_SWAP @@ -4800,7 +5007,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - mem_cgroup_flush_stats(memcg); + mem_cgroup_flush_stats_ratelimited(memcg); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -5428,26 +5635,33 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) } #endif -static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) +static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); if (!pn) - return 1; + return false; + + pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), + GFP_KERNEL_ACCOUNT, node); + if (!pn->lruvec_stats) + goto fail; pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, GFP_KERNEL_ACCOUNT); - if (!pn->lruvec_stats_percpu) { - kfree(pn); - return 1; - } + if (!pn->lruvec_stats_percpu) + goto fail; lruvec_init(&pn->lruvec); pn->memcg = memcg; memcg->nodeinfo[node] = pn; - return 0; + return true; +fail: + kfree(pn->lruvec_stats); + kfree(pn); + return false; } static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) @@ -5458,6 +5672,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) return; free_percpu(pn->lruvec_stats_percpu); + kfree(pn->lruvec_stats); kfree(pn); } @@ -5465,8 +5680,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - if (memcg->orig_objcg) - obj_cgroup_put(memcg->orig_objcg); + obj_cgroup_put(memcg->orig_objcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); @@ -5501,7 +5715,8 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) goto fail; } - memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); + memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), + GFP_KERNEL_ACCOUNT); if (!memcg->vmstats) goto fail; @@ -5519,7 +5734,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) } for_each_node(node) - if (alloc_mem_cgroup_per_node_info(memcg, node)) + if (!alloc_mem_cgroup_per_node_info(memcg, node)) goto fail; if (memcg_wb_domain_init(memcg, GFP_KERNEL)) @@ -5585,6 +5800,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { + init_memcg_stats(); init_memcg_events(); page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); @@ -5621,7 +5837,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (alloc_shrinker_info(memcg)) goto offline_kmem; - if (unlikely(mem_cgroup_is_root(memcg))) + if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled()) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); lru_gen_online_memcg(memcg); @@ -5756,7 +5972,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); - for (i = 0; i < MEMCG_NR_STAT; i++) { + for (i = 0; i < MEMCG_VMSTAT_SIZE; i++) { /* * Collect the aggregated propagation counts of groups * below us. We're in a per-cpu loop here and this is @@ -5811,18 +6027,19 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) for_each_node_state(nid, N_MEMORY) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; - struct mem_cgroup_per_node *ppn = NULL; + struct lruvec_stats *lstats = pn->lruvec_stats; + struct lruvec_stats *plstats = NULL; struct lruvec_stats_percpu *lstatc; if (parent) - ppn = parent->nodeinfo[nid]; + plstats = parent->nodeinfo[nid]->lruvec_stats; lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { - delta = pn->lruvec_stats.state_pending[i]; + for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) { + delta = lstats->state_pending[i]; if (delta) - pn->lruvec_stats.state_pending[i] = 0; + lstats->state_pending[i] = 0; delta_cpu = 0; v = READ_ONCE(lstatc->state[i]); @@ -5833,16 +6050,16 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) } if (delta_cpu) - pn->lruvec_stats.state_local[i] += delta_cpu; + lstats->state_local[i] += delta_cpu; if (delta) { - pn->lruvec_stats.state[i] += delta; - if (ppn) - ppn->lruvec_stats.state_pending[i] += delta; + lstats->state[i] += delta; + if (plstats) + plstats->state_pending[i] += delta; } } } - statc->stats_updates = 0; + WRITE_ONCE(statc->stats_updates, 0); /* We are in a per-cpu loop here, only do the atomic write once */ if (atomic64_read(&memcg->vmstats->stats_updates)) atomic64_set(&memcg->vmstats->stats_updates, 0); @@ -5873,7 +6090,7 @@ static int mem_cgroup_do_precharge(unsigned long count) } union mc_target { - struct page *page; + struct folio *folio; swp_entry_t ent; }; @@ -5965,23 +6182,22 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, } /** - * mem_cgroup_move_account - move account of the page - * @page: the page + * mem_cgroup_move_account - move account of the folio + * @folio: The folio. * @compound: charge the page as compound or small page - * @from: mem_cgroup which the page is moved from. - * @to: mem_cgroup which the page is moved to. @from != @to. + * @from: mem_cgroup which the folio is moved from. + * @to: mem_cgroup which the folio is moved to. @from != @to. * - * The page must be locked and not on the LRU. + * The folio must be locked and not on the LRU. * * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" * from old cgroup. */ -static int mem_cgroup_move_account(struct page *page, +static int mem_cgroup_move_account(struct folio *folio, bool compound, struct mem_cgroup *from, struct mem_cgroup *to) { - struct folio *folio = page_folio(page); struct lruvec *from_vec, *to_vec; struct pglist_data *pgdat; unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; @@ -6096,7 +6312,7 @@ out: * Return: * * MC_TARGET_NONE - If the pte is not a target for move charge. * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for - * move charge. If @target is not NULL, the page is stored in target->page + * move charge. If @target is not NULL, the folio is stored in target->folio * with extra refcnt taken (Caller should release it). * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a * target for charge migration. If @target is not NULL, the entry is @@ -6110,6 +6326,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { struct page *page = NULL; + struct folio *folio; enum mc_target_type ret = MC_TARGET_NONE; swp_entry_t ent = { .val = 0 }; @@ -6124,9 +6341,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); + if (page) + folio = page_folio(page); if (target && page) { - if (!trylock_page(page)) { - put_page(page); + if (!folio_trylock(folio)) { + folio_put(folio); return ret; } /* @@ -6141,8 +6360,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, * Alas, skip moving the page in this case. */ if (!pte_present(ptent) && page_mapped(page)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ret; } } @@ -6155,18 +6374,18 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, * mem_cgroup_move_account() checks the page is valid or * not under LRU exclusion. */ - if (page_memcg(page) == mc.from) { + if (folio_memcg(folio) == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page) || - is_device_coherent_page(page)) + if (folio_is_device_private(folio) || + folio_is_device_coherent(folio)) ret = MC_TARGET_DEVICE; if (target) - target->page = page; + target->folio = folio; } if (!ret || !target) { if (target) - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } } /* @@ -6192,6 +6411,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, union mc_target *target) { struct page *page = NULL; + struct folio *folio; enum mc_target_type ret = MC_TARGET_NONE; if (unlikely(is_swap_pmd(pmd))) { @@ -6201,17 +6421,18 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, } page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); + folio = page_folio(page); if (!(mc.flags & MOVE_ANON)) return ret; - if (page_memcg(page) == mc.from) { + if (folio_memcg(folio) == mc.from) { ret = MC_TARGET_PAGE; if (target) { - get_page(page); - if (!trylock_page(page)) { - put_page(page); + folio_get(folio); + if (!folio_trylock(folio)) { + folio_put(folio); return MC_TARGET_NONE; } - target->page = page; + target->folio = folio; } } return ret; @@ -6431,7 +6652,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; enum mc_target_type target_type; union mc_target target; - struct page *page; + struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -6441,26 +6662,26 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { - page = target.page; - if (isolate_lru_page(page)) { - if (!mem_cgroup_move_account(page, true, + folio = target.folio; + if (folio_isolate_lru(folio)) { + if (!mem_cgroup_move_account(folio, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } - putback_lru_page(page); + folio_putback_lru(folio); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } else if (target_type == MC_TARGET_DEVICE) { - page = target.page; - if (!mem_cgroup_move_account(page, true, + folio = target.folio; + if (!mem_cgroup_move_account(folio, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } spin_unlock(ptl); return 0; @@ -6483,28 +6704,28 @@ retry: device = true; fallthrough; case MC_TARGET_PAGE: - page = target.page; + folio = target.folio; /* * We can have a part of the split pmd here. Moving it * can be done but it would be too convoluted so simply * ignore such a partial THP and keep it in original * memcg. There should be somebody mapping the head. */ - if (PageTransCompound(page)) + if (folio_test_large(folio)) goto put; - if (!device && !isolate_lru_page(page)) + if (!device && !folio_isolate_lru(folio)) goto put; - if (!mem_cgroup_move_account(page, false, + if (!mem_cgroup_move_account(folio, false, mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; } if (!device) - putback_lru_page(page); + folio_putback_lru(folio); put: /* get_mctgt_type() gets & locks the page */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; case MC_TARGET_SWAP: ent = target.ent; @@ -6613,8 +6834,7 @@ static void mem_cgroup_exit(struct task_struct *task) objcg = (struct obj_cgroup *) ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG); - if (objcg) - obj_cgroup_put(objcg); + obj_cgroup_put(objcg); /* * Some kernel allocations can happen after this point, @@ -6977,6 +7197,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { + /* Will converge on zero, but reclaim enforces a minimum */ + unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; unsigned long reclaimed; if (signal_pending(current)) @@ -6991,8 +7213,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, lru_add_drain_all(); reclaimed = try_to_free_mem_cgroup_pages(memcg, - min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX), - GFP_KERNEL, reclaim_options); + batch_size, GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; @@ -7440,6 +7661,9 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON_FOLIO(folio_order(folio) > 1 && + !folio_test_hugetlb(folio) && + !list_empty(&folio->_deferred_list), folio); /* * Nobody should be changing or seriously looking at @@ -7505,21 +7729,14 @@ void __mem_cgroup_uncharge(struct folio *folio) uncharge_batch(&ug); } -/** - * __mem_cgroup_uncharge_list - uncharge a list of page - * @page_list: list of pages to uncharge - * - * Uncharge a list of pages previously charged with - * __mem_cgroup_charge(). - */ -void __mem_cgroup_uncharge_list(struct list_head *page_list) +void __mem_cgroup_uncharge_folios(struct folio_batch *folios) { struct uncharge_gather ug; - struct folio *folio; + unsigned int i; uncharge_gather_clear(&ug); - list_for_each_entry(folio, page_list, lru) - uncharge_folio(folio, &ug); + for (i = 0; i < folios->nr; i++) + uncharge_folio(folios->folios[i], &ug); if (ug.memcg) uncharge_batch(&ug); } diff --git a/mm/memfd.c b/mm/memfd.c index d3a1ba4208c9..7d8d3ab3fa37 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -29,29 +29,25 @@ #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ +static bool memfd_folio_has_extra_refs(struct folio *folio) +{ + return folio_ref_count(folio) - folio_mapcount(folio) != + folio_nr_pages(folio); +} + static void memfd_tag_pins(struct xa_state *xas) { - struct page *page; + struct folio *folio; int latency = 0; - int cache_count; lru_add_drain(); xas_lock_irq(xas); - xas_for_each(xas, page, ULONG_MAX) { - cache_count = 1; - if (!xa_is_value(page) && - PageTransHuge(page) && !PageHuge(page)) - cache_count = HPAGE_PMD_NR; - - if (!xa_is_value(page) && - page_count(page) - total_mapcount(page) != cache_count) + xas_for_each(xas, folio, ULONG_MAX) { + if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) xas_set_mark(xas, MEMFD_TAG_PINNED); - if (cache_count != 1) - xas_set(xas, page->index + cache_count); - latency += cache_count; - if (latency < XA_CHECK_SCHED) + if (++latency < XA_CHECK_SCHED) continue; latency = 0; @@ -66,16 +62,16 @@ static void memfd_tag_pins(struct xa_state *xas) /* * Setting SEAL_WRITE requires us to verify there's no pending writer. However, * via get_user_pages(), drivers might have some pending I/O without any active - * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios * and see whether it has an elevated ref-count. If so, we tag them and wait for * them to be dropped. * The caller must guarantee that no new user will acquire writable references - * to those pages to avoid races. + * to those folios to avoid races. */ static int memfd_wait_for_pins(struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, 0); - struct page *page; + struct folio *folio; int error, scan; memfd_tag_pins(&xas); @@ -83,7 +79,6 @@ static int memfd_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { int latency = 0; - int cache_count; if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; @@ -95,20 +90,15 @@ static int memfd_wait_for_pins(struct address_space *mapping) xas_set(&xas, 0); xas_lock_irq(&xas); - xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { + xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) { bool clear = true; - cache_count = 1; - if (!xa_is_value(page) && - PageTransHuge(page) && !PageHuge(page)) - cache_count = HPAGE_PMD_NR; - - if (!xa_is_value(page) && cache_count != - page_count(page) - total_mapcount(page)) { + if (!xa_is_value(folio) && + memfd_folio_has_extra_refs(folio)) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still - * found pages pinned. + * found folios pinned. */ if (scan == LAST_SCAN) error = -EBUSY; @@ -118,8 +108,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) if (clear) xas_clear_mark(&xas, MEMFD_TAG_PINNED); - latency += cache_count; - if (latency < XA_CHECK_SCHED) + if (++latency < XA_CHECK_SCHED) continue; latency = 0; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9349948f1abf..d3c830e817e3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -141,7 +141,6 @@ static struct ctl_table memory_failure_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; /* @@ -154,11 +153,23 @@ static int __page_handle_poison(struct page *page) { int ret; - zone_pcp_disable(page_zone(page)); - ret = dissolve_free_huge_page(page); - if (!ret) + /* + * zone_pcp_disable() can't be used here. It will + * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold + * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap + * optimization is enabled. This will break current lock dependency + * chain and leads to deadlock. + * Disabling pcp before dissolving the page was a deterministic + * approach because we made sure that those pages cannot end up in any + * PCP list. Draining PCP lists expels those pages to the buddy system, + * but nothing guarantees that those pages do not get back to a PCP + * queue if we need to refill those. + */ + ret = dissolve_free_hugetlb_folio(page_folio(page)); + if (!ret) { + drain_all_pages(page_zone(page)); ret = take_page_off_buddy(page); - zone_pcp_enable(page_zone(page)); + } return ret; } @@ -167,8 +178,8 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo { if (hugepage_or_freepage) { /* - * Doing this check for free pages is also fine since dissolve_free_huge_page - * returns 0 for non-hugetlb pages as well. + * Doing this check for free pages is also fine since + * dissolve_free_hugetlb_folio() returns 0 for non-hugetlb folios as well. */ if (__page_handle_poison(page) <= 0) /* @@ -205,6 +216,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); static int hwpoison_filter_dev(struct page *p) { + struct folio *folio = page_folio(p); struct address_space *mapping; dev_t dev; @@ -212,7 +224,7 @@ static int hwpoison_filter_dev(struct page *p) hwpoison_filter_dev_minor == ~0U) return 0; - mapping = page_mapping(p); + mapping = folio_mapping(folio); if (mapping == NULL || mapping->host == NULL) return -EINVAL; @@ -358,20 +370,25 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) * Unknown page type encountered. Try to check whether it can turn PageLRU by * lru_add_drain_all. */ -void shake_page(struct page *p) +void shake_folio(struct folio *folio) { - if (PageHuge(p)) + if (folio_test_hugetlb(folio)) return; /* * TODO: Could shrink slab caches here if a lightweight range-based * shrinker will be available. */ - if (PageSlab(p)) + if (folio_test_slab(folio)) return; lru_add_drain_all(); } -EXPORT_SYMBOL_GPL(shake_page); +EXPORT_SYMBOL_GPL(shake_folio); + +static void shake_page(struct page *page) +{ + shake_folio(page_folio(page)); +} static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, unsigned long address) @@ -416,21 +433,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, * not much we can do. We just print a message and ignore otherwise. */ -#define FSDAX_INVALID_PGOFF ULONG_MAX - /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. - * - * Note: @fsdax_pgoff is used only when @p is a fsdax page and a - * filesystem with a memory failure handler has claimed the - * memory_failure event. In all other cases, page->index and - * page->mapping are sufficient for mapping the page back to its - * corresponding user virtual address. */ static void __add_to_kill(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr, pgoff_t fsdax_pgoff) + unsigned long addr) { struct to_kill *tk; @@ -440,12 +449,10 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, return; } - tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma); - if (is_zone_device_page(p)) { - if (fsdax_pgoff != FSDAX_INVALID_PGOFF) - tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); + tk->addr = addr; + if (is_zone_device_page(p)) tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); - } else + else tk->size_shift = page_shift(compound_head(p)); /* @@ -472,10 +479,12 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, } static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, - struct list_head *to_kill) + struct vm_area_struct *vma, struct list_head *to_kill, + unsigned long addr) { - __add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF); + if (addr == -EFAULT) + return; + __add_to_kill(tsk, p, vma, to_kill, addr); } #ifdef CONFIG_KSM @@ -491,12 +500,13 @@ static bool task_in_to_kill_list(struct list_head *to_kill, return false; } + void add_to_kill_ksm(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr) + unsigned long addr) { if (!task_in_to_kill_list(to_kill, tsk)) - __add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF); + __add_to_kill(tsk, p, vma, to_kill, addr); } #endif /* @@ -598,7 +608,6 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) static void collect_procs_anon(struct folio *folio, struct page *page, struct list_head *to_kill, int force_early) { - struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; pgoff_t pgoff; @@ -610,8 +619,10 @@ static void collect_procs_anon(struct folio *folio, struct page *page, pgoff = page_to_pgoff(page); rcu_read_lock(); for_each_process(tsk) { + struct vm_area_struct *vma; struct anon_vma_chain *vmac; struct task_struct *t = task_early_kill(tsk, force_early); + unsigned long addr; if (!t) continue; @@ -620,9 +631,8 @@ static void collect_procs_anon(struct folio *folio, struct page *page, vma = vmac->vma; if (vma->vm_mm != t->mm) continue; - if (!page_mapped_in_vma(page, vma)) - continue; - add_to_kill_anon_file(t, page, vma, to_kill); + addr = page_mapped_in_vma(page, vma); + add_to_kill_anon_file(t, page, vma, to_kill, addr); } } rcu_read_unlock(); @@ -645,6 +655,7 @@ static void collect_procs_file(struct folio *folio, struct page *page, pgoff = page_to_pgoff(page); for_each_process(tsk) { struct task_struct *t = task_early_kill(tsk, force_early); + unsigned long addr; if (!t) continue; @@ -657,8 +668,10 @@ static void collect_procs_file(struct folio *folio, struct page *page, * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == t->mm) - add_to_kill_anon_file(t, page, vma, to_kill); + if (vma->vm_mm != t->mm) + continue; + addr = page_address_in_vma(page, vma); + add_to_kill_anon_file(t, page, vma, to_kill, addr); } } rcu_read_unlock(); @@ -670,7 +683,8 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, pgoff_t pgoff) { - __add_to_kill(tsk, p, vma, to_kill, 0, pgoff); + unsigned long addr = vma_address(vma, pgoff, 1); + __add_to_kill(tsk, p, vma, to_kill, addr); } /* @@ -715,9 +729,9 @@ static void collect_procs(struct folio *folio, struct page *page, { if (!folio->mapping) return; - if (unlikely(PageKsm(page))) - collect_procs_ksm(page, tokill, force_early); - else if (PageAnon(page)) + if (unlikely(folio_test_ksm(folio))) + collect_procs_ksm(folio, page, tokill, force_early); + else if (folio_test_anon(folio)) collect_procs_anon(folio, page, tokill, force_early); else collect_procs_file(folio, page, tokill, force_early); @@ -1077,7 +1091,8 @@ out: */ static int me_pagecache_dirty(struct page_state *ps, struct page *p) { - struct address_space *mapping = page_mapping(p); + struct folio *folio = page_folio(p); + struct address_space *mapping = folio_mapping(folio); SetPageError(p); /* TBD: print more information about the file. */ @@ -1206,7 +1221,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) * subpages. */ folio_put(folio); - if (__page_handle_poison(p) >= 0) { + if (__page_handle_poison(p) > 0) { page_ref_inc(p); res = MF_RECOVERED; } else { @@ -1239,7 +1254,6 @@ static int me_huge_page(struct page_state *ps, struct page *p) #define mlock (1UL << PG_mlocked) #define lru (1UL << PG_lru) #define head (1UL << PG_head) -#define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) static struct page_state error_states[] = { @@ -1249,13 +1263,6 @@ static struct page_state error_states[] = { * PG_buddy pages only make a small fraction of all free pages. */ - /* - * Could in theory check if slab page is free or if we can drop - * currently unused objects without touching them. But just - * treat it as standard kernel for now. - */ - { slab, slab, MF_MSG_SLAB, me_kernel }, - { head, head, MF_MSG_HUGE, me_huge_page }, { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, @@ -1282,7 +1289,6 @@ static struct page_state error_states[] = { #undef mlock #undef lru #undef head -#undef slab #undef reserved static void update_per_node_mf_stats(unsigned long pfn, @@ -1555,24 +1561,24 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ -static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, - int flags, struct page *hpage) +static bool hwpoison_user_mappings(struct folio *folio, struct page *p, + unsigned long pfn, int flags) { - struct folio *folio = page_folio(hpage); enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; int forcekill; - bool mlocked = PageMlocked(hpage); + bool mlocked = folio_test_mlocked(folio); /* * Here we are interested only in user-mapped pages, so skip any * other types of pages. */ - if (PageReserved(p) || PageSlab(p) || PageTable(p) || PageOffline(p)) + if (folio_test_reserved(folio) || folio_test_slab(folio) || + folio_test_pgtable(folio) || folio_test_offline(folio)) return true; - if (!(PageLRU(hpage) || PageHuge(p))) + if (!(folio_test_lru(folio) || folio_test_hugetlb(folio))) return true; /* @@ -1582,7 +1588,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (!page_mapped(p)) return true; - if (PageSwapCache(p)) { + if (folio_test_swapcache(folio)) { pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); ttu &= ~TTU_HWPOISON; } @@ -1593,11 +1599,11 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * XXX: the dirty test could be racy: set_page_dirty() may not always * be called inside page lock (it's recommended but not enforced). */ - mapping = page_mapping(hpage); - if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && + mapping = folio_mapping(folio); + if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping && mapping_can_writeback(mapping)) { - if (page_mkclean(hpage)) { - SetPageDirty(hpage); + if (folio_mkclean(folio)) { + folio_set_dirty(folio); } else { ttu &= ~TTU_HWPOISON; pr_info("%#lx: corrupted page was clean: dropped without side effects\n", @@ -1612,7 +1618,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, */ collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); - if (PageHuge(hpage) && !PageAnon(hpage)) { + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* * For hugetlb pages in shared mappings, try_to_unmap * could potentially call huge_pmd_unshare. Because of @@ -1620,7 +1626,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * TTU_RMAP_LOCKED to indicate we have taken the lock * at this higher level. */ - mapping = hugetlb_page_mapping_lock_write(hpage); + mapping = hugetlb_folio_mapping_lock_write(folio); if (mapping) { try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); i_mmap_unlock_write(mapping); @@ -1632,15 +1638,15 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, unmap_success = !page_mapped(p); if (!unmap_success) - pr_err("%#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(p)); + pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n", + pfn, folio_mapcount(page_folio(p))); /* * try_to_unmap() might put mlocked page in lru cache, so call * shake_page() again to ensure that it's flushed. */ if (mlocked) - shake_page(hpage); + shake_folio(folio); /* * Now that the dirty bit has been propagated to the @@ -1652,7 +1658,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) || + forcekill = folio_test_dirty(folio) || (flags & MF_MUST_KILL) || !unmap_success; kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); @@ -2085,7 +2091,7 @@ retry: */ if (res == 0) { folio_unlock(folio); - if (__page_handle_poison(p) >= 0) { + if (__page_handle_poison(p) > 0) { page_ref_inc(p); res = MF_RECOVERED; } else { @@ -2096,7 +2102,7 @@ retry: page_flags = folio->flags; - if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) { + if (!hwpoison_user_mappings(folio, p, pfn, flags)) { folio_unlock(folio); return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); } @@ -2185,7 +2191,7 @@ out: int memory_failure(unsigned long pfn, int flags) { struct page *p; - struct page *hpage; + struct folio *folio; struct dev_pagemap *pgmap; int res = 0; unsigned long page_flags; @@ -2273,8 +2279,8 @@ try_again: } } - hpage = compound_head(p); - if (PageTransHuge(hpage)) { + folio = page_folio(p); + if (folio_test_large(folio)) { /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. @@ -2288,12 +2294,13 @@ try_again: * or unhandlable page. The refcount is bumped iff the * page is a valid handlable page. */ - SetPageHasHWPoisoned(hpage); + folio_set_has_hwpoisoned(folio); if (try_to_split_thp_page(p) < 0) { res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); goto unlock_mutex; } VM_BUG_ON_PAGE(!page_count(p), p); + folio = page_folio(p); } /* @@ -2304,9 +2311,9 @@ try_again: * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - shake_page(p); + shake_folio(folio); - lock_page(p); + folio_lock(folio); /* * We're only intended to deal with the non-Compound page here. @@ -2314,11 +2321,11 @@ try_again: * race window. If this happens, we could try again to hopefully * handle the page next round. */ - if (PageCompound(p)) { + if (folio_test_large(folio)) { if (retry) { ClearPageHWPoison(p); - unlock_page(p); - put_page(p); + folio_unlock(folio); + folio_put(folio); flags &= ~MF_COUNT_INCREASED; retry = false; goto try_again; @@ -2334,35 +2341,35 @@ try_again: * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page * status correctly, we save a copy of the page flags at this time. */ - page_flags = p->flags; + page_flags = folio->flags; if (hwpoison_filter(p)) { ClearPageHWPoison(p); - unlock_page(p); - put_page(p); + folio_unlock(folio); + folio_put(folio); res = -EOPNOTSUPP; goto unlock_mutex; } /* - * __munlock_folio() may clear a writeback page's LRU flag without - * page_lock. We need wait writeback completion for this page or it - * may trigger vfs BUG while evict inode. + * __munlock_folio() may clear a writeback folio's LRU flag without + * the folio lock. We need to wait for writeback completion for this + * folio or it may trigger a vfs BUG while evicting inode. */ - if (!PageLRU(p) && !PageWriteback(p)) + if (!folio_test_lru(folio) && !folio_test_writeback(folio)) goto identify_page_state; /* * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ - wait_on_page_writeback(p); + folio_wait_writeback(folio); /* * Now take care of user space mappings. * Abort on fail: __filemap_remove_folio() assumes unmapped page. */ - if (!hwpoison_user_mappings(p, pfn, flags, p)) { + if (!hwpoison_user_mappings(folio, p, pfn, flags)) { res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); goto unlock_page; } @@ -2370,7 +2377,8 @@ try_again: /* * Torn down by someone else? */ - if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + if (folio_test_lru(folio) && !folio_test_swapcache(folio) && + folio->mapping == NULL) { res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); goto unlock_page; } @@ -2380,7 +2388,7 @@ identify_page_state: mutex_unlock(&mf_mutex); return res; unlock_page: - unlock_page(p); + folio_unlock(folio); unlock_mutex: mutex_unlock(&mf_mutex); return res; @@ -2538,6 +2546,13 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } + if (is_huge_zero_folio(folio)) { + unpoison_pr_info("Unpoison: huge zero page is not supported %#lx\n", + pfn, &unpoison_rs); + ret = -EOPNOTSUPP; + goto unlock_mutex; + } + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); @@ -2550,8 +2565,8 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (folio_test_slab(folio) || PageTable(&folio->page) || - folio_test_reserved(folio) || PageOffline(&folio->page)) + if (folio_test_slab(folio) || folio_test_pgtable(folio) || + folio_test_reserved(folio) || folio_test_offline(folio)) goto unlock_mutex; /* @@ -2572,7 +2587,7 @@ int unpoison_memory(unsigned long pfn) ghp = get_hwpoison_page(p, MF_UNPOISON); if (!ghp) { - if (PageHuge(p)) { + if (folio_test_hugetlb(folio)) { huge = true; count = folio_free_raw_hwp(folio, false); if (count == 0) @@ -2588,7 +2603,7 @@ int unpoison_memory(unsigned long pfn) pfn, &unpoison_rs); } } else { - if (PageHuge(p)) { + if (folio_test_hugetlb(folio)) { huge = true; count = folio_free_raw_hwp(folio, false); if (count == 0) { @@ -2666,6 +2681,7 @@ static int soft_offline_in_use_page(struct page *page) struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .reason = MR_MEMORY_FAILURE, }; if (!huge && folio_test_large(folio)) { diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 5462d9e3c84c..6632102bd5c9 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -36,10 +36,15 @@ struct node_memory_type_map { static DEFINE_MUTEX(memory_tier_lock); static LIST_HEAD(memory_tiers); +/* + * The list is used to store all memory types that are not created + * by a device driver. + */ +static LIST_HEAD(default_memory_types); static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; struct memory_dev_type *default_dram_type; -static struct bus_type memory_tier_subsys = { +static const struct bus_type memory_tier_subsys = { .name = "memory_tiering", .dev_name = "memory_tier", }; @@ -108,6 +113,8 @@ static struct demotion_nodes *node_demotion __read_mostly; static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); +/* The lock is used to protect `default_dram_perf*` info and nid. */ +static DEFINE_MUTEX(default_dram_perf_lock); static bool default_dram_perf_error; static struct access_coordinate default_dram_perf; static int default_dram_perf_ref_nid = NUMA_NO_NODE; @@ -359,6 +366,26 @@ static void disable_all_demotion_targets(void) synchronize_rcu(); } +static void dump_demotion_targets(void) +{ + int node; + + for_each_node_state(node, N_MEMORY) { + struct memory_tier *memtier = __node_get_memory_tier(node); + nodemask_t preferred = node_demotion[node].preferred; + + if (!memtier) + continue; + + if (nodes_empty(preferred)) + pr_info("Demotion targets for Node %d: null\n", node); + else + pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n", + node, nodemask_pr_args(&preferred), + nodemask_pr_args(&memtier->lower_tier_mask)); + } +} + /* * Find an automatic demotion target for all memory * nodes. Failing here is OK. It might just indicate @@ -443,7 +470,7 @@ static void establish_demotion_targets(void) * Now build the lower_tier mask for each node collecting node mask from * all memory tier below it. This allows us to fallback demotion page * allocation to a set of nodes that is closer the above selected - * perferred node. + * preferred node. */ lower_tier = node_states[N_MEMORY]; list_for_each_entry(memtier, &memory_tiers, list) { @@ -456,6 +483,8 @@ static void establish_demotion_targets(void) nodes_andnot(lower_tier, lower_tier, tier_nodes); memtier->lower_tier_mask = lower_tier; } + + dump_demotion_targets(); } #else @@ -483,7 +512,8 @@ static inline void __init_node_memory_type(int node, struct memory_dev_type *mem static struct memory_tier *set_node_memory_tier(int node) { struct memory_tier *memtier; - struct memory_dev_type *memtype; + struct memory_dev_type *memtype = default_dram_type; + int adist = MEMTIER_ADISTANCE_DRAM; pg_data_t *pgdat = NODE_DATA(node); @@ -492,7 +522,16 @@ static struct memory_tier *set_node_memory_tier(int node) if (!node_state(node, N_MEMORY)) return ERR_PTR(-EINVAL); - __init_node_memory_type(node, default_dram_type); + mt_calc_adistance(node, &adist); + if (!node_memory_types[node].memtype) { + memtype = mt_find_alloc_memory_type(adist, &default_memory_types); + if (IS_ERR(memtype)) { + memtype = default_dram_type; + pr_info("Failed to allocate a memory type. Fall back.\n"); + } + } + + __init_node_memory_type(node, memtype); memtype = node_memory_types[node].memtype; node_set(node, memtype->nodes); @@ -601,6 +640,64 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) } EXPORT_SYMBOL_GPL(clear_node_memory_type); +struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types) +{ + struct memory_dev_type *mtype; + + list_for_each_entry(mtype, memory_types, list) + if (mtype->adistance == adist) + return mtype; + + mtype = alloc_memory_type(adist); + if (IS_ERR(mtype)) + return mtype; + + list_add(&mtype->list, memory_types); + + return mtype; +} +EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type); + +void mt_put_memory_types(struct list_head *memory_types) +{ + struct memory_dev_type *mtype, *mtn; + + list_for_each_entry_safe(mtype, mtn, memory_types, list) { + list_del(&mtype->list); + put_memory_type(mtype); + } +} +EXPORT_SYMBOL_GPL(mt_put_memory_types); + +/* + * This is invoked via `late_initcall()` to initialize memory tiers for + * CPU-less memory nodes after driver initialization, which is + * expected to provide `adistance` algorithms. + */ +static int __init memory_tier_late_init(void) +{ + int nid; + + guard(mutex)(&memory_tier_lock); + for_each_node_state(nid, N_MEMORY) { + /* + * Some device drivers may have initialized memory tiers + * between `memory_tier_init()` and `memory_tier_late_init()`, + * potentially bringing online memory nodes and + * configuring memory tiers. Exclude them here. + */ + if (node_memory_types[nid].memtype) + continue; + + set_node_memory_tier(nid); + } + + establish_demotion_targets(); + + return 0; +} +late_initcall(memory_tier_late_init); + static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) { pr_info( @@ -612,25 +709,19 @@ static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, const char *source) { - int rc = 0; - - mutex_lock(&memory_tier_lock); - if (default_dram_perf_error) { - rc = -EIO; - goto out; - } + guard(mutex)(&default_dram_perf_lock); + if (default_dram_perf_error) + return -EIO; if (perf->read_latency + perf->write_latency == 0 || - perf->read_bandwidth + perf->write_bandwidth == 0) { - rc = -EINVAL; - goto out; - } + perf->read_bandwidth + perf->write_bandwidth == 0) + return -EINVAL; if (default_dram_perf_ref_nid == NUMA_NO_NODE) { default_dram_perf = *perf; default_dram_perf_ref_nid = nid; default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); - goto out; + return 0; } /* @@ -658,27 +749,25 @@ int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, pr_info( " disable default DRAM node performance based abstract distance algorithm.\n"); default_dram_perf_error = true; - rc = -EINVAL; + return -EINVAL; } -out: - mutex_unlock(&memory_tier_lock); - return rc; + return 0; } int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) { + guard(mutex)(&default_dram_perf_lock); if (default_dram_perf_error) return -EIO; - if (default_dram_perf_ref_nid == NUMA_NO_NODE) - return -ENOENT; - if (perf->read_latency + perf->write_latency == 0 || perf->read_bandwidth + perf->write_bandwidth == 0) return -EINVAL; - mutex_lock(&memory_tier_lock); + if (default_dram_perf_ref_nid == NUMA_NO_NODE) + return -ENOENT; + /* * The abstract distance of a memory node is in direct proportion to * its memory latency (read + write) and inversely proportional to its @@ -691,7 +780,6 @@ int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) (default_dram_perf.read_latency + default_dram_perf.write_latency) * (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / (perf->read_bandwidth + perf->write_bandwidth); - mutex_unlock(&memory_tier_lock); return 0; } @@ -804,7 +892,8 @@ static int __init memory_tier_init(void) * For now we can have 4 faster memory tiers with smaller adistance * than default DRAM tier. */ - default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM); + default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, + &default_memory_types); if (IS_ERR(default_dram_type)) panic("%s() failed to allocate default DRAM tier\n", __func__); @@ -814,6 +903,14 @@ static int __init memory_tier_init(void) * types assigned. */ for_each_node_state(node, N_MEMORY) { + if (!node_state(node, N_CPU)) + /* + * Defer memory tier initialization on + * CPUless numa nodes. These will be initialized + * after firmware and devices are initialized. + */ + continue; + memtier = set_node_memory_tier(node); if (IS_ERR(memtier)) /* diff --git a/mm/memory.c b/mm/memory.c index 0bfc8b007c01..0f47a533014e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -112,8 +112,10 @@ static bool vmf_pte_changed(struct vm_fault *vmf); * Return true if the original pte was a uffd-wp pte marker (so the pte was * wr-protected). */ -static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) +static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) { + if (!userfaultfd_wp(vmf->vma)) + return false; if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return false; @@ -806,9 +808,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + folio = pfn_swap_entry_folio(entry); - rss[mm_counter(page)]++; + rss[mm_counter(folio)]++; if (!is_readable_migration_entry(entry) && is_cow_mapping(vm_flags)) { @@ -840,7 +842,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * keep things as they are. */ folio_get(folio); - rss[mm_counter(page)]++; + rss[mm_counter(folio)]++; /* Cannot fail as these pages cannot get pinned. */ folio_try_dup_anon_rmap_pte(folio, page, src_vma); @@ -930,68 +932,111 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma return 0; } +static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, + pte_t pte, unsigned long addr, int nr) +{ + struct mm_struct *src_mm = src_vma->vm_mm; + + /* If it's a COW mapping, write protect it both processes. */ + if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { + wrprotect_ptes(src_mm, addr, src_pte, nr); + pte = pte_wrprotect(pte); + } + + /* If it's a shared mapping, mark it clean in the child. */ + if (src_vma->vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + + if (!userfaultfd_wp(dst_vma)) + pte = pte_clear_uffd_wp(pte); + + set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); +} + /* - * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page - * is required to copy this pte. + * Copy one present PTE, trying to batch-process subsequent PTEs that map + * consecutive pages of the same folio by copying them as well. + * + * Returns -EAGAIN if one preallocated page is required to copy the next PTE. + * Otherwise, returns the number of copied PTEs (at least 1). */ static inline int -copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, - pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, - struct folio **prealloc) +copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, + int max_nr, int *rss, struct folio **prealloc) { - struct mm_struct *src_mm = src_vma->vm_mm; - unsigned long vm_flags = src_vma->vm_flags; - pte_t pte = ptep_get(src_pte); struct page *page; struct folio *folio; + bool any_writable; + fpb_t flags = 0; + int err, nr; page = vm_normal_page(src_vma, addr, pte); - if (page) - folio = page_folio(page); - if (page && folio_test_anon(folio)) { + if (unlikely(!page)) + goto copy_pte; + + folio = page_folio(page); + + /* + * If we likely have to copy, just don't bother with batching. Make + * sure that the common "small folio" case is as fast as possible + * by keeping the batching logic separate. + */ + if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { + if (src_vma->vm_flags & VM_SHARED) + flags |= FPB_IGNORE_DIRTY; + if (!vma_soft_dirty_enabled(src_vma)) + flags |= FPB_IGNORE_SOFT_DIRTY; + + nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, + &any_writable, NULL, NULL); + folio_ref_add(folio, nr); + if (folio_test_anon(folio)) { + if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, + nr, src_vma))) { + folio_ref_sub(folio, nr); + return -EAGAIN; + } + rss[MM_ANONPAGES] += nr; + VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); + } else { + folio_dup_file_rmap_ptes(folio, page, nr); + rss[mm_counter_file(folio)] += nr; + } + if (any_writable) + pte = pte_mkwrite(pte, src_vma); + __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, + addr, nr); + return nr; + } + + folio_get(folio); + if (folio_test_anon(folio)) { /* * If this page may have been pinned by the parent process, * copy the page immediately for the child so that we'll always * guarantee the pinned page won't be randomly replaced in the * future. */ - folio_get(folio); if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, - addr, rss, prealloc, page); + err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, page); + return err ? err : 1; } rss[MM_ANONPAGES]++; - } else if (page) { - folio_get(folio); + VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); + } else { folio_dup_file_rmap_pte(folio, page); - rss[mm_counter_file(page)]++; + rss[mm_counter_file(folio)]++; } - /* - * If it's a COW mapping, write protect it both - * in the parent and the child - */ - if (is_cow_mapping(vm_flags) && pte_write(pte)) { - ptep_set_wrprotect(src_mm, addr, src_pte); - pte = pte_wrprotect(pte); - } - VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page)); - - /* - * If it's a shared mapping, mark it clean in - * the child - */ - if (vm_flags & VM_SHARED) - pte = pte_mkclean(pte); - pte = pte_mkold(pte); - - if (!userfaultfd_wp(dst_vma)) - pte = pte_clear_uffd_wp(pte); - - set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); - return 0; +copy_pte: + __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1); + return 1; } static inline struct folio *folio_prealloc(struct mm_struct *src_mm, @@ -1028,10 +1073,11 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *src_pte, *dst_pte; pte_t ptent; spinlock_t *src_ptl, *dst_ptl; - int progress, ret = 0; + int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; struct folio *prealloc = NULL; + int nr; again: progress = 0; @@ -1062,6 +1108,8 @@ again: arch_enter_lazy_mmu_mode(); do { + nr = 1; + /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. @@ -1091,6 +1139,8 @@ again: progress += 8; continue; } + ptent = ptep_get(src_pte); + VM_WARN_ON_ONCE(!pte_present(ptent)); /* * Device exclusive entry restored, continue by copying @@ -1098,9 +1148,10 @@ again: */ WARN_ON_ONCE(ret != -ENOENT); } - /* copy_present_pte() will clear `*prealloc' if consumed */ - ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, - addr, rss, &prealloc); + /* copy_present_ptes() will clear `*prealloc' if consumed */ + max_nr = (end - addr) / PAGE_SIZE; + ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, + ptent, addr, max_nr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. @@ -1117,8 +1168,10 @@ again: folio_put(prealloc); prealloc = NULL; } - progress += 8; - } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + nr = ret; + progress += 8 * nr; + } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, + addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_src_pte, src_ptl); @@ -1139,7 +1192,7 @@ again: prealloc = folio_prealloc(src_mm, src_vma, addr, false); if (!prealloc) return -ENOMEM; - } else if (ret) { + } else if (ret < 0) { VM_WARN_ON_ONCE(1); } @@ -1369,19 +1422,16 @@ static inline bool should_zap_cows(struct zap_details *details) return details->even_cows; } -/* Decides whether we should zap this page with the page pointer specified */ -static inline bool should_zap_page(struct zap_details *details, struct page *page) +/* Decides whether we should zap this folio with the folio pointer specified */ +static inline bool should_zap_folio(struct zap_details *details, + struct folio *folio) { - /* If we can make a decision without *page.. */ + /* If we can make a decision without *folio.. */ if (should_zap_cows(details)) return true; - /* E.g. the caller passes NULL for the case of a zero page */ - if (!page) - return true; - - /* Otherwise we should only zap non-anon pages */ - return !PageAnon(page); + /* Otherwise we should only zap non-anon folios */ + return !folio_test_anon(folio); } static inline bool zap_drop_file_uffd_wp(struct zap_details *details) @@ -1398,7 +1448,7 @@ static inline bool zap_drop_file_uffd_wp(struct zap_details *details) */ static inline void zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte, + unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { /* Zap on anonymous always means dropping everything */ @@ -1408,7 +1458,118 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, if (zap_drop_file_uffd_wp(details)) return; - pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + for (;;) { + /* the PFN in the PTE is irrelevant. */ + pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + if (--nr == 0) + break; + pte++; + addr += PAGE_SIZE; + } +} + +static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, struct folio *folio, + struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, + unsigned long addr, struct zap_details *details, int *rss, + bool *force_flush, bool *force_break) +{ + struct mm_struct *mm = tlb->mm; + bool delay_rmap = false; + + if (!folio_test_anon(folio)) { + ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); + if (pte_dirty(ptent)) { + folio_mark_dirty(folio); + if (tlb_delay_rmap(tlb)) { + delay_rmap = true; + *force_flush = true; + } + } + if (pte_young(ptent) && likely(vma_has_recency(vma))) + folio_mark_accessed(folio); + rss[mm_counter(folio)] -= nr; + } else { + /* We don't need up-to-date accessed/dirty bits. */ + clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); + rss[MM_ANONPAGES] -= nr; + } + /* Checking a single PTE in a batch is sufficient. */ + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entries(tlb, pte, nr, addr); + if (unlikely(userfaultfd_pte_wp(vma, ptent))) + zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, + ptent); + + if (!delay_rmap) { + folio_remove_rmap_ptes(folio, page, nr, vma); + + if (unlikely(folio_mapcount(folio) < 0)) + print_bad_pte(vma, addr, ptent, page); + } + + if (want_init_mlocked_on_free() && folio_test_mlocked(folio) && + !delay_rmap && folio_test_anon(folio)) { + kernel_init_pages(page, folio_nr_pages(folio)); + } + + if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { + *force_flush = true; + *force_break = true; + } +} + +/* + * Zap or skip at least one present PTE, trying to batch-process subsequent + * PTEs that map consecutive pages of the same folio. + * + * Returns the number of processed (skipped or zapped) PTEs (at least 1). + */ +static inline int zap_present_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned int max_nr, unsigned long addr, + struct zap_details *details, int *rss, bool *force_flush, + bool *force_break) +{ + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + struct mm_struct *mm = tlb->mm; + struct folio *folio; + struct page *page; + int nr; + + page = vm_normal_page(vma, addr, ptent); + if (!page) { + /* We don't need up-to-date accessed/dirty bits. */ + ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + if (userfaultfd_pte_wp(vma, ptent)) + zap_install_uffd_wp_if_needed(vma, addr, pte, 1, + details, ptent); + ksm_might_unmap_zero_page(mm, ptent); + return 1; + } + + folio = page_folio(page); + if (unlikely(!should_zap_folio(details, folio))) + return 1; + + /* + * Make sure that the common "small folio" case is as fast as possible + * by keeping the batching logic separate. + */ + if (unlikely(folio_test_large(folio) && max_nr != 1)) { + nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, + NULL, NULL, NULL); + + zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, + addr, details, rss, force_flush, + force_break); + return nr; + } + zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, + details, rss, force_flush, force_break); + return 1; } static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -1416,13 +1577,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details) { + bool force_flush = false, force_break = false; struct mm_struct *mm = tlb->mm; - int force_flush = 0; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; pte_t *pte; swp_entry_t entry; + int nr; tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); @@ -1436,7 +1598,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t ptent = ptep_get(pte); struct folio *folio; struct page *page; + int max_nr; + nr = 1; if (pte_none(ptent)) continue; @@ -1444,44 +1608,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, break; if (pte_present(ptent)) { - unsigned int delay_rmap; - - page = vm_normal_page(vma, addr, ptent); - if (unlikely(!should_zap_page(details, page))) - continue; - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - arch_check_zapped_pte(vma, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); - zap_install_uffd_wp_if_needed(vma, addr, pte, details, - ptent); - if (unlikely(!page)) { - ksm_might_unmap_zero_page(mm, ptent); - continue; - } - - folio = page_folio(page); - delay_rmap = 0; - if (!folio_test_anon(folio)) { - if (pte_dirty(ptent)) { - folio_mark_dirty(folio); - if (tlb_delay_rmap(tlb)) { - delay_rmap = 1; - force_flush = 1; - } - } - if (pte_young(ptent) && likely(vma_has_recency(vma))) - folio_mark_accessed(folio); - } - rss[mm_counter(page)]--; - if (!delay_rmap) { - folio_remove_rmap_pte(folio, page, vma); - if (unlikely(page_mapcount(page) < 0)) - print_bad_pte(vma, addr, ptent, page); - } - if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { - force_flush = 1; - addr += PAGE_SIZE; + max_nr = (end - addr) / PAGE_SIZE; + nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, + addr, details, rss, &force_flush, + &force_break); + if (unlikely(force_break)) { + addr += nr * PAGE_SIZE; break; } continue; @@ -1492,7 +1624,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, is_device_exclusive_entry(entry)) { page = pfn_swap_entry_to_page(entry); folio = page_folio(page); - if (unlikely(!should_zap_page(details, page))) + if (unlikely(!should_zap_folio(details, folio))) continue; /* * Both device private/exclusive mappings should only @@ -1501,22 +1633,23 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, * see zap_install_uffd_wp_if_needed(). */ WARN_ON_ONCE(!vma_is_anonymous(vma)); - rss[mm_counter(page)]--; + rss[mm_counter(folio)]--; if (is_device_private_entry(entry)) folio_remove_rmap_pte(folio, page, vma); folio_put(folio); } else if (!non_swap_entry(entry)) { - /* Genuine swap entry, hence a private anon page */ + max_nr = (end - addr) / PAGE_SIZE; + nr = swap_pte_batch(pte, max_nr, ptent); + /* Genuine swap entries, hence a private anon pages */ if (!should_zap_cows(details)) continue; - rss[MM_SWAPENTS]--; - if (unlikely(!free_swap_and_cache(entry))) - print_bad_pte(vma, addr, ptent, NULL); + rss[MM_SWAPENTS] -= nr; + free_swap_and_cache_nr(entry, nr); } else if (is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); - if (!should_zap_page(details, page)) + folio = pfn_swap_entry_folio(entry); + if (!should_zap_folio(details, folio)) continue; - rss[mm_counter(page)]--; + rss[mm_counter(folio)]--; } else if (pte_marker_entry_uffd_wp(entry)) { /* * For anon: always drop the marker; for file: only @@ -1534,9 +1667,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pr_alert("unrecognized swap entry 0x%lx\n", entry.val); WARN_ON_ONCE(1); } - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); - } while (pte++, addr += PAGE_SIZE, addr != end); + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); + zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); + } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); @@ -1870,7 +2003,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, return -EBUSY; /* Ok, finally just insert the thing.. */ folio_get(folio); - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); folio_add_file_rmap_pte(folio, page, vma); set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; @@ -2640,7 +2773,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long next; int err = 0; - BUG_ON(pud_huge(*pud)); + BUG_ON(pud_leaf(*pud)); if (create) { pmd = pmd_alloc_track(mm, pud, addr, mask); @@ -3081,19 +3214,39 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) return VM_FAULT_RETRY; } -static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +/** + * vmf_anon_prepare - Prepare to handle an anonymous fault. + * @vmf: The vm_fault descriptor passed from the fault handler. + * + * When preparing to insert an anonymous page into a VMA from a + * fault handler, call this function rather than anon_vma_prepare(). + * If this vma does not already have an associated anon_vma and we are + * only protected by the per-VMA lock, the caller must retry with the + * mmap_lock held. __anon_vma_prepare() will look at adjacent VMAs to + * determine if this VMA can share its anon_vma, and that's not safe to + * do with only the per-VMA lock held for this VMA. + * + * Return: 0 if fault handling can proceed. Any other value should be + * returned to the caller. + */ +vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = 0; if (likely(vma->anon_vma)) return 0; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; + if (!mmap_read_trylock(vma->vm_mm)) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } } if (__anon_vma_prepare(vma)) - return VM_FAULT_OOM; - return 0; + ret = VM_FAULT_OOM; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + mmap_read_unlock(vma->vm_mm); + return ret; } /* @@ -3175,7 +3328,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(&old_folio->page)); + dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } } else { @@ -3204,13 +3357,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) ptep_clear_flush(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address); folio_add_lru_vma(new_folio, vma); - /* - * We call the notify macro here because, when using secondary - * mmu page tables (such as kvm shadow page tables), we want the - * new page to be mapped directly into the secondary page table. - */ BUG_ON(unshare && pte_write(entry)); - set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + set_pte_at(mm, vmf->address, vmf->pte, entry); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* @@ -3253,7 +3401,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) folio_put(new_folio); if (old_folio) { if (page_copied) - free_swap_cache(&old_folio->page); + free_swap_cache(old_folio); folio_put(old_folio); } @@ -3376,6 +3524,16 @@ static bool wp_can_reuse_anon_folio(struct folio *folio, struct vm_area_struct *vma) { /* + * We could currently only reuse a subpage of a large folio if no + * other subpages of the large folios are still mapped. However, + * let's just consistently not reuse subpages even if we could + * reuse in that scenario, and give back a large folio a bit + * sooner. + */ + if (folio_test_large(folio)) + return false; + + /* * We have to verify under folio lock: these early checks are * just an optimization to avoid locking the folio and freeing * the swapcache if there is little hope that we can reuse. @@ -4055,7 +4213,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ - arch_swap_restore(entry, folio); + arch_swap_restore(folio_swap(entry, folio), folio); /* * Remove the swap entry and conditionally try to free up the swapcache. @@ -4170,8 +4328,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages) static struct folio *alloc_anon_folio(struct vm_fault *vmf) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE struct vm_area_struct *vma = vmf->vma; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long orders; struct folio *folio; unsigned long addr; @@ -4191,8 +4349,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true, - BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -4217,21 +4375,32 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) pte_unmap(pte); + if (!orders) + goto fallback; + /* Try allocating the highest of the remaining orders. */ gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr, true); if (folio) { + if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + folio_put(folio); + goto next; + } + folio_throttle_swaprate(folio, gfp); clear_huge_page(&folio->page, vmf->address, 1 << order); return folio; } +next: + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); order = next_order(&orders, order); } fallback: #endif - return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address); + return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } /* @@ -4241,7 +4410,6 @@ fallback: */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { - bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; struct folio *folio; @@ -4286,8 +4454,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) } /* Allocate our own private page. */ - if (unlikely(anon_vma_prepare(vma))) - goto oom; + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ folio = alloc_anon_folio(vmf); if (IS_ERR(folio)) @@ -4298,10 +4467,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) nr_pages = folio_nr_pages(folio); addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); - if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) - goto oom_free_page; - folio_throttle_swaprate(folio, GFP_KERNEL); - /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before @@ -4339,10 +4504,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); +#endif folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); setpte: - if (uffd_wp) + if (vmf_orig_pte_uffd_wp(vmf)) entry = pte_mkuffd_wp(entry); set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); @@ -4355,8 +4523,6 @@ unlock: release: folio_put(folio); goto unlock; -oom_free_page: - folio_put(folio); oom: return VM_FAULT_OOM; } @@ -4480,7 +4646,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); + add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR); folio_add_file_rmap_pmd(folio, page, vma); /* @@ -4519,7 +4685,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; - bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); bool write = vmf->flags & FAULT_FLAG_WRITE; bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE); pte_t entry; @@ -4534,16 +4699,14 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (unlikely(uffd_wp)) + if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { - add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); VM_BUG_ON_FOLIO(nr != 1, folio); folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); } else { - add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); folio_add_file_rmap_ptes(folio, page, nr, vma); } set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); @@ -4580,9 +4743,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct page *page; vm_fault_t ret; + bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && + !(vma->vm_flags & VM_SHARED); /* Did we COW the page? */ - if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) + if (is_cow) page = vmf->cow_page; else page = vmf->page; @@ -4618,8 +4783,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf) /* Re-check under ptl */ if (likely(!vmf_pte_changed(vmf))) { struct folio *folio = page_folio(page); + int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio); set_pte_range(vmf, folio, page, 1, vmf->address); + add_mm_counter(vma->vm_mm, type, 1); ret = 0; } else { update_mmu_tlb(vma, vmf->address, vmf->pte); @@ -4653,7 +4820,8 @@ static int fault_around_bytes_set(void *data, u64 val) * The minimum value is 1 page, however this results in no fault-around * at all. See should_fault_around(). */ - fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL); + val = max(val, PAGE_SIZE); + fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT; return 0; } @@ -4899,9 +5067,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf) return ret; } -int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, +int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int page_nid, int *flags) { + struct vm_area_struct *vma = vmf->vma; + folio_get(folio); /* Record the current PID acceesing VMA */ @@ -4913,7 +5083,55 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, *flags |= TNF_FAULT_LOCAL; } - return mpol_misplaced(folio, vma, addr); + return mpol_misplaced(folio, vmf, addr); +} + +static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long fault_addr, pte_t *fault_pte, + bool writable) +{ + pte_t pte, old_pte; + + old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte); + pte = pte_modify(old_pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); + if (writable) + pte = pte_mkwrite(pte, vma); + ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte); + update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1); +} + +static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, + struct folio *folio, pte_t fault_pte, + bool ignore_writable, bool pte_write_upgrade) +{ + int nr = pte_pfn(fault_pte) - folio_pfn(folio); + unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start); + unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end); + pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE; + unsigned long addr; + + /* Restore all PTEs' mapping of the large folio */ + for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { + pte_t ptent = ptep_get(start_ptep); + bool writable = false; + + if (!pte_present(ptent) || !pte_protnone(ptent)) + continue; + + if (pfn_folio(pte_pfn(ptent)) != folio) + continue; + + if (!ignore_writable) { + ptent = pte_modify(ptent, vma->vm_page_prot); + writable = pte_write(ptent); + if (!writable && pte_write_upgrade && + can_change_pte_writable(vma, addr, ptent)) + writable = true; + } + + numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable); + } } static vm_fault_t do_numa_page(struct vm_fault *vmf) @@ -4921,25 +5139,26 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; int nid = NUMA_NO_NODE; - bool writable = false; + bool writable = false, ignore_writable = false; + bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma); int last_cpupid; int target_nid; pte_t pte, old_pte; - int flags = 0; + int flags = 0, nr_pages; /* - * The "pte" at this point cannot be used safely without - * validation through pte_unmap_same(). It's of NUMA type but - * the pfn may be screwed if the read is non atomic. + * The pte cannot be used safely until we verify, while holding the page + * table lock, that its contents have not changed during fault handling. */ spin_lock(vmf->ptl); - if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { + /* Read the live PTE from the page tables: */ + old_pte = ptep_get(vmf->pte); + + if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } - /* Get the normal PTE */ - old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); /* @@ -4947,7 +5166,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * is only valid while holding the PT lock. */ writable = pte_write(pte); - if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + if (!writable && pte_write_upgrade && can_change_pte_writable(vma, vmf->address, pte)) writable = true; @@ -4955,10 +5174,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (!folio || folio_is_zone_device(folio)) goto out_map; - /* TODO: handle PTE-mapped THP */ - if (folio_test_large(folio)) - goto out_map; - /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses @@ -4974,10 +5189,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ - if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED)) + if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; nid = folio_nid(folio); + nr_pages = folio_nr_pages(folio); /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. @@ -4987,13 +5203,14 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) last_cpupid = (-1 & LAST_CPUPID_MASK); else last_cpupid = folio_last_cpupid(folio); - target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags); + target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); goto out_map; } pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; + ignore_writable = true; /* Migrate to the requested node */ if (migrate_misplaced_folio(folio, vma, target_nid)) { @@ -5014,20 +5231,19 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) out: if (nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, nid, 1, flags); + task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; out_map: /* * Make it present again, depending on how arch implements * non-accessible ptes, some can allow access by kernel mode. */ - old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); - pte = pte_modify(old_pte, vma->vm_page_prot); - pte = pte_mkyoung(pte); - if (writable) - pte = pte_mkwrite(pte, vma); - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + if (folio && folio_test_large(folio)) + numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable, + pte_write_upgrade); + else + numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, + writable); pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -5238,7 +5454,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5272,7 +5489,8 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5626,15 +5844,6 @@ retry: if (!vma_start_read(vma)) goto inval; - /* - * find_mergeable_anon_vma uses adjacent vmas which are not locked. - * This check must happen after vma_start_read(); otherwise, a - * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA - * from its anon_vma. - */ - if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) - goto inval_end_read; - /* Check since vm_start/vm_end might change before we lock the VMA */ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; @@ -5732,34 +5941,48 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) /** * follow_pte - look up PTE at a user virtual address - * @mm: the mm_struct of the target address space + * @vma: the memory mapping * @address: user virtual address * @ptepp: location to store found PTE * @ptlp: location to store the lock for the PTE * * On a successful return, the pointer to the PTE is stored in @ptepp; * the corresponding lock is taken and its location is stored in @ptlp. - * The contents of the PTE are only stable until @ptlp is released; - * any further use, if any, must be protected against invalidation - * with MMU notifiers. + * + * The contents of the PTE are only stable until @ptlp is released using + * pte_unmap_unlock(). This function will fail if the PTE is non-present. + * Present PTEs may include PTEs that map refcounted pages, such as + * anonymous folios in COW mappings. + * + * Callers must be careful when relying on PTE content after + * pte_unmap_unlock(). Especially if the PTE maps a refcounted page, + * callers must protect against invalidation with MMU notifiers; otherwise + * access to the PFN at a later point in time can trigger use-after-free. * * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore * should be taken for read. * - * KVM uses this function. While it is arguably less bad than ``follow_pfn``, - * it is not a good general-purpose API. + * This function must not be used to modify PTE content. * * Return: zero on success, -ve otherwise. */ -int follow_pte(struct mm_struct *mm, unsigned long address, +int follow_pte(struct vm_area_struct *vma, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep; + mmap_assert_locked(mm); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + goto out; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; + pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto out; @@ -5789,67 +6012,7 @@ out: } EXPORT_SYMBOL_GPL(follow_pte); -/** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN - * - * Only IO mappings and raw PFN mappings are allowed. - * - * This function does not allow the caller to read the permissions - * of the PTE. Do not use it. - * - * Return: zero and the pfn at @pfn on success, -ve otherwise. - */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn) -{ - int ret = -EINVAL; - spinlock_t *ptl; - pte_t *ptep; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return ret; - - ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); - if (ret) - return ret; - *pfn = pte_pfn(ptep_get(ptep)); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(follow_pfn); - #ifdef CONFIG_HAVE_IOREMAP_PROT -int follow_phys(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned long *prot, resource_size_t *phys) -{ - int ret = -EINVAL; - pte_t *ptep, pte; - spinlock_t *ptl; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - goto out; - - if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) - goto out; - pte = ptep_get(ptep); - - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - - *prot = pgprot_val(pte_pgprot(pte)); - *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - - ret = 0; -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return ret; -} - /** * generic_access_phys - generic implementation for iomem mmap access * @vma: the vma to access @@ -5873,11 +6036,8 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, int offset = offset_in_page(addr); int ret = -EINVAL; - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - retry: - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pte(vma, addr, &ptep, &ptl)) return -EINVAL; pte = ptep_get(ptep); pte_unmap_unlock(ptep, ptl); @@ -5892,7 +6052,7 @@ retry: if (!maddr) return -ENOMEM; - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pte(vma, addr, &ptep, &ptl)) goto out_unmap; if (!pte_same(pte, ptep_get(ptep))) { @@ -6050,21 +6210,14 @@ void print_vma_addr(char *prefix, unsigned long ip) if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, ip); + vma = vma_lookup(mm, ip); if (vma && vma->vm_file) { struct file *f = vma->vm_file; - char *buf = (char *)__get_free_page(GFP_NOWAIT); - if (buf) { - char *p; - - p = file_path(f, buf, PAGE_SIZE); - if (IS_ERR(p)) - p = "?"; - printk("%s%s[%lx+%lx]", prefix, kbasename(p), - vma->vm_start, - vma->vm_end - vma->vm_start); - free_page((unsigned long)buf); - } + ip -= vma->vm_start; + ip += vma->vm_pgoff << PAGE_SHIFT; + printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip, + vma->vm_start, + vma->vm_end - vma->vm_start); } mmap_read_unlock(mm); } @@ -6163,7 +6316,7 @@ static int clear_subpage(unsigned long addr, int idx, void *arg) { struct page *page = arg; - clear_user_highpage(page + idx, addr); + clear_user_highpage(nth_page(page, idx), addr); return 0; } @@ -6213,10 +6366,11 @@ struct copy_subpage_arg { static int copy_subpage(unsigned long addr, int idx, void *arg) { struct copy_subpage_arg *copy_arg = arg; + struct page *dst = nth_page(copy_arg->dst, idx); + struct page *src = nth_page(copy_arg->src, idx); - if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, - addr, copy_arg->vma)) { - memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0); + if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) { + memory_failure_queue(page_to_pfn(src), 0); return -EHWPOISON; } return 0; @@ -6299,3 +6453,15 @@ void ptlock_free(struct ptdesc *ptdesc) kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif + +void vma_pgtable_walk_begin(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_lock_read(vma); +} + +void vma_pgtable_walk_end(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_unlock_read(vma); +} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 21890994c1d3..431b1f6753c0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1087,7 +1087,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone) + struct zone *zone, bool mhp_off_inaccessible) { unsigned long end_pfn = pfn + nr_pages; int ret, i; @@ -1096,6 +1096,15 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, if (ret) return ret; + /* + * Memory block is accessible at this stage and hence poison the struct + * pages now. If the memory block is accessible during memory hotplug + * addition phase, then page poisining is already performed in + * sparse_add_section(). + */ + if (mhp_off_inaccessible) + page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); for (i = 0; i < nr_pages; i++) @@ -1328,7 +1337,7 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) } #endif -static bool mhp_supports_memmap_on_memory(unsigned long size) +bool mhp_supports_memmap_on_memory(void) { unsigned long vmemmap_size = memory_block_memmap_size(); unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); @@ -1337,17 +1346,11 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) * Besides having arch support and the feature enabled at runtime, we * need a few more assumptions to hold true: * - * a) We span a single memory block: memory onlining/offlinin;g happens - * in memory block granularity. We don't want the vmemmap of online - * memory blocks to reside on offline memory blocks. In the future, - * we might want to support variable-sized memory blocks to make the - * feature more versatile. - * - * b) The vmemmap pages span complete PMDs: We don't want vmemmap code + * a) The vmemmap pages span complete PMDs: We don't want vmemmap code * to populate memory from the altmap for unrelated parts (i.e., * other memory blocks) * - * c) The vmemmap pages (and thereby the pages that will be exposed to + * b) The vmemmap pages (and thereby the pages that will be exposed to * the buddy) have to cover full pageblocks: memory onlining/offlining * code requires applicable ranges to be page-aligned, for example, to * set the migratetypes properly. @@ -1359,7 +1362,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) * altmap as an alternative source of memory, and we do not exactly * populate a single PMD. */ - if (!mhp_memmap_on_memory() || size != memory_block_size_bytes()) + if (!mhp_memmap_on_memory()) return false; /* @@ -1382,6 +1385,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) return arch_supports_memmap_on_memory(vmemmap_size); } +EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory); static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) { @@ -1415,7 +1419,7 @@ static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) } static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, - u64 start, u64 size) + u64 start, u64 size, mhp_t mhp_flags) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; @@ -1431,6 +1435,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, }; mhp_altmap.free = memory_block_memmap_on_memory_pages(); + if (mhp_flags & MHP_OFFLINE_INACCESSIBLE) + mhp_altmap.inaccessible = true; params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), GFP_KERNEL); if (!params.altmap) { @@ -1515,8 +1521,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) * Self hosted memmap array */ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && - mhp_supports_memmap_on_memory(memory_block_size_bytes())) { - ret = create_altmaps_and_memory_blocks(nid, group, start, size); + mhp_supports_memmap_on_memory()) { + ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); if (ret) goto error; } else { @@ -1835,6 +1841,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) struct migration_target_control mtc = { .nmask = &nmask, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .reason = MR_MEMORY_HOTPLUG, }; int ret; @@ -2044,11 +2051,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, } /* - * Dissolve free hugepages in the memory block before doing + * Dissolve free hugetlb folios in the memory block before doing * offlining actually in order to make hugetlbfs's object * counting consistent. */ - ret = dissolve_free_huge_pages(start_pfn, end_pfn); + ret = dissolve_free_hugetlb_folios(start_pfn, end_pfn); if (ret) { reason = "failure to dissolve huge pages"; goto failed_removal_isolated; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 10a590ee1c89..aec756ae5637 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -19,6 +19,13 @@ * for anonymous memory. For process policy an process counter * is used. * + * weighted interleave + * Allocate memory interleaved over a set of nodes based on + * a set of weights (per-node), with normal fallback if it + * fails. Otherwise operates the same as interleave. + * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated + * on node 0 for every 1 page allocated on node 1. + * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node @@ -131,6 +138,32 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; +/* + * iw_table is the sysfs-set interleave weight table, a value of 0 denotes + * system-default value should be used. A NULL iw_table also denotes that + * system-default values should be used. Until the system-default table + * is implemented, the system-default is always 1. + * + * iw_table is RCU protected + */ +static u8 __rcu *iw_table; +static DEFINE_MUTEX(iw_table_lock); + +static u8 get_il_weight(int node) +{ + u8 *table; + u8 weight; + + rcu_read_lock(); + table = rcu_dereference(iw_table); + /* if no iw_table, use system default */ + weight = table ? table[node] : 1; + /* if value in iw_table is 0, use system default */ + weight = weight ? weight : 1; + rcu_read_unlock(); + return weight; +} + /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search @@ -415,6 +448,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, + [MPOL_WEIGHTED_INTERLEAVE] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_nodemask, + }, }; static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, @@ -472,8 +509,8 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) qp->nr_failed++; return; } - folio = pfn_folio(pmd_pfn(*pmd)); - if (is_huge_zero_page(&folio->page)) { + folio = pmd_folio(*pmd); + if (is_huge_zero_folio(folio)) { walk->action = ACTION_CONTINUE; return; } @@ -605,12 +642,11 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated sharers of the folio instead. + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || - (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) + (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) if (!isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: @@ -654,7 +690,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; - unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; /* range check first */ @@ -682,9 +717,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, !(flags & MPOL_MF_STRICT)) return 1; - if (endvma > end) - endvma = end; - /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. @@ -836,8 +868,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE) + if (new && (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE)) { current->il_prev = MAX_NUMNODES-1; + current->il_weight = 0; + } task_unlock(current); mpol_put(old); ret = 0; @@ -862,6 +897,7 @@ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: + case MPOL_WEIGHTED_INTERLEAVE: *nodes = pol->nodes; break; case MPOL_LOCAL: @@ -946,6 +982,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->nodes); + } else if (pol == current->mempolicy && + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { + if (current->il_weight) + *policy = current->il_prev; + else + *policy = next_node_in(current->il_prev, + pol->nodes); } else { err = -EINVAL; goto out; @@ -988,11 +1031,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated sharers of the folio instead. + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { + if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, @@ -1026,6 +1068,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + .reason = MR_SYSCALL, }; nodes_clear(nmask); @@ -1183,7 +1226,8 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src, h = folio_hstate(src); gfp = htlb_alloc_mask(h); nodemask = policy_nodemask(gfp, pol, ilx, &nid); - return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, + htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); } if (folio_test_large(src)) @@ -1310,30 +1354,32 @@ static long do_mbind(unsigned long start, unsigned long len, * VMAs, the nodes will still be interleaved from the targeted * nodemask, but one by one may be selected differently. */ - if (new->mode == MPOL_INTERLEAVE) { - struct page *page; + if (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE) { + struct folio *folio; unsigned int order; unsigned long addr = -EFAULT; - list_for_each_entry(page, &pagelist, lru) { - if (!PageKsm(page)) + list_for_each_entry(folio, &pagelist, lru) { + if (!folio_test_ksm(folio)) break; } - if (!list_entry_is_head(page, &pagelist, lru)) { + if (!list_entry_is_head(folio, &pagelist, lru)) { vma_iter_init(&vmi, mm, start); for_each_vma_range(vmi, vma, end) { - addr = page_address_in_vma(page, vma); + addr = page_address_in_vma( + folio_page(folio, 0), vma); if (addr != -EFAULT) break; } } if (addr != -EFAULT) { - order = compound_order(page); + order = folio_order(folio); /* We already know the pol, but not the ilx */ mpol_cond_put(get_vma_policy(vma, addr, order, &mmpol.ilx)); /* Set base from which to increment by index */ - mmpol.ilx -= page->index >> order; + mmpol.ilx -= folio->index >> order; } } } @@ -1458,9 +1504,10 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; if (*flags & MPOL_F_NUMA_BALANCING) { - if (*mode != MPOL_BIND) + if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) + *flags |= (MPOL_F_MOF | MPOL_F_MORON); + else return -EINVAL; - *flags |= (MPOL_F_MOF | MPOL_F_MORON); } return 0; } @@ -1758,7 +1805,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup * @order: 0, or appropriate huge_page_order for interleaving - * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE + * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or + * MPOL_WEIGHTED_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. @@ -1775,7 +1823,8 @@ struct mempolicy *get_vma_policy(struct vm_area_struct *vma, pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); - if (pol->mode == MPOL_INTERLEAVE) { + if (pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { *ilx += vma->vm_pgoff >> order; *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); } @@ -1825,12 +1874,40 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) return zone >= dynamic_policy_zone; } +static unsigned int weighted_interleave_nodes(struct mempolicy *policy) +{ + unsigned int node; + unsigned int cpuset_mems_cookie; + +retry: + /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ + cpuset_mems_cookie = read_mems_allowed_begin(); + node = current->il_prev; + if (!current->il_weight || !node_isset(node, policy->nodes)) { + node = next_node_in(node, policy->nodes); + if (read_mems_allowed_retry(cpuset_mems_cookie)) + goto retry; + if (node == MAX_NUMNODES) + return node; + current->il_prev = node; + current->il_weight = get_il_weight(node); + } + current->il_weight--; + return node; +} + /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { unsigned int nid; + unsigned int cpuset_mems_cookie; + + /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + nid = next_node_in(current->il_prev, policy->nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); - nid = next_node_in(current->il_prev, policy->nodes); if (nid < MAX_NUMNODES) current->il_prev = nid; return nid; @@ -1859,6 +1936,9 @@ unsigned int mempolicy_slab_node(void) case MPOL_INTERLEAVE: return interleave_nodes(policy); + case MPOL_WEIGHTED_INTERLEAVE: + return weighted_interleave_nodes(policy); + case MPOL_BIND: case MPOL_PREFERRED_MANY: { @@ -1883,6 +1963,59 @@ unsigned int mempolicy_slab_node(void) } } +static unsigned int read_once_policy_nodemask(struct mempolicy *pol, + nodemask_t *mask) +{ + /* + * barrier stabilizes the nodemask locally so that it can be iterated + * over safely without concern for changes. Allocators validate node + * selection does not violate mems_allowed, so this is safe. + */ + barrier(); + memcpy(mask, &pol->nodes, sizeof(nodemask_t)); + barrier(); + return nodes_weight(*mask); +} + +static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) +{ + nodemask_t nodemask; + unsigned int target, nr_nodes; + u8 *table; + unsigned int weight_total = 0; + u8 weight; + int nid; + + nr_nodes = read_once_policy_nodemask(pol, &nodemask); + if (!nr_nodes) + return numa_node_id(); + + rcu_read_lock(); + table = rcu_dereference(iw_table); + /* calculate the total weight */ + for_each_node_mask(nid, nodemask) { + /* detect system default usage */ + weight = table ? table[nid] : 1; + weight = weight ? weight : 1; + weight_total += weight; + } + + /* Calculate the node offset based on totals */ + target = ilx % weight_total; + nid = first_node(nodemask); + while (target) { + /* detect system default usage */ + weight = table ? table[nid] : 1; + weight = weight ? weight : 1; + if (target < weight) + break; + target -= weight; + nid = next_node_in(nid, nodemask); + } + rcu_read_unlock(); + return nid; +} + /* * Do static interleaving for interleave index @ilx. Returns the ilx'th * node in pol->nodes (starting from ilx=0), wrapping around if ilx @@ -1890,20 +2023,12 @@ unsigned int mempolicy_slab_node(void) */ static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { - nodemask_t nodemask = pol->nodes; + nodemask_t nodemask; unsigned int target, nnodes; int i; int nid; - /* - * The barrier will stabilize the nodemask in a register or on - * the stack so that it will stop changing under the code. - * - * Between first_node() and next_node(), pol->nodes could be changed - * by other threads. So we put pol->nodes in a local stack. - */ - barrier(); - nnodes = nodes_weight(nodemask); + nnodes = read_once_policy_nodemask(pol, &nodemask); if (!nnodes) return numa_node_id(); target = ilx % nnodes; @@ -1951,6 +2076,11 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, *nid = (ilx == NO_INTERLEAVE_INDEX) ? interleave_nodes(pol) : interleave_nid(pol, ilx); break; + case MPOL_WEIGHTED_INTERLEAVE: + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + weighted_interleave_nodes(pol) : + weighted_interleave_nid(pol, ilx); + break; } return nodemask; @@ -2012,6 +2142,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: *mask = mempolicy->nodes; break; @@ -2070,9 +2201,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages(preferred_gfp, order, nid, nodemask); + page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask); if (!page) - page = __alloc_pages(gfp, order, nid, NULL); + page = __alloc_pages_noprof(gfp, order, nid, NULL); return page; } @@ -2087,7 +2218,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, +struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; @@ -2112,12 +2243,13 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, * node in its nodemask, we allocate the standard way. */ if (pol->mode != MPOL_INTERLEAVE && + pol->mode != MPOL_WEIGHTED_INTERLEAVE && (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - page = __alloc_pages_node(nid, + page = __alloc_pages_node_noprof(nid, gfp | __GFP_THISNODE | __GFP_NORETRY, order); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; @@ -2130,7 +2262,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, } } - page = __alloc_pages(gfp, order, nid, nodemask); + page = __alloc_pages_noprof(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ @@ -2161,7 +2293,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, * * Return: The folio on success or NULL if allocation fails. */ -struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, +struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage) { struct mempolicy *pol; @@ -2169,12 +2301,12 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, struct page *page; pol = get_vma_policy(vma, addr, order, &ilx); - page = alloc_pages_mpol(gfp | __GFP_COMP, order, - pol, ilx, numa_node_id()); + page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order, + pol, ilx, numa_node_id()); mpol_cond_put(pol); return page_rmappable_folio(page); } -EXPORT_SYMBOL(vma_alloc_folio); +EXPORT_SYMBOL(vma_alloc_folio_noprof); /** * alloc_pages - Allocate pages. @@ -2190,7 +2322,7 @@ EXPORT_SYMBOL(vma_alloc_folio); * flags are used. * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages(gfp_t gfp, unsigned int order) +struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; @@ -2201,16 +2333,16 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order) if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); - return alloc_pages_mpol(gfp, order, - pol, NO_INTERLEAVE_INDEX, numa_node_id()); + return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX, + numa_node_id()); } -EXPORT_SYMBOL(alloc_pages); +EXPORT_SYMBOL(alloc_pages_noprof); -struct folio *folio_alloc(gfp_t gfp, unsigned int order) +struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { - return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); + return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); } -EXPORT_SYMBOL(folio_alloc); +EXPORT_SYMBOL(folio_alloc_noprof); static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, @@ -2229,13 +2361,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, for (i = 0; i < nodes; i++) { if (delta) { - nr_allocated = __alloc_pages_bulk(gfp, + nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node + 1, NULL, page_array); delta--; } else { - nr_allocated = __alloc_pages_bulk(gfp, + nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node, NULL, page_array); } @@ -2247,6 +2379,121 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, return total_allocated; } +static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + struct task_struct *me = current; + unsigned int cpuset_mems_cookie; + unsigned long total_allocated = 0; + unsigned long nr_allocated = 0; + unsigned long rounds; + unsigned long node_pages, delta; + u8 *table, *weights, weight; + unsigned int weight_total = 0; + unsigned long rem_pages = nr_pages; + nodemask_t nodes; + int nnodes, node; + int resume_node = MAX_NUMNODES - 1; + u8 resume_weight = 0; + int prev_node; + int i; + + if (!nr_pages) + return 0; + + /* read the nodes onto the stack, retry if done during rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + nnodes = read_once_policy_nodemask(pol, &nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + /* if the nodemask has become invalid, we cannot do anything */ + if (!nnodes) + return 0; + + /* Continue allocating from most recent node and adjust the nr_pages */ + node = me->il_prev; + weight = me->il_weight; + if (weight && node_isset(node, nodes)) { + node_pages = min(rem_pages, weight); + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + NULL, page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + /* if that's all the pages, no need to interleave */ + if (rem_pages <= weight) { + me->il_weight -= rem_pages; + return total_allocated; + } + /* Otherwise we adjust remaining pages, continue from there */ + rem_pages -= weight; + } + /* clear active weight in case of an allocation failure */ + me->il_weight = 0; + prev_node = node; + + /* create a local copy of node weights to operate on outside rcu */ + weights = kzalloc(nr_node_ids, GFP_KERNEL); + if (!weights) + return total_allocated; + + rcu_read_lock(); + table = rcu_dereference(iw_table); + if (table) + memcpy(weights, table, nr_node_ids); + rcu_read_unlock(); + + /* calculate total, detect system default usage */ + for_each_node_mask(node, nodes) { + if (!weights[node]) + weights[node] = 1; + weight_total += weights[node]; + } + + /* + * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. + * Track which node weighted interleave should resume from. + * + * if (rounds > 0) and (delta == 0), resume_node will always be + * the node following prev_node and its weight. + */ + rounds = rem_pages / weight_total; + delta = rem_pages % weight_total; + resume_node = next_node_in(prev_node, nodes); + resume_weight = weights[resume_node]; + for (i = 0; i < nnodes; i++) { + node = next_node_in(prev_node, nodes); + weight = weights[node]; + node_pages = weight * rounds; + /* If a delta exists, add this node's portion of the delta */ + if (delta > weight) { + node_pages += weight; + delta -= weight; + } else if (delta) { + /* when delta is depleted, resume from that node */ + node_pages += delta; + resume_node = node; + resume_weight = weight - delta; + delta = 0; + } + /* node_pages can be 0 if an allocation fails and rounds == 0 */ + if (!node_pages) + break; + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + NULL, page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + if (total_allocated == nr_pages) + break; + prev_node = node; + } + me->il_prev = resume_node; + me->il_weight = resume_weight; + kfree(weights); + return total_allocated; +} + static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) @@ -2257,11 +2504,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, + nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, nr_pages, NULL, page_array); if (nr_allocated < nr_pages) - nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, + nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, nr_pages - nr_allocated, NULL, page_array + nr_allocated); return nr_allocated; @@ -2273,7 +2520,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, * It can accelerate memory allocation especially interleaving * allocate memory. */ -unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, +unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; @@ -2287,14 +2534,18 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, return alloc_pages_bulk_array_interleave(gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) + return alloc_pages_bulk_array_weighted_interleave( + gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); - return __alloc_pages_bulk(gfp, nid, nodemask, - nr_pages, NULL, page_array); + return alloc_pages_bulk_noprof(gfp, nid, nodemask, + nr_pages, NULL, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) @@ -2362,6 +2613,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: + case MPOL_WEIGHTED_INTERLEAVE: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; @@ -2467,7 +2719,7 @@ static void sp_free(struct sp_node *n) * mpol_misplaced - check whether current folio node is valid in policy * * @folio: folio to be checked - * @vma: vm area where folio mapped + * @vmf: structure describing the fault * @addr: virtual address in @vma for shared policy lookup and interleave policy * * Lookup current policy node id for vma,addr and "compare to" folio's @@ -2477,18 +2729,24 @@ static void sp_free(struct sp_node *n) * Return: NUMA_NO_NODE if the page is in a node that is valid for this * policy, or a suitable node ID to allocate a replacement folio from. */ -int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, +int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); + struct vm_area_struct *vma = vmf->vma; int thiscpu = raw_smp_processor_id(); - int thisnid = cpu_to_node(thiscpu); + int thisnid = numa_node_id(); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; + /* + * Make sure ptl is held so that we don't preempt and we + * have a stable smp processor id + */ + lockdep_assert_held(vmf->ptl); pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; @@ -2498,6 +2756,10 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, polnid = interleave_nid(pol, ilx); break; + case MPOL_WEIGHTED_INTERLEAVE: + polnid = weighted_interleave_nid(pol, ilx); + break; + case MPOL_PREFERRED: if (node_isset(curnid, pol->nodes)) goto out; @@ -2509,15 +2771,26 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, break; case MPOL_BIND: - /* Optimize placement among multiple nodes via NUMA balancing */ + case MPOL_PREFERRED_MANY: + /* + * Even though MPOL_PREFERRED_MANY can allocate pages outside + * policy nodemask we don't allow numa migration to nodes + * outside policy nodemask for now. This is done so that if we + * want demotion to slow memory to happen, before allocating + * from some DRAM node say 'x', we will end up using a + * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario + * we should not promote to node 'x' from slow memory node. + */ if (pol->flags & MPOL_F_MORON) { + /* + * Optimize placement among multiple nodes + * via NUMA balancing + */ if (node_isset(thisnid, pol->nodes)) break; goto out; } - fallthrough; - case MPOL_PREFERRED_MANY: /* * use current page if in policy nodemask, * else select nearest allowed node, if any. @@ -2526,7 +2799,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, if (node_isset(curnid, pol->nodes)) goto out; z = first_zones_zonelist( - node_zonelist(numa_node_id(), GFP_HIGHUSER), + node_zonelist(thisnid, GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); polnid = zone_to_nid(z->zone); @@ -2872,6 +3145,7 @@ static const char * const policy_modes[] = [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", + [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; @@ -2931,6 +3205,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) } break; case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ @@ -3041,6 +3316,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: nodes = pol->nodes; break; default: @@ -3067,3 +3343,200 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } + +#ifdef CONFIG_SYSFS +struct iw_node_attr { + struct kobj_attribute kobj_attr; + int nid; +}; + +static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct iw_node_attr *node_attr; + u8 weight; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + weight = get_il_weight(node_attr->nid); + return sysfs_emit(buf, "%d\n", weight); +} + +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct iw_node_attr *node_attr; + u8 *new; + u8 *old; + u8 weight = 0; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + if (count == 0 || sysfs_streq(buf, "")) + weight = 0; + else if (kstrtou8(buf, 0, &weight)) + return -EINVAL; + + new = kzalloc(nr_node_ids, GFP_KERNEL); + if (!new) + return -ENOMEM; + + mutex_lock(&iw_table_lock); + old = rcu_dereference_protected(iw_table, + lockdep_is_held(&iw_table_lock)); + if (old) + memcpy(new, old, nr_node_ids); + new[node_attr->nid] = weight; + rcu_assign_pointer(iw_table, new); + mutex_unlock(&iw_table_lock); + synchronize_rcu(); + kfree(old); + return count; +} + +static struct iw_node_attr **node_attrs; + +static void sysfs_wi_node_release(struct iw_node_attr *node_attr, + struct kobject *parent) +{ + if (!node_attr) + return; + sysfs_remove_file(parent, &node_attr->kobj_attr.attr); + kfree(node_attr->kobj_attr.attr.name); + kfree(node_attr); +} + +static void sysfs_wi_release(struct kobject *wi_kobj) +{ + int i; + + for (i = 0; i < nr_node_ids; i++) + sysfs_wi_node_release(node_attrs[i], wi_kobj); + kobject_put(wi_kobj); +} + +static const struct kobj_type wi_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = sysfs_wi_release, +}; + +static int add_weight_node(int nid, struct kobject *wi_kobj) +{ + struct iw_node_attr *node_attr; + char *name; + + node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); + if (!node_attr) + return -ENOMEM; + + name = kasprintf(GFP_KERNEL, "node%d", nid); + if (!name) { + kfree(node_attr); + return -ENOMEM; + } + + sysfs_attr_init(&node_attr->kobj_attr.attr); + node_attr->kobj_attr.attr.name = name; + node_attr->kobj_attr.attr.mode = 0644; + node_attr->kobj_attr.show = node_show; + node_attr->kobj_attr.store = node_store; + node_attr->nid = nid; + + if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { + kfree(node_attr->kobj_attr.attr.name); + kfree(node_attr); + pr_err("failed to add attribute to weighted_interleave\n"); + return -ENOMEM; + } + + node_attrs[nid] = node_attr; + return 0; +} + +static int add_weighted_interleave_group(struct kobject *root_kobj) +{ + struct kobject *wi_kobj; + int nid, err; + + wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!wi_kobj) + return -ENOMEM; + + err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, + "weighted_interleave"); + if (err) { + kfree(wi_kobj); + return err; + } + + for_each_node_state(nid, N_POSSIBLE) { + err = add_weight_node(nid, wi_kobj); + if (err) { + pr_err("failed to add sysfs [node%d]\n", nid); + break; + } + } + if (err) + kobject_put(wi_kobj); + return 0; +} + +static void mempolicy_kobj_release(struct kobject *kobj) +{ + u8 *old; + + mutex_lock(&iw_table_lock); + old = rcu_dereference_protected(iw_table, + lockdep_is_held(&iw_table_lock)); + rcu_assign_pointer(iw_table, NULL); + mutex_unlock(&iw_table_lock); + synchronize_rcu(); + kfree(old); + kfree(node_attrs); + kfree(kobj); +} + +static const struct kobj_type mempolicy_ktype = { + .release = mempolicy_kobj_release +}; + +static int __init mempolicy_sysfs_init(void) +{ + int err; + static struct kobject *mempolicy_kobj; + + mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); + if (!mempolicy_kobj) { + err = -ENOMEM; + goto err_out; + } + + node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *), + GFP_KERNEL); + if (!node_attrs) { + err = -ENOMEM; + goto mempol_out; + } + + err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, + "mempolicy"); + if (err) + goto node_out; + + err = add_weighted_interleave_group(mempolicy_kobj); + if (err) { + pr_err("mempolicy sysfs structure failed to initialize\n"); + kobject_put(mempolicy_kobj); + return err; + } + + return err; +node_out: + kfree(node_attrs); +mempol_out: + kfree(mempolicy_kobj); +err_out: + pr_err("failed to add mempolicy kobject to the system\n"); + return err; +} + +late_initcall(mempolicy_sysfs_init); +#endif /* CONFIG_SYSFS */ diff --git a/mm/mempool.c b/mm/mempool.c index dbbf0e9fb424..6ece63a00acf 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -240,22 +240,24 @@ EXPORT_SYMBOL(mempool_init_node); * * Return: %0 on success, negative error code otherwise. */ -int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data) +int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) { return mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); } -EXPORT_SYMBOL(mempool_init); +EXPORT_SYMBOL(mempool_init_noprof); /** - * mempool_create - create a memory pool + * mempool_create_node - create a memory pool * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. * @free_fn: user-defined element-freeing function. * @pool_data: optional private data available to the user-defined functions. + * @gfp_mask: memory allocation flags + * @node_id: numa node to allocate on * * this function creates and allocates a guaranteed size, preallocated * memory pool. The pool can be used from the mempool_alloc() and mempool_free() @@ -265,17 +267,9 @@ EXPORT_SYMBOL(mempool_init); * * Return: pointer to the created memory pool object or %NULL on error. */ -mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data) -{ - return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data, - GFP_KERNEL, NUMA_NO_NODE); -} -EXPORT_SYMBOL(mempool_create); - -mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id) +mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id) { mempool_t *pool; @@ -291,7 +285,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, return pool; } -EXPORT_SYMBOL(mempool_create_node); +EXPORT_SYMBOL(mempool_create_node_noprof); /** * mempool_resize - resize an existing memory pool @@ -387,7 +381,7 @@ EXPORT_SYMBOL(mempool_resize); * * Return: pointer to the allocated element or %NULL on error. */ -void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) +void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; @@ -454,7 +448,7 @@ repeat_alloc: finish_wait(&pool->wait, &wait); goto repeat_alloc; } -EXPORT_SYMBOL(mempool_alloc); +EXPORT_SYMBOL(mempool_alloc_noprof); /** * mempool_alloc_preallocated - allocate an element from preallocated elements @@ -562,7 +556,7 @@ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) { struct kmem_cache *mem = pool_data; VM_BUG_ON(mem->ctor); - return kmem_cache_alloc(mem, gfp_mask); + return kmem_cache_alloc_noprof(mem, gfp_mask); } EXPORT_SYMBOL(mempool_alloc_slab); @@ -580,7 +574,7 @@ EXPORT_SYMBOL(mempool_free_slab); void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { size_t size = (size_t)pool_data; - return kmalloc(size, gfp_mask); + return kmalloc_noprof(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); @@ -590,6 +584,19 @@ void mempool_kfree(void *element, void *pool_data) } EXPORT_SYMBOL(mempool_kfree); +void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t)pool_data; + return kvmalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kvmalloc); + +void mempool_kvfree(void *element, void *pool_data) +{ + kvfree(element); +} +EXPORT_SYMBOL(mempool_kvfree); + /* * A simple mempool-backed page allocator that allocates pages * of the order specified by pool_data. @@ -597,7 +604,7 @@ EXPORT_SYMBOL(mempool_kfree); void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) { int order = (int)(long)pool_data; - return alloc_pages(gfp_mask, order); + return alloc_pages_noprof(gfp_mask, order); } EXPORT_SYMBOL(mempool_alloc_pages); diff --git a/mm/memremap.c b/mm/memremap.c index 9e9fb1972fff..40d4547ce514 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -456,21 +456,23 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, } EXPORT_SYMBOL_GPL(get_dev_pagemap); -void free_zone_device_page(struct page *page) +void free_zone_device_folio(struct folio *folio) { - if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free)) + if (WARN_ON_ONCE(!folio->page.pgmap->ops || + !folio->page.pgmap->ops->page_free)) return; - mem_cgroup_uncharge(page_folio(page)); + mem_cgroup_uncharge(folio); /* * Note: we don't expect anonymous compound pages yet. Once supported * and we could PTE-map them similar to THP, we'd have to clear * PG_anon_exclusive on all tail pages. */ - VM_BUG_ON_PAGE(PageAnon(page) && PageCompound(page), page); - if (PageAnon(page)) - __ClearPageAnonExclusive(page); + if (folio_test_anon(folio)) { + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + __ClearPageAnonExclusive(folio_page(folio, 0)); + } /* * When a device managed page is freed, the folio->mapping field @@ -481,20 +483,20 @@ void free_zone_device_page(struct page *page) * * For other types of ZONE_DEVICE pages, migration is either * handled differently or not done at all, so there is no need - * to clear page->mapping. + * to clear folio->mapping. */ - page->mapping = NULL; - page->pgmap->ops->page_free(page); + folio->mapping = NULL; + folio->page.pgmap->ops->page_free(folio_page(folio, 0)); - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && - page->pgmap->type != MEMORY_DEVICE_COHERENT) + if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE && + folio->page.pgmap->type != MEMORY_DEVICE_COHERENT) /* - * Reset the page count to 1 to prepare for handing out the page + * Reset the refcount to 1 to prepare for handing out the page * again. */ - set_page_count(page, 1); + folio_set_count(folio, 1); else - put_dev_pagemap(page->pgmap); + put_dev_pagemap(folio->page.pgmap); } void zone_device_page_init(struct page *page) @@ -510,9 +512,9 @@ void zone_device_page_init(struct page *page) EXPORT_SYMBOL_GPL(zone_device_page_init); #ifdef CONFIG_FS_DAX -bool __put_devmap_managed_page_refs(struct page *page, int refs) +bool __put_devmap_managed_folio_refs(struct folio *folio, int refs) { - if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) + if (folio->page.pgmap->type != MEMORY_DEVICE_FS_DAX) return false; /* @@ -520,9 +522,9 @@ bool __put_devmap_managed_page_refs(struct page *page, int refs) * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - if (page_ref_sub_return(page, refs) == 1) - wake_up_var(&page->_refcount); + if (folio_ref_sub_return(folio, refs) == 1) + wake_up_var(&folio->_refcount); return true; } -EXPORT_SYMBOL(__put_devmap_managed_page_refs); +EXPORT_SYMBOL(__put_devmap_managed_folio_refs); #endif /* CONFIG_FS_DAX */ diff --git a/mm/memtest.c b/mm/memtest.c index 32f3e9dda837..c2c609c39119 100644 --- a/mm/memtest.c +++ b/mm/memtest.c @@ -51,10 +51,10 @@ static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size last_bad = 0; for (p = start; p < end; p++) - *p = pattern; + WRITE_ONCE(*p, pattern); for (p = start; p < end; p++, start_phys_aligned += incr) { - if (*p == pattern) + if (READ_ONCE(*p) == pattern) continue; if (start_phys_aligned == last_bad + incr) { last_bad += incr; diff --git a/mm/migrate.c b/mm/migrate.c index c27b1f8097d4..dd04f578c19c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -113,7 +113,7 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) if (!mops->isolate_page(&folio->page, mode)) goto out_no_isolated; - /* Driver shouldn't use PG_isolated bit of page->flags */ + /* Driver shouldn't use the isolated flag */ WARN_ON_ONCE(folio_test_isolated(folio)); folio_set_isolated(folio); folio_unlock(folio); @@ -211,14 +211,17 @@ static bool remove_migration_pte(struct folio *folio, folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); old_pte = ptep_get(pvmw.pte); - if (pte_swp_soft_dirty(old_pte)) - pte = pte_mksoft_dirty(pte); entry = pte_to_swp_entry(old_pte); if (!is_migration_entry_young(entry)) pte = pte_mkold(pte); if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pte = pte_mkdirty(pte); + if (pte_swp_soft_dirty(old_pte)) + pte = pte_mksoft_dirty(pte); + else + pte = pte_clear_soft_dirty(pte); + if (is_writable_migration_entry(entry)) pte = pte_mkwrite(pte, vma); else if (pte_swp_uffd_wp(old_pte)) @@ -613,7 +616,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_migrate_ksm(newfolio, folio); /* * Please do not reorder this without considering how mm/ksm.c's - * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). + * ksm_get_folio() depends upon ksm_migrate_page() and PageSwapCache(). */ if (folio_test_swapcache(folio)) folio_clear_swapcache(folio); @@ -1422,7 +1425,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, * semaphore in write mode here and set TTU_RMAP_LOCKED * to let lower levels know we have taken the lock. */ - mapping = hugetlb_page_mapping_lock_write(&src->page); + mapping = hugetlb_folio_mapping_lock_write(src); if (unlikely(!mapping)) goto unlock_put_anon; @@ -1650,6 +1653,29 @@ static int migrate_pages_batch(struct list_head *from, cond_resched(); /* + * The rare folio on the deferred split list should + * be split now. It should not count as a failure. + * Only check it without removing it from the list. + * Since the folio can be on deferred_split_scan() + * local list and removing it can cause the local list + * corruption. Folio split process below can handle it + * with the help of folio_ref_freeze(). + * + * nr_pages > 2 is needed to avoid checking order-1 + * page cache folios. They exist, in contrast to + * non-existent order-1 anonymous folios, and do not + * use _deferred_list. + */ + if (nr_pages > 2 && + !list_empty(&folio->_deferred_list)) { + if (try_split_folio(folio, split_folios) == 0) { + stats->nr_thp_split += is_thp; + stats->nr_split++; + continue; + } + } + + /* * Large folio migration might be unsupported or * the allocation might be failed so we should retry * on the same folio with the large folio split @@ -2019,7 +2045,8 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private) gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); return alloc_hugetlb_folio_nodemask(h, nid, - mtc->nmask, gfp_mask); + mtc->nmask, gfp_mask, + htlb_allow_alloc_fallback(mtc->reason)); } if (folio_test_large(src)) { @@ -2057,6 +2084,7 @@ static int do_move_pages_to_node(struct list_head *pagelist, int node) struct migration_target_control mtc = { .nid = node, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + .reason = MR_SYSCALL, }; err = migrate_pages(pagelist, alloc_migration_target, NULL, @@ -2112,7 +2140,7 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, goto out_putfolio; err = -EACCES; - if (page_mapcount(page) > 1 && !migrate_all) + if (folio_likely_mapped_shared(folio) && !migrate_all) goto out_putfolio; err = -EBUSY; @@ -2565,11 +2593,11 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, /* * Don't migrate file folios that are mapped in multiple processes * with execute permissions as they are probably shared libraries. - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated mapcount of the folio instead. + * + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) && + if (folio_likely_mapped_shared(folio) && folio_is_file_lru(folio) && (vma->vm_flags & VM_EXEC)) goto out; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index b6c27c76e1a0..aecc71972a87 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -71,7 +71,7 @@ again: return migrate_vma_collect_hole(start, end, -1, walk); if (pmd_trans_huge(*pmdp)) { - struct page *page; + struct folio *folio; ptl = pmd_lock(mm, pmdp); if (unlikely(!pmd_trans_huge(*pmdp))) { @@ -79,21 +79,21 @@ again: goto again; } - page = pmd_page(*pmdp); - if (is_huge_zero_page(page)) { + folio = pmd_folio(*pmdp); + if (is_huge_zero_folio(folio)) { spin_unlock(ptl); split_huge_pmd(vma, pmdp, addr); } else { int ret; - get_page(page); + folio_get(folio); spin_unlock(ptl); - if (unlikely(!trylock_page(page))) + if (unlikely(!folio_trylock(folio))) return migrate_vma_collect_skip(start, end, walk); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); + ret = split_folio(folio); + folio_unlock(folio); + folio_put(folio); if (ret) return migrate_vma_collect_skip(start, end, walk); @@ -324,6 +324,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate) */ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) { + struct folio *folio = page_folio(page); + /* * One extra ref because caller holds an extra reference, either from * isolate_lru_page() for a regular page, or migrate_vma_collect() for @@ -336,18 +338,18 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) * check them than regular pages, because they can be mapped with a pmd * or with a pte (split pte mapping). */ - if (PageCompound(page)) + if (folio_test_large(folio)) return false; /* Page from ZONE_DEVICE have one extra reference */ - if (is_zone_device_page(page)) + if (folio_is_zone_device(folio)) extra++; /* For file back page */ - if (page_mapping(page)) - extra += 1 + page_has_private(page); + if (folio_mapping(folio)) + extra += 1 + folio_has_private(folio); - if ((page_count(page) - extra) > page_mapcount(page)) + if ((folio_ref_count(folio) - extra) > folio_mapcount(folio)) return false; return true; @@ -664,13 +666,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (flush) { flush_cache_page(vma, addr, pte_pfn(orig_pte)); ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); - } else { - /* No need to invalidate - it was non-present before */ - set_pte_at(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); } + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); pte_unmap_unlock(ptep, ptl); *src = MIGRATE_PFN_MIGRATE; @@ -694,6 +692,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); struct page *page = migrate_pfn_to_page(src_pfns[i]); struct address_space *mapping; + struct folio *folio; int r; if (!newpage) { @@ -728,15 +727,12 @@ static void __migrate_device_pages(unsigned long *src_pfns, continue; } - mapping = page_mapping(page); + folio = page_folio(page); + mapping = folio_mapping(folio); if (is_device_private_page(newpage) || is_device_coherent_page(newpage)) { if (mapping) { - struct folio *folio; - - folio = page_folio(page); - /* * For now only support anonymous memory migrating to * device private or coherent memory. @@ -759,11 +755,10 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (migrate && migrate->fault_page == page) r = migrate_folio_extra(mapping, page_folio(newpage), - page_folio(page), - MIGRATE_SYNC_NO_COPY, 1); + folio, MIGRATE_SYNC_NO_COPY, 1); else r = migrate_folio(mapping, page_folio(newpage), - page_folio(page), MIGRATE_SYNC_NO_COPY); + folio, MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; } diff --git a/mm/mlock.c b/mm/mlock.c index 086546ac5766..30b51cdea89d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -206,8 +206,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch) if (lruvec) unlock_page_lruvec_irq(lruvec); - folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } void mlock_drain_local(void) @@ -379,7 +378,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, goto out; if (is_huge_zero_pmd(*pmd)) goto out; - folio = page_folio(pmd_page(*pmd)); + folio = pmd_folio(*pmd); if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else diff --git a/mm/mm_init.c b/mm/mm_init.c index 2c19f5515e36..f72b852bd5b8 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -24,9 +24,11 @@ #include <linux/page_ext.h> #include <linux/pti.h> #include <linux/pgtable.h> +#include <linux/stackdepot.h> #include <linux/swap.h> #include <linux/cma.h> #include <linux/crash_dump.h> +#include <linux/execmem.h> #include "internal.h" #include "slab.h" #include "shuffle.h" @@ -226,7 +228,6 @@ static unsigned long required_movablecore_percent __initdata; static unsigned long nr_kernel_pages __initdata; static unsigned long nr_all_pages __initdata; -static unsigned long dma_reserve __initdata; static bool deferred_struct_pages __meminitdata; @@ -1144,7 +1145,7 @@ static void __init adjust_zone_range_for_zone_movable(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -unsigned long __init __absent_pages_in_range(int nid, +static unsigned long __init __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -1265,6 +1266,30 @@ static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat) pr_debug("On node %d totalpages: 0\n", pgdat->node_id); } +static void __init calc_nr_kernel_pages(void) +{ + unsigned long start_pfn, end_pfn; + phys_addr_t start_addr, end_addr; + u64 u; +#ifdef CONFIG_HIGHMEM + unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; +#endif + + for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) { + start_pfn = PFN_UP(start_addr); + end_pfn = PFN_DOWN(end_addr); + + if (start_pfn < end_pfn) { + nr_all_pages += end_pfn - start_pfn; +#ifdef CONFIG_HIGHMEM + start_pfn = clamp(start_pfn, 0, high_zone_low); + end_pfn = clamp(end_pfn, 0, high_zone_low); +#endif + nr_kernel_pages += end_pfn - start_pfn; + } + } +} + static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, unsigned long node_end_pfn) @@ -1308,26 +1333,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } -static unsigned long __init calc_memmap_size(unsigned long spanned_pages, - unsigned long present_pages) -{ - unsigned long pages = spanned_pages; - - /* - * Provide a more accurate estimation if there are holes within - * the zone and SPARSEMEM is in use. If there are holes within the - * zone, each populated memory region may cost us one or two extra - * memmap pages due to alignment because memmap pages for each - * populated regions may not be naturally aligned on page boundary. - * So the (present_pages >> 4) heuristic is a tradeoff for that. - */ - if (spanned_pages > present_pages + (present_pages >> 4) && - IS_ENABLED(CONFIG_SPARSEMEM)) - pages = present_pages; - - return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void pgdat_init_split_queue(struct pglist_data *pgdat) { @@ -1542,15 +1547,6 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) } #endif -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - * - * NOTE: pgdat should get zeroed by caller. - * NOTE: this function is only called during early init. - */ static void __init free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; @@ -1561,47 +1557,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, freesize, memmap_pages; - - size = zone->spanned_pages; - freesize = zone->present_pages; - - /* - * Adjust freesize so that it accounts for how much memory - * is used by this zone for memmap. This affects the watermark - * and per-cpu initialisations - */ - memmap_pages = calc_memmap_size(size, freesize); - if (!is_highmem_idx(j)) { - if (freesize >= memmap_pages) { - freesize -= memmap_pages; - if (memmap_pages) - pr_debug(" %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); - } else - pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n", - zone_names[j], memmap_pages, freesize); - } - - /* Account for reserved pages */ - if (j == 0 && freesize > dma_reserve) { - freesize -= dma_reserve; - pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); - } - - if (!is_highmem_idx(j)) - nr_kernel_pages += freesize; - /* Charge for highmem memmap if there are enough kernel pages */ - else if (nr_kernel_pages > memmap_pages * 2) - nr_kernel_pages -= memmap_pages; - nr_all_pages += freesize; + unsigned long size = zone->spanned_pages; /* - * Set an approximate value for lowmem here, it will be adjusted - * when the bootmem allocator frees pages into the buddy system. - * And all highmem pages will be managed by the buddy system. + * Initialize zone->managed_pages as 0 , it will be reset + * when memblock allocator frees pages into buddy system. */ - zone_init_internals(zone, j, nid, freesize); + zone_init_internals(zone, j, nid, zone->present_pages); if (!size) continue; @@ -1874,30 +1836,26 @@ void __init free_area_init(unsigned long *max_zone_pfn) panic("Cannot allocate %zuB for node %d.\n", sizeof(*pgdat), nid); arch_refresh_nodedata(nid, pgdat); - free_area_init_node(nid); - - /* - * We do not want to confuse userspace by sysfs - * files/directories for node without any memory - * attached to it, so this node is not marked as - * N_MEMORY and not marked online so that no sysfs - * hierarchy will be created via register_one_node for - * it. The pgdat will get fully initialized by - * hotadd_init_pgdat() when memory is hotplugged into - * this node. - */ - continue; } pgdat = NODE_DATA(nid); free_area_init_node(nid); - /* Any memory on that node */ - if (pgdat->node_present_pages) + /* + * No sysfs hierarcy will be created via register_one_node() + *for memory-less node because here it's not marked as N_MEMORY + *and won't be set online later. The benefit is userspace + *program won't be confused by sysfs files/directories of + *memory-less node. The pgdat will get fully initialized by + *hotadd_init_pgdat() when memory is hotplugged into this node. + */ + if (pgdat->node_present_pages) { node_set_state(nid, N_MEMORY); - check_for_memory(pgdat); + check_for_memory(pgdat); + } } + calc_nr_kernel_pages(); memmap_init(); /* disable hash distribution for systems with a single node */ @@ -2057,7 +2015,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone, __init_single_page(page, pfn, zid, nid); nr_pages++; } - return (nr_pages); + return nr_pages; } /* @@ -2231,6 +2189,7 @@ static int __init deferred_init_memmap(void *data) .align = PAGES_PER_SECTION, .min_chunk = PAGES_PER_SECTION, .max_threads = max_threads, + .numa_aware = false, }; padata_do_multithreaded(&job); @@ -2258,10 +2217,6 @@ zone_empty: * Return true when zone was grown, otherwise return false. We return true even * when we grow less than requested, to let the caller decide if there are * enough pages to satisfy the allocation. - * - * Note: We use noinline because this function is needed only during boot, and - * it is called from a __ref function _deferred_grow_zone. This way we are - * making sure that it is not inlined into permanent text section. */ bool __init deferred_grow_zone(struct zone *zone, unsigned int order) { @@ -2411,17 +2366,6 @@ void __init page_alloc_init_late(void) page_alloc_sysctl_init(); } -#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES -/* - * Returns the number of pages that arch has reserved but - * is not known to alloc_large_system_hash(). - */ -static unsigned long __init arch_reserved_kernel_pages(void) -{ - return 0; -} -#endif - /* * Adaptive scale is meant to reduce sizes of hash tables on large memory * machines. As memory size is increased the scale is also increased but at @@ -2464,7 +2408,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; - numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ if (PAGE_SIZE < SZ_1M) @@ -2546,26 +2489,9 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/** - * set_dma_reserve - set the specified number of pages reserved in the first zone - * @new_dma_reserve: The number of pages to mark reserved - * - * The per-cpu batchsize and zone watermarks are determined by managed_pages. - * In the DMA zone, a significant percentage may be consumed by kernel image - * and other unfreeable allocations which can skew the watermarks badly. This - * function may optionally be used to account for unfreeable pages in the - * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and - * smaller per-cpu batchsize. - */ -void __init set_dma_reserve(unsigned long new_dma_reserve) -{ - dma_reserve = new_dma_reserve; -} - void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { - if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) { int nid = early_pfn_to_nid(pfn); @@ -2577,6 +2503,17 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, /* KMSAN will take care of these pages. */ return; } + + /* pages were reserved and not allocated */ + if (mem_alloc_profiling_enabled()) { + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } + } + __free_pages_core(page, order); } @@ -2586,6 +2523,9 @@ EXPORT_SYMBOL(init_on_alloc); DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); EXPORT_SYMBOL(init_on_free); +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); +EXPORT_SYMBOL(init_mlocked_on_free); + static bool _init_on_alloc_enabled_early __read_mostly = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) @@ -2603,6 +2543,14 @@ static int __init early_init_on_free(char *buf) } early_param("init_on_free", early_init_on_free); +static bool _init_mlocked_on_free_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON); +static int __init early_init_mlocked_on_free(char *buf) +{ + return kstrtobool(buf, &_init_mlocked_on_free_enabled_early); +} +early_param("init_mlocked_on_free", early_init_mlocked_on_free); + DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); /* @@ -2630,12 +2578,21 @@ static void __init mem_debugging_and_hardening_init(void) } #endif - if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && + if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early || + _init_mlocked_on_free_enabled_early) && page_poisoning_requested) { pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " - "will take precedence over init_on_alloc and init_on_free\n"); + "will take precedence over init_on_alloc, init_on_free " + "and init_mlocked_on_free\n"); _init_on_alloc_enabled_early = false; _init_on_free_enabled_early = false; + _init_mlocked_on_free_enabled_early = false; + } + + if (_init_mlocked_on_free_enabled_early && _init_on_free_enabled_early) { + pr_info("mem auto-init: init_on_free is on, " + "will take precedence over init_mlocked_on_free\n"); + _init_mlocked_on_free_enabled_early = false; } if (_init_on_alloc_enabled_early) { @@ -2652,9 +2609,17 @@ static void __init mem_debugging_and_hardening_init(void) static_branch_disable(&init_on_free); } - if (IS_ENABLED(CONFIG_KMSAN) && - (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) - pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); + if (_init_mlocked_on_free_enabled_early) { + want_check_pages = true; + static_branch_enable(&init_mlocked_on_free); + } else { + static_branch_disable(&init_mlocked_on_free); + } + + if (IS_ENABLED(CONFIG_KMSAN) && (_init_on_alloc_enabled_early || + _init_on_free_enabled_early || _init_mlocked_on_free_enabled_early)) + pr_info("mem auto-init: please make sure init_on_alloc, init_on_free and " + "init_mlocked_on_free are disabled when running KMSAN\n"); #ifdef CONFIG_DEBUG_PAGEALLOC if (debug_pagealloc_enabled()) { @@ -2693,9 +2658,10 @@ static void __init report_meminit(void) else stack = "off"; - pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n", + pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s, mlocked free:%s\n", stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off", - want_init_on_free() ? "on" : "off"); + want_init_on_free() ? "on" : "off", + want_init_mlocked_on_free() ? "on" : "off"); if (want_init_on_free()) pr_info("mem auto-init: clearing system memory may take some time...\n"); } @@ -2792,4 +2758,5 @@ void __init mm_core_init(void) pti_init(); kmsan_init_runtime(); mm_cache_init(); + execmem_init(); } diff --git a/mm/mmap.c b/mm/mmap.c index 3281287771c9..83b4682ec85c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -64,7 +64,7 @@ #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; -const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; +int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX; int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS @@ -105,7 +105,7 @@ void vma_set_page_prot(struct vm_area_struct *vma) * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, - struct file *file, struct address_space *mapping) + struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_unmap_writable(mapping); @@ -126,7 +126,7 @@ void unlink_file_vma(struct vm_area_struct *vma) if (file) { struct address_space *mapping = file->f_mapping; i_mmap_lock_write(mapping); - __remove_shared_vm_struct(vma, file, mapping); + __remove_shared_vm_struct(vma, mapping); i_mmap_unlock_write(mapping); } } @@ -392,26 +392,30 @@ static void __vma_link_file(struct vm_area_struct *vma, flush_dcache_mmap_unlock(mapping); } +static void vma_link_file(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct address_space *mapping; + + if (file) { + mapping = file->f_mapping; + i_mmap_lock_write(mapping); + __vma_link_file(vma, mapping); + i_mmap_unlock_write(mapping); + } +} + static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); - struct address_space *mapping = NULL; vma_iter_config(&vmi, vma->vm_start, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; vma_start_write(vma); - vma_iter_store(&vmi, vma); - - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - i_mmap_lock_write(mapping); - __vma_link_file(vma, mapping); - i_mmap_unlock_write(mapping); - } - + vma_link_file(vma); mm->map_count++; validate_mm(mm); return 0; @@ -519,10 +523,9 @@ static inline void vma_complete(struct vma_prepare *vp, } if (vp->remove && vp->file) { - __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping); + __remove_shared_vm_struct(vp->remove, vp->mapping); if (vp->remove2) - __remove_shared_vm_struct(vp->remove2, vp->file, - vp->mapping); + __remove_shared_vm_struct(vp->remove2, vp->mapping); } else if (vp->insert) { /* * split_vma has split insert from vma, and needs @@ -660,9 +663,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; + vma_set_range(vma, start, end, pgoff); vma_iter_store(vmi, vma); vma_complete(&vp, vmi, vma->vm_mm); @@ -705,9 +706,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_adjust_trans_huge(vma, start, end, 0); vma_iter_clear(vmi); - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; + vma_set_range(vma, start, end, pgoff); vma_complete(&vp, vmi, vma->vm_mm); return 0; } @@ -861,13 +860,15 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * area is returned, or the function will return NULL */ static struct vm_area_struct -*vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy, +*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, + struct vm_area_struct *src, unsigned long addr, unsigned long end, + unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name) { + struct mm_struct *mm = src->vm_mm; + struct anon_vma *anon_vma = src->anon_vma; + struct file *file = src->vm_file; struct vm_area_struct *curr, *next, *res; struct vm_area_struct *vma, *adjust, *remove, *remove2; struct vm_area_struct *anon_dup = NULL; @@ -1020,10 +1021,7 @@ static struct vm_area_struct vma_prepare(&vp); vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); - - vma->vm_start = vma_start; - vma->vm_end = vma_end; - vma->vm_pgoff = vma_pgoff; + vma_set_range(vma, vma_start, vma_end, vma_pgoff); if (vma_expanded) vma_iter_store(vmi, vma); @@ -1116,21 +1114,21 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_ */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { - MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end); struct anon_vma *anon_vma = NULL; struct vm_area_struct *prev, *next; + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); /* Try next first. */ - next = mas_walk(&mas); + next = vma_iter_load(&vmi); if (next) { anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } - prev = mas_prev(&mas, 0); + prev = vma_prev(&vmi); VM_BUG_ON_VMA(prev != vma, vma); - prev = mas_prev(&mas, 0); + prev = vma_prev(&vmi); /* Try prev next. */ if (prev) anon_vma = reusable_anon_vma(prev, prev, vma); @@ -1257,17 +1255,15 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; - /* Obtain the address to map to. we verify (or select) it and ensure - * that it represents a valid section of the address space. + /* + * addr is returned from get_unmapped_area, + * There are two cases: + * 1> MAP_FIXED == false + * unallocated memory, no need to check sealing. + * 1> MAP_FIXED == true + * sealing is checked inside mmap_region when + * do_vmi_munmap is called. */ - addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (IS_ERR_VALUE(addr)) - return addr; - - if (flags & MAP_FIXED_NOREPLACE) { - if (find_vma_intersection(mm, addr, addr + len)) - return -EEXIST; - } if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); @@ -1282,6 +1278,18 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags); + if (IS_ERR_VALUE(addr)) + return addr; + + if (flags & MAP_FIXED_NOREPLACE) { + if (find_vma_intersection(mm, addr, addr + len)) + return -EEXIST; + } + if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; @@ -1296,7 +1304,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; - flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags; + flags_mask = LEGACY_MAP_MASK; + if (file->f_op->fop_flags & FOP_MMAP_SYNC) + flags_mask |= MAP_SYNC; switch (flags & MAP_TYPE) { case MAP_SHARED: @@ -1516,32 +1526,32 @@ bool vma_needs_dirty_tracking(struct vm_area_struct *vma) * to the private version (using protection_map[] without the * VM_SHARED bit). */ -int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { /* If it was private or non-writable, the write bit is already clear */ if (!vma_is_shared_writable(vma)) - return 0; + return false; /* The backer wishes to know when pages are first written to? */ if (vm_ops_needs_writenotify(vma->vm_ops)) - return 1; + return true; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) - return 0; + return false; /* * Do we need to track softdirty? hugetlb does not support softdirty * tracking yet. */ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) - return 1; + return true; /* Do we need write faults for uffd-wp tracking? */ if (userfaultfd_wp(vma)) - return 1; + return true; /* Can the mapping track the dirty pages? */ return vma_fs_can_writeback(vma); @@ -1551,14 +1561,14 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ -static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) +static inline bool accountable_mapping(struct file *file, vm_flags_t vm_flags) { /* * hugetlb has its own accounting separate from the core VM * VM_HUGETLB may not be set yet so we cannot check for that flag. */ if (file && is_file_hugepages(file)) - return 0; + return false; return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } @@ -1578,11 +1588,10 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) unsigned long length, gap; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; - - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask; + length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; @@ -1591,23 +1600,29 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: - if (mas_empty_area(&mas, low_limit, high_limit - 1, length)) + if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) return -ENOMEM; - gap = mas.index; + /* + * Adjust for the gap first so it doesn't interfere with the + * later alignment. The first step is the minimum needed to + * fulill the start gap, the next steps is the minimum to align + * that. It is the minimum needed to fulill both. + */ + gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; - tmp = mas_next(&mas, ULONG_MAX); + tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; - mas_reset(&mas); + vma_iter_reset(&vmi); goto retry; } } else { - tmp = mas_prev(&mas, 0); + tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { low_limit = vm_end_gap(tmp); - mas_reset(&mas); + vma_iter_reset(&vmi); goto retry; } } @@ -1630,10 +1645,10 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) unsigned long length, gap, gap_end; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; + VMA_ITERATOR(vmi, current->mm, 0); - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask; + length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; @@ -1642,24 +1657,24 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: - if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length)) + if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) return -ENOMEM; - gap = mas.last + 1 - info->length; + gap = vma_iter_end(&vmi) - info->length; gap -= (gap - info->align_offset) & info->align_mask; - gap_end = mas.last; - tmp = mas_next(&mas, ULONG_MAX); + gap_end = vma_iter_end(&vmi); + tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ - if (vm_start_gap(tmp) <= gap_end) { + if (vm_start_gap(tmp) < gap_end) { high_limit = vm_start_gap(tmp); - mas_reset(&mas); + vma_iter_reset(&vmi); goto retry; } } else { - tmp = mas_prev(&mas, 0); + tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { high_limit = tmp->vm_start; - mas_reset(&mas); + vma_iter_reset(&vmi); goto retry; } } @@ -1707,7 +1722,7 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len > mmap_end - mmap_min_addr) @@ -1725,12 +1740,9 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, return addr; } - info.flags = 0; info.length = len; info.low_limit = mm->mmap_base; info.high_limit = mmap_end; - info.align_mask = 0; - info.align_offset = 0; return vm_unmapped_area(&info); } @@ -1755,7 +1767,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); /* requested length too big for entire address space */ @@ -1779,8 +1791,6 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); - info.align_mask = 0; - info.align_offset = 0; addr = vm_unmapped_area(&info); /* @@ -1810,12 +1820,41 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, } #endif +#ifndef HAVE_ARCH_UNMAPPED_AREA_VMFLAGS +unsigned long +arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) +{ + return arch_get_unmapped_area(filp, addr, len, pgoff, flags); +} + +unsigned long +arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) +{ + return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); +} +#endif + +unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) +{ + if (test_bit(MMF_TOPDOWN, &mm->flags)) + return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff, + flags, vm_flags); + return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags); +} + unsigned long -get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) +__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { unsigned long (*get_area)(struct file *, unsigned long, - unsigned long, unsigned long, unsigned long); + unsigned long, unsigned long, unsigned long) + = NULL; unsigned long error = arch_mmap_check(addr, len, flags); if (error) @@ -1825,7 +1864,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (len > TASK_SIZE) return -ENOMEM; - get_area = current->mm->get_unmapped_area; if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; @@ -1835,16 +1873,22 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, * so use shmem's get_unmapped_area in case it can be huge. */ get_area = shmem_get_unmapped_area; - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - /* Ensures that larger anonymous mappings are THP aligned. */ - get_area = thp_get_unmapped_area; } /* Always treat pgoff as zero for anonymous memory. */ if (!file) pgoff = 0; - addr = get_area(file, addr, len, pgoff, flags); + if (get_area) { + addr = get_area(file, addr, len, pgoff, flags); + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + /* Ensures that larger anonymous mappings are THP aligned. */ + addr = thp_get_unmapped_area_vmflags(file, addr, len, + pgoff, flags, vm_flags); + } else { + addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, + pgoff, flags, vm_flags); + } if (IS_ERR_VALUE(addr)) return addr; @@ -1857,7 +1901,16 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return error ? error : addr; } -EXPORT_SYMBOL(get_unmapped_area); +unsigned long +mm_get_unmapped_area(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + if (test_bit(MMF_TOPDOWN, &mm->flags)) + return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags); + return arch_get_unmapped_area(file, addr, len, pgoff, flags); +} +EXPORT_SYMBOL(mm_get_unmapped_area); /** * find_vma_intersection() - Look up the first VMA which intersects the interval @@ -1914,12 +1967,12 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma; - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); - vma = mas_walk(&mas); - *pprev = mas_prev(&mas, 0); + vma = vma_iter_load(&vmi); + *pprev = vma_prev(&vmi); if (!vma) - vma = mas_next(&mas, ULONG_MAX); + vma = vma_next(&vmi); return vma; } @@ -1973,7 +2026,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) struct vm_area_struct *next; unsigned long gap_addr; int error = 0; - MA_STATE(mas, &mm->mm_mt, vma->vm_start, address); + VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; @@ -1999,15 +2052,15 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) } if (next) - mas_prev_range(&mas, address); + vma_iter_prev_range_limit(&vmi, address); - __mas_set_range(&mas, vma->vm_start, address - 1); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + vma_iter_config(&vmi, vma->vm_start, address); + if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { - mas_destroy(&mas); + vma_iter_free(&vmi); return -ENOMEM; } @@ -2047,7 +2100,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ - mas_store_prealloc(&mas, vma); + vma_iter_store(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2056,8 +2109,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } anon_vma_unlock_write(vma->anon_vma); - khugepaged_enter_vma(vma, vma->vm_flags); - mas_destroy(&mas); + vma_iter_free(&vmi); validate_mm(mm); return error; } @@ -2070,9 +2122,9 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); struct vm_area_struct *prev; int error = 0; + VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSDOWN)) return -EFAULT; @@ -2082,7 +2134,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) return -EPERM; /* Enforce stack_guard_gap */ - prev = mas_prev(&mas, 0); + prev = vma_prev(&vmi); /* Check that both stack segments have the same anon_vma? */ if (prev) { if (!(prev->vm_flags & VM_GROWSDOWN) && @@ -2092,15 +2144,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) } if (prev) - mas_next_range(&mas, vma->vm_start); + vma_iter_next_range_limit(&vmi, vma->vm_start); - __mas_set_range(&mas, address, vma->vm_end - 1); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + vma_iter_config(&vmi, address, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { - mas_destroy(&mas); + vma_iter_free(&vmi); return -ENOMEM; } @@ -2141,7 +2193,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ - mas_store_prealloc(&mas, vma); + vma_iter_store(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2150,8 +2202,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) } } anon_vma_unlock_write(vma->anon_vma); - khugepaged_enter_vma(vma, vma->vm_flags); - mas_destroy(&mas); + vma_iter_free(&vmi); validate_mm(mm); return error; } @@ -2440,9 +2491,8 @@ struct vm_area_struct *vma_modify(struct vma_iterator *vmi, pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); struct vm_area_struct *merged; - merged = vma_merge(vmi, vma->vm_mm, prev, start, end, vm_flags, - vma->anon_vma, vma->vm_file, pgoff, policy, - uffd_ctx, anon_name); + merged = vma_merge(vmi, prev, vma, start, end, vm_flags, + pgoff, policy, uffd_ctx, anon_name); if (merged) return merged; @@ -2472,9 +2522,8 @@ static struct vm_area_struct struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { - return vma_merge(vmi, vma->vm_mm, prev, start, end, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff, + vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } /* @@ -2488,10 +2537,9 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); /* vma is specified as prev, so case 1 or 2 will apply. */ - return vma_merge(vmi, vma->vm_mm, vma, vma->vm_end, vma->vm_end + delta, - vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - vma_policy(vma), vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); + return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta, + vma->vm_flags, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } /* @@ -2689,6 +2737,14 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, if (end == start) return -EINVAL; + /* + * Check if memory is sealed before arch_unmap. + * Prevent unmapping a sealed VMA. + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(mm, start, end))) + return -EPERM; + /* arch_unmap() might do unmaps itself. */ arch_unmap(mm, start, end); @@ -2751,7 +2807,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Unmap any existing mapping in the area */ - if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) + error = do_vmi_munmap(&vmi, mm, addr, len, uf, false); + if (error == -EPERM) + return error; + else if (error) return -ENOMEM; /* @@ -2818,11 +2877,9 @@ cannot_expand: } vma_iter_config(&vmi, addr, end); - vma->vm_start = addr; - vma->vm_end = end; + vma_set_range(vma, addr, end, pgoff); vm_flags_init(vma, vm_flags); vma->vm_page_prot = vm_get_page_prot(vm_flags); - vma->vm_pgoff = pgoff; if (file) { vma->vm_file = get_file(file); @@ -2899,16 +2956,7 @@ cannot_expand: vma_start_write(vma); vma_iter_store(&vmi, vma); mm->map_count++; - if (vma->vm_file) { - i_mmap_lock_write(vma->vm_file->f_mapping); - if (vma_is_shared_maywrite(vma)) - mapping_allow_writable(vma->vm_file->f_mapping); - - flush_dcache_mmap_lock(vma->vm_file->f_mapping); - vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); - flush_dcache_mmap_unlock(vma->vm_file->f_mapping); - i_mmap_unlock_write(vma->vm_file->f_mapping); - } + vma_link_file(vma); /* * vma_merge() calls khugepaged_enter_vma() either, the below @@ -3112,6 +3160,14 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; + /* + * Check if memory is sealed before arch_unmap. + * Prevent unmapping a sealed VMA. + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(mm, start, end))) + return -EPERM; + arch_unmap(mm, start, end); return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); } @@ -3181,9 +3237,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, goto unacct_fail; vma_set_anonymous(vma); - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_pgoff = addr >> PAGE_SHIFT; + vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); vma_start_write(vma); @@ -3262,7 +3316,7 @@ void exit_mmap(struct mm_struct *mm) struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); int count = 0; /* mm's last user has gone, and its about to be pulled down */ @@ -3271,7 +3325,7 @@ void exit_mmap(struct mm_struct *mm) mmap_read_lock(mm); arch_exit_mmap(mm); - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); @@ -3284,7 +3338,7 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &mas, vma, 0, ULONG_MAX, ULONG_MAX, false); + unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false); mmap_read_unlock(mm); /* @@ -3294,8 +3348,8 @@ void exit_mmap(struct mm_struct *mm) set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); - mas_set(&mas, vma->vm_end); - free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS, + vma_iter_set(&vmi, vma->vm_end); + free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); @@ -3304,14 +3358,14 @@ void exit_mmap(struct mm_struct *mm) * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ - mas_set(&mas, vma->vm_end); + vma_iter_set(&vmi, vma->vm_end); do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); remove_vma(vma, true); count++; cond_resched(); - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); @@ -3420,9 +3474,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma = vm_area_dup(vma); if (!new_vma) goto out; - new_vma->vm_start = addr; - new_vma->vm_end = addr + len; - new_vma->vm_pgoff = pgoff; + vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; if (anon_vma_clone(new_vma, vma)) @@ -3590,9 +3642,7 @@ static struct vm_area_struct *__install_special_mapping( if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); - vma->vm_start = addr; - vma->vm_end = addr + len; - + vma_set_range(vma, addr, addr + len, 0); vm_flags_init(vma, (vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); @@ -3737,7 +3787,7 @@ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); @@ -3749,14 +3799,14 @@ int mm_take_all_locks(struct mm_struct *mm) * being written to until mmap_write_unlock() or mmap_write_downgrade() * is reached. */ - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; vma_start_write(vma); } - mas_set(&mas, 0); - mas_for_each(&mas, vma, ULONG_MAX) { + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3764,8 +3814,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - mas_set(&mas, 0); - mas_for_each(&mas, vma, ULONG_MAX) { + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3773,8 +3823,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - mas_set(&mas, 0); - mas_for_each(&mas, vma, ULONG_MAX) { + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) @@ -3833,12 +3883,12 @@ void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_unlock_anon_vma(avc->anon_vma); @@ -3876,7 +3926,7 @@ static int init_user_reserve(void) free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); - sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K); return 0; } subsys_initcall(init_user_reserve); @@ -3897,7 +3947,7 @@ static int init_admin_reserve(void) free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); - sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K); return 0; } subsys_initcall(init_admin_reserve); @@ -3929,12 +3979,12 @@ static int reserve_mem_notifier(struct notifier_block *nb, case MEM_ONLINE: /* Default max is 128MB. Leave alone if modified by operator. */ tmp = sysctl_user_reserve_kbytes; - if (0 < tmp && tmp < (1UL << 17)) + if (tmp > 0 && tmp < SZ_128K) init_user_reserve(); /* Default max is 8MB. Leave alone if modified by operator. */ tmp = sysctl_admin_reserve_kbytes; - if (0 < tmp && tmp < (1UL << 13)) + if (tmp > 0 && tmp < SZ_8K) init_admin_reserve(); break; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 604ddf08affe..99b3e9408aa0 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -50,12 +50,21 @@ static bool tlb_next_batch(struct mmu_gather *tlb) #ifdef CONFIG_SMP static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) { + struct encoded_page **pages = batch->encoded_pages; + for (int i = 0; i < batch->nr; i++) { - struct encoded_page *enc = batch->encoded_pages[i]; + struct encoded_page *enc = pages[i]; - if (encoded_page_flags(enc)) { + if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { struct page *page = encoded_page_ptr(enc); - folio_remove_rmap_pte(page_folio(page), page, vma); + unsigned int nr_pages = 1; + + if (unlikely(encoded_page_flags(enc) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + nr_pages = encoded_nr_pages(pages[++i]); + + folio_remove_rmap_ptes(page_folio(page), page, nr_pages, + vma); } } } @@ -82,26 +91,62 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) } #endif -static void tlb_batch_pages_flush(struct mmu_gather *tlb) +/* + * We might end up freeing a lot of pages. Reschedule on a regular + * basis to avoid soft lockups in configurations without full + * preemption enabled. The magic number of 512 folios seems to work. + */ +#define MAX_NR_FOLIOS_PER_FREE 512 + +static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch) { - struct mmu_gather_batch *batch; + struct encoded_page **pages = batch->encoded_pages; + unsigned int nr, nr_pages; - for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { - struct encoded_page **pages = batch->encoded_pages; + while (batch->nr) { + if (!page_poisoning_enabled_static() && !want_init_on_free()) { + nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr); - do { /* - * limit free batch count when PAGE_SIZE > 4K + * Make sure we cover page + nr_pages, and don't leave + * nr_pages behind when capping the number of entries. + */ + if (unlikely(encoded_page_flags(pages[nr - 1]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + nr++; + } else { + /* + * With page poisoning and init_on_free, the time it + * takes to free memory grows proportionally with the + * actual memory size. Therefore, limit based on the + * actual memory size and not the number of involved + * folios. */ - unsigned int nr = min(512U, batch->nr); + for (nr = 0, nr_pages = 0; + nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE; + nr++) { + if (unlikely(encoded_page_flags(pages[nr]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + nr_pages += encoded_nr_pages(pages[++nr]); + else + nr_pages++; + } + } - free_pages_and_swap_cache(pages, nr); - pages += nr; - batch->nr -= nr; + free_pages_and_swap_cache(pages, nr); + pages += nr; + batch->nr -= nr; - cond_resched(); - } while (batch->nr); + cond_resched(); } +} + +static void tlb_batch_pages_flush(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; + + for (batch = &tlb->local; batch && batch->nr; batch = batch->next) + __tlb_batch_free_encoded_pages(batch); tlb->active = &tlb->local; } @@ -116,14 +161,19 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) tlb->local.next = NULL; } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size) +static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, + struct page *page, unsigned int nr_pages, bool delay_rmap, + int page_size) { + int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); + VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE); + VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1)); #endif batch = tlb->active; @@ -131,17 +181,40 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, i * Add the page and check if we are full. If so * force a flush. */ - batch->encoded_pages[batch->nr++] = page; - if (batch->nr == batch->max) { + if (likely(nr_pages == 1)) { + batch->encoded_pages[batch->nr++] = encode_page(page, flags); + } else { + flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT; + batch->encoded_pages[batch->nr++] = encode_page(page, flags); + batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages); + } + /* + * Make sure that we can always add another "page" + "nr_pages", + * requiring two entries instead of only a single one. + */ + if (batch->nr >= batch->max - 1) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } - VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page)); + VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page); return false; } +bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, + unsigned int nr_pages, bool delay_rmap) +{ + return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap, + PAGE_SIZE); +} + +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, + bool delay_rmap, int page_size) +{ + return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size); +} + #endif /* MMU_GATHER_NO_GATHER */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index ec3b068cbbe6..8982e6139d07 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -424,23 +424,6 @@ int __mmu_notifier_test_young(struct mm_struct *mm, return young; } -void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, - pte_t pte) -{ - struct mmu_notifier *subscription; - int id; - - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist, - srcu_read_lock_held(&srcu)) { - if (subscription->ops->change_pte) - subscription->ops->change_pte(subscription, mm, address, - pte); - } - srcu_read_unlock(&srcu, id); -} - static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, const struct mmu_notifier_range *range) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 81991102f785..8c6cd8825273 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -32,6 +32,7 @@ #include <linux/sched/sysctl.h> #include <linux/userfaultfd_k.h> #include <linux/memory-tiers.h> +#include <uapi/linux/mman.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -129,7 +130,8 @@ static long change_pte_range(struct mmu_gather *tlb, /* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) && - folio_ref_count(folio) != 1) + (folio_maybe_dma_pinned(folio) || + folio_likely_mapped_shared(folio))) continue; /* @@ -198,13 +200,13 @@ static long change_pte_range(struct mmu_gather *tlb, pte_t newpte; if (is_writable_migration_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = pfn_swap_entry_folio(entry); /* * A protection check is difficult so * just be safe and disable write */ - if (PageAnon(page)) + if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); else @@ -743,6 +745,15 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } } + /* + * checking if memory is sealed. + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(current->mm, start, end))) { + error = -EPERM; + goto out; + } + prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; diff --git a/mm/mremap.c b/mm/mremap.c index 38d98465f3d8..5f96bc5ee918 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -205,7 +205,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, */ if (pte_present(pte)) force_flush = true; - pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); + pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); set_pte_at(mm, new_addr, new_pte, pte); } @@ -902,7 +902,25 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if ((mm->map_count + 2) >= sysctl_max_map_count - 3) return -ENOMEM; + /* + * In mremap_to(). + * Move a VMA to another location, check if src addr is sealed. + * + * Place can_modify_mm here because mremap_to() + * does its own checking for address range, and we only + * check the sealing after passing those checks. + * + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) + return -EPERM; + if (flags & MREMAP_FIXED) { + /* + * In mremap_to(). + * VMA is moved to dst address, and munmap dst first. + * do_munmap will check if dst is sealed. + */ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); if (ret) goto out; @@ -1062,6 +1080,19 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } /* + * Below is shrink/expand case (not mremap_to()) + * Check if src address is sealed, if so, reject. + * In other words, prevent shrinking or expanding a sealed VMA. + * + * Place can_modify_mm here so we can keep the logic related to + * shrink/expand together. + */ + if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) { + ret = -EPERM; + goto out; + } + + /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. * do_vmi_munmap does all the needed commit accounting, and diff --git a/mm/mseal.c b/mm/mseal.c new file mode 100644 index 000000000000..bf783bba8ed0 --- /dev/null +++ b/mm/mseal.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement mseal() syscall. + * + * Copyright (c) 2023,2024 Google, Inc. + * + * Author: Jeff Xu <jeffxu@chromium.org> + */ + +#include <linux/mempolicy.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/mm_inline.h> +#include <linux/mmu_context.h> +#include <linux/syscalls.h> +#include <linux/sched.h> +#include "internal.h" + +static inline bool vma_is_sealed(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_SEALED); +} + +static inline void set_vma_sealed(struct vm_area_struct *vma) +{ + vm_flags_set(vma, VM_SEALED); +} + +/* + * check if a vma is sealed for modification. + * return true, if modification is allowed. + */ +static bool can_modify_vma(struct vm_area_struct *vma) +{ + if (unlikely(vma_is_sealed(vma))) + return false; + + return true; +} + +static bool is_madv_discard(int behavior) +{ + return behavior & + (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED | + MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK); +} + +static bool is_ro_anon(struct vm_area_struct *vma) +{ + /* check anonymous mapping. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) + return false; + + /* + * check for non-writable: + * PROT=RO or PKRU is not writeable. + */ + if (!(vma->vm_flags & VM_WRITE) || + !arch_vma_access_permitted(vma, true, false, false)) + return true; + + return false; +} + +/* + * Check if the vmas of a memory range are allowed to be modified. + * the memory ranger can have a gap (unallocated memory). + * return true, if it is allowed. + */ +bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + + VMA_ITERATOR(vmi, mm, start); + + /* going through each vma to check. */ + for_each_vma_range(vmi, vma, end) { + if (unlikely(!can_modify_vma(vma))) + return false; + } + + /* Allow by default. */ + return true; +} + +/* + * Check if the vmas of a memory range are allowed to be modified by madvise. + * the memory ranger can have a gap (unallocated memory). + * return true, if it is allowed. + */ +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, + int behavior) +{ + struct vm_area_struct *vma; + + VMA_ITERATOR(vmi, mm, start); + + if (!is_madv_discard(behavior)) + return true; + + /* going through each vma to check. */ + for_each_vma_range(vmi, vma, end) + if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))) + return false; + + /* Allow by default. */ + return true; +} + +static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, vm_flags_t newflags) +{ + int ret = 0; + vm_flags_t oldflags = vma->vm_flags; + + if (newflags == oldflags) + goto out; + + vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; + } + + set_vma_sealed(vma); +out: + *prev = vma; + return ret; +} + +/* + * Check for do_mseal: + * 1> start is part of a valid vma. + * 2> end is part of a valid vma. + * 3> No gap (unallocated address) between start and end. + * 4> map is sealable. + */ +static int check_mm_seal(unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + unsigned long nstart = start; + + VMA_ITERATOR(vmi, current->mm, start); + + /* going through each vma to check. */ + for_each_vma_range(vmi, vma, end) { + if (vma->vm_start > nstart) + /* unallocated memory found. */ + return -ENOMEM; + + if (vma->vm_end >= end) + return 0; + + nstart = vma->vm_end; + } + + return -ENOMEM; +} + +/* + * Apply sealing. + */ +static int apply_mm_seal(unsigned long start, unsigned long end) +{ + unsigned long nstart; + struct vm_area_struct *vma, *prev; + + VMA_ITERATOR(vmi, current->mm, start); + + vma = vma_iter_load(&vmi); + /* + * Note: check_mm_seal should already checked ENOMEM case. + * so vma should not be null, same for the other ENOMEM cases. + */ + prev = vma_prev(&vmi); + if (start > vma->vm_start) + prev = vma; + + nstart = start; + for_each_vma_range(vmi, vma, end) { + int error; + unsigned long tmp; + vm_flags_t newflags; + + newflags = vma->vm_flags | VM_SEALED; + tmp = vma->vm_end; + if (tmp > end) + tmp = end; + error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); + if (error) + return error; + nstart = vma_iter_end(&vmi); + } + + return 0; +} + +/* + * mseal(2) seals the VM's meta data from + * selected syscalls. + * + * addr/len: VM address range. + * + * The address range by addr/len must meet: + * start (addr) must be in a valid VMA. + * end (addr + len) must be in a valid VMA. + * no gap (unallocated memory) between start and end. + * start (addr) must be page aligned. + * + * len: len will be page aligned implicitly. + * + * Below VMA operations are blocked after sealing. + * 1> Unmapping, moving to another location, and shrinking + * the size, via munmap() and mremap(), can leave an empty + * space, therefore can be replaced with a VMA with a new + * set of attributes. + * 2> Moving or expanding a different vma into the current location, + * via mremap(). + * 3> Modifying a VMA via mmap(MAP_FIXED). + * 4> Size expansion, via mremap(), does not appear to pose any + * specific risks to sealed VMAs. It is included anyway because + * the use case is unclear. In any case, users can rely on + * merging to expand a sealed VMA. + * 5> mprotect and pkey_mprotect. + * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) + * for anonymous memory, when users don't have write permission to the + * memory. Those behaviors can alter region contents by discarding pages, + * effectively a memset(0) for anonymous memory. + * + * flags: reserved. + * + * return values: + * zero: success. + * -EINVAL: + * invalid input flags. + * start address is not page aligned. + * Address arange (start + len) overflow. + * -ENOMEM: + * addr is not a valid address (not allocated). + * end (start + len) is not a valid address. + * a gap (unallocated memory) between start and end. + * -EPERM: + * - In 32 bit architecture, sealing is not supported. + * Note: + * user can call mseal(2) multiple times, adding a seal on an + * already sealed memory is a no-action (no error). + * + * unseal() is not supported. + */ +static int do_mseal(unsigned long start, size_t len_in, unsigned long flags) +{ + size_t len; + int ret = 0; + unsigned long end; + struct mm_struct *mm = current->mm; + + ret = can_do_mseal(flags); + if (ret) + return ret; + + start = untagged_addr(start); + if (!PAGE_ALIGNED(start)) + return -EINVAL; + + len = PAGE_ALIGN(len_in); + /* Check to see whether len was rounded up from small -ve to zero. */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + /* + * First pass, this helps to avoid + * partial sealing in case of error in input address range, + * e.g. ENOMEM error. + */ + ret = check_mm_seal(start, end); + if (ret) + goto out; + + /* + * Second pass, this should success, unless there are errors + * from vma_modify_flags, e.g. merge/split error, or process + * reaching the max supported VMAs, however, those cases shall + * be rare. + */ + ret = apply_mm_seal(start, end); + +out: + mmap_write_unlock(current->mm); + return ret; +} + +SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, + flags) +{ + return do_mseal(start, len, flags); +} diff --git a/mm/nommu.c b/mm/nommu.c index b6dc558d3144..7296e775e04e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -110,57 +110,34 @@ unsigned int kobjsize(const void *objp) return page_size(page); } -/** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN - * - * Only IO mappings and raw PFN mappings are allowed. - * - * Returns zero and the pfn at @pfn on success, -ve otherwise. - */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn) -{ - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - - *pfn = address >> PAGE_SHIFT; - return 0; -} -EXPORT_SYMBOL(follow_pfn); - -LIST_HEAD(vmap_area_list); - void vfree(const void *addr) { kfree(addr); } EXPORT_SYMBOL(vfree); -void *__vmalloc(unsigned long size, gfp_t gfp_mask) +void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) { /* * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() * returns only a logical address. */ - return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); + return kmalloc_noprof(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); } -EXPORT_SYMBOL(__vmalloc); +EXPORT_SYMBOL(__vmalloc_noprof); -void *__vmalloc_node_range(unsigned long size, unsigned long align, +void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { - return __vmalloc(size, gfp_mask); + return __vmalloc_noprof(size, gfp_mask); } -void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, +void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { - return __vmalloc(size, gfp_mask); + return __vmalloc_noprof(size, gfp_mask); } static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) @@ -181,11 +158,11 @@ static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) return ret; } -void *vmalloc_user(unsigned long size) +void *vmalloc_user_noprof(unsigned long size) { return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); } -EXPORT_SYMBOL(vmalloc_user); +EXPORT_SYMBOL(vmalloc_user_noprof); struct page *vmalloc_to_page(const void *addr) { @@ -219,13 +196,13 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vmalloc(unsigned long size) +void *vmalloc_noprof(unsigned long size) { - return __vmalloc(size, GFP_KERNEL); + return __vmalloc_noprof(size, GFP_KERNEL); } -EXPORT_SYMBOL(vmalloc); +EXPORT_SYMBOL(vmalloc_noprof); -void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc); +void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof); /* * vzalloc - allocate virtually contiguous memory with zero fill @@ -239,11 +216,11 @@ void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc) * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vzalloc(unsigned long size) +void *vzalloc_noprof(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_ZERO); + return __vmalloc_noprof(size, GFP_KERNEL | __GFP_ZERO); } -EXPORT_SYMBOL(vzalloc); +EXPORT_SYMBOL(vzalloc_noprof); /** * vmalloc_node - allocate memory on a specific node @@ -256,11 +233,11 @@ EXPORT_SYMBOL(vzalloc); * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vmalloc_node(unsigned long size, int node) +void *vmalloc_node_noprof(unsigned long size, int node) { - return vmalloc(size); + return vmalloc_noprof(size); } -EXPORT_SYMBOL(vmalloc_node); +EXPORT_SYMBOL(vmalloc_node_noprof); /** * vzalloc_node - allocate memory on a specific node with zero fill @@ -274,11 +251,11 @@ EXPORT_SYMBOL(vmalloc_node); * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vzalloc_node(unsigned long size, int node) +void *vzalloc_node_noprof(unsigned long size, int node) { - return vzalloc(size); + return vzalloc_noprof(size); } -EXPORT_SYMBOL(vzalloc_node); +EXPORT_SYMBOL(vzalloc_node_noprof); /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) @@ -287,11 +264,11 @@ EXPORT_SYMBOL(vzalloc_node); * Allocate enough 32bit PA addressable pages to cover @size from the * page level allocator and map them into contiguous kernel virtual space. */ -void *vmalloc_32(unsigned long size) +void *vmalloc_32_noprof(unsigned long size) { - return __vmalloc(size, GFP_KERNEL); + return __vmalloc_noprof(size, GFP_KERNEL); } -EXPORT_SYMBOL(vmalloc_32); +EXPORT_SYMBOL(vmalloc_32_noprof); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory @@ -303,15 +280,15 @@ EXPORT_SYMBOL(vmalloc_32); * VM_USERMAP is set on the corresponding VMA so that subsequent calls to * remap_vmalloc_range() are permissible. */ -void *vmalloc_32_user(unsigned long size) +void *vmalloc_32_user_noprof(unsigned long size) { /* * We'll have to sort out the ZONE_DMA bits for 64-bit, * but for now this can simply use vmalloc_user() directly. */ - return vmalloc_user(size); + return vmalloc_user_noprof(size); } -EXPORT_SYMBOL(vmalloc_32_user); +EXPORT_SYMBOL(vmalloc_32_user_noprof); void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { @@ -357,6 +334,13 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_pages); + int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num) { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 91ccd82097c2..4d7a0004df2c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include <linux/kthread.h> #include <linux/init.h> #include <linux/mmu_notifier.h> +#include <linux/cred.h> #include <asm/tlb.h> #include "internal.h" @@ -723,7 +724,6 @@ static struct ctl_table vm_oom_kill_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - {} }; #endif @@ -754,6 +754,7 @@ static inline void queue_oom_reaper(struct task_struct *tsk) */ static void mark_oom_victim(struct task_struct *tsk) { + const struct cred *cred; struct mm_struct *mm = tsk->mm; WARN_ON(oom_killer_disabled); @@ -773,7 +774,9 @@ static void mark_oom_victim(struct task_struct *tsk) */ __thaw_task(tsk); atomic_inc(&oom_victims); - trace_mark_victim(tsk->pid); + cred = get_task_cred(tsk); + trace_mark_victim(tsk, cred->uid.val); + put_cred(cred); } /** diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3f255534986a..12c9297ed4a7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -838,13 +838,15 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, } /** - * __wb_calc_thresh - @wb's share of dirty throttling threshold + * __wb_calc_thresh - @wb's share of dirty threshold * @dtc: dirty_throttle_context of interest + * @thresh: dirty throttling or dirty background threshold of wb_domain in @dtc * - * Note that balance_dirty_pages() will only seriously take it as a hard limit - * when sleeping max_pause per page is not enough to keep the dirty pages under - * control. For example, when the device is completely stalled due to some error - * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. + * Note that balance_dirty_pages() will only seriously take dirty throttling + * threshold as a hard limit when sleeping max_pause per page is not enough + * to keep the dirty pages under control. For example, when the device is + * completely stalled due to some error conditions, or when there are 1000 + * dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks * more (rather than completely block them) when the wb dirty pages go high. * @@ -855,19 +857,20 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * - * Return: @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty and PG_writeback pages. + * Return: @wb's dirty limit in pages. For dirty throttling limit, the term + * "dirty" in the context of dirty balancing includes all PG_dirty and + * PG_writeback pages. */ -static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) +static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, + unsigned long thresh) { struct wb_domain *dom = dtc_dom(dtc); - unsigned long thresh = dtc->thresh; u64 wb_thresh; unsigned long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* - * Calculate this BDI's share of the thresh ratio. + * Calculate this wb's share of the thresh ratio. */ fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); @@ -887,9 +890,28 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { - struct dirty_throttle_control gdtc = { GDTC_INIT(wb), - .thresh = thresh }; - return __wb_calc_thresh(&gdtc); + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + + return __wb_calc_thresh(&gdtc, thresh); +} + +unsigned long cgwb_calc_thresh(struct bdi_writeback *wb) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; + struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; + unsigned long filepages = 0, headroom = 0, writeback = 0; + + gdtc.avail = global_dirtyable_memory(); + gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_WRITEBACK); + + mem_cgroup_wb_stats(wb, &filepages, &headroom, + &mdtc.dirty, &writeback); + mdtc.dirty += writeback; + mdtc_calc_avail(&mdtc, filepages, headroom); + domain_dirty_limits(&mdtc); + + return __wb_calc_thresh(&mdtc, mdtc.thresh); } /* @@ -1636,7 +1658,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ - dtc->wb_thresh = __wb_calc_thresh(dtc); + dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh); dtc->wb_bg_thresh = dtc->thresh ? div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; @@ -1675,7 +1697,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb, struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; - unsigned long nr_reclaimable; /* = file_dirty */ + unsigned long nr_dirty; long period; long pause; long max_pause; @@ -1696,9 +1718,9 @@ static int balance_dirty_pages(struct bdi_writeback *wb, unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); + nr_dirty = global_node_page_state(NR_FILE_DIRTY); gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); + gdtc->dirty = nr_dirty + global_node_page_state(NR_WRITEBACK); domain_dirty_limits(gdtc); @@ -1749,7 +1771,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb, * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ - if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh && + if (!laptop_mode && nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb); @@ -2095,7 +2117,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (gdtc->dirty > gdtc->bg_thresh) return true; - thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh); + thresh = __wb_calc_thresh(gdtc, gdtc->bg_thresh); if (thresh < 2 * wb_stat_error()) reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); else @@ -2115,7 +2137,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (mdtc->dirty > mdtc->bg_thresh) return true; - thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh); + thresh = __wb_calc_thresh(mdtc, mdtc->bg_thresh); if (thresh < 2 * wb_stat_error()) reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); else @@ -2291,7 +2313,6 @@ static struct ctl_table vm_page_writeback_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - {} }; #endif @@ -2325,18 +2346,18 @@ void __init page_writeback_init(void) } /** - * tag_pages_for_writeback - tag pages to be written by write_cache_pages + * tag_pages_for_writeback - tag pages to be written by writeback * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags - * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is - * that write_cache_pages (or whoever calls this function) will then use - * TOWRITE tag to identify pages eligible for writeback. This mechanism is - * used to avoid livelocking of writeback by a process steadily creating new - * dirty pages in the file (thus it is important for this function to be quick - * so that it can tag pages faster than a dirtying process can create them). + * all pages that have DIRTY tag set with a special TOWRITE tag. The caller + * can then use the TOWRITE tag to identify pages eligible for writeback. + * This mechanism is used to avoid livelocking of writeback by a process + * steadily creating new dirty pages in the file (thus it is important for this + * function to be quick so that it can tag pages faster than a dirtying process + * can create them). */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -2360,183 +2381,243 @@ void tag_pages_for_writeback(struct address_space *mapping, } EXPORT_SYMBOL(tag_pages_for_writeback); +static bool folio_prepare_writeback(struct address_space *mapping, + struct writeback_control *wbc, struct folio *folio) +{ + /* + * Folio truncated or invalidated. We can freely skip it then, + * even for data integrity operations: the folio has disappeared + * concurrently, so there could be no real expectation of this + * data integrity operation even if there is now a new, dirty + * folio at the same pagecache index. + */ + if (unlikely(folio->mapping != mapping)) + return false; + + /* + * Did somebody else write it for us? + */ + if (!folio_test_dirty(folio)) + return false; + + if (folio_test_writeback(folio)) { + if (wbc->sync_mode == WB_SYNC_NONE) + return false; + folio_wait_writeback(folio); + } + BUG_ON(folio_test_writeback(folio)); + + if (!folio_clear_dirty_for_io(folio)) + return false; + + return true; +} + +static xa_mark_t wbc_to_tag(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + return PAGECACHE_TAG_TOWRITE; + return PAGECACHE_TAG_DIRTY; +} + +static pgoff_t wbc_end(struct writeback_control *wbc) +{ + if (wbc->range_cyclic) + return -1; + return wbc->range_end >> PAGE_SHIFT; +} + +static struct folio *writeback_get_folio(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct folio *folio; + +retry: + folio = folio_batch_next(&wbc->fbatch); + if (!folio) { + folio_batch_release(&wbc->fbatch); + cond_resched(); + filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc), + wbc_to_tag(wbc), &wbc->fbatch); + folio = folio_batch_next(&wbc->fbatch); + if (!folio) + return NULL; + } + + folio_lock(folio); + if (unlikely(!folio_prepare_writeback(mapping, wbc, folio))) { + folio_unlock(folio); + goto retry; + } + + trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); + return folio; +} + /** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * writeback_iter - iterate folio of a mapping for writeback * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function + * @wbc: writeback context + * @folio: previously iterated folio (%NULL to start) + * @error: in-out pointer for writeback errors (see below) * - * If a page is already under I/O, write_cache_pages() skips it, even - * if it's dirty. This is desirable behaviour for memory-cleaning writeback, - * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. If wbc->sync_mode is - * WB_SYNC_ALL then we were called for data integrity and we must wait for - * existing IO to complete. - * - * To avoid livelocks (when other process dirties new pages), we first tag - * pages which should be written back with TOWRITE tag and only then start - * writing them. For data-integrity sync we have to be careful so that we do - * not miss some pages (e.g., because some other process has cleared TOWRITE - * tag we set). The rule we follow is that TOWRITE tag can be cleared only - * by the process clearing the DIRTY tag (and submitting the page for IO). - * - * To avoid deadlocks between range_cyclic writeback and callers that hold - * pages in PageWriteback to aggregate IO until write_cache_pages() returns, - * we do not loop back to the start of the file. Doing so causes a page - * lock/page writeback access order inversion - we should only ever lock - * multiple pages in ascending page->index order, and looping back to the start - * of the file violates that rule and causes deadlocks. + * This function returns the next folio for the writeback operation described by + * @wbc on @mapping and should be called in a while loop in the ->writepages + * implementation. * - * Return: %0 on success, negative error code otherwise + * To start the writeback operation, %NULL is passed in the @folio argument, and + * for every subsequent iteration the folio returned previously should be passed + * back in. + * + * If there was an error in the per-folio writeback inside the writeback_iter() + * loop, @error should be set to the error value. + * + * Once the writeback described in @wbc has finished, this function will return + * %NULL and if there was an error in any iteration restore it to @error. + * + * Note: callers should not manually break out of the loop using break or goto + * but must keep calling writeback_iter() until it returns %NULL. + * + * Return: the folio to write or %NULL if the loop is done. */ -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) +struct folio *writeback_iter(struct address_space *mapping, + struct writeback_control *wbc, struct folio *folio, int *error) { - int ret = 0; - int done = 0; - int error; - struct folio_batch fbatch; - int nr_folios; - pgoff_t index; - pgoff_t end; /* Inclusive */ - pgoff_t done_index; - int range_whole = 0; - xa_mark_t tag; - - folio_batch_init(&fbatch); - if (wbc->range_cyclic) { - index = mapping->writeback_index; /* prev offset */ - end = -1; - } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - tag_pages_for_writeback(mapping, index, end); - tag = PAGECACHE_TAG_TOWRITE; - } else { - tag = PAGECACHE_TAG_DIRTY; - } - done_index = index; - while (!done && (index <= end)) { - int i; + if (!folio) { + folio_batch_init(&wbc->fbatch); + wbc->saved_err = *error = 0; - nr_folios = filemap_get_folios_tag(mapping, &index, end, - tag, &fbatch); - - if (nr_folios == 0) - break; - - for (i = 0; i < nr_folios; i++) { - struct folio *folio = fbatch.folios[i]; - unsigned long nr; + /* + * For range cyclic writeback we remember where we stopped so + * that we can continue where we stopped. + * + * For non-cyclic writeback we always start at the beginning of + * the passed in range. + */ + if (wbc->range_cyclic) + wbc->index = mapping->writeback_index; + else + wbc->index = wbc->range_start >> PAGE_SHIFT; - done_index = folio->index; + /* + * To avoid livelocks when other processes dirty new pages, we + * first tag pages which should be written back and only then + * start writing them. + * + * For data-integrity writeback we have to be careful so that we + * do not miss some pages (e.g., because some other process has + * cleared the TOWRITE tag we set). The rule we follow is that + * TOWRITE tag can be cleared only by the process clearing the + * DIRTY tag (and submitting the page for I/O). + */ + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, wbc->index, + wbc_end(wbc)); + } else { + wbc->nr_to_write -= folio_nr_pages(folio); - folio_lock(folio); + WARN_ON_ONCE(*error > 0); - /* - * Page truncated or invalidated. We can freely skip it - * then, even for data integrity operations: the page - * has disappeared concurrently, so there could be no - * real expectation of this data integrity operation - * even if there is now a new, dirty page at the same - * pagecache address. - */ - if (unlikely(folio->mapping != mapping)) { -continue_unlock: - folio_unlock(folio); - continue; - } + /* + * For integrity writeback we have to keep going until we have + * written all the folios we tagged for writeback above, even if + * we run past wbc->nr_to_write or encounter errors. + * We stash away the first error we encounter in wbc->saved_err + * so that it can be retrieved when we're done. This is because + * the file system may still have state to clear for each folio. + * + * For background writeback we exit as soon as we run past + * wbc->nr_to_write or encounter the first error. + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + if (*error && !wbc->saved_err) + wbc->saved_err = *error; + } else { + if (*error || wbc->nr_to_write <= 0) + goto done; + } + } - if (!folio_test_dirty(folio)) { - /* someone wrote it for us */ - goto continue_unlock; - } + folio = writeback_get_folio(mapping, wbc); + if (!folio) { + /* + * To avoid deadlocks between range_cyclic writeback and callers + * that hold pages in PageWriteback to aggregate I/O until + * the writeback iteration finishes, we do not loop back to the + * start of the file. Doing so causes a page lock/page + * writeback access order inversion - we should only ever lock + * multiple pages in ascending page->index order, and looping + * back to the start of the file violates that rule and causes + * deadlocks. + */ + if (wbc->range_cyclic) + mapping->writeback_index = 0; - if (folio_test_writeback(folio)) { - if (wbc->sync_mode != WB_SYNC_NONE) - folio_wait_writeback(folio); - else - goto continue_unlock; - } + /* + * Return the first error we encountered (if there was any) to + * the caller. + */ + *error = wbc->saved_err; + } + return folio; - BUG_ON(folio_test_writeback(folio)); - if (!folio_clear_dirty_for_io(folio)) - goto continue_unlock; +done: + if (wbc->range_cyclic) + mapping->writeback_index = folio->index + folio_nr_pages(folio); + folio_batch_release(&wbc->fbatch); + return NULL; +} +EXPORT_SYMBOL_GPL(writeback_iter); - trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - error = writepage(folio, wbc, data); - nr = folio_nr_pages(folio); - if (unlikely(error)) { - /* - * Handle errors according to the type of - * writeback. There's no need to continue for - * background writeback. Just push done_index - * past this page so media errors won't choke - * writeout for the entire file. For integrity - * writeback, we must process the entire dirty - * set regardless of errors because the fs may - * still have state to clear for each page. In - * that case we continue processing and return - * the first error. - */ - if (error == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - error = 0; - } else if (wbc->sync_mode != WB_SYNC_ALL) { - ret = error; - done_index = folio->index + nr; - done = 1; - break; - } - if (!ret) - ret = error; - } +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * Return: %0 on success, negative error code otherwise + * + * Note: please use writeback_iter() instead. + */ +int write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + struct folio *folio = NULL; + int error; - /* - * We stop writing back only if we are not doing - * integrity sync. In case of integrity sync we have to - * keep going until we have written all the pages - * we tagged for writeback prior to entering this loop. - */ - wbc->nr_to_write -= nr; - if (wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { - done = 1; - break; - } + while ((folio = writeback_iter(mapping, wbc, folio, &error))) { + error = writepage(folio, wbc, data); + if (error == AOP_WRITEPAGE_ACTIVATE) { + folio_unlock(folio); + error = 0; } - folio_batch_release(&fbatch); - cond_resched(); } - /* - * If we hit the last page and there is more work to be done: wrap - * back the index back to the start of the file for the next - * time we are called. - */ - if (wbc->range_cyclic && !done) - done_index = 0; - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = done_index; - - return ret; + return error; } EXPORT_SYMBOL(write_cache_pages); -static int writepage_cb(struct folio *folio, struct writeback_control *wbc, - void *data) +static int writeback_use_writepage(struct address_space *mapping, + struct writeback_control *wbc) { - struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(&folio->page, wbc); - mapping_set_error(mapping, ret); - return ret; + struct folio *folio = NULL; + struct blk_plug plug; + int err; + + blk_start_plug(&plug); + while ((folio = writeback_iter(mapping, wbc, folio, &err))) { + err = mapping->a_ops->writepage(&folio->page, wbc); + if (err == AOP_WRITEPAGE_ACTIVATE) { + folio_unlock(folio); + err = 0; + } + mapping_set_error(mapping, err); + } + blk_finish_plug(&plug); + + return err; } int do_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -2552,12 +2633,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) if (mapping->a_ops->writepages) { ret = mapping->a_ops->writepages(mapping, wbc); } else if (mapping->a_ops->writepage) { - struct blk_plug plug; - - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, writepage_cb, - mapping); - blk_finish_plug(&plug); + ret = writeback_use_writepage(mapping, wbc); } else { /* deal with chardevs and other special files */ ret = 0; @@ -2646,17 +2722,20 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) } /* - * Mark the folio dirty, and set it dirty in the page cache, and mark - * the inode dirty. + * Mark the folio dirty, and set it dirty in the page cache. * * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * - * The caller must hold folio_memcg_lock(). Most callers have the folio - * locked. A few have the folio blocked from truncation through other - * means (eg zap_vma_pages() has it mapped and is holding the page table - * lock). This can also be called from mark_buffer_dirty(), which I - * cannot prove is always protected against truncate. + * The caller must hold folio_memcg_lock(). It is the caller's + * responsibility to prevent the folio from being truncated while + * this function is in progress, although it may have been truncated + * before this function is called. Most callers have the folio locked. + * A few have the folio blocked from truncation through other means (e.g. + * zap_vma_pages() has it mapped and is holding the page table lock). + * When called from mark_buffer_dirty(), the filesystem should hold a + * reference to the buffer_head that is being marked dirty, which causes + * try_to_free_buffers() to fail. */ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, int warn) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a663202045dc..2e22ce5675ca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -32,6 +32,7 @@ #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/cpuset.h> +#include <linux/pagevec.h> #include <linux/memory_hotplug.h> #include <linux/nodemask.h> #include <linux/vmstat.h> @@ -53,6 +54,7 @@ #include <linux/khugepaged.h> #include <linux/delayacct.h> #include <linux/cacheinfo.h> +#include <linux/pgalloc_tag.h> #include <asm/div64.h> #include "internal.h" #include "shuffle.h" @@ -205,24 +207,6 @@ EXPORT_SYMBOL(node_states); gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -/* - * A cached value of the page's pageblock's migratetype, used when the page is - * put on a pcplist. Used to avoid the pageblock migratetype lookup when - * freeing from pcplists in most cases, at the cost of possibly becoming stale. - * Also the migratetype set in the page does not necessarily match the pcplist - * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any - * other index - this ensures that it will be put on the correct CMA freelist. - */ -static inline int get_pcppage_migratetype(struct page *page) -{ - return page->index; -} - -static inline void set_pcppage_migratetype(struct page *page, int migratetype) -{ - page->index = migratetype; -} - #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE unsigned int pageblock_order __read_mostly; #endif @@ -331,7 +315,7 @@ static inline bool deferred_pages_enabled(void) static bool __ref _deferred_grow_zone(struct zone *zone, unsigned int order) { - return deferred_grow_zone(zone, order); + return deferred_grow_zone(zone, order); } #else static inline bool deferred_pages_enabled(void) @@ -464,19 +448,19 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) /* * Temporary debugging check for pages not lying within a given zone. */ -static int __maybe_unused bad_range(struct zone *zone, struct page *page) +static bool __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) - return 1; + return true; if (zone != page_zone(page)) - return 1; + return true; - return 0; + return false; } #else -static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) +static inline bool __maybe_unused bad_range(struct zone *zone, struct page *page) { - return 0; + return false; } #endif @@ -522,7 +506,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { - VM_BUG_ON(order != pageblock_order); + VM_BUG_ON(order != HPAGE_PMD_ORDER); return NR_LOWORDER_PCP_LISTS; } #else @@ -538,7 +522,7 @@ static inline int pindex_to_order(unsigned int pindex) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pindex == NR_LOWORDER_PCP_LISTS) - order = pageblock_order; + order = HPAGE_PMD_ORDER; #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); #endif @@ -551,20 +535,12 @@ static inline bool pcp_allowed_order(unsigned int order) if (order <= PAGE_ALLOC_COSTLY_ORDER) return true; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order == pageblock_order) + if (order == HPAGE_PMD_ORDER) return true; #endif return false; } -static inline void free_the_page(struct page *page, unsigned int order) -{ - if (pcp_allowed_order(order)) /* Via pcp? */ - free_unref_page(page, order); - else - __free_pages_ok(page, order, FPI_NONE); -} - /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -589,20 +565,6 @@ void prep_compound_page(struct page *page, unsigned int order) prep_compound_head(page, order); } -void destroy_large_folio(struct folio *folio) -{ - if (folio_test_hugetlb(folio)) { - free_huge_folio(folio); - return; - } - - if (folio_test_large_rmappable(folio)) - folio_undo_large_rmappable(folio); - - mem_cgroup_uncharge(folio); - free_the_page(&folio->page, folio_order(folio)); -} - static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); @@ -633,12 +595,14 @@ compaction_capture(struct capture_control *capc, struct page *page, return false; /* - * Do not let lower order allocations pollute a movable pageblock. + * Do not let lower order allocations pollute a movable pageblock + * unless compaction is also requesting movable pages. * This might let an unmovable request use a reclaimable pageblock * and vice-versa but no more than normal fallback logic which can * have trouble finding a high-order free page. */ - if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) + if (order < pageblock_order && migratetype == MIGRATE_MOVABLE && + capc->cc->migratetype != MIGRATE_MOVABLE) return false; capc->page = page; @@ -659,23 +623,33 @@ compaction_capture(struct capture_control *capc, struct page *page, } #endif /* CONFIG_COMPACTION */ -/* Used for pages not on another list */ -static inline void add_to_free_list(struct page *page, struct zone *zone, - unsigned int order, int migratetype) +static inline void account_freepages(struct zone *zone, int nr_pages, + int migratetype) { - struct free_area *area = &zone->free_area[order]; + if (is_migrate_isolate(migratetype)) + return; - list_add(&page->buddy_list, &area->free_list[migratetype]); - area->nr_free++; + __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); + + if (is_migrate_cma(migratetype)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); } /* Used for pages not on another list */ -static inline void add_to_free_list_tail(struct page *page, struct zone *zone, - unsigned int order, int migratetype) +static inline void __add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype, + bool tail) { struct free_area *area = &zone->free_area[order]; - list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), migratetype, 1 << order); + + if (tail) + list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + else + list_add(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -685,16 +659,28 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, * allocation again (e.g., optimization for memory onlining). */ static inline void move_to_free_list(struct page *page, struct zone *zone, - unsigned int order, int migratetype) + unsigned int order, int old_mt, int new_mt) { struct free_area *area = &zone->free_area[order]; - list_move_tail(&page->buddy_list, &area->free_list[migratetype]); + /* Free page moving can fail, so it happens before the type update */ + VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), old_mt, 1 << order); + + list_move_tail(&page->buddy_list, &area->free_list[new_mt]); + + account_freepages(zone, -(1 << order), old_mt); + account_freepages(zone, 1 << order, new_mt); } -static inline void del_page_from_free_list(struct page *page, struct zone *zone, - unsigned int order) +static inline void __del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) { + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), migratetype, 1 << order); + /* clear reported state and update reported page count */ if (page_reported(page)) __ClearPageReported(page); @@ -705,6 +691,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, zone->free_area[order].nr_free--; } +static inline void del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + __del_page_from_free_list(page, zone, order, migratetype); + account_freepages(zone, -(1 << order), migratetype); +} + static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { @@ -776,16 +769,16 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); - if (likely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); - VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); + account_freepages(zone, 1 << order, migratetype); + while (order < MAX_PAGE_ORDER) { + int buddy_mt = migratetype; + if (compaction_capture(capc, page, order, migratetype)) { - __mod_zone_freepage_state(zone, -(1 << order), - migratetype); + account_freepages(zone, -(1 << order), migratetype); return; } @@ -800,11 +793,11 @@ static inline void __free_one_page(struct page *page, * pageblock isolation could cause incorrect freepage or CMA * accounting or HIGHATOMIC accounting. */ - int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); + buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); - if (migratetype != buddy_mt - && (!migratetype_is_mergeable(migratetype) || - !migratetype_is_mergeable(buddy_mt))) + if (migratetype != buddy_mt && + (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) goto done_merging; } @@ -813,9 +806,19 @@ static inline void __free_one_page(struct page *page, * merge with it and move up one order. */ if (page_is_guard(buddy)) - clear_page_guard(zone, buddy, order, migratetype); + clear_page_guard(zone, buddy, order); else - del_page_from_free_list(buddy, zone, order); + __del_page_from_free_list(buddy, zone, order, buddy_mt); + + if (unlikely(buddy_mt != migratetype)) { + /* + * Match buddy type. This ensures that an + * expand() down the line puts the sub-blocks + * on the right freelists. + */ + set_pageblock_migratetype(buddy, migratetype); + } + combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -832,74 +835,13 @@ done_merging: else to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); - if (to_tail) - add_to_free_list_tail(page, zone, order, migratetype); - else - add_to_free_list(page, zone, order, migratetype); + __add_to_free_list(page, zone, order, migratetype, to_tail); /* Notify page reporting subsystem of freed page */ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) page_reporting_notify_free(order); } -/** - * split_free_page() -- split a free page at split_pfn_offset - * @free_page: the original free page - * @order: the order of the page - * @split_pfn_offset: split offset within the page - * - * Return -ENOENT if the free page is changed, otherwise 0 - * - * It is used when the free page crosses two pageblocks with different migratetypes - * at split_pfn_offset within the page. The split free page will be put into - * separate migratetype lists afterwards. Otherwise, the function achieves - * nothing. - */ -int split_free_page(struct page *free_page, - unsigned int order, unsigned long split_pfn_offset) -{ - struct zone *zone = page_zone(free_page); - unsigned long free_page_pfn = page_to_pfn(free_page); - unsigned long pfn; - unsigned long flags; - int free_page_order; - int mt; - int ret = 0; - - if (split_pfn_offset == 0) - return ret; - - spin_lock_irqsave(&zone->lock, flags); - - if (!PageBuddy(free_page) || buddy_order(free_page) != order) { - ret = -ENOENT; - goto out; - } - - mt = get_pfnblock_migratetype(free_page, free_page_pfn); - if (likely(!is_migrate_isolate(mt))) - __mod_zone_freepage_state(zone, -(1UL << order), mt); - - del_page_from_free_list(free_page, zone, order); - for (pfn = free_page_pfn; - pfn < free_page_pfn + (1UL << order);) { - int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); - - free_page_order = min_t(unsigned int, - pfn ? __ffs(pfn) : order, - __fls(split_pfn_offset)); - __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, - mt, FPI_NONE); - pfn += 1UL << free_page_order; - split_pfn_offset -= (1UL << free_page_order); - /* we have done the first part, now switch to second part */ - if (split_pfn_offset == 0) - split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); - } -out: - spin_unlock_irqrestore(&zone->lock, flags); - return ret; -} /* * A bad page could be due to a number of fields. Instead of multiple branches, * try and check multiple fields with one check. The caller must do a detailed @@ -995,6 +937,10 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) bad_page(page, "nonzero entire_mapcount"); goto out; } + if (unlikely(folio_large_mapcount(folio))) { + bad_page(page, "nonzero large_mapcount"); + goto out; + } if (unlikely(atomic_read(&folio->_nr_pages_mapped))) { bad_page(page, "nonzero nr_pages_mapped"); goto out; @@ -1005,10 +951,11 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) } break; case 2: - /* - * the second tail page: ->mapping is - * deferred_list.next -- ignore value. - */ + /* the second tail page: deferred_list overlaps ->mapping */ + if (unlikely(!list_empty(&folio->_deferred_list))) { + bad_page(page, "on deferred list"); + goto out; + } break; default: if (page->mapping != TAIL_MAPPING) { @@ -1061,7 +1008,7 @@ out: * on-demand allocation and then freed again before the deferred pages * initialization is done, but this is not likely to happen. */ -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool should_skip_kasan_poison(struct page *page) { if (IS_ENABLED(CONFIG_KASAN_GENERIC)) return deferred_pages_enabled(); @@ -1069,7 +1016,7 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) return page_kasan_tag(page) == KASAN_TAG_KERNEL; } -static void kernel_init_pages(struct page *page, int numpages) +void kernel_init_pages(struct page *page, int numpages) { int i; @@ -1080,11 +1027,11 @@ static void kernel_init_pages(struct page *page, int numpages) kasan_enable_current(); } -static __always_inline bool free_pages_prepare(struct page *page, - unsigned int order, fpi_t fpi_flags) +__always_inline bool free_pages_prepare(struct page *page, + unsigned int order) { int bad = 0; - bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); + bool skip_kasan_poison = should_skip_kasan_poison(page); bool init = want_init_on_free(); bool compound = PageCompound(page); @@ -1100,6 +1047,7 @@ static __always_inline bool free_pages_prepare(struct page *page, /* Do not let hwpoison pages hit pcplists/buddy */ reset_page_owner(page, order); page_table_check_free(page, order); + pgalloc_tag_sub(page, 1 << order); return false; } @@ -1139,6 +1087,7 @@ static __always_inline bool free_pages_prepare(struct page *page, page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); page_table_check_free(page, order); + pgalloc_tag_sub(page, 1 << order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), @@ -1190,7 +1139,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, { unsigned long flags; unsigned int order; - bool isolated_pageblocks; struct page *page; /* @@ -1203,7 +1151,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, pindex = pindex - 1; spin_lock_irqsave(&zone->lock, flags); - isolated_pageblocks = has_isolate_pageblock(zone); while (count > 0) { struct list_head *list; @@ -1219,23 +1166,19 @@ static void free_pcppages_bulk(struct zone *zone, int count, order = pindex_to_order(pindex); nr_pages = 1 << order; do { + unsigned long pfn; int mt; page = list_last_entry(list, struct page, pcp_list); - mt = get_pcppage_migratetype(page); + pfn = page_to_pfn(page); + mt = get_pfnblock_migratetype(page, pfn); /* must delete to avoid corrupting pcp list */ list_del(&page->pcp_list); count -= nr_pages; pcp->count -= nr_pages; - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); - - __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + __free_one_page(page, pfn, zone, order, mt, FPI_NONE); trace_mm_page_pcpu_drain(page, order, mt); } while (count > 0 && !list_empty(list)); } @@ -1243,18 +1186,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_unlock_irqrestore(&zone->lock, flags); } -static void free_one_page(struct zone *zone, - struct page *page, unsigned long pfn, - unsigned int order, - int migratetype, fpi_t fpi_flags) +static void free_one_page(struct zone *zone, struct page *page, + unsigned long pfn, unsigned int order, + fpi_t fpi_flags) { unsigned long flags; + int migratetype; spin_lock_irqsave(&zone->lock, flags); - if (unlikely(has_isolate_pageblock(zone) || - is_migrate_isolate(migratetype))) { - migratetype = get_pfnblock_migratetype(page, pfn); - } + migratetype = get_pfnblock_migratetype(page, pfn); __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); } @@ -1262,21 +1202,13 @@ static void free_one_page(struct zone *zone, static void __free_pages_ok(struct page *page, unsigned int order, fpi_t fpi_flags) { - int migratetype; unsigned long pfn = page_to_pfn(page); struct zone *zone = page_zone(page); - if (!free_pages_prepare(page, order, fpi_flags)) + if (!free_pages_prepare(page, order)) return; - /* - * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here - * is used to avoid calling get_pfnblock_migratetype() under the lock. - * This will reduce the lock holding time. - */ - migratetype = get_pfnblock_migratetype(page, pfn); - - free_one_page(zone, page, pfn, order, migratetype, fpi_flags); + free_one_page(zone, page, pfn, order, fpi_flags); __count_vm_events(PGFREE, 1 << order); } @@ -1387,6 +1319,7 @@ static inline void expand(struct zone *zone, struct page *page, int low, int high, int migratetype) { unsigned long size = 1 << high; + unsigned long nr_added = 0; while (high > low) { high--; @@ -1399,12 +1332,14 @@ static inline void expand(struct zone *zone, struct page *page, * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space */ - if (set_page_guard(zone, &page[size], high, migratetype)) + if (set_page_guard(zone, &page[size], high)) continue; - add_to_free_list(&page[size], zone, high, migratetype); + __add_to_free_list(&page[size], zone, high, migratetype, false); set_buddy_order(&page[size], high); + nr_added += size; } + account_freepages(zone, nr_added, migratetype); } static void check_new_page_bad(struct page *page) @@ -1422,14 +1357,14 @@ static void check_new_page_bad(struct page *page) /* * This page is about to be returned from the page allocator */ -static int check_new_page(struct page *page) +static bool check_new_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) - return 0; + return false; check_new_page_bad(page); - return 1; + return true; } static inline bool check_new_pages(struct page *page, unsigned int order) @@ -1532,6 +1467,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); + pgalloc_tag_add(page, current, 1 << order); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -1572,9 +1508,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_list(page, zone, current_order); + del_page_from_free_list(page, zone, current_order, migratetype); expand(zone, page, order, current_order, migratetype); - set_pcppage_migratetype(page, migratetype); trace_mm_page_alloc_zone_locked(page, order, migratetype, pcp_allowed_order(order) && migratetype < MIGRATE_PCPTYPES); @@ -1591,7 +1526,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * * The other migratetypes do not have fallbacks. */ -static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = { +static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - 1] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE }, [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE }, @@ -1609,30 +1544,23 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, #endif /* - * Move the free pages in a range to the freelist tail of the requested type. - * Note that start_page and end_pages are not aligned on a pageblock - * boundary. If alignment is required, use move_freepages_block() + * Change the type of a block and move all its free pages to that + * type's freelist. */ -static int move_freepages(struct zone *zone, - unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int *num_movable) +static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, + int old_mt, int new_mt) { struct page *page; - unsigned long pfn; + unsigned long pfn, end_pfn; unsigned int order; int pages_moved = 0; - for (pfn = start_pfn; pfn <= end_pfn;) { + VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1)); + end_pfn = pageblock_end_pfn(start_pfn); + + for (pfn = start_pfn; pfn < end_pfn;) { page = pfn_to_page(pfn); if (!PageBuddy(page)) { - /* - * We assume that pages that could be isolated for - * migration are movable. But we don't actually try - * isolating, as that would be expensive. - */ - if (num_movable && - (PageLRU(page) || __PageMovable(page))) - (*num_movable)++; pfn++; continue; } @@ -1642,36 +1570,187 @@ static int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_zone(page) != zone, page); order = buddy_order(page); - move_to_free_list(page, zone, order, migratetype); + + move_to_free_list(page, zone, order, old_mt, new_mt); + pfn += 1 << order; pages_moved += 1 << order; } + set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); + return pages_moved; } -int move_freepages_block(struct zone *zone, struct page *page, - int migratetype, int *num_movable) +static bool prep_move_freepages_block(struct zone *zone, struct page *page, + unsigned long *start_pfn, + int *num_free, int *num_movable) { - unsigned long start_pfn, end_pfn, pfn; + unsigned long pfn, start, end; + + pfn = page_to_pfn(page); + start = pageblock_start_pfn(pfn); + end = pageblock_end_pfn(pfn); - if (num_movable) + /* + * The caller only has the lock for @zone, don't touch ranges + * that straddle into other zones. While we could move part of + * the range that's inside the zone, this call is usually + * accompanied by other operations such as migratetype updates + * which also should be locked. + */ + if (!zone_spans_pfn(zone, start)) + return false; + if (!zone_spans_pfn(zone, end - 1)) + return false; + + *start_pfn = start; + + if (num_free) { + *num_free = 0; *num_movable = 0; + for (pfn = start; pfn < end;) { + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + int nr = 1 << buddy_order(page); - pfn = page_to_pfn(page); - start_pfn = pageblock_start_pfn(pfn); - end_pfn = pageblock_end_pfn(pfn) - 1; + *num_free += nr; + pfn += nr; + continue; + } + /* + * We assume that pages that could be isolated for + * migration are movable. But we don't actually try + * isolating, as that would be expensive. + */ + if (PageLRU(page) || __PageMovable(page)) + (*num_movable)++; + pfn++; + } + } - /* Do not cross zone boundaries */ - if (!zone_spans_pfn(zone, start_pfn)) - start_pfn = pfn; - if (!zone_spans_pfn(zone, end_pfn)) - return 0; + return true; +} + +static int move_freepages_block(struct zone *zone, struct page *page, + int old_mt, int new_mt) +{ + unsigned long start_pfn; + + if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) + return -1; + + return __move_freepages_block(zone, start_pfn, old_mt, new_mt); +} + +#ifdef CONFIG_MEMORY_ISOLATION +/* Look for a buddy that straddles start_pfn */ +static unsigned long find_large_buddy(unsigned long start_pfn) +{ + int order = 0; + struct page *page; + unsigned long pfn = start_pfn; + + while (!PageBuddy(page = pfn_to_page(pfn))) { + /* Nothing found */ + if (++order > MAX_PAGE_ORDER) + return start_pfn; + pfn &= ~0UL << order; + } + + /* + * Found a preceding buddy, but does it straddle? + */ + if (pfn + (1 << buddy_order(page)) > start_pfn) + return pfn; + + /* Nothing found */ + return start_pfn; +} + +/* Split a multi-block free page into its individual pageblocks */ +static void split_large_buddy(struct zone *zone, struct page *page, + unsigned long pfn, int order) +{ + unsigned long end_pfn = pfn + (1 << order); + + VM_WARN_ON_ONCE(order <= pageblock_order); + VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1)); + + /* Caller removed page from freelist, buddy info cleared! */ + VM_WARN_ON_ONCE(PageBuddy(page)); - return move_freepages(zone, start_pfn, end_pfn, migratetype, - num_movable); + while (pfn != end_pfn) { + int mt = get_pfnblock_migratetype(page, pfn); + + __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE); + pfn += pageblock_nr_pages; + page = pfn_to_page(pfn); + } } +/** + * move_freepages_block_isolate - move free pages in block for page isolation + * @zone: the zone + * @page: the pageblock page + * @migratetype: migratetype to set on the pageblock + * + * This is similar to move_freepages_block(), but handles the special + * case encountered in page isolation, where the block of interest + * might be part of a larger buddy spanning multiple pageblocks. + * + * Unlike the regular page allocator path, which moves pages while + * stealing buddies off the freelist, page isolation is interested in + * arbitrary pfn ranges that may have overlapping buddies on both ends. + * + * This function handles that. Straddling buddies are split into + * individual pageblocks. Only the block of interest is moved. + * + * Returns %true if pages could be moved, %false otherwise. + */ +bool move_freepages_block_isolate(struct zone *zone, struct page *page, + int migratetype) +{ + unsigned long start_pfn, pfn; + + if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) + return false; + + /* No splits needed if buddies can't span multiple blocks */ + if (pageblock_order == MAX_PAGE_ORDER) + goto move; + + /* We're a tail block in a larger buddy */ + pfn = find_large_buddy(start_pfn); + if (pfn != start_pfn) { + struct page *buddy = pfn_to_page(pfn); + int order = buddy_order(buddy); + + del_page_from_free_list(buddy, zone, order, + get_pfnblock_migratetype(buddy, pfn)); + set_pageblock_migratetype(page, migratetype); + split_large_buddy(zone, buddy, pfn, order); + return true; + } + + /* We're the starting block of a larger buddy */ + if (PageBuddy(page) && buddy_order(page) > pageblock_order) { + int order = buddy_order(page); + + del_page_from_free_list(page, zone, order, + get_pfnblock_migratetype(page, pfn)); + set_pageblock_migratetype(page, migratetype); + split_large_buddy(zone, page, pfn, order); + return true; + } +move: + __move_freepages_block(zone, start_pfn, + get_pfnblock_migratetype(page, start_pfn), + migratetype); + return true; +} +#endif /* CONFIG_MEMORY_ISOLATION */ + static void change_pageblock_range(struct page *pageblock_page, int start_order, int migratetype) { @@ -1754,33 +1833,37 @@ static inline bool boost_watermark(struct zone *zone) } /* - * This function implements actual steal behaviour. If order is large enough, - * we can steal whole pageblock. If not, we first move freepages in this - * pageblock to our migratetype and determine how many already-allocated pages - * are there in the pageblock with a compatible migratetype. If at least half - * of pages are free or compatible, we can change migratetype of the pageblock - * itself, so pages freed in the future will be put on the correct free list. + * This function implements actual steal behaviour. If order is large enough, we + * can claim the whole pageblock for the requested migratetype. If not, we check + * the pageblock for constituent pages; if at least half of the pages are free + * or compatible, we can still claim the whole block, so pages freed in the + * future will be put on the correct free list. Otherwise, we isolate exactly + * the order we need from the fallback block and leave its migratetype alone. */ -static void steal_suitable_fallback(struct zone *zone, struct page *page, - unsigned int alloc_flags, int start_type, bool whole_block) +static struct page * +steal_suitable_fallback(struct zone *zone, struct page *page, + int current_order, int order, int start_type, + unsigned int alloc_flags, bool whole_block) { - unsigned int current_order = buddy_order(page); int free_pages, movable_pages, alike_pages; - int old_block_type; + unsigned long start_pfn; + int block_type; - old_block_type = get_pageblock_migratetype(page); + block_type = get_pageblock_migratetype(page); /* * This can happen due to races and we want to prevent broken * highatomic accounting. */ - if (is_migrate_highatomic(old_block_type)) + if (is_migrate_highatomic(block_type)) goto single_page; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { + del_page_from_free_list(page, zone, current_order, block_type); change_pageblock_range(page, current_order, start_type); - goto single_page; + expand(zone, page, order, current_order, start_type); + return page; } /* @@ -1795,10 +1878,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, if (!whole_block) goto single_page; - free_pages = move_freepages_block(zone, page, start_type, - &movable_pages); /* moving whole block can fail due to zone boundary conditions */ - if (!free_pages) + if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages, + &movable_pages)) goto single_page; /* @@ -1816,7 +1898,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, * vice versa, be conservative since we can't distinguish the * exact migratetype of non-movable pages. */ - if (old_block_type == MIGRATE_MOVABLE) + if (block_type == MIGRATE_MOVABLE) alike_pages = pageblock_nr_pages - (free_pages + movable_pages); else @@ -1827,13 +1909,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, * compatible migratability as our allocation, claim the whole block. */ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) - set_pageblock_migratetype(page, start_type); - - return; + page_group_by_mobility_disabled) { + __move_freepages_block(zone, start_pfn, block_type, start_type); + return __rmqueue_smallest(zone, order, start_type); + } single_page: - move_to_free_list(page, zone, current_order, start_type); + del_page_from_free_list(page, zone, current_order, block_type); + expand(zone, page, order, current_order, block_type); + return page; } /* @@ -1900,11 +1984,10 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) /* Yoink! */ mt = get_pageblock_migratetype(page); /* Only reserve normal pageblocks (i.e., they can merge with others) */ - if (migratetype_is_mergeable(mt)) { - zone->nr_reserved_highatomic += pageblock_nr_pages; - set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); - move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); - } + if (migratetype_is_mergeable(mt)) + if (move_freepages_block(zone, page, mt, + MIGRATE_HIGHATOMIC) != -1) + zone->nr_reserved_highatomic += pageblock_nr_pages; out_unlock: spin_unlock_irqrestore(&zone->lock, flags); @@ -1928,7 +2011,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, struct zone *zone; struct page *page; int order; - bool ret; + int ret; for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, ac->nodemask) { @@ -1943,11 +2026,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]); + int mt; page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); if (!page) continue; + mt = get_pageblock_migratetype(page); /* * In page freeing path, migratetype change is racy so * we can counter several free pages in a pageblock @@ -1955,7 +2040,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * from highatomic to ac->migratetype. So we should * adjust the count once. */ - if (is_migrate_highatomic_page(page)) { + if (is_migrate_highatomic(mt)) { /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -1977,10 +2062,14 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * of pageblocks that cannot be completely freed * may increase. */ - set_pageblock_migratetype(page, ac->migratetype); - ret = move_freepages_block(zone, page, ac->migratetype, - NULL); - if (ret) { + ret = move_freepages_block(zone, page, mt, + ac->migratetype); + /* + * Reserving this block already succeeded, so this should + * not fail on zone boundaries. + */ + WARN_ON_ONCE(ret == -1); + if (ret > 0) { spin_unlock_irqrestore(&zone->lock, flags); return ret; } @@ -2001,7 +2090,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * deviation from the rest of this file, to make the for loop * condition simpler. */ -static __always_inline bool +static __always_inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, unsigned int alloc_flags) { @@ -2048,7 +2137,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, goto do_steal; } - return false; + return NULL; find_smallest: for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { @@ -2068,14 +2157,14 @@ find_smallest: do_steal: page = get_page_from_free_area(area, fallback_mt); - steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, - can_steal); + /* take off list, maybe claim block, expand remainder */ + page = steal_suitable_fallback(zone, page, current_order, order, + start_migratetype, alloc_flags, can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); - return true; - + return page; } /* @@ -2102,15 +2191,15 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, return page; } } -retry: + page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { if (alloc_flags & ALLOC_CMA) page = __rmqueue_cma_fallback(zone, order); - if (!page && __rmqueue_fallback(zone, order, migratetype, - alloc_flags)) - goto retry; + if (!page) + page = __rmqueue_fallback(zone, order, migratetype, + alloc_flags); } return page; } @@ -2145,12 +2234,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->pcp_list, list); - if (is_migrate_cma(get_pcppage_migratetype(page))) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, - -(1 << order)); } - - __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock_irqrestore(&zone->lock, flags); return i; @@ -2215,12 +2299,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) */ static void drain_pages_zone(unsigned int cpu, struct zone *zone) { - struct per_cpu_pages *pcp; + struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + int count = READ_ONCE(pcp->count); + + while (count) { + int to_drain = min(count, pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX); + count -= to_drain; - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - if (pcp->count) { spin_lock(&pcp->lock); - free_pcppages_bulk(zone, pcp->count, pcp, 0); + free_pcppages_bulk(zone, to_drain, pcp, 0); spin_unlock(&pcp->lock); } } @@ -2338,19 +2425,6 @@ void drain_all_pages(struct zone *zone) __drain_all_pages(zone, false); } -static bool free_unref_page_prepare(struct page *page, unsigned long pfn, - unsigned int order) -{ - int migratetype; - - if (!free_pages_prepare(page, order, FPI_NONE)) - return false; - - migratetype = get_pfnblock_migratetype(page, pfn); - set_pcppage_migratetype(page, migratetype); - return true; -} - static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high) { int min_nr_free, max_nr_free; @@ -2481,9 +2555,14 @@ void free_unref_page(struct page *page, unsigned int order) struct per_cpu_pages *pcp; struct zone *zone; unsigned long pfn = page_to_pfn(page); - int migratetype, pcpmigratetype; + int migratetype; + + if (!pcp_allowed_order(order)) { + __free_pages_ok(page, order, FPI_NONE); + return; + } - if (!free_unref_page_prepare(page, pfn, order)) + if (!free_pages_prepare(page, order)) return; /* @@ -2493,89 +2572,103 @@ void free_unref_page(struct page *page, unsigned int order) * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ - migratetype = pcpmigratetype = get_pcppage_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); + free_one_page(page_zone(page), page, pfn, order, FPI_NONE); return; } - pcpmigratetype = MIGRATE_MOVABLE; + migratetype = MIGRATE_MOVABLE; } zone = page_zone(page); pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, pcpmigratetype, order); + free_unref_page_commit(zone, pcp, page, migratetype, order); pcp_spin_unlock(pcp); } else { - free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); + free_one_page(zone, page, pfn, order, FPI_NONE); } pcp_trylock_finish(UP_flags); } /* - * Free a list of 0-order pages + * Free a batch of folios */ -void free_unref_page_list(struct list_head *list) +void free_unref_folios(struct folio_batch *folios) { unsigned long __maybe_unused UP_flags; - struct page *page, *next; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; - int batch_count = 0; - int migratetype; + int i, j; - /* Prepare pages for freeing */ - list_for_each_entry_safe(page, next, list, lru) { - unsigned long pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn, 0)) { - list_del(&page->lru); - continue; - } + /* Prepare folios for freeing */ + for (i = 0, j = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + unsigned long pfn = folio_pfn(folio); + unsigned int order = folio_order(folio); + if (order > 0 && folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + if (!free_pages_prepare(&folio->page, order)) + continue; /* - * Free isolated pages directly to the allocator, see - * comment in free_unref_page. + * Free orders not handled on the PCP directly to the + * allocator. */ - migratetype = get_pcppage_migratetype(page); - if (unlikely(is_migrate_isolate(migratetype))) { - list_del(&page->lru); - free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + if (!pcp_allowed_order(order)) { + free_one_page(folio_zone(folio), &folio->page, + pfn, order, FPI_NONE); continue; } - } - - list_for_each_entry_safe(page, next, list, lru) { - struct zone *zone = page_zone(page); - - list_del(&page->lru); - migratetype = get_pcppage_migratetype(page); - - /* - * Either different zone requiring a different pcp lock or - * excessive lock hold times when freeing a large list of - * pages. - */ - if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { + folio->private = (void *)(unsigned long)order; + if (j != i) + folios->folios[j] = folio; + j++; + } + folios->nr = j; + + for (i = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + struct zone *zone = folio_zone(folio); + unsigned long pfn = folio_pfn(folio); + unsigned int order = (unsigned long)folio->private; + int migratetype; + + folio->private = NULL; + migratetype = get_pfnblock_migratetype(&folio->page, pfn); + + /* Different zone requires a different pcp lock */ + if (zone != locked_zone || + is_migrate_isolate(migratetype)) { if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); + locked_zone = NULL; + pcp = NULL; } - batch_count = 0; + /* + * Free isolated pages directly to the + * allocator, see comment in free_unref_page. + */ + if (is_migrate_isolate(migratetype)) { + free_one_page(zone, &folio->page, pfn, + order, FPI_NONE); + continue; + } /* - * trylock is necessary as pages may be getting freed + * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. */ pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); - free_one_page(zone, page, page_to_pfn(page), - 0, migratetype, FPI_NONE); - locked_zone = NULL; + free_one_page(zone, &folio->page, pfn, + order, FPI_NONE); continue; } locked_zone = zone; @@ -2588,15 +2681,16 @@ void free_unref_page_list(struct list_head *list) if (unlikely(migratetype >= MIGRATE_PCPTYPES)) migratetype = MIGRATE_MOVABLE; - trace_mm_page_free_batched(page); - free_unref_page_commit(zone, pcp, page, migratetype, 0); - batch_count++; + trace_mm_page_free_batched(&folio->page); + free_unref_page_commit(zone, pcp, &folio->page, migratetype, + order); } if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); } + folio_batch_reinit(folios); } /* @@ -2616,8 +2710,9 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, 1 << order); - split_page_memcg(page, 1 << order); + split_page_owner(page, order, 0); + pgalloc_tag_split(page, 1 << order); + split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@ -2637,11 +2732,9 @@ int __isolate_free_page(struct page *page, unsigned int order) watermark = zone->_watermark[WMARK_MIN] + (1UL << order); if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; - - __mod_zone_freepage_state(zone, -(1UL << order), mt); } - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, mt); /* * Set the pageblock if the isolated page is at least half of a @@ -2656,8 +2749,8 @@ int __isolate_free_page(struct page *page, unsigned int order) * with others) */ if (migratetype_is_mergeable(mt)) - set_pageblock_migratetype(page, - MIGRATE_MOVABLE); + move_freepages_block(zone, page, mt, + MIGRATE_MOVABLE); } } @@ -2741,8 +2834,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, return NULL; } } - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); spin_unlock_irqrestore(&zone->lock, flags); } while (check_new_pages(page, order)); @@ -4378,7 +4469,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * * Returns the number of pages on the list or array. */ -unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, +unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, struct list_head *page_list, struct page **page_array) @@ -4514,7 +4605,7 @@ failed_irq: pcp_trylock_finish(UP_flags); failed: - page = __alloc_pages(gfp, 0, preferred_nid, nodemask); + page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); if (page) { if (page_list) list_add(&page->lru, page_list); @@ -4525,13 +4616,13 @@ failed: goto out; } -EXPORT_SYMBOL_GPL(__alloc_pages_bulk); +EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); /* * This is the 'heart' of the zoned buddy allocator. */ -struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, - nodemask_t *nodemask) +struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -4593,38 +4684,38 @@ out: return page; } -EXPORT_SYMBOL(__alloc_pages); +EXPORT_SYMBOL(__alloc_pages_noprof); -struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, +struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { - struct page *page = __alloc_pages(gfp | __GFP_COMP, order, + struct page *page = __alloc_pages_noprof(gfp | __GFP_COMP, order, preferred_nid, nodemask); return page_rmappable_folio(page); } -EXPORT_SYMBOL(__folio_alloc); +EXPORT_SYMBOL(__folio_alloc_noprof); /* * Common helper functions. Never use with __GFP_HIGHMEM because the returned * address cannot represent highmem pages. Use alloc_pages and then kmap if * you need to access high mem. */ -unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) +unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order) { struct page *page; - page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); + page = alloc_pages_noprof(gfp_mask & ~__GFP_HIGHMEM, order); if (!page) return 0; return (unsigned long) page_address(page); } -EXPORT_SYMBOL(__get_free_pages); +EXPORT_SYMBOL(get_free_pages_noprof); -unsigned long get_zeroed_page(gfp_t gfp_mask) +unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) { - return __get_free_page(gfp_mask | __GFP_ZERO); + return get_free_pages_noprof(gfp_mask | __GFP_ZERO, 0); } -EXPORT_SYMBOL(get_zeroed_page); +EXPORT_SYMBOL(get_zeroed_page_noprof); /** * __free_pages - Free pages allocated with alloc_pages(). @@ -4650,12 +4741,15 @@ void __free_pages(struct page *page, unsigned int order) { /* get PageHead before we drop reference */ int head = PageHead(page); + struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) - free_the_page(page, order); - else if (!head) + free_unref_page(page, order); + else if (!head) { + pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) - free_the_page(page + (1 << order), order); + free_unref_page(page + (1 << order), order); + } } EXPORT_SYMBOL(__free_pages); @@ -4687,8 +4781,8 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, gfp_t gfp = gfp_mask; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC; + gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | + __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC; page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER); nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; @@ -4701,18 +4795,28 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, return page; } +void page_frag_cache_drain(struct page_frag_cache *nc) +{ + if (!nc->va) + return; + + __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias); + nc->va = NULL; +} +EXPORT_SYMBOL(page_frag_cache_drain); + void __page_frag_cache_drain(struct page *page, unsigned int count) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) - free_the_page(page, compound_order(page)); + free_unref_page(page, compound_order(page)); } EXPORT_SYMBOL(__page_frag_cache_drain); -void *page_frag_alloc_align(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask, - unsigned int align_mask) +void *__page_frag_alloc_align(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask, + unsigned int align_mask) { unsigned int size = PAGE_SIZE; struct page *page; @@ -4747,7 +4851,7 @@ refill: goto refill; if (unlikely(nc->pfmemalloc)) { - free_the_page(page, compound_order(page)); + free_unref_page(page, compound_order(page)); goto refill; } @@ -4781,7 +4885,7 @@ refill: return nc->va + offset; } -EXPORT_SYMBOL(page_frag_alloc_align); +EXPORT_SYMBOL(__page_frag_alloc_align); /* * Frees a page fragment allocated out of either a compound or order 0 page. @@ -4791,7 +4895,7 @@ void page_frag_free(void *addr) struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) - free_the_page(page, compound_order(page)); + free_unref_page(page, compound_order(page)); } EXPORT_SYMBOL(page_frag_free); @@ -4803,8 +4907,9 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *page = virt_to_page((void *)addr); struct page *last = page + nr; - split_page_owner(page, 1 << order); - split_page_memcg(page, 1 << order); + split_page_owner(page, order, 0); + pgalloc_tag_split(page, 1 << order); + split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last); @@ -4830,7 +4935,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, * * Return: pointer to the allocated area or %NULL in case of error. */ -void *alloc_pages_exact(size_t size, gfp_t gfp_mask) +void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) { unsigned int order = get_order(size); unsigned long addr; @@ -4838,10 +4943,10 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); - addr = __get_free_pages(gfp_mask, order); + addr = get_free_pages_noprof(gfp_mask, order); return make_alloc_exact(addr, order, size); } -EXPORT_SYMBOL(alloc_pages_exact); +EXPORT_SYMBOL(alloc_pages_exact_noprof); /** * alloc_pages_exact_nid - allocate an exact number of physically-contiguous @@ -4855,7 +4960,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * * Return: pointer to the allocated area or %NULL in case of error. */ -void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) { unsigned int order = get_order(size); struct page *p; @@ -4863,7 +4968,7 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); - p = alloc_pages_node(nid, gfp_mask, order); + p = alloc_pages_node_noprof(nid, gfp_mask, order); if (!p) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); @@ -5164,37 +5269,13 @@ static void setup_min_slab_ratio(void); static void build_zonelists(pg_data_t *pgdat) { - int node, local_node; struct zoneref *zonerefs; int nr_zones; - local_node = pgdat->node_id; - zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; nr_zones = build_zonerefs_node(pgdat, zonerefs); zonerefs += nr_zones; - /* - * Now we build the zonelist so that it contains the zones - * of all the other nodes. - * We don't want to pressure a particular node, so when - * building the zones for node N, we make sure that the - * zones coming right after the local ones are those from - * node N+1 (modulo N) - */ - for (node = local_node + 1; node < MAX_NUMNODES; node++) { - if (!node_online(node)) - continue; - nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); - zonerefs += nr_zones; - } - for (node = 0; node < local_node; node++) { - if (!node_online(node)) - continue; - nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); - zonerefs += nr_zones; - } - zonerefs->zone = NULL; zonerefs->zone_idx = 0; } @@ -5574,37 +5655,34 @@ static void zone_pcp_update(struct zone *zone, int cpu_online) mutex_unlock(&pcp_batch_high_lock); } -static void zone_pcp_update_cacheinfo(struct zone *zone) +static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) { - int cpu; struct per_cpu_pages *pcp; struct cpu_cacheinfo *cci; - for_each_online_cpu(cpu) { - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - cci = get_cpu_cacheinfo(cpu); - /* - * If data cache slice of CPU is large enough, "pcp->batch" - * pages can be preserved in PCP before draining PCP for - * consecutive high-order pages freeing without allocation. - * This can reduce zone lock contention without hurting - * cache-hot pages sharing. - */ - spin_lock(&pcp->lock); - if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) - pcp->flags |= PCPF_FREE_HIGH_BATCH; - else - pcp->flags &= ~PCPF_FREE_HIGH_BATCH; - spin_unlock(&pcp->lock); - } + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + cci = get_cpu_cacheinfo(cpu); + /* + * If data cache slice of CPU is large enough, "pcp->batch" + * pages can be preserved in PCP before draining PCP for + * consecutive high-order pages freeing without allocation. + * This can reduce zone lock contention without hurting + * cache-hot pages sharing. + */ + spin_lock(&pcp->lock); + if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) + pcp->flags |= PCPF_FREE_HIGH_BATCH; + else + pcp->flags &= ~PCPF_FREE_HIGH_BATCH; + spin_unlock(&pcp->lock); } -void setup_pcp_cacheinfo(void) +void setup_pcp_cacheinfo(unsigned int cpu) { struct zone *zone; for_each_populated_zone(zone) - zone_pcp_update_cacheinfo(zone); + zone_pcp_update_cacheinfo(zone, cpu); } /* @@ -5814,10 +5892,11 @@ static void setup_per_zone_lowmem_reserve(void) for (j = i + 1; j < MAX_NR_ZONES; j++) { struct zone *upper_zone = &pgdat->node_zones[j]; + bool empty = !zone_managed_pages(upper_zone); managed_pages += zone_managed_pages(upper_zone); - if (clear) + if (clear || empty) zone->lowmem_reserve[j] = 0; else zone->lowmem_reserve[j] = managed_pages / ratio; @@ -5847,7 +5926,7 @@ static void __setup_per_zone_wmarks(void) spin_lock_irqsave(&zone->lock, flags); tmp = (u64)pages_min * zone_managed_pages(zone); - do_div(tmp, lowmem_pages); + tmp = div64_ul(tmp, lowmem_pages); if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -6198,7 +6277,6 @@ static struct ctl_table page_alloc_sysctl_table[] = { .extra2 = SYSCTL_ONE_HUNDRED, }, #endif - {} }; void __init page_alloc_sysctl_init(void) @@ -6221,9 +6299,14 @@ static void alloc_contig_dump_pages(struct list_head *page_list) } } -/* [start, end) must belong to a single zone. */ +/* + * [start, end) must belong to a single zone. + * @migratetype: using migratetype to filter the type of migration in + * trace_mm_alloc_contig_migrate_range_info. + */ int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + int migratetype) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; @@ -6233,7 +6316,12 @@ int __alloc_contig_migrate_range(struct compact_control *cc, struct migration_target_control mtc = { .nid = zone_to_nid(cc->zone), .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .reason = MR_CONTIG_RANGE, }; + struct page *page; + unsigned long total_mapped = 0; + unsigned long total_migrated = 0; + unsigned long total_reclaimed = 0; lru_cache_disable(); @@ -6259,9 +6347,22 @@ int __alloc_contig_migrate_range(struct compact_control *cc, &cc->migratepages); cc->nr_migratepages -= nr_reclaimed; + if (trace_mm_alloc_contig_migrate_range_info_enabled()) { + total_reclaimed += nr_reclaimed; + list_for_each_entry(page, &cc->migratepages, lru) { + struct folio *folio = page_folio(page); + + total_mapped += folio_mapped(folio) * + folio_nr_pages(folio); + } + } + ret = migrate_pages(&cc->migratepages, alloc_migration_target, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); + if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret) + total_migrated += cc->nr_migratepages; + /* * On -ENOMEM, migrate_pages() bails out right away. It is pointless * to retry again over this error, so do the same here. @@ -6275,9 +6376,13 @@ int __alloc_contig_migrate_range(struct compact_control *cc, if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); - return ret; } - return 0; + + trace_mm_alloc_contig_migrate_range_info(start, end, migratetype, + total_migrated, + total_reclaimed, + total_mapped); + return (ret < 0) ? ret : 0; } /** @@ -6301,11 +6406,10 @@ int __alloc_contig_migrate_range(struct compact_control *cc, * pages which PFN is in [start, end) are allocated for the caller and * need to be freed with free_contig_range(). */ -int alloc_contig_range(unsigned long start, unsigned long end, +int alloc_contig_range_noprof(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; - int order; int ret = 0; struct compact_control cc = { @@ -6357,7 +6461,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, * allocated. So, if we fall through be sure to clear ret so that * -EBUSY is not accidentally used or returned to caller. */ - ret = __alloc_contig_migrate_range(&cc, start, end); + ret = __alloc_contig_migrate_range(&cc, start, end, migratetype); if (ret && ret != -EBUSY) goto done; ret = 0; @@ -6378,29 +6482,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, * We don't have to hold zone->lock here because the pages are * isolated thus they won't get removed from buddy. */ - - order = 0; - outer_start = start; - while (!PageBuddy(pfn_to_page(outer_start))) { - if (++order > MAX_PAGE_ORDER) { - outer_start = start; - break; - } - outer_start &= ~0UL << order; - } - - if (outer_start != start) { - order = buddy_order(pfn_to_page(outer_start)); - - /* - * outer_start page could be small order buddy page and - * it doesn't include start page. Adjust outer_start - * in this case to report failed page properly - * on tracepoint in test_pages_isolated() - */ - if (outer_start + (1UL << order) <= start) - outer_start = start; - } + outer_start = find_large_buddy(start); /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, 0)) { @@ -6425,15 +6507,15 @@ done: undo_isolate_page_range(start, end, migratetype); return ret; } -EXPORT_SYMBOL(alloc_contig_range); +EXPORT_SYMBOL(alloc_contig_range_noprof); static int __alloc_contig_pages(unsigned long start_pfn, unsigned long nr_pages, gfp_t gfp_mask) { unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - gfp_mask); + return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE, + gfp_mask); } static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, @@ -6488,8 +6570,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, * * Return: pointer to contiguous pages on success, or NULL if not successful. */ -struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { unsigned long ret, pfn, flags; struct zonelist *zonelist; @@ -6620,8 +6702,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); + VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE); order = buddy_order(page); - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); @@ -6631,16 +6714,16 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) /* * This function returns a stable result only if called under zone lock. */ -bool is_free_buddy_page(struct page *page) +bool is_free_buddy_page(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned int order; for (order = 0; order < NR_PAGE_ORDERS; order++) { - struct page *page_head = page - (pfn & ((1 << order) - 1)); + const struct page *head = page - (pfn & ((1 << order) - 1)); - if (PageBuddy(page_head) && - buddy_order_unsafe(page_head) >= order) + if (PageBuddy(head) && + buddy_order_unsafe(head) >= order) break; } @@ -6649,6 +6732,14 @@ bool is_free_buddy_page(struct page *page) EXPORT_SYMBOL(is_free_buddy_page); #ifdef CONFIG_MEMORY_FAILURE +static inline void add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype, + bool tail) +{ + __add_to_free_list(page, zone, order, migratetype, tail); + account_freepages(zone, 1 << order, migratetype); +} + /* * Break down a higher-order page in sub-pages, and keep our target out of * buddy allocator. @@ -6671,10 +6762,10 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, current_buddy = page + size; } - if (set_page_guard(zone, current_buddy, high, migratetype)) + if (set_page_guard(zone, current_buddy, high)) continue; - add_to_free_list(current_buddy, zone, high, migratetype); + add_to_free_list(current_buddy, zone, high, migratetype, false); set_buddy_order(current_buddy, high); } } @@ -6700,12 +6791,11 @@ bool take_page_off_buddy(struct page *page) int migratetype = get_pfnblock_migratetype(page_head, pfn_head); - del_page_from_free_list(page_head, zone, page_order); + del_page_from_free_list(page_head, zone, page_order, + migratetype); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); SetPageHWPoisonTakenOff(page); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -1, migratetype); ret = true; break; } @@ -6722,13 +6812,14 @@ bool take_page_off_buddy(struct page *page) bool put_page_back_buddy(struct page *page) { struct zone *zone = page_zone(page); - unsigned long pfn = page_to_pfn(page); unsigned long flags; - int migratetype = get_pfnblock_migratetype(page, pfn); bool ret = false; spin_lock_irqsave(&zone->lock, flags); if (put_page_testzero(page)) { + unsigned long pfn = page_to_pfn(page); + int migratetype = get_pfnblock_migratetype(page, pfn); + ClearPageHWPoisonTakenOff(page); __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); if (TestClearPageHWPoison(page)) { @@ -6812,7 +6903,7 @@ static bool try_to_accept_memory_one(struct zone *zone) list_del(&page->lru); last = list_empty(&zone->unaccepted_pages); - __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); spin_unlock_irqrestore(&zone->lock, flags); @@ -6864,7 +6955,7 @@ static bool __free_unaccepted(struct page *page) spin_lock_irqsave(&zone->lock, flags); first = list_empty(&zone->unaccepted_pages); list_add_tail(&page->lru, &zone->unaccepted_pages); - __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); spin_unlock_irqrestore(&zone->lock, flags); diff --git a/mm/page_ext.c b/mm/page_ext.c index 4548fcc66d74..95dd8ffeaf81 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -10,6 +10,7 @@ #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate.h> +#include <linux/pgalloc_tag.h> /* * struct page extension @@ -82,6 +83,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif +#ifdef CONFIG_MEM_ALLOC_PROFILING + &page_alloc_tagging_ops, +#endif #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif @@ -91,7 +95,16 @@ unsigned long page_ext_size; static unsigned long total_usage; +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG +/* + * To ensure correct allocation tagging for pages, page_ext should be available + * before the first page allocation. Otherwise early task stacks will be + * allocated before page_ext initialization and missing tags will be flagged. + */ +bool early_page_ext __meminitdata = true; +#else bool early_page_ext __meminitdata; +#endif static int __init setup_early_page_ext(char *str) { early_page_ext = true; @@ -501,7 +514,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) * Context: Any context. Caller may not sleep until they have called * page_ext_put(). */ -struct page_ext *page_ext_get(struct page *page) +struct page_ext *page_ext_get(const struct page *page) { struct page_ext *page_ext; diff --git a/mm/page_io.c b/mm/page_io.c index ae2b49055e43..46c603dddf04 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -189,7 +189,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) * Arch code may have to preserve more data than just the page * contents, e.g. memory tags. */ - ret = arch_prepare_to_swap(&folio->page); + ret = arch_prepare_to_swap(folio); if (ret) { folio_mark_dirty(folio); folio_unlock(folio); @@ -217,6 +217,7 @@ static inline void count_swpout_vm_event(struct folio *folio) count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); } + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_SWPOUT); #endif count_vm_events(PSWPOUT, folio_nr_pages(folio)); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index cd0ea3668253..042937d5abe4 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -178,15 +178,11 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, migratetype, isol_flags); if (!unmovable) { - unsigned long nr_pages; - int mt = get_pageblock_migratetype(page); - - set_pageblock_migratetype(page, MIGRATE_ISOLATE); + if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) { + spin_unlock_irqrestore(&zone->lock, flags); + return -EBUSY; + } zone->nr_isolate_pageblock++; - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, - NULL); - - __mod_zone_freepage_state(zone, -nr_pages, mt); spin_unlock_irqrestore(&zone->lock, flags); return 0; } @@ -206,7 +202,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ static void unset_migratetype_isolate(struct page *page, int migratetype) { struct zone *zone; - unsigned long flags, nr_pages; + unsigned long flags; bool isolated_page = false; unsigned int order; struct page *buddy; @@ -252,12 +248,15 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) * allocation. */ if (!isolated_page) { - nr_pages = move_freepages_block(zone, page, migratetype, NULL); - __mod_zone_freepage_state(zone, nr_pages, migratetype); - } - set_pageblock_migratetype(page, migratetype); - if (isolated_page) + /* + * Isolating this block already succeeded, so this + * should not fail on zone boundaries. + */ + WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype)); + } else { + set_pageblock_migratetype(page, migratetype); __putback_isolated_page(page, order, migratetype); + } zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); @@ -367,26 +366,29 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, VM_BUG_ON(!page); pfn = page_to_pfn(page); - /* - * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any - * free pages in [start_pfn, boundary_pfn), its head page will - * always be in the range. - */ + if (PageBuddy(page)) { int order = buddy_order(page); - if (pfn + (1UL << order) > boundary_pfn) { - /* free page changed before split, check it again */ - if (split_free_page(page, order, boundary_pfn - pfn)) - continue; - } + /* move_freepages_block_isolate() handled this */ + VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn); pfn += 1UL << order; continue; } + /* - * migrate compound pages then let the free page handling code - * above do the rest. If migration is not possible, just fail. + * If a compound page is straddling our block, attempt + * to migrate it out of the way. + * + * We don't have to worry about this creating a large + * free page that straddles into our block: gigantic + * pages are freed as order-0 chunks, and LRU pages + * (currently) do not exceed pageblock_order. + * + * The block of interest has already been marked + * MIGRATE_ISOLATE above, so when migration is done it + * will free its pages onto the correct freelists. */ if (PageCompound(page)) { struct page *head = compound_head(page); @@ -397,16 +399,10 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, pfn = head_pfn + nr_pages; continue; } + #if defined CONFIG_COMPACTION || defined CONFIG_CMA - /* - * hugetlb, lru compound (THP), and movable compound pages - * can be migrated. Otherwise, fail the isolation. - */ - if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) { - int order; - unsigned long outer_pfn; + if (PageHuge(page)) { int page_mt = get_pageblock_migratetype(page); - bool isolate_page = !is_migrate_isolate_page(page); struct compact_control cc = { .nr_migratepages = 0, .order = -1, @@ -419,56 +415,25 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, }; INIT_LIST_HEAD(&cc.migratepages); - /* - * XXX: mark the page as MIGRATE_ISOLATE so that - * no one else can grab the freed page after migration. - * Ideally, the page should be freed as two separate - * pages to be added into separate migratetype free - * lists. - */ - if (isolate_page) { - ret = set_migratetype_isolate(page, page_mt, - flags, head_pfn, head_pfn + nr_pages); - if (ret) - goto failed; - } - ret = __alloc_contig_migrate_range(&cc, head_pfn, - head_pfn + nr_pages); - - /* - * restore the page's migratetype so that it can - * be split into separate migratetype free lists - * later. - */ - if (isolate_page) - unset_migratetype_isolate(page, page_mt); - + head_pfn + nr_pages, page_mt); if (ret) goto failed; - /* - * reset pfn to the head of the free page, so - * that the free page handling code above can split - * the free page to the right migratetype list. - * - * head_pfn is not used here as a hugetlb page order - * can be bigger than MAX_PAGE_ORDER, but after it is - * freed, the free page order is not. Use pfn within - * the range to find the head of the free page. - */ - order = 0; - outer_pfn = pfn; - while (!PageBuddy(pfn_to_page(outer_pfn))) { - /* stop if we cannot find the free page */ - if (++order > MAX_PAGE_ORDER) - goto failed; - outer_pfn &= ~0UL << order; - } - pfn = outer_pfn; + pfn = head_pfn + nr_pages; continue; - } else + } + + /* + * These pages are movable too, but they're + * not expected to exceed pageblock_order. + * + * Let us know when they do, so we can add + * proper free and split handling for them. + */ + VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); + VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page); #endif - goto failed; + goto failed; } pfn++; diff --git a/mm/page_owner.c b/mm/page_owner.c index 5634e5d890f8..2d6360eaccbb 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -36,6 +36,15 @@ struct page_owner { pid_t free_tgid; }; +struct stack { + struct stack_record *stack_record; + struct stack *next; +}; +static struct stack dummy_stack; +static struct stack failure_stack; +static struct stack *stack_list; +static DEFINE_SPINLOCK(stack_list_lock); + static bool page_owner_enabled __initdata; DEFINE_STATIC_KEY_FALSE(page_owner_inited); @@ -45,6 +54,22 @@ static depot_stack_handle_t early_handle; static void init_early_allocated_pages(void); +static inline void set_current_in_page_owner(void) +{ + /* + * Avoid recursion. + * + * We might need to allocate more memory from page_owner code, so make + * sure to signal it in order to avoid recursion. + */ + current->in_page_owner = 1; +} + +static inline void unset_current_in_page_owner(void) +{ + current->in_page_owner = 0; +} + static int __init early_page_owner_param(char *buf) { int ret = kstrtobool(buf, &page_owner_enabled); @@ -93,8 +118,17 @@ static __init void init_page_owner(void) register_dummy_stack(); register_failure_stack(); register_early_stack(); - static_branch_enable(&page_owner_inited); init_early_allocated_pages(); + /* Initialize dummy and failure stacks and link them to stack_list */ + dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle); + failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle); + if (dummy_stack.stack_record) + refcount_set(&dummy_stack.stack_record->count, 1); + if (failure_stack.stack_record) + refcount_set(&failure_stack.stack_record->count, 1); + dummy_stack.next = &failure_stack; + stack_list = &dummy_stack; + static_branch_enable(&page_owner_inited); } struct page_ext_operations page_owner_ops = { @@ -115,81 +149,172 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) depot_stack_handle_t handle; unsigned int nr_entries; - /* - * Avoid recursion. - * - * Sometimes page metadata allocation tracking requires more - * memory to be allocated: - * - when new stack trace is saved to stack depot - */ if (current->in_page_owner) return dummy_handle; - current->in_page_owner = 1; + set_current_in_page_owner(); nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); handle = stack_depot_save(entries, nr_entries, flags); if (!handle) handle = failure_handle; + unset_current_in_page_owner(); - current->in_page_owner = 0; return handle; } -void __reset_page_owner(struct page *page, unsigned short order) +static void add_stack_record_to_list(struct stack_record *stack_record, + gfp_t gfp_mask) { - int i; - struct page_ext *page_ext; - depot_stack_handle_t handle; - struct page_owner *page_owner; - u64 free_ts_nsec = local_clock(); + unsigned long flags; + struct stack *stack; - page_ext = page_ext_get(page); - if (unlikely(!page_ext)) + set_current_in_page_owner(); + stack = kmalloc(sizeof(*stack), gfp_nested_mask(gfp_mask)); + if (!stack) { + unset_current_in_page_owner(); return; + } + unset_current_in_page_owner(); - handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); - for (i = 0; i < (1 << order); i++) { - __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - page_owner = get_page_owner(page_ext); - page_owner->free_handle = handle; - page_owner->free_ts_nsec = free_ts_nsec; - page_owner->free_pid = current->pid; - page_owner->free_tgid = current->tgid; - page_ext = page_ext_next(page_ext); + stack->stack_record = stack_record; + stack->next = NULL; + + spin_lock_irqsave(&stack_list_lock, flags); + stack->next = stack_list; + /* + * This pairs with smp_load_acquire() from function + * stack_start(). This guarantees that stack_start() + * will see an updated stack_list before starting to + * traverse the list. + */ + smp_store_release(&stack_list, stack); + spin_unlock_irqrestore(&stack_list_lock, flags); +} + +static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask, + int nr_base_pages) +{ + struct stack_record *stack_record = __stack_depot_get_stack_record(handle); + + if (!stack_record) + return; + + /* + * New stack_record's that do not use STACK_DEPOT_FLAG_GET start + * with REFCOUNT_SATURATED to catch spurious increments of their + * refcount. + * Since we do not use STACK_DEPOT_FLAG_GET API, let us + * set a refcount of 1 ourselves. + */ + if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) { + int old = REFCOUNT_SATURATED; + + if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1)) + /* Add the new stack_record to our list */ + add_stack_record_to_list(stack_record, gfp_mask); } - page_ext_put(page_ext); + refcount_add(nr_base_pages, &stack_record->count); } -static inline void __set_page_owner_handle(struct page_ext *page_ext, - depot_stack_handle_t handle, - unsigned short order, gfp_t gfp_mask) +static void dec_stack_record_count(depot_stack_handle_t handle, + int nr_base_pages) +{ + struct stack_record *stack_record = __stack_depot_get_stack_record(handle); + + if (!stack_record) + return; + + if (refcount_sub_and_test(nr_base_pages, &stack_record->count)) + pr_warn("%s: refcount went to 0 for %u handle\n", __func__, + handle); +} + +static inline void __update_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, + gfp_t gfp_mask, + short last_migrate_reason, u64 ts_nsec, + pid_t pid, pid_t tgid, char *comm) { - struct page_owner *page_owner; int i; - u64 ts_nsec = local_clock(); + struct page_owner *page_owner; for (i = 0; i < (1 << order); i++) { page_owner = get_page_owner(page_ext); page_owner->handle = handle; page_owner->order = order; page_owner->gfp_mask = gfp_mask; - page_owner->last_migrate_reason = -1; - page_owner->pid = current->pid; - page_owner->tgid = current->tgid; + page_owner->last_migrate_reason = last_migrate_reason; + page_owner->pid = pid; + page_owner->tgid = tgid; page_owner->ts_nsec = ts_nsec; - strscpy(page_owner->comm, current->comm, + strscpy(page_owner->comm, comm, sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_ext = page_ext_next(page_ext); + } +} + +static inline void __update_page_owner_free_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, + pid_t pid, pid_t tgid, + u64 free_ts_nsec) +{ + int i; + struct page_owner *page_owner; + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + /* Only __reset_page_owner() wants to clear the bit */ + if (handle) { + __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_owner->free_handle = handle; + } + page_owner->free_ts_nsec = free_ts_nsec; + page_owner->free_pid = current->pid; + page_owner->free_tgid = current->tgid; page_ext = page_ext_next(page_ext); } } +void __reset_page_owner(struct page *page, unsigned short order) +{ + struct page_ext *page_ext; + depot_stack_handle_t handle; + depot_stack_handle_t alloc_handle; + struct page_owner *page_owner; + u64 free_ts_nsec = local_clock(); + + page_ext = page_ext_get(page); + if (unlikely(!page_ext)) + return; + + page_owner = get_page_owner(page_ext); + alloc_handle = page_owner->handle; + + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); + __update_page_owner_free_handle(page_ext, handle, order, current->pid, + current->tgid, free_ts_nsec); + page_ext_put(page_ext); + + if (alloc_handle != early_handle) + /* + * early_handle is being set as a handle for all those + * early allocated pages. See init_pages_in_zone(). + * Since their refcount is not being incremented because + * the machinery is not ready yet, we cannot decrement + * their refcount either. + */ + dec_stack_record_count(alloc_handle, 1 << order); +} + noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { struct page_ext *page_ext; + u64 ts_nsec = local_clock(); depot_stack_handle_t handle; handle = save_stack(gfp_mask); @@ -197,8 +322,11 @@ noinline void __set_page_owner(struct page *page, unsigned short order, page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; - __set_page_owner_handle(page_ext, handle, order, gfp_mask); + __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1, + ts_nsec, current->pid, current->tgid, + current->comm); page_ext_put(page_ext); + inc_stack_record_count(handle, gfp_mask, 1 << order); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -214,7 +342,7 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_ext_put(page_ext); } -void __split_page_owner(struct page *page, unsigned int nr) +void __split_page_owner(struct page *page, int old_order, int new_order) { int i; struct page_ext *page_ext = page_ext_get(page); @@ -223,9 +351,9 @@ void __split_page_owner(struct page *page, unsigned int nr) if (unlikely(!page_ext)) return; - for (i = 0; i < nr; i++) { + for (i = 0; i < (1 << old_order); i++) { page_owner = get_page_owner(page_ext); - page_owner->order = 0; + page_owner->order = new_order; page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); @@ -233,9 +361,12 @@ void __split_page_owner(struct page *page, unsigned int nr) void __folio_copy_owner(struct folio *newfolio, struct folio *old) { + int i; struct page_ext *old_ext; struct page_ext *new_ext; - struct page_owner *old_page_owner, *new_page_owner; + struct page_owner *old_page_owner; + struct page_owner *new_page_owner; + depot_stack_handle_t migrate_handle; old_ext = page_ext_get(&old->page); if (unlikely(!old_ext)) @@ -249,30 +380,32 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) old_page_owner = get_page_owner(old_ext); new_page_owner = get_page_owner(new_ext); - new_page_owner->order = old_page_owner->order; - new_page_owner->gfp_mask = old_page_owner->gfp_mask; - new_page_owner->last_migrate_reason = - old_page_owner->last_migrate_reason; - new_page_owner->handle = old_page_owner->handle; - new_page_owner->pid = old_page_owner->pid; - new_page_owner->tgid = old_page_owner->tgid; - new_page_owner->free_pid = old_page_owner->free_pid; - new_page_owner->free_tgid = old_page_owner->free_tgid; - new_page_owner->ts_nsec = old_page_owner->ts_nsec; - new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; - strcpy(new_page_owner->comm, old_page_owner->comm); - + migrate_handle = new_page_owner->handle; + __update_page_owner_handle(new_ext, old_page_owner->handle, + old_page_owner->order, old_page_owner->gfp_mask, + old_page_owner->last_migrate_reason, + old_page_owner->ts_nsec, old_page_owner->pid, + old_page_owner->tgid, old_page_owner->comm); + /* + * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio + * will be freed after migration. Keep them until then as they may be + * useful. + */ + __update_page_owner_free_handle(new_ext, 0, old_page_owner->order, + old_page_owner->free_pid, + old_page_owner->free_tgid, + old_page_owner->free_ts_nsec); /* - * We don't clear the bit on the old folio as it's going to be freed - * after migration. Until then, the info can be useful in case of - * a bug, and the overall stats will be off a bit only temporarily. - * Also, migrate_misplaced_transhuge_page() can still fail the - * migration and then we want the old folio to retain the info. But - * in that case we also don't need to explicitly clear the info from - * the new page, which will be freed. + * We linked the original stack to the new folio, we need to do the same + * for the new one and the old folio otherwise there will be an imbalance + * when subtracting those pages from the stack. */ - __set_bit(PAGE_EXT_OWNER, &new_ext->flags); - __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + for (i = 0; i < (1 << new_page_owner->order); i++) { + old_page_owner->handle = migrate_handle; + old_ext = page_ext_next(old_ext); + old_page_owner = get_page_owner(old_ext); + } + page_ext_put(new_ext); page_ext_put(old_ext); } @@ -377,7 +510,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, if (!memcg_data) goto out_unlock; - if (memcg_data & MEMCG_DATA_OBJCGS) + if (memcg_data & MEMCG_DATA_OBJEXTS) ret += scnprintf(kbuf + ret, count - ret, "Slab cache page\n"); @@ -680,8 +813,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) goto ext_put_continue; /* Found early allocated page */ - __set_page_owner_handle(page_ext, early_handle, - 0, 0); + __update_page_owner_handle(page_ext, early_handle, 0, 0, + -1, local_clock(), current->pid, + current->tgid, current->comm); count++; ext_put_continue: page_ext_put(page_ext); @@ -719,8 +853,109 @@ static const struct file_operations proc_page_owner_operations = { .llseek = lseek_page_owner, }; +static void *stack_start(struct seq_file *m, loff_t *ppos) +{ + struct stack *stack; + + if (*ppos == -1UL) + return NULL; + + if (!*ppos) { + /* + * This pairs with smp_store_release() from function + * add_stack_record_to_list(), so we get a consistent + * value of stack_list. + */ + stack = smp_load_acquire(&stack_list); + m->private = stack; + } else { + stack = m->private; + } + + return stack; +} + +static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) +{ + struct stack *stack = v; + + stack = stack->next; + *ppos = stack ? *ppos + 1 : -1UL; + m->private = stack; + + return stack; +} + +static unsigned long page_owner_pages_threshold; + +static int stack_print(struct seq_file *m, void *v) +{ + int i, nr_base_pages; + struct stack *stack = v; + unsigned long *entries; + unsigned long nr_entries; + struct stack_record *stack_record = stack->stack_record; + + if (!stack->stack_record) + return 0; + + nr_entries = stack_record->size; + entries = stack_record->entries; + nr_base_pages = refcount_read(&stack_record->count) - 1; + + if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold) + return 0; + + for (i = 0; i < nr_entries; i++) + seq_printf(m, " %pS\n", (void *)entries[i]); + seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); + + return 0; +} + +static void stack_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations page_owner_stack_op = { + .start = stack_start, + .next = stack_next, + .stop = stack_stop, + .show = stack_print +}; + +static int page_owner_stack_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &page_owner_stack_op, 0); +} + +static const struct file_operations page_owner_stack_operations = { + .open = page_owner_stack_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int page_owner_threshold_get(void *data, u64 *val) +{ + *val = READ_ONCE(page_owner_pages_threshold); + return 0; +} + +static int page_owner_threshold_set(void *data, u64 val) +{ + WRITE_ONCE(page_owner_pages_threshold, val); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get, + &page_owner_threshold_set, "%llu"); + + static int __init pageowner_init(void) { + struct dentry *dir; + if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; @@ -728,6 +963,11 @@ static int __init pageowner_init(void) debugfs_create_file("page_owner", 0400, NULL, NULL, &proc_page_owner_operations); + dir = debugfs_create_dir("page_owner_stacks", NULL); + debugfs_create_file("show_stacks", 0400, dir, NULL, + &page_owner_stack_operations); + debugfs_create_file("count_threshold", 0600, dir, NULL, + &proc_page_owner_threshold); return 0; } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index af69c3c8f7c2..4169576bed72 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -7,6 +7,8 @@ #include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/page_table_check.h> +#include <linux/swap.h> +#include <linux/swapops.h> #undef pr_fmt #define pr_fmt(fmt) "page_table_check: " fmt @@ -182,6 +184,22 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } EXPORT_SYMBOL(__page_table_check_pud_clear); +/* Whether the swap entry cached writable information */ +static inline bool swap_cached_writable(swp_entry_t entry) +{ + return is_writable_device_exclusive_entry(entry) || + is_writable_device_private_entry(entry) || + is_writable_migration_entry(entry); +} + +static inline void page_table_check_pte_flags(pte_t pte) +{ + if (pte_present(pte) && pte_uffd_wp(pte)) + WARN_ON_ONCE(pte_write(pte)); + else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte)) + WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte))); +} + void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { @@ -190,6 +208,8 @@ void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, if (&init_mm == mm) return; + page_table_check_pte_flags(pte); + for (i = 0; i < nr; i++) __page_table_check_pte_clear(mm, ptep_get(ptep + i)); if (pte_user_accessible_page(pte)) @@ -197,11 +217,21 @@ void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, } EXPORT_SYMBOL(__page_table_check_ptes_set); +static inline void page_table_check_pmd_flags(pmd_t pmd) +{ + if (pmd_present(pmd) && pmd_uffd_wp(pmd)) + WARN_ON_ONCE(pmd_write(pmd)); + else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd)) + WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); +} + void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (&init_mm == mm) return; + page_table_check_pmd_flags(pmd); + __page_table_check_pmd_clear(mm, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 74d2de15fb5e..ae5cc42aa208 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -314,17 +314,21 @@ next_pte: return false; } +#ifdef CONFIG_MEMORY_FAILURE /** * page_mapped_in_vma - check whether a page is really mapped in a VMA * @page: the page to test * @vma: the VMA to test * - * Returns 1 if the page is mapped into the page tables of the VMA, 0 - * if the page is not mapped into the page tables of this VMA. Only - * valid for normal file or anonymous VMAs. + * Return: The address the page is mapped at if the page is in the range + * covered by the VMA and present in the page table. If the page is + * outside the VMA or not present, returns -EFAULT. + * Only valid for normal file or anonymous VMAs. */ -int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { + struct folio *folio = page_folio(page); + pgoff_t pgoff = folio->index + folio_page_idx(folio, page); struct page_vma_mapped_walk pvmw = { .pfn = page_to_pfn(page), .nr_pages = 1, @@ -332,11 +336,13 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) .flags = PVMW_SYNC, }; - pvmw.address = vma_address(page, vma); + pvmw.address = vma_address(vma, pgoff, 1); if (pvmw.address == -EFAULT) - return 0; + goto out; if (!page_vma_mapped_walk(&pvmw)) - return 0; + return -EFAULT; page_vma_mapped_walk_done(&pvmw); - return 1; +out: + return pvmw.address; } +#endif diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index cdd0aa597a81..7e42f0ca3b7b 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -32,6 +32,19 @@ struct pcpu_block_md { int nr_bits; /* total bits responsible for */ }; +struct pcpuobj_ext { +#ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup *cgroup; +#endif +#ifdef CONFIG_MEM_ALLOC_PROFILING + union codetag_ref tag; +#endif +}; + +#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MEM_ALLOC_PROFILING) +#define NEED_PCPUOBJ_EXT +#endif + struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ @@ -64,8 +77,8 @@ struct pcpu_chunk { int end_offset; /* additional area required to have the region end page aligned */ -#ifdef CONFIG_MEMCG_KMEM - struct obj_cgroup **obj_cgroups; /* vector of object cgroups */ +#ifdef NEED_PCPUOBJ_EXT + struct pcpuobj_ext *obj_exts; /* vector of object cgroups */ #endif int nr_pages; /* # of pages served by this chunk */ @@ -74,6 +87,15 @@ struct pcpu_chunk { unsigned long populated[]; /* populated bitmap */ }; +static inline bool need_pcpuobj_ext(void) +{ + if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING)) + return true; + if (!mem_cgroup_kmem_disabled()) + return true; + return false; +} + extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 2054c9213c43..cd69caf6aa8d 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -231,10 +231,10 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk, return 0; err: for_each_possible_cpu(tcpu) { - if (tcpu == cpu) - break; __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), page_end - page_start); + if (tcpu == cpu) + break; } pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); return err; diff --git a/mm/percpu.c b/mm/percpu.c index 4e11fc1e6def..474e3683b74d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1392,9 +1392,9 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); -#ifdef CONFIG_MEMCG_KMEM +#ifdef NEED_PCPUOBJ_EXT /* first chunk is free to use */ - chunk->obj_cgroups = NULL; + chunk->obj_exts = NULL; #endif pcpu_init_md_blocks(chunk); @@ -1463,12 +1463,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) if (!chunk->md_blocks) goto md_blocks_fail; -#ifdef CONFIG_MEMCG_KMEM - if (!mem_cgroup_kmem_disabled()) { - chunk->obj_cgroups = +#ifdef NEED_PCPUOBJ_EXT + if (need_pcpuobj_ext()) { + chunk->obj_exts = pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) * - sizeof(struct obj_cgroup *), gfp); - if (!chunk->obj_cgroups) + sizeof(struct pcpuobj_ext), gfp); + if (!chunk->obj_exts) goto objcg_fail; } #endif @@ -1480,7 +1480,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) return chunk; -#ifdef CONFIG_MEMCG_KMEM +#ifdef NEED_PCPUOBJ_EXT objcg_fail: pcpu_mem_free(chunk->md_blocks); #endif @@ -1498,8 +1498,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; -#ifdef CONFIG_MEMCG_KMEM - pcpu_mem_free(chunk->obj_cgroups); +#ifdef NEED_PCPUOBJ_EXT + pcpu_mem_free(chunk->obj_exts); #endif pcpu_mem_free(chunk->md_blocks); pcpu_mem_free(chunk->bound_map); @@ -1646,9 +1646,9 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, if (!objcg) return; - if (likely(chunk && chunk->obj_cgroups)) { + if (likely(chunk && chunk->obj_exts)) { obj_cgroup_get(objcg); - chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; + chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg; rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, @@ -1663,13 +1663,13 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { struct obj_cgroup *objcg; - if (unlikely(!chunk->obj_cgroups)) + if (unlikely(!chunk->obj_exts)) return; - objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; + objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup; if (!objcg) return; - chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; + chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL; obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); @@ -1699,6 +1699,32 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) } #endif /* CONFIG_MEMCG_KMEM */ +#ifdef CONFIG_MEM_ALLOC_PROFILING +static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, + size_t size) +{ + if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) { + alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, + current->alloc_tag, size); + } +} + +static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) +{ + if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) + alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size); +} +#else +static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, + size_t size) +{ +} + +static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) +{ +} +#endif + /** * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes @@ -1714,7 +1740,7 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, +void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, gfp_t gfp) { gfp_t pcpu_gfp; @@ -1881,6 +1907,8 @@ area_found: pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); + pcpu_alloc_tag_alloc_hook(chunk, off, size); + return ptr; fail_unlock: @@ -1909,61 +1937,7 @@ fail: return NULL; } - -/** - * __alloc_percpu_gfp - allocate dynamic percpu area - * @size: size of area to allocate in bytes - * @align: alignment of area (max PAGE_SIZE) - * @gfp: allocation flags - * - * Allocate zero-filled percpu area of @size bytes aligned at @align. If - * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can - * be called from any context but is a lot more likely to fail. If @gfp - * has __GFP_NOWARN then no warning will be triggered on invalid or failed - * allocation requests. - * - * RETURNS: - * Percpu pointer to the allocated area on success, NULL on failure. - */ -void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) -{ - return pcpu_alloc(size, align, false, gfp); -} -EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); - -/** - * __alloc_percpu - allocate dynamic percpu area - * @size: size of area to allocate in bytes - * @align: alignment of area (max PAGE_SIZE) - * - * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). - */ -void __percpu *__alloc_percpu(size_t size, size_t align) -{ - return pcpu_alloc(size, align, false, GFP_KERNEL); -} -EXPORT_SYMBOL_GPL(__alloc_percpu); - -/** - * __alloc_reserved_percpu - allocate reserved percpu area - * @size: size of area to allocate in bytes - * @align: alignment of area (max PAGE_SIZE) - * - * Allocate zero-filled percpu area of @size bytes aligned at @align - * from reserved percpu area if arch has set it up; otherwise, - * allocation is served from the same dynamic area. Might sleep. - * Might trigger writeouts. - * - * CONTEXT: - * Does GFP_KERNEL allocation. - * - * RETURNS: - * Percpu pointer to the allocated area on success, NULL on failure. - */ -void __percpu *__alloc_reserved_percpu(size_t size, size_t align) -{ - return pcpu_alloc(size, align, true, GFP_KERNEL); -} +EXPORT_SYMBOL_GPL(pcpu_alloc_noprof); /** * pcpu_balance_free - manage the amount of free chunks @@ -2302,6 +2276,8 @@ void free_percpu(void __percpu *ptr) spin_lock_irqsave(&pcpu_lock, flags); size = pcpu_free_area(chunk, off); + pcpu_alloc_tag_free_hook(chunk, off, size); + pcpu_memcg_free_hook(chunk, off, size); /* diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4fcd959dcc4d..a78a4adf711a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { + VM_WARN_ON_ONCE(!pmd_present(*pmdp)); pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return old; @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { + VM_WARN_ON_ONCE(!pmd_present(*pmdp)); return pmdp_invalidate(vma, address, pmdp); } #endif diff --git a/mm/ptdump.c b/mm/ptdump.c index 03c1bdae4a43..106e1d66e9f9 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> +#include <linux/debugfs.h> #include <linux/ptdump.h> #include <linux/kasan.h> @@ -163,3 +164,24 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) /* Flush out the last page */ st->note_page(st, 0, -1, 0); } + +static int check_wx_show(struct seq_file *m, void *v) +{ + if (ptdump_check_wx()) + seq_puts(m, "SUCCESS\n"); + else + seq_puts(m, "FAILED\n"); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(check_wx); + +static int ptdump_debugfs_init(void) +{ + debugfs_create_file("check_wx_pages", 0400, NULL, NULL, &check_wx_fops); + + return 0; +} + +device_initcall(ptdump_debugfs_init); diff --git a/mm/readahead.c b/mm/readahead.c index 2648ec4f0494..c1b23989d9ca 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -228,6 +228,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, */ for (i = 0; i < nr_to_read; i++) { struct folio *folio = xa_load(&mapping->i_pages, index + i); + int ret; if (folio && !xa_is_value(folio)) { /* @@ -247,9 +248,12 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) break; - if (filemap_add_folio(mapping, folio, index + i, - gfp_mask) < 0) { + + ret = filemap_add_folio(mapping, folio, index + i, gfp_mask); + if (ret < 0) { folio_put(folio); + if (ret == -ENOMEM) + break; read_pages(ractl); ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; @@ -490,6 +494,7 @@ void page_cache_ra_order(struct readahead_control *ractl, pgoff_t index = readahead_index(ractl); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; + unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); @@ -500,12 +505,12 @@ void page_cache_ra_order(struct readahead_control *ractl, if (new_order < MAX_PAGECACHE_ORDER) { new_order += 2; - if (new_order > MAX_PAGECACHE_ORDER) - new_order = MAX_PAGECACHE_ORDER; - while ((1 << new_order) > ra->size) - new_order--; + new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); + new_order = min_t(unsigned int, new_order, ilog2(ra->size)); } + /* See comment in page_cache_ra_unbounded() */ + nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); while (index <= limit) { unsigned int order = new_order; @@ -516,9 +521,6 @@ void page_cache_ra_order(struct readahead_control *ractl, /* Don't allocate pages past EOF */ while (index + (1UL << order) - 1 > limit) order--; - /* THP machinery does not support order-1 */ - if (order == 1) - order = 0; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; @@ -532,6 +534,7 @@ void page_cache_ra_order(struct readahead_control *ractl, read_pages(ractl); filemap_invalidate_unlock_shared(mapping); + memalloc_nofs_restore(nofs); /* * If there were already pages in the page cache, then we may have diff --git a/mm/rmap.c b/mm/rmap.c index f5d43edad529..e8fc5ecb59b2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -23,7 +23,7 @@ * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) - * page->flags PG_locked (lock_page) + * folio_lock * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) * vma_start_write * mapping->i_mmap_rwsem @@ -50,7 +50,7 @@ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * vma_lock (hugetlb specific lock for pmd_sharing) * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) - * page->flags PG_locked (lock_page) + * folio_lock */ #include <linux/mm.h> @@ -182,8 +182,6 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * for the new allocation. At the same time, we do not want * to do any locking for the common case of already having * an anon_vma. - * - * This must be called with the mmap_lock held for reading. */ int __anon_vma_prepare(struct vm_area_struct *vma) { @@ -191,6 +189,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma) struct anon_vma *anon_vma, *allocated; struct anon_vma_chain *avc; + mmap_assert_locked(mm); might_sleep(); avc = anon_vma_chain_alloc(GFP_KERNEL); @@ -775,6 +774,8 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { struct folio *folio = page_folio(page); + pgoff_t pgoff; + if (folio_test_anon(folio)) { struct anon_vma *page__anon_vma = folio_anon_vma(folio); /* @@ -790,7 +791,9 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } - return vma_address(page, vma); + /* The !page__anon_vma above handles KSM folios */ + pgoff = folio->index + folio_page_idx(folio, page); + return vma_address(vma, pgoff, 1); } /* @@ -961,7 +964,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) int folio_referenced(struct folio *folio, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags) { - int we_locked = 0; + bool we_locked = false; struct folio_referenced_arg pra = { .mapcount = folio_mapcount(folio), .memcg = memcg, @@ -1128,56 +1131,38 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, if (invalid_mkclean_vma(vma, NULL)) return 0; - pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma); + pvmw.address = vma_address(vma, pgoff, nr_pages); VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma); return page_vma_mkclean_one(&pvmw); } -int folio_total_mapcount(struct folio *folio) -{ - int mapcount = folio_entire_mapcount(folio); - int nr_pages; - int i; - - /* In the common case, avoid the loop when no pages mapped by PTE */ - if (folio_nr_pages_mapped(folio) == 0) - return mapcount; - /* - * Add all the PTE mappings of those pages mapped by PTE. - * Limit the loop to folio_nr_pages_mapped()? - * Perhaps: given all the raciness, that may be a good or a bad idea. - */ - nr_pages = folio_nr_pages(folio); - for (i = 0; i < nr_pages; i++) - mapcount += atomic_read(&folio_page(folio, i)->_mapcount); - - /* But each of those _mapcounts was based on -1 */ - mapcount += nr_pages; - return mapcount; -} - static __always_inline unsigned int __folio_add_rmap(struct folio *folio, struct page *page, int nr_pages, enum rmap_level level, int *nr_pmdmapped) { atomic_t *mapped = &folio->_nr_pages_mapped; + const int orig_nr_pages = nr_pages; int first, nr = 0; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case RMAP_LEVEL_PTE: + if (!folio_test_large(folio)) { + nr = atomic_inc_and_test(&page->_mapcount); + break; + } + do { first = atomic_inc_and_test(&page->_mapcount); - if (first && folio_test_large(folio)) { + if (first) { first = atomic_inc_return_relaxed(mapped); - first = (first < ENTIRELY_MAPPED); + if (first < ENTIRELY_MAPPED) + nr++; } - - if (first) - nr++; } while (page++, --nr_pages > 0); + atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: first = atomic_inc_and_test(&folio->_entire_mapcount); @@ -1194,6 +1179,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, nr = 0; } } + atomic_inc(&folio->_large_mapcount); break; } return nr; @@ -1429,10 +1415,14 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, SetPageAnonExclusive(page); } + /* increment count (starts at -1) */ + atomic_set(&folio->_large_mapcount, nr - 1); atomic_set(&folio->_nr_pages_mapped, nr); } else { /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); + /* increment count (starts at -1) */ + atomic_set(&folio->_large_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); SetPageAnonExclusive(&folio->page); __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr); @@ -1445,13 +1435,14 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum rmap_level level) { + pg_data_t *pgdat = folio_pgdat(folio); int nr, nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); if (nr_pmdmapped) - __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ? + __mod_node_page_state(pgdat, folio_test_swapbacked(folio) ? NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); @@ -1503,25 +1494,34 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, enum rmap_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; + pg_data_t *pgdat = folio_pgdat(folio); int last, nr = 0, nr_pmdmapped = 0; + bool partially_mapped = false; enum node_stat_item idx; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case RMAP_LEVEL_PTE: + if (!folio_test_large(folio)) { + nr = atomic_add_negative(-1, &page->_mapcount); + break; + } + + atomic_sub(nr_pages, &folio->_large_mapcount); do { last = atomic_add_negative(-1, &page->_mapcount); - if (last && folio_test_large(folio)) { + if (last) { last = atomic_dec_return_relaxed(mapped); - last = (last < ENTIRELY_MAPPED); + if (last < ENTIRELY_MAPPED) + nr++; } - - if (last) - nr++; } while (page++, --nr_pages > 0); + + partially_mapped = nr && atomic_read(mapped); break; case RMAP_LEVEL_PMD: + atomic_dec(&folio->_large_mapcount); last = atomic_add_negative(-1, &folio->_entire_mapcount); if (last) { nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); @@ -1536,17 +1536,20 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr = 0; } } + + partially_mapped = nr < nr_pmdmapped; break; } if (nr_pmdmapped) { + /* NR_{FILE/SHMEM}_PMDMAPPED are not maintained per-memcg */ if (folio_test_anon(folio)) - idx = NR_ANON_THPS; - else if (folio_test_swapbacked(folio)) - idx = NR_SHMEM_PMDMAPPED; + __lruvec_stat_mod_folio(folio, NR_ANON_THPS, -nr_pmdmapped); else - idx = NR_FILE_PMDMAPPED; - __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped); + __mod_node_page_state(pgdat, + folio_test_swapbacked(folio) ? + NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, + -nr_pmdmapped); } if (nr) { idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; @@ -1556,10 +1559,12 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, * Queue anon large folio for deferred split if at least one * page of the folio is unmapped and at least one page * is still mapped. + * + * Check partially_mapped first to ensure it is a large folio. */ - if (folio_test_large(folio) && folio_test_anon(folio)) - if (level == RMAP_LEVEL_PTE || nr < nr_pmdmapped) - deferred_split_folio(folio); + if (folio_test_anon(folio) && partially_mapped && + list_empty(&folio->_deferred_list)) + deferred_split_folio(folio); } /* @@ -1780,7 +1785,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -1795,7 +1800,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * migration) will not expect userfaults on already * copied pages. */ - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); } else if (folio_test_anon(folio)) { swp_entry_t entry = page_swap_entry(subpage); pte_t swp_pte; @@ -1903,7 +1908,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * * See Documentation/mm/mmu_notifier.rst */ - dec_mm_counter(mm, mm_counter_file(&folio->page)); + dec_mm_counter(mm, mm_counter_file(folio)); } discard: if (unlikely(folio_test_hugetlb(folio))) @@ -2169,7 +2174,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); trace_set_migration_pte(pvmw.address, pte_val(swp_pte), - compound_order(&folio->page)); + folio_order(folio)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. @@ -2181,7 +2186,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -2196,7 +2201,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * migration) will not expect userfaults on already * copied pages. */ - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); } else { swp_entry_t entry; pte_t swp_pte; @@ -2261,7 +2266,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, else set_pte_at(mm, address, pvmw.pte, swp_pte); trace_set_migration_pte(address, pte_val(swp_pte), - compound_order(&folio->page)); + folio_order(folio)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. @@ -2588,7 +2593,8 @@ static void rmap_walk_anon(struct folio *folio, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(&folio->page, vma); + unsigned long address = vma_address(vma, pgoff_start, + folio_nr_pages(folio)); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); @@ -2649,7 +2655,8 @@ static void rmap_walk_file(struct folio *folio, lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { - unsigned long address = vma_address(&folio->page, vma); + unsigned long address = vma_address(vma, pgoff_start, + folio_nr_pages(folio)); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); @@ -2702,6 +2709,7 @@ void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); atomic_inc(&folio->_entire_mapcount); + atomic_inc(&folio->_large_mapcount); if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(&folio->page); VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 && @@ -2716,6 +2724,7 @@ void hugetlb_add_new_anon_rmap(struct folio *folio, BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); + atomic_set(&folio->_large_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); __folio_set_anon(folio, vma, address, true); SetPageAnonExclusive(&folio->page); diff --git a/mm/shmem.c b/mm/shmem.c index d7c84ff62186..f5d60436b604 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -254,7 +254,7 @@ static void shmem_inode_unacct_blocks(struct inode *inode, long pages) } static const struct super_operations shmem_ops; -const struct address_space_operations shmem_aops; +static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; @@ -263,6 +263,12 @@ static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; +bool shmem_mapping(struct address_space *mapping) +{ + return mapping->a_ops == &shmem_aops; +} +EXPORT_SYMBOL_GPL(shmem_mapping); + bool vma_is_anon_shmem(struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; @@ -311,7 +317,7 @@ static void shmem_disable_quotas(struct super_block *sb) dquot_quota_off(sb, type); } -static struct dquot **shmem_get_dquots(struct inode *inode) +static struct dquot __rcu **shmem_get_dquots(struct inode *inode) { return SHMEM_I(inode)->i_dquot; } @@ -742,12 +748,6 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) -{ - return false; -} - static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { @@ -1907,7 +1907,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * Some architectures may have to restore extra metadata to the * folio after reading from swap. */ - arch_swap_restore(swap, folio); + arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { error = shmem_replace_folio(&folio, gfp, info, index); @@ -1966,6 +1966,9 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, int error; bool alloced; + if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) + return -EINVAL; + if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: @@ -2128,12 +2131,36 @@ unlock: return error; } +/** + * shmem_get_folio - find, and lock a shmem folio. + * @inode: inode to search + * @index: the page index. + * @foliop: pointer to the folio if found + * @sgp: SGP_* flags to control behavior + * + * Looks up the page cache entry at @inode & @index. If a folio is + * present, it is returned locked with an increased refcount. + * + * If the caller modifies data in the folio, it must call folio_mark_dirty() + * before unlocking the folio to ensure that the folio is not reclaimed. + * There is no need to reserve space before calling folio_mark_dirty(). + * + * When no folio is found, the behavior depends on @sgp: + * - for SGP_READ, *@foliop is %NULL and 0 is returned + * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned + * - for all other flags a new folio is allocated, inserted into the + * page cache and returned locked in @foliop. + * + * Context: May sleep. + * Return: 0 if successful, else a negative error code. + */ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp) { return shmem_get_folio_gfp(inode, index, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); } +EXPORT_SYMBOL_GPL(shmem_get_folio); /* * This is like autoremove_wake_function, but it removes the wait queue @@ -2240,8 +2267,6 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long (*get_area)(struct file *, - unsigned long, unsigned long, unsigned long, unsigned long); unsigned long addr; unsigned long offset; unsigned long inflated_len; @@ -2251,8 +2276,8 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (len > TASK_SIZE) return -ENOMEM; - get_area = current->mm->get_unmapped_area; - addr = get_area(file, uaddr, len, pgoff, flags); + addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, + flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; @@ -2309,7 +2334,8 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_len < len) return addr; - inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); + inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, + inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) @@ -3374,7 +3400,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { - if (!simple_empty(dentry)) + if (!simple_offset_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); @@ -3431,7 +3457,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, return simple_offset_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); - if (!simple_empty(new_dentry)) + if (!simple_offset_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { @@ -3440,8 +3466,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, return error; } - simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry); - error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry); + error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (error) return error; @@ -3500,10 +3525,10 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); + inode->i_mapping->a_ops = &shmem_aops; error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; - inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); folio_mark_uptodate(folio); @@ -4265,6 +4290,24 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) mpol_put(mpol); if (sbinfo->noswap) seq_printf(seq, ",noswap"); +#ifdef CONFIG_TMPFS_QUOTA + if (sb_has_quota_active(root->d_sb, USRQUOTA)) + seq_printf(seq, ",usrquota"); + if (sb_has_quota_active(root->d_sb, GRPQUOTA)) + seq_printf(seq, ",grpquota"); + if (sbinfo->qlimits.usrquota_bhardlimit) + seq_printf(seq, ",usrquota_block_hardlimit=%lld", + sbinfo->qlimits.usrquota_bhardlimit); + if (sbinfo->qlimits.grpquota_bhardlimit) + seq_printf(seq, ",grpquota_block_hardlimit=%lld", + sbinfo->qlimits.grpquota_bhardlimit); + if (sbinfo->qlimits.usrquota_ihardlimit) + seq_printf(seq, ",usrquota_inode_hardlimit=%lld", + sbinfo->qlimits.usrquota_ihardlimit); + if (sbinfo->qlimits.grpquota_ihardlimit) + seq_printf(seq, ",grpquota_inode_hardlimit=%lld", + sbinfo->qlimits.grpquota_ihardlimit); +#endif return 0; } @@ -4355,7 +4398,9 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif - uuid_gen(&sb->s_uuid); + uuid_t uuid; + uuid_gen(&uuid); + super_set_uuid(sb, uuid.b, sizeof(uuid)); #ifdef CONFIG_TMPFS_QUOTA if (ctx->seen & SHMEM_SEEN_QUOTA) { @@ -4466,7 +4511,7 @@ static int shmem_error_remove_folio(struct address_space *mapping, return 0; } -const struct address_space_operations shmem_aops = { +static const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS @@ -4478,7 +4523,6 @@ const struct address_space_operations shmem_aops = { #endif .error_remove_folio = shmem_error_remove_folio, }; -EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, @@ -4755,7 +4799,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } #endif @@ -4833,6 +4877,7 @@ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned lon { return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } +EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs @@ -4910,7 +4955,6 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, struct folio *folio; int error; - BUG_ON(!shmem_mapping(mapping)); error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, gfp, NULL, NULL); if (error) diff --git a/mm/shmem_quota.c b/mm/shmem_quota.c index 062d1c1097ae..ce514e700d2f 100644 --- a/mm/shmem_quota.c +++ b/mm/shmem_quota.c @@ -116,7 +116,7 @@ static int shmem_free_file_info(struct super_block *sb, int type) static int shmem_get_next_id(struct super_block *sb, struct kqid *qid) { struct mem_dqinfo *info = sb_dqinfo(sb, qid->type); - struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node; + struct rb_node *node; qid_t id = from_kqid(&init_user_ns, *qid); struct quota_info *dqopt = sb_dqopt(sb); struct quota_id *entry = NULL; @@ -126,6 +126,7 @@ static int shmem_get_next_id(struct super_block *sb, struct kqid *qid) return -ESRCH; down_read(&dqopt->dqio_sem); + node = ((struct rb_root *)info->dqi_priv)->rb_node; while (node) { entry = rb_entry(node, struct quota_id, node); @@ -165,7 +166,7 @@ out_unlock: static int shmem_acquire_dquot(struct dquot *dquot) { struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type); - struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node; + struct rb_node **n; struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info; struct rb_node *parent = NULL, *new_node = NULL; struct quota_id *new_entry, *entry; @@ -176,6 +177,8 @@ static int shmem_acquire_dquot(struct dquot *dquot) mutex_lock(&dquot->dq_lock); down_write(&dqopt->dqio_sem); + n = &((struct rb_root *)info->dqi_priv)->rb_node; + while (*n) { parent = *n; entry = rb_entry(parent, struct quota_id, node); @@ -264,7 +267,7 @@ static bool shmem_is_empty_dquot(struct dquot *dquot) static int shmem_release_dquot(struct dquot *dquot) { struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type); - struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node; + struct rb_node *node; qid_t id = from_kqid(&init_user_ns, dquot->dq_id); struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); struct quota_id *entry = NULL; @@ -275,6 +278,7 @@ static int shmem_release_dquot(struct dquot *dquot) goto out_dqlock; down_write(&dqopt->dqio_sem); + node = ((struct rb_root *)info->dqi_priv)->rb_node; while (node) { entry = rb_entry(node, struct quota_id, node); diff --git a/mm/show_mem.c b/mm/show_mem.c index 8dcfafbd283c..bdb439551eef 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -423,4 +423,30 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif +#ifdef CONFIG_MEM_ALLOC_PROFILING + { + struct codetag_bytes tags[10]; + size_t i, nr; + + nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false); + if (nr) { + pr_notice("Memory allocations:\n"); + for (i = 0; i < nr; i++) { + struct codetag *ct = tags[i].ct; + struct alloc_tag *tag = ct_to_alloc_tag(ct); + struct alloc_tag_counters counter = alloc_tag_read(tag); + + /* Same as alloc_tag_to_text() but w/o intermediate buffer */ + if (ct->modname) + pr_notice("%12lli %8llu %s:%u [%s] func:%s\n", + counter.bytes, counter.calls, ct->filename, + ct->lineno, ct->modname, ct->function); + else + pr_notice("%12lli %8llu %s:%u func:%s\n", + counter.bytes, counter.calls, ct->filename, + ct->lineno, ct->function); + } + } + } +#endif } diff --git a/mm/slab.h b/mm/slab.h index 54deeb0428c6..5f8f47c5bee0 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -84,11 +84,11 @@ struct slab { }; struct rcu_head rcu_head; }; - unsigned int __unused; + unsigned int __page_type; atomic_t __page_refcount; -#ifdef CONFIG_MEMCG - unsigned long memcg_data; +#ifdef CONFIG_SLAB_OBJ_EXT + unsigned long obj_exts; #endif }; @@ -97,8 +97,8 @@ struct slab { SLAB_MATCH(flags, __page_flags); SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ SLAB_MATCH(_refcount, __page_refcount); -#ifdef CONFIG_MEMCG -SLAB_MATCH(memcg_data, memcg_data); +#ifdef CONFIG_SLAB_OBJ_EXT +SLAB_MATCH(memcg_data, obj_exts); #endif #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); @@ -363,7 +363,6 @@ static inline int objs_per_slab(const struct kmem_cache *cache, enum slab_state { DOWN, /* No slab functionality yet */ PARTIAL, /* SLUB: kmem_cache_node available */ - PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ UP, /* Slab caches usable but not all extras yet */ FULL /* Everything is working */ }; @@ -387,7 +386,7 @@ extern const struct kmalloc_info_struct { /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); -void create_kmalloc_caches(slab_flags_t); +void create_kmalloc_caches(void); extern u8 kmalloc_size_index[24]; @@ -422,8 +421,6 @@ gfp_t kmalloc_fix_flags(gfp_t flags); int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); void __init kmem_cache_init(void); -void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, - slab_flags_t flags); extern void create_boot_cache(struct kmem_cache *, const char *name, unsigned int size, slab_flags_t flags, unsigned int useroffset, unsigned int usersize); @@ -435,8 +432,7 @@ struct kmem_cache * __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); -slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name); +slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name); static inline bool is_kmalloc_cache(struct kmem_cache *s) { @@ -469,7 +465,6 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s) SLAB_STORE_USER | \ SLAB_TRACE | \ SLAB_CONSISTENCY_CHECKS | \ - SLAB_MEM_SPREAD | \ SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | \ @@ -501,9 +496,6 @@ struct slabinfo { }; void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos); #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON @@ -528,7 +520,7 @@ static inline bool __slub_debug_enabled(void) #endif /* - * Returns true if any of the specified slub_debug flags is enabled for the + * Returns true if any of the specified slab_debug flags is enabled for the * cache. Use only for flags parsed by setup_slub_debug() as it also enables * the static key. */ @@ -541,42 +533,52 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla return false; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_SLAB_OBJ_EXT + /* - * slab_objcgs - get the object cgroups vector associated with a slab + * slab_obj_exts - get the pointer to the slab object extension vector + * associated with a slab. * @slab: a pointer to the slab struct * - * Returns a pointer to the object cgroups vector associated with the slab, + * Returns a pointer to the object extension vector associated with the slab, * or NULL if no such vector has been associated yet. */ -static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) { - unsigned long memcg_data = READ_ONCE(slab->memcg_data); + unsigned long obj_exts = READ_ONCE(slab->obj_exts); - VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), +#ifdef CONFIG_MEMCG + VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS), slab_page(slab)); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab)); - - return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); +#endif + return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); } -int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, - gfp_t gfp, bool new_slab); -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - enum node_stat_item idx, int nr); -#else /* CONFIG_MEMCG_KMEM */ -static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab); + +#else /* CONFIG_SLAB_OBJ_EXT */ + +static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) { return NULL; } -static inline int memcg_alloc_slab_cgroups(struct slab *slab, - struct kmem_cache *s, gfp_t gfp, - bool new_slab) +#endif /* CONFIG_SLAB_OBJ_EXT */ + +static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) { - return 0; + return (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; } -#endif /* CONFIG_MEMCG_KMEM */ + +#ifdef CONFIG_MEMCG_KMEM +bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, + gfp_t flags, size_t size, void **p); +void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects, struct slabobj_ext *obj_exts); +#endif size_t __ksize(const void *objp); diff --git a/mm/slab_common.c b/mm/slab_common.c index 238293b1dbe1..1560a1546bb1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -50,7 +50,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, */ #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB | SLAB_NO_MERGE | kasan_never_merge()) + SLAB_FAILSLAB | SLAB_NO_MERGE) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_ACCOUNT) @@ -172,7 +172,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, size = ALIGN(size, sizeof(void *)); align = calculate_alignment(flags, align, size); size = ALIGN(size, align); - flags = kmem_cache_flags(size, flags, name); + flags = kmem_cache_flags(flags, name); if (flags & SLAB_NEVER_MERGE) return NULL; @@ -282,7 +282,7 @@ kmem_cache_create_usercopy(const char *name, #ifdef CONFIG_SLUB_DEBUG /* - * If no slub_debug was enabled globally, the static key is not yet + * If no slab_debug was enabled globally, the static key is not yet * enabled by setup_slub_debug(). Enable it if the cache is being * created with any of the debugging flags passed explicitly. * It's also possible that this is the first cache created with @@ -404,8 +404,12 @@ EXPORT_SYMBOL(kmem_cache_create); */ static void kmem_cache_release(struct kmem_cache *s) { - sysfs_slab_unlink(s); - sysfs_slab_release(s); + if (slab_state >= FULL) { + sysfs_slab_unlink(s); + sysfs_slab_release(s); + } else { + slab_kmem_cache_release(s); + } } #else static void kmem_cache_release(struct kmem_cache *s) @@ -651,7 +655,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name, struct kmem_cache * kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init = -{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ }; +{ /* initialization for https://llvm.org/pr42570 */ }; EXPORT_SYMBOL(kmalloc_caches); #ifdef CONFIG_RANDOM_KMALLOC_CACHES @@ -766,7 +770,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup); } /* - * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. + * kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time. * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is * kmalloc-2M. */ @@ -853,9 +857,10 @@ static unsigned int __kmalloc_minalign(void) return max(minalign, arch_slab_minalign()); } -void __init -new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) +static void __init +new_kmalloc_cache(int idx, enum kmalloc_cache_type type) { + slab_flags_t flags = 0; unsigned int minalign = __kmalloc_minalign(); unsigned int aligned_size = kmalloc_info[idx].size; int aligned_idx = idx; @@ -902,7 +907,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) * may already have been created because they were needed to * enable allocations for slab creation. */ -void __init create_kmalloc_caches(slab_flags_t flags) +void __init create_kmalloc_caches(void) { int i; enum kmalloc_cache_type type; @@ -911,22 +916,15 @@ void __init create_kmalloc_caches(slab_flags_t flags) * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined */ for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[type][i]) - new_kmalloc_cache(i, type, flags); - - /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches - */ - if (KMALLOC_MIN_SIZE <= 32 && i == 6 && - !kmalloc_caches[type][1]) - new_kmalloc_cache(1, type, flags); - if (KMALLOC_MIN_SIZE <= 64 && i == 7 && - !kmalloc_caches[type][2]) - new_kmalloc_cache(2, type, flags); - } + /* Caches that are NOT of the two-to-the-power-of size. */ + if (KMALLOC_MIN_SIZE <= 32) + new_kmalloc_cache(1, type); + if (KMALLOC_MIN_SIZE <= 64) + new_kmalloc_cache(2, type); + + /* Caches that are of the two-to-the-power-of size. */ + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + new_kmalloc_cache(i, type); } #ifdef CONFIG_RANDOM_KMALLOC_CACHES random_kmalloc_seed = get_random_u64(); @@ -1073,7 +1071,6 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m) sinfo.limit, sinfo.batchcount, sinfo.shared); seq_printf(m, " : slabdata %6lu %6lu %6lu", sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); - slabinfo_show_stats(m, s); seq_putc(m, '\n'); } @@ -1150,7 +1147,6 @@ static const struct proc_ops slabinfo_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = slabinfo_open, .proc_read = seq_read, - .proc_write = slabinfo_write, .proc_lseek = seq_lseek, .proc_release = seq_release, }; @@ -1184,7 +1180,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) return (void *)p; } - ret = kmalloc_track_caller(new_size, flags); + ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); if (ret && p) { /* Disable KASAN checks as the object's redzone is accessed. */ kasan_disable_current(); @@ -1208,7 +1204,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) * * Return: pointer to the allocated memory or %NULL in case of error */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) +void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) { void *ret; @@ -1223,7 +1219,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) return ret; } -EXPORT_SYMBOL(krealloc); +EXPORT_SYMBOL(krealloc_noprof); /** * kfree_sensitive - Clear sensitive information in memory before freeing diff --git a/mm/slub.c b/mm/slub.c index 2ef88bbf56a3..0809760cf789 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -295,7 +295,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* * Debugging flags that require metadata to be stored in the slab. These get - * disabled when slub_debug=O is used and a cache's min order increases with + * disabled when slab_debug=O is used and a cache's min order increases with * metadata. */ #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) @@ -306,13 +306,13 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* Internal SLUB flags */ /* Poison object */ -#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U) +#define __OBJECT_POISON __SLAB_FLAG_BIT(_SLAB_OBJECT_POISON) /* Use cmpxchg_double */ #ifdef system_has_freelist_aba -#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U) +#define __CMPXCHG_DOUBLE __SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE) #else -#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U) +#define __CMPXCHG_DOUBLE __SLAB_FLAG_UNUSED #endif /* @@ -391,7 +391,7 @@ struct kmem_cache_cpu { }; struct slab *slab; /* The slab from which we are allocating */ #ifdef CONFIG_SLUB_CPU_PARTIAL - struct slab *partial; /* Partially allocated frozen slabs */ + struct slab *partial; /* Partially allocated slabs */ #endif local_lock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS @@ -557,6 +557,26 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) *(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr); } +/* + * See comment in calculate_sizes(). + */ +static inline bool freeptr_outside_object(struct kmem_cache *s) +{ + return s->offset >= s->inuse; +} + +/* + * Return offset of the end of info block which is inuse + free pointer if + * not overlapping with object. + */ +static inline unsigned int get_info_end(struct kmem_cache *s) +{ + if (freeptr_outside_object(s)) + return s->inuse + sizeof(void *); + else + return s->inuse; +} + /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ for (__p = fixup_red_left(__s, __addr); \ @@ -604,11 +624,21 @@ static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); s->cpu_partial_slabs = nr_slabs; } + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return s->cpu_partial_slabs; +} #else static inline void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) { } + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return 0; +} #endif /* CONFIG_SLUB_CPU_PARTIAL */ /* @@ -616,18 +646,12 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) */ static __always_inline void slab_lock(struct slab *slab) { - struct page *page = slab_page(slab); - - VM_BUG_ON_PAGE(PageTail(page), page); - bit_spin_lock(PG_locked, &page->flags); + bit_spin_lock(PG_locked, &slab->__page_flags); } static __always_inline void slab_unlock(struct slab *slab) { - struct page *page = slab_page(slab); - - VM_BUG_ON_PAGE(PageTail(page), page); - bit_spin_unlock(PG_locked, &page->flags); + bit_spin_unlock(PG_locked, &slab->__page_flags); } static inline bool @@ -845,26 +869,6 @@ static void print_section(char *level, char *text, u8 *addr, metadata_access_disable(); } -/* - * See comment in calculate_sizes(). - */ -static inline bool freeptr_outside_object(struct kmem_cache *s) -{ - return s->offset >= s->inuse; -} - -/* - * Return offset of the end of info block which is inuse + free pointer if - * not overlapping with object. - */ -static inline unsigned int get_info_end(struct kmem_cache *s) -{ - if (freeptr_outside_object(s)) - return s->inuse + sizeof(void *); - else - return s->inuse; -} - static struct track *get_track(struct kmem_cache *s, void *object, enum track_item alloc) { @@ -1498,16 +1502,8 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) { struct kmem_cache_node *n = get_node(s, node); - /* - * May be called early in order to allocate a slab for the - * kmem_cache_node structure. Solve the chicken-egg - * dilemma by deferring the increment of the count during - * bootstrap (see early_kmem_cache_node_alloc). - */ - if (likely(n)) { - atomic_long_inc(&n->nr_slabs); - atomic_long_add(objects, &n->total_objects); - } + atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); } static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) { @@ -1616,7 +1612,7 @@ static inline int free_consistency_checks(struct kmem_cache *s, } /* - * Parse a block of slub_debug options. Blocks are delimited by ';' + * Parse a block of slab_debug options. Blocks are delimited by ';' * * @str: start of block * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified @@ -1677,7 +1673,7 @@ parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) break; default: if (init) - pr_err("slub_debug option '%c' unknown. skipped\n", *str); + pr_err("slab_debug option '%c' unknown. skipped\n", *str); } } check_slabs: @@ -1736,7 +1732,7 @@ static int __init setup_slub_debug(char *str) /* * For backwards compatibility, a single list of flags with list of * slabs means debugging is only changed for those slabs, so the global - * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending + * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as * long as there is no option specifying flags without a slab list. */ @@ -1760,21 +1756,20 @@ out: return 1; } -__setup("slub_debug", setup_slub_debug); +__setup("slab_debug", setup_slub_debug); +__setup_param("slub_debug", slub_debug, setup_slub_debug, 0); /* * kmem_cache_flags - apply debugging options to the cache - * @object_size: the size of an object without meta data * @flags: flags to set * @name: name of the cache * * Debug option(s) are applied to @flags. In addition to the debug * option(s), if a slab name (or multiple) is specified i.e. - * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ... + * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ... * then only the select slabs will receive the debug option(s). */ -slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name) +slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) { char *iter; size_t len; @@ -1850,8 +1845,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} -slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name) +slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) { return flags; } @@ -1875,203 +1869,278 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, #endif #endif /* CONFIG_SLUB_DEBUG */ -static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) +#ifdef CONFIG_SLAB_OBJ_EXT + +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + +static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) { - return (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; + struct slabobj_ext *slab_exts; + struct slab *obj_exts_slab; + + obj_exts_slab = virt_to_slab(obj_exts); + slab_exts = slab_obj_exts(obj_exts_slab); + if (slab_exts) { + unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, + obj_exts_slab, obj_exts); + /* codetag should be NULL */ + WARN_ON(slab_exts[offs].ref.ct); + set_codetag_empty(&slab_exts[offs].ref); + } } -#ifdef CONFIG_MEMCG_KMEM -static inline void memcg_free_slab_cgroups(struct slab *slab) +static inline void mark_failed_objexts_alloc(struct slab *slab) { - kfree(slab_objcgs(slab)); - slab->memcg_data = 0; + slab->obj_exts = OBJEXTS_ALLOC_FAIL; } -static inline size_t obj_full_size(struct kmem_cache *s) +static inline void handle_failed_objexts_alloc(unsigned long obj_exts, + struct slabobj_ext *vec, unsigned int objects) { /* - * For each accounted object there is an extra space which is used - * to store obj_cgroup membership. Charge it too. + * If vector previously failed to allocate then we have live + * objects with no tag reference. Mark all references in this + * vector as empty to avoid warnings later on. */ - return s->size + sizeof(struct obj_cgroup *); + if (obj_exts & OBJEXTS_ALLOC_FAIL) { + unsigned int i; + + for (i = 0; i < objects; i++) + set_codetag_empty(&vec[i].ref); + } } -/* - * Returns false if the allocation should fail. - */ -static bool __memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t objects, gfp_t flags) -{ - /* - * The obtained objcg pointer is safe to use within the current scope, - * defined by current task or set_active_memcg() pair. - * obj_cgroup_get() is used to get a permanent reference. - */ - struct obj_cgroup *objcg = current_obj_cgroup(); - if (!objcg) - return true; +#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ - if (lru) { - int ret; - struct mem_cgroup *memcg; +static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {} +static inline void mark_failed_objexts_alloc(struct slab *slab) {} +static inline void handle_failed_objexts_alloc(unsigned long obj_exts, + struct slabobj_ext *vec, unsigned int objects) {} - memcg = get_mem_cgroup_from_objcg(objcg); - ret = memcg_list_lru_alloc(memcg, lru, flags); - css_put(&memcg->css); +#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ - if (ret) - return false; +/* + * The allocated objcg pointers array is not accounted directly. + * Moreover, it should not come from DMA buffer and is not readily + * reclaimable. So those GFP bits should be masked off. + */ +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) + +int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) +{ + unsigned int objects = objs_per_slab(s, slab); + unsigned long new_exts; + unsigned long old_exts; + struct slabobj_ext *vec; + + gfp &= ~OBJCGS_CLEAR_MASK; + /* Prevent recursive extension vector allocation */ + gfp |= __GFP_NO_OBJ_EXT; + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + if (!vec) { + /* Mark vectors which failed to allocate */ + if (new_slab) + mark_failed_objexts_alloc(slab); + + return -ENOMEM; } - if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) - return false; + new_exts = (unsigned long)vec; +#ifdef CONFIG_MEMCG + new_exts |= MEMCG_DATA_OBJEXTS; +#endif + old_exts = slab->obj_exts; + handle_failed_objexts_alloc(old_exts, vec, objects); + if (new_slab) { + /* + * If the slab is brand new and nobody can yet access its + * obj_exts, no synchronization is required and obj_exts can + * be simply assigned. + */ + slab->obj_exts = new_exts; + } else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) { + /* + * If the slab is already in use, somebody can allocate and + * assign slabobj_exts in parallel. In this case the existing + * objcg vector should be reused. + */ + mark_objexts_empty(vec); + kfree(vec); + return 0; + } - *objcgp = objcg; - return true; + kmemleak_not_leak(vec); + return 0; } -/* - * Returns false if the allocation should fail. - */ -static __fastpath_inline -bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru, - struct obj_cgroup **objcgp, size_t objects, - gfp_t flags) +static inline void free_slab_obj_exts(struct slab *slab) { - if (!memcg_kmem_online()) - return true; + struct slabobj_ext *obj_exts; - if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))) + obj_exts = slab_obj_exts(slab); + if (!obj_exts) + return; + + /* + * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its + * corresponding extension will be NULL. alloc_tag_sub() will throw a + * warning if slab has extensions but the extension of an object is + * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that + * the extension for obj_exts is expected to be NULL. + */ + mark_objexts_empty(obj_exts); + kfree(obj_exts); + slab->obj_exts = 0; +} + +static inline bool need_slab_obj_ext(void) +{ + if (mem_alloc_profiling_enabled()) return true; - return likely(__memcg_slab_pre_alloc_hook(s, lru, objcgp, objects, - flags)); + /* + * CONFIG_MEMCG_KMEM creates vector of obj_cgroup objects conditionally + * inside memcg_slab_post_alloc_hook. No other users for now. + */ + return false; } -static void __memcg_slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, - gfp_t flags, size_t size, - void **p) +static inline struct slabobj_ext * +prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) { struct slab *slab; - unsigned long off; - size_t i; - flags &= gfp_allowed_mask; + if (!p) + return NULL; - for (i = 0; i < size; i++) { - if (likely(p[i])) { - slab = virt_to_slab(p[i]); + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return NULL; - if (!slab_objcgs(slab) && - memcg_alloc_slab_cgroups(slab, s, flags, false)) { - obj_cgroup_uncharge(objcg, obj_full_size(s)); - continue; - } + if (flags & __GFP_NO_OBJ_EXT) + return NULL; - off = obj_to_index(s, slab, p[i]); - obj_cgroup_get(objcg); - slab_objcgs(slab)[off] = objcg; - mod_objcg_state(objcg, slab_pgdat(slab), - cache_vmstat_idx(s), obj_full_size(s)); - } else { - obj_cgroup_uncharge(objcg, obj_full_size(s)); - } - } + slab = virt_to_slab(p); + if (!slab_obj_exts(slab) && + WARN(alloc_slab_obj_exts(slab, s, flags, false), + "%s, %s: Failed to create slab extension vector!\n", + __func__, s->name)) + return NULL; + + return slab_obj_exts(slab) + obj_to_index(s, slab, p); } -static __fastpath_inline -void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, - gfp_t flags, size_t size, void **p) +static inline void +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) { - if (likely(!memcg_kmem_online() || !objcg)) - return; +#ifdef CONFIG_MEM_ALLOC_PROFILING + struct slabobj_ext *obj_exts; + int i; - return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p); -} + if (!mem_alloc_profiling_enabled()) + return; -static void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects, - struct obj_cgroup **objcgs) -{ - for (int i = 0; i < objects; i++) { - struct obj_cgroup *objcg; - unsigned int off; + obj_exts = slab_obj_exts(slab); + if (!obj_exts) + return; - off = obj_to_index(s, slab, p[i]); - objcg = objcgs[off]; - if (!objcg) - continue; + for (i = 0; i < objects; i++) { + unsigned int off = obj_to_index(s, slab, p[i]); - objcgs[off] = NULL; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), - -obj_full_size(s)); - obj_cgroup_put(objcg); + alloc_tag_sub(&obj_exts[off].ref, s->size); } +#endif } -static __fastpath_inline -void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, - int objects) -{ - struct obj_cgroup **objcgs; - - if (!memcg_kmem_online()) - return; +#else /* CONFIG_SLAB_OBJ_EXT */ - objcgs = slab_objcgs(slab); - if (likely(!objcgs)) - return; +static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) +{ + return 0; +} - __memcg_slab_free_hook(s, slab, p, objects, objcgs); +static inline void free_slab_obj_exts(struct slab *slab) +{ } -static inline -void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, - struct obj_cgroup *objcg) +static inline bool need_slab_obj_ext(void) { - if (objcg) - obj_cgroup_uncharge(objcg, objects * obj_full_size(s)); + return false; } -#else /* CONFIG_MEMCG_KMEM */ -static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) + +static inline struct slabobj_ext * +prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) { return NULL; } -static inline void memcg_free_slab_cgroups(struct slab *slab) +static inline void +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) { } -static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t objects, gfp_t flags) +#endif /* CONFIG_SLAB_OBJ_EXT */ + +#ifdef CONFIG_MEMCG_KMEM + +static void memcg_alloc_abort_single(struct kmem_cache *s, void *object); + +static __fastpath_inline +bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, + gfp_t flags, size_t size, void **p) { - return true; + if (likely(!memcg_kmem_online())) + return true; + + if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))) + return true; + + if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p))) + return true; + + if (likely(size == 1)) { + memcg_alloc_abort_single(s, *p); + *p = NULL; + } else { + kmem_cache_free_bulk(s, size, p); + } + + return false; } -static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, +static __fastpath_inline +void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ + struct slabobj_ext *obj_exts; + + if (!memcg_kmem_online()) + return; + + obj_exts = slab_obj_exts(slab); + if (likely(!obj_exts)) + return; + + __memcg_slab_free_hook(s, slab, p, objects, obj_exts); +} +#else /* CONFIG_MEMCG_KMEM */ +static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, gfp_t flags, size_t size, void **p) { + return true; } static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { } - -static inline -void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, - struct obj_cgroup *objcg) -{ -} #endif /* CONFIG_MEMCG_KMEM */ /* @@ -2107,23 +2176,28 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) * * The initialization memset's clear the object and the metadata, * but don't touch the SLAB redzone. + * + * The object's freepointer is also avoided if stored outside the + * object. */ if (unlikely(init)) { int rsize; + unsigned int inuse; + inuse = get_info_end(s); if (!kasan_has_integrated_init()) memset(kasan_reset_tag(x), 0, s->object_size); rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; - memset((char *)kasan_reset_tag(x) + s->inuse, 0, - s->size - s->inuse - rsize); + memset((char *)kasan_reset_tag(x) + inuse, 0, + s->size - inuse - rsize); } /* KASAN might put x into memory quarantine, delaying its reuse. */ return !kasan_slab_free(s, x, init); } -static inline bool slab_free_freelist_hook(struct kmem_cache *s, - void **head, void **tail, - int *cnt) +static __fastpath_inline +bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, + int *cnt) { void *object; @@ -2243,7 +2317,7 @@ static void __init init_freelist_randomization(void) } /* Get the next entry on the pre-computed freelist randomized */ -static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab, +static void *next_freelist_entry(struct kmem_cache *s, unsigned long *pos, void *start, unsigned long page_limit, unsigned long freelist_count) @@ -2282,13 +2356,12 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) start = fixup_red_left(s, slab_address(slab)); /* First entry is used as the base of the freelist */ - cur = next_freelist_entry(s, slab, &pos, start, page_limit, - freelist_count); + cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count); cur = setup_object(s, cur); slab->freelist = cur; for (idx = 1; idx < slab->objects; idx++) { - next = next_freelist_entry(s, slab, &pos, start, page_limit, + next = next_freelist_entry(s, &pos, start, page_limit, freelist_count); next = setup_object(s, next); set_freepointer(s, cur, next); @@ -2314,7 +2387,7 @@ static __always_inline void account_slab(struct slab *slab, int order, struct kmem_cache *s, gfp_t gfp) { if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) - memcg_alloc_slab_cgroups(slab, s, gfp, true); + alloc_slab_obj_exts(slab, s, gfp, true); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), PAGE_SIZE << order); @@ -2323,8 +2396,8 @@ static __always_inline void account_slab(struct slab *slab, int order, static __always_inline void unaccount_slab(struct slab *slab, int order, struct kmem_cache *s) { - if (memcg_kmem_online()) - memcg_free_slab_cgroups(slab); + if (memcg_kmem_online() || need_slab_obj_ext()) + free_slab_obj_exts(slab); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), -(PAGE_SIZE << order)); @@ -2620,19 +2693,18 @@ static struct slab *get_partial_node(struct kmem_cache *s, if (!partial) { partial = slab; stat(s, ALLOC_FROM_PARTIAL); + + if ((slub_get_cpu_partial(s) == 0)) { + break; + } } else { put_cpu_partial(s, slab, 0); stat(s, CPU_PARTIAL_NODE); - partial_slabs++; - } -#ifdef CONFIG_SLUB_CPU_PARTIAL - if (!kmem_cache_has_cpu_partial(s) - || partial_slabs > s->cpu_partial_slabs / 2) - break; -#else - break; -#endif + if (++partial_slabs > slub_get_cpu_partial(s) / 2) { + break; + } + } } spin_unlock_irqrestore(&n->list_lock, flags); return partial; @@ -2715,7 +2787,7 @@ static struct slab *get_partial(struct kmem_cache *s, int node, searchnode = numa_mem_id(); slab = get_partial_node(s, get_node(s, searchnode), pc); - if (slab || node != NUMA_NO_NODE) + if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) return slab; return get_any_partial(s, pc); @@ -2813,7 +2885,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, struct slab new; struct slab old; - if (slab->freelist) { + if (READ_ONCE(slab->freelist)) { stat(s, DEACTIVATE_REMOTE_FREES); tail = DEACTIVATE_TO_TAIL; } @@ -3245,6 +3317,43 @@ static unsigned long count_partial(struct kmem_cache_node *n, #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ #ifdef CONFIG_SLUB_DEBUG +#define MAX_PARTIAL_TO_SCAN 10000 + +static unsigned long count_partial_free_approx(struct kmem_cache_node *n) +{ + unsigned long flags; + unsigned long x = 0; + struct slab *slab; + + spin_lock_irqsave(&n->list_lock, flags); + if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) { + list_for_each_entry(slab, &n->partial, slab_list) + x += slab->objects - slab->inuse; + } else { + /* + * For a long list, approximate the total count of objects in + * it to meet the limit on the number of slabs to scan. + * Scan from both the list's head and tail for better accuracy. + */ + unsigned long scanned = 0; + + list_for_each_entry(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN / 2) + break; + } + list_for_each_entry_reverse(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN) + break; + } + x = mult_frac(x, n->nr_partial, scanned); + x = min(x, node_nr_objs(n)); + } + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { @@ -3263,7 +3372,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) oo_order(s->min)); if (oo_order(s->min) > get_order(s->object_size)) - pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", + pr_warn(" %s debugging increased min order, use slab_debug=O to disable.\n", s->name); for_each_kmem_cache_node(s, node, n) { @@ -3271,7 +3380,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) unsigned long nr_objs; unsigned long nr_free; - nr_free = count_partial(n, count_free); + nr_free = count_partial_free_approx(n); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); @@ -3326,7 +3435,6 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) counters = slab->counters; new.counters = counters; - VM_BUG_ON(!new.frozen); new.inuse = slab->objects; new.frozen = freelist != NULL; @@ -3392,6 +3500,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, struct slab *slab; unsigned long flags; struct partial_context pc; + bool try_thisnode = true; stat(s, ALLOC_SLOWPATH); @@ -3498,24 +3607,41 @@ new_slab: slab = slub_percpu_partial(c); slub_set_percpu_partial(c, slab); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); - stat(s, CPU_PARTIAL_ALLOC); - if (unlikely(!node_match(slab, node) || - !pfmemalloc_match(slab, gfpflags))) { - slab->next = NULL; - __put_partials(s, slab); - continue; + if (likely(node_match(slab, node) && + pfmemalloc_match(slab, gfpflags))) { + c->slab = slab; + freelist = get_freelist(s, slab); + VM_BUG_ON(!freelist); + stat(s, CPU_PARTIAL_ALLOC); + goto load_freelist; } - freelist = freeze_slab(s, slab); - goto retry_load_slab; + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + slab->next = NULL; + __put_partials(s, slab); } #endif new_objects: pc.flags = gfpflags; + /* + * When a preferred node is indicated but no __GFP_THISNODE + * + * 1) try to get a partial slab from target node only by having + * __GFP_THISNODE in pc.flags for get_partial() + * 2) if 1) failed, try to allocate a new slab from target node with + * GPF_NOWAIT | __GFP_THISNODE opportunistically + * 3) if 2) failed, retry with original gfpflags which will allow + * get_partial() try partial lists of other nodes before potentially + * allocating new page from other nodes + */ + if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode)) + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + pc.orig_size = orig_size; slab = get_partial(s, node, &pc); if (slab) { @@ -3537,10 +3663,15 @@ new_objects: } slub_put_cpu_ptr(s->cpu_slab); - slab = new_slab(s, gfpflags, node); + slab = new_slab(s, pc.flags, node); c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!slab)) { + if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode) { + try_thisnode = false; + goto new_objects; + } slab_out_of_memory(s, gfpflags, node); return NULL; } @@ -3737,7 +3868,8 @@ static void *__slab_alloc_node(struct kmem_cache *s, static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, void *obj) { - if (unlikely(slab_want_init_on_free(s)) && obj) + if (unlikely(slab_want_init_on_free(s)) && obj && + !freeptr_outside_object(s)) memset((void *)((char *)kasan_reset_tag(obj) + s->offset), 0, sizeof(void *)); } @@ -3751,10 +3883,7 @@ noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) ALLOW_ERROR_INJECTION(should_failslab, ERRNO); static __fastpath_inline -struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t size, gfp_t flags) +struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; @@ -3763,18 +3892,16 @@ struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, if (unlikely(should_failslab(s, flags))) return NULL; - if (unlikely(!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))) - return NULL; - return s; } static __fastpath_inline -void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, +bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p, bool init, unsigned int orig_size) { unsigned int zero_size = s->object_size; + struct slabobj_ext *obj_exts; bool kasan_init = init; size_t i; gfp_t init_flags = flags & gfp_allowed_mask; @@ -3792,11 +3919,11 @@ void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, zero_size = orig_size; /* - * When slub_debug is enabled, avoid memory initialization integrated + * When slab_debug is enabled, avoid memory initialization integrated * into KASAN and instead zero out the memory via the memset below with * the proper size. Otherwise, KASAN might overwrite SLUB redzones and * cause false-positive reports. This does not lead to a performance - * penalty on production builds, as slub_debug is not intended to be + * penalty on production builds, as slab_debug is not intended to be * enabled there. */ if (__slub_debug_enabled()) @@ -3817,9 +3944,21 @@ void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, init_flags); kmsan_slab_alloc(s, p[i], init_flags); + if (need_slab_obj_ext()) { + obj_exts = prepare_slab_obj_exts_hook(s, flags, p[i]); +#ifdef CONFIG_MEM_ALLOC_PROFILING + /* + * Currently obj_exts is used only for allocation profiling. + * If other users appear then mem_alloc_profiling_enabled() + * check should be added before alloc_tag_add(). + */ + if (likely(obj_exts)) + alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); +#endif + } } - memcg_slab_post_alloc_hook(s, objcg, flags, size, p); + return memcg_slab_post_alloc_hook(s, lru, flags, size, p); } /* @@ -3836,10 +3975,9 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { void *object; - struct obj_cgroup *objcg = NULL; bool init = false; - s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); + s = slab_pre_alloc_hook(s, gfpflags); if (unlikely(!s)) return NULL; @@ -3856,13 +3994,15 @@ out: /* * When init equals 'true', like for kzalloc() family, only * @orig_size bytes might be zeroed instead of s->object_size + * In case this fails due to memcg_slab_post_alloc_hook(), + * object is set to NULL */ - slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size); + slab_post_alloc_hook(s, lru, gfpflags, 1, &object, init, orig_size); return object; } -void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags) { void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_, s->object_size); @@ -3871,9 +4011,9 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) return ret; } -EXPORT_SYMBOL(kmem_cache_alloc); +EXPORT_SYMBOL(kmem_cache_alloc_noprof); -void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, +void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) { void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_, @@ -3883,7 +4023,7 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_lru); +EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); /** * kmem_cache_alloc_node - Allocate an object on the specified node @@ -3898,7 +4038,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_lru); * * Return: pointer to the new object or %NULL in case of error */ -void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); @@ -3906,7 +4046,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node); +EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); /* * To avoid unnecessary overhead, we pass through large allocation requests @@ -3923,7 +4063,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) flags = kmalloc_fix_flags(flags); flags |= __GFP_COMP; - folio = (struct folio *)alloc_pages_node(node, flags, order); + folio = (struct folio *)alloc_pages_node_noprof(node, flags, order); if (folio) { ptr = folio_address(folio); lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, @@ -3938,7 +4078,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) return ptr; } -void *kmalloc_large(size_t size, gfp_t flags) +void *kmalloc_large_noprof(size_t size, gfp_t flags) { void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); @@ -3946,9 +4086,9 @@ void *kmalloc_large(size_t size, gfp_t flags) flags, NUMA_NO_NODE); return ret; } -EXPORT_SYMBOL(kmalloc_large); +EXPORT_SYMBOL(kmalloc_large_noprof); -void *kmalloc_large_node(size_t size, gfp_t flags, int node) +void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) { void *ret = __kmalloc_large_node(size, flags, node); @@ -3956,7 +4096,7 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) flags, node); return ret; } -EXPORT_SYMBOL(kmalloc_large_node); +EXPORT_SYMBOL(kmalloc_large_node_noprof); static __always_inline void *__do_kmalloc_node(size_t size, gfp_t flags, int node, @@ -3983,26 +4123,26 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, return ret; } -void *__kmalloc_node(size_t size, gfp_t flags, int node) +void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, _RET_IP_); } -EXPORT_SYMBOL(__kmalloc_node); +EXPORT_SYMBOL(__kmalloc_node_noprof); -void *__kmalloc(size_t size, gfp_t flags) +void *__kmalloc_noprof(size_t size, gfp_t flags) { return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); } -EXPORT_SYMBOL(__kmalloc); +EXPORT_SYMBOL(__kmalloc_noprof); -void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, unsigned long caller) +void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, + int node, unsigned long caller) { return __do_kmalloc_node(size, flags, node, caller); } -EXPORT_SYMBOL(__kmalloc_node_track_caller); +EXPORT_SYMBOL(kmalloc_node_track_caller_noprof); -void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_, size); @@ -4012,9 +4152,9 @@ void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } -EXPORT_SYMBOL(kmalloc_trace); +EXPORT_SYMBOL(kmalloc_trace_noprof); -void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, +void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); @@ -4024,7 +4164,7 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } -EXPORT_SYMBOL(kmalloc_node_trace); +EXPORT_SYMBOL(kmalloc_node_trace_noprof); static noinline void free_to_partial_list( struct kmem_cache *s, struct slab *slab, @@ -4187,7 +4327,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * then add it. */ if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { - remove_full(s, n, slab); add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -4201,9 +4340,6 @@ slab_empty: */ remove_partial(n, slab); stat(s, FREE_REMOVE_PARTIAL); - } else { - /* Slab must be on the full list */ - remove_full(s, n, slab); } spin_unlock_irqrestore(&n->list_lock, flags); @@ -4245,7 +4381,7 @@ redo: c = raw_cpu_ptr(s->cpu_slab); tid = READ_ONCE(c->tid); - /* Same with comment on barrier() in slab_alloc_node() */ + /* Same with comment on barrier() in __slab_alloc_node() */ barrier(); if (unlikely(slab != c->slab)) { @@ -4295,16 +4431,28 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, unsigned long addr) { memcg_slab_free_hook(s, slab, &object, 1); + alloc_tagging_slab_free_hook(s, slab, &object, 1); if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) do_slab_free(s, slab, object, object, 1, addr); } +#ifdef CONFIG_MEMCG_KMEM +/* Do not inline the rare memcg charging failed path into the allocation path */ +static noinline +void memcg_alloc_abort_single(struct kmem_cache *s, void *object) +{ + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); +} +#endif + static __fastpath_inline void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, void *tail, void **p, int cnt, unsigned long addr) { memcg_slab_free_hook(s, slab, p, cnt); + alloc_tagging_slab_free_hook(s, slab, p, cnt); /* * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. @@ -4631,36 +4779,33 @@ error: #endif /* CONFIG_SLUB_TINY */ /* Note that interrupts must be enabled when calling this function. */ -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) { int i; - struct obj_cgroup *objcg = NULL; if (!size) return 0; - /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); + s = slab_pre_alloc_hook(s, flags); if (unlikely(!s)) return 0; i = __kmem_cache_alloc_bulk(s, flags, size, p); + if (unlikely(i == 0)) + return 0; /* * memcg and kmem_cache debug support and memory initialization. * Done outside of the IRQ disabled fastpath loop. */ - if (likely(i != 0)) { - slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s), s->object_size); - } else { - memcg_slab_alloc_error_hook(s, size, objcg); + if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p, + slab_want_init_on_alloc(flags, s), s->object_size))) { + return 0; } - return i; } -EXPORT_SYMBOL(kmem_cache_alloc_bulk); +EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); /* @@ -4702,8 +4847,8 @@ static unsigned int slub_min_objects; * activity on the partial lists which requires taking the list_lock. This is * less a concern for large slabs though which are rarely used. * - * slub_max_order specifies the order where we begin to stop considering the - * number of objects in a slab as critical. If we reach slub_max_order then + * slab_max_order specifies the order where we begin to stop considering the + * number of objects in a slab as critical. If we reach slab_max_order then * we try to keep the page order as low as possible. So we accept more waste * of space in favor of a small page order. * @@ -4770,14 +4915,14 @@ static inline int calculate_order(unsigned int size) * and backing off gradually. * * We start with accepting at most 1/16 waste and try to find the - * smallest order from min_objects-derived/slub_min_order up to - * slub_max_order that will satisfy the constraint. Note that increasing + * smallest order from min_objects-derived/slab_min_order up to + * slab_max_order that will satisfy the constraint. Note that increasing * the order can only result in same or less fractional waste, not more. * * If that fails, we increase the acceptable fraction of waste and try * again. The last iteration with fraction of 1/2 would effectively * accept any waste and give us the order determined by min_objects, as - * long as at least single object fits within slub_max_order. + * long as at least single object fits within slab_max_order. */ for (unsigned int fraction = 16; fraction > 1; fraction /= 2) { order = calc_slab_order(size, min_order, slub_max_order, @@ -4787,7 +4932,7 @@ static inline int calculate_order(unsigned int size) } /* - * Doh this slab cannot be placed using slub_max_order. + * Doh this slab cannot be placed using slab_max_order. */ order = get_order(size); if (order <= MAX_PAGE_ORDER) @@ -4857,7 +5002,6 @@ static void early_kmem_cache_node_alloc(int node) slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!slab); - inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects); if (slab_nid(slab) != node) { pr_err("SLUB: Unable to allocate memory from node %d\n", node); pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); @@ -4867,7 +5011,6 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!n); #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); - init_tracking(kmem_cache_node, n); #endif n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); @@ -5080,9 +5223,7 @@ static int calculate_sizes(struct kmem_cache *s) if ((int)order < 0) return 0; - s->allocflags = 0; - if (order) - s->allocflags |= __GFP_COMP; + s->allocflags = __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) s->allocflags |= GFP_DMA; @@ -5104,7 +5245,7 @@ static int calculate_sizes(struct kmem_cache *s) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { - s->flags = kmem_cache_flags(s->size, flags, s->name); + s->flags = kmem_cache_flags(flags, s->name); #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif @@ -5313,7 +5454,9 @@ static int __init setup_slub_min_order(char *str) return 1; } -__setup("slub_min_order=", setup_slub_min_order); +__setup("slab_min_order=", setup_slub_min_order); +__setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0); + static int __init setup_slub_max_order(char *str) { @@ -5326,7 +5469,8 @@ static int __init setup_slub_max_order(char *str) return 1; } -__setup("slub_max_order=", setup_slub_max_order); +__setup("slab_max_order=", setup_slub_max_order); +__setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0); static int __init setup_slub_min_objects(char *str) { @@ -5335,7 +5479,8 @@ static int __init setup_slub_min_objects(char *str) return 1; } -__setup("slub_min_objects=", setup_slub_min_objects); +__setup("slab_min_objects=", setup_slub_min_objects); +__setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0); #ifdef CONFIG_HARDENED_USERCOPY /* @@ -5646,7 +5791,8 @@ void __init kmem_cache_init(void) node_set(node, slab_nodes); create_boot_cache(kmem_cache_node, "kmem_cache_node", - sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); + sizeof(struct kmem_cache_node), + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); @@ -5656,14 +5802,14 @@ void __init kmem_cache_init(void) create_boot_cache(kmem_cache, "kmem_cache", offsetof(struct kmem_cache, node) + nr_node_ids * sizeof(struct kmem_cache_node *), - SLAB_HWCACHE_ALIGN, 0, 0); + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ setup_kmalloc_cache_index_table(); - create_kmalloc_caches(0); + create_kmalloc_caches(); /* Setup random freelists for each cache */ init_freelist_randomization(); @@ -6052,7 +6198,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, else if (flags & SO_OBJECTS) WARN_ON_ONCE(1); else - x = slab->slabs; + x = data_race(slab->slabs); total += x; nodes[node] += x; } @@ -6257,7 +6403,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (slab) - slabs += slab->slabs; + slabs += data_race(slab->slabs); } #endif @@ -6271,7 +6417,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (slab) { - slabs = READ_ONCE(slab->slabs); + slabs = data_race(slab->slabs); objects = (slabs * oo_objects(s->oo)) / 2; len += sysfs_emit_at(buf, len, " C%d=%d(%d)", cpu, objects, slabs); @@ -6792,14 +6938,12 @@ out_del_kobj: void sysfs_slab_unlink(struct kmem_cache *s) { - if (slab_state >= FULL) - kobject_del(&s->kobj); + kobject_del(&s->kobj); } void sysfs_slab_release(struct kmem_cache *s) { - if (slab_state >= FULL) - kobject_put(&s->kobj); + kobject_put(&s->kobj); } /* @@ -7107,7 +7251,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); - nr_free += count_partial(n, count_free); + nr_free += count_partial_free_approx(n); } sinfo->active_objs = nr_objs - nr_free; @@ -7117,14 +7261,4 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) sinfo->objects_per_slab = oo_objects(s->oo); sinfo->cache_order = oo_order(s->oo); } - -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) -{ -} - -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - return -EIO; -} #endif /* CONFIG_SLUB_DEBUG */ diff --git a/mm/sparse.c b/mm/sparse.c index 338cf946dee8..de40b2c73406 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -226,19 +226,6 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en { unsigned long pfn; -#ifdef CONFIG_SPARSEMEM_EXTREME - if (unlikely(!mem_section)) { - unsigned long size, align; - - size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; - align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_alloc(size, align); - if (!mem_section) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, align); - } -#endif - start &= PAGE_SECTION_MASK; mminit_validate_memmodel_limits(&start, &end); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { @@ -267,6 +254,19 @@ static void __init memblocks_present(void) unsigned long start, end; int i, nid; +#ifdef CONFIG_SPARSEMEM_EXTREME + if (unlikely(!mem_section)) { + unsigned long size, align; + + size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; + align = 1 << (INTERNODE_CACHE_SHIFT); + mem_section = memblock_alloc(size, align); + if (!mem_section) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, size, align); + } +#endif + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) memory_present(nid, start, end); } @@ -560,6 +560,8 @@ void __init sparse_init(void) unsigned long pnum_end, pnum_begin, map_count = 1; int nid_begin; + /* see include/linux/mmzone.h 'struct mem_section' definition */ + BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); memblocks_present(); pnum_begin = first_present_section_nr(); @@ -908,7 +910,8 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - page_init_poison(memmap, sizeof(struct page) * nr_pages); + if (!altmap || !altmap->inaccessible) + page_init_poison(memmap, sizeof(struct page) * nr_pages); ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); diff --git a/mm/swap.c b/mm/swap.c index cd8f0150ba3a..67786cb77130 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -74,22 +74,21 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), }; -/* - * This path almost never happens for VM activity - pages are normally freed - * in batches. But it gets used by networking - and for compound pages. - */ -static void __page_cache_release(struct folio *folio) +static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, + unsigned long *flagsp) { if (folio_test_lru(folio)) { - struct lruvec *lruvec; - unsigned long flags; - - lruvec = folio_lruvec_lock_irqsave(folio, &flags); - lruvec_del_folio(lruvec, folio); + folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); + lruvec_del_folio(*lruvecp, folio); __folio_clear_lru_flags(folio); - unlock_page_lruvec_irqrestore(lruvec, flags); } - /* See comment on folio_test_mlocked in release_pages() */ + + /* + * In rare cases, when truncation or holepunching raced with + * munlock after VM_LOCKED was cleared, Mlocked may still be + * found set here. This does not indicate a problem, unless + * "unevictable_pgs_cleared" appears worryingly large. + */ if (unlikely(folio_test_mlocked(folio))) { long nr_pages = folio_nr_pages(folio); @@ -99,34 +98,35 @@ static void __page_cache_release(struct folio *folio) } } -static void __folio_put_small(struct folio *folio) +/* + * This path almost never happens for VM activity - pages are normally freed + * in batches. But it gets used by networking - and for compound pages. + */ +static void page_cache_release(struct folio *folio) { - __page_cache_release(folio); - mem_cgroup_uncharge(folio); - free_unref_page(&folio->page, 0); -} + struct lruvec *lruvec = NULL; + unsigned long flags; -static void __folio_put_large(struct folio *folio) -{ - /* - * __page_cache_release() is supposed to be called for thp, not for - * hugetlb. This is because hugetlb page does never have PageLRU set - * (it's never listed to any LRU lists) and no memcg routines should - * be called for hugetlb (it has a separate hugetlb_cgroup.) - */ - if (!folio_test_hugetlb(folio)) - __page_cache_release(folio); - destroy_large_folio(folio); + __page_cache_release(folio, &lruvec, &flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); } void __folio_put(struct folio *folio) { - if (unlikely(folio_is_zone_device(folio))) - free_zone_device_page(&folio->page); - else if (unlikely(folio_test_large(folio))) - __folio_put_large(folio); - else - __folio_put_small(folio); + if (unlikely(folio_is_zone_device(folio))) { + free_zone_device_folio(folio); + return; + } else if (folio_test_hugetlb(folio)) { + free_huge_folio(folio); + return; + } + + page_cache_release(folio); + if (folio_test_large(folio) && folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + mem_cgroup_uncharge(folio); + free_unref_page(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); @@ -138,22 +138,25 @@ EXPORT_SYMBOL(__folio_put); */ void put_pages_list(struct list_head *pages) { + struct folio_batch fbatch; struct folio *folio, *next; + folio_batch_init(&fbatch); list_for_each_entry_safe(folio, next, pages, lru) { - if (!folio_put_testzero(folio)) { - list_del(&folio->lru); + if (!folio_put_testzero(folio)) continue; - } - if (folio_test_large(folio)) { - list_del(&folio->lru); - __folio_put_large(folio); + if (folio_test_hugetlb(folio)) { + free_huge_folio(folio); continue; } /* LRU flag must be clear because it's passed using the lru */ + if (folio_batch_add(&fbatch, folio) > 0) + continue; + free_unref_folios(&fbatch); } - free_unref_page_list(pages); + if (fbatch.nr) + free_unref_folios(&fbatch); INIT_LIST_HEAD(pages); } EXPORT_SYMBOL(put_pages_list); @@ -175,7 +178,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily - * true of release_pages(): but those only clear the mlocked flag after + * true of folios_put(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { @@ -213,7 +216,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) continue; - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); + folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); @@ -221,8 +224,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); - folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, @@ -445,15 +447,18 @@ static void folio_inc_refs(struct folio *folio) } #endif /* CONFIG_LRU_GEN */ -/* - * Mark a page as having seen activity. +/** + * folio_mark_accessed - Mark a folio as having seen activity. + * @folio: The folio to mark. + * + * This function will perform one of the following transitions: * - * inactive,unreferenced -> inactive,referenced - * inactive,referenced -> active,unreferenced - * active,unreferenced -> active,referenced + * * inactive,unreferenced -> inactive,referenced + * * inactive,referenced -> active,unreferenced + * * active,unreferenced -> active,referenced * - * When a newly allocated page is not yet visible, so safe for non-atomic ops, - * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). + * When a newly allocated folio is not yet visible, so safe for non-atomic ops, + * __folio_set_referenced() may be substituted for folio_mark_accessed(). */ void folio_mark_accessed(struct folio *folio) { @@ -946,43 +951,31 @@ void lru_cache_disable(void) } /** - * release_pages - batched put_page() - * @arg: array of pages to release - * @nr: number of pages + * folios_put_refs - Reduce the reference count on a batch of folios. + * @folios: The folios. + * @refs: The number of refs to subtract from each folio. * - * Decrement the reference count on all the pages in @arg. If it - * fell to zero, remove the page from the LRU and free it. + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need + * to reinitialise it. If @refs is NULL, we subtract one from each + * folio refcount. * - * Note that the argument can be an array of pages, encoded pages, - * or folio pointers. We ignore any encoded bits, and turn any of - * them into just a folio that gets free'd. + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. */ -void release_pages(release_pages_arg arg, int nr) +void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { - int i; - struct encoded_page **encoded = arg.encoded_pages; - LIST_HEAD(pages_to_free); + int i, j; struct lruvec *lruvec = NULL; unsigned long flags = 0; - unsigned int lock_batch; - - for (i = 0; i < nr; i++) { - struct folio *folio; - /* Turn any of the argument types into a folio */ - folio = page_folio(encoded_page_ptr(encoded[i])); + for (i = 0, j = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + unsigned int nr_refs = refs ? refs[i] : 1; - /* - * Make sure the IRQ-safe lock-holding time does not get - * excessive with a continuous string of pages from the - * same lruvec. The lock is held only if lruvec != NULL. - */ - if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { - unlock_page_lruvec_irqrestore(lruvec, flags); - lruvec = NULL; - } - - if (is_huge_zero_page(&folio->page)) + if (is_huge_zero_folio(folio)) continue; if (folio_is_zone_device(folio)) { @@ -990,56 +983,85 @@ void release_pages(release_pages_arg arg, int nr) unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - if (put_devmap_managed_page(&folio->page)) + if (put_devmap_managed_folio_refs(folio, nr_refs)) continue; - if (folio_put_testzero(folio)) - free_zone_device_page(&folio->page); + if (folio_ref_sub_and_test(folio, nr_refs)) + free_zone_device_folio(folio); continue; } - if (!folio_put_testzero(folio)) + if (!folio_ref_sub_and_test(folio, nr_refs)) continue; - if (folio_test_large(folio)) { + /* hugetlb has its own memcg */ + if (folio_test_hugetlb(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - __folio_put_large(folio); + free_huge_folio(folio); continue; } + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); - if (folio_test_lru(folio)) { - struct lruvec *prev_lruvec = lruvec; + __page_cache_release(folio, &lruvec, &flags); - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, - &flags); - if (prev_lruvec != lruvec) - lock_batch = 0; + if (j != i) + folios->folios[j] = folio; + j++; + } + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + if (!j) { + folio_batch_reinit(folios); + return; + } - lruvec_del_folio(lruvec, folio); - __folio_clear_lru_flags(folio); - } + folios->nr = j; + mem_cgroup_uncharge_folios(folios); + free_unref_folios(folios); +} +EXPORT_SYMBOL(folios_put_refs); - /* - * In rare cases, when truncation or holepunching raced with - * munlock after VM_LOCKED was cleared, Mlocked may still be - * found set here. This does not indicate a problem, unless - * "unevictable_pgs_cleared" appears worryingly large. - */ - if (unlikely(folio_test_mlocked(folio))) { - __folio_clear_mlocked(folio); - zone_stat_sub_folio(folio, NR_MLOCK); - count_vm_event(UNEVICTABLE_PGCLEARED); - } +/** + * release_pages - batched put_page() + * @arg: array of pages to release + * @nr: number of pages + * + * Decrement the reference count on all the pages in @arg. If it + * fell to zero, remove the page from the LRU and free it. + * + * Note that the argument can be an array of pages, encoded pages, + * or folio pointers. We ignore any encoded bits, and turn any of + * them into just a folio that gets free'd. + */ +void release_pages(release_pages_arg arg, int nr) +{ + struct folio_batch fbatch; + int refs[PAGEVEC_SIZE]; + struct encoded_page **encoded = arg.encoded_pages; + int i; - list_add(&folio->lru, &pages_to_free); + folio_batch_init(&fbatch); + for (i = 0; i < nr; i++) { + /* Turn any of the argument types into a folio */ + struct folio *folio = page_folio(encoded_page_ptr(encoded[i])); + + /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ + refs[fbatch.nr] = 1; + if (unlikely(encoded_page_flags(encoded[i]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + refs[fbatch.nr] = encoded_nr_pages(encoded[++i]); + + if (folio_batch_add(&fbatch, folio) > 0) + continue; + folios_put_refs(&fbatch, refs); } - if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); - mem_cgroup_uncharge_list(&pages_to_free); - free_unref_page_list(&pages_to_free); + if (fbatch.nr) + folios_put_refs(&fbatch, refs); } EXPORT_SYMBOL(release_pages); @@ -1059,8 +1081,7 @@ void __folio_batch_release(struct folio_batch *fbatch) lru_add_drain(); fbatch->percpu_pvec_drained = true; } - release_pages(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } EXPORT_SYMBOL(__folio_batch_release); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0bec1f705f8e..13ab3b771409 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -264,7 +264,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1); + cache->slots, 0); return cache->nr; } @@ -273,6 +273,9 @@ void free_swap_slot(swp_entry_t entry) { struct swap_slots_cache *cache; + /* Large folio swap slot is not covered. */ + zswap_invalidate(entry); + cache = raw_cpu_ptr(&swp_slots); if (likely(use_swap_slot_cache && cache->slots_ret)) { spin_lock_irq(&cache->free_lock); @@ -307,8 +310,8 @@ swp_entry_t folio_alloc_swap(struct folio *folio) entry.val = 0; if (folio_test_large(folio)) { - if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported()) - get_swap_pages(1, &entry, folio_nr_pages(folio)); + if (IS_ENABLED(CONFIG_THP_SWAP)) + get_swap_pages(1, &entry, folio_order(folio)); goto out; } @@ -340,7 +343,7 @@ repeat: goto out; } - get_swap_pages(1, &entry, 1); + get_swap_pages(1, &entry, 0); out: if (mem_cgroup_try_charge_swap(folio, entry)) { put_swap_folio(folio, entry); diff --git a/mm/swap_state.c b/mm/swap_state.c index 7255c01a1e4e..642c30d8376c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -15,6 +15,7 @@ #include <linux/swapops.h> #include <linux/init.h> #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/migrate.h> @@ -72,11 +73,11 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); - struct page *page; + void *shadow; - page = xa_load(&address_space->i_pages, idx); - if (xa_is_value(page)) - return page; + shadow = xa_load(&address_space->i_pages, idx); + if (xa_is_value(shadow)) + return shadow; return NULL; } @@ -282,10 +283,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, * folio_free_swap() _with_ the lock. * - Marcelo */ -void free_swap_cache(struct page *page) +void free_swap_cache(struct folio *folio) { - struct folio *folio = page_folio(page); - if (folio_test_swapcache(folio) && !folio_mapped(folio) && folio_trylock(folio)) { folio_free_swap(folio); @@ -299,9 +298,11 @@ void free_swap_cache(struct page *page) */ void free_page_and_swap_cache(struct page *page) { - free_swap_cache(page); - if (!is_huge_zero_page(page)) - put_page(page); + struct folio *folio = page_folio(page); + + free_swap_cache(folio); + if (!is_huge_zero_folio(folio)) + folio_put(folio); } /* @@ -310,10 +311,25 @@ void free_page_and_swap_cache(struct page *page) */ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { + struct folio_batch folios; + unsigned int refs[PAGEVEC_SIZE]; + lru_add_drain(); - for (int i = 0; i < nr; i++) - free_swap_cache(encoded_page_ptr(pages[i])); - release_pages(pages, nr); + folio_batch_init(&folios); + for (int i = 0; i < nr; i++) { + struct folio *folio = page_folio(encoded_page_ptr(pages[i])); + + free_swap_cache(folio); + refs[folios.nr] = 1; + if (unlikely(encoded_page_flags(pages[i]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + refs[folios.nr] = encoded_nr_pages(pages[++i]); + + if (folio_batch_add(&folios, folio) == 0) + folios_put_refs(&folios, refs); + } + if (folios.nr) + folios_put_refs(&folios, refs); } static inline bool swap_use_vma_readahead(void) diff --git a/mm/swapfile.c b/mm/swapfile.c index 746aa9da5302..b3e5e384e330 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -130,7 +130,11 @@ static inline unsigned char swap_count(unsigned char ent) /* Reclaim the swap entry if swap is getting full*/ #define TTRS_FULL 0x4 -/* returns 1 if swap entry is freed */ +/* + * returns number of pages in the folio that backs the swap entry. If positive, + * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no + * folio was associated with the swap entry. + */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { @@ -155,6 +159,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, ret = folio_free_swap(folio); folio_unlock(folio); } + ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); folio_put(folio); return ret; } @@ -273,15 +278,15 @@ static void discard_swap_cluster(struct swap_info_struct *si, #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR -#define swap_entry_size(size) (size) +#define swap_entry_order(order) (order) #else #define SWAPFILE_CLUSTER 256 /* - * Define swap_entry_size() as constant to let compiler to optimize + * Define swap_entry_order() as constant to let compiler to optimize * out some code if !CONFIG_THP_SWAP */ -#define swap_entry_size(size) 1 +#define swap_entry_order(order) 0 #endif #define LATENCY_LIMIT 256 @@ -343,18 +348,6 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } -static inline bool cluster_is_huge(struct swap_cluster_info *info) -{ - if (IS_ENABLED(CONFIG_THP_SWAP)) - return info->flags & CLUSTER_FLAG_HUGE; - return false; -} - -static inline void cluster_clear_huge(struct swap_cluster_info *info) -{ - info->flags &= ~CLUSTER_FLAG_HUGE; -} - static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { @@ -558,10 +551,12 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx) /* * The cluster corresponding to page_nr will be used. The cluster will be - * removed from free cluster list and its usage counter will be increased. + * removed from free cluster list and its usage counter will be increased by + * count. */ -static void inc_cluster_info_page(struct swap_info_struct *p, - struct swap_cluster_info *cluster_info, unsigned long page_nr) +static void add_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr, + unsigned long count) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; @@ -570,9 +565,19 @@ static void inc_cluster_info_page(struct swap_info_struct *p, if (cluster_is_free(&cluster_info[idx])) alloc_cluster(p, idx); - VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); + VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], - cluster_count(&cluster_info[idx]) + 1); + cluster_count(&cluster_info[idx]) + count); +} + +/* + * The cluster corresponding to page_nr will be used. The cluster will be + * removed from free cluster list and its usage counter will be increased by 1. + */ +static void inc_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + add_cluster_info_page(p, cluster_info, page_nr, 1); } /* @@ -602,7 +607,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p, */ static bool scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, - unsigned long offset) + unsigned long offset, int order) { struct percpu_cluster *percpu_cluster; bool conflict; @@ -616,27 +621,42 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, return false; percpu_cluster = this_cpu_ptr(si->percpu_cluster); - cluster_set_null(&percpu_cluster->index); + percpu_cluster->next[order] = SWAP_NEXT_INVALID; + return true; +} + +static inline bool swap_range_empty(char *swap_map, unsigned int start, + unsigned int nr_pages) +{ + unsigned int i; + + for (i = 0; i < nr_pages; i++) { + if (swap_map[start + i]) + return false; + } + return true; } /* - * Try to get a swap entry from current cpu's swap entry pool (a cluster). This - * might involve allocating a new cluster for current CPU too. + * Try to get swap entries with specified order from current cpu's swap entry + * pool (a cluster). This might involve allocating a new cluster for current CPU + * too. */ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, - unsigned long *offset, unsigned long *scan_base) + unsigned long *offset, unsigned long *scan_base, int order) { + unsigned int nr_pages = 1 << order; struct percpu_cluster *cluster; struct swap_cluster_info *ci; - unsigned long tmp, max; + unsigned int tmp, max; new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); - if (cluster_is_null(&cluster->index)) { + tmp = cluster->next[order]; + if (tmp == SWAP_NEXT_INVALID) { if (!cluster_list_empty(&si->free_clusters)) { - cluster->index = si->free_clusters.head; - cluster->next = cluster_next(&cluster->index) * + tmp = cluster_next(&si->free_clusters.head) * SWAPFILE_CLUSTER; } else if (!cluster_list_empty(&si->discard_clusters)) { /* @@ -654,27 +674,27 @@ new_cluster: /* * Other CPUs can use our cluster if they can't find a free cluster, - * check if there is still free entry in the cluster + * check if there is still free entry in the cluster, maintaining + * natural alignment. */ - tmp = cluster->next; - max = min_t(unsigned long, si->max, - (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); + max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); if (tmp < max) { ci = lock_cluster(si, tmp); while (tmp < max) { - if (!si->swap_map[tmp]) + if (swap_range_empty(si->swap_map, tmp, nr_pages)) break; - tmp++; + tmp += nr_pages; } unlock_cluster(ci); } if (tmp >= max) { - cluster_set_null(&cluster->index); + cluster->next[order] = SWAP_NEXT_INVALID; goto new_cluster; } - cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; + tmp += nr_pages; + cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; return true; } @@ -737,8 +757,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, if (was_full && (si->flags & SWP_WRITEOK)) add_to_avail_list(si); } - atomic_long_add(nr_entries, &nr_swap_pages); - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; @@ -746,12 +764,19 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify = NULL; while (offset <= end) { arch_swap_invalidate_page(si->type, offset); - zswap_invalidate(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; } clear_shadow_from_swap_cache(si->type, begin, end); + + /* + * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 + * only after the above cleanups are done. + */ + smp_wmb(); + atomic_long_add(nr_entries, &nr_swap_pages); + WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); } static void set_cluster_next(struct swap_info_struct *si, unsigned long next) @@ -799,13 +824,14 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si, static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, - swp_entry_t slots[]) + swp_entry_t slots[], int order) { struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; + unsigned int nr_pages = 1 << order; int n_ret = 0; bool scanned_many = false; @@ -820,6 +846,25 @@ static int scan_swap_map_slots(struct swap_info_struct *si, * And we let swap pages go all over an SSD partition. Hugh */ + if (order > 0) { + /* + * Should not even be attempting large allocations when huge + * page swap is disabled. Warn and fail the allocation. + */ + if (!IS_ENABLED(CONFIG_THP_SWAP) || + nr_pages > SWAPFILE_CLUSTER) { + VM_WARN_ON_ONCE(1); + return 0; + } + + /* + * Swapfile is not block device or not using clusters so unable + * to allocate large entries. + */ + if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) + return 0; + } + si->flags += SWP_SCANNING; /* * Use percpu scan base for SSD to reduce lock contention on @@ -834,8 +879,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si, /* SSD algorithm */ if (si->cluster_info) { - if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { + if (order > 0) + goto no_page; goto scan; + } } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; @@ -854,7 +902,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ - for (; last_in_cluster <= si->highest_bit; offset++) { + for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) { if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { @@ -877,13 +925,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, checks: if (si->cluster_info) { - while (scan_swap_map_ssd_cluster_conflict(si, offset)) { + while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) { /* take a break if we already got some slots */ if (n_ret) goto done; if (!scan_swap_map_try_ssd_cluster(si, &offset, - &scan_base)) + &scan_base, order)) { + if (order > 0) + goto no_page; goto scan; + } } } if (!(si->flags & SWP_WRITEOK)) @@ -902,7 +953,7 @@ checks: swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ - if (swap_was_freed) + if (swap_was_freed > 0) goto checks; goto scan; /* check next one */ } @@ -914,11 +965,11 @@ checks: else goto done; } - WRITE_ONCE(si->swap_map[offset], usage); - inc_cluster_info_page(si, si->cluster_info, offset); + memset(si->swap_map + offset, usage, nr_pages); + add_cluster_info_page(si, si->cluster_info, offset, nr_pages); unlock_cluster(ci); - swap_range_alloc(si, offset, 1); + swap_range_alloc(si, offset, nr_pages); slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ @@ -939,8 +990,10 @@ checks: /* try to get more slots in cluster */ if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) goto checks; + if (order > 0) + goto done; } else if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; @@ -967,11 +1020,13 @@ checks: } done: - set_cluster_next(si, offset + 1); + if (order == 0) + set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; scan: + VM_WARN_ON(order > 0); spin_unlock(&si->lock); while (++offset <= READ_ONCE(si->highest_bit)) { if (unlikely(--latency_ration < 0)) { @@ -1000,38 +1055,6 @@ no_page: return n_ret; } -static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) -{ - unsigned long idx; - struct swap_cluster_info *ci; - unsigned long offset; - - /* - * Should not even be attempting cluster allocations when huge - * page swap is disabled. Warn and fail the allocation. - */ - if (!IS_ENABLED(CONFIG_THP_SWAP)) { - VM_WARN_ON_ONCE(1); - return 0; - } - - if (cluster_list_empty(&si->free_clusters)) - return 0; - - idx = cluster_list_first(&si->free_clusters); - offset = idx * SWAPFILE_CLUSTER; - ci = lock_cluster(si, offset); - alloc_cluster(si, idx); - cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); - - memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER); - unlock_cluster(ci); - swap_range_alloc(si, offset, SWAPFILE_CLUSTER); - *slot = swp_entry(si->type, offset); - - return 1; -} - static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) { unsigned long offset = idx * SWAPFILE_CLUSTER; @@ -1045,17 +1068,15 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) swap_range_free(si, offset, SWAPFILE_CLUSTER); } -int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) { - unsigned long size = swap_entry_size(entry_size); + int order = swap_entry_order(entry_order); + unsigned long size = 1 << order; struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; - /* Only single cluster request supported */ - WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); - spin_lock(&swap_avail_lock); avail_pgs = atomic_long_read(&nr_swap_pages) / size; @@ -1091,14 +1112,10 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - if (size == SWAPFILE_CLUSTER) { - if (si->flags & SWP_BLKDEV) - n_ret = swap_alloc_cluster(si, swp_entries); - } else - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries); + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries, order); spin_unlock(&si->lock); - if (n_ret || size == SWAPFILE_CLUSTER) + if (n_ret || size > 1) goto check_out; cond_resched(); @@ -1221,11 +1238,15 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, /* * When we get a swap entry, if there aren't some other ways to - * prevent swapoff, such as the folio in swap cache is locked, page - * table lock is held, etc., the swap entry may become invalid because - * of swapoff. Then, we need to enclose all swap related functions - * with get_swap_device() and put_swap_device(), unless the swap - * functions call get/put_swap_device() by themselves. + * prevent swapoff, such as the folio in swap cache is locked, RCU + * reader side is locked, etc., the swap entry may become invalid + * because of swapoff. Then, we need to enclose all swap related + * functions with get_swap_device() and put_swap_device(), unless the + * swap functions call get/put_swap_device() by themselves. + * + * RCU reader side lock (including any spinlock) is sufficient to + * prevent swapoff, because synchronize_rcu() is called in swapoff() + * before freeing data structures. * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid @@ -1347,7 +1368,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; - int size = swap_entry_size(folio_nr_pages(folio)); + int size = 1 << swap_entry_order(folio_order(folio)); si = _swap_info_get(entry); if (!si) @@ -1355,7 +1376,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) ci = lock_cluster_or_swap_info(si, offset); if (size == SWAPFILE_CLUSTER) { - VM_BUG_ON(!cluster_is_huge(ci)); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { val = map[i]; @@ -1363,7 +1383,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (val == SWAP_HAS_CACHE) free_entries++; } - cluster_clear_huge(ci); if (free_entries == SWAPFILE_CLUSTER) { unlock_cluster_or_swap_info(si, ci); spin_lock(&si->lock); @@ -1385,23 +1404,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) unlock_cluster_or_swap_info(si, ci); } -#ifdef CONFIG_THP_SWAP -int split_swap_cluster(swp_entry_t entry) -{ - struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); - - si = _swap_info_get(entry); - if (!si) - return -EBUSY; - ci = lock_cluster(si, offset); - cluster_clear_huge(ci); - unlock_cluster(ci); - return 0; -} -#endif - static int swp_entry_cmp(const void *ent1, const void *ent2) { const swp_entry_t *e1 = ent1, *e2 = ent2; @@ -1509,22 +1511,23 @@ out: } static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, - swp_entry_t entry) + swp_entry_t entry, int order) { struct swap_cluster_info *ci; unsigned char *map = si->swap_map; + unsigned int nr_pages = 1 << order; unsigned long roffset = swp_offset(entry); - unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); + unsigned long offset = round_down(roffset, nr_pages); int i; bool ret = false; ci = lock_cluster_or_swap_info(si, offset); - if (!ci || !cluster_is_huge(ci)) { + if (!ci || nr_pages == 1) { if (swap_count(map[roffset])) ret = true; goto unlock_out; } - for (i = 0; i < SWAPFILE_CLUSTER; i++) { + for (i = 0; i < nr_pages; i++) { if (swap_count(map[offset + i])) { ret = true; break; @@ -1546,7 +1549,7 @@ static bool folio_swapped(struct folio *folio) if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) return swap_swapcount(si, entry) != 0; - return swap_page_trans_huge_swapped(si, entry); + return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } /** @@ -1592,27 +1595,88 @@ bool folio_free_swap(struct folio *folio) return true; } -/* - * Free the swap entry like above, but also try to - * free the page cache entry if it is the last user. +/** + * free_swap_and_cache_nr() - Release reference on range of swap entries and + * reclaim their cache if no more references remain. + * @entry: First entry of range. + * @nr: Number of entries in range. + * + * For each swap entry in the contiguous range, release a reference. If any swap + * entries become free, try to reclaim their underlying folios, if present. The + * offset range is defined by [entry.offset, entry.offset + nr). */ -int free_swap_and_cache(swp_entry_t entry) +void free_swap_and_cache_nr(swp_entry_t entry, int nr) { - struct swap_info_struct *p; + const unsigned long start_offset = swp_offset(entry); + const unsigned long end_offset = start_offset + nr; + unsigned int type = swp_type(entry); + struct swap_info_struct *si; + bool any_only_cache = false; + unsigned long offset; unsigned char count; if (non_swap_entry(entry)) - return 1; + return; - p = _swap_info_get(entry); - if (p) { - count = __swap_entry_free(p, entry); - if (count == SWAP_HAS_CACHE && - !swap_page_trans_huge_swapped(p, entry)) - __try_to_reclaim_swap(p, swp_offset(entry), + si = get_swap_device(entry); + if (!si) + return; + + if (WARN_ON(end_offset > si->max)) + goto out; + + /* + * First free all entries in the range. + */ + for (offset = start_offset; offset < end_offset; offset++) { + if (data_race(si->swap_map[offset])) { + count = __swap_entry_free(si, swp_entry(type, offset)); + if (count == SWAP_HAS_CACHE) + any_only_cache = true; + } else { + WARN_ON_ONCE(1); + } + } + + /* + * Short-circuit the below loop if none of the entries had their + * reference drop to zero. + */ + if (!any_only_cache) + goto out; + + /* + * Now go back over the range trying to reclaim the swap cache. This is + * more efficient for large folios because we will only try to reclaim + * the swap once per folio in the common case. If we do + * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the + * latter will get a reference and lock the folio for every individual + * page but will only succeed once the swap slot for every subpage is + * zero. + */ + for (offset = start_offset; offset < end_offset; offset += nr) { + nr = 1; + if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { + /* + * Folios are always naturally aligned in swap so + * advance forward to the next boundary. Zero means no + * folio was found for the swap entry, so advance by 1 + * in this case. Negative value means folio was found + * but could not be reclaimed. Here we can still advance + * to the next boundary. + */ + nr = __try_to_reclaim_swap(si, offset, TTRS_UNMAPPED | TTRS_FULL); + if (nr == 0) + nr = 1; + else if (nr < 0) + nr = -nr; + nr = ALIGN(offset + 1, nr) - offset; + } } - return p != NULL; + +out: + put_swap_device(si); } #ifdef CONFIG_HIBERNATION @@ -1627,7 +1691,7 @@ swp_entry_t get_swap_page_of_type(int type) /* This is called for allocating swap entry, not cache */ spin_lock(&si->lock); - if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry)) + if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) atomic_long_dec(&nr_swap_pages); spin_unlock(&si->lock); fail: @@ -1790,7 +1854,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ - arch_swap_restore(entry, folio); + arch_swap_restore(folio_swap(entry, folio), folio); dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); @@ -2049,7 +2113,7 @@ static int try_to_unuse(unsigned int type) unsigned int i; if (!READ_ONCE(si->inuse_pages)) - return 0; + goto success; retry: retval = shmem_unuse(type); @@ -2130,6 +2194,12 @@ retry: return -EINTR; } +success: + /* + * Make sure that further cleanups after try_to_unuse() returns happen + * after swap_range_free() reduces si->inuse_pages to 0. + */ + smp_mb(); return 0; } @@ -2348,8 +2418,6 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { - zswap_swapon(p->type); - spin_lock(&swap_lock); spin_lock(&p->lock); setup_swap_info(p, prio, swap_map, cluster_info); @@ -2376,13 +2444,17 @@ static void reinsert_swap_info(struct swap_info_struct *p) spin_unlock(&swap_lock); } +static bool __has_usable_swap(void) +{ + return !plist_head_empty(&swap_active_head); +} + bool has_usable_swap(void) { - bool ret = true; + bool ret; spin_lock(&swap_lock); - if (plist_head_empty(&swap_active_head)) - ret = false; + ret = __has_usable_swap(); spin_unlock(&swap_lock); return ret; } @@ -2397,7 +2469,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct inode *inode; struct filename *pathname; int err, found = 0; - unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2475,10 +2546,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) /* * Wait for swap operations protected by get/put_swap_device() - * to complete. - * - * We need synchronize_rcu() here to protect the accessing to - * the swap cache data structure. + * to complete. Because of synchronize_rcu() here, all swap + * operations protected by RCU reader side lock (including any + * spinlock) will be waited too. This makes it easy to + * prevent folio_test_swapcache() and the following swap cache + * operations from racing with swapoff. */ percpu_ref_kill(&p->users); synchronize_rcu(); @@ -2509,7 +2581,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } swap_file = p->swap_file; - old_block_size = p->old_block_size; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; @@ -2532,11 +2603,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) exit_swap_address_space(p->type); inode = mapping->host; - if (p->bdev_handle) { - set_blocksize(p->bdev, old_block_size); - bdev_release(p->bdev_handle); - p->bdev_handle = NULL; - } inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; @@ -2762,21 +2828,8 @@ static struct swap_info_struct *alloc_swap_info(void) static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) { - int error; - if (S_ISBLK(inode->i_mode)) { - p->bdev_handle = bdev_open_by_dev(inode->i_rdev, - BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL); - if (IS_ERR(p->bdev_handle)) { - error = PTR_ERR(p->bdev_handle); - p->bdev_handle = NULL; - return error; - } - p->bdev = p->bdev_handle->bdev; - p->old_block_size = block_size(p->bdev); - error = set_blocksize(p->bdev, PAGE_SIZE); - if (error < 0) - return error; + p->bdev = I_BDEV(inode); /* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not @@ -3017,7 +3070,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) name = NULL; goto bad_swap; } - swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); + swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; @@ -3077,7 +3130,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->flags |= SWP_SYNCHRONOUS_IO; if (p->bdev && bdev_nonrot(p->bdev)) { - int cpu; + int cpu, i; unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; @@ -3113,8 +3166,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; + cluster = per_cpu_ptr(p->percpu_cluster, cpu); - cluster_set_null(&cluster->index); + for (i = 0; i < SWAP_NR_ORDERS; i++) + cluster->next[i] = SWAP_NEXT_INVALID; } } else { atomic_inc(&nr_rotate_swap); @@ -3167,6 +3222,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; + error = zswap_swapon(p->type, maxpages); + if (error) + goto free_swap_address_space; + /* * Flush any pending IO and dirty mappings before we start using this * swap device. @@ -3175,7 +3234,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; - goto free_swap_address_space; + goto free_swap_zswap; } mutex_lock(&swapon_mutex); @@ -3199,6 +3258,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; +free_swap_zswap: + zswap_swapoff(p->type); free_swap_address_space: exit_swap_address_space(p->type); bad_swap_unlock_inode: @@ -3208,11 +3269,6 @@ bad_swap: p->percpu_cluster = NULL; free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; - if (p->bdev_handle) { - set_blocksize(p->bdev, p->old_block_size); - bdev_release(p->bdev_handle); - p->bdev_handle = NULL; - } inode = NULL; destroy_swap_extents(p); swap_cgroup_swapoff(p->type); @@ -3320,7 +3376,8 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) } else err = -ENOENT; /* unused swap entry */ - WRITE_ONCE(p->swap_map[offset], count | has_cache); + if (!err) + WRITE_ONCE(p->swap_map[offset], count | has_cache); unlock_out: unlock_cluster_or_swap_info(p, ci); @@ -3632,6 +3689,9 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) if (!(gfp & __GFP_IO)) return; + if (!__has_usable_swap()) + return; + if (!blk_cgroup_congested()) return; diff --git a/mm/truncate.c b/mm/truncate.c index 725b150e47ac..e99085bf3d34 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -764,15 +764,15 @@ EXPORT_SYMBOL(truncate_setsize); * @from: original inode size * @to: new inode size * - * Handle extension of inode size either caused by extending truncate or by - * write starting after current i_size. We mark the page straddling current - * i_size RO so that page_mkwrite() is called on the nearest write access to - * the page. This way filesystem can be sure that page_mkwrite() is called on - * the page before user writes to the page via mmap after the i_size has been - * changed. + * Handle extension of inode size either caused by extending truncate or + * by write starting after current i_size. We mark the page straddling + * current i_size RO so that page_mkwrite() is called on the first + * write access to the page. The filesystem will update its per-block + * information before user writes to the page via mmap after the i_size + * has been changed. * * The function must be called after i_size is updated so that page fault - * coming after we unlock the page will already see the new i_size. + * coming after we unlock the folio will already see the new i_size. * The function must be called while we still hold i_rwsem - this not only * makes sure i_size is stable but also that userspace cannot observe new * i_size value before we are prepared to store mmap writes at new inode size. @@ -781,31 +781,29 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) { int bsize = i_blocksize(inode); loff_t rounded_from; - struct page *page; - pgoff_t index; + struct folio *folio; WARN_ON(to > inode->i_size); - if (from >= to || bsize == PAGE_SIZE) + if (from >= to || bsize >= PAGE_SIZE) return; /* Page straddling @from will not have any hole block created? */ rounded_from = round_up(from, bsize); if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1))) return; - index = from >> PAGE_SHIFT; - page = find_lock_page(inode->i_mapping, index); - /* Page not cached? Nothing to do */ - if (!page) + folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE); + /* Folio not cached? Nothing to do */ + if (IS_ERR(folio)) return; /* - * See clear_page_dirty_for_io() for details why set_page_dirty() + * See folio_clear_dirty_for_io() for details why folio_mark_dirty() * is needed. */ - if (page_mkclean(page)) - set_page_dirty(page); - unlock_page(page); - put_page(page); + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); } EXPORT_SYMBOL(pagecache_isize_extended); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 313f1c42768a..defa5109cc62 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -20,19 +20,11 @@ #include "internal.h" static __always_inline -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, - unsigned long dst_start, - unsigned long len) +bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) { - /* - * Make sure that the dst range is both valid and fully within a - * single existing vma. - */ - struct vm_area_struct *dst_vma; - - dst_vma = find_vma(dst_mm, dst_start); - if (!range_in_vma(dst_vma, dst_start, dst_start + len)) - return NULL; + /* Make sure that the dst range is fully within dst_vma. */ + if (dst_end > dst_vma->vm_end) + return false; /* * Check the vma is registered in uffd, this is required to @@ -40,11 +32,120 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, * time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) - return NULL; + return false; + + return true; +} + +static __always_inline +struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, + unsigned long addr) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + vma = vma_lookup(mm, addr); + if (!vma) + vma = ERR_PTR(-ENOENT); + else if (!(vma->vm_flags & VM_SHARED) && + unlikely(anon_vma_prepare(vma))) + vma = ERR_PTR(-ENOMEM); + + return vma; +} + +#ifdef CONFIG_PER_VMA_LOCK +/* + * uffd_lock_vma() - Lookup and lock vma corresponding to @address. + * @mm: mm to search vma in. + * @address: address that the vma should contain. + * + * Should be called without holding mmap_lock. + * + * Return: A locked vma containing @address, -ENOENT if no vma is found, or + * -ENOMEM if anon_vma couldn't be allocated. + */ +static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, + unsigned long address) +{ + struct vm_area_struct *vma; + + vma = lock_vma_under_rcu(mm, address); + if (vma) { + /* + * We know we're going to need to use anon_vma, so check + * that early. + */ + if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) + vma_end_read(vma); + else + return vma; + } + + mmap_read_lock(mm); + vma = find_vma_and_prepare_anon(mm, address); + if (!IS_ERR(vma)) { + /* + * We cannot use vma_start_read() as it may fail due to + * false locked (see comment in vma_start_read()). We + * can avoid that by directly locking vm_lock under + * mmap_lock, which guarantees that nobody can lock the + * vma for write (vma_start_write()) under us. + */ + down_read(&vma->vm_lock->lock); + } + + mmap_read_unlock(mm); + return vma; +} + +static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long len) +{ + struct vm_area_struct *dst_vma; + + dst_vma = uffd_lock_vma(dst_mm, dst_start); + if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) + return dst_vma; + + vma_end_read(dst_vma); + return ERR_PTR(-ENOENT); +} + +static void uffd_mfill_unlock(struct vm_area_struct *vma) +{ + vma_end_read(vma); +} + +#else + +static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long len) +{ + struct vm_area_struct *dst_vma; + mmap_read_lock(dst_mm); + dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); + if (IS_ERR(dst_vma)) + goto out_unlock; + + if (validate_dst_vma(dst_vma, dst_start + len)) + return dst_vma; + + dst_vma = ERR_PTR(-ENOENT); +out_unlock: + mmap_read_unlock(dst_mm); return dst_vma; } +static void uffd_mfill_unlock(struct vm_area_struct *vma) +{ + mmap_read_unlock(vma->vm_mm); +} +#endif + /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ static bool mfill_file_over_size(struct vm_area_struct *dst_vma, unsigned long dst_addr) @@ -77,9 +178,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, pte_t _dst_pte, *dst_pte; bool writable = dst_vma->vm_flags & VM_WRITE; bool vm_shared = dst_vma->vm_flags & VM_SHARED; - bool page_in_cache = page_mapping(page); spinlock_t *ptl; - struct folio *folio; + struct folio *folio = page_folio(page); + bool page_in_cache = folio_mapping(folio); _dst_pte = mk_pte(page, dst_vma->vm_page_prot); _dst_pte = pte_mkdirty(_dst_pte); @@ -109,7 +210,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, if (!pte_none_mostly(ptep_get(dst_pte))) goto out_unlock; - folio = page_folio(page); if (page_in_cache) { /* Usually, cache pages are already added to LRU */ if (newly_allocated) @@ -124,7 +224,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, * Must happen after rmap, as mm_counter() checks mapping (via * PageAnon()), which is set by __page_set_anon_rmap(). */ - inc_mm_counter(dst_mm, mm_counter(page)); + inc_mm_counter(dst_mm, mm_counter(folio)); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); @@ -213,6 +313,38 @@ out_release: goto out; } +static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + struct folio *folio; + int ret = -ENOMEM; + + folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); + if (!folio) + return ret; + + if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) + goto out_put; + + /* + * The memory barrier inside __folio_mark_uptodate makes sure that + * zeroing out the folio become visible before mapping the page + * using set_pte_at(). See do_anonymous_page(). + */ + __folio_mark_uptodate(folio); + + ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, + &folio->page, true, 0); + if (ret) + goto out_put; + + return 0; +out_put: + folio_put(folio); + return ret; +} + static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr) @@ -221,6 +353,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, spinlock_t *ptl; int ret; + if (mm_forbids_zeropage(dst_vma->vm_mm)) + return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); + _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); ret = -EAGAIN; @@ -350,18 +485,18 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) #ifdef CONFIG_HUGETLB_PAGE /* * mfill_atomic processing for HUGETLB vmas. Note that this routine is - * called with mmap_lock held, it will release mmap_lock before returning. + * called with either vma-lock or mmap_lock held, it will release the lock + * before returning. */ static __always_inline ssize_t mfill_atomic_hugetlb( + struct userfaultfd_ctx *ctx, struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, uffd_flags_t flags) { struct mm_struct *dst_mm = dst_vma->vm_mm; - int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; pte_t *dst_pte; unsigned long src_addr, dst_addr; @@ -379,7 +514,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb( * feature is not supported. */ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { - mmap_read_unlock(dst_mm); + up_read(&ctx->map_changing_lock); + uffd_mfill_unlock(dst_vma); return -EINVAL; } @@ -402,24 +538,28 @@ retry: * retry, dst_vma will be set to NULL and we must lookup again. */ if (!dst_vma) { + dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); + if (IS_ERR(dst_vma)) { + err = PTR_ERR(dst_vma); + goto out; + } + err = -ENOENT; - dst_vma = find_dst_vma(dst_mm, dst_start, len); - if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) - goto out_unlock; + if (!is_vm_hugetlb_page(dst_vma)) + goto out_unlock_vma; err = -EINVAL; if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) - goto out_unlock; - - vm_shared = dst_vma->vm_flags & VM_SHARED; - } + goto out_unlock_vma; - /* - * If not shared, ensure the dst_vma has a anon_vma. - */ - err = -ENOMEM; - if (!vm_shared) { - if (unlikely(anon_vma_prepare(dst_vma))) + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + down_read(&ctx->map_changing_lock); + err = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) goto out_unlock; } @@ -463,7 +603,8 @@ retry: cond_resched(); if (unlikely(err == -ENOENT)) { - mmap_read_unlock(dst_mm); + up_read(&ctx->map_changing_lock); + uffd_mfill_unlock(dst_vma); BUG_ON(!folio); err = copy_folio_from_user(folio, @@ -472,16 +613,6 @@ retry: err = -EFAULT; goto out; } - mmap_read_lock(dst_mm); - /* - * If memory mappings are changing because of non-cooperative - * operation (e.g. mremap) running in parallel, bail out and - * request the user to retry later - */ - if (mmap_changing && atomic_read(mmap_changing)) { - err = -EAGAIN; - break; - } dst_vma = NULL; goto retry; @@ -501,7 +632,9 @@ retry: } out_unlock: - mmap_read_unlock(dst_mm); + up_read(&ctx->map_changing_lock); +out_unlock_vma: + uffd_mfill_unlock(dst_vma); out: if (folio) folio_put(folio); @@ -512,11 +645,11 @@ out: } #else /* !CONFIG_HUGETLB_PAGE */ /* fail at build time if gcc attempts to use this */ -extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma, +extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, + struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, uffd_flags_t flags); #endif /* CONFIG_HUGETLB_PAGE */ @@ -564,13 +697,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, return err; } -static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, +static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, uffd_flags_t flags) { + struct mm_struct *dst_mm = ctx->mm; struct vm_area_struct *dst_vma; ssize_t err; pmd_t *dst_pmd; @@ -593,24 +726,24 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, copied = 0; folio = NULL; retry: - mmap_read_lock(dst_mm); + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); + if (IS_ERR(dst_vma)) { + err = PTR_ERR(dst_vma); + goto out; + } /* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ + down_read(&ctx->map_changing_lock); err = -EAGAIN; - if (mmap_changing && atomic_read(mmap_changing)) - goto out_unlock; - - /* - * Make sure the vma is not shared, that the dst range is - * both valid and fully within a single existing vma. - */ - err = -ENOENT; - dst_vma = find_dst_vma(dst_mm, dst_start, len); - if (!dst_vma) + if (atomic_read(&ctx->mmap_changing)) goto out_unlock; err = -EINVAL; @@ -633,8 +766,8 @@ retry: * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) - return mfill_atomic_hugetlb(dst_vma, dst_start, src_start, - len, mmap_changing, flags); + return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, + src_start, len, flags); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; @@ -642,16 +775,6 @@ retry: uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) goto out_unlock; - /* - * Ensure the dst_vma has a anon_vma or this page - * would get a NULL anon_vma when moved in the - * dst_vma. - */ - err = -ENOMEM; - if (!(dst_vma->vm_flags & VM_SHARED) && - unlikely(anon_vma_prepare(dst_vma))) - goto out_unlock; - while (src_addr < src_start + len) { pmd_t dst_pmdval; @@ -693,7 +816,8 @@ retry: if (unlikely(err == -ENOENT)) { void *kaddr; - mmap_read_unlock(dst_mm); + up_read(&ctx->map_changing_lock); + uffd_mfill_unlock(dst_vma); BUG_ON(!folio); kaddr = kmap_local_folio(folio, 0); @@ -723,7 +847,8 @@ retry: } out_unlock: - mmap_read_unlock(dst_mm); + up_read(&ctx->map_changing_lock); + uffd_mfill_unlock(dst_vma); out: if (folio) folio_put(folio); @@ -733,34 +858,42 @@ out: return copied ? copied : err; } -ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, +ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, uffd_flags_t flags) + uffd_flags_t flags) { - return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing, + return mfill_atomic(ctx, dst_start, src_start, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); } -ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing) +ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, + unsigned long start, + unsigned long len) { - return mfill_atomic(dst_mm, start, 0, len, mmap_changing, + return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); } -ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing, - uffd_flags_t flags) +ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, uffd_flags_t flags) { - return mfill_atomic(dst_mm, start, 0, len, mmap_changing, + + /* + * A caller might reasonably assume that UFFDIO_CONTINUE contains an + * smp_wmb() to ensure that any writes to the about-to-be-mapped page by + * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to + * subsequent loads from the page through the newly mapped address range. + */ + smp_wmb(); + + return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); } -ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing, - uffd_flags_t flags) +ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, uffd_flags_t flags) { - return mfill_atomic(dst_mm, start, 0, len, mmap_changing, + return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); } @@ -793,10 +926,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma, return ret; } -int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool enable_wp, - atomic_t *mmap_changing) +int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, bool enable_wp) { + struct mm_struct *dst_mm = ctx->mm; unsigned long end = start + len; unsigned long _start, _end; struct vm_area_struct *dst_vma; @@ -820,8 +953,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ + down_read(&ctx->map_changing_lock); err = -EAGAIN; - if (mmap_changing && atomic_read(mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out_unlock; err = -ENOENT; @@ -850,6 +984,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, err = 0; } out_unlock: + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); return err; } @@ -923,7 +1058,7 @@ static int move_present_pte(struct mm_struct *mm, } folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + src_folio->index = linear_page_index(dst_vma, dst_addr); orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ @@ -959,6 +1094,33 @@ static int move_swap_pte(struct mm_struct *mm, return 0; } +static int move_zeropage_pte(struct mm_struct *mm, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + spinlock_t *dst_ptl, spinlock_t *src_ptl) +{ + pte_t zero_pte; + + double_pt_lock(dst_ptl, src_ptl); + if (!pte_same(ptep_get(src_pte), orig_src_pte) || + !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } + + zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + dst_vma->vm_page_prot)); + ptep_clear_flush(src_vma, src_addr, src_pte); + set_pte_at(mm, dst_addr, dst_pte, zero_pte); + double_pt_unlock(dst_ptl, src_ptl); + + return 0; +} + + /* * The mmap_lock for reading is held by the caller. Just move the page * from src_pmd to dst_pmd if possible, and return true if succeeded @@ -1041,6 +1203,14 @@ retry: } if (pte_present(orig_src_pte)) { + if (is_zero_pfn(pte_pfn(orig_src_pte))) { + err = move_zeropage_pte(mm, dst_vma, src_vma, + dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, + dst_ptl, src_ptl); + goto out; + } + /* * Pin and lock both source folio and anon_vma. Since we are in * RCU read section, we can't block, so on contention have to @@ -1224,27 +1394,137 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx, if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) return -EINVAL; + return 0; +} + +static __always_inline +int find_vmas_mm_locked(struct mm_struct *mm, + unsigned long dst_start, + unsigned long src_start, + struct vm_area_struct **dst_vmap, + struct vm_area_struct **src_vmap) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + vma = find_vma_and_prepare_anon(mm, dst_start); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + *dst_vmap = vma; + /* Skip finding src_vma if src_start is in dst_vma */ + if (src_start >= vma->vm_start && src_start < vma->vm_end) + goto out_success; + + vma = vma_lookup(mm, src_start); + if (!vma) + return -ENOENT; +out_success: + *src_vmap = vma; + return 0; +} + +#ifdef CONFIG_PER_VMA_LOCK +static int uffd_move_lock(struct mm_struct *mm, + unsigned long dst_start, + unsigned long src_start, + struct vm_area_struct **dst_vmap, + struct vm_area_struct **src_vmap) +{ + struct vm_area_struct *vma; + int err; + + vma = uffd_lock_vma(mm, dst_start); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + *dst_vmap = vma; /* - * Ensure the dst_vma has a anon_vma or this page - * would get a NULL anon_vma when moved in the - * dst_vma. + * Skip finding src_vma if src_start is in dst_vma. This also ensures + * that we don't lock the same vma twice. */ - if (unlikely(anon_vma_prepare(dst_vma))) - return -ENOMEM; + if (src_start >= vma->vm_start && src_start < vma->vm_end) { + *src_vmap = vma; + return 0; + } - return 0; + /* + * Using uffd_lock_vma() to get src_vma can lead to following deadlock: + * + * Thread1 Thread2 + * ------- ------- + * vma_start_read(dst_vma) + * mmap_write_lock(mm) + * vma_start_write(src_vma) + * vma_start_read(src_vma) + * mmap_read_lock(mm) + * vma_start_write(dst_vma) + */ + *src_vmap = lock_vma_under_rcu(mm, src_start); + if (likely(*src_vmap)) + return 0; + + /* Undo any locking and retry in mmap_lock critical section */ + vma_end_read(*dst_vmap); + + mmap_read_lock(mm); + err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); + if (!err) { + /* + * See comment in uffd_lock_vma() as to why not using + * vma_start_read() here. + */ + down_read(&(*dst_vmap)->vm_lock->lock); + if (*dst_vmap != *src_vmap) + down_read_nested(&(*src_vmap)->vm_lock->lock, + SINGLE_DEPTH_NESTING); + } + mmap_read_unlock(mm); + return err; } +static void uffd_move_unlock(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) +{ + vma_end_read(src_vma); + if (src_vma != dst_vma) + vma_end_read(dst_vma); +} + +#else + +static int uffd_move_lock(struct mm_struct *mm, + unsigned long dst_start, + unsigned long src_start, + struct vm_area_struct **dst_vmap, + struct vm_area_struct **src_vmap) +{ + int err; + + mmap_read_lock(mm); + err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); + if (err) + mmap_read_unlock(mm); + return err; +} + +static void uffd_move_unlock(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) +{ + mmap_assert_locked(src_vma->vm_mm); + mmap_read_unlock(dst_vma->vm_mm); +} +#endif + /** * move_pages - move arbitrary anonymous pages of an existing vma * @ctx: pointer to the userfaultfd context - * @mm: the address space to move pages * @dst_start: start of the destination virtual memory range * @src_start: start of the source virtual memory range * @len: length of the virtual memory range * @mode: flags from uffdio_move.mode * - * Must be called with mmap_lock held for read. + * It will either use the mmap_lock in read mode or per-vma locks * * move_pages() remaps arbitrary anonymous pages atomically in zero * copy. It only works on non shared anonymous pages because those can @@ -1312,10 +1592,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx, * could be obtained. This is the only additional complexity added to * the rmap code to provide this anonymous page remapping functionality. */ -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, - unsigned long dst_start, unsigned long src_start, - unsigned long len, __u64 mode) +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long src_start, unsigned long len, __u64 mode) { + struct mm_struct *mm = ctx->mm; struct vm_area_struct *src_vma, *dst_vma; unsigned long src_addr, dst_addr; pmd_t *src_pmd, *dst_pmd; @@ -1333,28 +1613,34 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, WARN_ON_ONCE(dst_start + len <= dst_start)) goto out; + err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); + if (err) + goto out; + + /* Re-check after taking map_changing_lock */ + err = -EAGAIN; + down_read(&ctx->map_changing_lock); + if (likely(atomic_read(&ctx->mmap_changing))) + goto out_unlock; /* * Make sure the vma is not shared, that the src and dst remap * ranges are both valid and fully within a single existing * vma. */ - src_vma = find_vma(mm, src_start); - if (!src_vma || (src_vma->vm_flags & VM_SHARED)) - goto out; - if (src_start < src_vma->vm_start || - src_start + len > src_vma->vm_end) - goto out; + err = -EINVAL; + if (src_vma->vm_flags & VM_SHARED) + goto out_unlock; + if (src_start + len > src_vma->vm_end) + goto out_unlock; - dst_vma = find_vma(mm, dst_start); - if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) - goto out; - if (dst_start < dst_vma->vm_start || - dst_start + len > dst_vma->vm_end) - goto out; + if (dst_vma->vm_flags & VM_SHARED) + goto out_unlock; + if (dst_start + len > dst_vma->vm_end) + goto out_unlock; err = validate_move_areas(ctx, src_vma, dst_vma); if (err) - goto out; + goto out_unlock; for (src_addr = src_start, dst_addr = dst_start; src_addr < src_start + len;) { @@ -1404,19 +1690,14 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, err = -ENOENT; break; } - /* Avoid moving zeropages for now */ - if (is_huge_zero_pmd(*src_pmd)) { - spin_unlock(ptl); - err = -EBUSY; - break; - } /* Check if we can move the pmd without splitting it. */ if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || !pmd_none(dst_pmdval)) { - struct folio *folio = pfn_folio(pmd_pfn(*src_pmd)); + struct folio *folio = pmd_folio(*src_pmd); - if (!folio || !PageAnonExclusive(&folio->page)) { + if (!folio || (!is_huge_zero_folio(folio) && + !PageAnonExclusive(&folio->page))) { spin_unlock(ptl); err = -EBUSY; break; @@ -1476,6 +1757,9 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, moved += step_size; } +out_unlock: + up_read(&ctx->map_changing_lock); + uffd_move_unlock(dst_vma, src_vma); out: VM_WARN_ON(moved < 0); VM_WARN_ON(err > 0); diff --git a/mm/util.c b/mm/util.c index 5a6a9802583b..c9e519e6811f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -124,16 +124,33 @@ EXPORT_SYMBOL(kstrndup); * Return: newly allocated copy of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ -void *kmemdup(const void *src, size_t len, gfp_t gfp) +void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) { void *p; - p = kmalloc_track_caller(len, gfp); + p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); if (p) memcpy(p, src, len); return p; } -EXPORT_SYMBOL(kmemdup); +EXPORT_SYMBOL(kmemdup_noprof); + +/** + * kmemdup_array - duplicate a given array. + * + * @src: array to duplicate. + * @element_size: size of each element of array. + * @count: number of elements to duplicate from array. + * @gfp: GFP mask to use. + * + * Return: duplicated array of @src or %NULL in case of error, + * result is physically contiguous. Use kfree() to free. + */ +void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp) +{ + return kmemdup(src, size_mul(element_size, count), gfp); +} +EXPORT_SYMBOL(kmemdup_array); /** * kvmemdup - duplicate region of memory @@ -452,17 +469,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; + set_bit(MMF_TOPDOWN, &mm->flags); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } #endif @@ -592,7 +609,7 @@ EXPORT_SYMBOL(vm_mmap); * * Return: pointer to the allocated memory of %NULL in case of failure */ -void *kvmalloc_node(size_t size, gfp_t flags, int node) +void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) { gfp_t kmalloc_flags = flags; void *ret; @@ -614,7 +631,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) kmalloc_flags &= ~__GFP_NOFAIL; } - ret = kmalloc_node(size, kmalloc_flags, node); + ret = kmalloc_node_noprof(size, kmalloc_flags, node); /* * It doesn't really make sense to fallback to vmalloc for sub page @@ -639,11 +656,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) * about the resulting pointer, and cannot play * protection games. */ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(kvmalloc_node); +EXPORT_SYMBOL(kvmalloc_node_noprof); /** * kvfree() - Free memory. @@ -682,7 +699,7 @@ void kvfree_sensitive(const void *addr, size_t len) } EXPORT_SYMBOL(kvfree_sensitive); -void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) { void *newp; @@ -695,7 +712,7 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) kvfree(p); return newp; } -EXPORT_SYMBOL(kvrealloc); +EXPORT_SYMBOL(kvrealloc_noprof); /** * __vmalloc_array - allocate memory for a virtually contiguous array. @@ -703,7 +720,7 @@ EXPORT_SYMBOL(kvrealloc); * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -void *__vmalloc_array(size_t n, size_t size, gfp_t flags) +void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) { size_t bytes; @@ -711,18 +728,18 @@ void *__vmalloc_array(size_t n, size_t size, gfp_t flags) return NULL; return __vmalloc(bytes, flags); } -EXPORT_SYMBOL(__vmalloc_array); +EXPORT_SYMBOL(__vmalloc_array_noprof); /** * vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ -void *vmalloc_array(size_t n, size_t size) +void *vmalloc_array_noprof(size_t n, size_t size) { return __vmalloc_array(n, size, GFP_KERNEL); } -EXPORT_SYMBOL(vmalloc_array); +EXPORT_SYMBOL(vmalloc_array_noprof); /** * __vcalloc - allocate and zero memory for a virtually contiguous array. @@ -730,22 +747,22 @@ EXPORT_SYMBOL(vmalloc_array); * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -void *__vcalloc(size_t n, size_t size, gfp_t flags) +void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) { return __vmalloc_array(n, size, flags | __GFP_ZERO); } -EXPORT_SYMBOL(__vcalloc); +EXPORT_SYMBOL(__vcalloc_noprof); /** * vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ -void *vcalloc(size_t n, size_t size) +void *vcalloc_noprof(size_t n, size_t size) { return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO); } -EXPORT_SYMBOL(vcalloc); +EXPORT_SYMBOL(vcalloc_noprof); struct anon_vma *folio_anon_vma(struct folio *folio) { @@ -942,6 +959,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; + unsigned long bytes_failed; vm_acct_memory(pages); @@ -976,8 +994,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: - pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n", - __func__, current->pid, current->comm); + bytes_failed = pages << PAGE_SHIFT; + pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n", + __func__, current->pid, current->comm, bytes_failed); vm_unacct_memory(pages); return -ENOMEM; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d12a17fc0c17..5d3aa2dc88a8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -42,6 +42,7 @@ #include <linux/sched/mm.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> +#include <linux/page_owner.h> #define CREATE_TRACE_POINTS #include <trace/events/vmalloc.h> @@ -96,6 +97,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, { pte_t *pte; u64 pfn; + struct page *page; unsigned long size = PAGE_SIZE; pfn = phys_addr >> PAGE_SHIFT; @@ -103,7 +105,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; do { - BUG_ON(!pte_none(ptep_get(pte))); + if (!pte_none(ptep_get(pte))) { + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + dump_page(page, "remapping already mapped page"); + } + BUG(); + } #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); @@ -304,8 +312,8 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end, return err; } -int ioremap_page_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot) +int vmap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { int err; @@ -318,6 +326,26 @@ int ioremap_page_range(unsigned long addr, unsigned long end, return err; } +int ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + struct vm_struct *area; + + area = find_vm_area((void *)addr); + if (!area || !(area->flags & VM_IOREMAP)) { + WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr); + return -EINVAL; + } + if (addr != (unsigned long)area->addr || + (void *)end != area->addr + get_vm_area_size(area)) { + WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n", + addr, end, (long)area->addr, + (long)area->addr + get_vm_area_size(area)); + return -ERANGE; + } + return vmap_page_range(addr, end, phys_addr, prot); +} + static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { @@ -635,6 +663,58 @@ static int vmap_pages_range(unsigned long addr, unsigned long end, return err; } +static int check_sparse_vm_area(struct vm_struct *area, unsigned long start, + unsigned long end) +{ + might_sleep(); + if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS)) + return -EINVAL; + if (WARN_ON_ONCE(area->flags & VM_NO_GUARD)) + return -EINVAL; + if (WARN_ON_ONCE(!(area->flags & VM_SPARSE))) + return -EINVAL; + if ((end - start) >> PAGE_SHIFT > totalram_pages()) + return -E2BIG; + if (start < (unsigned long)area->addr || + (void *)end > area->addr + get_vm_area_size(area)) + return -ERANGE; + return 0; +} + +/** + * vm_area_map_pages - map pages inside given sparse vm_area + * @area: vm_area + * @start: start address inside vm_area + * @end: end address inside vm_area + * @pages: pages to map (always PAGE_SIZE pages) + */ +int vm_area_map_pages(struct vm_struct *area, unsigned long start, + unsigned long end, struct page **pages) +{ + int err; + + err = check_sparse_vm_area(area, start, end); + if (err) + return err; + + return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT); +} + +/** + * vm_area_unmap_pages - unmap pages inside given sparse vm_area + * @area: vm_area + * @start: start address inside vm_area + * @end: end address inside vm_area + */ +void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, + unsigned long end) +{ + if (check_sparse_vm_area(area, start, end)) + return; + + vunmap_range(start, end); +} + int is_vmalloc_or_module_addr(const void *x) { /* @@ -728,17 +808,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 -static DEFINE_SPINLOCK(vmap_area_lock); static DEFINE_SPINLOCK(free_vmap_area_lock); -/* Export for kexec only */ -LIST_HEAD(vmap_area_list); -static struct rb_root vmap_area_root = RB_ROOT; static bool vmap_initialized __read_mostly; -static struct rb_root purge_vmap_area_root = RB_ROOT; -static LIST_HEAD(purge_vmap_area_list); -static DEFINE_SPINLOCK(purge_vmap_area_lock); - /* * This kmem_cache is used for vmap_area objects. Instead of * allocating from slab we reuse an object from this cache to @@ -772,6 +844,129 @@ static struct rb_root free_vmap_area_root = RB_ROOT; */ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); +/* + * This structure defines a single, solid model where a list and + * rb-tree are part of one entity protected by the lock. Nodes are + * sorted in ascending order, thus for O(1) access to left/right + * neighbors a list is used as well as for sequential traversal. + */ +struct rb_list { + struct rb_root root; + struct list_head head; + spinlock_t lock; +}; + +/* + * A fast size storage contains VAs up to 1M size. A pool consists + * of linked between each other ready to go VAs of certain sizes. + * An index in the pool-array corresponds to number of pages + 1. + */ +#define MAX_VA_SIZE_PAGES 256 + +struct vmap_pool { + struct list_head head; + unsigned long len; +}; + +/* + * An effective vmap-node logic. Users make use of nodes instead + * of a global heap. It allows to balance an access and mitigate + * contention. + */ +static struct vmap_node { + /* Simple size segregated storage. */ + struct vmap_pool pool[MAX_VA_SIZE_PAGES]; + spinlock_t pool_lock; + bool skip_populate; + + /* Bookkeeping data of this node. */ + struct rb_list busy; + struct rb_list lazy; + + /* + * Ready-to-free areas. + */ + struct list_head purge_list; + struct work_struct purge_work; + unsigned long nr_purged; +} single; + +/* + * Initial setup consists of one single node, i.e. a balancing + * is fully disabled. Later on, after vmap is initialized these + * parameters are updated based on a system capacity. + */ +static struct vmap_node *vmap_nodes = &single; +static __read_mostly unsigned int nr_vmap_nodes = 1; +static __read_mostly unsigned int vmap_zone_size = 1; + +static inline unsigned int +addr_to_node_id(unsigned long addr) +{ + return (addr / vmap_zone_size) % nr_vmap_nodes; +} + +static inline struct vmap_node * +addr_to_node(unsigned long addr) +{ + return &vmap_nodes[addr_to_node_id(addr)]; +} + +static inline struct vmap_node * +id_to_node(unsigned int id) +{ + return &vmap_nodes[id % nr_vmap_nodes]; +} + +/* + * We use the value 0 to represent "no node", that is why + * an encoded value will be the node-id incremented by 1. + * It is always greater then 0. A valid node_id which can + * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id + * is not valid 0 is returned. + */ +static unsigned int +encode_vn_id(unsigned int node_id) +{ + /* Can store U8_MAX [0:254] nodes. */ + if (node_id < nr_vmap_nodes) + return (node_id + 1) << BITS_PER_BYTE; + + /* Warn and no node encoded. */ + WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id); + return 0; +} + +/* + * Returns an encoded node-id, the valid range is within + * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is + * returned if extracted data is wrong. + */ +static unsigned int +decode_vn_id(unsigned int val) +{ + unsigned int node_id = (val >> BITS_PER_BYTE) - 1; + + /* Can store U8_MAX [0:254] nodes. */ + if (node_id < nr_vmap_nodes) + return node_id; + + /* If it was _not_ zero, warn. */ + WARN_ONCE(node_id != UINT_MAX, + "Decode wrong node id (%d)\n", node_id); + + return nr_vmap_nodes; +} + +static bool +is_vn_id_valid(unsigned int node_id) +{ + if (node_id < nr_vmap_nodes) + return true; + + return false; +} + static __always_inline unsigned long va_size(struct vmap_area *va) { @@ -802,11 +997,33 @@ unsigned long vmalloc_nr_pages(void) return atomic_long_read(&nr_vmalloc_pages); } +static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + + addr = (unsigned long)kasan_reset_tag((void *)addr); + + while (n) { + struct vmap_area *va; + + va = rb_entry(n, struct vmap_area, rb_node); + if (addr < va->va_start) + n = n->rb_left; + else if (addr >= va->va_end) + n = n->rb_right; + else + return va; + } + + return NULL; +} + /* Look up the first VA which satisfies addr < va_end, NULL if none. */ -static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) +static struct vmap_area * +__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) { struct vmap_area *va = NULL; - struct rb_node *n = vmap_area_root.rb_node; + struct rb_node *n = root->rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); @@ -827,22 +1044,49 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) return va; } -static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) +/* + * Returns a node where a first VA, that satisfies addr < va_end, resides. + * If success, a node is locked. A user is responsible to unlock it when a + * VA is no longer needed to be accessed. + * + * Returns NULL if nothing found. + */ +static struct vmap_node * +find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) { - struct rb_node *n = root->rb_node; + unsigned long va_start_lowest; + struct vmap_node *vn; + int i; - addr = (unsigned long)kasan_reset_tag((void *)addr); +repeat: + for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; - while (n) { - struct vmap_area *va; + spin_lock(&vn->busy.lock); + *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root); - va = rb_entry(n, struct vmap_area, rb_node); - if (addr < va->va_start) - n = n->rb_left; - else if (addr >= va->va_end) - n = n->rb_right; - else - return va; + if (*va) + if (!va_start_lowest || (*va)->va_start < va_start_lowest) + va_start_lowest = (*va)->va_start; + spin_unlock(&vn->busy.lock); + } + + /* + * Check if found VA exists, it might have gone away. In this case we + * repeat the search because a VA has been removed concurrently and we + * need to proceed to the next one, which is a rare case. + */ + if (va_start_lowest) { + vn = addr_to_node(va_start_lowest); + + spin_lock(&vn->busy.lock); + *va = __find_vmap_area(va_start_lowest, &vn->busy.root); + + if (*va) + return vn; + + spin_unlock(&vn->busy.lock); + goto repeat; } return NULL; @@ -1382,9 +1626,9 @@ classify_va_fit_type(struct vmap_area *va, } static __always_inline int -adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, - struct vmap_area *va, unsigned long nva_start_addr, - unsigned long size) +va_clip(struct rb_root *root, struct list_head *head, + struct vmap_area *va, unsigned long nva_start_addr, + unsigned long size) { struct vmap_area *lva = NULL; enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); @@ -1481,6 +1725,32 @@ adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, return 0; } +static unsigned long +va_alloc(struct vmap_area *va, + struct rb_root *root, struct list_head *head, + unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend) +{ + unsigned long nva_start_addr; + int ret; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Check the "vend" restriction. */ + if (nva_start_addr + size > vend) + return vend; + + /* Update the free vmap_area. */ + ret = va_clip(root, head, va, nva_start_addr, size); + if (WARN_ON_ONCE(ret)) + return vend; + + return nva_start_addr; +} + /* * Returns a start address of the newly allocated area, if success. * Otherwise a vend is returned that indicates failure. @@ -1493,7 +1763,6 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, bool adjust_search_size = true; unsigned long nva_start_addr; struct vmap_area *va; - int ret; /* * Do not adjust when: @@ -1511,18 +1780,8 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, if (unlikely(!va)) return vend; - if (va->va_start > vstart) - nva_start_addr = ALIGN(va->va_start, align); - else - nva_start_addr = ALIGN(vstart, align); - - /* Check the "vend" restriction. */ - if (nva_start_addr + size > vend) - return vend; - - /* Update the free vmap_area. */ - ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); - if (WARN_ON_ONCE(ret)) + nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend); + if (nva_start_addr == vend) return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK @@ -1537,12 +1796,14 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, */ static void free_vmap_area(struct vmap_area *va) { + struct vmap_node *vn = addr_to_node(va->va_start); + /* * Remove from the busy tree/list. */ - spin_lock(&vmap_area_lock); - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + spin_lock(&vn->busy.lock); + unlink_va(va, &vn->busy.root); + spin_unlock(&vn->busy.lock); /* * Insert/Merge it back to the free tree/list. @@ -1575,19 +1836,129 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) kmem_cache_free(vmap_area_cachep, va); } +static struct vmap_pool * +size_to_va_pool(struct vmap_node *vn, unsigned long size) +{ + unsigned int idx = (size - 1) / PAGE_SIZE; + + if (idx < MAX_VA_SIZE_PAGES) + return &vn->pool[idx]; + + return NULL; +} + +static bool +node_pool_add_va(struct vmap_node *n, struct vmap_area *va) +{ + struct vmap_pool *vp; + + vp = size_to_va_pool(n, va_size(va)); + if (!vp) + return false; + + spin_lock(&n->pool_lock); + list_add(&va->list, &vp->head); + WRITE_ONCE(vp->len, vp->len + 1); + spin_unlock(&n->pool_lock); + + return true; +} + +static struct vmap_area * +node_pool_del_va(struct vmap_node *vn, unsigned long size, + unsigned long align, unsigned long vstart, + unsigned long vend) +{ + struct vmap_area *va = NULL; + struct vmap_pool *vp; + int err = 0; + + vp = size_to_va_pool(vn, size); + if (!vp || list_empty(&vp->head)) + return NULL; + + spin_lock(&vn->pool_lock); + if (!list_empty(&vp->head)) { + va = list_first_entry(&vp->head, struct vmap_area, list); + + if (IS_ALIGNED(va->va_start, align)) { + /* + * Do some sanity check and emit a warning + * if one of below checks detects an error. + */ + err |= (va_size(va) != size); + err |= (va->va_start < vstart); + err |= (va->va_end > vend); + + if (!WARN_ON_ONCE(err)) { + list_del_init(&va->list); + WRITE_ONCE(vp->len, vp->len - 1); + } else { + va = NULL; + } + } else { + list_move_tail(&va->list, &vp->head); + va = NULL; + } + } + spin_unlock(&vn->pool_lock); + + return va; +} + +static struct vmap_area * +node_alloc(unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, + unsigned long *addr, unsigned int *vn_id) +{ + struct vmap_area *va; + + *vn_id = 0; + *addr = vend; + + /* + * Fallback to a global heap if not vmalloc or there + * is only one node. + */ + if (vstart != VMALLOC_START || vend != VMALLOC_END || + nr_vmap_nodes == 1) + return NULL; + + *vn_id = raw_smp_processor_id() % nr_vmap_nodes; + va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend); + *vn_id = encode_vn_id(*vn_id); + + if (va) + *addr = va->va_start; + + return va; +} + +static inline void setup_vmalloc_vm(struct vm_struct *vm, + struct vmap_area *va, unsigned long flags, const void *caller) +{ + vm->flags = flags; + vm->addr = (void *)va->va_start; + vm->size = va->va_end - va->va_start; + vm->caller = caller; + va->vm = vm; +} + /* * Allocate a region of KVA of the specified size and alignment, within the - * vstart and vend. + * vstart and vend. If vm is passed in, the two will also be bound. */ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask, - unsigned long va_flags) + unsigned long va_flags, struct vm_struct *vm) { + struct vmap_node *vn; struct vmap_area *va; unsigned long freed; unsigned long addr; + unsigned int vn_id; int purged = 0; int ret; @@ -1598,23 +1969,37 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, return ERR_PTR(-EBUSY); might_sleep(); - gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - - va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); - if (unlikely(!va)) - return ERR_PTR(-ENOMEM); /* - * Only scan the relevant parts containing pointers to other objects - * to avoid false negatives. + * If a VA is obtained from a global heap(if it fails here) + * it is anyway marked with this "vn_id" so it is returned + * to this pool's node later. Such way gives a possibility + * to populate pools based on users demand. + * + * On success a ready to go VA is returned. */ - kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); + va = node_alloc(size, align, vstart, vend, &addr, &vn_id); + if (!va) { + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; + + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + + /* + * Only scan the relevant parts containing pointers to other objects + * to avoid false negatives. + */ + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); + } retry: - preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); - addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, - size, align, vstart, vend); - spin_unlock(&free_vmap_area_lock); + if (addr == vend) { + preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); + addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, + size, align, vstart, vend); + spin_unlock(&free_vmap_area_lock); + } trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); @@ -1628,11 +2013,19 @@ retry: va->va_start = addr; va->va_end = addr + size; va->vm = NULL; - va->flags = va_flags; + va->flags = (va_flags | vn_id); - spin_lock(&vmap_area_lock); - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); - spin_unlock(&vmap_area_lock); + if (vm) { + vm->addr = (void *)va->va_start; + vm->size = va->va_end - va->va_start; + va->vm = vm; + } + + vn = addr_to_node(va->va_start); + + spin_lock(&vn->busy.lock); + insert_vmap_area(va, &vn->busy.root, &vn->busy.head); + spin_unlock(&vn->busy.lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); @@ -1717,70 +2110,199 @@ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); +static cpumask_t purge_nodes; -/* - * Purges all lazily-freed vmap areas. - */ -static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) +static void +reclaim_list_global(struct list_head *head) { - unsigned long resched_threshold; - unsigned int num_purged_areas = 0; - struct list_head local_purge_list; - struct vmap_area *va, *n_va; + struct vmap_area *va, *n; - lockdep_assert_held(&vmap_purge_lock); + if (list_empty(head)) + return; - spin_lock(&purge_vmap_area_lock); - purge_vmap_area_root = RB_ROOT; - list_replace_init(&purge_vmap_area_list, &local_purge_list); - spin_unlock(&purge_vmap_area_lock); + spin_lock(&free_vmap_area_lock); + list_for_each_entry_safe(va, n, head, list) + merge_or_add_vmap_area_augment(va, + &free_vmap_area_root, &free_vmap_area_list); + spin_unlock(&free_vmap_area_lock); +} - if (unlikely(list_empty(&local_purge_list))) - goto out; +static void +decay_va_pool_node(struct vmap_node *vn, bool full_decay) +{ + struct vmap_area *va, *nva; + struct list_head decay_list; + struct rb_root decay_root; + unsigned long n_decay; + int i; - start = min(start, - list_first_entry(&local_purge_list, - struct vmap_area, list)->va_start); + decay_root = RB_ROOT; + INIT_LIST_HEAD(&decay_list); - end = max(end, - list_last_entry(&local_purge_list, - struct vmap_area, list)->va_end); + for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { + struct list_head tmp_list; - flush_tlb_kernel_range(start, end); - resched_threshold = lazy_max_pages() << 1; + if (list_empty(&vn->pool[i].head)) + continue; - spin_lock(&free_vmap_area_lock); - list_for_each_entry_safe(va, n_va, &local_purge_list, list) { - unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; - unsigned long orig_start = va->va_start; - unsigned long orig_end = va->va_end; + INIT_LIST_HEAD(&tmp_list); + + /* Detach the pool, so no-one can access it. */ + spin_lock(&vn->pool_lock); + list_replace_init(&vn->pool[i].head, &tmp_list); + spin_unlock(&vn->pool_lock); + + if (full_decay) + WRITE_ONCE(vn->pool[i].len, 0); + + /* Decay a pool by ~25% out of left objects. */ + n_decay = vn->pool[i].len >> 2; + + list_for_each_entry_safe(va, nva, &tmp_list, list) { + list_del_init(&va->list); + merge_or_add_vmap_area(va, &decay_root, &decay_list); + + if (!full_decay) { + WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1); + + if (!--n_decay) + break; + } + } /* - * Finally insert or merge lazily-freed area. It is - * detached and there is no need to "unlink" it from - * anything. + * Attach the pool back if it has been partly decayed. + * Please note, it is supposed that nobody(other contexts) + * can populate the pool therefore a simple list replace + * operation takes place here. */ - va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, - &free_vmap_area_list); + if (!full_decay && !list_empty(&tmp_list)) { + spin_lock(&vn->pool_lock); + list_replace_init(&tmp_list, &vn->pool[i].head); + spin_unlock(&vn->pool_lock); + } + } - if (!va) - continue; + reclaim_list_global(&decay_list); +} + +static void purge_vmap_node(struct work_struct *work) +{ + struct vmap_node *vn = container_of(work, + struct vmap_node, purge_work); + struct vmap_area *va, *n_va; + LIST_HEAD(local_list); + + vn->nr_purged = 0; + + list_for_each_entry_safe(va, n_va, &vn->purge_list, list) { + unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long orig_start = va->va_start; + unsigned long orig_end = va->va_end; + unsigned int vn_id = decode_vn_id(va->flags); + + list_del_init(&va->list); if (is_vmalloc_or_module_addr((void *)orig_start)) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); atomic_long_sub(nr, &vmap_lazy_nr); - num_purged_areas++; + vn->nr_purged++; - if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) - cond_resched_lock(&free_vmap_area_lock); + if (is_vn_id_valid(vn_id) && !vn->skip_populate) + if (node_pool_add_va(vn, va)) + continue; + + /* Go back to global. */ + list_add(&va->list, &local_list); } - spin_unlock(&free_vmap_area_lock); -out: - trace_purge_vmap_area_lazy(start, end, num_purged_areas); - return num_purged_areas > 0; + reclaim_list_global(&local_list); +} + +/* + * Purges all lazily-freed vmap areas. + */ +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, + bool full_pool_decay) +{ + unsigned long nr_purged_areas = 0; + unsigned int nr_purge_helpers; + unsigned int nr_purge_nodes; + struct vmap_node *vn; + int i; + + lockdep_assert_held(&vmap_purge_lock); + + /* + * Use cpumask to mark which node has to be processed. + */ + purge_nodes = CPU_MASK_NONE; + + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + + INIT_LIST_HEAD(&vn->purge_list); + vn->skip_populate = full_pool_decay; + decay_va_pool_node(vn, full_pool_decay); + + if (RB_EMPTY_ROOT(&vn->lazy.root)) + continue; + + spin_lock(&vn->lazy.lock); + WRITE_ONCE(vn->lazy.root.rb_node, NULL); + list_replace_init(&vn->lazy.head, &vn->purge_list); + spin_unlock(&vn->lazy.lock); + + start = min(start, list_first_entry(&vn->purge_list, + struct vmap_area, list)->va_start); + + end = max(end, list_last_entry(&vn->purge_list, + struct vmap_area, list)->va_end); + + cpumask_set_cpu(i, &purge_nodes); + } + + nr_purge_nodes = cpumask_weight(&purge_nodes); + if (nr_purge_nodes > 0) { + flush_tlb_kernel_range(start, end); + + /* One extra worker is per a lazy_max_pages() full set minus one. */ + nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages(); + nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1; + + for_each_cpu(i, &purge_nodes) { + vn = &vmap_nodes[i]; + + if (nr_purge_helpers > 0) { + INIT_WORK(&vn->purge_work, purge_vmap_node); + + if (cpumask_test_cpu(i, cpu_online_mask)) + schedule_work_on(i, &vn->purge_work); + else + schedule_work(&vn->purge_work); + + nr_purge_helpers--; + } else { + vn->purge_work.func = NULL; + purge_vmap_node(&vn->purge_work); + nr_purged_areas += vn->nr_purged; + } + } + + for_each_cpu(i, &purge_nodes) { + vn = &vmap_nodes[i]; + + if (vn->purge_work.func) { + flush_work(&vn->purge_work); + nr_purged_areas += vn->nr_purged; + } + } + } + + trace_purge_vmap_area_lazy(start, end, nr_purged_areas); + return nr_purged_areas > 0; } /* @@ -1791,22 +2313,15 @@ static void reclaim_and_purge_vmap_areas(void) { mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); - __purge_vmap_area_lazy(ULONG_MAX, 0); + __purge_vmap_area_lazy(ULONG_MAX, 0, true); mutex_unlock(&vmap_purge_lock); } static void drain_vmap_area_work(struct work_struct *work) { - unsigned long nr_lazy; - - do { - mutex_lock(&vmap_purge_lock); - __purge_vmap_area_lazy(ULONG_MAX, 0); - mutex_unlock(&vmap_purge_lock); - - /* Recheck if further work is required. */ - nr_lazy = atomic_long_read(&vmap_lazy_nr); - } while (nr_lazy > lazy_max_pages()); + mutex_lock(&vmap_purge_lock); + __purge_vmap_area_lazy(ULONG_MAX, 0, false); + mutex_unlock(&vmap_purge_lock); } /* @@ -1818,6 +2333,8 @@ static void free_vmap_area_noflush(struct vmap_area *va) { unsigned long nr_lazy_max = lazy_max_pages(); unsigned long va_start = va->va_start; + unsigned int vn_id = decode_vn_id(va->flags); + struct vmap_node *vn; unsigned long nr_lazy; if (WARN_ON_ONCE(!list_empty(&va->list))) @@ -1827,12 +2344,15 @@ static void free_vmap_area_noflush(struct vmap_area *va) PAGE_SHIFT, &vmap_lazy_nr); /* - * Merge or place it to the purge tree/list. + * If it was request by a certain node we would like to + * return it to that node, i.e. its pool for later reuse. */ - spin_lock(&purge_vmap_area_lock); - merge_or_add_vmap_area(va, - &purge_vmap_area_root, &purge_vmap_area_list); - spin_unlock(&purge_vmap_area_lock); + vn = is_vn_id_valid(vn_id) ? + id_to_node(vn_id):addr_to_node(va->va_start); + + spin_lock(&vn->lazy.lock); + insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head); + spin_unlock(&vn->lazy.lock); trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); @@ -1856,26 +2376,65 @@ static void free_unmap_vmap_area(struct vmap_area *va) struct vmap_area *find_vmap_area(unsigned long addr) { + struct vmap_node *vn; struct vmap_area *va; + int i, j; - spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr, &vmap_area_root); - spin_unlock(&vmap_area_lock); + if (unlikely(!vmap_initialized)) + return NULL; - return va; + /* + * An addr_to_node_id(addr) converts an address to a node index + * where a VA is located. If VA spans several zones and passed + * addr is not the same as va->va_start, what is not common, we + * may need to scan extra nodes. See an example: + * + * <----va----> + * -|-----|-----|-----|-----|- + * 1 2 0 1 + * + * VA resides in node 1 whereas it spans 1, 2 an 0. If passed + * addr is within 2 or 0 nodes we should do extra work. + */ + i = j = addr_to_node_id(addr); + do { + vn = &vmap_nodes[i]; + + spin_lock(&vn->busy.lock); + va = __find_vmap_area(addr, &vn->busy.root); + spin_unlock(&vn->busy.lock); + + if (va) + return va; + } while ((i = (i + 1) % nr_vmap_nodes) != j); + + return NULL; } static struct vmap_area *find_unlink_vmap_area(unsigned long addr) { + struct vmap_node *vn; struct vmap_area *va; + int i, j; - spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr, &vmap_area_root); - if (va) - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + /* + * Check the comment in the find_vmap_area() about the loop. + */ + i = j = addr_to_node_id(addr); + do { + vn = &vmap_nodes[i]; - return va; + spin_lock(&vn->busy.lock); + va = __find_vmap_area(addr, &vn->busy.root); + if (va) + unlink_va(va, &vn->busy.root); + spin_unlock(&vn->busy.lock); + + if (va) + return va; + } while ((i = (i + 1) % nr_vmap_nodes) != j); + + return NULL; } /*** Per cpu kva allocator ***/ @@ -2039,7 +2598,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask, - VMAP_RAM|VMAP_BLOCK); + VMAP_RAM|VMAP_BLOCK, NULL); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); @@ -2077,6 +2636,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) static void free_vmap_block(struct vmap_block *vb) { + struct vmap_node *vn; struct vmap_block *tmp; struct xarray *xa; @@ -2084,9 +2644,10 @@ static void free_vmap_block(struct vmap_block *vb) tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); - spin_lock(&vmap_area_lock); - unlink_va(vb->va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + vn = addr_to_node(vb->va->va_start); + spin_lock(&vn->busy.lock); + unlink_va(vb->va, &vn->busy.root); + spin_unlock(&vn->busy.lock); free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); @@ -2173,7 +2734,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) * get_order(0) returns funny result. Just warn and terminate * early. */ - return NULL; + return ERR_PTR(-EINVAL); } order = get_order(size); @@ -2303,7 +2864,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) } free_purged_blocks(&purge_list); - if (!__purge_vmap_area_lazy(start, end) && flush) + if (!__purge_vmap_area_lazy(start, end, false) && flush) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } @@ -2394,7 +2955,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END, - node, GFP_KERNEL, VMAP_RAM); + node, GFP_KERNEL, VMAP_RAM, + NULL); if (IS_ERR(va)) return NULL; @@ -2497,65 +3059,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } -static void vmap_init_free_space(void) -{ - unsigned long vmap_start = 1; - const unsigned long vmap_end = ULONG_MAX; - struct vmap_area *busy, *free; - - /* - * B F B B B F - * -|-----|.....|-----|-----|-----|.....|- - * | The KVA space | - * |<--------------------------------->| - */ - list_for_each_entry(busy, &vmap_area_list, list) { - if (busy->va_start - vmap_start > 0) { - free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); - if (!WARN_ON_ONCE(!free)) { - free->va_start = vmap_start; - free->va_end = busy->va_start; - - insert_vmap_area_augment(free, NULL, - &free_vmap_area_root, - &free_vmap_area_list); - } - } - - vmap_start = busy->va_end; - } - - if (vmap_end - vmap_start > 0) { - free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); - if (!WARN_ON_ONCE(!free)) { - free->va_start = vmap_start; - free->va_end = vmap_end; - - insert_vmap_area_augment(free, NULL, - &free_vmap_area_root, - &free_vmap_area_list); - } - } -} - -static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, - struct vmap_area *va, unsigned long flags, const void *caller) -{ - vm->flags = flags; - vm->addr = (void *)va->va_start; - vm->size = va->va_end - va->va_start; - vm->caller = caller; - va->vm = vm; -} - -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, const void *caller) -{ - spin_lock(&vmap_area_lock); - setup_vmalloc_vm_locked(vm, va, flags, caller); - spin_unlock(&vmap_area_lock); -} - static void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* @@ -2592,14 +3095,15 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; - va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0); + area->flags = flags; + area->caller = caller; + + va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); if (IS_ERR(va)) { kfree(area); return NULL; } - setup_vmalloc_vm(area, va, flags, caller); - /* * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a * best-effort approach, as they can be mapped outside of vmalloc code. @@ -2994,7 +3498,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, { unsigned int nr_allocated = 0; gfp_t alloc_gfp = gfp; - bool nofail = false; + bool nofail = gfp & __GFP_NOFAIL; struct page *page; int i; @@ -3025,12 +3529,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, + nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node(bulk_gfp, nid, + nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid, nr_pages_request, pages + nr_allocated); @@ -3051,18 +3555,17 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * and compaction etc. */ alloc_gfp &= ~__GFP_NOFAIL; - nofail = true; } /* High-order pages or fallback path if "bulk" fails. */ while (nr_allocated < nr_pages) { - if (fatal_signal_pending(current)) + if (!nofail && fatal_signal_pending(current)) break; if (nid == NUMA_NO_NODE) - page = alloc_pages(alloc_gfp, order); + page = alloc_pages_noprof(alloc_gfp, order); else - page = alloc_pages_node(nid, alloc_gfp, order); + page = alloc_pages_node_noprof(nid, alloc_gfp, order); if (unlikely(!page)) { if (!nofail) break; @@ -3119,10 +3622,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, + area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node, area->caller); } else { - area->pages = kmalloc_node(array_size, nested_gfp, node); + area->pages = kmalloc_node_noprof(array_size, nested_gfp, node); } if (!area->pages) { @@ -3232,7 +3735,7 @@ fail: * * Return: the address of the area or %NULL on failure */ -void *__vmalloc_node_range(unsigned long size, unsigned long align, +void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) @@ -3379,10 +3882,10 @@ fail: * * Return: pointer to the allocated memory or %NULL on error */ -void *__vmalloc_node(unsigned long size, unsigned long align, +void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { - return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } /* @@ -3391,15 +3894,15 @@ void *__vmalloc_node(unsigned long size, unsigned long align, * than that. */ #ifdef CONFIG_TEST_VMALLOC_MODULE -EXPORT_SYMBOL_GPL(__vmalloc_node); +EXPORT_SYMBOL_GPL(__vmalloc_node_noprof); #endif -void *__vmalloc(unsigned long size, gfp_t gfp_mask) +void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(__vmalloc); +EXPORT_SYMBOL(__vmalloc_noprof); /** * vmalloc - allocate virtually contiguous memory @@ -3413,12 +3916,12 @@ EXPORT_SYMBOL(__vmalloc); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc(unsigned long size) +void *vmalloc_noprof(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc); +EXPORT_SYMBOL(vmalloc_noprof); /** * vmalloc_huge - allocate virtually contiguous memory, allow huge pages @@ -3432,13 +3935,13 @@ EXPORT_SYMBOL(vmalloc); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) +void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL_GPL(vmalloc_huge); +EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); /** * vzalloc - allocate virtually contiguous memory with zero fill @@ -3453,12 +3956,12 @@ EXPORT_SYMBOL_GPL(vmalloc_huge); * * Return: pointer to the allocated memory or %NULL on error */ -void *vzalloc(unsigned long size) +void *vzalloc_noprof(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vzalloc); +EXPORT_SYMBOL(vzalloc_noprof); /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace @@ -3469,14 +3972,14 @@ EXPORT_SYMBOL(vzalloc); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_user(unsigned long size) +void *vmalloc_user_noprof(unsigned long size) { - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_user); +EXPORT_SYMBOL(vmalloc_user_noprof); /** * vmalloc_node - allocate memory on a specific node @@ -3491,12 +3994,12 @@ EXPORT_SYMBOL(vmalloc_user); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_node(unsigned long size, int node) +void *vmalloc_node_noprof(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL, node, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_node); +EXPORT_SYMBOL(vmalloc_node_noprof); /** * vzalloc_node - allocate memory on a specific node with zero fill @@ -3509,12 +4012,12 @@ EXPORT_SYMBOL(vmalloc_node); * * Return: pointer to the allocated memory or %NULL on error */ -void *vzalloc_node(unsigned long size, int node) +void *vzalloc_node_noprof(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(vzalloc_node); +EXPORT_SYMBOL(vzalloc_node_noprof); #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) @@ -3537,12 +4040,12 @@ EXPORT_SYMBOL(vzalloc_node); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_32(unsigned long size) +void *vmalloc_32_noprof(unsigned long size) { - return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_32); +EXPORT_SYMBOL(vmalloc_32_noprof); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory @@ -3553,14 +4056,14 @@ EXPORT_SYMBOL(vmalloc_32); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_32_user(unsigned long size) +void *vmalloc_32_user_noprof(unsigned long size) { - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_32_user); +EXPORT_SYMBOL(vmalloc_32_user_noprof); /* * Atomically zero bytes in the iterator. @@ -3741,10 +4244,12 @@ finished: */ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) { + struct vmap_node *vn; struct vmap_area *va; struct vm_struct *vm; char *vaddr; size_t n, size, flags, remains; + unsigned long next; addr = kasan_reset_tag(addr); @@ -3754,16 +4259,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) remains = count; - spin_lock(&vmap_area_lock); - va = find_vmap_area_exceed_addr((unsigned long)addr); - if (!va) + vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va); + if (!vn) goto finished_zero; /* no intersects with alive vmap_area */ if ((unsigned long)addr + remains <= va->va_start) goto finished_zero; - list_for_each_entry_from(va, &vmap_area_list, list) { + do { size_t copied; if (remains == 0) @@ -3778,10 +4282,10 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) WARN_ON(flags == VMAP_BLOCK); if (!vm && !flags) - continue; + goto next_va; if (vm && (vm->flags & VM_UNINITIALIZED)) - continue; + goto next_va; /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); @@ -3790,7 +4294,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) size = vm ? get_vm_area_size(vm) : va_size(va); if (addr >= vaddr + size) - continue; + goto next_va; if (addr < vaddr) { size_t to_zero = min_t(size_t, vaddr - addr, remains); @@ -3809,9 +4313,9 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) if (flags & VMAP_RAM) copied = vmap_ram_vread_iter(iter, addr, n, flags); - else if (!(vm && (vm->flags & VM_IOREMAP))) + else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE)))) copied = aligned_vread_iter(iter, addr, n); - else /* IOREMAP area is treated as memory hole */ + else /* IOREMAP | SPARSE area is treated as memory hole */ copied = zero_iter(iter, n); addr += copied; @@ -3819,15 +4323,22 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) if (copied != n) goto finished; - } + + next_va: + next = va->va_end; + spin_unlock(&vn->busy.lock); + } while ((vn = find_vmap_area_exceed_addr_lock(next, &va))); finished_zero: - spin_unlock(&vmap_area_lock); + if (vn) + spin_unlock(&vn->busy.lock); + /* zero-fill memory holes */ return count - remains + zero_iter(iter, remains); finished: /* Nothing remains, or We couldn't copy/zero everything. */ - spin_unlock(&vmap_area_lock); + if (vn) + spin_unlock(&vn->busy.lock); return count - remains; } @@ -4140,9 +4651,8 @@ retry: /* It is a BUG(), but trigger recovery instead. */ goto recovery; - ret = adjust_va_to_fit_type(&free_vmap_area_root, - &free_vmap_area_list, - va, start, size); + ret = va_clip(&free_vmap_area_root, + &free_vmap_area_list, va, start, size); if (WARN_ON_ONCE(unlikely(ret))) /* It is a BUG(), but trigger recovery instead. */ goto recovery; @@ -4162,14 +4672,15 @@ retry: } /* insert all vm's */ - spin_lock(&vmap_area_lock); for (area = 0; area < nr_vms; area++) { - insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); + struct vmap_node *vn = addr_to_node(vas[area]->va_start); - setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, + spin_lock(&vn->busy.lock); + insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head); + setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); + spin_unlock(&vn->busy.lock); } - spin_unlock(&vmap_area_lock); /* * Mark allocated areas as accessible. Do it now as a best-effort @@ -4278,60 +4789,39 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #ifdef CONFIG_PRINTK bool vmalloc_dump_obj(void *object) { - void *objp = (void *)PAGE_ALIGN((unsigned long)object); const void *caller; struct vm_struct *vm; struct vmap_area *va; + struct vmap_node *vn; unsigned long addr; unsigned int nr_pages; - if (!spin_trylock(&vmap_area_lock)) + addr = PAGE_ALIGN((unsigned long) object); + vn = addr_to_node(addr); + + if (!spin_trylock(&vn->busy.lock)) return false; - va = __find_vmap_area((unsigned long)objp, &vmap_area_root); - if (!va) { - spin_unlock(&vmap_area_lock); + + va = __find_vmap_area(addr, &vn->busy.root); + if (!va || !va->vm) { + spin_unlock(&vn->busy.lock); return false; } vm = va->vm; - if (!vm) { - spin_unlock(&vmap_area_lock); - return false; - } - addr = (unsigned long)vm->addr; + addr = (unsigned long) vm->addr; caller = vm->caller; nr_pages = vm->nr_pages; - spin_unlock(&vmap_area_lock); + spin_unlock(&vn->busy.lock); + pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", nr_pages, addr, caller); + return true; } #endif #ifdef CONFIG_PROC_FS -static void *s_start(struct seq_file *m, loff_t *pos) - __acquires(&vmap_purge_lock) - __acquires(&vmap_area_lock) -{ - mutex_lock(&vmap_purge_lock); - spin_lock(&vmap_area_lock); - - return seq_list_start(&vmap_area_list, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &vmap_area_list, pos); -} - -static void s_stop(struct seq_file *m, void *p) - __releases(&vmap_area_lock) - __releases(&vmap_purge_lock) -{ - spin_unlock(&vmap_area_lock); - mutex_unlock(&vmap_purge_lock); -} - static void show_numa_info(struct seq_file *m, struct vm_struct *v) { if (IS_ENABLED(CONFIG_NUMA)) { @@ -4358,102 +4848,237 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static void show_purge_info(struct seq_file *m) { + struct vmap_node *vn; struct vmap_area *va; + int i; - spin_lock(&purge_vmap_area_lock); - list_for_each_entry(va, &purge_vmap_area_list, list) { - seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", - (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + + spin_lock(&vn->lazy.lock); + list_for_each_entry(va, &vn->lazy.head, list) { + seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); + } + spin_unlock(&vn->lazy.lock); } - spin_unlock(&purge_vmap_area_lock); } -static int s_show(struct seq_file *m, void *p) +static int vmalloc_info_show(struct seq_file *m, void *p) { + struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; + int i; - va = list_entry(p, struct vmap_area, list); + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; - if (!va->vm) { - if (va->flags & VMAP_RAM) - seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", - (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + spin_lock(&vn->busy.lock); + list_for_each_entry(va, &vn->busy.head, list) { + if (!va->vm) { + if (va->flags & VMAP_RAM) + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); - goto final; - } + continue; + } - v = va->vm; + v = va->vm; - seq_printf(m, "0x%pK-0x%pK %7ld", - v->addr, v->addr + v->size, v->size); + seq_printf(m, "0x%pK-0x%pK %7ld", + v->addr, v->addr + v->size, v->size); - if (v->caller) - seq_printf(m, " %pS", v->caller); + if (v->caller) + seq_printf(m, " %pS", v->caller); - if (v->nr_pages) - seq_printf(m, " pages=%d", v->nr_pages); + if (v->nr_pages) + seq_printf(m, " pages=%d", v->nr_pages); - if (v->phys_addr) - seq_printf(m, " phys=%pa", &v->phys_addr); + if (v->phys_addr) + seq_printf(m, " phys=%pa", &v->phys_addr); - if (v->flags & VM_IOREMAP) - seq_puts(m, " ioremap"); + if (v->flags & VM_IOREMAP) + seq_puts(m, " ioremap"); - if (v->flags & VM_ALLOC) - seq_puts(m, " vmalloc"); + if (v->flags & VM_SPARSE) + seq_puts(m, " sparse"); - if (v->flags & VM_MAP) - seq_puts(m, " vmap"); + if (v->flags & VM_ALLOC) + seq_puts(m, " vmalloc"); - if (v->flags & VM_USERMAP) - seq_puts(m, " user"); + if (v->flags & VM_MAP) + seq_puts(m, " vmap"); - if (v->flags & VM_DMA_COHERENT) - seq_puts(m, " dma-coherent"); + if (v->flags & VM_USERMAP) + seq_puts(m, " user"); - if (is_vmalloc_addr(v->pages)) - seq_puts(m, " vpages"); + if (v->flags & VM_DMA_COHERENT) + seq_puts(m, " dma-coherent"); - show_numa_info(m, v); - seq_putc(m, '\n'); + if (is_vmalloc_addr(v->pages)) + seq_puts(m, " vpages"); + + show_numa_info(m, v); + seq_putc(m, '\n'); + } + spin_unlock(&vn->busy.lock); + } /* * As a final step, dump "unpurged" areas. */ -final: - if (list_is_last(&va->list, &vmap_area_list)) - show_purge_info(m); - + show_purge_info(m); return 0; } -static const struct seq_operations vmalloc_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - static int __init proc_vmalloc_init(void) { + void *priv_data = NULL; + if (IS_ENABLED(CONFIG_NUMA)) - proc_create_seq_private("vmallocinfo", 0400, NULL, - &vmalloc_op, - nr_node_ids * sizeof(unsigned int), NULL); - else - proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); + priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + + proc_create_single_data("vmallocinfo", + 0400, NULL, vmalloc_info_show, priv_data); + return 0; } module_init(proc_vmalloc_init); #endif +static void __init vmap_init_free_space(void) +{ + unsigned long vmap_start = 1; + const unsigned long vmap_end = ULONG_MAX; + struct vmap_area *free; + struct vm_struct *busy; + + /* + * B F B B B F + * -|-----|.....|-----|-----|-----|.....|- + * | The KVA space | + * |<--------------------------------->| + */ + for (busy = vmlist; busy; busy = busy->next) { + if ((unsigned long) busy->addr - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = (unsigned long) busy->addr; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } + + vmap_start = (unsigned long) busy->addr + busy->size; + } + + if (vmap_end - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = vmap_end; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } +} + +static void vmap_init_nodes(void) +{ + struct vmap_node *vn; + int i, n; + +#if BITS_PER_LONG == 64 + /* + * A high threshold of max nodes is fixed and bound to 128, + * thus a scale factor is 1 for systems where number of cores + * are less or equal to specified threshold. + * + * As for NUMA-aware notes. For bigger systems, for example + * NUMA with multi-sockets, where we can end-up with thousands + * of cores in total, a "sub-numa-clustering" should be added. + * + * In this case a NUMA domain is considered as a single entity + * with dedicated sub-nodes in it which describe one group or + * set of cores. Therefore a per-domain purging is supposed to + * be added as well as a per-domain balancing. + */ + n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); + + if (n > 1) { + vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); + if (vn) { + /* Node partition is 16 pages. */ + vmap_zone_size = (1 << 4) * PAGE_SIZE; + nr_vmap_nodes = n; + vmap_nodes = vn; + } else { + pr_err("Failed to allocate an array. Disable a node layer\n"); + } + } +#endif + + for (n = 0; n < nr_vmap_nodes; n++) { + vn = &vmap_nodes[n]; + vn->busy.root = RB_ROOT; + INIT_LIST_HEAD(&vn->busy.head); + spin_lock_init(&vn->busy.lock); + + vn->lazy.root = RB_ROOT; + INIT_LIST_HEAD(&vn->lazy.head); + spin_lock_init(&vn->lazy.lock); + + for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { + INIT_LIST_HEAD(&vn->pool[i].head); + WRITE_ONCE(vn->pool[i].len, 0); + } + + spin_lock_init(&vn->pool_lock); + } +} + +static unsigned long +vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long count; + struct vmap_node *vn; + int i, j; + + for (count = 0, i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + + for (j = 0; j < MAX_VA_SIZE_PAGES; j++) + count += READ_ONCE(vn->pool[j].len); + } + + return count ? count : SHRINK_EMPTY; +} + +static unsigned long +vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int i; + + for (i = 0; i < nr_vmap_nodes; i++) + decay_va_pool_node(&vmap_nodes[i], true); + + return SHRINK_STOP; +} + void __init vmalloc_init(void) { + struct shrinker *vmap_node_shrinker; struct vmap_area *va; + struct vmap_node *vn; struct vm_struct *tmp; int i; @@ -4475,6 +5100,11 @@ void __init vmalloc_init(void) xa_init(&vbq->vmap_blocks); } + /* + * Setup nodes before importing vmlist. + */ + vmap_init_nodes(); + /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); @@ -4484,7 +5114,9 @@ void __init vmalloc_init(void) va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + + vn = addr_to_node(va->va_start); + insert_vmap_area(va, &vn->busy.root, &vn->busy.head); } /* @@ -4492,4 +5124,14 @@ void __init vmalloc_init(void) */ vmap_init_free_space(); vmap_initialized = true; + + vmap_node_shrinker = shrinker_alloc(0, "vmap-node"); + if (!vmap_node_shrinker) { + pr_err("Failed to allocate vmap-node shrinker!\n"); + return; + } + + vmap_node_shrinker->count_objects = vmap_node_shrink_count; + vmap_node_shrinker->scan_objects = vmap_node_shrink_scan; + shrinker_register(vmap_node_shrinker); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 4255619a1a31..d55e8d07ffc4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -108,6 +108,12 @@ struct scan_control { /* Can folios be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Not allow cache_trim_mode to be turned on as part of reclaim? */ + unsigned int no_cache_trim_mode:1; + + /* Has cache_trim_mode failed at least once? */ + unsigned int cache_trim_mode_failed:1; + /* Proactive reclaim invoked by userspace through memory.reclaim */ unsigned int proactive:1; @@ -961,7 +967,8 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT, .nid = target_nid, - .nmask = &allowed_mask + .nmask = &allowed_mask, + .reason = MR_DEMOTION, }; if (list_empty(demote_folios)) @@ -1006,14 +1013,15 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references) { + struct folio_batch free_folios; LIST_HEAD(ret_folios); - LIST_HEAD(free_folios); LIST_HEAD(demote_folios); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; struct swap_iocb *plug = NULL; + folio_batch_init(&free_folios); memset(stat, 0, sizeof(*stat)); cond_resched(); do_demote_pass = can_demote(pgdat->node_id, sc); @@ -1198,25 +1206,28 @@ retry: if (!can_split_folio(folio, NULL)) goto activate_locked; /* - * Split folios without a PMD map right - * away. Chances are some or all of the - * tail pages can be freed without IO. + * Split partially mapped folios right away. + * We can free the unmapped pages without IO. */ - if (!folio_entire_mapcount(folio) && - split_folio_to_list(folio, - folio_list)) + if (data_race(!list_empty(&folio->_deferred_list)) && + split_folio_to_list(folio, folio_list)) goto activate_locked; } if (!add_to_swap(folio)) { + int __maybe_unused order = folio_order(folio); + if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ - if (split_folio_to_list(folio, - folio_list)) + if (split_folio_to_list(folio, folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1); - count_vm_event(THP_SWPOUT_FALLBACK); + if (nr_pages >= HPAGE_PMD_NR) { + count_memcg_folio_events(folio, + THP_SWPOUT_FALLBACK, 1); + count_vm_event(THP_SWPOUT_FALLBACK); + } + count_mthp_stat(order, MTHP_STAT_ANON_SWPOUT_FALLBACK); #endif if (!add_to_swap(folio)) goto activate_locked_split; @@ -1249,6 +1260,20 @@ retry: if (folio_test_pmd_mappable(folio)) flags |= TTU_SPLIT_HUGE_PMD; + /* + * Without TTU_SYNC, try_to_unmap will only begin to + * hold PTL from the first present PTE within a large + * folio. Some initial PTEs might be skipped due to + * races with parallel PTE writes in which PTEs can be + * cleared temporarily before being written new present + * values. This will lead to a large folio is still + * mapped while some subpages have been partially + * unmapped after try_to_unmap; TTU_SYNC helps + * try_to_unmap acquire PTL from the first PTE, + * eliminating the influence of temporary PTE values. + */ + if (folio_test_large(folio) && list_empty(&folio->_deferred_list)) + flags |= TTU_SYNC; try_to_unmap(folio, flags); if (folio_mapped(folio)) { @@ -1412,14 +1437,14 @@ free_it: */ nr_reclaimed += nr_pages; - /* - * Is there need to periodically free_folio_list? It would - * appear not as the counts should be low - */ - if (unlikely(folio_test_large(folio))) - destroy_large_folio(folio); - else - list_add(&folio->lru, &free_folios); + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + if (folio_batch_add(&free_folios, folio) == 0) { + mem_cgroup_uncharge_folios(&free_folios); + try_to_unmap_flush(); + free_unref_folios(&free_folios); + } continue; activate_locked_split: @@ -1483,9 +1508,9 @@ keep: pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; - mem_cgroup_uncharge_list(&free_folios); + mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); - free_unref_page_list(&free_folios); + free_unref_folios(&free_folios); list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); @@ -1744,17 +1769,17 @@ bool folio_isolate_lru(struct folio *folio) * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. */ -static int too_many_isolated(struct pglist_data *pgdat, int file, +static bool too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; bool too_many; if (current_is_kswapd()) - return 0; + return false; if (!writeback_throttling_sane(sc)) - return 0; + return false; if (file) { inactive = node_page_state(pgdat, NR_INACTIVE_FILE); @@ -1783,7 +1808,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, /* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. - * On return, @list is reused as a list of folios to be freed by the caller. * * Returns the number of pages moved to the given lruvec. */ @@ -1791,8 +1815,9 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, struct list_head *list) { int nr_pages, nr_moved = 0; - LIST_HEAD(folios_to_free); + struct folio_batch free_folios; + folio_batch_init(&free_folios); while (!list_empty(list)) { struct folio *folio = lru_to_folio(list); @@ -1821,12 +1846,15 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, if (unlikely(folio_put_testzero(folio))) { __folio_clear_lru_flags(folio); - if (unlikely(folio_test_large(folio))) { + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + if (folio_batch_add(&free_folios, folio) == 0) { spin_unlock_irq(&lruvec->lru_lock); - destroy_large_folio(folio); + mem_cgroup_uncharge_folios(&free_folios); + free_unref_folios(&free_folios); spin_lock_irq(&lruvec->lru_lock); - } else - list_add(&folio->lru, &folios_to_free); + } continue; } @@ -1843,10 +1871,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, workingset_age_nonresident(lruvec, nr_pages); } - /* - * To save our caller's stack, now use input list for pages to free. - */ - list_splice(&folios_to_free, list); + if (free_folios.nr) { + spin_unlock_irq(&lruvec->lru_lock); + mem_cgroup_uncharge_folios(&free_folios); + free_unref_folios(&free_folios); + spin_lock_irq(&lruvec->lru_lock); + } return nr_moved; } @@ -1925,8 +1955,6 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); - mem_cgroup_uncharge_list(&folio_list); - free_unref_page_list(&folio_list); /* * If dirty folios are scanned that are not queued for IO, it @@ -1998,7 +2026,7 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_inactive); unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; - int file = is_file_lru(lru); + bool file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); @@ -2067,8 +2095,6 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_activate = move_folios_to_lru(lruvec, &l_active); nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); - /* Keep all free folios in l_active list */ - list_splice(&l_inactive, &l_active); __count_vm_events(PGDEACTIVATE, nr_deactivate); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); @@ -2078,8 +2104,6 @@ static void shrink_active_list(unsigned long nr_to_scan, if (nr_rotated) lru_note_cost(lruvec, file, 0, nr_rotated); - mem_cgroup_uncharge_list(&l_active); - free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } @@ -2098,7 +2122,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, }; - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); @@ -2269,7 +2293,8 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) * anonymous pages. */ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); - if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) && + !sc->no_cache_trim_mode) sc->cache_trim_mode = 1; else sc->cache_trim_mode = 0; @@ -2412,7 +2437,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, denominator = ap + fp; out: for_each_evictable_lru(lru) { - int file = is_file_lru(lru); + bool file = is_file_lru(lru); unsigned long lruvec_size; unsigned long low, min; unsigned long scan; @@ -2879,38 +2904,37 @@ static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) #endif -static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) +static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last) { int i; int hist; + struct lruvec *lruvec = walk->lruvec; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); - if (walk) { - hist = lru_hist_from_seq(walk->max_seq); + hist = lru_hist_from_seq(walk->seq); - for (i = 0; i < NR_MM_STATS; i++) { - WRITE_ONCE(mm_state->stats[hist][i], - mm_state->stats[hist][i] + walk->mm_stats[i]); - walk->mm_stats[i] = 0; - } + for (i = 0; i < NR_MM_STATS; i++) { + WRITE_ONCE(mm_state->stats[hist][i], + mm_state->stats[hist][i] + walk->mm_stats[i]); + walk->mm_stats[i] = 0; } if (NR_HIST_GENS > 1 && last) { - hist = lru_hist_from_seq(mm_state->seq + 1); + hist = lru_hist_from_seq(walk->seq + 1); for (i = 0; i < NR_MM_STATS; i++) WRITE_ONCE(mm_state->stats[hist][i], 0); } } -static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, - struct mm_struct **iter) +static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter) { bool first = false; bool last = false; struct mm_struct *mm = NULL; + struct lruvec *lruvec = walk->lruvec; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); @@ -2927,9 +2951,9 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, */ spin_lock(&mm_list->lock); - VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq); - if (walk->max_seq <= mm_state->seq) + if (walk->seq <= mm_state->seq) goto done; if (!mm_state->head) @@ -2954,12 +2978,12 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, } while (!(mm = get_next_mm(walk))); done: if (*iter || last) - reset_mm_stats(lruvec, walk, last); + reset_mm_stats(walk, last); spin_unlock(&mm_list->lock); if (mm && first) - reset_bloom_filter(mm_state, walk->max_seq + 1); + reset_bloom_filter(mm_state, walk->seq + 1); if (*iter) mmput_async(*iter); @@ -2969,7 +2993,7 @@ done: return last; } -static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq) { bool success = false; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -2978,13 +3002,12 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) spin_lock(&mm_list->lock); - VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + VM_WARN_ON_ONCE(mm_state->seq + 1 < seq); - if (max_seq > mm_state->seq) { + if (seq > mm_state->seq) { mm_state->head = NULL; mm_state->tail = NULL; WRITE_ONCE(mm_state->seq, mm_state->seq + 1); - reset_mm_stats(lruvec, NULL, true); success = true; } @@ -3159,9 +3182,10 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, walk->nr_pages[new_gen][type][zone] += delta; } -static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) +static void reset_batch_size(struct lru_gen_mm_walk *walk) { int gen, type, zone; + struct lruvec *lruvec = walk->lruvec; struct lru_gen_folio *lrugen = &lruvec->lrugen; walk->batched = 0; @@ -3331,7 +3355,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + DEFINE_MAX_SEQ(walk->lruvec); + int old_gen, new_gen = lru_gen_from_seq(max_seq); pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl); if (!pte) @@ -3398,7 +3423,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + DEFINE_MAX_SEQ(walk->lruvec); + int old_gen, new_gen = lru_gen_from_seq(max_seq); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -3529,7 +3555,7 @@ restart: walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } - if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i)) + if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i)) continue; walk->mm_stats[MM_NONLEAF_FOUND]++; @@ -3540,7 +3566,7 @@ restart: walk->mm_stats[MM_NONLEAF_ADDED]++; /* carry over to the next generation */ - update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i); + update_bloom_filter(mm_state, walk->seq + 1, pmd + i); } walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); @@ -3591,7 +3617,7 @@ done: return -EAGAIN; } -static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) +static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) { static const struct mm_walk_ops mm_walk_ops = { .test_walk = should_skip_vma, @@ -3600,6 +3626,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ }; int err; + struct lruvec *lruvec = walk->lruvec; struct mem_cgroup *memcg = lruvec_memcg(lruvec); walk->next_addr = FIRST_USER_ADDRESS; @@ -3610,7 +3637,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ err = -EBUSY; /* another thread might have called inc_max_seq() */ - if (walk->max_seq != max_seq) + if (walk->seq != max_seq) break; /* folio_update_gen() requires stable folio_memcg() */ @@ -3628,7 +3655,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ if (walk->batched) { spin_lock_irq(&lruvec->lru_lock); - reset_batch_size(lruvec, walk); + reset_batch_size(walk); spin_unlock_irq(&lruvec->lru_lock); } @@ -3747,7 +3774,7 @@ next: return success; } -static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, +static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, bool can_swap, bool force_scan) { bool success; @@ -3755,14 +3782,14 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, int type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; restart: - if (max_seq < READ_ONCE(lrugen->max_seq)) + if (seq < READ_ONCE(lrugen->max_seq)) return false; spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); - success = max_seq == lrugen->max_seq; + success = seq == lrugen->max_seq; if (!success) goto unlock; @@ -3815,8 +3842,8 @@ unlock: return success; } -static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, - struct scan_control *sc, bool can_swap, bool force_scan) +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, + bool can_swap, bool force_scan) { bool success; struct lru_gen_mm_walk *walk; @@ -3824,13 +3851,13 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, struct lru_gen_folio *lrugen = &lruvec->lrugen; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); - VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); if (!mm_state) - return inc_max_seq(lruvec, max_seq, can_swap, force_scan); + return inc_max_seq(lruvec, seq, can_swap, force_scan); /* see the comment in iterate_mm_list() */ - if (max_seq <= READ_ONCE(mm_state->seq)) + if (seq <= READ_ONCE(mm_state->seq)) return false; /* @@ -3840,29 +3867,29 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, * is less efficient, but it avoids bursty page faults. */ if (!should_walk_mmu()) { - success = iterate_mm_list_nowalk(lruvec, max_seq); + success = iterate_mm_list_nowalk(lruvec, seq); goto done; } walk = set_mm_walk(NULL, true); if (!walk) { - success = iterate_mm_list_nowalk(lruvec, max_seq); + success = iterate_mm_list_nowalk(lruvec, seq); goto done; } walk->lruvec = lruvec; - walk->max_seq = max_seq; + walk->seq = seq; walk->can_swap = can_swap; walk->force_scan = force_scan; do { - success = iterate_mm_list(lruvec, walk, &mm); + success = iterate_mm_list(walk, &mm); if (mm) - walk_mm(lruvec, mm, walk); + walk_mm(mm, walk); } while (mm); done: if (success) { - success = inc_max_seq(lruvec, max_seq, can_swap, force_scan); + success = inc_max_seq(lruvec, seq, can_swap, force_scan); WARN_ON_ONCE(!success); } @@ -4287,7 +4314,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca { bool success; - /* swapping inhibited */ + /* swap constrained */ if (!(sc->gfp_mask & __GFP_IO) && (folio_test_dirty(folio) || (folio_test_anon(folio) && !folio_test_swapcache(folio)))) @@ -4456,9 +4483,12 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw DEFINE_MIN_SEQ(lruvec); /* - * Try to make the obvious choice first. When anon and file are both - * available from the same generation, interpret swappiness 1 as file - * first and 200 as anon first. + * Try to make the obvious choice first, and if anon and file are both + * available from the same generation, + * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon + * first. + * 2. If !__GFP_IO, file first since clean pagecache is more likely to + * exist than clean swapcache. */ if (!swappiness) type = LRU_GEN_FILE; @@ -4468,6 +4498,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw type = LRU_GEN_FILE; else if (swappiness == 200) type = LRU_GEN_ANON; + else if (!(sc->gfp_mask & __GFP_IO)) + type = LRU_GEN_FILE; else type = get_type_to_scan(lruvec, swappiness, &tier); @@ -4558,8 +4590,10 @@ retry: move_folios_to_lru(lruvec, &list); walk = current->reclaim_state->mm_walk; - if (walk && walk->batched) - reset_batch_size(lruvec, walk); + if (walk && walk->batched) { + walk->lruvec = lruvec; + reset_batch_size(walk); + } item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) @@ -4569,10 +4603,6 @@ retry: spin_unlock_irq(&lruvec->lru_lock); - mem_cgroup_uncharge_list(&list); - free_unref_page_list(&list); - - INIT_LIST_HEAD(&list); list_splice_init(&clean, &list); if (!list_empty(&list)) { @@ -4584,14 +4614,13 @@ retry: } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, - struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) + bool can_swap, unsigned long *nr_to_scan) { int gen, type, zone; unsigned long old = 0; unsigned long young = 0; unsigned long total = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); /* whether this lruvec is completely out of cold folios */ @@ -4619,13 +4648,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, } } - /* try to scrape all its memory if this memcg was deleted */ - if (!mem_cgroup_online(memcg)) { - *nr_to_scan = total; - return false; - } - - *nr_to_scan = total >> sc->priority; + *nr_to_scan = total; /* * The aging tries to be lazy to reduce the overhead, while the eviction @@ -4657,6 +4680,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, */ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) { + bool success; unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); @@ -4664,15 +4688,18 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return -1; - if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) - return nr_to_scan; + success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan); - /* skip the aging path at the default priority */ - if (sc->priority == DEF_PRIORITY) + /* try to scrape all its memory if this memcg was deleted */ + if (nr_to_scan && !mem_cgroup_online(memcg)) return nr_to_scan; - /* skip this lruvec as it's low on cold folios */ - return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; + /* try to get away with not aging at the default priority */ + if (!success || sc->priority == DEF_PRIORITY) + return nr_to_scan >> sc->priority; + + /* stop scanning this lruvec as it's low on cold folios */ + return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0; } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) @@ -4712,10 +4739,6 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long scanned = 0; int swappiness = get_swappiness(lruvec, sc); - /* clean file folios are more likely to exist */ - if (swappiness && !(sc->gfp_mask & __GFP_IO)) - swappiness = 1; - while (true) { int delta; @@ -4878,7 +4901,6 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control { int priority; unsigned long reclaimable; - struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) return; @@ -4888,7 +4910,7 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control * where reclaimed_to_scanned_ratio = inactive / total. */ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); - if (get_swappiness(lruvec, sc)) + if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); /* round down reclaimable and round up sc->nr_to_reclaim */ @@ -5332,7 +5354,7 @@ static const struct seq_operations lru_gen_seq_ops = { .show = lru_gen_seq_show, }; -static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, +static int run_aging(struct lruvec *lruvec, unsigned long seq, bool can_swap, bool force_scan) { DEFINE_MAX_SEQ(lruvec); @@ -5347,7 +5369,7 @@ static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_contr if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) return -ERANGE; - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); + try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan); return 0; } @@ -5415,7 +5437,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, switch (cmd) { case '+': - err = run_aging(lruvec, seq, sc, swappiness, opt); + err = run_aging(lruvec, seq, swappiness, opt); break; case '-': err = run_eviction(lruvec, seq, sc, swappiness, opt); @@ -5987,6 +6009,8 @@ again: */ if (reclaimable) pgdat->kswapd_failures = 0; + else if (sc->cache_trim_mode) + sc->cache_trim_mode_failed = 1; } /* @@ -6799,6 +6823,7 @@ restart: bool raise_priority = true; bool balanced; bool ret; + bool was_frozen; sc.reclaim_idx = highest_zoneidx; @@ -6897,9 +6922,9 @@ restart: /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); - ret = try_to_freeze(); + ret = kthread_freezable_should_stop(&was_frozen); __fs_reclaim_acquire(_THIS_IP_); - if (ret || kthread_should_stop()) + if (was_frozen || ret) break; /* @@ -6921,6 +6946,16 @@ restart: sc.priority--; } while (sc.priority >= 1); + /* + * Restart only if it went through the priority loop all the way, + * but cache_trim_mode didn't work. + */ + if (!sc.nr_reclaimed && sc.priority < 1 && + !sc.no_cache_trim_mode && sc.cache_trim_mode_failed) { + sc.no_cache_trim_mode = 1; + goto restart; + } + if (!sc.nr_reclaimed) pgdat->kswapd_failures++; @@ -7105,7 +7140,7 @@ static int kswapd(void *p) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); atomic_set(&pgdat->nr_writeback_throttled, 0); for ( ; ; ) { - bool ret; + bool was_frozen; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, @@ -7122,15 +7157,14 @@ kswapd_try_sleep: WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); - ret = try_to_freeze(); - if (kthread_should_stop()) + if (kthread_freezable_should_stop(&was_frozen)) break; /* * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ - if (ret) + if (was_frozen) continue; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index db79935e4a54..8507c497218b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1242,6 +1242,9 @@ const char * const vmstat_text[] = { #endif "nr_page_table_pages", "nr_sec_page_table_pages", +#ifdef CONFIG_IOMMU_SUPPORT + "nr_iommu_pages", +#endif #ifdef CONFIG_SWAP "nr_swapcached", #endif diff --git a/mm/workingset.c b/mm/workingset.c index 226012974328..c22adb93622a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -16,6 +16,7 @@ #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> +#include "internal.h" /* * Double CLOCK lists @@ -617,6 +618,7 @@ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { struct address_space *mapping; + struct page *page = virt_to_page(node); /* * Track non-empty nodes that contain only shadow entries; @@ -632,12 +634,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add_obj(&shadow_nodes, &node->private_list); - __inc_lruvec_kmem_state(node, WORKINGSET_NODES); + __inc_node_page_state(page, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del_obj(&shadow_nodes, &node->private_list); - __dec_lruvec_kmem_state(node, WORKINGSET_NODES); + __dec_node_page_state(page, WORKINGSET_NODES); } } } @@ -741,7 +743,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); - __dec_lruvec_kmem_state(node, WORKINGSET_NODES); + __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); spin_unlock(lru_lock); diff --git a/mm/z3fold.c b/mm/z3fold.c index 7c76b396b74c..2ebfed32871b 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -364,8 +364,9 @@ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) } /* - * Encodes the handle of a particular buddy within a z3fold page - * Pool lock should be held as this function accesses first_num + * Encodes the handle of a particular buddy within a z3fold page. + * Zhdr->page_lock should be held as this function accesses first_num + * if bud != HEADLESS. */ static unsigned long __encode_handle(struct z3fold_header *zhdr, struct z3fold_buddy_slots *slots, @@ -1236,12 +1237,12 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) } /** - * z3fold_get_pool_size() - gets the z3fold pool size in pages + * z3fold_get_pool_pages() - gets the z3fold pool size in pages * @pool: pool whose size is being queried * * Returns: size in pages of the given pool. */ -static u64 z3fold_get_pool_size(struct z3fold_pool *pool) +static u64 z3fold_get_pool_pages(struct z3fold_pool *pool) { return atomic64_read(&pool->pages_nr); } @@ -1401,9 +1402,9 @@ static void z3fold_zpool_unmap(void *pool, unsigned long handle) z3fold_unmap(pool, handle); } -static u64 z3fold_zpool_total_size(void *pool) +static u64 z3fold_zpool_total_pages(void *pool) { - return z3fold_get_pool_size(pool) * PAGE_SIZE; + return z3fold_get_pool_pages(pool); } static struct zpool_driver z3fold_zpool_driver = { @@ -1416,7 +1417,7 @@ static struct zpool_driver z3fold_zpool_driver = { .free = z3fold_zpool_free, .map = z3fold_zpool_map, .unmap = z3fold_zpool_unmap, - .total_size = z3fold_zpool_total_size, + .total_pages = z3fold_zpool_total_pages, }; MODULE_ALIAS("zpool-z3fold"); diff --git a/mm/zbud.c b/mm/zbud.c index 2190cc1f37b3..e9836fff9438 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -365,13 +365,13 @@ static void zbud_unmap(struct zbud_pool *pool, unsigned long handle) } /** - * zbud_get_pool_size() - gets the zbud pool size in pages + * zbud_get_pool_pages() - gets the zbud pool size in pages * @pool: pool whose size is being queried * * Returns: size in pages of the given pool. The pool lock need not be * taken to access pages_nr. */ -static u64 zbud_get_pool_size(struct zbud_pool *pool) +static u64 zbud_get_pool_pages(struct zbud_pool *pool) { return pool->pages_nr; } @@ -410,9 +410,9 @@ static void zbud_zpool_unmap(void *pool, unsigned long handle) zbud_unmap(pool, handle); } -static u64 zbud_zpool_total_size(void *pool) +static u64 zbud_zpool_total_pages(void *pool) { - return zbud_get_pool_size(pool) * PAGE_SIZE; + return zbud_get_pool_pages(pool); } static struct zpool_driver zbud_zpool_driver = { @@ -425,7 +425,7 @@ static struct zpool_driver zbud_zpool_driver = { .free = zbud_zpool_free, .map = zbud_zpool_map, .unmap = zbud_zpool_unmap, - .total_size = zbud_zpool_total_size, + .total_pages = zbud_zpool_total_pages, }; MODULE_ALIAS("zpool-zbud"); diff --git a/mm/zpool.c b/mm/zpool.c index 846410479c2f..b9fda1fa857d 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -321,16 +321,16 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) } /** - * zpool_get_total_size() - The total size of the pool + * zpool_get_total_pages() - The total size of the pool * @zpool: The zpool to check * - * This returns the total size in bytes of the pool. + * This returns the total size in pages of the pool. * - * Returns: Total size of the zpool in bytes. + * Returns: Total size of the zpool in pages. */ -u64 zpool_get_total_size(struct zpool *zpool) +u64 zpool_get_total_pages(struct zpool *zpool) { - return zpool->driver->total_size(zpool->pool); + return zpool->driver->total_pages(zpool->pool); } /** diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c937635e0ad1..b42d3545ca85 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -110,13 +110,12 @@ #define OBJ_TAG_BITS 1 #define OBJ_TAG_MASK OBJ_ALLOCATED_TAG -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) +#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) #define HUGE_BITS 1 #define FULLNESS_BITS 4 #define CLASS_BITS 8 -#define ISOLATED_BITS 5 #define MAGIC_VAL_BITS 8 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) @@ -246,7 +245,6 @@ struct zspage { unsigned int huge:HUGE_BITS; unsigned int fullness:FULLNESS_BITS; unsigned int class:CLASS_BITS + 1; - unsigned int isolated:ISOLATED_BITS; unsigned int magic:MAGIC_VAL_BITS; }; unsigned int inuse; @@ -278,18 +276,14 @@ static bool ZsHugePage(struct zspage *zspage) static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); - -#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage); -static void migrate_write_lock_nested(struct zspage *zspage); static void migrate_write_unlock(struct zspage *zspage); + +#ifdef CONFIG_COMPACTION static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); #else -static void migrate_write_lock(struct zspage *zspage) {} -static void migrate_write_lock_nested(struct zspage *zspage) {} -static void migrate_write_unlock(struct zspage *zspage) {} static void kick_deferred_free(struct zs_pool *pool) {} static void init_deferred_free(struct zs_pool *pool) {} static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} @@ -405,9 +399,9 @@ static void zs_zpool_unmap(void *pool, unsigned long handle) zs_unmap_object(pool, handle); } -static u64 zs_zpool_total_size(void *pool) +static u64 zs_zpool_total_pages(void *pool) { - return zs_get_total_pages(pool) << PAGE_SHIFT; + return zs_get_total_pages(pool); } static struct zpool_driver zs_zpool_driver = { @@ -420,7 +414,7 @@ static struct zpool_driver zs_zpool_driver = { .free = zs_zpool_free, .map = zs_zpool_map, .unmap = zs_zpool_unmap, - .total_size = zs_zpool_total_size, + .total_pages = zs_zpool_total_pages, }; MODULE_ALIAS("zpool-zsmalloc"); @@ -476,30 +470,12 @@ static inline void set_freeobj(struct zspage *zspage, unsigned int obj) zspage->freeobj = obj; } -static void get_zspage_mapping(struct zspage *zspage, - unsigned int *class_idx, - int *fullness) -{ - BUG_ON(zspage->magic != ZSPAGE_MAGIC); - - *fullness = zspage->fullness; - *class_idx = zspage->class; -} - static struct size_class *zspage_class(struct zs_pool *pool, struct zspage *zspage) { return pool->size_class[zspage->class]; } -static void set_zspage_mapping(struct zspage *zspage, - unsigned int class_idx, - int fullness) -{ - zspage->class = class_idx; - zspage->fullness = fullness; -} - /* * zsmalloc divides the pool into various size classes where each * class maintains a list of zspages where each zspage is divided @@ -694,16 +670,17 @@ static void insert_zspage(struct size_class *class, { class_stat_inc(class, fullness, 1); list_add(&zspage->list, &class->fullness_list[fullness]); + zspage->fullness = fullness; } /* * This function removes the given zspage from the freelist identified * by <class, fullness_group>. */ -static void remove_zspage(struct size_class *class, - struct zspage *zspage, - int fullness) +static void remove_zspage(struct size_class *class, struct zspage *zspage) { + int fullness = zspage->fullness; + VM_BUG_ON(list_empty(&class->fullness_list[fullness])); list_del_init(&zspage->list); @@ -721,17 +698,14 @@ static void remove_zspage(struct size_class *class, */ static int fix_fullness_group(struct size_class *class, struct zspage *zspage) { - int class_idx; - int currfg, newfg; + int newfg; - get_zspage_mapping(zspage, &class_idx, &currfg); newfg = get_fullness_group(class, zspage); - if (newfg == currfg) + if (newfg == zspage->fullness) goto out; - remove_zspage(class, zspage, currfg); + remove_zspage(class, zspage); insert_zspage(class, zspage, newfg); - set_zspage_mapping(zspage, class_idx, newfg); out: return newfg; } @@ -763,14 +737,12 @@ static struct page *get_next_page(struct page *page) static void obj_to_location(unsigned long obj, struct page **page, unsigned int *obj_idx) { - obj >>= OBJ_TAG_BITS; *page = pfn_to_page(obj >> OBJ_INDEX_BITS); *obj_idx = (obj & OBJ_INDEX_MASK); } static void obj_to_page(unsigned long obj, struct page **page) { - obj >>= OBJ_TAG_BITS; *page = pfn_to_page(obj >> OBJ_INDEX_BITS); } @@ -785,7 +757,6 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) obj = page_to_pfn(page) << OBJ_INDEX_BITS; obj |= obj_idx & OBJ_INDEX_MASK; - obj <<= OBJ_TAG_BITS; return obj; } @@ -849,15 +820,11 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { struct page *page, *next; - int fg; - unsigned int class_idx; - - get_zspage_mapping(zspage, &class_idx, &fg); assert_spin_locked(&pool->lock); VM_BUG_ON(get_zspage_inuse(zspage)); - VM_BUG_ON(fg != ZS_INUSE_RATIO_0); + VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0); next = page = get_first_page(zspage); do { @@ -892,7 +859,7 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class, return; } - remove_zspage(class, zspage, ZS_INUSE_RATIO_0); + remove_zspage(class, zspage); __free_zspage(pool, class, zspage); } @@ -1011,6 +978,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, create_page_chain(class, zspage, pages); init_zspage(class, zspage); zspage->pool = pool; + zspage->class = class->index; return zspage; } @@ -1403,7 +1371,6 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) obj = obj_malloc(pool, zspage, handle); newfg = get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); - set_zspage_mapping(zspage, class->index, newfg); record_obj(handle, obj); atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); class_stat_inc(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); @@ -1623,7 +1590,7 @@ static struct zspage *isolate_src_zspage(struct size_class *class) zspage = list_first_entry_or_null(&class->fullness_list[fg], struct zspage, list); if (zspage) { - remove_zspage(class, zspage, fg); + remove_zspage(class, zspage); return zspage; } } @@ -1640,7 +1607,7 @@ static struct zspage *isolate_dst_zspage(struct size_class *class) zspage = list_first_entry_or_null(&class->fullness_list[fg], struct zspage, list); if (zspage) { - remove_zspage(class, zspage, fg); + remove_zspage(class, zspage); return zspage; } } @@ -1661,7 +1628,6 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage) fullness = get_fullness_group(class, zspage); insert_zspage(class, zspage, fullness); - set_zspage_mapping(zspage, class->index, fullness); return fullness; } @@ -1725,33 +1691,17 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) read_unlock(&zspage->lock); } -#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage) { write_lock(&zspage->lock); } -static void migrate_write_lock_nested(struct zspage *zspage) -{ - write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING); -} - static void migrate_write_unlock(struct zspage *zspage) { write_unlock(&zspage->lock); } -/* Number of isolated subpage for *page migration* in this zspage */ -static void inc_zspage_isolation(struct zspage *zspage) -{ - zspage->isolated++; -} - -static void dec_zspage_isolation(struct zspage *zspage) -{ - VM_BUG_ON(zspage->isolated == 0); - zspage->isolated--; -} +#ifdef CONFIG_COMPACTION static const struct movable_operations zsmalloc_mops; @@ -1780,21 +1730,12 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { - struct zs_pool *pool; - struct zspage *zspage; - /* * Page is locked so zspage couldn't be destroyed. For detail, look at * lock_zspage in free_zspage. */ VM_BUG_ON_PAGE(PageIsolated(page), page); - zspage = get_zspage(page); - pool = zspage->pool; - spin_lock(&pool->lock); - inc_zspage_isolation(zspage); - spin_unlock(&pool->lock); - return true; } @@ -1859,7 +1800,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page, kunmap_atomic(s_addr); replace_sub_page(class, zspage, newpage, page); - dec_zspage_isolation(zspage); /* * Since we complete the data copy and set up new zspage structure, * it's okay to release the pool's lock. @@ -1881,16 +1821,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, static void zs_page_putback(struct page *page) { - struct zs_pool *pool; - struct zspage *zspage; - VM_BUG_ON_PAGE(!PageIsolated(page), page); - - zspage = get_zspage(page); - pool = zspage->pool; - spin_lock(&pool->lock); - dec_zspage_isolation(zspage); - spin_unlock(&pool->lock); } static const struct movable_operations zsmalloc_mops = { @@ -1907,8 +1838,6 @@ static void async_free_zspage(struct work_struct *work) { int i; struct size_class *class; - unsigned int class_idx; - int fullness; struct zspage *zspage, *tmp; LIST_HEAD(free_pages); struct zs_pool *pool = container_of(work, struct zs_pool, @@ -1929,10 +1858,8 @@ static void async_free_zspage(struct work_struct *work) list_del(&zspage->list); lock_zspage(zspage); - get_zspage_mapping(zspage, &class_idx, &fullness); - VM_BUG_ON(fullness != ZS_INUSE_RATIO_0); - class = pool->size_class[class_idx]; spin_lock(&pool->lock); + class = zspage_class(pool, zspage); __free_zspage(pool, class, zspage); spin_unlock(&pool->lock); } @@ -2006,19 +1933,17 @@ static unsigned long __zs_compact(struct zs_pool *pool, dst_zspage = isolate_dst_zspage(class); if (!dst_zspage) break; - migrate_write_lock(dst_zspage); } src_zspage = isolate_src_zspage(class); if (!src_zspage) break; - migrate_write_lock_nested(src_zspage); - + migrate_write_lock(src_zspage); migrate_zspage(pool, src_zspage, dst_zspage); - fg = putback_zspage(class, src_zspage); migrate_write_unlock(src_zspage); + fg = putback_zspage(class, src_zspage); if (fg == ZS_INUSE_RATIO_0) { free_zspage(pool, class, src_zspage); pages_freed += class->pages_per_zspage; @@ -2028,7 +1953,6 @@ static unsigned long __zs_compact(struct zs_pool *pool, if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100 || spin_is_contended(&pool->lock)) { putback_zspage(class, dst_zspage); - migrate_write_unlock(dst_zspage); dst_zspage = NULL; spin_unlock(&pool->lock); @@ -2037,15 +1961,12 @@ static unsigned long __zs_compact(struct zs_pool *pool, } } - if (src_zspage) { + if (src_zspage) putback_zspage(class, src_zspage); - migrate_write_unlock(src_zspage); - } - if (dst_zspage) { + if (dst_zspage) putback_zspage(class, dst_zspage); - migrate_write_unlock(dst_zspage); - } + spin_unlock(&pool->lock); return pages_freed; diff --git a/mm/zswap.c b/mm/zswap.c index db4625af65fb..a50e2986cd2f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -20,7 +20,6 @@ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> -#include <linux/rbtree.h> #include <linux/swap.h> #include <linux/crypto.h> #include <linux/scatterlist.h> @@ -43,8 +42,6 @@ /********************************* * statistics **********************************/ -/* Total bytes used by the compressed storage */ -u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ atomic_t zswap_stored_pages = ATOMIC_INIT(0); /* The number of same-value filled pages currently stored in zswap */ @@ -71,8 +68,6 @@ static u64 zswap_reject_compress_poor; static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ static u64 zswap_reject_kmemcache_fail; -/* Duplicate store was encountered (rare) */ -static u64 zswap_duplicate_entry; /* Shrinker work queue */ static struct workqueue_struct *shrink_wq; @@ -128,23 +123,6 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); -/* - * Enable/disable handling same-value filled pages (enabled by default). - * If disabled every page is considered non-same-value filled. - */ -static bool zswap_same_filled_pages_enabled = true; -module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, - bool, 0644); - -/* Enable/disable handling non-same-value filled pages (enabled by default) */ -static bool zswap_non_same_filled_pages_enabled = true; -module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, - bool, 0644); - -static bool zswap_exclusive_loads_enabled = IS_ENABLED( - CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); -module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); - /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 @@ -168,6 +146,7 @@ struct crypto_acomp_ctx { struct crypto_wait wait; u8 *buffer; struct mutex mutex; + bool is_sleepable; }; /* @@ -179,32 +158,29 @@ struct crypto_acomp_ctx { struct zswap_pool { struct zpool *zpools[ZSWAP_NR_ZPOOLS]; struct crypto_acomp_ctx __percpu *acomp_ctx; - struct kref kref; + struct percpu_ref ref; struct list_head list; struct work_struct release_work; - struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; - struct list_lru list_lru; - struct mem_cgroup *next_shrink; - struct shrinker *shrinker; - atomic_t nr_stored; }; +/* Global LRU lists shared by all zswap pools. */ +static struct list_lru zswap_list_lru; + +/* The lock protects zswap_next_shrink updates. */ +static DEFINE_SPINLOCK(zswap_shrink_lock); +static struct mem_cgroup *zswap_next_shrink; +static struct work_struct zswap_shrink_work; +static struct shrinker *zswap_shrinker; + /* * struct zswap_entry * * This structure contains the metadata for tracking a single compressed * page within zswap. * - * rbnode - links the entry into red-black tree for the appropriate swap type * swpentry - associated swap entry, the offset indexes into the red-black tree - * refcount - the number of outstanding reference to the entry. This is needed - * to protect against premature freeing of the entry by code - * concurrent calls to load, invalidate, and writeback. The lock - * for the zswap_tree structure that contains the entry must - * be held while changing the refcount. Since the lock must - * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both * pool and lru are invalid and must be ignored. @@ -215,9 +191,7 @@ struct zswap_pool { * lru - handle to the pool's lru used to evict pages. */ struct zswap_entry { - struct rb_node rbnode; swp_entry_t swpentry; - int refcount; unsigned int length; struct zswap_pool *pool; union { @@ -228,17 +202,8 @@ struct zswap_entry { struct list_head lru; }; -/* - * The tree lock in the zswap_tree struct protects a few things: - * - the rbtree - * - the refcount field of each entry in the tree - */ -struct zswap_tree { - struct rb_root rbroot; - spinlock_t lock; -}; - -static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; +static struct xarray *zswap_trees[MAX_SWAPFILES]; +static unsigned int nr_zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ static LIST_HEAD(zswap_pools); @@ -265,750 +230,20 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ -#define zswap_pool_debug(msg, p) \ - pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ - zpool_get_type((p)->zpools[0])) - -static int zswap_writeback_entry(struct zswap_entry *entry, - struct zswap_tree *tree); -static int zswap_pool_get(struct zswap_pool *pool); -static void zswap_pool_put(struct zswap_pool *pool); - -static bool zswap_is_full(void) -{ - return totalram_pages() * zswap_max_pool_percent / 100 < - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); -} - -static bool zswap_can_accept(void) -{ - return totalram_pages() * zswap_accept_thr_percent / 100 * - zswap_max_pool_percent / 100 > - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); -} - -static u64 get_zswap_pool_size(struct zswap_pool *pool) -{ - u64 pool_size = 0; - int i; - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - pool_size += zpool_get_total_size(pool->zpools[i]); - - return pool_size; -} - -static void zswap_update_total_size(void) -{ - struct zswap_pool *pool; - u64 total = 0; - - rcu_read_lock(); - - list_for_each_entry_rcu(pool, &zswap_pools, list) - total += get_zswap_pool_size(pool); - - rcu_read_unlock(); - - zswap_pool_total_size = total; -} - -/* should be called under RCU */ -#ifdef CONFIG_MEMCG -static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) -{ - return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; -} -#else -static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) -{ - return NULL; -} -#endif - -static inline int entry_to_nid(struct zswap_entry *entry) -{ - return page_to_nid(virt_to_page(entry)); -} - -void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) -{ - struct zswap_pool *pool; - - /* lock out zswap pools list modification */ - spin_lock(&zswap_pools_lock); - list_for_each_entry(pool, &zswap_pools, list) { - if (pool->next_shrink == memcg) - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - } - spin_unlock(&zswap_pools_lock); -} - -/********************************* -* zswap entry functions -**********************************/ -static struct kmem_cache *zswap_entry_cache; - -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) -{ - struct zswap_entry *entry; - entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); - if (!entry) - return NULL; - entry->refcount = 1; - RB_CLEAR_NODE(&entry->rbnode); - return entry; -} - -static void zswap_entry_cache_free(struct zswap_entry *entry) +static inline struct xarray *swap_zswap_tree(swp_entry_t swp) { - kmem_cache_free(zswap_entry_cache, entry); -} - -/********************************* -* zswap lruvec functions -**********************************/ -void zswap_lruvec_state_init(struct lruvec *lruvec) -{ - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); -} - -void zswap_folio_swapin(struct folio *folio) -{ - struct lruvec *lruvec; - - VM_WARN_ON_ONCE(!folio_test_locked(folio)); - lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); -} - -/********************************* -* lru functions -**********************************/ -static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) -{ - atomic_long_t *nr_zswap_protected; - unsigned long lru_size, old, new; - int nid = entry_to_nid(entry); - struct mem_cgroup *memcg; - struct lruvec *lruvec; - - /* - * Note that it is safe to use rcu_read_lock() here, even in the face of - * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection - * used in list_lru lookup, only two scenarios are possible: - * - * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The - * new entry will be reparented to memcg's parent's list_lru. - * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The - * new entry will be added directly to memcg's parent's list_lru. - * - * Similar reasoning holds for list_lru_del() and list_lru_putback(). - */ - rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); - /* will always succeed */ - list_lru_add(list_lru, &entry->lru, nid, memcg); - - /* Update the protection area */ - lru_size = list_lru_count_one(list_lru, nid, memcg); - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); - nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; - old = atomic_long_inc_return(nr_zswap_protected); - /* - * Decay to avoid overflow and adapt to changing workloads. - * This is based on LRU reclaim cost decaying heuristics. - */ - do { - new = old > lru_size / 4 ? old / 2 : old; - } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); - rcu_read_unlock(); -} - -static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) -{ - int nid = entry_to_nid(entry); - struct mem_cgroup *memcg; - - rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); - /* will always succeed */ - list_lru_del(list_lru, &entry->lru, nid, memcg); - rcu_read_unlock(); -} - -static void zswap_lru_putback(struct list_lru *list_lru, - struct zswap_entry *entry) -{ - int nid = entry_to_nid(entry); - spinlock_t *lock = &list_lru->node[nid].lock; - struct mem_cgroup *memcg; - struct lruvec *lruvec; - - rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); - spin_lock(lock); - /* we cannot use list_lru_add here, because it increments node's lru count */ - list_lru_putback(list_lru, &entry->lru, nid, memcg); - spin_unlock(lock); - - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); - /* increment the protection area to account for the LRU rotation. */ - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - rcu_read_unlock(); -} - -/********************************* -* rbtree functions -**********************************/ -static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) -{ - struct rb_node *node = root->rb_node; - struct zswap_entry *entry; - pgoff_t entry_offset; - - while (node) { - entry = rb_entry(node, struct zswap_entry, rbnode); - entry_offset = swp_offset(entry->swpentry); - if (entry_offset > offset) - node = node->rb_left; - else if (entry_offset < offset) - node = node->rb_right; - else - return entry; - } - return NULL; -} - -/* - * In the case that a entry with the same offset is found, a pointer to - * the existing entry is stored in dupentry and the function returns -EEXIST - */ -static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, - struct zswap_entry **dupentry) -{ - struct rb_node **link = &root->rb_node, *parent = NULL; - struct zswap_entry *myentry; - pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); - - while (*link) { - parent = *link; - myentry = rb_entry(parent, struct zswap_entry, rbnode); - myentry_offset = swp_offset(myentry->swpentry); - if (myentry_offset > entry_offset) - link = &(*link)->rb_left; - else if (myentry_offset < entry_offset) - link = &(*link)->rb_right; - else { - *dupentry = myentry; - return -EEXIST; - } - } - rb_link_node(&entry->rbnode, parent, link); - rb_insert_color(&entry->rbnode, root); - return 0; -} - -static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) -{ - if (!RB_EMPTY_NODE(&entry->rbnode)) { - rb_erase(&entry->rbnode, root); - RB_CLEAR_NODE(&entry->rbnode); - return true; - } - return false; -} - -static struct zpool *zswap_find_zpool(struct zswap_entry *entry) -{ - int i = 0; - - if (ZSWAP_NR_ZPOOLS > 1) - i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); - - return entry->pool->zpools[i]; -} - -/* - * Carries out the common pattern of freeing and entry's zpool allocation, - * freeing the entry itself, and decrementing the number of stored pages. - */ -static void zswap_free_entry(struct zswap_entry *entry) -{ - if (!entry->length) - atomic_dec(&zswap_same_filled_pages); - else { - zswap_lru_del(&entry->pool->list_lru, entry); - zpool_free(zswap_find_zpool(entry), entry->handle); - atomic_dec(&entry->pool->nr_stored); - zswap_pool_put(entry->pool); - } - if (entry->objcg) { - obj_cgroup_uncharge_zswap(entry->objcg, entry->length); - obj_cgroup_put(entry->objcg); - } - zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); - zswap_update_total_size(); -} - -/* caller must hold the tree lock */ -static void zswap_entry_get(struct zswap_entry *entry) -{ - entry->refcount++; -} - -/* caller must hold the tree lock -* remove from the tree and free it, if nobody reference the entry -*/ -static void zswap_entry_put(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - int refcount = --entry->refcount; - - WARN_ON_ONCE(refcount < 0); - if (refcount == 0) { - WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); - zswap_free_entry(entry); - } -} - -/* caller must hold the tree lock */ -static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, - pgoff_t offset) -{ - struct zswap_entry *entry; - - entry = zswap_rb_search(root, offset); - if (entry) - zswap_entry_get(entry); - - return entry; -} - -/********************************* -* shrinker functions -**********************************/ -static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg); - -static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, - struct shrink_control *sc) -{ - struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); - unsigned long shrink_ret, nr_protected, lru_size; - struct zswap_pool *pool = shrinker->private_data; - bool encountered_page_in_swapcache = false; - - if (!zswap_shrinker_enabled || - !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { - sc->nr_scanned = 0; - return SHRINK_STOP; - } - - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&pool->list_lru, sc); - - /* - * Abort if we are shrinking into the protected region. - * - * This short-circuiting is necessary because if we have too many multiple - * concurrent reclaimers getting the freeable zswap object counts at the - * same time (before any of them made reasonable progress), the total - * number of reclaimed objects might be more than the number of unprotected - * objects (i.e the reclaimers will reclaim into the protected area of the - * zswap LRU). - */ - if (nr_protected >= lru_size - sc->nr_to_scan) { - sc->nr_scanned = 0; - return SHRINK_STOP; - } - - shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, - &encountered_page_in_swapcache); - - if (encountered_page_in_swapcache) - return SHRINK_STOP; - - return shrink_ret ? shrink_ret : SHRINK_STOP; -} - -static unsigned long zswap_shrinker_count(struct shrinker *shrinker, - struct shrink_control *sc) -{ - struct zswap_pool *pool = shrinker->private_data; - struct mem_cgroup *memcg = sc->memcg; - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); - unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; - - if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) - return 0; - -#ifdef CONFIG_MEMCG_KMEM - mem_cgroup_flush_stats(memcg); - nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; - nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); -#else - /* use pool stats instead of memcg stats */ - nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; - nr_stored = atomic_read(&pool->nr_stored); -#endif - - if (!nr_stored) - return 0; - - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); - /* - * Subtract the lru size by an estimate of the number of pages - * that should be protected. - */ - nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; - - /* - * Scale the number of freeable pages by the memory saving factor. - * This ensures that the better zswap compresses memory, the fewer - * pages we will evict to swap (as it will otherwise incur IO for - * relatively small memory saving). - */ - return mult_frac(nr_freeable, nr_backing, nr_stored); -} - -static void zswap_alloc_shrinker(struct zswap_pool *pool) -{ - pool->shrinker = - shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); - if (!pool->shrinker) - return; - - pool->shrinker->private_data = pool; - pool->shrinker->scan_objects = zswap_shrinker_scan; - pool->shrinker->count_objects = zswap_shrinker_count; - pool->shrinker->batch = 0; - pool->shrinker->seeks = DEFAULT_SEEKS; -} - -/********************************* -* per-cpu code -**********************************/ -static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp; - struct acomp_req *req; - int ret; - - mutex_init(&acomp_ctx->mutex); - - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!acomp_ctx->buffer) - return -ENOMEM; - - acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); - if (IS_ERR(acomp)) { - pr_err("could not alloc crypto acomp %s : %ld\n", - pool->tfm_name, PTR_ERR(acomp)); - ret = PTR_ERR(acomp); - goto acomp_fail; - } - acomp_ctx->acomp = acomp; - - req = acomp_request_alloc(acomp_ctx->acomp); - if (!req) { - pr_err("could not alloc crypto acomp_request %s\n", - pool->tfm_name); - ret = -ENOMEM; - goto req_fail; - } - acomp_ctx->req = req; - - crypto_init_wait(&acomp_ctx->wait); - /* - * if the backend of acomp is async zip, crypto_req_done() will wakeup - * crypto_wait_req(); if the backend of acomp is scomp, the callback - * won't be called, crypto_wait_req() will return without blocking. - */ - acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &acomp_ctx->wait); - - return 0; - -req_fail: - crypto_free_acomp(acomp_ctx->acomp); -acomp_fail: - kfree(acomp_ctx->buffer); - return ret; + return &zswap_trees[swp_type(swp)][swp_offset(swp) + >> SWAP_ADDRESS_SPACE_SHIFT]; } -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - - if (!IS_ERR_OR_NULL(acomp_ctx)) { - if (!IS_ERR_OR_NULL(acomp_ctx->req)) - acomp_request_free(acomp_ctx->req); - if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) - crypto_free_acomp(acomp_ctx->acomp); - kfree(acomp_ctx->buffer); - } - - return 0; -} +#define zswap_pool_debug(msg, p) \ + pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ + zpool_get_type((p)->zpools[0])) /********************************* * pool functions **********************************/ - -static struct zswap_pool *__zswap_pool_current(void) -{ - struct zswap_pool *pool; - - pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); - WARN_ONCE(!pool && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - - return pool; -} - -static struct zswap_pool *zswap_pool_current(void) -{ - assert_spin_locked(&zswap_pools_lock); - - return __zswap_pool_current(); -} - -static struct zswap_pool *zswap_pool_current_get(void) -{ - struct zswap_pool *pool; - - rcu_read_lock(); - - pool = __zswap_pool_current(); - if (!zswap_pool_get(pool)) - pool = NULL; - - rcu_read_unlock(); - - return pool; -} - -static struct zswap_pool *zswap_pool_last_get(void) -{ - struct zswap_pool *pool, *last = NULL; - - rcu_read_lock(); - - list_for_each_entry_rcu(pool, &zswap_pools, list) - last = pool; - WARN_ONCE(!last && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - if (!zswap_pool_get(last)) - last = NULL; - - rcu_read_unlock(); - - return last; -} - -/* type and compressor must be null-terminated */ -static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) -{ - struct zswap_pool *pool; - - assert_spin_locked(&zswap_pools_lock); - - list_for_each_entry_rcu(pool, &zswap_pools, list) { - if (strcmp(pool->tfm_name, compressor)) - continue; - /* all zpools share the same type */ - if (strcmp(zpool_get_type(pool->zpools[0]), type)) - continue; - /* if we can't get it, it's about to be destroyed */ - if (!zswap_pool_get(pool)) - continue; - return pool; - } - - return NULL; -} - -/* - * If the entry is still valid in the tree, drop the initial ref and remove it - * from the tree. This function must be called with an additional ref held, - * otherwise it may race with another invalidation freeing the entry. - */ -static void zswap_invalidate_entry(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - if (zswap_rb_erase(&tree->rbroot, entry)) - zswap_entry_put(tree, entry); -} - -static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg) -{ - struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); - bool *encountered_page_in_swapcache = (bool *)arg; - struct zswap_tree *tree; - pgoff_t swpoffset; - enum lru_status ret = LRU_REMOVED_RETRY; - int writeback_result; - - /* - * Once the lru lock is dropped, the entry might get freed. The - * swpoffset is copied to the stack, and entry isn't deref'd again - * until the entry is verified to still be alive in the tree. - */ - swpoffset = swp_offset(entry->swpentry); - tree = zswap_trees[swp_type(entry->swpentry)]; - list_lru_isolate(l, item); - /* - * It's safe to drop the lock here because we return either - * LRU_REMOVED_RETRY or LRU_RETRY. - */ - spin_unlock(lock); - - /* Check for invalidate() race */ - spin_lock(&tree->lock); - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) - goto unlock; - - /* Hold a reference to prevent a free during writeback */ - zswap_entry_get(entry); - spin_unlock(&tree->lock); - - writeback_result = zswap_writeback_entry(entry, tree); - - spin_lock(&tree->lock); - if (writeback_result) { - zswap_reject_reclaim_fail++; - zswap_lru_putback(&entry->pool->list_lru, entry); - ret = LRU_RETRY; - - /* - * Encountering a page already in swap cache is a sign that we are shrinking - * into the warmer region. We should terminate shrinking (if we're in the dynamic - * shrinker context). - */ - if (writeback_result == -EEXIST && encountered_page_in_swapcache) - *encountered_page_in_swapcache = true; - - goto put_unlock; - } - zswap_written_back_pages++; - - if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPWB); - - count_vm_event(ZSWPWB); - /* - * Writeback started successfully, the page now belongs to the - * swapcache. Drop the entry from zswap - unless invalidate already - * took it out while we had the tree->lock released for IO. - */ - zswap_invalidate_entry(tree, entry); - -put_unlock: - /* Drop local reference */ - zswap_entry_put(tree, entry); -unlock: - spin_unlock(&tree->lock); - spin_lock(lock); - return ret; -} - -static int shrink_memcg(struct mem_cgroup *memcg) -{ - struct zswap_pool *pool; - int nid, shrunk = 0; - - if (!mem_cgroup_zswap_writeback_enabled(memcg)) - return -EINVAL; - - /* - * Skip zombies because their LRUs are reparented and we would be - * reclaiming from the parent instead of the dead memcg. - */ - if (memcg && !mem_cgroup_online(memcg)) - return -ENOENT; - - pool = zswap_pool_current_get(); - if (!pool) - return -EINVAL; - - for_each_node_state(nid, N_NORMAL_MEMORY) { - unsigned long nr_to_walk = 1; - - shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, - &shrink_memcg_cb, NULL, &nr_to_walk); - } - zswap_pool_put(pool); - return shrunk ? 0 : -EAGAIN; -} - -static void shrink_worker(struct work_struct *w) -{ - struct zswap_pool *pool = container_of(w, typeof(*pool), - shrink_work); - struct mem_cgroup *memcg; - int ret, failures = 0; - - /* global reclaim will select cgroup in a round-robin fashion. */ - do { - spin_lock(&zswap_pools_lock); - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - memcg = pool->next_shrink; - - /* - * We need to retry if we have gone through a full round trip, or if we - * got an offline memcg (or else we risk undoing the effect of the - * zswap memcg offlining cleanup callback). This is not catastrophic - * per se, but it will keep the now offlined memcg hostage for a while. - * - * Note that if we got an online memcg, we will keep the extra - * reference in case the original reference obtained by mem_cgroup_iter - * is dropped by the zswap memcg offlining callback, ensuring that the - * memcg is not killed when we are reclaiming. - */ - if (!memcg) { - spin_unlock(&zswap_pools_lock); - if (++failures == MAX_RECLAIM_RETRIES) - break; - - goto resched; - } - - if (!mem_cgroup_tryget_online(memcg)) { - /* drop the reference from mem_cgroup_iter() */ - mem_cgroup_iter_break(NULL, memcg); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); - - if (++failures == MAX_RECLAIM_RETRIES) - break; - - goto resched; - } - spin_unlock(&zswap_pools_lock); - - ret = shrink_memcg(memcg); - /* drop the extra reference */ - mem_cgroup_put(memcg); - - if (ret == -EINVAL) - break; - if (ret && ++failures == MAX_RECLAIM_RETRIES) - break; - -resched: - cond_resched(); - } while (!zswap_can_accept()); - zswap_pool_put(pool); -} +static void __zswap_pool_empty(struct percpu_ref *ref); static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { @@ -1059,30 +294,21 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) if (ret) goto error; - zswap_alloc_shrinker(pool); - if (!pool->shrinker) - goto error; - - pr_debug("using %s compressor\n", pool->tfm_name); - /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool */ - kref_init(&pool->kref); + ret = percpu_ref_init(&pool->ref, __zswap_pool_empty, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (ret) + goto ref_fail; INIT_LIST_HEAD(&pool->list); - if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) - goto lru_fail; - shrinker_register(pool->shrinker); - INIT_WORK(&pool->shrink_work, shrink_worker); - atomic_set(&pool->nr_stored, 0); zswap_pool_debug("created", pool); return pool; -lru_fail: - list_lru_destroy(&pool->list_lru); - shrinker_free(pool->shrinker); +ref_fail: + cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -1140,29 +366,14 @@ static void zswap_pool_destroy(struct zswap_pool *pool) zswap_pool_debug("destroying", pool); - shrinker_free(pool->shrinker); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); - list_lru_destroy(&pool->list_lru); - - spin_lock(&zswap_pools_lock); - mem_cgroup_iter_break(NULL, pool->next_shrink); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) zpool_destroy_pool(pool->zpools[i]); kfree(pool); } -static int __must_check zswap_pool_get(struct zswap_pool *pool) -{ - if (!pool) - return 0; - - return kref_get_unless_zero(&pool->kref); -} - static void __zswap_pool_release(struct work_struct *work) { struct zswap_pool *pool = container_of(work, typeof(*pool), @@ -1170,20 +381,23 @@ static void __zswap_pool_release(struct work_struct *work) synchronize_rcu(); - /* nobody should have been able to get a kref... */ - WARN_ON(kref_get_unless_zero(&pool->kref)); + /* nobody should have been able to get a ref... */ + WARN_ON(!percpu_ref_is_zero(&pool->ref)); + percpu_ref_exit(&pool->ref); /* pool is now off zswap_pools list and has no references. */ zswap_pool_destroy(pool); } -static void __zswap_pool_empty(struct kref *kref) +static struct zswap_pool *zswap_pool_current(void); + +static void __zswap_pool_empty(struct percpu_ref *ref) { struct zswap_pool *pool; - pool = container_of(kref, typeof(*pool), kref); + pool = container_of(ref, typeof(*pool), ref); - spin_lock(&zswap_pools_lock); + spin_lock_bh(&zswap_pools_lock); WARN_ON(pool == zswap_pool_current()); @@ -1192,12 +406,117 @@ static void __zswap_pool_empty(struct kref *kref) INIT_WORK(&pool->release_work, __zswap_pool_release); schedule_work(&pool->release_work); - spin_unlock(&zswap_pools_lock); + spin_unlock_bh(&zswap_pools_lock); +} + +static int __must_check zswap_pool_get(struct zswap_pool *pool) +{ + if (!pool) + return 0; + + return percpu_ref_tryget(&pool->ref); } static void zswap_pool_put(struct zswap_pool *pool) { - kref_put(&pool->kref, __zswap_pool_empty); + percpu_ref_put(&pool->ref); +} + +static struct zswap_pool *__zswap_pool_current(void) +{ + struct zswap_pool *pool; + + pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); + WARN_ONCE(!pool && zswap_has_pool, + "%s: no page storage pool!\n", __func__); + + return pool; +} + +static struct zswap_pool *zswap_pool_current(void) +{ + assert_spin_locked(&zswap_pools_lock); + + return __zswap_pool_current(); +} + +static struct zswap_pool *zswap_pool_current_get(void) +{ + struct zswap_pool *pool; + + rcu_read_lock(); + + pool = __zswap_pool_current(); + if (!zswap_pool_get(pool)) + pool = NULL; + + rcu_read_unlock(); + + return pool; +} + +/* type and compressor must be null-terminated */ +static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) +{ + struct zswap_pool *pool; + + assert_spin_locked(&zswap_pools_lock); + + list_for_each_entry_rcu(pool, &zswap_pools, list) { + if (strcmp(pool->tfm_name, compressor)) + continue; + /* all zpools share the same type */ + if (strcmp(zpool_get_type(pool->zpools[0]), type)) + continue; + /* if we can't get it, it's about to be destroyed */ + if (!zswap_pool_get(pool)) + continue; + return pool; + } + + return NULL; +} + +static unsigned long zswap_max_pages(void) +{ + return totalram_pages() * zswap_max_pool_percent / 100; +} + +static unsigned long zswap_accept_thr_pages(void) +{ + return zswap_max_pages() * zswap_accept_thr_percent / 100; +} + +unsigned long zswap_total_pages(void) +{ + struct zswap_pool *pool; + unsigned long total = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(pool, &zswap_pools, list) { + int i; + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + total += zpool_get_total_pages(pool->zpools[i]); + } + rcu_read_unlock(); + + return total; +} + +static bool zswap_check_limits(void) +{ + unsigned long cur_pages = zswap_total_pages(); + unsigned long max_pages = zswap_max_pages(); + + if (cur_pages >= max_pages) { + zswap_pool_limit_hit++; + zswap_pool_reached_full = true; + } else if (zswap_pool_reached_full && + cur_pages <= zswap_accept_thr_pages()) { + zswap_pool_reached_full = false; + } + return zswap_pool_reached_full; } /********************************* @@ -1259,7 +578,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, return -EINVAL; } - spin_lock(&zswap_pools_lock); + spin_lock_bh(&zswap_pools_lock); pool = zswap_pool_find_get(type, compressor); if (pool) { @@ -1268,17 +587,28 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, list_del_rcu(&pool->list); } - spin_unlock(&zswap_pools_lock); + spin_unlock_bh(&zswap_pools_lock); if (!pool) pool = zswap_pool_create(type, compressor); + else { + /* + * Restore the initial ref dropped by percpu_ref_kill() + * when the pool was decommissioned and switch it again + * to percpu mode. + */ + percpu_ref_resurrect(&pool->ref); + + /* Drop the ref from zswap_pool_find_get(). */ + zswap_pool_put(pool); + } if (pool) ret = param_set_charp(s, kp); else ret = -EINVAL; - spin_lock(&zswap_pools_lock); + spin_lock_bh(&zswap_pools_lock); if (!ret) { put_pool = zswap_pool_current(); @@ -1293,7 +623,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, put_pool = pool; } - spin_unlock(&zswap_pools_lock); + spin_unlock_bh(&zswap_pools_lock); if (!zswap_has_pool && !pool) { /* if initial pool creation failed, and this pool creation also @@ -1310,7 +640,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, * or the new pool we failed to add */ if (put_pool) - zswap_pool_put(put_pool); + percpu_ref_kill(&put_pool->ref); return ret; } @@ -1356,7 +686,292 @@ static int zswap_enabled_param_set(const char *val, return ret; } -static void __zswap_load(struct zswap_entry *entry, struct page *page) +/********************************* +* lru functions +**********************************/ + +/* should be called under RCU */ +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) +{ + return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; +} +#else +static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) +{ + return NULL; +} +#endif + +static inline int entry_to_nid(struct zswap_entry *entry) +{ + return page_to_nid(virt_to_page(entry)); +} + +static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) +{ + atomic_long_t *nr_zswap_protected; + unsigned long lru_size, old, new; + int nid = entry_to_nid(entry); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + /* + * Note that it is safe to use rcu_read_lock() here, even in the face of + * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection + * used in list_lru lookup, only two scenarios are possible: + * + * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The + * new entry will be reparented to memcg's parent's list_lru. + * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The + * new entry will be added directly to memcg's parent's list_lru. + * + * Similar reasoning holds for list_lru_del(). + */ + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + /* will always succeed */ + list_lru_add(list_lru, &entry->lru, nid, memcg); + + /* Update the protection area */ + lru_size = list_lru_count_one(list_lru, nid, memcg); + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; + old = atomic_long_inc_return(nr_zswap_protected); + /* + * Decay to avoid overflow and adapt to changing workloads. + * This is based on LRU reclaim cost decaying heuristics. + */ + do { + new = old > lru_size / 4 ? old / 2 : old; + } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); + rcu_read_unlock(); +} + +static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) +{ + int nid = entry_to_nid(entry); + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + /* will always succeed */ + list_lru_del(list_lru, &entry->lru, nid, memcg); + rcu_read_unlock(); +} + +void zswap_lruvec_state_init(struct lruvec *lruvec) +{ + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} + +void zswap_folio_swapin(struct folio *folio) +{ + struct lruvec *lruvec; + + if (folio) { + lruvec = folio_lruvec(folio); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} + +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) +{ + /* lock out zswap shrinker walking memcg tree */ + spin_lock(&zswap_shrink_lock); + if (zswap_next_shrink == memcg) + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + spin_unlock(&zswap_shrink_lock); +} + +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); + if (!entry) + return NULL; + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + +static struct zpool *zswap_find_zpool(struct zswap_entry *entry) +{ + return entry->pool->zpools[hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS))]; +} + +/* + * Carries out the common pattern of freeing and entry's zpool allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_entry_free(struct zswap_entry *entry) +{ + if (!entry->length) + atomic_dec(&zswap_same_filled_pages); + else { + zswap_lru_del(&zswap_list_lru, entry); + zpool_free(zswap_find_zpool(entry), entry->handle); + zswap_pool_put(entry->pool); + } + if (entry->objcg) { + obj_cgroup_uncharge_zswap(entry->objcg, entry->length); + obj_cgroup_put(entry->objcg); + } + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); +} + +/********************************* +* compressed storage functions +**********************************/ +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) +{ + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct crypto_acomp *acomp; + struct acomp_req *req; + int ret; + + mutex_init(&acomp_ctx->mutex); + + acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!acomp_ctx->buffer) + return -ENOMEM; + + acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); + if (IS_ERR(acomp)) { + pr_err("could not alloc crypto acomp %s : %ld\n", + pool->tfm_name, PTR_ERR(acomp)); + ret = PTR_ERR(acomp); + goto acomp_fail; + } + acomp_ctx->acomp = acomp; + acomp_ctx->is_sleepable = acomp_is_async(acomp); + + req = acomp_request_alloc(acomp_ctx->acomp); + if (!req) { + pr_err("could not alloc crypto acomp_request %s\n", + pool->tfm_name); + ret = -ENOMEM; + goto req_fail; + } + acomp_ctx->req = req; + + crypto_init_wait(&acomp_ctx->wait); + /* + * if the backend of acomp is async zip, crypto_req_done() will wakeup + * crypto_wait_req(); if the backend of acomp is scomp, the callback + * won't be called, crypto_wait_req() will return without blocking. + */ + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &acomp_ctx->wait); + + return 0; + +req_fail: + crypto_free_acomp(acomp_ctx->acomp); +acomp_fail: + kfree(acomp_ctx->buffer); + return ret; +} + +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) +{ + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + + if (!IS_ERR_OR_NULL(acomp_ctx)) { + if (!IS_ERR_OR_NULL(acomp_ctx->req)) + acomp_request_free(acomp_ctx->req); + if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) + crypto_free_acomp(acomp_ctx->acomp); + kfree(acomp_ctx->buffer); + } + + return 0; +} + +static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) +{ + struct crypto_acomp_ctx *acomp_ctx; + struct scatterlist input, output; + int comp_ret = 0, alloc_ret = 0; + unsigned int dlen = PAGE_SIZE; + unsigned long handle; + struct zpool *zpool; + char *buf; + gfp_t gfp; + u8 *dst; + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + + mutex_lock(&acomp_ctx->mutex); + + dst = acomp_ctx->buffer; + sg_init_table(&input, 1); + sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ + sg_init_one(&output, dst, PAGE_SIZE * 2); + acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); + + /* + * it maybe looks a little bit silly that we send an asynchronous request, + * then wait for its completion synchronously. This makes the process look + * synchronous in fact. + * Theoretically, acomp supports users send multiple acomp requests in one + * acomp instance, then get those requests done simultaneously. but in this + * case, zswap actually does store and load page by page, there is no + * existing method to send the second page before the first page is done + * in one thread doing zwap. + * but in different threads running on different cpu, we have different + * acomp instance, so multiple threads can do (de)compression in parallel. + */ + comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + if (comp_ret) + goto unlock; + + zpool = zswap_find_zpool(entry); + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); + if (alloc_ret) + goto unlock; + + buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); + memcpy(buf, dst, dlen); + zpool_unmap_handle(zpool, handle); + + entry->handle = handle; + entry->length = dlen; + +unlock: + if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC) + zswap_reject_compress_poor++; + else if (comp_ret) + zswap_reject_compress_fail++; + else if (alloc_ret) + zswap_reject_alloc_fail++; + + mutex_unlock(&acomp_ctx->mutex); + return comp_ret == 0 && alloc_ret == 0; +} + +static void zswap_decompress(struct zswap_entry *entry, struct page *page) { struct zpool *zpool = zswap_find_zpool(entry); struct scatterlist input, output; @@ -1367,7 +982,17 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(zpool)) { + /* + * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer + * to do crypto_acomp_decompress() which might sleep. In such cases, we must + * resort to copying the buffer to a temporary one. + * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer, + * such as a kmap address of high memory or even ever a vmap address. + * However, sg_init_one is only equipped to handle linearly mapped low memory. + * In such cases, we also must copy the buffer to a temporary and lowmem one. + */ + if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) || + !virt_addr_valid(src)) { memcpy(acomp_ctx->buffer, src, entry->length); src = acomp_ctx->buffer; zpool_unmap_handle(zpool, entry->handle); @@ -1381,7 +1006,7 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); mutex_unlock(&acomp_ctx->mutex); - if (zpool_can_sleep_mapped(zpool)) + if (src != acomp_ctx->buffer) zpool_unmap_handle(zpool, entry->handle); } @@ -1401,9 +1026,10 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) * freed. */ static int zswap_writeback_entry(struct zswap_entry *entry, - struct zswap_tree *tree) + swp_entry_t swpentry) { - swp_entry_t swpentry = entry->swpentry; + struct xarray *tree; + pgoff_t offset = swp_offset(swpentry); struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; @@ -1419,9 +1045,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return -ENOMEM; /* - * Found an existing folio, we raced with load/swapin. We generally - * writeback cold folios from zswap, and swapin means the folio just - * became hot. Skip this folio and let the caller find another one. + * Found an existing folio, we raced with swapin or concurrent + * shrinker. We generally writeback cold folios from zswap, and + * swapin means the folio just became hot, so skip this folio. + * For unlikely concurrent shrinker case, it will be unlinked + * and freed when invalidated by the concurrent shrinker anyway. */ if (!folio_was_allocated) { folio_put(folio); @@ -1430,22 +1058,28 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* * folio is locked, and the swapcache is now secured against - * concurrent swapping to and from the slot. Verify that the - * swap entry hasn't been invalidated and recycled behind our - * backs (our zswap_entry reference doesn't prevent that), to - * avoid overwriting a new swap folio with old compressed data. + * concurrent swapping to and from the slot, and concurrent + * swapoff so we can safely dereference the zswap tree here. + * Verify that the swap entry hasn't been invalidated and recycled + * behind our backs, to avoid overwriting a new swap folio with + * old compressed data. Only when this is successful can the entry + * be dereferenced. */ - spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { - spin_unlock(&tree->lock); + tree = swap_zswap_tree(swpentry); + if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) { delete_from_swap_cache(folio); folio_unlock(folio); folio_put(folio); return -ENOMEM; } - spin_unlock(&tree->lock); - __zswap_load(entry, &folio->page); + zswap_decompress(entry, &folio->page); + + count_vm_event(ZSWPWB); + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPWB); + + zswap_entry_free(entry); /* folio is up to date */ folio_mark_uptodate(folio); @@ -1460,26 +1094,308 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return 0; } -static int zswap_is_page_same_filled(void *ptr, unsigned long *value) +/********************************* +* shrinker functions +**********************************/ +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, + spinlock_t *lock, void *arg) +{ + struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); + bool *encountered_page_in_swapcache = (bool *)arg; + swp_entry_t swpentry; + enum lru_status ret = LRU_REMOVED_RETRY; + int writeback_result; + + /* + * As soon as we drop the LRU lock, the entry can be freed by + * a concurrent invalidation. This means the following: + * + * 1. We extract the swp_entry_t to the stack, allowing + * zswap_writeback_entry() to pin the swap entry and + * then validate the zwap entry against that swap entry's + * tree using pointer value comparison. Only when that + * is successful can the entry be dereferenced. + * + * 2. Usually, objects are taken off the LRU for reclaim. In + * this case this isn't possible, because if reclaim fails + * for whatever reason, we have no means of knowing if the + * entry is alive to put it back on the LRU. + * + * So rotate it before dropping the lock. If the entry is + * written back or invalidated, the free path will unlink + * it. For failures, rotation is the right thing as well. + * + * Temporary failures, where the same entry should be tried + * again immediately, almost never happen for this shrinker. + * We don't do any trylocking; -ENOMEM comes closest, + * but that's extremely rare and doesn't happen spuriously + * either. Don't bother distinguishing this case. + */ + list_move_tail(item, &l->list); + + /* + * Once the lru lock is dropped, the entry might get freed. The + * swpentry is copied to the stack, and entry isn't deref'd again + * until the entry is verified to still be alive in the tree. + */ + swpentry = entry->swpentry; + + /* + * It's safe to drop the lock here because we return either + * LRU_REMOVED_RETRY or LRU_RETRY. + */ + spin_unlock(lock); + + writeback_result = zswap_writeback_entry(entry, swpentry); + + if (writeback_result) { + zswap_reject_reclaim_fail++; + ret = LRU_RETRY; + + /* + * Encountering a page already in swap cache is a sign that we are shrinking + * into the warmer region. We should terminate shrinking (if we're in the dynamic + * shrinker context). + */ + if (writeback_result == -EEXIST && encountered_page_in_swapcache) { + ret = LRU_STOP; + *encountered_page_in_swapcache = true; + } + } else { + zswap_written_back_pages++; + } + + spin_lock(lock); + return ret; +} + +static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); + unsigned long shrink_ret, nr_protected, lru_size; + bool encountered_page_in_swapcache = false; + + if (!zswap_shrinker_enabled || + !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + lru_size = list_lru_shrink_count(&zswap_list_lru, sc); + + /* + * Abort if we are shrinking into the protected region. + * + * This short-circuiting is necessary because if we have too many multiple + * concurrent reclaimers getting the freeable zswap object counts at the + * same time (before any of them made reasonable progress), the total + * number of reclaimed objects might be more than the number of unprotected + * objects (i.e the reclaimers will reclaim into the protected area of the + * zswap LRU). + */ + if (nr_protected >= lru_size - sc->nr_to_scan) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } + + shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, + &encountered_page_in_swapcache); + + if (encountered_page_in_swapcache) + return SHRINK_STOP; + + return shrink_ret ? shrink_ret : SHRINK_STOP; +} + +static unsigned long zswap_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct mem_cgroup *memcg = sc->memcg; + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); + unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + + if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) + return 0; + + /* + * The shrinker resumes swap writeback, which will enter block + * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS + * rules (may_enter_fs()), which apply on a per-folio basis. + */ + if (!gfp_has_io_fs(sc->gfp_mask)) + return 0; + + /* + * For memcg, use the cgroup-wide ZSWAP stats since we don't + * have them per-node and thus per-lruvec. Careful if memcg is + * runtime-disabled: we can get sc->memcg == NULL, which is ok + * for the lruvec, but not for memcg_page_state(). + * + * Without memcg, use the zswap pool-wide metrics. + */ + if (!mem_cgroup_disabled()) { + mem_cgroup_flush_stats(memcg); + nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; + nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); + } else { + nr_backing = zswap_total_pages(); + nr_stored = atomic_read(&zswap_stored_pages); + } + + if (!nr_stored) + return 0; + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); + /* + * Subtract the lru size by an estimate of the number of pages + * that should be protected. + */ + nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + + /* + * Scale the number of freeable pages by the memory saving factor. + * This ensures that the better zswap compresses memory, the fewer + * pages we will evict to swap (as it will otherwise incur IO for + * relatively small memory saving). + * + * The memory saving factor calculated here takes same-filled pages into + * account, but those are not freeable since they almost occupy no + * space. Hence, we may scale nr_freeable down a little bit more than we + * should if we have a lot of same-filled pages. + */ + return mult_frac(nr_freeable, nr_backing, nr_stored); +} + +static struct shrinker *zswap_alloc_shrinker(void) +{ + struct shrinker *shrinker; + + shrinker = + shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); + if (!shrinker) + return NULL; + + shrinker->scan_objects = zswap_shrinker_scan; + shrinker->count_objects = zswap_shrinker_count; + shrinker->batch = 0; + shrinker->seeks = DEFAULT_SEEKS; + return shrinker; +} + +static int shrink_memcg(struct mem_cgroup *memcg) +{ + int nid, shrunk = 0; + + if (!mem_cgroup_zswap_writeback_enabled(memcg)) + return -EINVAL; + + /* + * Skip zombies because their LRUs are reparented and we would be + * reclaiming from the parent instead of the dead memcg. + */ + if (memcg && !mem_cgroup_online(memcg)) + return -ENOENT; + + for_each_node_state(nid, N_NORMAL_MEMORY) { + unsigned long nr_to_walk = 1; + + shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, + &shrink_memcg_cb, NULL, &nr_to_walk); + } + return shrunk ? 0 : -EAGAIN; +} + +static void shrink_worker(struct work_struct *w) +{ + struct mem_cgroup *memcg; + int ret, failures = 0; + unsigned long thr; + + /* Reclaim down to the accept threshold */ + thr = zswap_accept_thr_pages(); + + /* global reclaim will select cgroup in a round-robin fashion. */ + do { + spin_lock(&zswap_shrink_lock); + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + memcg = zswap_next_shrink; + + /* + * We need to retry if we have gone through a full round trip, or if we + * got an offline memcg (or else we risk undoing the effect of the + * zswap memcg offlining cleanup callback). This is not catastrophic + * per se, but it will keep the now offlined memcg hostage for a while. + * + * Note that if we got an online memcg, we will keep the extra + * reference in case the original reference obtained by mem_cgroup_iter + * is dropped by the zswap memcg offlining callback, ensuring that the + * memcg is not killed when we are reclaiming. + */ + if (!memcg) { + spin_unlock(&zswap_shrink_lock); + if (++failures == MAX_RECLAIM_RETRIES) + break; + + goto resched; + } + + if (!mem_cgroup_tryget_online(memcg)) { + /* drop the reference from mem_cgroup_iter() */ + mem_cgroup_iter_break(NULL, memcg); + zswap_next_shrink = NULL; + spin_unlock(&zswap_shrink_lock); + + if (++failures == MAX_RECLAIM_RETRIES) + break; + + goto resched; + } + spin_unlock(&zswap_shrink_lock); + + ret = shrink_memcg(memcg); + /* drop the extra reference */ + mem_cgroup_put(memcg); + + if (ret == -EINVAL) + break; + if (ret && ++failures == MAX_RECLAIM_RETRIES) + break; +resched: + cond_resched(); + } while (zswap_total_pages() > thr); +} + +/********************************* +* same-filled functions +**********************************/ +static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value) { unsigned long *page; unsigned long val; unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; + bool ret = false; - page = (unsigned long *)ptr; + page = kmap_local_folio(folio, 0); val = page[0]; if (val != page[last_pos]) - return 0; + goto out; for (pos = 1; pos < last_pos; pos++) { if (val != page[pos]) - return 0; + goto out; } *value = val; - - return 1; + ret = true; +out: + kunmap_local(page); + return ret; } static void zswap_fill_page(void *ptr, unsigned long value) @@ -1490,26 +1406,18 @@ static void zswap_fill_page(void *ptr, unsigned long value) memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); } +/********************************* +* main API +**********************************/ bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; - int type = swp_type(swp); pgoff_t offset = swp_offset(swp); - struct page *page = &folio->page; - struct zswap_tree *tree = zswap_trees[type]; - struct zswap_entry *entry, *dupentry; - struct scatterlist input, output; - struct crypto_acomp_ctx *acomp_ctx; + struct xarray *tree = swap_zswap_tree(swp); + struct zswap_entry *entry, *old; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - struct zswap_pool *pool; - struct zpool *zpool; - unsigned int dlen = PAGE_SIZE; - unsigned long handle, value; - char *buf; - u8 *src, *dst; - gfp_t gfp; - int ret; + unsigned long value; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1518,25 +1426,10 @@ bool zswap_store(struct folio *folio) if (folio_test_large(folio)) return false; - if (!tree) - return false; - - /* - * If this is a duplicate, it must be removed before attempting to store - * it, otherwise, if the store fails the old page won't be removed from - * the tree, and it might be written back overriding the new data. - */ - spin_lock(&tree->lock); - dupentry = zswap_rb_search(&tree->rbroot, offset); - if (dupentry) { - zswap_duplicate_entry++; - zswap_invalidate_entry(tree, dupentry); - } - spin_unlock(&tree->lock); - if (!zswap_enabled) - return false; + goto check_old; + /* Check cgroup limits */ objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { memcg = get_mem_cgroup_from_objcg(objcg); @@ -1547,43 +1440,23 @@ bool zswap_store(struct folio *folio) mem_cgroup_put(memcg); } - /* reclaim space if needed */ - if (zswap_is_full()) { - zswap_pool_limit_hit++; - zswap_pool_reached_full = true; - goto shrink; - } - - if (zswap_pool_reached_full) { - if (!zswap_can_accept()) - goto shrink; - else - zswap_pool_reached_full = false; - } + if (zswap_check_limits()) + goto reject; /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); + entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; } - if (zswap_same_filled_pages_enabled) { - src = kmap_local_page(page); - if (zswap_is_page_same_filled(src, &value)) { - kunmap_local(src); - entry->swpentry = swp_entry(type, offset); - entry->length = 0; - entry->value = value; - atomic_inc(&zswap_same_filled_pages); - goto insert_entry; - } - kunmap_local(src); + if (zswap_is_folio_same_filled(folio, &value)) { + entry->length = 0; + entry->value = value; + atomic_inc(&zswap_same_filled_pages); + goto store_entry; } - if (!zswap_non_same_filled_pages_enabled) - goto freepage; - /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) @@ -1591,149 +1464,123 @@ bool zswap_store(struct folio *folio) if (objcg) { memcg = get_mem_cgroup_from_objcg(objcg); - if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { + if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) { mem_cgroup_put(memcg); goto put_pool; } mem_cgroup_put(memcg); } - /* compress */ - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - - mutex_lock(&acomp_ctx->mutex); - - dst = acomp_ctx->buffer; - sg_init_table(&input, 1); - sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + if (!zswap_compress(folio, entry)) + goto put_pool; - /* - * We need PAGE_SIZE * 2 here since there maybe over-compression case, - * and hardware-accelerators may won't check the dst buffer size, so - * giving the dst buffer with enough length to avoid buffer overflow. - */ - sg_init_one(&output, dst, PAGE_SIZE * 2); - acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); - /* - * it maybe looks a little bit silly that we send an asynchronous request, - * then wait for its completion synchronously. This makes the process look - * synchronous in fact. - * Theoretically, acomp supports users send multiple acomp requests in one - * acomp instance, then get those requests done simultaneously. but in this - * case, zswap actually does store and load page by page, there is no - * existing method to send the second page before the first page is done - * in one thread doing zwap. - * but in different threads running on different cpu, we have different - * acomp instance, so multiple threads can do (de)compression in parallel. - */ - ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; +store_entry: + entry->swpentry = swp; + entry->objcg = objcg; - if (ret) { - zswap_reject_compress_fail++; - goto put_dstmem; - } + old = xa_store(tree, offset, entry, GFP_KERNEL); + if (xa_is_err(old)) { + int err = xa_err(old); - /* store */ - zpool = zswap_find_zpool(entry); - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(zpool, dlen, gfp, &handle); - if (ret == -ENOSPC) { - zswap_reject_compress_poor++; - goto put_dstmem; - } - if (ret) { + WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); zswap_reject_alloc_fail++; - goto put_dstmem; + goto store_failed; } - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - mutex_unlock(&acomp_ctx->mutex); - /* populate entry */ - entry->swpentry = swp_entry(type, offset); - entry->handle = handle; - entry->length = dlen; + /* + * We may have had an existing entry that became stale when + * the folio was redirtied and now the new version is being + * swapped out. Get rid of the old. + */ + if (old) + zswap_entry_free(old); -insert_entry: - entry->objcg = objcg; if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); - /* Account before objcg ref is moved to tree */ count_objcg_event(objcg, ZSWPOUT); } - /* map */ - spin_lock(&tree->lock); /* - * A duplicate entry should have been removed at the beginning of this - * function. Since the swap entry should be pinned, if a duplicate is - * found again here it means that something went wrong in the swap - * cache. + * We finish initializing the entry while it's already in xarray. + * This is safe because: + * + * 1. Concurrent stores and invalidations are excluded by folio lock. + * + * 2. Writeback is excluded by the entry not being on the LRU yet. + * The publishing order matters to prevent writeback from seeing + * an incoherent entry. */ - while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { - WARN_ON(1); - zswap_duplicate_entry++; - zswap_invalidate_entry(tree, dupentry); - } if (entry->length) { INIT_LIST_HEAD(&entry->lru); - zswap_lru_add(&entry->pool->list_lru, entry); - atomic_inc(&entry->pool->nr_stored); + zswap_lru_add(&zswap_list_lru, entry); } - spin_unlock(&tree->lock); /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_update_total_size(); count_vm_event(ZSWPOUT); return true; -put_dstmem: - mutex_unlock(&acomp_ctx->mutex); +store_failed: + if (!entry->length) + atomic_dec(&zswap_same_filled_pages); + else { + zpool_free(zswap_find_zpool(entry), entry->handle); put_pool: - zswap_pool_put(entry->pool); + zswap_pool_put(entry->pool); + } freepage: zswap_entry_cache_free(entry); reject: - if (objcg) - obj_cgroup_put(objcg); + obj_cgroup_put(objcg); + if (zswap_pool_reached_full) + queue_work(shrink_wq, &zswap_shrink_work); +check_old: + /* + * If the zswap store fails or zswap is disabled, we must invalidate the + * possibly stale entry which was previously stored at this offset. + * Otherwise, writeback could overwrite the new data in the swapfile. + */ + entry = xa_erase(tree, offset); + if (entry) + zswap_entry_free(entry); return false; - -shrink: - pool = zswap_pool_last_get(); - if (pool && !queue_work(shrink_wq, &pool->shrink_work)) - zswap_pool_put(pool); - goto reject; } bool zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; - int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; - struct zswap_tree *tree = zswap_trees[type]; + bool swapcache = folio_test_swapcache(folio); + struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; u8 *dst; VM_WARN_ON_ONCE(!folio_test_locked(folio)); - /* find */ - spin_lock(&tree->lock); - entry = zswap_entry_find_get(&tree->rbroot, offset); - if (!entry) { - spin_unlock(&tree->lock); + /* + * When reading into the swapcache, invalidate our entry. The + * swapcache can be the authoritative owner of the page and + * its mappings, and the pressure that results from having two + * in-memory copies outweighs any benefits of caching the + * compression work. + * + * (Most swapins go through the swapcache. The notable + * exception is the singleton fault on SWP_SYNCHRONOUS_IO + * files, which reads into a private page and may free it if + * the fault fails. We remain the primary owner of the entry.) + */ + if (swapcache) + entry = xa_erase(tree, offset); + else + entry = xa_load(tree, offset); + + if (!entry) return false; - } - spin_unlock(&tree->lock); if (entry->length) - __zswap_load(entry, page); + zswap_decompress(entry, page); else { dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); @@ -1744,67 +1591,59 @@ bool zswap_load(struct folio *folio) if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); - spin_lock(&tree->lock); - if (zswap_exclusive_loads_enabled) { - zswap_invalidate_entry(tree, entry); + if (swapcache) { + zswap_entry_free(entry); folio_mark_dirty(folio); - } else if (entry->length) { - zswap_lru_del(&entry->pool->list_lru, entry); - zswap_lru_add(&entry->pool->list_lru, entry); } - zswap_entry_put(tree, entry); - spin_unlock(&tree->lock); return true; } -void zswap_invalidate(int type, pgoff_t offset) +void zswap_invalidate(swp_entry_t swp) { - struct zswap_tree *tree = zswap_trees[type]; + pgoff_t offset = swp_offset(swp); + struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; - /* find */ - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); - if (!entry) { - /* entry was written back */ - spin_unlock(&tree->lock); - return; - } - zswap_invalidate_entry(tree, entry); - spin_unlock(&tree->lock); + entry = xa_erase(tree, offset); + if (entry) + zswap_entry_free(entry); } -void zswap_swapon(int type) +int zswap_swapon(int type, unsigned long nr_pages) { - struct zswap_tree *tree; + struct xarray *trees, *tree; + unsigned int nr, i; - tree = kzalloc(sizeof(*tree), GFP_KERNEL); - if (!tree) { + nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); + if (!trees) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); - return; + return -ENOMEM; } - tree->rbroot = RB_ROOT; - spin_lock_init(&tree->lock); - zswap_trees[type] = tree; + for (i = 0; i < nr; i++) + xa_init(trees + i); + + nr_zswap_trees[type] = nr; + zswap_trees[type] = trees; + return 0; } void zswap_swapoff(int type) { - struct zswap_tree *tree = zswap_trees[type]; - struct zswap_entry *entry, *n; + struct xarray *trees = zswap_trees[type]; + unsigned int i; - if (!tree) + if (!trees) return; - /* walk the tree and free everything */ - spin_lock(&tree->lock); - rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) - zswap_free_entry(entry); - tree->rbroot = RB_ROOT; - spin_unlock(&tree->lock); - kfree(tree); + /* try_to_unuse() invalidated all the entries already */ + for (i = 0; i < nr_zswap_trees[type]; i++) + WARN_ON_ONCE(!xa_empty(trees + i)); + + kvfree(trees); + nr_zswap_trees[type] = 0; zswap_trees[type] = NULL; } @@ -1816,6 +1655,13 @@ void zswap_swapoff(int type) static struct dentry *zswap_debugfs_root; +static int debugfs_get_total_size(void *data, u64 *val) +{ + *val = zswap_total_pages() * PAGE_SIZE; + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n"); + static int zswap_debugfs_init(void) { if (!debugfs_initialized()) @@ -1837,10 +1683,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); - debugfs_create_u64("duplicate_entry", 0444, - zswap_debugfs_root, &zswap_duplicate_entry); - debugfs_create_u64("pool_total_size", 0444, - zswap_debugfs_root, &zswap_pool_total_size); + debugfs_create_file("pool_total_size", 0444, + zswap_debugfs_root, NULL, &total_size_fops); debugfs_create_atomic_t("stored_pages", 0444, zswap_debugfs_root, &zswap_stored_pages); debugfs_create_atomic_t("same_filled_pages", 0444, @@ -1876,6 +1720,20 @@ static int zswap_setup(void) if (ret) goto hp_fail; + shrink_wq = alloc_workqueue("zswap-shrink", + WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + if (!shrink_wq) + goto shrink_wq_fail; + + zswap_shrinker = zswap_alloc_shrinker(); + if (!zswap_shrinker) + goto shrinker_fail; + if (list_lru_init_memcg(&zswap_list_lru, zswap_shrinker)) + goto lru_fail; + shrinker_register(zswap_shrinker); + + INIT_WORK(&zswap_shrink_work, shrink_worker); + pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, @@ -1887,18 +1745,17 @@ static int zswap_setup(void) zswap_enabled = false; } - shrink_wq = create_workqueue("zswap-shrink"); - if (!shrink_wq) - goto fallback_fail; - if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; -fallback_fail: - if (pool) - zswap_pool_destroy(pool); +lru_fail: + shrinker_free(zswap_shrinker); +shrinker_fail: + destroy_workqueue(shrink_wq); +shrink_wq_fail: + cpuhp_remove_multi_state(CPUHP_MM_ZSWP_POOL_PREPARE); hp_fail: kmem_cache_destroy(zswap_entry_cache); cache_fail: |