summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig55
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/balloon_compaction.c21
-rw-r--r--mm/cma.c352
-rw-r--r--mm/cma.h6
-rw-r--r--mm/cma_debug.c10
-rw-r--r--mm/compaction.c74
-rw-r--r--mm/damon/Kconfig19
-rw-r--r--mm/damon/Makefile1
-rw-r--r--mm/damon/core.c270
-rw-r--r--mm/damon/lru_sort.c75
-rw-r--r--mm/damon/modules-common.c2
-rw-r--r--mm/damon/modules-common.h2
-rw-r--r--mm/damon/ops-common.c276
-rw-r--r--mm/damon/ops-common.h7
-rw-r--r--mm/damon/paddr.c280
-rw-r--r--mm/damon/reclaim.c71
-rw-r--r--mm/damon/stat.c264
-rw-r--r--mm/damon/sysfs-common.c2
-rw-r--r--mm/damon/sysfs-common.h2
-rw-r--r--mm/damon/sysfs-schemes.c546
-rw-r--r--mm/damon/sysfs.c171
-rw-r--r--mm/damon/tests/core-kunit.h74
-rw-r--r--mm/damon/tests/vaddr-kunit.h2
-rw-r--r--mm/damon/vaddr.c243
-rw-r--r--mm/debug.c50
-rw-r--r--mm/debug_page_alloc.c2
-rw-r--r--mm/debug_vm_pgtable.c131
-rw-r--r--mm/dmapool.c19
-rw-r--r--mm/execmem.c10
-rw-r--r--mm/filemap.c147
-rw-r--r--mm/gup.c404
-rw-r--r--mm/hmm.c276
-rw-r--r--mm/huge_memory.c537
-rw-r--r--mm/hugetlb.c509
-rw-r--r--mm/hugetlb_cma.c11
-rw-r--r--mm/hugetlb_vmemmap.c8
-rw-r--r--mm/internal.h188
-rw-r--r--mm/io-mapping.c9
-rw-r--r--mm/kasan/Makefile3
-rw-r--r--mm/kasan/kasan_test_c.c34
-rw-r--r--mm/kasan/report.c47
-rw-r--r--mm/kasan/shadow.c92
-rw-r--r--mm/kfence/core.c4
-rw-r--r--mm/khugepaged.c92
-rw-r--r--mm/kmemleak.c23
-rw-r--r--mm/kmsan/core.c12
-rw-r--r--mm/kmsan/hooks.c6
-rw-r--r--mm/kmsan/init.c3
-rw-r--r--mm/kmsan/instrumentation.c4
-rw-r--r--mm/kmsan/kmsan.h1
-rw-r--r--mm/kmsan/kmsan_test.c1
-rw-r--r--mm/kmsan/report.c6
-rw-r--r--mm/kmsan/shadow.c7
-rw-r--r--mm/ksm.c61
-rw-r--r--mm/list_lru.c34
-rw-r--r--mm/maccess.c3
-rw-r--r--mm/madvise.c864
-rw-r--r--mm/mapping_dirty_helpers.c6
-rw-r--r--mm/memblock.c341
-rw-r--r--mm/memcontrol-v1.c11
-rw-r--r--mm/memcontrol.c850
-rw-r--r--mm/memfd.c39
-rw-r--r--mm/memory-failure.c21
-rw-r--r--mm/memory-tiers.c19
-rw-r--r--mm/memory.c635
-rw-r--r--mm/memory_hotplug.c217
-rw-r--r--mm/mempolicy.c547
-rw-r--r--mm/mempool.c34
-rw-r--r--mm/memremap.c42
-rw-r--r--mm/migrate.c388
-rw-r--r--mm/migrate_device.c2
-rw-r--r--mm/mincore.c22
-rw-r--r--mm/mlock.c4
-rw-r--r--mm/mm_init.c96
-rw-r--r--mm/mmap.c334
-rw-r--r--mm/mmap_lock.c366
-rw-r--r--mm/mmu_gather.c1
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mprotect.c307
-rw-r--r--mm/mremap.c635
-rw-r--r--mm/nommu.c60
-rw-r--r--mm/numa.c4
-rw-r--r--mm/numa_memblks.c22
-rw-r--r--mm/page-writeback.c49
-rw-r--r--mm/page_alloc.c674
-rw-r--r--mm/page_ext.c17
-rw-r--r--mm/page_idle.c2
-rw-r--r--mm/page_io.c72
-rw-r--r--mm/page_isolation.c121
-rw-r--r--mm/page_owner.c6
-rw-r--r--mm/page_table_check.c34
-rw-r--r--mm/page_vma_mapped.c5
-rw-r--r--mm/pagewalk.c90
-rw-r--r--mm/percpu-stats.c1
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/pgtable-generic.c7
-rw-r--r--mm/ptdump.c67
-rw-r--r--mm/readahead.c56
-rw-r--r--mm/rmap.c98
-rw-r--r--mm/secretmem.c59
-rw-r--r--mm/shmem.c182
-rw-r--r--mm/show_mem.c24
-rw-r--r--mm/slab.h28
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c194
-rw-r--r--mm/swap.c41
-rw-r--r--mm/swap.h50
-rw-r--r--mm/swap_state.c9
-rw-r--r--mm/swapfile.c288
-rw-r--r--mm/truncate.c22
-rw-r--r--mm/userfaultfd.c152
-rw-r--r--mm/util.c156
-rw-r--r--mm/vma.c443
-rw-r--r--mm/vma.h107
-rw-r--r--mm/vma_exec.c161
-rw-r--r--mm/vma_init.c151
-rw-r--r--mm/vmalloc.c315
-rw-r--r--mm/vmpressure.c2
-rw-r--r--mm/vmscan.c641
-rw-r--r--mm/vmstat.c457
-rw-r--r--mm/workingset.c4
-rw-r--r--mm/zpdesc.h22
-rw-r--r--mm/zpool.c8
-rw-r--r--mm/zsmalloc.c76
-rw-r--r--mm/zswap.c37
127 files changed, 10294 insertions, 6072 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d3fb3762887b..d5d4eca947a6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -201,7 +201,7 @@ config KVFREE_RCU_BATCHED
config SLUB_TINY
bool "Configure for minimal memory footprint"
- depends on EXPERT
+ depends on EXPERT && !COMPILE_TEST
select SLAB_MERGE_DEFAULT
help
Configures the slab allocator in a way to achieve minimal memory
@@ -469,6 +469,10 @@ config HAVE_GUP_FAST
depends on MMU
bool
+# Enable memblock support for scratch memory which is needed for kexec handover
+config MEMBLOCK_KHO_SCRATCH
+ bool
+
# Don't discard allocated memory used to track "memory" and "reserved" memblocks
# after early boot, so it can still be used to test for validity of memory.
# Also, memblocks are updated with memory hot(un)plug.
@@ -882,7 +886,7 @@ config THP_SWAP
config READ_ONLY_THP_FOR_FS
bool "Read-only THP for filesystems (EXPERIMENTAL)"
- depends on TRANSPARENT_HUGEPAGE && SHMEM
+ depends on TRANSPARENT_HUGEPAGE
help
Allow khugepaged to put read-only file-backed pages in THP.
@@ -930,6 +934,13 @@ config ARCH_SUPPORTS_PUD_PFNMAP
depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
#
+# Architectures that always use weak definitions for percpu
+# variables in modules should set this.
+#
+config ARCH_MODULE_NEEDS_WEAK_PER_CPU
+ bool
+
+#
# UP and nommu archs use km based percpu allocator
#
config NEED_PER_CPU_KM
@@ -989,6 +1000,41 @@ config CMA_AREAS
If unsure, leave the default value "8" in UMA and "20" in NUMA.
+#
+# Select this config option from the architecture Kconfig, if available, to set
+# the max page order for physically contiguous allocations.
+#
+config ARCH_FORCE_MAX_ORDER
+ int
+
+#
+# When ARCH_FORCE_MAX_ORDER is not defined,
+# the default page block order is MAX_PAGE_ORDER (10) as per
+# include/linux/mmzone.h.
+#
+config PAGE_BLOCK_MAX_ORDER
+ int "Page Block Order Upper Limit"
+ range 1 10 if ARCH_FORCE_MAX_ORDER = 0
+ default 10 if ARCH_FORCE_MAX_ORDER = 0
+ range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+ default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+ help
+ The page block order refers to the power of two number of pages that
+ are physically contiguous and can have a migrate type associated to
+ them. The maximum size of the page block order is at least limited by
+ ARCH_FORCE_MAX_ORDER/MAX_PAGE_ORDER.
+
+ This config adds a new upper limit of default page block
+ order when the page block order is required to be smaller than
+ ARCH_FORCE_MAX_ORDER/MAX_PAGE_ORDER or other limits
+ (see include/linux/pageblock-flags.h for details).
+
+ Reducing pageblock order can negatively impact THP generation
+ success rate. If your workloads use THP heavily, please use this
+ option with caution.
+
+ Don't change if unsure.
+
config MEM_SOFT_DIRTY
bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
@@ -1071,9 +1117,6 @@ config ARCH_HAS_CURRENT_STACK_POINTER
register alias named "current_stack_pointer", this config can be
selected.
-config ARCH_HAS_PTE_DEVMAP
- bool
-
config ARCH_HAS_ZONE_DMA_SET
bool
@@ -1091,7 +1134,6 @@ config ZONE_DEVICE
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
- depends on ARCH_HAS_PTE_DEVMAP
select XARRAY_MULTI
help
@@ -1317,6 +1359,7 @@ config NUMA_MEMBLKS
config NUMA_EMU
bool "NUMA emulation"
depends on NUMA_MEMBLKS
+ depends on X86 || GENERIC_ARCH_NUMA
help
Enable NUMA emulation. A flat machine will be split
into virtual nodes when booted with "numa=fake=N", where N is the
diff --git a/mm/Makefile b/mm/Makefile
index e7f6bbf8ae5f..1a7a11d4933d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o vma.o
+ pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
@@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o percpu.o slab_common.o \
compaction.o show_mem.o \
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o mmap_lock.o $(mmu-y)
+ debug.o gup.o mmap_lock.o vma_init.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e61bbb1bd622..783904d8c5ef 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1151,7 +1151,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
- del_timer_sync(&bdi->laptop_mode_wb_timer);
+ timer_delete_sync(&bdi->laptop_mode_wb_timer);
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index d3e00731e262..2a4a649805c1 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -94,13 +94,8 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
if (!trylock_page(page))
continue;
- if (IS_ENABLED(CONFIG_BALLOON_COMPACTION) &&
- PageIsolated(page)) {
- /* raced with isolation */
- unlock_page(page);
- continue;
- }
- balloon_page_delete(page);
+ list_del(&page->lru);
+ balloon_page_finalize(page);
__count_vm_event(BALLOON_DEFLATE);
list_add(&page->lru, pages);
unlock_page(page);
@@ -211,6 +206,9 @@ static bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
+ if (!b_dev_info)
+ return false;
+
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_del(&page->lru);
b_dev_info->isolated_pages++;
@@ -224,6 +222,10 @@ static void balloon_page_putback(struct page *page)
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
+ /* Isolated balloon pages cannot get deflated. */
+ if (WARN_ON_ONCE(!b_dev_info))
+ return;
+
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_add(&page->lru, &b_dev_info->pages);
b_dev_info->isolated_pages--;
@@ -239,6 +241,10 @@ static int balloon_page_migrate(struct page *newpage, struct page *page,
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ /* Isolated balloon pages cannot get deflated. */
+ if (WARN_ON_ONCE(!balloon))
+ return -EAGAIN;
+
return balloon->migratepage(balloon, newpage, page, mode);
}
@@ -247,6 +253,5 @@ const struct movable_operations balloon_mops = {
.isolate_page = balloon_page_isolate,
.putback_page = balloon_page_putback,
};
-EXPORT_SYMBOL_GPL(balloon_mops);
#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/cma.c b/mm/cma.c
index b06d5fe73399..2ffa4befb99a 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -22,6 +22,7 @@
#include <linux/mm.h>
#include <linux/sizes.h>
#include <linux/slab.h>
+#include <linux/string_choices.h>
#include <linux/log2.h>
#include <linux/cma.h>
#include <linux/highmem.h>
@@ -35,12 +36,6 @@
struct cma cma_areas[MAX_CMA_AREAS];
unsigned int cma_area_count;
-static int __init __cma_declare_contiguous_nid(phys_addr_t base,
- phys_addr_t size, phys_addr_t limit,
- phys_addr_t alignment, unsigned int order_per_bit,
- bool fixed, const char *name, struct cma **res_cma,
- int nid);
-
phys_addr_t cma_get_base(const struct cma *cma)
{
WARN_ON_ONCE(cma->nranges != 1);
@@ -143,13 +138,14 @@ bool cma_validate_zones(struct cma *cma)
static void __init cma_activate_area(struct cma *cma)
{
- unsigned long pfn, end_pfn;
+ unsigned long pfn, end_pfn, early_pfn[CMA_MAX_RANGES];
int allocrange, r;
struct cma_memrange *cmr;
unsigned long bitmap_count, count;
for (allocrange = 0; allocrange < cma->nranges; allocrange++) {
cmr = &cma->ranges[allocrange];
+ early_pfn[allocrange] = cmr->early_pfn;
cmr->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma, cmr),
GFP_KERNEL);
if (!cmr->bitmap)
@@ -161,13 +157,13 @@ static void __init cma_activate_area(struct cma *cma)
for (r = 0; r < cma->nranges; r++) {
cmr = &cma->ranges[r];
- if (cmr->early_pfn != cmr->base_pfn) {
- count = cmr->early_pfn - cmr->base_pfn;
+ if (early_pfn[r] != cmr->base_pfn) {
+ count = early_pfn[r] - cmr->base_pfn;
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
bitmap_set(cmr->bitmap, 0, bitmap_count);
}
- for (pfn = cmr->early_pfn; pfn < cmr->base_pfn + cmr->count;
+ for (pfn = early_pfn[r]; pfn < cmr->base_pfn + cmr->count;
pfn += pageblock_nr_pages)
init_cma_reserved_pageblock(pfn_to_page(pfn));
}
@@ -193,7 +189,7 @@ cleanup:
for (r = 0; r < allocrange; r++) {
cmr = &cma->ranges[r];
end_pfn = cmr->base_pfn + cmr->count;
- for (pfn = cmr->early_pfn; pfn < end_pfn; pfn++)
+ for (pfn = early_pfn[r]; pfn < end_pfn; pfn++)
free_reserved_page(pfn_to_page(pfn));
}
}
@@ -357,6 +353,168 @@ static void __init list_insert_sorted(
}
}
+static int __init cma_fixed_reserve(phys_addr_t base, phys_addr_t size)
+{
+ if (IS_ENABLED(CONFIG_HIGHMEM)) {
+ phys_addr_t highmem_start = __pa(high_memory - 1) + 1;
+
+ /*
+ * If allocating at a fixed base the request region must not
+ * cross the low/high memory boundary.
+ */
+ if (base < highmem_start && base + size > highmem_start) {
+ pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
+ &base, &highmem_start);
+ return -EINVAL;
+ }
+ }
+
+ if (memblock_is_region_reserved(base, size) ||
+ memblock_reserve(base, size) < 0) {
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static phys_addr_t __init cma_alloc_mem(phys_addr_t base, phys_addr_t size,
+ phys_addr_t align, phys_addr_t limit, int nid)
+{
+ phys_addr_t addr = 0;
+
+ /*
+ * If there is enough memory, try a bottom-up allocation first.
+ * It will place the new cma area close to the start of the node
+ * and guarantee that the compaction is moving pages out of the
+ * cma area and not into it.
+ * Avoid using first 4GB to not interfere with constrained zones
+ * like DMA/DMA32.
+ */
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ if (!memblock_bottom_up() && limit >= SZ_4G + size) {
+ memblock_set_bottom_up(true);
+ addr = memblock_alloc_range_nid(size, align, SZ_4G, limit,
+ nid, true);
+ memblock_set_bottom_up(false);
+ }
+#endif
+
+ /*
+ * On systems with HIGHMEM try allocating from there before consuming
+ * memory in lower zones.
+ */
+ if (!addr && IS_ENABLED(CONFIG_HIGHMEM)) {
+ phys_addr_t highmem = __pa(high_memory - 1) + 1;
+
+ /*
+ * All pages in the reserved area must come from the same zone.
+ * If the requested region crosses the low/high memory boundary,
+ * try allocating from high memory first and fall back to low
+ * memory in case of failure.
+ */
+ if (base < highmem && limit > highmem) {
+ addr = memblock_alloc_range_nid(size, align, highmem,
+ limit, nid, true);
+ limit = highmem;
+ }
+ }
+
+ if (!addr)
+ addr = memblock_alloc_range_nid(size, align, base, limit, nid,
+ true);
+
+ return addr;
+}
+
+static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
+ phys_addr_t size, phys_addr_t limit,
+ phys_addr_t alignment, unsigned int order_per_bit,
+ bool fixed, const char *name, struct cma **res_cma,
+ int nid)
+{
+ phys_addr_t memblock_end = memblock_end_of_DRAM();
+ phys_addr_t base = *basep;
+ int ret;
+
+ pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
+ __func__, &size, &base, &limit, &alignment);
+
+ if (cma_area_count == ARRAY_SIZE(cma_areas)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ if (!size)
+ return -EINVAL;
+
+ if (alignment && !is_power_of_2(alignment))
+ return -EINVAL;
+
+ if (!IS_ENABLED(CONFIG_NUMA))
+ nid = NUMA_NO_NODE;
+
+ /* Sanitise input arguments. */
+ alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES);
+ if (fixed && base & (alignment - 1)) {
+ pr_err("Region at %pa must be aligned to %pa bytes\n",
+ &base, &alignment);
+ return -EINVAL;
+ }
+ base = ALIGN(base, alignment);
+ size = ALIGN(size, alignment);
+ limit &= ~(alignment - 1);
+
+ if (!base)
+ fixed = false;
+
+ /* size should be aligned with order_per_bit */
+ if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
+ return -EINVAL;
+
+
+ /*
+ * If the limit is unspecified or above the memblock end, its effective
+ * value will be the memblock end. Set it explicitly to simplify further
+ * checks.
+ */
+ if (limit == 0 || limit > memblock_end)
+ limit = memblock_end;
+
+ if (base + size > limit) {
+ pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
+ &size, &base, &limit);
+ return -EINVAL;
+ }
+
+ /* Reserve memory */
+ if (fixed) {
+ ret = cma_fixed_reserve(base, size);
+ if (ret)
+ return ret;
+ } else {
+ base = cma_alloc_mem(base, size, alignment, limit, nid);
+ if (!base)
+ return -ENOMEM;
+
+ /*
+ * kmemleak scans/reads tracked objects for pointers to other
+ * objects but this address isn't mapped and accessible
+ */
+ kmemleak_ignore_phys(base);
+ }
+
+ ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
+ if (ret) {
+ memblock_phys_free(base, size);
+ return ret;
+ }
+
+ (*res_cma)->nid = nid;
+ *basep = base;
+
+ return 0;
+}
+
/*
* Create CMA areas with a total size of @total_size. A normal allocation
* for one area is tried first. If that fails, the biggest memblock
@@ -370,7 +528,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size,
phys_addr_t align, unsigned int order_per_bit,
const char *name, struct cma **res_cma, int nid)
{
- phys_addr_t start, end;
+ phys_addr_t start = 0, end;
phys_addr_t size, sizesum, sizeleft;
struct cma_init_memrange *mrp, *mlp, *failed;
struct cma_memrange *cmrp;
@@ -384,7 +542,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size,
/*
* First, try it the normal way, producing just one range.
*/
- ret = __cma_declare_contiguous_nid(0, total_size, 0, align,
+ ret = __cma_declare_contiguous_nid(&start, total_size, 0, align,
order_per_bit, false, name, res_cma, nid);
if (ret != -ENOMEM)
goto out;
@@ -547,8 +705,7 @@ out:
(unsigned long)total_size / SZ_1M);
else
pr_info("Reserved %lu MiB in %d range%s\n",
- (unsigned long)total_size / SZ_1M, nr,
- nr > 1 ? "s" : "");
+ (unsigned long)total_size / SZ_1M, nr, str_plural(nr));
return ret;
}
@@ -580,7 +737,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
{
int ret;
- ret = __cma_declare_contiguous_nid(base, size, limit, alignment,
+ ret = __cma_declare_contiguous_nid(&base, size, limit, alignment,
order_per_bit, fixed, name, res_cma, nid);
if (ret != 0)
pr_err("Failed to reserve %ld MiB\n",
@@ -592,148 +749,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
return ret;
}
-static int __init __cma_declare_contiguous_nid(phys_addr_t base,
- phys_addr_t size, phys_addr_t limit,
- phys_addr_t alignment, unsigned int order_per_bit,
- bool fixed, const char *name, struct cma **res_cma,
- int nid)
-{
- phys_addr_t memblock_end = memblock_end_of_DRAM();
- phys_addr_t highmem_start;
- int ret;
-
- /*
- * We can't use __pa(high_memory) directly, since high_memory
- * isn't a valid direct map VA, and DEBUG_VIRTUAL will (validly)
- * complain. Find the boundary by adding one to the last valid
- * address.
- */
- highmem_start = __pa(high_memory - 1) + 1;
- pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
- __func__, &size, &base, &limit, &alignment);
-
- if (cma_area_count == ARRAY_SIZE(cma_areas)) {
- pr_err("Not enough slots for CMA reserved regions!\n");
- return -ENOSPC;
- }
-
- if (!size)
- return -EINVAL;
-
- if (alignment && !is_power_of_2(alignment))
- return -EINVAL;
-
- if (!IS_ENABLED(CONFIG_NUMA))
- nid = NUMA_NO_NODE;
-
- /* Sanitise input arguments. */
- alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES);
- if (fixed && base & (alignment - 1)) {
- pr_err("Region at %pa must be aligned to %pa bytes\n",
- &base, &alignment);
- return -EINVAL;
- }
- base = ALIGN(base, alignment);
- size = ALIGN(size, alignment);
- limit &= ~(alignment - 1);
-
- if (!base)
- fixed = false;
-
- /* size should be aligned with order_per_bit */
- if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
- return -EINVAL;
-
- /*
- * If allocating at a fixed base the request region must not cross the
- * low/high memory boundary.
- */
- if (fixed && base < highmem_start && base + size > highmem_start) {
- pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
- &base, &highmem_start);
- return -EINVAL;
- }
-
- /*
- * If the limit is unspecified or above the memblock end, its effective
- * value will be the memblock end. Set it explicitly to simplify further
- * checks.
- */
- if (limit == 0 || limit > memblock_end)
- limit = memblock_end;
-
- if (base + size > limit) {
- pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
- &size, &base, &limit);
- return -EINVAL;
- }
-
- /* Reserve memory */
- if (fixed) {
- if (memblock_is_region_reserved(base, size) ||
- memblock_reserve(base, size) < 0) {
- return -EBUSY;
- }
- } else {
- phys_addr_t addr = 0;
-
- /*
- * If there is enough memory, try a bottom-up allocation first.
- * It will place the new cma area close to the start of the node
- * and guarantee that the compaction is moving pages out of the
- * cma area and not into it.
- * Avoid using first 4GB to not interfere with constrained zones
- * like DMA/DMA32.
- */
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
- if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) {
- memblock_set_bottom_up(true);
- addr = memblock_alloc_range_nid(size, alignment, SZ_4G,
- limit, nid, true);
- memblock_set_bottom_up(false);
- }
-#endif
-
- /*
- * All pages in the reserved area must come from the same zone.
- * If the requested region crosses the low/high memory boundary,
- * try allocating from high memory first and fall back to low
- * memory in case of failure.
- */
- if (!addr && base < highmem_start && limit > highmem_start) {
- addr = memblock_alloc_range_nid(size, alignment,
- highmem_start, limit, nid, true);
- limit = highmem_start;
- }
-
- if (!addr) {
- addr = memblock_alloc_range_nid(size, alignment, base,
- limit, nid, true);
- if (!addr)
- return -ENOMEM;
- }
-
- /*
- * kmemleak scans/reads tracked objects for pointers to other
- * objects but this address isn't mapped and accessible
- */
- kmemleak_ignore_phys(addr);
- base = addr;
- }
-
- ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
- if (ret)
- memblock_phys_free(base, size);
-
- (*res_cma)->nid = nid;
-
- return ret;
-}
-
static void cma_debug_show_areas(struct cma *cma)
{
- unsigned long next_zero_bit, next_set_bit, nr_zero;
- unsigned long start;
+ unsigned long start, end;
unsigned long nr_part;
unsigned long nbits;
int r;
@@ -744,22 +762,12 @@ static void cma_debug_show_areas(struct cma *cma)
for (r = 0; r < cma->nranges; r++) {
cmr = &cma->ranges[r];
- start = 0;
nbits = cma_bitmap_maxno(cma, cmr);
pr_info("range %d: ", r);
- for (;;) {
- next_zero_bit = find_next_zero_bit(cmr->bitmap,
- nbits, start);
- if (next_zero_bit >= nbits)
- break;
- next_set_bit = find_next_bit(cmr->bitmap, nbits,
- next_zero_bit);
- nr_zero = next_set_bit - next_zero_bit;
- nr_part = nr_zero << cma->order_per_bit;
- pr_cont("%s%lu@%lu", start ? "+" : "", nr_part,
- next_zero_bit);
- start = next_zero_bit + nr_zero;
+ for_each_clear_bitrange(start, end, cmr->bitmap, nbits) {
+ nr_part = (end - start) << cma->order_per_bit;
+ pr_cont("%s%lu@%lu", start ? "+" : "", nr_part, start);
}
pr_info("\n");
}
@@ -815,7 +823,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
mutex_lock(&cma->alloc_mutex);
- ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp);
+ ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp);
mutex_unlock(&cma->alloc_mutex);
if (ret == 0) {
page = pfn_to_page(pfn);
@@ -847,8 +855,6 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
unsigned long i;
const char *name = cma ? cma->name : NULL;
- trace_cma_alloc_start(name, count, align);
-
if (!cma || !cma->count)
return page;
@@ -858,6 +864,8 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
if (!count)
return page;
+ trace_cma_alloc_start(name, count, align);
+
for (r = 0; r < cma->nranges; r++) {
page = NULL;
diff --git a/mm/cma.h b/mm/cma.h
index 41a3ab0ec3de..c70180c36559 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -25,9 +25,11 @@ struct cma_kobject {
*/
struct cma_memrange {
unsigned long base_pfn;
- unsigned long early_pfn;
unsigned long count;
- unsigned long *bitmap;
+ union {
+ unsigned long early_pfn;
+ unsigned long *bitmap;
+ };
#ifdef CONFIG_CMA_DEBUGFS
struct debugfs_u32_array dfs_bitmap;
#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index fdf899532ca0..8c7d7f8e8fbd 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -56,16 +56,8 @@ static int cma_maxchunk_get(void *data, u64 *val)
for (r = 0; r < cma->nranges; r++) {
cmr = &cma->ranges[r];
bitmap_maxno = cma_bitmap_maxno(cma, cmr);
- end = 0;
- for (;;) {
- start = find_next_zero_bit(cmr->bitmap,
- bitmap_maxno, end);
- if (start >= bitmap_maxno)
- break;
- end = find_next_bit(cmr->bitmap, bitmap_maxno,
- start);
+ for_each_clear_bitrange(start, end, cmr->bitmap, bitmap_maxno)
maxchunk = max(end - start, maxchunk);
- }
}
spin_unlock_irq(&cma->lock);
*val = (u64)maxchunk << cma->order_per_bit;
diff --git a/mm/compaction.c b/mm/compaction.c
index 139f00c0308a..bf021b31c7ec 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -114,39 +114,6 @@ static unsigned long release_free_list(struct list_head *freepages)
}
#ifdef CONFIG_COMPACTION
-bool PageMovable(struct page *page)
-{
- const struct movable_operations *mops;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (!__PageMovable(page))
- return false;
-
- mops = page_movable_ops(page);
- if (mops)
- return true;
-
- return false;
-}
-
-void __SetPageMovable(struct page *page, const struct movable_operations *mops)
-{
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page);
- page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE);
-}
-EXPORT_SYMBOL(__SetPageMovable);
-
-void __ClearPageMovable(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageMovable(page), page);
- /*
- * This page still has the type of a movable page, but it's
- * actually not movable any more.
- */
- page->mapping = (void *)PAGE_MAPPING_MOVABLE;
-}
-EXPORT_SYMBOL(__ClearPageMovable);
/* Do not skip compaction more than 64 times */
#define COMPACT_MAX_DEFER_SHIFT 6
@@ -981,13 +948,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
if (PageHuge(page)) {
+ const unsigned int order = compound_order(page);
/*
* skip hugetlbfs if we are not compacting for pages
* bigger than its order. THPs and other compound pages
* are handled below.
*/
if (!cc->alloc_contig) {
- const unsigned int order = compound_order(page);
if (order <= MAX_PAGE_ORDER) {
low_pfn += (1UL << order) - 1;
@@ -1001,27 +968,27 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
locked = NULL;
}
- ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
+ folio = page_folio(page);
+ ret = isolate_or_dissolve_huge_folio(folio, &cc->migratepages);
/*
- * Fail isolation in case isolate_or_dissolve_huge_page()
+ * Fail isolation in case isolate_or_dissolve_huge_folio()
* reports an error. In case of -ENOMEM, abort right away.
*/
if (ret < 0) {
/* Do not report -EBUSY down the chain */
if (ret == -EBUSY)
ret = 0;
- low_pfn += compound_nr(page) - 1;
- nr_scanned += compound_nr(page) - 1;
+ low_pfn += (1UL << order) - 1;
+ nr_scanned += (1UL << order) - 1;
goto isolate_fail;
}
- if (PageHuge(page)) {
+ if (folio_test_hugetlb(folio)) {
/*
* Hugepage was successfully isolated and placed
* on the cc->migratepages list.
*/
- folio = page_folio(page);
low_pfn += folio_nr_pages(folio) - 1;
goto isolate_success_no_list;
}
@@ -1082,18 +1049,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* Skip any other type of page
*/
if (!PageLRU(page)) {
- /*
- * __PageMovable can return false positive so we need
- * to verify it under page_lock.
- */
- if (unlikely(__PageMovable(page)) &&
- !PageIsolated(page)) {
+ /* Isolation code will deal with any races. */
+ if (unlikely(page_has_movable_ops(page)) &&
+ !PageMovableOpsIsolated(page)) {
if (locked) {
unlock_page_lruvec_irqrestore(locked, flags);
locked = NULL;
}
- if (isolate_movable_page(page, mode)) {
+ if (isolate_movable_ops_page(page, mode)) {
folio = page_folio(page);
goto isolate_success;
}
@@ -2249,15 +2213,11 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat)
static unsigned int fragmentation_score_wmark(bool low)
{
- unsigned int wmark_low;
+ unsigned int wmark_low, leeway;
- /*
- * Cap the low watermark to avoid excessive compaction
- * activity in case a user sets the proactiveness tunable
- * close to 100 (maximum).
- */
- wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
- return low ? wmark_low : min(wmark_low + 10, 100U);
+ wmark_low = 100U - sysctl_compaction_proactiveness;
+ leeway = min(10U, wmark_low / 2);
+ return low ? wmark_low : min(wmark_low + leeway, 100U);
}
static bool should_proactive_compact_node(pg_data_t *pgdat)
@@ -2348,7 +2308,6 @@ static enum compact_result __compact_finished(struct compact_control *cc)
ret = COMPACT_NO_SUITABLE_PAGE;
for (order = cc->order; order < NR_PAGE_ORDERS; order++) {
struct free_area *area = &cc->zone->free_area[order];
- bool claim_block;
/* Job done if page is free of the right migratetype */
if (!free_area_empty(area, migratetype))
@@ -2364,8 +2323,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* Job done if allocation would steal freepages from
* other migratetype buddy lists.
*/
- if (find_suitable_fallback(area, order, migratetype,
- true, &claim_block) != -1)
+ if (find_suitable_fallback(area, order, migratetype, true) >= 0)
/*
* Movable pages are OK in any pageblock. If we are
* stealing for a non-movable allocation, make sure
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index c213cf8b5638..b3171f9406c1 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -28,6 +28,7 @@ config DAMON_VADDR
bool "Data access monitoring operations for virtual address spaces"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
+ default DAMON
help
This builds the default data access monitoring operations for DAMON
that work for virtual address spaces.
@@ -36,6 +37,7 @@ config DAMON_PADDR
bool "Data access monitoring operations for the physical address space"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
+ default DAMON
help
This builds the default data access monitoring operations for DAMON
that works for the physical address space.
@@ -55,6 +57,7 @@ config DAMON_VADDR_KUNIT_TEST
config DAMON_SYSFS
bool "DAMON sysfs interface"
depends on DAMON && SYSFS
+ default DAMON
help
This builds the sysfs interface for DAMON. The user space can use
the interface for arbitrary data access monitoring.
@@ -91,4 +94,20 @@ config DAMON_LRU_SORT
protect frequently accessed (hot) pages while rarely accessed (cold)
pages reclaimed first under memory pressure.
+config DAMON_STAT
+ bool "Build data access monitoring stat (DAMON_STAT)"
+ depends on DAMON_PADDR
+ help
+ This builds the DAMON-based access monitoring statistics subsystem.
+ It runs DAMON and expose access monitoring results in simple stat
+ metrics.
+
+config DAMON_STAT_ENABLED_DEFAULT
+ bool "Enable DAMON_STAT by default"
+ depends on DAMON_PADDR
+ default DAMON_STAT
+ help
+ Whether to enable DAMON_STAT by default. Users can disable it in
+ boot or runtime using its 'enabled' parameter.
+
endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 8b49012ba8c3..d8d6bf5f8bff 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o
obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o
obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o
obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o
+obj-$(CONFIG_DAMON_STAT) += modules-common.o stat.o
diff --git a/mm/damon/core.c b/mm/damon/core.c
index fc1eba3da419..52a48c9316bc 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -76,14 +76,13 @@ int damon_register_ops(struct damon_operations *ops)
if (ops->id >= NR_DAMON_OPS)
return -EINVAL;
+
mutex_lock(&damon_ops_lock);
/* Fail for already registered ops */
- if (__damon_is_registered_ops(ops->id)) {
+ if (__damon_is_registered_ops(ops->id))
err = -EINVAL;
- goto out;
- }
- damon_registered_ops[ops->id] = *ops;
-out:
+ else
+ damon_registered_ops[ops->id] = *ops;
mutex_unlock(&damon_ops_lock);
return err;
}
@@ -408,6 +407,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
scheme->wmarks = *wmarks;
scheme->wmarks.activated = true;
+ scheme->migrate_dests = (struct damos_migrate_dests){};
scheme->target_nid = target_nid;
return scheme;
@@ -450,6 +450,9 @@ void damon_destroy_scheme(struct damos *s)
damos_for_each_filter_safe(f, next, s)
damos_destroy_filter(f);
+
+ kfree(s->migrate_dests.node_id_arr);
+ kfree(s->migrate_dests.weight_arr);
damon_del_scheme(s);
damon_free_scheme(s);
}
@@ -499,8 +502,12 @@ void damon_free_target(struct damon_target *t)
kfree(t);
}
-void damon_destroy_target(struct damon_target *t)
+void damon_destroy_target(struct damon_target *t, struct damon_ctx *ctx)
{
+
+ if (ctx && ctx->ops.cleanup_target)
+ ctx->ops.cleanup_target(t);
+
damon_del_target(t);
damon_free_target(t);
}
@@ -530,7 +537,8 @@ struct damon_ctx *damon_new_ctx(void)
ctx->next_ops_update_sis = 0;
mutex_init(&ctx->kdamond_lock);
- mutex_init(&ctx->call_control_lock);
+ INIT_LIST_HEAD(&ctx->call_controls);
+ mutex_init(&ctx->call_controls_lock);
mutex_init(&ctx->walk_control_lock);
ctx->attrs.min_nr_regions = 10;
@@ -546,13 +554,8 @@ static void damon_destroy_targets(struct damon_ctx *ctx)
{
struct damon_target *t, *next_t;
- if (ctx->ops.cleanup) {
- ctx->ops.cleanup(ctx);
- return;
- }
-
damon_for_each_target_safe(t, next_t, ctx)
- damon_destroy_target(t);
+ damon_destroy_target(t, ctx);
}
void damon_destroy_ctx(struct damon_ctx *ctx)
@@ -677,9 +680,7 @@ static bool damon_valid_intervals_goal(struct damon_attrs *attrs)
* @attrs: monitoring attributes
*
* This function should be called while the kdamond is not running, an access
- * check results aggregation is not ongoing (e.g., from &struct
- * damon_callback->after_aggregation or &struct
- * damon_callback->after_wmarks_check callbacks), or from damon_call().
+ * check results aggregation is not ongoing (e.g., from damon_call().
*
* Every time interval is in micro-seconds.
*
@@ -755,6 +756,19 @@ static struct damos_quota_goal *damos_nth_quota_goal(
return NULL;
}
+static void damos_commit_quota_goal_union(
+ struct damos_quota_goal *dst, struct damos_quota_goal *src)
+{
+ switch (dst->metric) {
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ dst->nid = src->nid;
+ break;
+ default:
+ break;
+ }
+}
+
static void damos_commit_quota_goal(
struct damos_quota_goal *dst, struct damos_quota_goal *src)
{
@@ -763,6 +777,7 @@ static void damos_commit_quota_goal(
if (dst->metric == DAMOS_QUOTA_USER_INPUT)
dst->current_value = src->current_value;
/* keep last_psi_total as is, since it will be updated in next cycle */
+ damos_commit_quota_goal_union(dst, src);
}
/**
@@ -775,7 +790,7 @@ static void damos_commit_quota_goal(
* DAMON contexts, instead of manual in-place updates.
*
* This function should be called from parameters-update safe context, like
- * DAMON callbacks.
+ * damon_call().
*/
int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src)
{
@@ -796,6 +811,7 @@ int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src)
src_goal->metric, src_goal->target_value);
if (!new_goal)
return -ENOMEM;
+ damos_commit_quota_goal_union(new_goal, src_goal);
damos_add_quota_goal(dst, new_goal);
}
return 0;
@@ -940,6 +956,41 @@ static void damos_set_filters_default_reject(struct damos *s)
damos_filters_default_reject(&s->ops_filters);
}
+static int damos_commit_dests(struct damos *dst, struct damos *src)
+{
+ struct damos_migrate_dests *dst_dests, *src_dests;
+
+ dst_dests = &dst->migrate_dests;
+ src_dests = &src->migrate_dests;
+
+ if (dst_dests->nr_dests != src_dests->nr_dests) {
+ kfree(dst_dests->node_id_arr);
+ kfree(dst_dests->weight_arr);
+
+ dst_dests->node_id_arr = kmalloc_array(src_dests->nr_dests,
+ sizeof(*dst_dests->node_id_arr), GFP_KERNEL);
+ if (!dst_dests->node_id_arr) {
+ dst_dests->weight_arr = NULL;
+ return -ENOMEM;
+ }
+
+ dst_dests->weight_arr = kmalloc_array(src_dests->nr_dests,
+ sizeof(*dst_dests->weight_arr), GFP_KERNEL);
+ if (!dst_dests->weight_arr) {
+ /* ->node_id_arr will be freed by scheme destruction */
+ return -ENOMEM;
+ }
+ }
+
+ dst_dests->nr_dests = src_dests->nr_dests;
+ for (int i = 0; i < src_dests->nr_dests; i++) {
+ dst_dests->node_id_arr[i] = src_dests->node_id_arr[i];
+ dst_dests->weight_arr[i] = src_dests->weight_arr[i];
+ }
+
+ return 0;
+}
+
static int damos_commit_filters(struct damos *dst, struct damos *src)
{
int err;
@@ -979,6 +1030,11 @@ static int damos_commit(struct damos *dst, struct damos *src)
return err;
dst->wmarks = src->wmarks;
+ dst->target_nid = src->target_nid;
+
+ err = damos_commit_dests(dst, src);
+ if (err)
+ return err;
err = damos_commit_filters(dst, src);
return err;
@@ -1094,9 +1150,15 @@ static int damon_commit_targets(
if (err)
return err;
} else {
- if (damon_target_has_pid(dst))
- put_pid(dst_target->pid);
- damon_destroy_target(dst_target);
+ struct damos *s;
+
+ damon_destroy_target(dst_target, dst);
+ damon_for_each_scheme(s, dst) {
+ if (s->quota.charge_target_from == dst_target) {
+ s->quota.charge_target_from = NULL;
+ s->quota.charge_addr_from = 0;
+ }
+ }
}
}
@@ -1109,7 +1171,7 @@ static int damon_commit_targets(
err = damon_commit_target(new_target, false,
src_target, damon_target_has_pid(src));
if (err) {
- damon_destroy_target(new_target);
+ damon_destroy_target(new_target, NULL);
return err;
}
damon_add_target(dst, new_target);
@@ -1128,7 +1190,7 @@ static int damon_commit_targets(
* in-place updates.
*
* This function should be called from parameters-update safe context, like
- * DAMON callbacks.
+ * damon_call().
*/
int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
{
@@ -1304,7 +1366,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
return err;
}
-static bool damon_is_running(struct damon_ctx *ctx)
+/**
+ * damon_is_running() - Returns if a given DAMON context is running.
+ * @ctx: The DAMON context to see if running.
+ *
+ * Return: true if @ctx is running, false otherwise.
+ */
+bool damon_is_running(struct damon_ctx *ctx)
{
bool running;
@@ -1321,8 +1389,9 @@ static bool damon_is_running(struct damon_ctx *ctx)
*
* Ask DAMON worker thread (kdamond) of @ctx to call a function with an
* argument data that respectively passed via &damon_call_control->fn and
- * &damon_call_control->data of @control, and wait until the kdamond finishes
- * handling of the request.
+ * &damon_call_control->data of @control. If &damon_call_control->repeat of
+ * @control is set, further wait until the kdamond finishes handling of the
+ * request. Otherwise, return as soon as the request is made.
*
* The kdamond executes the function with the argument in the main loop, just
* after a sampling of the iteration is finished. The function can hence
@@ -1334,18 +1403,18 @@ static bool damon_is_running(struct damon_ctx *ctx)
*/
int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
{
- init_completion(&control->completion);
+ if (!control->repeat)
+ init_completion(&control->completion);
control->canceled = false;
+ INIT_LIST_HEAD(&control->list);
- mutex_lock(&ctx->call_control_lock);
- if (ctx->call_control) {
- mutex_unlock(&ctx->call_control_lock);
- return -EBUSY;
- }
- ctx->call_control = control;
- mutex_unlock(&ctx->call_control_lock);
+ mutex_lock(&ctx->call_controls_lock);
+ list_add_tail(&ctx->call_controls, &control->list);
+ mutex_unlock(&ctx->call_controls_lock);
if (!damon_is_running(ctx))
return -EINVAL;
+ if (control->repeat)
+ return 0;
wait_for_completion(&control->completion);
if (control->canceled)
return -ECANCELED;
@@ -1393,6 +1462,19 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control)
}
/*
+ * Warn and fix corrupted ->nr_accesses[_bp] for investigations and preventing
+ * the problem being propagated.
+ */
+static void damon_warn_fix_nr_accesses_corruption(struct damon_region *r)
+{
+ if (r->nr_accesses_bp == r->nr_accesses * 10000)
+ return;
+ WARN_ONCE(true, "invalid nr_accesses_bp at reset: %u %u\n",
+ r->nr_accesses_bp, r->nr_accesses);
+ r->nr_accesses_bp = r->nr_accesses * 10000;
+}
+
+/*
* Reset the aggregated monitoring results ('nr_accesses' of each region).
*/
static void kdamond_reset_aggregated(struct damon_ctx *c)
@@ -1405,6 +1487,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
damon_for_each_region(r, t) {
trace_damon_aggregated(ti, r, damon_nr_regions(t));
+ damon_warn_fix_nr_accesses_corruption(r);
r->last_nr_accesses = r->nr_accesses;
r->nr_accesses = 0;
}
@@ -1428,6 +1511,7 @@ static unsigned long damon_get_intervals_score(struct damon_ctx *c)
}
}
target_access_events = max_access_events * goal_bp / 10000;
+ target_access_events = target_access_events ? : 1;
return access_events * 10000 / target_access_events;
}
@@ -1468,6 +1552,7 @@ static void kdamond_tune_intervals(struct damon_ctx *c)
new_attrs.sample_interval);
new_attrs.aggr_interval = new_attrs.sample_interval *
c->attrs.aggr_samples;
+ trace_damon_monitor_intervals_tune(new_attrs.sample_interval);
damon_set_attrs(c, &new_attrs);
}
@@ -1890,6 +1975,29 @@ static inline u64 damos_get_some_mem_psi_total(void)
#endif /* CONFIG_PSI */
+#ifdef CONFIG_NUMA
+static __kernel_ulong_t damos_get_node_mem_bp(
+ struct damos_quota_goal *goal)
+{
+ struct sysinfo i;
+ __kernel_ulong_t numerator;
+
+ si_meminfo_node(&i, goal->nid);
+ if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP)
+ numerator = i.totalram - i.freeram;
+ else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */
+ numerator = i.freeram;
+ return numerator * 10000 / i.totalram;
+}
+#else
+static __kernel_ulong_t damos_get_node_mem_bp(
+ struct damos_quota_goal *goal)
+{
+ return 0;
+}
+#endif
+
+
static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
{
u64 now_psi_total;
@@ -1903,6 +2011,10 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
goal->current_value = now_psi_total - goal->last_psi_total;
goal->last_psi_total = now_psi_total;
break;
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ goal->current_value = damos_get_node_mem_bp(goal);
+ break;
default:
break;
}
@@ -1961,12 +2073,26 @@ static void damos_set_effective_quota(struct damos_quota *quota)
quota->esz = esz;
}
+static void damos_trace_esz(struct damon_ctx *c, struct damos *s,
+ struct damos_quota *quota)
+{
+ unsigned int cidx = 0, sidx = 0;
+ struct damos *siter;
+
+ damon_for_each_scheme(siter, c) {
+ if (siter == s)
+ break;
+ sidx++;
+ }
+ trace_damos_esz(cidx, sidx, quota->esz);
+}
+
static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
{
struct damos_quota *quota = &s->quota;
struct damon_target *t;
struct damon_region *r;
- unsigned long cumulated_sz;
+ unsigned long cumulated_sz, cached_esz;
unsigned int score, max_score = 0;
if (!quota->ms && !quota->sz && list_empty(&quota->goals))
@@ -1980,7 +2106,11 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
quota->total_charged_sz += quota->charged_sz;
quota->charged_from = jiffies;
quota->charged_sz = 0;
+ if (trace_damos_esz_enabled())
+ cached_esz = quota->esz;
damos_set_effective_quota(quota);
+ if (trace_damos_esz_enabled() && quota->esz != cached_esz)
+ damos_trace_esz(c, s, quota);
}
if (!c->ops.get_scheme_score)
@@ -2301,36 +2431,49 @@ static void kdamond_usleep(unsigned long usecs)
}
/*
- * kdamond_call() - handle damon_call_control.
+ * kdamond_call() - handle damon_call_control objects.
* @ctx: The &struct damon_ctx of the kdamond.
* @cancel: Whether to cancel the invocation of the function.
*
- * If there is a &struct damon_call_control request that registered via
+ * If there are &struct damon_call_control requests that registered via
* &damon_call() on @ctx, do or cancel the invocation of the function depending
- * on @cancel. @cancel is set when the kdamond is deactivated by DAMOS
- * watermarks, or the kdamond is already out of the main loop and therefore
- * will be terminated.
+ * on @cancel. @cancel is set when the kdamond is already out of the main loop
+ * and therefore will be terminated.
*/
static void kdamond_call(struct damon_ctx *ctx, bool cancel)
{
struct damon_call_control *control;
+ LIST_HEAD(repeat_controls);
int ret = 0;
- mutex_lock(&ctx->call_control_lock);
- control = ctx->call_control;
- mutex_unlock(&ctx->call_control_lock);
- if (!control)
- return;
- if (cancel) {
- control->canceled = true;
- } else {
- ret = control->fn(control->data);
- control->return_code = ret;
+ while (true) {
+ mutex_lock(&ctx->call_controls_lock);
+ control = list_first_entry_or_null(&ctx->call_controls,
+ struct damon_call_control, list);
+ mutex_unlock(&ctx->call_controls_lock);
+ if (!control)
+ break;
+ if (cancel) {
+ control->canceled = true;
+ } else {
+ ret = control->fn(control->data);
+ control->return_code = ret;
+ }
+ mutex_lock(&ctx->call_controls_lock);
+ list_del(&control->list);
+ mutex_unlock(&ctx->call_controls_lock);
+ if (!control->repeat)
+ complete(&control->completion);
+ else
+ list_add(&control->list, &repeat_controls);
}
- complete(&control->completion);
- mutex_lock(&ctx->call_control_lock);
- ctx->call_control = NULL;
- mutex_unlock(&ctx->call_control_lock);
+ control = list_first_entry_or_null(&repeat_controls,
+ struct damon_call_control, list);
+ if (!control || cancel)
+ return;
+ mutex_lock(&ctx->call_controls_lock);
+ list_add_tail(&control->list, &ctx->call_controls);
+ mutex_unlock(&ctx->call_controls_lock);
}
/* Returns negative error code if it's not activated but should return */
@@ -2354,10 +2497,7 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
kdamond_usleep(min_wait_time);
- if (ctx->callback.after_wmarks_check &&
- ctx->callback.after_wmarks_check(ctx))
- break;
- kdamond_call(ctx, true);
+ kdamond_call(ctx, false);
damos_walk_cancel(ctx);
}
return -EBUSY;
@@ -2413,10 +2553,9 @@ static int kdamond_fn(void *data)
while (!kdamond_need_stop(ctx)) {
/*
* ctx->attrs and ctx->next_{aggregation,ops_update}_sis could
- * be changed from after_wmarks_check() or after_aggregation()
- * callbacks. Read the values here, and use those for this
- * iteration. That is, damon_set_attrs() updated new values
- * are respected from next iteration.
+ * be changed from kdamond_call(). Read the values here, and
+ * use those for this iteration. That is, damon_set_attrs()
+ * updated new values are respected from next iteration.
*/
unsigned long next_aggregation_sis = ctx->next_aggregation_sis;
unsigned long next_ops_update_sis = ctx->next_ops_update_sis;
@@ -2434,14 +2573,10 @@ static int kdamond_fn(void *data)
if (ctx->ops.check_accesses)
max_nr_accesses = ctx->ops.check_accesses(ctx);
- if (ctx->passed_sample_intervals >= next_aggregation_sis) {
+ if (ctx->passed_sample_intervals >= next_aggregation_sis)
kdamond_merge_regions(ctx,
max_nr_accesses / 10,
sz_limit);
- if (ctx->callback.after_aggregation &&
- ctx->callback.after_aggregation(ctx))
- break;
- }
/*
* do kdamond_call() and kdamond_apply_schemes() after
@@ -2507,8 +2642,6 @@ done:
damon_destroy_region(r, t);
}
- if (ctx->callback.before_terminate)
- ctx->callback.before_terminate(ctx);
if (ctx->ops.cleanup)
ctx->ops.cleanup(ctx);
kfree(ctx->regions_score_histogram);
@@ -2527,6 +2660,7 @@ done:
running_exclusive_ctxs = false;
mutex_unlock(&damon_lock);
+ damon_destroy_targets(ctx);
return 0;
}
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 4af8fd4a390b..151a9de5ad8b 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -230,6 +230,39 @@ out:
return err;
}
+static int damon_lru_sort_handle_commit_inputs(void)
+{
+ int err;
+
+ if (!commit_inputs)
+ return 0;
+
+ err = damon_lru_sort_apply_parameters();
+ commit_inputs = false;
+ return err;
+}
+
+static int damon_lru_sort_damon_call_fn(void *arg)
+{
+ struct damon_ctx *c = arg;
+ struct damos *s;
+
+ /* update the stats parameter */
+ damon_for_each_scheme(s, c) {
+ if (s->action == DAMOS_LRU_PRIO)
+ damon_lru_sort_hot_stat = s->stat;
+ else if (s->action == DAMOS_LRU_DEPRIO)
+ damon_lru_sort_cold_stat = s->stat;
+ }
+
+ return damon_lru_sort_handle_commit_inputs();
+}
+
+static struct damon_call_control call_control = {
+ .fn = damon_lru_sort_damon_call_fn,
+ .repeat = true,
+};
+
static int damon_lru_sort_turn(bool on)
{
int err;
@@ -249,7 +282,7 @@ static int damon_lru_sort_turn(bool on)
if (err)
return err;
kdamond_pid = ctx->kdamond->pid;
- return 0;
+ return damon_call(ctx, &call_control);
}
static int damon_lru_sort_enabled_store(const char *val,
@@ -288,52 +321,22 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
MODULE_PARM_DESC(enabled,
"Enable or disable DAMON_LRU_SORT (default: disabled)");
-static int damon_lru_sort_handle_commit_inputs(void)
-{
- int err;
-
- if (!commit_inputs)
- return 0;
-
- err = damon_lru_sort_apply_parameters();
- commit_inputs = false;
- return err;
-}
-
-static int damon_lru_sort_after_aggregation(struct damon_ctx *c)
-{
- struct damos *s;
-
- /* update the stats parameter */
- damon_for_each_scheme(s, c) {
- if (s->action == DAMOS_LRU_PRIO)
- damon_lru_sort_hot_stat = s->stat;
- else if (s->action == DAMOS_LRU_DEPRIO)
- damon_lru_sort_cold_stat = s->stat;
- }
-
- return damon_lru_sort_handle_commit_inputs();
-}
-
-static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c)
-{
- return damon_lru_sort_handle_commit_inputs();
-}
-
static int __init damon_lru_sort_init(void)
{
int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
if (err)
- return err;
+ goto out;
- ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
- ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
+ call_control.data = ctx;
/* 'enabled' has set before this function, probably via command line */
if (enabled)
err = damon_lru_sort_turn(true);
+out:
+ if (err && enabled)
+ enabled = false;
return err;
}
diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c
index 7cf96574cde7..86d58f8c4f63 100644
--- a/mm/damon/modules-common.c
+++ b/mm/damon/modules-common.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for DAMON Modules
+ * Common Code for DAMON Modules
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index f49cdb417005..f103ad556368 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for DAMON Modules
+ * Common Code for DAMON Modules
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 0db1fc70c84d..99321ff5cb92 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -1,10 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for Data Access Monitoring
+ * Common Code for Data Access Monitoring
*
* Author: SeongJae Park <sj@kernel.org>
*/
+#include <linux/migrate.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/pagemap.h>
@@ -12,6 +13,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
+#include "../internal.h"
#include "ops-common.h"
/*
@@ -138,3 +140,275 @@ int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
/* Return coldness of the region */
return DAMOS_MAX_SCORE - hotness;
}
+
+static bool damon_folio_mkold_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
+{
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ addr = pvmw.address;
+ if (pvmw.pte)
+ damon_ptep_mkold(pvmw.pte, vma, addr);
+ else
+ damon_pmdp_mkold(pvmw.pmd, vma, addr);
+ }
+ return true;
+}
+
+void damon_folio_mkold(struct folio *folio)
+{
+ struct rmap_walk_control rwc = {
+ .rmap_one = damon_folio_mkold_one,
+ .anon_lock = folio_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
+ folio_set_idle(folio);
+ return;
+ }
+
+ need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+ if (need_lock && !folio_trylock(folio))
+ return;
+
+ rmap_walk(folio, &rwc);
+
+ if (need_lock)
+ folio_unlock(folio);
+
+}
+
+static bool damon_folio_young_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
+{
+ bool *accessed = arg;
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
+ pte_t pte;
+
+ *accessed = false;
+ while (page_vma_mapped_walk(&pvmw)) {
+ addr = pvmw.address;
+ if (pvmw.pte) {
+ pte = ptep_get(pvmw.pte);
+
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are "old" from a CPU perspective.
+ * The MMU notifier takes care of any device aspects.
+ */
+ *accessed = (pte_present(pte) && pte_young(pte)) ||
+ !folio_test_idle(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+ } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ *accessed = pmd_young(pmdp_get(pvmw.pmd)) ||
+ !folio_test_idle(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+#else
+ WARN_ON_ONCE(1);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ }
+ if (*accessed) {
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
+
+ /* If accessed, stop walking */
+ return *accessed == false;
+}
+
+bool damon_folio_young(struct folio *folio)
+{
+ bool accessed = false;
+ struct rmap_walk_control rwc = {
+ .arg = &accessed,
+ .rmap_one = damon_folio_young_one,
+ .anon_lock = folio_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
+ if (folio_test_idle(folio))
+ return false;
+ else
+ return true;
+ }
+
+ need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+ if (need_lock && !folio_trylock(folio))
+ return false;
+
+ rmap_walk(folio, &rwc);
+
+ if (need_lock)
+ folio_unlock(folio);
+
+ return accessed;
+}
+
+bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio)
+{
+ bool matched = false;
+ struct mem_cgroup *memcg;
+ size_t folio_sz;
+
+ switch (filter->type) {
+ case DAMOS_FILTER_TYPE_ANON:
+ matched = folio_test_anon(folio);
+ break;
+ case DAMOS_FILTER_TYPE_ACTIVE:
+ matched = folio_test_active(folio);
+ break;
+ case DAMOS_FILTER_TYPE_MEMCG:
+ rcu_read_lock();
+ memcg = folio_memcg_check(folio);
+ if (!memcg)
+ matched = false;
+ else
+ matched = filter->memcg_id == mem_cgroup_id(memcg);
+ rcu_read_unlock();
+ break;
+ case DAMOS_FILTER_TYPE_YOUNG:
+ matched = damon_folio_young(folio);
+ if (matched)
+ damon_folio_mkold(folio);
+ break;
+ case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:
+ folio_sz = folio_size(folio);
+ matched = filter->sz_range.min <= folio_sz &&
+ folio_sz <= filter->sz_range.max;
+ break;
+ case DAMOS_FILTER_TYPE_UNMAPPED:
+ matched = !folio_mapped(folio) || !folio_raw_mapping(folio);
+ break;
+ default:
+ break;
+ }
+
+ return matched == filter->matching;
+}
+
+static unsigned int __damon_migrate_folio_list(
+ struct list_head *migrate_folios, struct pglist_data *pgdat,
+ int target_nid)
+{
+ unsigned int nr_succeeded = 0;
+ struct migration_target_control mtc = {
+ /*
+ * Allocate from 'node', or fail quickly and quietly.
+ * When this happens, 'page' will likely just be discarded
+ * instead of migrated.
+ */
+ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
+ __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT,
+ .nid = target_nid,
+ };
+
+ if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE)
+ return 0;
+
+ if (list_empty(migrate_folios))
+ return 0;
+
+ /* Migration ignores all cpuset and mempolicy settings */
+ migrate_pages(migrate_folios, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON,
+ &nr_succeeded);
+
+ return nr_succeeded;
+}
+
+static unsigned int damon_migrate_folio_list(struct list_head *folio_list,
+ struct pglist_data *pgdat,
+ int target_nid)
+{
+ unsigned int nr_migrated = 0;
+ struct folio *folio;
+ LIST_HEAD(ret_folios);
+ LIST_HEAD(migrate_folios);
+
+ while (!list_empty(folio_list)) {
+ struct folio *folio;
+
+ cond_resched();
+
+ folio = lru_to_folio(folio_list);
+ list_del(&folio->lru);
+
+ if (!folio_trylock(folio))
+ goto keep;
+
+ /* Relocate its contents to another node. */
+ list_add(&folio->lru, &migrate_folios);
+ folio_unlock(folio);
+ continue;
+keep:
+ list_add(&folio->lru, &ret_folios);
+ }
+ /* 'folio_list' is always empty here */
+
+ /* Migrate folios selected for migration */
+ nr_migrated += __damon_migrate_folio_list(
+ &migrate_folios, pgdat, target_nid);
+ /*
+ * Folios that could not be migrated are still in @migrate_folios. Add
+ * those back on @folio_list
+ */
+ if (!list_empty(&migrate_folios))
+ list_splice_init(&migrate_folios, folio_list);
+
+ try_to_unmap_flush();
+
+ list_splice(&ret_folios, folio_list);
+
+ while (!list_empty(folio_list)) {
+ folio = lru_to_folio(folio_list);
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ }
+
+ return nr_migrated;
+}
+
+unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid)
+{
+ int nid;
+ unsigned long nr_migrated = 0;
+ LIST_HEAD(node_folio_list);
+ unsigned int noreclaim_flag;
+
+ if (list_empty(folio_list))
+ return nr_migrated;
+
+ if (target_nid < 0 || target_nid >= MAX_NUMNODES ||
+ !node_state(target_nid, N_MEMORY))
+ return nr_migrated;
+
+ noreclaim_flag = memalloc_noreclaim_save();
+
+ nid = folio_nid(lru_to_folio(folio_list));
+ do {
+ struct folio *folio = lru_to_folio(folio_list);
+
+ if (nid == folio_nid(folio)) {
+ list_move(&folio->lru, &node_folio_list);
+ continue;
+ }
+
+ nr_migrated += damon_migrate_folio_list(&node_folio_list,
+ NODE_DATA(nid),
+ target_nid);
+ nid = folio_nid(lru_to_folio(folio_list));
+ } while (!list_empty(folio_list));
+
+ nr_migrated += damon_migrate_folio_list(&node_folio_list,
+ NODE_DATA(nid),
+ target_nid);
+
+ memalloc_noreclaim_restore(noreclaim_flag);
+
+ return nr_migrated;
+}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 18d837d11bce..61ad54aaf256 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for Data Access Monitoring
+ * Common Code for Data Access Monitoring
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -11,8 +11,13 @@ struct folio *damon_get_folio(unsigned long pfn);
void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr);
void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr);
+void damon_folio_mkold(struct folio *folio);
+bool damon_folio_young(struct folio *folio);
int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
+
+bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio);
+unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 1b70d3f36046..53a55c5114fb 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * DAMON Primitives for The Physical Address Space
+ * DAMON Code for The Physical Address Space
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -13,51 +13,11 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/memory-tiers.h>
-#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include "../internal.h"
#include "ops-common.h"
-static bool damon_folio_mkold_one(struct folio *folio,
- struct vm_area_struct *vma, unsigned long addr, void *arg)
-{
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
-
- while (page_vma_mapped_walk(&pvmw)) {
- addr = pvmw.address;
- if (pvmw.pte)
- damon_ptep_mkold(pvmw.pte, vma, addr);
- else
- damon_pmdp_mkold(pvmw.pmd, vma, addr);
- }
- return true;
-}
-
-static void damon_folio_mkold(struct folio *folio)
-{
- struct rmap_walk_control rwc = {
- .rmap_one = damon_folio_mkold_one,
- .anon_lock = folio_lock_anon_vma_read,
- };
- bool need_lock;
-
- if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
- folio_set_idle(folio);
- return;
- }
-
- need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
- if (need_lock && !folio_trylock(folio))
- return;
-
- rmap_walk(folio, &rwc);
-
- if (need_lock)
- folio_unlock(folio);
-
-}
-
static void damon_pa_mkold(unsigned long paddr)
{
struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
@@ -87,75 +47,6 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
}
}
-static bool damon_folio_young_one(struct folio *folio,
- struct vm_area_struct *vma, unsigned long addr, void *arg)
-{
- bool *accessed = arg;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
- pte_t pte;
-
- *accessed = false;
- while (page_vma_mapped_walk(&pvmw)) {
- addr = pvmw.address;
- if (pvmw.pte) {
- pte = ptep_get(pvmw.pte);
-
- /*
- * PFN swap PTEs, such as device-exclusive ones, that
- * actually map pages are "old" from a CPU perspective.
- * The MMU notifier takes care of any device aspects.
- */
- *accessed = (pte_present(pte) && pte_young(pte)) ||
- !folio_test_idle(folio) ||
- mmu_notifier_test_young(vma->vm_mm, addr);
- } else {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- *accessed = pmd_young(pmdp_get(pvmw.pmd)) ||
- !folio_test_idle(folio) ||
- mmu_notifier_test_young(vma->vm_mm, addr);
-#else
- WARN_ON_ONCE(1);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- }
- if (*accessed) {
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
- }
-
- /* If accessed, stop walking */
- return *accessed == false;
-}
-
-static bool damon_folio_young(struct folio *folio)
-{
- bool accessed = false;
- struct rmap_walk_control rwc = {
- .arg = &accessed,
- .rmap_one = damon_folio_young_one,
- .anon_lock = folio_lock_anon_vma_read,
- };
- bool need_lock;
-
- if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
- if (folio_test_idle(folio))
- return false;
- else
- return true;
- }
-
- need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
- if (need_lock && !folio_trylock(folio))
- return false;
-
- rmap_walk(folio, &rwc);
-
- if (need_lock)
- folio_unlock(folio);
-
- return accessed;
-}
-
static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
{
struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
@@ -206,49 +97,6 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
-static bool damos_pa_filter_match(struct damos_filter *filter,
- struct folio *folio)
-{
- bool matched = false;
- struct mem_cgroup *memcg;
- size_t folio_sz;
-
- switch (filter->type) {
- case DAMOS_FILTER_TYPE_ANON:
- matched = folio_test_anon(folio);
- break;
- case DAMOS_FILTER_TYPE_ACTIVE:
- matched = folio_test_active(folio);
- break;
- case DAMOS_FILTER_TYPE_MEMCG:
- rcu_read_lock();
- memcg = folio_memcg_check(folio);
- if (!memcg)
- matched = false;
- else
- matched = filter->memcg_id == mem_cgroup_id(memcg);
- rcu_read_unlock();
- break;
- case DAMOS_FILTER_TYPE_YOUNG:
- matched = damon_folio_young(folio);
- if (matched)
- damon_folio_mkold(folio);
- break;
- case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:
- folio_sz = folio_size(folio);
- matched = filter->sz_range.min <= folio_sz &&
- folio_sz <= filter->sz_range.max;
- break;
- case DAMOS_FILTER_TYPE_UNMAPPED:
- matched = !folio_mapped(folio) || !folio_raw_mapping(folio);
- break;
- default:
- break;
- }
-
- return matched == filter->matching;
-}
-
/*
* damos_pa_filter_out - Return true if the page should be filtered out.
*/
@@ -260,7 +108,7 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
return false;
damos_for_each_ops_filter(filter, scheme) {
- if (damos_pa_filter_match(filter, folio))
+ if (damos_folio_filter_match(filter, folio))
return !filter->allow;
}
return scheme->ops_filters_default_reject;
@@ -381,127 +229,6 @@ static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
sz_filter_passed);
}
-static unsigned int __damon_pa_migrate_folio_list(
- struct list_head *migrate_folios, struct pglist_data *pgdat,
- int target_nid)
-{
- unsigned int nr_succeeded = 0;
- nodemask_t allowed_mask = NODE_MASK_NONE;
- struct migration_target_control mtc = {
- /*
- * Allocate from 'node', or fail quickly and quietly.
- * When this happens, 'page' will likely just be discarded
- * instead of migrated.
- */
- .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
- __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT,
- .nid = target_nid,
- .nmask = &allowed_mask
- };
-
- if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE)
- return 0;
-
- if (list_empty(migrate_folios))
- return 0;
-
- /* Migration ignores all cpuset and mempolicy settings */
- migrate_pages(migrate_folios, alloc_migrate_folio, NULL,
- (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON,
- &nr_succeeded);
-
- return nr_succeeded;
-}
-
-static unsigned int damon_pa_migrate_folio_list(struct list_head *folio_list,
- struct pglist_data *pgdat,
- int target_nid)
-{
- unsigned int nr_migrated = 0;
- struct folio *folio;
- LIST_HEAD(ret_folios);
- LIST_HEAD(migrate_folios);
-
- while (!list_empty(folio_list)) {
- struct folio *folio;
-
- cond_resched();
-
- folio = lru_to_folio(folio_list);
- list_del(&folio->lru);
-
- if (!folio_trylock(folio))
- goto keep;
-
- /* Relocate its contents to another node. */
- list_add(&folio->lru, &migrate_folios);
- folio_unlock(folio);
- continue;
-keep:
- list_add(&folio->lru, &ret_folios);
- }
- /* 'folio_list' is always empty here */
-
- /* Migrate folios selected for migration */
- nr_migrated += __damon_pa_migrate_folio_list(
- &migrate_folios, pgdat, target_nid);
- /*
- * Folios that could not be migrated are still in @migrate_folios. Add
- * those back on @folio_list
- */
- if (!list_empty(&migrate_folios))
- list_splice_init(&migrate_folios, folio_list);
-
- try_to_unmap_flush();
-
- list_splice(&ret_folios, folio_list);
-
- while (!list_empty(folio_list)) {
- folio = lru_to_folio(folio_list);
- list_del(&folio->lru);
- folio_putback_lru(folio);
- }
-
- return nr_migrated;
-}
-
-static unsigned long damon_pa_migrate_pages(struct list_head *folio_list,
- int target_nid)
-{
- int nid;
- unsigned long nr_migrated = 0;
- LIST_HEAD(node_folio_list);
- unsigned int noreclaim_flag;
-
- if (list_empty(folio_list))
- return nr_migrated;
-
- noreclaim_flag = memalloc_noreclaim_save();
-
- nid = folio_nid(lru_to_folio(folio_list));
- do {
- struct folio *folio = lru_to_folio(folio_list);
-
- if (nid == folio_nid(folio)) {
- list_move(&folio->lru, &node_folio_list);
- continue;
- }
-
- nr_migrated += damon_pa_migrate_folio_list(&node_folio_list,
- NODE_DATA(nid),
- target_nid);
- nid = folio_nid(lru_to_folio(folio_list));
- } while (!list_empty(folio_list));
-
- nr_migrated += damon_pa_migrate_folio_list(&node_folio_list,
- NODE_DATA(nid),
- target_nid);
-
- memalloc_noreclaim_restore(noreclaim_flag);
-
- return nr_migrated;
-}
-
static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
unsigned long *sz_filter_passed)
{
@@ -529,7 +256,7 @@ put_folio:
addr += folio_size(folio);
folio_put(folio);
}
- applied = damon_pa_migrate_pages(&folio_list, s->target_nid);
+ applied = damon_migrate_pages(&folio_list, s->target_nid);
cond_resched();
s->last_applied = folio;
return applied * PAGE_SIZE;
@@ -548,7 +275,6 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
unsigned long *sz_filter_passed)
{
unsigned long addr;
- LIST_HEAD(folio_list);
struct folio *folio;
if (!damon_pa_scheme_has_filter(s))
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index a675150965e0..3c71b4596676 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -194,7 +194,7 @@ static int damon_reclaim_apply_parameters(void)
if (err)
return err;
- err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs);
+ err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs);
if (err)
goto out;
@@ -202,7 +202,7 @@ static int damon_reclaim_apply_parameters(void)
scheme = damon_reclaim_new_scheme();
if (!scheme)
goto out;
- damon_set_schemes(ctx, &scheme, 1);
+ damon_set_schemes(param_ctx, &scheme, 1);
if (quota_mem_pressure_us) {
goal = damos_new_quota_goal(DAMOS_QUOTA_SOME_MEM_PSI_US,
@@ -238,6 +238,35 @@ out:
return err;
}
+static int damon_reclaim_handle_commit_inputs(void)
+{
+ int err;
+
+ if (!commit_inputs)
+ return 0;
+
+ err = damon_reclaim_apply_parameters();
+ commit_inputs = false;
+ return err;
+}
+
+static int damon_reclaim_damon_call_fn(void *arg)
+{
+ struct damon_ctx *c = arg;
+ struct damos *s;
+
+ /* update the stats parameter */
+ damon_for_each_scheme(s, c)
+ damon_reclaim_stat = s->stat;
+
+ return damon_reclaim_handle_commit_inputs();
+}
+
+static struct damon_call_control call_control = {
+ .fn = damon_reclaim_damon_call_fn,
+ .repeat = true,
+};
+
static int damon_reclaim_turn(bool on)
{
int err;
@@ -257,7 +286,7 @@ static int damon_reclaim_turn(bool on)
if (err)
return err;
kdamond_pid = ctx->kdamond->pid;
- return 0;
+ return damon_call(ctx, &call_control);
}
static int damon_reclaim_enabled_store(const char *val,
@@ -296,48 +325,22 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
MODULE_PARM_DESC(enabled,
"Enable or disable DAMON_RECLAIM (default: disabled)");
-static int damon_reclaim_handle_commit_inputs(void)
-{
- int err;
-
- if (!commit_inputs)
- return 0;
-
- err = damon_reclaim_apply_parameters();
- commit_inputs = false;
- return err;
-}
-
-static int damon_reclaim_after_aggregation(struct damon_ctx *c)
-{
- struct damos *s;
-
- /* update the stats parameter */
- damon_for_each_scheme(s, c)
- damon_reclaim_stat = s->stat;
-
- return damon_reclaim_handle_commit_inputs();
-}
-
-static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
-{
- return damon_reclaim_handle_commit_inputs();
-}
-
static int __init damon_reclaim_init(void)
{
int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
if (err)
- return err;
+ goto out;
- ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
- ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
+ call_control.data = ctx;
/* 'enabled' has set before this function, probably via command line */
if (enabled)
err = damon_reclaim_turn(true);
+out:
+ if (err && enabled)
+ enabled = false;
return err;
}
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
new file mode 100644
index 000000000000..87bcd8866d4b
--- /dev/null
+++ b/mm/damon/stat.c
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shows data access monitoring resutls in simple metrics.
+ */
+
+#define pr_fmt(fmt) "damon-stat: " fmt
+
+#include <linux/damon.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sort.h>
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_stat."
+
+static int damon_stat_enabled_store(
+ const char *val, const struct kernel_param *kp);
+
+static const struct kernel_param_ops enabled_param_ops = {
+ .set = damon_stat_enabled_store,
+ .get = param_get_bool,
+};
+
+static bool enabled __read_mostly = IS_ENABLED(
+ CONFIG_DAMON_STAT_ENABLED_DEFAULT);
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled, "Enable of disable DAMON_STAT");
+
+static unsigned long estimated_memory_bandwidth __read_mostly;
+module_param(estimated_memory_bandwidth, ulong, 0400);
+MODULE_PARM_DESC(estimated_memory_bandwidth,
+ "Estimated memory bandwidth usage in bytes per second");
+
+static unsigned long memory_idle_ms_percentiles[101] __read_mostly = {0,};
+module_param_array(memory_idle_ms_percentiles, ulong, NULL, 0400);
+MODULE_PARM_DESC(memory_idle_ms_percentiles,
+ "Memory idle time percentiles in milliseconds");
+
+static struct damon_ctx *damon_stat_context;
+
+static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ unsigned long access_bytes = 0;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t)
+ access_bytes += (r->ar.end - r->ar.start) *
+ r->nr_accesses;
+ }
+ estimated_memory_bandwidth = access_bytes * USEC_PER_MSEC *
+ MSEC_PER_SEC / c->attrs.aggr_interval;
+}
+
+static unsigned int damon_stat_idletime(const struct damon_region *r)
+{
+ if (r->nr_accesses)
+ return 0;
+ return r->age + 1;
+}
+
+static int damon_stat_cmp_regions(const void *a, const void *b)
+{
+ const struct damon_region *ra = *(const struct damon_region **)a;
+ const struct damon_region *rb = *(const struct damon_region **)b;
+
+ return damon_stat_idletime(ra) - damon_stat_idletime(rb);
+}
+
+static int damon_stat_sort_regions(struct damon_ctx *c,
+ struct damon_region ***sorted_ptr, int *nr_regions_ptr,
+ unsigned long *total_sz_ptr)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ struct damon_region **region_pointers;
+ unsigned int nr_regions = 0;
+ unsigned long total_sz = 0;
+
+ damon_for_each_target(t, c) {
+ /* there is only one target */
+ region_pointers = kmalloc_array(damon_nr_regions(t),
+ sizeof(*region_pointers), GFP_KERNEL);
+ if (!region_pointers)
+ return -ENOMEM;
+ damon_for_each_region(r, t) {
+ region_pointers[nr_regions++] = r;
+ total_sz += r->ar.end - r->ar.start;
+ }
+ }
+ sort(region_pointers, nr_regions, sizeof(*region_pointers),
+ damon_stat_cmp_regions, NULL);
+ *sorted_ptr = region_pointers;
+ *nr_regions_ptr = nr_regions;
+ *total_sz_ptr = total_sz;
+ return 0;
+}
+
+static void damon_stat_set_idletime_percentiles(struct damon_ctx *c)
+{
+ struct damon_region **sorted_regions, *region;
+ int nr_regions;
+ unsigned long total_sz, accounted_bytes = 0;
+ int err, i, next_percentile = 0;
+
+ err = damon_stat_sort_regions(c, &sorted_regions, &nr_regions,
+ &total_sz);
+ if (err)
+ return;
+ for (i = 0; i < nr_regions; i++) {
+ region = sorted_regions[i];
+ accounted_bytes += region->ar.end - region->ar.start;
+ while (next_percentile <= accounted_bytes * 100 / total_sz)
+ memory_idle_ms_percentiles[next_percentile++] =
+ damon_stat_idletime(region) *
+ c->attrs.aggr_interval / USEC_PER_MSEC;
+ }
+ kfree(sorted_regions);
+}
+
+static int damon_stat_damon_call_fn(void *data)
+{
+ struct damon_ctx *c = data;
+ static unsigned long last_refresh_jiffies;
+
+ /* avoid unnecessarily frequent stat update */
+ if (time_before_eq(jiffies, last_refresh_jiffies +
+ msecs_to_jiffies(5 * MSEC_PER_SEC)))
+ return 0;
+ last_refresh_jiffies = jiffies;
+
+ damon_stat_set_estimated_memory_bandwidth(c);
+ damon_stat_set_idletime_percentiles(c);
+ return 0;
+}
+
+static struct damon_ctx *damon_stat_build_ctx(void)
+{
+ struct damon_ctx *ctx;
+ struct damon_attrs attrs;
+ struct damon_target *target;
+ unsigned long start = 0, end = 0;
+
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return NULL;
+ attrs = (struct damon_attrs) {
+ .sample_interval = 5 * USEC_PER_MSEC,
+ .aggr_interval = 100 * USEC_PER_MSEC,
+ .ops_update_interval = 60 * USEC_PER_MSEC * MSEC_PER_SEC,
+ .min_nr_regions = 10,
+ .max_nr_regions = 1000,
+ };
+ /*
+ * auto-tune sampling and aggregation interval aiming 4% DAMON-observed
+ * accesses ratio, keeping sampling interval in [5ms, 10s] range.
+ */
+ attrs.intervals_goal = (struct damon_intervals_goal) {
+ .access_bp = 400, .aggrs = 3,
+ .min_sample_us = 5000, .max_sample_us = 10000000,
+ };
+ if (damon_set_attrs(ctx, &attrs))
+ goto free_out;
+
+ /*
+ * auto-tune sampling and aggregation interval aiming 4% DAMON-observed
+ * accesses ratio, keeping sampling interval in [5ms, 10s] range.
+ */
+ ctx->attrs.intervals_goal = (struct damon_intervals_goal) {
+ .access_bp = 400, .aggrs = 3,
+ .min_sample_us = 5000, .max_sample_us = 10000000,
+ };
+ if (damon_select_ops(ctx, DAMON_OPS_PADDR))
+ goto free_out;
+
+ target = damon_new_target();
+ if (!target)
+ goto free_out;
+ damon_add_target(ctx, target);
+ if (damon_set_region_biggest_system_ram_default(target, &start, &end))
+ goto free_out;
+ return ctx;
+free_out:
+ damon_destroy_ctx(ctx);
+ return NULL;
+}
+
+static struct damon_call_control call_control = {
+ .fn = damon_stat_damon_call_fn,
+ .repeat = true,
+};
+
+static int damon_stat_start(void)
+{
+ int err;
+
+ damon_stat_context = damon_stat_build_ctx();
+ if (!damon_stat_context)
+ return -ENOMEM;
+ err = damon_start(&damon_stat_context, 1, true);
+ if (err)
+ return err;
+ call_control.data = damon_stat_context;
+ return damon_call(damon_stat_context, &call_control);
+}
+
+static void damon_stat_stop(void)
+{
+ damon_stop(&damon_stat_context, 1);
+ damon_destroy_ctx(damon_stat_context);
+}
+
+static bool damon_stat_init_called;
+
+static int damon_stat_enabled_store(
+ const char *val, const struct kernel_param *kp)
+{
+ bool is_enabled = enabled;
+ int err;
+
+ err = kstrtobool(val, &enabled);
+ if (err)
+ return err;
+
+ if (is_enabled == enabled)
+ return 0;
+
+ if (!damon_stat_init_called)
+ /*
+ * probably called from command line parsing (parse_args()).
+ * Cannot call damon_new_ctx(). Let damon_stat_init() handle.
+ */
+ return 0;
+
+ if (enabled) {
+ err = damon_stat_start();
+ if (err)
+ enabled = false;
+ return err;
+ }
+ damon_stat_stop();
+ return 0;
+}
+
+static int __init damon_stat_init(void)
+{
+ int err = 0;
+
+ damon_stat_init_called = true;
+
+ /* probably set via command line */
+ if (enabled)
+ err = damon_stat_start();
+
+ if (err && enabled)
+ enabled = false;
+ return err;
+}
+
+module_init(damon_stat_init);
diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
index 70edf45c2174..ffaf285e241a 100644
--- a/mm/damon/sysfs-common.c
+++ b/mm/damon/sysfs-common.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for DAMON Sysfs Interface
+ * Common Code for DAMON Sysfs Interface
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 70d84bdc9f5f..2099adee11d0 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for DAMON Sysfs Interface
+ * Common Code for DAMON Sysfs Interface
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 23b562df0839..74056bcd6a2c 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -341,16 +341,45 @@ static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(
return filter;
}
-/* Should match with enum damos_filter_type */
-static const char * const damon_sysfs_scheme_filter_type_strs[] = {
- "anon",
- "active",
- "memcg",
- "young",
- "hugepage_size",
- "unmapped",
- "addr",
- "target",
+struct damos_sysfs_filter_type_name {
+ enum damos_filter_type type;
+ char *name;
+};
+
+static const struct damos_sysfs_filter_type_name
+damos_sysfs_filter_type_names[] = {
+ {
+ .type = DAMOS_FILTER_TYPE_ANON,
+ .name = "anon",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_ACTIVE,
+ .name = "active",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_MEMCG,
+ .name = "memcg",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_YOUNG,
+ .name = "young",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_HUGEPAGE_SIZE,
+ .name = "hugepage_size",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_UNMAPPED,
+ .name = "unmapped",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_ADDR,
+ .name = "addr",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_TARGET,
+ .name = "target",
+ },
};
static ssize_t type_show(struct kobject *kobj,
@@ -358,9 +387,16 @@ static ssize_t type_show(struct kobject *kobj,
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
struct damon_sysfs_scheme_filter, kobj);
+ int i;
- return sysfs_emit(buf, "%s\n",
- damon_sysfs_scheme_filter_type_strs[filter->type]);
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_filter_type_names); i++) {
+ const struct damos_sysfs_filter_type_name *type_name;
+
+ type_name = &damos_sysfs_filter_type_names[i];
+ if (type_name->type == filter->type)
+ return sysfs_emit(buf, "%s\n", type_name->name);
+ }
+ return -EINVAL;
}
static bool damos_sysfs_scheme_filter_valid_type(
@@ -385,16 +421,19 @@ static ssize_t type_store(struct kobject *kobj,
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
struct damon_sysfs_scheme_filter, kobj);
- enum damos_filter_type type;
ssize_t ret = -EINVAL;
+ int i;
- for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) {
- if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[
- type])) {
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_filter_type_names); i++) {
+ const struct damos_sysfs_filter_type_name *type_name;
+
+ type_name = &damos_sysfs_filter_type_names[i];
+ if (sysfs_streq(buf, type_name->name)) {
if (!damos_sysfs_scheme_filter_valid_type(
- filter->handle_layer, type))
+ filter->handle_layer,
+ type_name->type))
break;
- filter->type = type;
+ filter->type = type_name->type;
ret = count;
break;
}
@@ -465,12 +504,14 @@ static ssize_t memcg_path_store(struct kobject *kobj,
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
struct damon_sysfs_scheme_filter, kobj);
- char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL);
+ char *path = kmalloc_array(size_add(count, 1), sizeof(*path),
+ GFP_KERNEL);
if (!path)
return -ENOMEM;
strscpy(path, buf, count + 1);
+ kfree(filter->memcg_path);
filter->memcg_path = path;
return count;
}
@@ -783,10 +824,21 @@ static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc(
return watermarks;
}
-/* Should match with enum damos_wmark_metric */
-static const char * const damon_sysfs_wmark_metric_strs[] = {
- "none",
- "free_mem_rate",
+struct damos_sysfs_wmark_metric_name {
+ enum damos_wmark_metric metric;
+ char *name;
+};
+
+static const struct damos_sysfs_wmark_metric_name
+damos_sysfs_wmark_metric_names[] = {
+ {
+ .metric = DAMOS_WMARK_NONE,
+ .name = "none",
+ },
+ {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .name = "free_mem_rate",
+ },
};
static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -794,9 +846,16 @@ static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_watermarks *watermarks = container_of(kobj,
struct damon_sysfs_watermarks, kobj);
+ int i;
- return sysfs_emit(buf, "%s\n",
- damon_sysfs_wmark_metric_strs[watermarks->metric]);
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_wmark_metric_names); i++) {
+ const struct damos_sysfs_wmark_metric_name *metric_name;
+
+ metric_name = &damos_sysfs_wmark_metric_names[i];
+ if (metric_name->metric == watermarks->metric)
+ return sysfs_emit(buf, "%s\n", metric_name->name);
+ }
+ return -EINVAL;
}
static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -804,11 +863,14 @@ static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_watermarks *watermarks = container_of(kobj,
struct damon_sysfs_watermarks, kobj);
- enum damos_wmark_metric metric;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_wmark_metric_names); i++) {
+ const struct damos_sysfs_wmark_metric_name *metric_name;
- for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) {
- if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) {
- watermarks->metric = metric;
+ metric_name = &damos_sysfs_wmark_metric_names[i];
+ if (sysfs_streq(buf, metric_name->name)) {
+ watermarks->metric = metric_name->metric;
return count;
}
}
@@ -936,12 +998,7 @@ struct damos_sysfs_quota_goal {
enum damos_quota_goal_metric metric;
unsigned long target_value;
unsigned long current_value;
-};
-
-/* This should match with enum damos_action */
-static const char * const damos_sysfs_quota_goal_metric_strs[] = {
- "user_input",
- "some_mem_psi_us",
+ int nid;
};
static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void)
@@ -949,14 +1006,46 @@ static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void)
return kzalloc(sizeof(struct damos_sysfs_quota_goal), GFP_KERNEL);
}
+struct damos_sysfs_qgoal_metric_name {
+ enum damos_quota_goal_metric metric;
+ char *name;
+};
+
+static
+struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = {
+ {
+ .metric = DAMOS_QUOTA_USER_INPUT,
+ .name = "user_input",
+ },
+ {
+ .metric = DAMOS_QUOTA_SOME_MEM_PSI_US,
+ .name = "some_mem_psi_us",
+ },
+ {
+ .metric = DAMOS_QUOTA_NODE_MEM_USED_BP,
+ .name = "node_mem_used_bp",
+ },
+ {
+ .metric = DAMOS_QUOTA_NODE_MEM_FREE_BP,
+ .name = "node_mem_free_bp",
+ },
+};
+
static ssize_t target_metric_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct damos_sysfs_quota_goal *goal = container_of(kobj,
struct damos_sysfs_quota_goal, kobj);
+ int i;
- return sysfs_emit(buf, "%s\n",
- damos_sysfs_quota_goal_metric_strs[goal->metric]);
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_metric_names); i++) {
+ struct damos_sysfs_qgoal_metric_name *metric_name;
+
+ metric_name = &damos_sysfs_qgoal_metric_names[i];
+ if (metric_name->metric == goal->metric)
+ return sysfs_emit(buf, "%s\n", metric_name->name);
+ }
+ return -EINVAL;
}
static ssize_t target_metric_store(struct kobject *kobj,
@@ -964,11 +1053,14 @@ static ssize_t target_metric_store(struct kobject *kobj,
{
struct damos_sysfs_quota_goal *goal = container_of(kobj,
struct damos_sysfs_quota_goal, kobj);
- enum damos_quota_goal_metric m;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_metric_names); i++) {
+ struct damos_sysfs_qgoal_metric_name *metric_name;
- for (m = 0; m < NR_DAMOS_QUOTA_GOAL_METRICS; m++) {
- if (sysfs_streq(buf, damos_sysfs_quota_goal_metric_strs[m])) {
- goal->metric = m;
+ metric_name = &damos_sysfs_qgoal_metric_names[i];
+ if (sysfs_streq(buf, metric_name->name)) {
+ goal->metric = metric_name->metric;
return count;
}
}
@@ -1014,6 +1106,28 @@ static ssize_t current_value_store(struct kobject *kobj,
return err ? err : count;
}
+static ssize_t nid_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj, struct
+ damos_sysfs_quota_goal, kobj);
+
+ /* todo: return error if the goal is not using nid */
+
+ return sysfs_emit(buf, "%d\n", goal->nid);
+}
+
+static ssize_t nid_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj, struct
+ damos_sysfs_quota_goal, kobj);
+ int err = kstrtoint(buf, 0, &goal->nid);
+
+ /* feed callback should check existence of this file and read value */
+ return err ? err : count;
+}
+
static void damos_sysfs_quota_goal_release(struct kobject *kobj)
{
/* or, notify this release to the feed callback */
@@ -1029,10 +1143,14 @@ static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr =
static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr =
__ATTR_RW_MODE(current_value, 0600);
+static struct kobj_attribute damos_sysfs_quota_goal_nid_attr =
+ __ATTR_RW_MODE(nid, 0600);
+
static struct attribute *damos_sysfs_quota_goal_attrs[] = {
&damos_sysfs_quota_goal_target_metric_attr.attr,
&damos_sysfs_quota_goal_target_value_attr.attr,
&damos_sysfs_quota_goal_current_value_attr.attr,
+ &damos_sysfs_quota_goal_nid_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damos_sysfs_quota_goal);
@@ -1538,6 +1656,204 @@ static const struct kobj_type damon_sysfs_access_pattern_ktype = {
};
/*
+ * dest (action destination) directory
+ */
+
+struct damos_sysfs_dest {
+ struct kobject kobj;
+ unsigned int id;
+ unsigned int weight;
+};
+
+static struct damos_sysfs_dest *damos_sysfs_dest_alloc(void)
+{
+ return kzalloc(sizeof(struct damos_sysfs_dest), GFP_KERNEL);
+}
+
+static ssize_t id_show(
+ struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+
+ return sysfs_emit(buf, "%u\n", dest->id);
+}
+
+static ssize_t id_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+ int err = kstrtouint(buf, 0, &dest->id);
+
+ return err ? err : count;
+}
+
+static ssize_t weight_show(
+ struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+
+ return sysfs_emit(buf, "%u\n", dest->weight);
+}
+
+static ssize_t weight_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+ int err = kstrtouint(buf, 0, &dest->weight);
+
+ return err ? err : count;
+}
+
+static void damos_sysfs_dest_release(struct kobject *kobj)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+ kfree(dest);
+}
+
+static struct kobj_attribute damos_sysfs_dest_id_attr =
+ __ATTR_RW_MODE(id, 0600);
+
+static struct kobj_attribute damos_sysfs_dest_weight_attr =
+ __ATTR_RW_MODE(weight, 0600);
+
+static struct attribute *damos_sysfs_dest_attrs[] = {
+ &damos_sysfs_dest_id_attr.attr,
+ &damos_sysfs_dest_weight_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damos_sysfs_dest);
+
+static const struct kobj_type damos_sysfs_dest_ktype = {
+ .release = damos_sysfs_dest_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damos_sysfs_dest_groups,
+};
+
+/*
+ * dests (action destinations) directory
+ */
+
+struct damos_sysfs_dests {
+ struct kobject kobj;
+ struct damos_sysfs_dest **dests_arr;
+ int nr;
+};
+
+static struct damos_sysfs_dests *
+damos_sysfs_dests_alloc(void)
+{
+ return kzalloc(sizeof(struct damos_sysfs_dests), GFP_KERNEL);
+}
+
+static void damos_sysfs_dests_rm_dirs(
+ struct damos_sysfs_dests *dests)
+{
+ struct damos_sysfs_dest **dests_arr = dests->dests_arr;
+ int i;
+
+ for (i = 0; i < dests->nr; i++)
+ kobject_put(&dests_arr[i]->kobj);
+ dests->nr = 0;
+ kfree(dests_arr);
+ dests->dests_arr = NULL;
+}
+
+static int damos_sysfs_dests_add_dirs(
+ struct damos_sysfs_dests *dests, int nr_dests)
+{
+ struct damos_sysfs_dest **dests_arr, *dest;
+ int err, i;
+
+ damos_sysfs_dests_rm_dirs(dests);
+ if (!nr_dests)
+ return 0;
+
+ dests_arr = kmalloc_array(nr_dests, sizeof(*dests_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!dests_arr)
+ return -ENOMEM;
+ dests->dests_arr = dests_arr;
+
+ for (i = 0; i < nr_dests; i++) {
+ dest = damos_sysfs_dest_alloc();
+ if (!dest) {
+ damos_sysfs_dests_rm_dirs(dests);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&dest->kobj,
+ &damos_sysfs_dest_ktype,
+ &dests->kobj, "%d", i);
+ if (err) {
+ kobject_put(&dest->kobj);
+ damos_sysfs_dests_rm_dirs(dests);
+ return err;
+ }
+
+ dests_arr[i] = dest;
+ dests->nr++;
+ }
+ return 0;
+}
+
+static ssize_t nr_dests_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_dests *dests = container_of(kobj,
+ struct damos_sysfs_dests, kobj);
+
+ return sysfs_emit(buf, "%d\n", dests->nr);
+}
+
+static ssize_t nr_dests_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_dests *dests;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ dests = container_of(kobj, struct damos_sysfs_dests, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damos_sysfs_dests_add_dirs(dests, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damos_sysfs_dests_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damos_sysfs_dests, kobj));
+}
+
+static struct kobj_attribute damos_sysfs_dests_nr_attr =
+ __ATTR_RW_MODE(nr_dests, 0600);
+
+static struct attribute *damos_sysfs_dests_attrs[] = {
+ &damos_sysfs_dests_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damos_sysfs_dests);
+
+static const struct kobj_type damos_sysfs_dests_ktype = {
+ .release = damos_sysfs_dests_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damos_sysfs_dests_groups,
+};
+
+/*
* scheme directory
*/
@@ -1554,20 +1870,55 @@ struct damon_sysfs_scheme {
struct damon_sysfs_stats *stats;
struct damon_sysfs_scheme_regions *tried_regions;
int target_nid;
+ struct damos_sysfs_dests *dests;
+};
+
+struct damos_sysfs_action_name {
+ enum damos_action action;
+ char *name;
};
-/* This should match with enum damos_action */
-static const char * const damon_sysfs_damos_action_strs[] = {
- "willneed",
- "cold",
- "pageout",
- "hugepage",
- "nohugepage",
- "lru_prio",
- "lru_deprio",
- "migrate_hot",
- "migrate_cold",
- "stat",
+static struct damos_sysfs_action_name damos_sysfs_action_names[] = {
+ {
+ .action = DAMOS_WILLNEED,
+ .name = "willneed",
+ },
+ {
+ .action = DAMOS_COLD,
+ .name = "cold",
+ },
+ {
+ .action = DAMOS_PAGEOUT,
+ .name = "pageout",
+ },
+ {
+ .action = DAMOS_HUGEPAGE,
+ .name = "hugepage",
+ },
+ {
+ .action = DAMOS_NOHUGEPAGE,
+ .name = "nohugepage",
+ },
+ {
+ .action = DAMOS_LRU_PRIO,
+ .name = "lru_prio",
+ },
+ {
+ .action = DAMOS_LRU_DEPRIO,
+ .name = "lru_deprio",
+ },
+ {
+ .action = DAMOS_MIGRATE_HOT,
+ .name = "migrate_hot",
+ },
+ {
+ .action = DAMOS_MIGRATE_COLD,
+ .name = "migrate_cold",
+ },
+ {
+ .action = DAMOS_STAT,
+ .name = "stat",
+ },
};
static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
@@ -1610,6 +1961,22 @@ out:
return err;
}
+static int damos_sysfs_set_dests(struct damon_sysfs_scheme *scheme)
+{
+ struct damos_sysfs_dests *dests = damos_sysfs_dests_alloc();
+ int err;
+
+ if (!dests)
+ return -ENOMEM;
+ err = kobject_init_and_add(&dests->kobj, &damos_sysfs_dests_ktype,
+ &scheme->kobj, "dests");
+ if (err)
+ kobject_put(&dests->kobj);
+ else
+ scheme->dests = dests;
+ return err;
+}
+
static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
{
struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc();
@@ -1742,9 +2109,12 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
err = damon_sysfs_scheme_set_access_pattern(scheme);
if (err)
return err;
- err = damon_sysfs_scheme_set_quotas(scheme);
+ err = damos_sysfs_set_dests(scheme);
if (err)
goto put_access_pattern_out;
+ err = damon_sysfs_scheme_set_quotas(scheme);
+ if (err)
+ goto put_dests_out;
err = damon_sysfs_scheme_set_watermarks(scheme);
if (err)
goto put_quotas_access_pattern_out;
@@ -1775,6 +2145,9 @@ put_watermarks_quotas_access_pattern_out:
put_quotas_access_pattern_out:
kobject_put(&scheme->quotas->kobj);
scheme->quotas = NULL;
+put_dests_out:
+ kobject_put(&scheme->dests->kobj);
+ scheme->dests = NULL;
put_access_pattern_out:
kobject_put(&scheme->access_pattern->kobj);
scheme->access_pattern = NULL;
@@ -1785,6 +2158,8 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
{
damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
kobject_put(&scheme->access_pattern->kobj);
+ kobject_put(&scheme->dests->kobj);
+ damos_sysfs_dests_rm_dirs(scheme->dests);
damon_sysfs_quotas_rm_dirs(scheme->quotas);
kobject_put(&scheme->quotas->kobj);
kobject_put(&scheme->watermarks->kobj);
@@ -1804,9 +2179,16 @@ static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_scheme *scheme = container_of(kobj,
struct damon_sysfs_scheme, kobj);
+ int i;
- return sysfs_emit(buf, "%s\n",
- damon_sysfs_damos_action_strs[scheme->action]);
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_action_names); i++) {
+ struct damos_sysfs_action_name *action_name;
+
+ action_name = &damos_sysfs_action_names[i];
+ if (action_name->action == scheme->action)
+ return sysfs_emit(buf, "%s\n", action_name->name);
+ }
+ return -EINVAL;
}
static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1814,11 +2196,14 @@ static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_scheme *scheme = container_of(kobj,
struct damon_sysfs_scheme, kobj);
- enum damos_action action;
+ int i;
- for (action = 0; action < NR_DAMOS_ACTIONS; action++) {
- if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) {
- scheme->action = action;
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_action_names); i++) {
+ struct damos_sysfs_action_name *action_name;
+
+ action_name = &damos_sysfs_action_names[i];
+ if (sysfs_streq(buf, action_name->name)) {
+ scheme->action = action_name->action;
return count;
}
}
@@ -2035,7 +2420,7 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id)
if (!memcg_path)
return -EINVAL;
- path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL);
+ path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL);
if (!path)
return -ENOMEM;
@@ -2120,8 +2505,17 @@ static int damos_sysfs_add_quota_score(
sysfs_goal->target_value);
if (!goal)
return -ENOMEM;
- if (sysfs_goal->metric == DAMOS_QUOTA_USER_INPUT)
+ switch (sysfs_goal->metric) {
+ case DAMOS_QUOTA_USER_INPUT:
goal->current_value = sysfs_goal->current_value;
+ break;
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ goal->nid = sysfs_goal->nid;
+ break;
+ default:
+ break;
+ }
damos_add_quota_goal(quota, goal);
}
return 0;
@@ -2182,6 +2576,29 @@ void damos_sysfs_update_effective_quotas(
}
}
+static int damos_sysfs_add_migrate_dest(struct damos *scheme,
+ struct damos_sysfs_dests *sysfs_dests)
+{
+ struct damos_migrate_dests *dests = &scheme->migrate_dests;
+ int i;
+
+ dests->node_id_arr = kmalloc_array(sysfs_dests->nr,
+ sizeof(*dests->node_id_arr), GFP_KERNEL);
+ if (!dests->node_id_arr)
+ return -ENOMEM;
+ dests->weight_arr = kmalloc_array(sysfs_dests->nr,
+ sizeof(*dests->weight_arr), GFP_KERNEL);
+ if (!dests->weight_arr)
+ /* ->node_id_arr will be freed by scheme destruction */
+ return -ENOMEM;
+ for (i = 0; i < sysfs_dests->nr; i++) {
+ dests->node_id_arr[i] = sysfs_dests->dests_arr[i]->id;
+ dests->weight_arr[i] = sysfs_dests->dests_arr[i]->weight;
+ }
+ dests->nr_dests = sysfs_dests->nr;
+ return 0;
+}
+
static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_scheme *sysfs_scheme)
{
@@ -2244,6 +2661,11 @@ static struct damos *damon_sysfs_mk_scheme(
damon_destroy_scheme(scheme);
return NULL;
}
+ err = damos_sysfs_add_migrate_dest(scheme, sysfs_scheme->dests);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
return scheme;
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 1af6aff35d84..6d2b0dab50cb 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -811,11 +811,24 @@ static const struct kobj_type damon_sysfs_attrs_ktype = {
* context directory
*/
-/* This should match with enum damon_ops_id */
-static const char * const damon_sysfs_ops_strs[] = {
- "vaddr",
- "fvaddr",
- "paddr",
+struct damon_sysfs_ops_name {
+ enum damon_ops_id ops_id;
+ char *name;
+};
+
+static const struct damon_sysfs_ops_name damon_sysfs_ops_names[] = {
+ {
+ .ops_id = DAMON_OPS_VADDR,
+ .name = "vaddr",
+ },
+ {
+ .ops_id = DAMON_OPS_FVADDR,
+ .name = "fvaddr",
+ },
+ {
+ .ops_id = DAMON_OPS_PADDR,
+ .name = "paddr",
+ },
};
struct damon_sysfs_context {
@@ -934,14 +947,16 @@ static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context)
static ssize_t avail_operations_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- enum damon_ops_id id;
int len = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) {
+ const struct damon_sysfs_ops_name *ops_name;
- for (id = 0; id < NR_DAMON_OPS; id++) {
- if (!damon_is_registered_ops(id))
+ ops_name = &damon_sysfs_ops_names[i];
+ if (!damon_is_registered_ops(ops_name->ops_id))
continue;
- len += sysfs_emit_at(buf, len, "%s\n",
- damon_sysfs_ops_strs[id]);
+ len += sysfs_emit_at(buf, len, "%s\n", ops_name->name);
}
return len;
}
@@ -951,8 +966,16 @@ static ssize_t operations_show(struct kobject *kobj,
{
struct damon_sysfs_context *context = container_of(kobj,
struct damon_sysfs_context, kobj);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) {
+ const struct damon_sysfs_ops_name *ops_name;
- return sysfs_emit(buf, "%s\n", damon_sysfs_ops_strs[context->ops_id]);
+ ops_name = &damon_sysfs_ops_names[i];
+ if (ops_name->ops_id == context->ops_id)
+ return sysfs_emit(buf, "%s\n", ops_name->name);
+ }
+ return -EINVAL;
}
static ssize_t operations_store(struct kobject *kobj,
@@ -960,11 +983,14 @@ static ssize_t operations_store(struct kobject *kobj,
{
struct damon_sysfs_context *context = container_of(kobj,
struct damon_sysfs_context, kobj);
- enum damon_ops_id id;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) {
+ const struct damon_sysfs_ops_name *ops_name;
- for (id = 0; id < NR_DAMON_OPS; id++) {
- if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) {
- context->ops_id = id;
+ ops_name = &damon_sysfs_ops_names[i];
+ if (sysfs_streq(buf, ops_name->name)) {
+ context->ops_id = ops_name->ops_id;
return count;
}
}
@@ -1129,6 +1155,7 @@ struct damon_sysfs_kdamond {
struct kobject kobj;
struct damon_sysfs_contexts *contexts;
struct damon_ctx *damon_ctx;
+ unsigned int refresh_ms;
};
static struct damon_sysfs_kdamond *damon_sysfs_kdamond_alloc(void)
@@ -1163,16 +1190,6 @@ static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond)
kobject_put(&kdamond->contexts->kobj);
}
-static bool damon_sysfs_ctx_running(struct damon_ctx *ctx)
-{
- bool running;
-
- mutex_lock(&ctx->kdamond_lock);
- running = ctx->kdamond != NULL;
- mutex_unlock(&ctx->kdamond_lock);
- return running;
-}
-
/*
* enum damon_sysfs_cmd - Commands for a specific kdamond.
*/
@@ -1249,7 +1266,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
if (!ctx)
running = false;
else
- running = damon_sysfs_ctx_running(ctx);
+ running = damon_is_running(ctx);
return sysfs_emit(buf, "%s\n", running ?
damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] :
@@ -1279,18 +1296,6 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
return damon_set_attrs(ctx, &attrs);
}
-static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
-{
- struct damon_target *t, *next;
- bool has_pid = damon_target_has_pid(ctx);
-
- damon_for_each_target_safe(t, next, ctx) {
- if (has_pid)
- put_pid(t->pid);
- damon_destroy_target(t);
- }
-}
-
static int damon_sysfs_set_regions(struct damon_target *t,
struct damon_sysfs_regions *sysfs_regions)
{
@@ -1325,7 +1330,6 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
struct damon_ctx *ctx)
{
struct damon_target *t = damon_new_target();
- int err = -EINVAL;
if (!t)
return -ENOMEM;
@@ -1333,16 +1337,10 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
if (damon_target_has_pid(ctx)) {
t->pid = find_get_pid(sys_target->pid);
if (!t->pid)
- goto destroy_targets_out;
+ /* caller will destroy targets */
+ return -EINVAL;
}
- err = damon_sysfs_set_regions(t, sys_target->regions);
- if (err)
- goto destroy_targets_out;
- return 0;
-
-destroy_targets_out:
- damon_sysfs_destroy_targets(ctx);
- return err;
+ return damon_sysfs_set_regions(t, sys_target->regions);
}
static int damon_sysfs_add_targets(struct damon_ctx *ctx,
@@ -1364,21 +1362,6 @@ static int damon_sysfs_add_targets(struct damon_ctx *ctx,
return 0;
}
-static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
-{
- struct damon_target *t, *next;
-
- if (!damon_target_has_pid(ctx))
- return;
-
- mutex_lock(&ctx->kdamond_lock);
- damon_for_each_target_safe(t, next, ctx) {
- put_pid(t->pid);
- damon_destroy_target(t);
- }
- mutex_unlock(&ctx->kdamond_lock);
-}
-
/*
* damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
* @data: The kobject wrapper that associated to the kdamond thread.
@@ -1403,7 +1386,7 @@ static inline bool damon_sysfs_kdamond_running(
struct damon_sysfs_kdamond *kdamond)
{
return kdamond->damon_ctx &&
- damon_sysfs_ctx_running(kdamond->damon_ctx);
+ damon_is_running(kdamond->damon_ctx);
}
static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
@@ -1450,13 +1433,11 @@ static int damon_sysfs_commit_input(void *data)
test_ctx = damon_new_ctx();
err = damon_commit_ctx(test_ctx, param_ctx);
if (err) {
- damon_sysfs_destroy_targets(test_ctx);
damon_destroy_ctx(test_ctx);
goto out;
}
err = damon_commit_ctx(kdamond->damon_ctx, param_ctx);
out:
- damon_sysfs_destroy_targets(param_ctx);
damon_destroy_ctx(param_ctx);
return err;
}
@@ -1525,10 +1506,35 @@ static struct damon_ctx *damon_sysfs_build_ctx(
return ERR_PTR(err);
}
- ctx->callback.before_terminate = damon_sysfs_before_terminate;
return ctx;
}
+static int damon_sysfs_repeat_call_fn(void *data)
+{
+ struct damon_sysfs_kdamond *sysfs_kdamond = data;
+ static unsigned long next_update_jiffies;
+
+ if (!sysfs_kdamond->refresh_ms)
+ return 0;
+ if (time_before(jiffies, next_update_jiffies))
+ return 0;
+ next_update_jiffies = jiffies +
+ msecs_to_jiffies(sysfs_kdamond->refresh_ms);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return 0;
+ damon_sysfs_upd_tuned_intervals(sysfs_kdamond);
+ damon_sysfs_upd_schemes_stats(sysfs_kdamond);
+ damon_sysfs_upd_schemes_effective_quotas(sysfs_kdamond);
+ mutex_unlock(&damon_sysfs_lock);
+ return 0;
+}
+
+static struct damon_call_control damon_sysfs_repeat_call_control = {
+ .fn = damon_sysfs_repeat_call_fn,
+ .repeat = true,
+};
+
static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
{
struct damon_ctx *ctx;
@@ -1553,6 +1559,9 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
return err;
}
kdamond->damon_ctx = ctx;
+
+ damon_sysfs_repeat_call_control.data = kdamond;
+ damon_call(ctx, &damon_sysfs_repeat_call_control);
return err;
}
@@ -1711,6 +1720,30 @@ out:
return sysfs_emit(buf, "%d\n", pid);
}
+static ssize_t refresh_ms_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+
+ return sysfs_emit(buf, "%u\n", kdamond->refresh_ms);
+}
+
+static ssize_t refresh_ms_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+ unsigned int nr;
+ int err = kstrtouint(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ kdamond->refresh_ms = nr;
+ return count;
+}
+
static void damon_sysfs_kdamond_release(struct kobject *kobj)
{
struct damon_sysfs_kdamond *kdamond = container_of(kobj,
@@ -1727,9 +1760,13 @@ static struct kobj_attribute damon_sysfs_kdamond_state_attr =
static struct kobj_attribute damon_sysfs_kdamond_pid_attr =
__ATTR_RO_MODE(pid, 0400);
+static struct kobj_attribute damon_sysfs_kdamond_refresh_ms_attr =
+ __ATTR_RW_MODE(refresh_ms, 0600);
+
static struct attribute *damon_sysfs_kdamond_attrs[] = {
&damon_sysfs_kdamond_state_attr.attr,
&damon_sysfs_kdamond_pid_attr.attr,
+ &damon_sysfs_kdamond_refresh_ms_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_kdamond);
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index be0fea9ee5fc..dfedfff19940 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -58,7 +58,7 @@ static void damon_test_target(struct kunit *test)
damon_add_target(c, t);
KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c));
- damon_destroy_target(t);
+ damon_destroy_target(t, c);
KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c));
damon_destroy_ctx(c);
@@ -310,7 +310,7 @@ static void damon_test_set_regions(struct kunit *test)
KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]);
KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]);
}
- damon_destroy_target(t);
+ damon_destroy_target(t, NULL);
}
static void damon_test_nr_accesses_to_accesses_bp(struct kunit *test)
@@ -510,6 +510,75 @@ static void damon_test_feed_loop_next_input(struct kunit *test)
damon_feed_loop_next_input(last_input, 2000));
}
+static void damon_test_set_filters_default_reject(struct kunit *test)
+{
+ struct damos scheme;
+ struct damos_filter *target_filter, *anon_filter;
+
+ INIT_LIST_HEAD(&scheme.filters);
+ INIT_LIST_HEAD(&scheme.ops_filters);
+
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * No filter is installed. Allow by default on both core and ops layer
+ * filtering stages, since there are no filters at all.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ target_filter = damos_new_filter(DAMOS_FILTER_TYPE_TARGET, true, true);
+ damos_add_filter(&scheme, target_filter);
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled allow-filter is installed.
+ * Rejct by default on core layer filtering stage due to the last
+ * core-layer-filter's behavior.
+ * Allow by default on ops layer filtering stage due to the absence of
+ * ops layer filters.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, true);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ target_filter->allow = false;
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled reject-filter is installed.
+ * Allow by default on core layer filtering stage due to the last
+ * core-layer-filter's behavior.
+ * Allow by default on ops layer filtering stage due to the absence of
+ * ops layer filters.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ anon_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true);
+ damos_add_filter(&scheme, anon_filter);
+
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled reject-filter and ops-handled allow-filter are installed.
+ * Allow by default on core layer filtering stage due to the existence
+ * of the ops-handled filter.
+ * Reject by default on ops layer filtering stage due to the last
+ * ops-layer-filter's behavior.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true);
+
+ target_filter->allow = true;
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled allow-filter and ops-handled allow-filter are
+ * installed.
+ * Allow by default on core layer filtering stage due to the existence
+ * of the ops-handled filter.
+ * Reject by default on ops layer filtering stage due to the last
+ * ops-layer-filter's behavior.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -527,6 +596,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damos_test_new_filter),
KUNIT_CASE(damos_test_filter_out),
KUNIT_CASE(damon_test_feed_loop_next_input),
+ KUNIT_CASE(damon_test_set_filters_default_reject),
{},
};
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index 7cd944266a92..d2b37ccf2cc0 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -149,7 +149,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]);
}
- damon_destroy_target(t);
+ damon_destroy_target(t, NULL);
}
/*
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index e6d99106a7f9..94af19c4dfed 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * DAMON Primitives for Virtual Address Spaces
+ * DAMON Code for Virtual Address Spaces
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -15,6 +15,7 @@
#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
+#include "../internal.h"
#include "ops-common.h"
#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
@@ -610,6 +611,183 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
+static bool damos_va_filter_young_match(struct damos_filter *filter,
+ struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pmd_t *pmdp)
+{
+ bool young = false;
+
+ if (ptep)
+ young = pte_young(ptep_get(ptep));
+ else if (pmdp)
+ young = pmd_young(pmdp_get(pmdp));
+
+ young = young || !folio_test_idle(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+
+ if (young && ptep)
+ damon_ptep_mkold(ptep, vma, addr);
+ else if (young && pmdp)
+ damon_pmdp_mkold(pmdp, vma, addr);
+
+ return young == filter->matching;
+}
+
+static bool damos_va_filter_out(struct damos *scheme, struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pmd_t *pmdp)
+{
+ struct damos_filter *filter;
+ bool matched;
+
+ if (scheme->core_filters_allowed)
+ return false;
+
+ damos_for_each_ops_filter(filter, scheme) {
+ /*
+ * damos_folio_filter_match checks the young filter by doing an
+ * rmap on the folio to find its page table. However, being the
+ * vaddr scheme, we have direct access to the page tables, so
+ * use that instead.
+ */
+ if (filter->type == DAMOS_FILTER_TYPE_YOUNG)
+ matched = damos_va_filter_young_match(filter, folio,
+ vma, addr, ptep, pmdp);
+ else
+ matched = damos_folio_filter_match(filter, folio);
+
+ if (matched)
+ return !filter->allow;
+ }
+ return scheme->ops_filters_default_reject;
+}
+
+struct damos_va_migrate_private {
+ struct list_head *migration_lists;
+ struct damos *scheme;
+};
+
+/*
+ * Place the given folio in the migration_list corresponding to where the folio
+ * should be migrated.
+ *
+ * The algorithm used here is similar to weighted_interleave_nid()
+ */
+static void damos_va_migrate_dests_add(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr,
+ struct damos_migrate_dests *dests,
+ struct list_head *migration_lists)
+{
+ pgoff_t ilx;
+ int order;
+ unsigned int target;
+ unsigned int weight_total = 0;
+ int i;
+
+ /*
+ * If dests is empty, there is only one migration list corresponding
+ * to s->target_nid.
+ */
+ if (!dests->nr_dests) {
+ i = 0;
+ goto isolate;
+ }
+
+ order = folio_order(folio);
+ ilx = vma->vm_pgoff >> order;
+ ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
+
+ for (i = 0; i < dests->nr_dests; i++)
+ weight_total += dests->weight_arr[i];
+
+ /* If the total weights are somehow 0, don't migrate at all */
+ if (!weight_total)
+ return;
+
+ target = ilx % weight_total;
+ for (i = 0; i < dests->nr_dests; i++) {
+ if (target < dests->weight_arr[i])
+ break;
+ target -= dests->weight_arr[i];
+ }
+
+isolate:
+ if (!folio_isolate_lru(folio))
+ return;
+
+ list_add(&folio->lru, &migration_lists[i]);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct damos_va_migrate_private *priv = walk->private;
+ struct list_head *migration_lists = priv->migration_lists;
+ struct damos *s = priv->scheme;
+ struct damos_migrate_dests *dests = &s->migrate_dests;
+ struct folio *folio;
+ spinlock_t *ptl;
+ pmd_t pmde;
+
+ ptl = pmd_lock(walk->mm, pmd);
+ pmde = pmdp_get(pmd);
+
+ if (!pmd_present(pmde) || !pmd_trans_huge(pmde))
+ goto unlock;
+
+ /* Tell page walk code to not split the PMD */
+ walk->action = ACTION_CONTINUE;
+
+ folio = damon_get_folio(pmd_pfn(pmde));
+ if (!folio)
+ goto unlock;
+
+ if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd))
+ goto put_folio;
+
+ damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
+ migration_lists);
+
+put_folio:
+ folio_put(folio);
+unlock:
+ spin_unlock(ptl);
+ return 0;
+}
+#else
+#define damos_va_migrate_pmd_entry NULL
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static int damos_va_migrate_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct damos_va_migrate_private *priv = walk->private;
+ struct list_head *migration_lists = priv->migration_lists;
+ struct damos *s = priv->scheme;
+ struct damos_migrate_dests *dests = &s->migrate_dests;
+ struct folio *folio;
+ pte_t ptent;
+
+ ptent = ptep_get(pte);
+ if (pte_none(ptent) || !pte_present(ptent))
+ return 0;
+
+ folio = damon_get_folio(pte_pfn(ptent));
+ if (!folio)
+ return 0;
+
+ if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL))
+ goto put_folio;
+
+ damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
+ migration_lists);
+
+put_folio:
+ folio_put(folio);
+ return 0;
+}
+
/*
* Functions for the target validity check and cleanup
*/
@@ -627,6 +805,11 @@ static bool damon_va_target_valid(struct damon_target *t)
return false;
}
+static void damon_va_cleanup_target(struct damon_target *t)
+{
+ put_pid(t->pid);
+}
+
#ifndef CONFIG_ADVISE_SYSCALLS
static unsigned long damos_madvise(struct damon_target *target,
struct damon_region *r, int behavior)
@@ -653,6 +836,56 @@ static unsigned long damos_madvise(struct damon_target *target,
}
#endif /* CONFIG_ADVISE_SYSCALLS */
+static unsigned long damos_va_migrate(struct damon_target *target,
+ struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
+{
+ LIST_HEAD(folio_list);
+ struct damos_va_migrate_private priv;
+ struct mm_struct *mm;
+ int nr_dests;
+ int nid;
+ bool use_target_nid;
+ unsigned long applied = 0;
+ struct damos_migrate_dests *dests = &s->migrate_dests;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = damos_va_migrate_pmd_entry,
+ .pte_entry = damos_va_migrate_pte_entry,
+ .walk_lock = PGWALK_RDLOCK,
+ };
+
+ use_target_nid = dests->nr_dests == 0;
+ nr_dests = use_target_nid ? 1 : dests->nr_dests;
+ priv.scheme = s;
+ priv.migration_lists = kmalloc_array(nr_dests,
+ sizeof(*priv.migration_lists), GFP_KERNEL);
+ if (!priv.migration_lists)
+ return 0;
+
+ for (int i = 0; i < nr_dests; i++)
+ INIT_LIST_HEAD(&priv.migration_lists[i]);
+
+
+ mm = damon_get_mm(target);
+ if (!mm)
+ goto free_lists;
+
+ mmap_read_lock(mm);
+ walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ for (int i = 0; i < nr_dests; i++) {
+ nid = use_target_nid ? s->target_nid : dests->node_id_arr[i];
+ applied += damon_migrate_pages(&priv.migration_lists[i], nid);
+ cond_resched();
+ }
+
+free_lists:
+ kfree(priv.migration_lists);
+ return applied * PAGE_SIZE;
+}
+
static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
struct damos *scheme, unsigned long *sz_filter_passed)
@@ -675,6 +908,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
case DAMOS_NOHUGEPAGE:
madv_action = MADV_NOHUGEPAGE;
break;
+ case DAMOS_MIGRATE_HOT:
+ case DAMOS_MIGRATE_COLD:
+ return damos_va_migrate(t, r, scheme, sz_filter_passed);
case DAMOS_STAT:
return 0;
default:
@@ -695,6 +931,10 @@ static int damon_va_scheme_score(struct damon_ctx *context,
switch (scheme->action) {
case DAMOS_PAGEOUT:
return damon_cold_score(context, r, scheme);
+ case DAMOS_MIGRATE_HOT:
+ return damon_hot_score(context, r, scheme);
+ case DAMOS_MIGRATE_COLD:
+ return damon_cold_score(context, r, scheme);
default:
break;
}
@@ -711,6 +951,7 @@ static int __init damon_va_initcall(void)
.prepare_access_checks = damon_va_prepare_access_checks,
.check_accesses = damon_va_check_accesses,
.target_valid = damon_va_target_valid,
+ .cleanup_target = damon_va_cleanup_target,
.cleanup = NULL,
.apply_scheme = damon_va_apply_scheme,
.get_scheme_score = damon_va_scheme_score,
diff --git a/mm/debug.c b/mm/debug.c
index db83e381a8ae..b4388f4dcd4d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -71,10 +71,12 @@ static void __dump_folio(struct folio *folio, struct page *page,
unsigned long pfn, unsigned long idx)
{
struct address_space *mapping = folio_mapping(folio);
- int mapcount = atomic_read(&page->_mapcount);
+ int mapcount = atomic_read(&page->_mapcount) + 1;
char *type = "";
- mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1;
+ if (page_mapcount_is_type(mapcount))
+ mapcount = 0;
+
pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
folio_ref_count(folio), mapcount, mapping,
folio->index + idx, pfn);
@@ -127,47 +129,13 @@ static void __dump_folio(struct folio *folio, struct page *page,
static void __dump_page(const struct page *page)
{
- struct folio *foliop, folio;
- struct page precise;
- unsigned long head;
- unsigned long pfn = page_to_pfn(page);
- unsigned long idx, nr_pages = 1;
- int loops = 5;
-
-again:
- memcpy(&precise, page, sizeof(*page));
- head = precise.compound_head;
- if ((head & 1) == 0) {
- foliop = (struct folio *)&precise;
- idx = 0;
- if (!folio_test_large(foliop))
- goto dump;
- foliop = (struct folio *)page;
- } else {
- foliop = (struct folio *)(head - 1);
- idx = folio_page_idx(foliop, page);
- }
+ struct page_snapshot ps;
- if (idx < MAX_FOLIO_NR_PAGES) {
- memcpy(&folio, foliop, 2 * sizeof(struct page));
- nr_pages = folio_nr_pages(&folio);
- if (nr_pages > 1)
- memcpy(&folio.__page_2, &foliop->__page_2,
- sizeof(struct page));
- foliop = &folio;
- }
-
- if (idx > nr_pages) {
- if (loops-- > 0)
- goto again;
+ snapshot_page(&ps, page);
+ if (!snapshot_page_is_faithful(&ps))
pr_warn("page does not match folio\n");
- precise.compound_head &= ~1UL;
- foliop = (struct folio *)&precise;
- idx = 0;
- }
-dump:
- __dump_folio(foliop, &precise, pfn, idx);
+ __dump_folio(&ps.folio_snapshot, &ps.page_snapshot, ps.pfn, ps.idx);
}
void dump_page(const struct page *page, const char *reason)
@@ -288,7 +256,7 @@ void dump_vmg(const struct vma_merge_struct *vmg, const char *reason)
vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0,
vmg->vmi ? vma_iter_end(vmg->vmi) : 0,
vmg->prev, vmg->middle, vmg->next, vmg->target,
- vmg->start, vmg->end, vmg->flags,
+ vmg->start, vmg->end, vmg->vm_flags,
vmg->file, vmg->anon_vma, vmg->policy,
#ifdef CONFIG_USERFAULTFD
vmg->uffd_ctx.ctx,
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
index d46acf989dde..6a26eca546c3 100644
--- a/mm/debug_page_alloc.c
+++ b/mm/debug_page_alloc.c
@@ -23,7 +23,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_PAGE_ORDER / 2) {
- pr_err("Bad debug_guardpage_minorder value\n");
+ pr_err("Bad debug_guardpage_minorder value: %s\n", buf);
return 0;
}
_debug_guardpage_minorder = res;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index bc748f700a9e..d19031f275a3 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -20,7 +20,6 @@
#include <linux/mman.h>
#include <linux/mm_types.h>
#include <linux/module.h>
-#include <linux/pfn_t.h>
#include <linux/printk.h>
#include <linux/pgtable.h>
#include <linux/random.h>
@@ -73,6 +72,8 @@ struct pgtable_debug_args {
unsigned long fixed_pud_pfn;
unsigned long fixed_pmd_pfn;
unsigned long fixed_pte_pfn;
+
+ swp_entry_t swp_entry;
};
static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
@@ -348,12 +349,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
vaddr &= HPAGE_PUD_MASK;
pud = pfn_pud(args->pud_pfn, args->page_prot);
- /*
- * Some architectures have debug checks to make sure
- * huge pud mapping are only found with devmap entries
- * For now test with only devmap entries.
- */
- pud = pud_mkdevmap(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
pudp_set_wrprotect(args->mm, vaddr, args->pudp);
@@ -366,7 +361,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
- pud = pud_mkdevmap(pud);
pud = pud_wrprotect(pud);
pud = pud_mkclean(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
@@ -384,7 +378,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
- pud = pud_mkdevmap(pud);
pud = pud_mkyoung(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
@@ -693,53 +686,6 @@ static void __init pmd_protnone_tests(struct pgtable_debug_args *args)
static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static void __init pte_devmap_tests(struct pgtable_debug_args *args)
-{
- pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
-
- pr_debug("Validating PTE devmap\n");
- WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_devmap_tests(struct pgtable_debug_args *args)
-{
- pmd_t pmd;
-
- if (!has_transparent_hugepage())
- return;
-
- pr_debug("Validating PMD devmap\n");
- pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
- WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
-}
-
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_devmap_tests(struct pgtable_debug_args *args)
-{
- pud_t pud;
-
- if (!has_transparent_pud_hugepage())
- return;
-
- pr_debug("Validating PUD devmap\n");
- pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
- WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
-}
-#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
-#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-#else /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
-static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-#else
-static void __init pte_devmap_tests(struct pgtable_debug_args *args) { }
-static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
-static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
-#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
-
static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
{
pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
@@ -754,12 +700,15 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
+ pte_t pte;
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
return;
pr_debug("Validating PTE swap soft dirty\n");
+ pte = swp_entry_to_pte(args->swp_entry);
+ WARN_ON(!is_swap_pte(pte));
+
WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte)));
WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte)));
}
@@ -793,7 +742,9 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args)
return;
pr_debug("Validating PMD swap soft dirty\n");
- pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
+ pmd = swp_entry_to_pmd(args->swp_entry);
+ WARN_ON(!is_swap_pmd(pmd));
+
WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
}
@@ -804,17 +755,11 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) {
static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
{
- unsigned long max_swap_offset;
swp_entry_t entry, entry2;
pte_t pte;
pr_debug("Validating PTE swap exclusive\n");
-
- /* See generic_max_swapfile_size(): probe the maximum offset */
- max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
-
- /* Create a swp entry with all possible bits set */
- entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset);
+ entry = args->swp_entry;
pte = swp_entry_to_pte(entry);
WARN_ON(pte_swp_exclusive(pte));
@@ -838,30 +783,34 @@ static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
static void __init pte_swap_tests(struct pgtable_debug_args *args)
{
- swp_entry_t swp;
- pte_t pte;
+ swp_entry_t arch_entry;
+ pte_t pte1, pte2;
pr_debug("Validating PTE swap\n");
- pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
- swp = __pte_to_swp_entry(pte);
- pte = __swp_entry_to_pte(swp);
- WARN_ON(args->fixed_pte_pfn != pte_pfn(pte));
+ pte1 = swp_entry_to_pte(args->swp_entry);
+ WARN_ON(!is_swap_pte(pte1));
+
+ arch_entry = __pte_to_swp_entry(pte1);
+ pte2 = __swp_entry_to_pte(arch_entry);
+ WARN_ON(memcmp(&pte1, &pte2, sizeof(pte1)));
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
static void __init pmd_swap_tests(struct pgtable_debug_args *args)
{
- swp_entry_t swp;
- pmd_t pmd;
+ swp_entry_t arch_entry;
+ pmd_t pmd1, pmd2;
if (!has_transparent_hugepage())
return;
pr_debug("Validating PMD swap\n");
- pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
- swp = __pmd_to_swp_entry(pmd);
- pmd = __swp_entry_to_pmd(swp);
- WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd));
+ pmd1 = swp_entry_to_pmd(args->swp_entry);
+ WARN_ON(!is_swap_pmd(pmd1));
+
+ arch_entry = __pmd_to_swp_entry(pmd1);
+ pmd2 = __swp_entry_to_pmd(arch_entry);
+ WARN_ON(memcmp(&pmd1, &pmd2, sizeof(pmd1)));
}
#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
static void __init pmd_swap_tests(struct pgtable_debug_args *args) { }
@@ -910,26 +859,18 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args)
#ifdef CONFIG_HUGETLB_PAGE
static void __init hugetlb_basic_tests(struct pgtable_debug_args *args)
{
- struct page *page;
pte_t pte;
pr_debug("Validating HugeTLB basic\n");
- /*
- * Accessing the page associated with the pfn is safe here,
- * as it was previously derived from a real kernel symbol.
- */
- page = pfn_to_page(args->fixed_pmd_pfn);
- pte = mk_huge_pte(page, args->page_prot);
+ pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
+ pte = arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS);
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+ WARN_ON(!pte_huge(pte));
+#endif
WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
-
-#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
- pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
-
- WARN_ON(!pte_huge(arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS)));
-#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
#else /* !CONFIG_HUGETLB_PAGE */
static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { }
@@ -1174,6 +1115,7 @@ static void __init init_fixed_pfns(struct pgtable_debug_args *args)
static int __init init_args(struct pgtable_debug_args *args)
{
+ unsigned long max_swap_offset;
struct page *page = NULL;
int ret = 0;
@@ -1256,6 +1198,11 @@ static int __init init_args(struct pgtable_debug_args *args)
init_fixed_pfns(args);
+ /* See generic_max_swapfile_size(): probe the maximum offset */
+ max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
+ /* Create a swp entry with all possible bits set */
+ args->swp_entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset);
+
/*
* Allocate (huge) pages because some of the tests need to access
* the data in the pages. The corresponding tests will be skipped
@@ -1341,10 +1288,6 @@ static int __init debug_vm_pgtable(void)
pte_protnone_tests(&args);
pmd_protnone_tests(&args);
- pte_devmap_tests(&args);
- pmd_devmap_tests(&args);
- pud_devmap_tests(&args);
-
pte_soft_dirty_tests(&args);
pmd_soft_dirty_tests(&args);
pte_swap_soft_dirty_tests(&args);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f0bfc6c490f4..5d8af6e29127 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -56,6 +56,7 @@ struct dma_pool { /* the pool */
unsigned int size;
unsigned int allocation;
unsigned int boundary;
+ int node;
char name[32];
struct list_head pools;
};
@@ -199,16 +200,17 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
/**
- * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * dma_pool_create_node - Creates a pool of coherent DMA memory blocks.
* @name: name of pool, for diagnostics
* @dev: device that will be doing the DMA
* @size: size of the blocks in this pool.
* @align: alignment requirement for blocks; must be a power of two
* @boundary: returned blocks won't cross this power of two boundary
+ * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on
* Context: not in_interrupt()
*
* Given one of these pools, dma_pool_alloc()
- * may be used to allocate memory. Such memory will all have "consistent"
+ * may be used to allocate memory. Such memory will all have coherent
* DMA mappings, accessible by the device and its driver without using
* cache flushing primitives. The actual size of blocks allocated may be
* larger than requested because of alignment.
@@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
* Return: a dma allocation pool with the requested characteristics, or
* %NULL if one can't be created.
*/
-struct dma_pool *dma_pool_create(const char *name, struct device *dev,
- size_t size, size_t align, size_t boundary)
+struct dma_pool *dma_pool_create_node(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary, int node)
{
struct dma_pool *retval;
size_t allocation;
@@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
boundary = min(boundary, allocation);
- retval = kzalloc(sizeof(*retval), GFP_KERNEL);
+ retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node);
if (!retval)
return retval;
@@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
+ retval->node = node;
INIT_LIST_HEAD(&retval->pools);
/*
@@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
mutex_unlock(&pools_reg_lock);
return retval;
}
-EXPORT_SYMBOL(dma_pool_create);
+EXPORT_SYMBOL(dma_pool_create_node);
static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
{
@@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
{
struct dma_page *page;
- page = kmalloc(sizeof(*page), mem_flags);
+ page = kmalloc_node(sizeof(*page), mem_flags, pool->node);
if (!page)
return NULL;
@@ -392,7 +395,7 @@ void dma_pool_destroy(struct dma_pool *pool)
EXPORT_SYMBOL(dma_pool_destroy);
/**
- * dma_pool_alloc - get a block of consistent memory
+ * dma_pool_alloc - get a block of coherent memory
* @pool: dma pool that will produce the block
* @mem_flags: GFP_* bitmask
* @handle: pointer to dma address of block
diff --git a/mm/execmem.c b/mm/execmem.c
index e6c4f5076ca8..627e6cf64f4f 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -26,7 +26,7 @@ static struct execmem_info default_execmem_info __ro_after_init;
#ifdef CONFIG_MMU
static void *execmem_vmalloc(struct execmem_range *range, size_t size,
- pgprot_t pgprot, unsigned long vm_flags)
+ pgprot_t pgprot, vm_flags_t vm_flags)
{
bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
@@ -82,7 +82,7 @@ struct vm_struct *execmem_vmap(size_t size)
}
#else
static void *execmem_vmalloc(struct execmem_range *range, size_t size,
- pgprot_t pgprot, unsigned long vm_flags)
+ pgprot_t pgprot, vm_flags_t vm_flags)
{
return vmalloc(size);
}
@@ -256,7 +256,7 @@ out_unlock:
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
- unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
+ vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP;
struct vm_struct *vm;
size_t alloc_size;
int err = -ENOMEM;
@@ -373,10 +373,12 @@ void *execmem_alloc(enum execmem_type type, size_t size)
{
struct execmem_range *range = &execmem_info->ranges[type];
bool use_cache = range->flags & EXECMEM_ROX_CACHE;
- unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
+ vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS;
pgprot_t pgprot = range->pgprot;
void *p;
+ size = PAGE_ALIGN(size);
+
if (use_cache)
p = execmem_cache_alloc(range, size);
else
diff --git a/mm/filemap.c b/mm/filemap.c
index b5e784f34d98..751838ef05e5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -142,7 +142,7 @@ static void page_cache_delete(struct address_space *mapping,
xas_init_marks(&xas);
folio->mapping = NULL;
- /* Leave page->index set: truncation lookup relies upon it */
+ /* Leave folio->index set: truncation lookup relies upon it */
mapping->nrpages -= nr;
}
@@ -949,7 +949,7 @@ unlock:
return 0;
error:
folio->mapping = NULL;
- /* Leave page->index set: truncation relies upon it */
+ /* Leave folio->index set: truncation relies upon it */
folio_put_refs(folio, nr);
return xas_error(&xas);
}
@@ -1589,13 +1589,30 @@ int folio_wait_private_2_killable(struct folio *folio)
}
EXPORT_SYMBOL(folio_wait_private_2_killable);
+static void filemap_end_dropbehind(struct folio *folio)
+{
+ struct address_space *mapping = folio->mapping;
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return;
+ if (!folio_test_clear_dropbehind(folio))
+ return;
+ if (mapping)
+ folio_unmap_invalidate(mapping, folio, 0);
+}
+
/*
* If folio was marked as dropbehind, then pages should be dropped when writeback
* completes. Do that now. If we fail, it's likely because of a big folio -
* just reset dropbehind for that case and latter completions should invalidate.
*/
-static void folio_end_dropbehind_write(struct folio *folio)
+static void filemap_end_dropbehind_write(struct folio *folio)
{
+ if (!folio_test_dropbehind(folio))
+ return;
+
/*
* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
* but can happen if normal writeback just happens to find dirty folios
@@ -1604,8 +1621,7 @@ static void folio_end_dropbehind_write(struct folio *folio)
* invalidation in that case.
*/
if (in_task() && folio_trylock(folio)) {
- if (folio->mapping)
- folio_unmap_invalidate(folio->mapping, folio, 0);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}
@@ -1620,8 +1636,6 @@ static void folio_end_dropbehind_write(struct folio *folio)
*/
void folio_end_writeback(struct folio *folio)
{
- bool folio_dropbehind = false;
-
VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
/*
@@ -1643,14 +1657,11 @@ void folio_end_writeback(struct folio *folio)
* reused before the folio_wake_bit().
*/
folio_get(folio);
- if (!folio_test_dirty(folio))
- folio_dropbehind = folio_test_clear_dropbehind(folio);
if (__folio_end_writeback(folio))
folio_wake_bit(folio, PG_writeback);
- acct_reclaim_writeback(folio);
- if (folio_dropbehind)
- folio_end_dropbehind_write(folio);
+ filemap_end_dropbehind_write(folio);
+ acct_reclaim_writeback(folio);
folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);
@@ -1767,8 +1778,9 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
XA_STATE(xas, &mapping->i_pages, index);
+ unsigned long nr = max_scan;
- while (max_scan--) {
+ while (nr--) {
void *entry = xas_next(&xas);
if (!entry || xa_is_value(entry))
return xas.xa_index;
@@ -2244,6 +2256,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
*start = folio->index + nr;
goto out;
}
+ xas_advance(&xas, folio_next_index(folio) - 1);
continue;
put_folio:
folio_put(folio);
@@ -2634,16 +2647,14 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
return (pos1 >> shift == pos2 >> shift);
}
-static void filemap_end_dropbehind_read(struct address_space *mapping,
- struct folio *folio)
+static void filemap_end_dropbehind_read(struct folio *folio)
{
if (!folio_test_dropbehind(folio))
return;
if (folio_test_writeback(folio) || folio_test_dirty(folio))
return;
if (folio_trylock(folio)) {
- if (folio_test_clear_dropbehind(folio))
- folio_unmap_invalidate(mapping, folio, 0);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}
@@ -2764,7 +2775,7 @@ put_folios:
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- filemap_end_dropbehind_read(mapping, folio);
+ filemap_end_dropbehind_read(folio);
folio_put(folio);
}
folio_batch_init(&fbatch);
@@ -3205,8 +3216,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct address_space *mapping = file->f_mapping;
DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
struct file *fpin = NULL;
- unsigned long vm_flags = vmf->vma->vm_flags;
- unsigned int mmap_miss;
+ vm_flags_t vm_flags = vmf->vma->vm_flags;
+ unsigned short mmap_miss;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
@@ -3221,13 +3232,17 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (!(vm_flags & VM_RAND_READ))
ra->size *= 2;
ra->async_size = HPAGE_PMD_NR;
- page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
+ ra->order = HPAGE_PMD_ORDER;
+ page_cache_ra_order(&ractl, ra);
return fpin;
}
#endif
- /* If we don't want any read-ahead, don't bother */
- if (vm_flags & VM_RAND_READ)
+ /*
+ * If we don't want any read-ahead, don't bother. VM_EXEC case below is
+ * already intended for random access.
+ */
+ if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ)
return fpin;
if (!ra->ra_pages)
return fpin;
@@ -3250,15 +3265,43 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (mmap_miss > MMAP_LOTSAMISS)
return fpin;
- /*
- * mmap read-around
- */
+ if (vm_flags & VM_EXEC) {
+ /*
+ * Allow arch to request a preferred minimum folio order for
+ * executable memory. This can often be beneficial to
+ * performance if (e.g.) arm64 can contpte-map the folio.
+ * Executable memory rarely benefits from readahead, due to its
+ * random access nature, so set async_size to 0.
+ *
+ * Limit to the boundaries of the VMA to avoid reading in any
+ * pad that might exist between sections, which would be a waste
+ * of memory.
+ */
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long start = vma->vm_pgoff;
+ unsigned long end = start + vma_pages(vma);
+ unsigned long ra_end;
+
+ ra->order = exec_folio_order();
+ ra->start = round_down(vmf->pgoff, 1UL << ra->order);
+ ra->start = max(ra->start, start);
+ ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);
+ ra_end = min(ra_end, end);
+ ra->size = ra_end - ra->start;
+ ra->async_size = 0;
+ } else {
+ /*
+ * mmap read-around
+ */
+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
+ ra->size = ra->ra_pages;
+ ra->async_size = ra->ra_pages / 4;
+ ra->order = 0;
+ }
+
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
- ra->size = ra->ra_pages;
- ra->async_size = ra->ra_pages / 4;
ractl._index = ra->start;
- page_cache_ra_order(&ractl, ra, 0);
+ page_cache_ra_order(&ractl, ra);
return fpin;
}
@@ -3274,7 +3317,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file_ra_state *ra = &file->f_ra;
DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
struct file *fpin = NULL;
- unsigned int mmap_miss;
+ unsigned short mmap_miss;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
@@ -3532,7 +3575,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
struct page *page = folio_file_page(folio, start);
- vm_fault_t ret = do_set_pmd(vmf, page);
+ vm_fault_t ret = do_set_pmd(vmf, folio, page);
if (!ret) {
/* The page is mapped successfully, reference consumed. */
folio_unlock(folio);
@@ -3594,7 +3637,7 @@ skip:
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
struct folio *folio, unsigned long start,
unsigned long addr, unsigned int nr_pages,
- unsigned long *rss, unsigned int *mmap_miss)
+ unsigned long *rss, unsigned short *mmap_miss)
{
vm_fault_t ret = 0;
struct page *page = folio_page(folio, start);
@@ -3656,7 +3699,7 @@ skip:
static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
struct folio *folio, unsigned long addr,
- unsigned long *rss, unsigned int *mmap_miss)
+ unsigned long *rss, unsigned short *mmap_miss)
{
vm_fault_t ret = 0;
struct page *page = &folio->page;
@@ -3698,7 +3741,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
struct folio *folio;
vm_fault_t ret = 0;
unsigned long rss = 0;
- unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
+ unsigned int nr_pages = 0, folio_type;
+ unsigned short mmap_miss = 0, mmap_miss_saved;
rcu_read_lock();
folio = next_uptodate_folio(&xas, mapping, end_pgoff);
@@ -3804,6 +3848,18 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+int generic_file_mmap_prepare(struct vm_area_desc *desc)
+{
+ struct file *file = desc->file;
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->read_folio)
+ return -ENOEXEC;
+ file_accessed(file);
+ desc->vm_ops = &generic_file_vm_ops;
+ return 0;
+}
+
/*
* This is for filesystems which do not implement ->writepage.
*/
@@ -3813,6 +3869,13 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
return generic_file_mmap(file, vma);
}
+
+int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
+{
+ if (is_shared_maywrite(desc->vm_flags))
+ return -EINVAL;
+ return generic_file_mmap_prepare(desc);
+}
#else
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
@@ -3822,15 +3885,25 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
+int generic_file_mmap_prepare(struct vm_area_desc *desc)
+{
+ return -ENOSYS;
+}
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
+int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
+{
+ return -ENOSYS;
+}
#endif /* CONFIG_MMU */
EXPORT_SYMBOL(filemap_page_mkwrite);
EXPORT_SYMBOL(generic_file_mmap);
+EXPORT_SYMBOL(generic_file_mmap_prepare);
EXPORT_SYMBOL(generic_file_readonly_mmap);
+EXPORT_SYMBOL(generic_file_readonly_mmap_prepare);
static struct folio *do_read_cache_folio(struct address_space *mapping,
pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
@@ -4099,7 +4172,7 @@ retry:
break;
}
- status = a_ops->write_begin(file, mapping, pos, bytes,
+ status = a_ops->write_begin(iocb, mapping, pos, bytes,
&folio, &fsdata);
if (unlikely(status < 0))
break;
@@ -4120,7 +4193,7 @@ retry:
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
flush_dcache_folio(folio);
- status = a_ops->write_end(file, mapping, pos, bytes, copied,
+ status = a_ops->write_end(iocb, mapping, pos, bytes, copied,
folio, fsdata);
if (unlikely(status != copied)) {
iov_iter_revert(i, copied - max(status, 0L));
diff --git a/mm/gup.c b/mm/gup.c
index 92351e2fa876..adffe663594d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -26,6 +26,7 @@
#include <asm/tlbflush.h>
#include "internal.h"
+#include "swap.h"
struct follow_page_context {
struct dev_pagemap *pgmap;
@@ -63,11 +64,11 @@ static inline void sanity_check_pinned_pages(struct page **pages,
!folio_test_anon(folio))
continue;
if (!folio_test_large(folio) || folio_test_hugetlb(folio))
- VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
+ VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio);
else
/* Either a PTE-mapped or a PMD-mapped THP. */
- VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
- !PageAnonExclusive(page), page);
+ VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) &&
+ !PageAnonExclusive(page), page);
}
}
@@ -678,31 +679,9 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma,
return NULL;
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
-
- if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
- pud_devmap(pud)) {
- /*
- * device mapped pages can only be returned if the caller
- * will manage the page reference count.
- *
- * At least one of FOLL_GET | FOLL_PIN must be set, so
- * assert that here:
- */
- if (!(flags & (FOLL_GET | FOLL_PIN)))
- return ERR_PTR(-EEXIST);
-
- if (flags & FOLL_TOUCH)
- touch_pud(vma, addr, pudp, flags & FOLL_WRITE);
-
- ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
- if (!ctx->pgmap)
- return ERR_PTR(-EFAULT);
- }
-
page = pfn_to_page(pfn);
- if (!pud_devmap(pud) && !pud_write(pud) &&
- gup_must_unshare(vma, flags, page))
+ if (!pud_write(pud) && gup_must_unshare(vma, flags, page))
return ERR_PTR(-EMLINK);
ret = try_grab_folio(page_folio(page), 1, flags);
@@ -759,8 +738,8 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma,
if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))
return ERR_PTR(-EMLINK);
- VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
- !PageAnonExclusive(page), page);
+ VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
ret = try_grab_folio(page_folio(page), 1, flags);
if (ret)
@@ -844,11 +823,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
pte_t *ptep, pte;
int ret;
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
- return ERR_PTR(-EINVAL);
-
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!ptep)
return no_page_table(vma, flags, address);
@@ -861,8 +835,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
page = vm_normal_page(vma, address, pte);
/*
- * We only care about anon pages in can_follow_write_pte() and don't
- * have to worry about pte_devmap() because they are never anon.
+ * We only care about anon pages in can_follow_write_pte().
*/
if ((flags & FOLL_WRITE) &&
!can_follow_write_pte(pte, page, vma, flags)) {
@@ -870,18 +843,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
goto out;
}
- if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
- /*
- * Only return device mapping pages in the FOLL_GET or FOLL_PIN
- * case since they are only valid while holding the pgmap
- * reference.
- */
- *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
- if (*pgmap)
- page = pte_page(pte);
- else
- goto no_page;
- } else if (unlikely(!page)) {
+ if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
page = ERR_PTR(-EFAULT);
@@ -903,8 +865,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
goto out;
}
- VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
- !PageAnonExclusive(page), page);
+ VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
/* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */
ret = try_grab_folio(folio, 1, flags);
@@ -963,14 +925,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
return no_page_table(vma, flags, address);
if (!pmd_present(pmdval))
return no_page_table(vma, flags, address);
- if (pmd_devmap(pmdval)) {
- ptl = pmd_lock(mm, pmd);
- page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
- spin_unlock(ptl);
- if (page)
- return page;
- return no_page_table(vma, flags, address);
- }
if (likely(!pmd_leaf(pmdval)))
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
@@ -1106,10 +1060,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
/* user gate pages are read-only */
if (gup_flags & FOLL_WRITE)
return -EFAULT;
- if (address > TASK_SIZE)
- pgd = pgd_offset_k(address);
- else
- pgd = pgd_offset_gate(mm, address);
+ pgd = pgd_offset(mm, address);
if (pgd_none(*pgd))
return -EFAULT;
p4d = p4d_offset(pgd, address);
@@ -1187,7 +1138,7 @@ static int faultin_page(struct vm_area_struct *vma,
if (unshare) {
fault_flags |= FAULT_FLAG_UNSHARE;
/* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
- VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
+ VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE);
}
ret = handle_mm_fault(vma, address, fault_flags, NULL);
@@ -1432,7 +1383,11 @@ static long __get_user_pages(struct mm_struct *mm,
start = untagged_addr_remote(mm, start);
- VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
+ VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET));
do {
struct page *page;
@@ -1763,10 +1718,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
}
/* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
- if (!*locked) {
- BUG_ON(ret < 0);
- BUG_ON(ret >= nr_pages);
- }
+ VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages));
if (ret > 0) {
nr_pages -= ret;
@@ -1811,7 +1763,6 @@ retry:
ret = mmap_read_lock_killable(mm);
if (ret) {
- BUG_ON(ret > 0);
if (!pages_done)
pages_done = ret;
break;
@@ -1822,11 +1773,11 @@ retry:
pages, locked);
if (!*locked) {
/* Continue to retry until we succeeded */
- BUG_ON(ret != 0);
+ VM_WARN_ON_ONCE(ret != 0);
goto retry;
}
if (ret != 1) {
- BUG_ON(ret > 1);
+ VM_WARN_ON_ONCE(ret > 1);
if (!pages_done)
pages_done = ret;
break;
@@ -1888,10 +1839,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
int gup_flags;
long ret;
- VM_BUG_ON(!PAGE_ALIGNED(start));
- VM_BUG_ON(!PAGE_ALIGNED(end));
- VM_BUG_ON_VMA(start < vma->vm_start, vma);
- VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
+ VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
+ VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma);
+ VM_WARN_ON_ONCE_VMA(end > vma->vm_end, vma);
mmap_assert_locked(mm);
/*
@@ -1960,8 +1911,8 @@ long faultin_page_range(struct mm_struct *mm, unsigned long start,
int gup_flags;
long ret;
- VM_BUG_ON(!PAGE_ALIGNED(start));
- VM_BUG_ON(!PAGE_ALIGNED(end));
+ VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
+ VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
mmap_assert_locked(mm);
/*
@@ -2051,7 +2002,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
{
struct vm_area_struct *vma;
bool must_unlock = false;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
long i;
if (!nr_pages)
@@ -2114,28 +2065,22 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
*/
size_t fault_in_writeable(char __user *uaddr, size_t size)
{
- char __user *start = uaddr, *end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
if (unlikely(size == 0))
return 0;
if (!user_write_access_begin(uaddr, size))
return size;
- if (!PAGE_ALIGNED(uaddr)) {
- unsafe_put_user(0, uaddr, out);
- uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
- }
- end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
- if (unlikely(end < start))
- end = NULL;
- while (uaddr != end) {
- unsafe_put_user(0, uaddr, out);
- uaddr += PAGE_SIZE;
- }
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ unsafe_put_user(0, (char __user *)cur, out);
out:
user_write_access_end();
- if (size > uaddr - start)
- return size - (uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_writeable);
@@ -2189,26 +2134,24 @@ EXPORT_SYMBOL(fault_in_subpage_writeable);
*/
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
{
- unsigned long start = (unsigned long)uaddr, end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
struct mm_struct *mm = current->mm;
bool unlocked = false;
if (unlikely(size == 0))
return 0;
- end = PAGE_ALIGN(start + size);
- if (end < start)
- end = 0;
mmap_read_lock(mm);
- do {
- if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked))
break;
- start = (start + PAGE_SIZE) & PAGE_MASK;
- } while (start != end);
mmap_read_unlock(mm);
- if (size > (unsigned long)uaddr - start)
- return size - ((unsigned long)uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
@@ -2223,30 +2166,24 @@ EXPORT_SYMBOL(fault_in_safe_writeable);
*/
size_t fault_in_readable(const char __user *uaddr, size_t size)
{
- const char __user *start = uaddr, *end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
volatile char c;
if (unlikely(size == 0))
return 0;
if (!user_read_access_begin(uaddr, size))
return size;
- if (!PAGE_ALIGNED(uaddr)) {
- unsafe_get_user(c, uaddr, out);
- uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
- }
- end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
- if (unlikely(end < start))
- end = NULL;
- while (uaddr != end) {
- unsafe_get_user(c, uaddr, out);
- uaddr += PAGE_SIZE;
- }
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ unsafe_get_user(c, (const char __user *)cur, out);
out:
user_read_access_end();
(void)c;
- if (size > uaddr - start)
- return size - (uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_readable);
@@ -2317,27 +2254,51 @@ static void pofs_unpin(struct pages_or_folios *pofs)
unpin_user_pages(pofs->pages, pofs->nr_entries);
}
+static struct folio *pofs_next_folio(struct folio *folio,
+ struct pages_or_folios *pofs, long *index_ptr)
+{
+ long i = *index_ptr + 1;
+
+ if (!pofs->has_folios && folio_test_large(folio)) {
+ const unsigned long start_pfn = folio_pfn(folio);
+ const unsigned long end_pfn = start_pfn + folio_nr_pages(folio);
+
+ for (; i < pofs->nr_entries; i++) {
+ unsigned long pfn = page_to_pfn(pofs->pages[i]);
+
+ /* Is this page part of this folio? */
+ if (pfn < start_pfn || pfn >= end_pfn)
+ break;
+ }
+ }
+
+ if (unlikely(i == pofs->nr_entries))
+ return NULL;
+ *index_ptr = i;
+
+ return pofs_get_folio(pofs, i);
+}
+
/*
* Returns the number of collected folios. Return value is always >= 0.
*/
-static void collect_longterm_unpinnable_folios(
+static unsigned long collect_longterm_unpinnable_folios(
struct list_head *movable_folio_list,
struct pages_or_folios *pofs)
{
- struct folio *prev_folio = NULL;
+ unsigned long collected = 0;
bool drain_allow = true;
- unsigned long i;
-
- for (i = 0; i < pofs->nr_entries; i++) {
- struct folio *folio = pofs_get_folio(pofs, i);
+ struct folio *folio;
+ long i = 0;
- if (folio == prev_folio)
- continue;
- prev_folio = folio;
+ for (folio = pofs_get_folio(pofs, i); folio;
+ folio = pofs_next_folio(folio, pofs, &i)) {
if (folio_is_longterm_pinnable(folio))
continue;
+ collected++;
+
if (folio_is_device_coherent(folio))
continue;
@@ -2359,6 +2320,8 @@ static void collect_longterm_unpinnable_folios(
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
}
+
+ return collected;
}
/*
@@ -2435,9 +2398,11 @@ static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
{
LIST_HEAD(movable_folio_list);
+ unsigned long collected;
- collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
- if (list_empty(&movable_folio_list))
+ collected = collect_longterm_unpinnable_folios(&movable_folio_list,
+ pofs);
+ if (!collected)
return 0;
return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
@@ -2839,9 +2804,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
return false;
/* Anonymous folios pose no problem. */
- mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS;
+ mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS;
if (mapping_flags)
- return mapping_flags & PAGE_MAPPING_ANON;
+ return mapping_flags & FOLIO_MAPPING_ANON;
/*
* At this point, we know the mapping is non-null and points to an
@@ -2889,7 +2854,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
int *nr)
{
struct dev_pagemap *pgmap = NULL;
- int nr_start = *nr, ret = 0;
+ int ret = 0;
pte_t *ptep, *ptem;
ptem = ptep = pte_offset_map(&pmd, addr);
@@ -2913,19 +2878,11 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
goto pte_unmap;
- if (pte_devmap(pte)) {
- if (unlikely(flags & FOLL_LONGTERM))
- goto pte_unmap;
-
- pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
- if (unlikely(!pgmap)) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- goto pte_unmap;
- }
- } else if (pte_special(pte))
+ if (pte_special(pte))
goto pte_unmap;
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ /* If it's not marked as special it must have a valid memmap. */
+ VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
folio = try_grab_folio_fast(page, 1, flags);
@@ -2993,91 +2950,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
}
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
-#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages, int *nr)
-{
- int nr_start = *nr;
- struct dev_pagemap *pgmap = NULL;
-
- do {
- struct folio *folio;
- struct page *page = pfn_to_page(pfn);
-
- pgmap = get_dev_pagemap(pfn, pgmap);
- if (unlikely(!pgmap)) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- break;
- }
-
- folio = try_grab_folio_fast(page, 1, flags);
- if (!folio) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- break;
- }
- folio_set_referenced(folio);
- pages[*nr] = page;
- (*nr)++;
- pfn++;
- } while (addr += PAGE_SIZE, addr != end);
-
- put_dev_pagemap(pgmap);
- return addr == end;
-}
-
-static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- unsigned long fault_pfn;
- int nr_start = *nr;
-
- fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
- return 0;
-
- if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- return 0;
- }
- return 1;
-}
-
-static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- unsigned long fault_pfn;
- int nr_start = *nr;
-
- fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
- return 0;
-
- if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- return 0;
- }
- return 1;
-}
-#else
-static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- BUILD_BUG();
- return 0;
-}
-
-static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- BUILD_BUG();
- return 0;
-}
-#endif
-
static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags, struct page **pages,
int *nr)
@@ -3092,13 +2964,6 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
if (pmd_special(orig))
return 0;
- if (pmd_devmap(orig)) {
- if (unlikely(flags & FOLL_LONGTERM))
- return 0;
- return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags,
- pages, nr);
- }
-
page = pmd_page(orig);
refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr);
@@ -3139,13 +3004,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
if (pud_special(orig))
return 0;
- if (pud_devmap(orig)) {
- if (unlikely(flags & FOLL_LONGTERM))
- return 0;
- return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags,
- pages, nr);
- }
-
page = pud_page(orig);
refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr);
@@ -3173,46 +3031,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
return 1;
}
-static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- int refs;
- struct page *page;
- struct folio *folio;
-
- if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
- return 0;
-
- BUILD_BUG_ON(pgd_devmap(orig));
-
- page = pgd_page(orig);
- refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr);
-
- folio = try_grab_folio_fast(page, refs, flags);
- if (!folio)
- return 0;
-
- if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- if (!gup_fast_folio_allowed(folio, flags)) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- *nr += refs;
- folio_set_referenced(folio);
- return 1;
-}
-
static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
unsigned long end, unsigned int flags, struct page **pages,
int *nr)
@@ -3307,12 +3125,9 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
return;
- if (unlikely(pgd_leaf(pgd))) {
- if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags,
- pages, nr))
- return;
- } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
- pages, nr))
+ BUILD_BUG_ON(pgd_leaf(pgd));
+ if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
+ pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
@@ -3359,7 +3174,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
* include/asm-generic/tlb.h for more details.
*
* We do not adopt an rcu_read_lock() here as we also want to block IPIs
- * that come from THPs splitting.
+ * that come from callers of tlb_remove_table_sync_one().
*/
local_irq_save(flags);
gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
@@ -3647,7 +3462,7 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
{
unsigned int flags, nr_folios, nr_found;
unsigned int i, pgshift = PAGE_SHIFT;
- pgoff_t start_idx, end_idx, next_idx;
+ pgoff_t start_idx, end_idx;
struct folio *folio = NULL;
struct folio_batch fbatch;
struct hstate *h;
@@ -3697,20 +3512,8 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
folio = NULL;
}
- next_idx = 0;
for (i = 0; i < nr_found; i++) {
- /*
- * As there can be multiple entries for a
- * given folio in the batch returned by
- * filemap_get_folios_contig(), the below
- * check is to ensure that we pin and return a
- * unique set of folios between start and end.
- */
- if (next_idx &&
- next_idx != folio_index(fbatch.folios[i]))
- continue;
-
- folio = page_folio(&fbatch.folios[i]->page);
+ folio = fbatch.folios[i];
if (try_grab_folio(folio, 1, FOLL_PIN)) {
folio_batch_release(&fbatch);
@@ -3722,7 +3525,6 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
*offset = offset_in_folio(folio, start);
folios[nr_folios] = folio;
- next_idx = folio_next_index(folio);
if (++nr_folios == max_folios)
break;
}
diff --git a/mm/hmm.c b/mm/hmm.c
index 082f7b7c0b9e..d545e2494994 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -10,6 +10,7 @@
*/
#include <linux/pagewalk.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/init.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -23,6 +24,7 @@
#include <linux/sched/mm.h>
#include <linux/jump_label.h>
#include <linux/dma-mapping.h>
+#include <linux/pci-p2pdma.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -39,13 +41,21 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};
+enum {
+ /* These flags are carried from input-to-output */
+ HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
+ HMM_PFN_P2PDMA_BUS,
+};
+
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
struct hmm_range *range, unsigned long cpu_flags)
{
unsigned long i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++)
- range->hmm_pfns[i] = cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= cpu_flags;
+ }
return 0;
}
@@ -173,6 +183,7 @@ static inline unsigned long hmm_pfn_flags_order(unsigned long order)
return order << HMM_PFN_ORDER_SHIFT;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
pmd_t pmd)
{
@@ -183,7 +194,6 @@ static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
unsigned long end, unsigned long hmm_pfns[],
pmd_t pmd)
@@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
return hmm_vma_fault(addr, end, required_fault, walk);
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long cpu_flags;
pte_t pte = ptep_get(ptep);
uint64_t pfn_req_flags = *hmm_pfn;
+ uint64_t new_pfn_flags = 0;
if (pte_none_mostly(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
- *hmm_pfn = 0;
- return 0;
+ goto out;
}
if (!pte_present(pte)) {
@@ -253,16 +265,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
- return 0;
+ new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
+ goto out;
}
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
- if (!required_fault) {
- *hmm_pfn = 0;
- return 0;
- }
+ if (!required_fault)
+ goto out;
if (!non_swap_entry(entry))
goto fault;
@@ -292,23 +302,22 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
goto fault;
/*
- * Bypass devmap pte such as DAX page when all pfn requested
- * flags(pfn_req_flags) are fulfilled.
* Since each architecture defines a struct page for the zero page, just
* fall through and treat it like a normal page.
*/
if (!vm_normal_page(walk->vma, addr, pte) &&
- !pte_devmap(pte) &&
!is_zero_pfn(pte_pfn(pte))) {
if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
pte_unmap(ptep);
return -EFAULT;
}
- *hmm_pfn = HMM_PFN_ERROR;
- return 0;
+ new_pfn_flags = HMM_PFN_ERROR;
+ goto out;
}
- *hmm_pfn = pte_pfn(pte) | cpu_flags;
+ new_pfn_flags = pte_pfn(pte) | cpu_flags;
+out:
+ *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags;
return 0;
fault:
@@ -351,7 +360,7 @@ again:
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
}
- if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
+ if (pmd_trans_huge(pmd)) {
/*
* No need to take pmd_lock here, even if some other thread
* is splitting the huge pmd we will get that event through
@@ -362,7 +371,7 @@ again:
* values.
*/
pmd = pmdp_get_lockless(pmdp);
- if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
+ if (!pmd_trans_huge(pmd))
goto again;
return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
@@ -396,8 +405,7 @@ again:
return 0;
}
-#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
- defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+#if defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
pud_t pud)
{
@@ -429,7 +437,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
return hmm_vma_walk_hole(start, end, -1, walk);
}
- if (pud_leaf(pud) && pud_devmap(pud)) {
+ if (pud_leaf(pud)) {
unsigned long i, npages, pfn;
unsigned int required_fault;
unsigned long *hmm_pfns;
@@ -448,8 +456,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
}
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- for (i = 0; i < npages; ++i, ++pfn)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
goto out_unlock;
}
@@ -507,8 +517,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
- for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- range->hmm_pfns[i] = pfn | cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= pfn | cpu_flags;
+ }
spin_unlock(ptl);
return 0;
@@ -607,3 +619,211 @@ int hmm_range_fault(struct hmm_range *range)
return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_dma_map_alloc - Allocate HMM map structure
+ * @dev: device to allocate structure for
+ * @map: HMM map to allocate
+ * @nr_entries: number of entries in the map
+ * @dma_entry_size: size of the DMA entry in the map
+ *
+ * Allocate the HMM map structure and all the lists it contains.
+ * Return 0 on success, -ENOMEM on failure.
+ */
+int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
+ size_t nr_entries, size_t dma_entry_size)
+{
+ bool dma_need_sync = false;
+ bool use_iova;
+
+ WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size));
+
+ /*
+ * The HMM API violates our normal DMA buffer ownership rules and can't
+ * transfer buffer ownership. The dma_addressing_limited() check is a
+ * best approximation to ensure no swiotlb buffering happens.
+ */
+#ifdef CONFIG_DMA_NEED_SYNC
+ dma_need_sync = !dev->dma_skip_sync;
+#endif /* CONFIG_DMA_NEED_SYNC */
+ if (dma_need_sync || dma_addressing_limited(dev))
+ return -EOPNOTSUPP;
+
+ map->dma_entry_size = dma_entry_size;
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
+ return -ENOMEM;
+
+ use_iova = dma_iova_try_alloc(dev, &map->state, 0,
+ nr_entries * PAGE_SIZE);
+ if (!use_iova && dma_need_unmap(dev)) {
+ map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->dma_list)
+ goto err_dma;
+ }
+ return 0;
+
+err_dma:
+ kvfree(map->pfn_list);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_alloc);
+
+/**
+ * hmm_dma_map_free - iFree HMM map structure
+ * @dev: device to free structure from
+ * @map: HMM map containing the various lists and state
+ *
+ * Free the HMM map structure and all the lists it contains.
+ */
+void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map)
+{
+ if (dma_use_iova(&map->state))
+ dma_iova_free(dev, &map->state);
+ kvfree(map->pfn_list);
+ kvfree(map->dma_list);
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_free);
+
+/**
+ * hmm_dma_map_pfn - Map a physical HMM page to DMA address
+ * @dev: Device to map the page for
+ * @map: HMM map
+ * @idx: Index into the PFN and dma address arrays
+ * @p2pdma_state: PCI P2P state.
+ *
+ * dma_alloc_iova() allocates IOVA based on the size specified by their use in
+ * iova->size. Call this function after IOVA allocation to link whole @page
+ * to get the DMA address. Note that very first call to this function
+ * will have @offset set to 0 in the IOVA space allocated from
+ * dma_alloc_iova(). For subsequent calls to this function on same @iova,
+ * @offset needs to be advanced by the caller with the size of previous
+ * page that was linked + DMA address returned for the previous page that was
+ * linked by this function.
+ */
+dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
+ size_t idx,
+ struct pci_p2pdma_map_state *p2pdma_state)
+{
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ struct page *page = hmm_pfn_to_page(pfns[idx]);
+ phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
+ size_t offset = idx * map->dma_entry_size;
+ unsigned long attrs = 0;
+ dma_addr_t dma_addr;
+ int ret;
+
+ if ((pfns[idx] & HMM_PFN_DMA_MAPPED) &&
+ !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) {
+ /*
+ * We are in this flow when there is a need to resync flags,
+ * for example when page was already linked in prefetch call
+ * with READ flag and now we need to add WRITE flag
+ *
+ * This page was already programmed to HW and we don't want/need
+ * to unlink and link it again just to resync flags.
+ */
+ if (dma_use_iova(state))
+ return state->addr + offset;
+
+ /*
+ * Without dma_need_unmap, the dma_addrs array is NULL, thus we
+ * need to regenerate the address below even if there already
+ * was a mapping. But !dma_need_unmap implies that the
+ * mapping stateless, so this is fine.
+ */
+ if (dma_need_unmap(dev))
+ return dma_addrs[idx];
+
+ /* Continue to remapping */
+ }
+
+ switch (pci_p2pdma_state(p2pdma_state, dev, page)) {
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ pfns[idx] |= HMM_PFN_P2PDMA;
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
+ return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
+ default:
+ return DMA_MAPPING_ERROR;
+ }
+
+ if (dma_use_iova(state)) {
+ ret = dma_iova_link(dev, state, paddr, offset,
+ map->dma_entry_size, DMA_BIDIRECTIONAL,
+ attrs);
+ if (ret)
+ goto error;
+
+ ret = dma_iova_sync(dev, state, offset, map->dma_entry_size);
+ if (ret) {
+ dma_iova_unlink(dev, state, offset, map->dma_entry_size,
+ DMA_BIDIRECTIONAL, attrs);
+ goto error;
+ }
+
+ dma_addr = state->addr + offset;
+ } else {
+ if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
+ goto error;
+
+ dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(dev, dma_addr))
+ goto error;
+
+ if (dma_need_unmap(dev))
+ dma_addrs[idx] = dma_addr;
+ }
+ pfns[idx] |= HMM_PFN_DMA_MAPPED;
+ return dma_addr;
+error:
+ pfns[idx] &= ~HMM_PFN_P2PDMA;
+ return DMA_MAPPING_ERROR;
+
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_pfn);
+
+/**
+ * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address
+ * @dev: Device to unmap the page from
+ * @map: HMM map
+ * @idx: Index of the PFN to unmap
+ *
+ * Returns true if the PFN was mapped and has been unmapped, false otherwise.
+ */
+bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
+{
+ const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED;
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ unsigned long attrs = 0;
+
+ if ((pfns[idx] & valid_dma) != valid_dma)
+ return false;
+
+ if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
+ ; /* no need to unmap bus address P2P mappings */
+ else if (dma_use_iova(state)) {
+ if (pfns[idx] & HMM_PFN_P2PDMA)
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ dma_iova_unlink(dev, state, idx * map->dma_entry_size,
+ map->dma_entry_size, DMA_BIDIRECTIONAL, attrs);
+ } else if (dma_need_unmap(dev))
+ dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+
+ pfns[idx] &=
+ ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS);
+ return true;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2a47682d1ab7..9c38a95e9f09 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -22,7 +22,6 @@
#include <linux/mm_types.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
-#include <linux/pfn_t.h>
#include <linux/mman.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
@@ -99,7 +98,7 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
}
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
- unsigned long vm_flags,
+ vm_flags_t vm_flags,
unsigned long tva_flags,
unsigned long orders)
{
@@ -166,7 +165,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
* own flags.
*/
if (!in_pf && shmem_file(vma->vm_file))
- return shmem_allowable_huge_orders(file_inode(vma->vm_file),
+ return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
vma, vma->vm_pgoff, 0,
!enforce_sysfs);
@@ -1203,7 +1202,7 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
{
pmd_t entry;
- entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
@@ -1309,8 +1308,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
struct folio *zero_folio)
{
pmd_t entry;
- entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
- entry = pmd_mkhuge(entry);
+ entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
@@ -1373,9 +1371,17 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return __do_huge_pmd_anonymous_page(vmf);
}
-static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
- pgtable_t pgtable)
+struct folio_or_pfn {
+ union {
+ struct folio *folio;
+ unsigned long pfn;
+ };
+ bool is_folio;
+};
+
+static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot,
+ bool write, pgtable_t pgtable)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t entry;
@@ -1383,8 +1389,11 @@ static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
lockdep_assert_held(pmd_lockptr(mm, pmd));
if (!pmd_none(*pmd)) {
+ const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
+ fop.pfn;
+
if (write) {
- if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
+ if (pmd_pfn(*pmd) != pfn) {
WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
return -EEXIST;
}
@@ -1397,11 +1406,16 @@ static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
return -EEXIST;
}
- entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
- if (pfn_t_devmap(pfn))
- entry = pmd_mkdevmap(entry);
- else
+ if (fop.is_folio) {
+ entry = folio_mk_pmd(fop.folio, vma->vm_page_prot);
+
+ folio_get(fop.folio);
+ folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
+ } else {
+ entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
entry = pmd_mkspecial(entry);
+ }
if (write) {
entry = pmd_mkyoung(pmd_mkdirty(entry));
entry = maybe_pmd_mkwrite(entry, vma);
@@ -1427,11 +1441,15 @@ static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
*
* Return: vm_fault_t value.
*/
-vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
+ bool write)
{
unsigned long addr = vmf->address & PMD_MASK;
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
+ struct folio_or_pfn fop = {
+ .pfn = pfn,
+ };
pgtable_t pgtable = NULL;
spinlock_t *ptl;
int error;
@@ -1441,8 +1459,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
* but we need to be consistent with PTEs and architectures that
* can't support a 'special' bit.
*/
- BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
- !pfn_t_devmap(pfn));
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
@@ -1456,10 +1473,11 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
return VM_FAULT_OOM;
}
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
+
ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write,
- pgtable);
+ error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write,
+ pgtable);
spin_unlock(ptl);
if (error && pgtable)
pte_free(vma->vm_mm, pgtable);
@@ -1474,6 +1492,10 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address & PMD_MASK;
struct mm_struct *mm = vma->vm_mm;
+ struct folio_or_pfn fop = {
+ .folio = folio,
+ .is_folio = true,
+ };
spinlock_t *ptl;
pgtable_t pgtable = NULL;
int error;
@@ -1491,14 +1513,8 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
}
ptl = pmd_lock(mm, vmf->pmd);
- if (pmd_none(*vmf->pmd)) {
- folio_get(folio);
- folio_add_file_rmap_pmd(folio, &folio->page, vma);
- add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
- }
- error = insert_pfn_pmd(vma, addr, vmf->pmd,
- pfn_to_pfn_t(folio_pfn(folio)), vma->vm_page_prot,
- write, pgtable);
+ error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot,
+ write, pgtable);
spin_unlock(ptl);
if (error && pgtable)
pte_free(mm, pgtable);
@@ -1515,16 +1531,18 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
return pud;
}
-static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, pfn_t pfn, bool write)
+static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
{
struct mm_struct *mm = vma->vm_mm;
- pgprot_t prot = vma->vm_page_prot;
pud_t entry;
if (!pud_none(*pud)) {
+ const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
+ fop.pfn;
+
if (write) {
- if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn)))
+ if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
return;
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
@@ -1534,11 +1552,16 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
return;
}
- entry = pud_mkhuge(pfn_t_pud(pfn, prot));
- if (pfn_t_devmap(pfn))
- entry = pud_mkdevmap(entry);
- else
+ if (fop.is_folio) {
+ entry = folio_mk_pud(fop.folio, vma->vm_page_prot);
+
+ folio_get(fop.folio);
+ folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR);
+ } else {
+ entry = pud_mkhuge(pfn_pud(fop.pfn, prot));
entry = pud_mkspecial(entry);
+ }
if (write) {
entry = pud_mkyoung(pud_mkdirty(entry));
entry = maybe_pud_mkwrite(entry, vma);
@@ -1557,11 +1580,15 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
*
* Return: vm_fault_t value.
*/
-vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
+ bool write)
{
unsigned long addr = vmf->address & PUD_MASK;
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
+ struct folio_or_pfn fop = {
+ .pfn = pfn,
+ };
spinlock_t *ptl;
/*
@@ -1569,8 +1596,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
* but we need to be consistent with PTEs and architectures that
* can't support a 'special' bit.
*/
- BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
- !pfn_t_devmap(pfn));
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
@@ -1578,10 +1604,10 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
ptl = pud_lock(vma->vm_mm, vmf->pud);
- insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
+ insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
spin_unlock(ptl);
return VM_FAULT_NOPAGE;
@@ -1603,6 +1629,10 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
unsigned long addr = vmf->address & PUD_MASK;
pud_t *pud = vmf->pud;
struct mm_struct *mm = vma->vm_mm;
+ struct folio_or_pfn fop = {
+ .folio = folio,
+ .is_folio = true,
+ };
spinlock_t *ptl;
if (addr < vma->vm_start || addr >= vma->vm_end)
@@ -1612,20 +1642,7 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
return VM_FAULT_SIGBUS;
ptl = pud_lock(mm, pud);
-
- /*
- * If there is already an entry present we assume the folio is
- * already mapped, hence no need to take another reference. We
- * still call insert_pfn_pud() though in case the mapping needs
- * upgrading to writeable.
- */
- if (pud_none(*vmf->pud)) {
- folio_get(folio);
- folio_add_file_rmap_pud(folio, &folio->page, vma);
- add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
- }
- insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)),
- write);
+ insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
spin_unlock(ptl);
return VM_FAULT_NOPAGE;
@@ -1646,46 +1663,6 @@ void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
update_mmu_cache_pmd(vma, addr, pmd);
}
-struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
-{
- unsigned long pfn = pmd_pfn(*pmd);
- struct mm_struct *mm = vma->vm_mm;
- struct page *page;
- int ret;
-
- assert_spin_locked(pmd_lockptr(mm, pmd));
-
- if (flags & FOLL_WRITE && !pmd_write(*pmd))
- return NULL;
-
- if (pmd_present(*pmd) && pmd_devmap(*pmd))
- /* pass */;
- else
- return NULL;
-
- if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
-
- /*
- * device mapped pages can only be returned if the
- * caller will manage the page reference count.
- */
- if (!(flags & (FOLL_GET | FOLL_PIN)))
- return ERR_PTR(-EEXIST);
-
- pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
- *pgmap = get_dev_pagemap(pfn, *pgmap);
- if (!*pgmap)
- return ERR_PTR(-EFAULT);
- page = pfn_to_page(pfn);
- ret = try_grab_folio(page_folio(page), 1, flags);
- if (ret)
- page = ERR_PTR(ret);
-
- return page;
-}
-
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
@@ -1786,7 +1763,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
- __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
+ __split_huge_pmd(src_vma, src_pmd, addr, false);
return -EAGAIN;
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1837,7 +1814,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pud = *src_pud;
- if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
+ if (unlikely(!pud_trans_huge(pud)))
goto out_unlock;
/*
@@ -2008,7 +1985,7 @@ unlock_fallback:
folio_unlock(folio);
spin_unlock(vmf->ptl);
fallback:
- __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
return VM_FAULT_FALLBACK;
}
@@ -2260,6 +2237,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, mm_counter_file(folio),
-HPAGE_PMD_NR);
+
+ /*
+ * Use flush_needed to indicate whether the PMD entry
+ * is present, instead of checking pmd_present() again.
+ */
+ if (flush_needed && pmd_young(orig_pmd) &&
+ likely(vma_has_recency(vma)))
+ folio_mark_accessed(folio);
}
spin_unlock(ptl);
@@ -2653,12 +2638,12 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
- _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
+ _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
/* Follow mremap() behavior and treat the entry dirty after the move */
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
} else {
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
- _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
+ _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
}
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
@@ -2691,8 +2676,7 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
spinlock_t *ptl;
ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
- pmd_devmap(*pmd)))
+ if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)))
return ptl;
spin_unlock(ptl);
return NULL;
@@ -2709,7 +2693,7 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
spinlock_t *ptl;
ptl = pud_lock(vma->vm_mm, pud);
- if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
+ if (likely(pud_trans_huge(*pud)))
return ptl;
spin_unlock(ptl);
return NULL;
@@ -2761,7 +2745,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
- VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
+ VM_BUG_ON(!pud_trans_huge(*pud));
count_vm_event(THP_SPLIT_PUD);
@@ -2794,7 +2778,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pud_lock(vma->vm_mm, pud);
- if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
+ if (unlikely(!pud_trans_huge(*pud)))
goto out;
__split_huge_pud_locked(vma, pud, range.start);
@@ -2867,8 +2851,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
- VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
- && !pmd_devmap(*pmd));
+ VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
count_vm_event(THP_SPLIT_PMD);
@@ -3073,28 +3056,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, bool freeze, struct folio *folio)
+ pmd_t *pmd, bool freeze)
{
- VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
- VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
- VM_BUG_ON(freeze && !folio);
-
- /*
- * When the caller requests to set up a migration entry, we
- * require a folio to check the PMD against. Otherwise, there
- * is a risk of replacing the wrong folio.
- */
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
- is_pmd_migration_entry(*pmd)) {
- if (folio && folio != pmd_folio(*pmd))
- return;
+ if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
__split_huge_pmd_locked(vma, pmd, address, freeze);
- }
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address, bool freeze, struct folio *folio)
+ unsigned long address, bool freeze)
{
spinlock_t *ptl;
struct mmu_notifier_range range;
@@ -3104,20 +3074,20 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pmd_lock(vma->vm_mm, pmd);
- split_huge_pmd_locked(vma, range.start, pmd, freeze, folio);
+ split_huge_pmd_locked(vma, range.start, pmd, freeze);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
- bool freeze, struct folio *folio)
+ bool freeze)
{
pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
if (!pmd)
return;
- __split_huge_pmd(vma, pmd, address, freeze, folio);
+ __split_huge_pmd(vma, pmd, address, freeze);
}
static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
@@ -3129,7 +3099,7 @@ static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned
if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
ALIGN(address, HPAGE_PMD_SIZE)))
- split_huge_pmd_address(vma, address, false, NULL);
+ split_huge_pmd_address(vma, address, false);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3415,10 +3385,6 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
* order - 1 to new_order).
* @split_at: in buddy allocator like split, the folio containing @split_at
* will be split until its order becomes @new_order.
- * @lock_at: the folio containing @lock_at is left locked for caller.
- * @list: the after split folios will be added to @list if it is not NULL,
- * otherwise to LRU lists.
- * @end: the end of the file @folio maps to. -1 if @folio is anonymous memory.
* @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
* @mapping: @folio->mapping
* @uniform_split: if the split is uniform or not (buddy allocator like split)
@@ -3444,52 +3410,26 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
* @page, which is split in next for loop.
*
* After splitting, the caller's folio reference will be transferred to the
- * folio containing @page. The other folios may be freed if they are not mapped.
- *
- * In terms of locking, after splitting,
- * 1. uniform split leaves @page (or the folio contains it) locked;
- * 2. buddy allocator like (non-uniform) split leaves @folio locked.
- *
+ * folio containing @page. The caller needs to unlock and/or free after-split
+ * folios if necessary.
*
* For !uniform_split, when -ENOMEM is returned, the original folio might be
* split. The caller needs to check the input folio.
*/
static int __split_unmapped_folio(struct folio *folio, int new_order,
- struct page *split_at, struct page *lock_at,
- struct list_head *list, pgoff_t end,
- struct xa_state *xas, struct address_space *mapping,
- bool uniform_split)
+ struct page *split_at, struct xa_state *xas,
+ struct address_space *mapping, bool uniform_split)
{
- struct lruvec *lruvec;
- struct address_space *swap_cache = NULL;
- struct folio *origin_folio = folio;
- struct folio *next_folio = folio_next(folio);
- struct folio *new_folio;
- struct folio *next;
int order = folio_order(folio);
- int split_order;
int start_order = uniform_split ? new_order : order - 1;
- int nr_dropped = 0;
- int ret = 0;
bool stop_split = false;
-
- if (folio_test_swapcache(folio)) {
- VM_BUG_ON(mapping);
-
- /* a swapcache folio can only be uniformly split to order-0 */
- if (!uniform_split || new_order != 0)
- return -EINVAL;
-
- swap_cache = swap_address_space(folio->swap);
- xa_lock(&swap_cache->i_pages);
- }
+ struct folio *next;
+ int split_order;
+ int ret = 0;
if (folio_test_anon(folio))
mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
- /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
- lruvec = folio_lruvec_lock(folio);
-
folio_clear_has_hwpoisoned(folio);
/*
@@ -3499,9 +3439,9 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
for (split_order = start_order;
split_order >= new_order && !stop_split;
split_order--) {
- int old_order = folio_order(folio);
- struct folio *release;
struct folio *end_folio = folio_next(folio);
+ int old_order = folio_order(folio);
+ struct folio *new_folio;
/* order-1 anonymous folio is not supported */
if (folio_test_anon(folio) && split_order == 1)
@@ -3523,126 +3463,45 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
if (xas_error(xas)) {
ret = xas_error(xas);
stop_split = true;
- goto after_split;
}
}
}
- folio_split_memcg_refs(folio, old_order, split_order);
- split_page_owner(&folio->page, old_order, split_order);
- pgalloc_tag_split(folio, old_order, split_order);
+ if (!stop_split) {
+ folio_split_memcg_refs(folio, old_order, split_order);
+ split_page_owner(&folio->page, old_order, split_order);
+ pgalloc_tag_split(folio, old_order, split_order);
- __split_folio_to_order(folio, old_order, split_order);
+ __split_folio_to_order(folio, old_order, split_order);
+ }
-after_split:
/*
- * Iterate through after-split folios and perform related
- * operations. But in buddy allocator like split, the folio
+ * Iterate through after-split folios and update folio stats.
+ * But in buddy allocator like split, the folio
* containing the specified page is skipped until its order
* is new_order, since the folio will be worked on in next
* iteration.
*/
- for (release = folio; release != end_folio; release = next) {
- next = folio_next(release);
+ for (new_folio = folio; new_folio != end_folio; new_folio = next) {
+ next = folio_next(new_folio);
/*
- * for buddy allocator like split, the folio containing
- * page will be split next and should not be released,
- * until the folio's order is new_order or stop_split
- * is set to true by the above xas_split() failure.
+ * for buddy allocator like split, new_folio containing
+ * @split_at page could be split again, thus do not
+ * change stats yet. Wait until new_folio's order is
+ * @new_order or stop_split is set to true by the above
+ * xas_split() failure.
*/
- if (release == page_folio(split_at)) {
- folio = release;
+ if (new_folio == page_folio(split_at)) {
+ folio = new_folio;
if (split_order != new_order && !stop_split)
continue;
}
- if (folio_test_anon(release)) {
- mod_mthp_stat(folio_order(release),
- MTHP_STAT_NR_ANON, 1);
- }
-
- /*
- * origin_folio should be kept frozon until page cache
- * entries are updated with all the other after-split
- * folios to prevent others seeing stale page cache
- * entries.
- */
- if (release == origin_folio)
- continue;
-
- folio_ref_unfreeze(release, 1 +
- ((mapping || swap_cache) ?
- folio_nr_pages(release) : 0));
-
- lru_add_split_folio(origin_folio, release, lruvec,
- list);
-
- /* Some pages can be beyond EOF: drop them from cache */
- if (release->index >= end) {
- if (shmem_mapping(mapping))
- nr_dropped += folio_nr_pages(release);
- else if (folio_test_clear_dirty(release))
- folio_account_cleaned(release,
- inode_to_wb(mapping->host));
- __filemap_remove_folio(release, NULL);
- folio_put_refs(release, folio_nr_pages(release));
- } else if (mapping) {
- __xa_store(&mapping->i_pages,
- release->index, release, 0);
- } else if (swap_cache) {
- __xa_store(&swap_cache->i_pages,
- swap_cache_index(release->swap),
- release, 0);
- }
+ if (folio_test_anon(new_folio))
+ mod_mthp_stat(folio_order(new_folio),
+ MTHP_STAT_NR_ANON, 1);
}
}
- /*
- * Unfreeze origin_folio only after all page cache entries, which used
- * to point to it, have been updated with new folios. Otherwise,
- * a parallel folio_try_get() can grab origin_folio and its caller can
- * see stale page cache entries.
- */
- folio_ref_unfreeze(origin_folio, 1 +
- ((mapping || swap_cache) ? folio_nr_pages(origin_folio) : 0));
-
- unlock_page_lruvec(lruvec);
-
- if (swap_cache)
- xa_unlock(&swap_cache->i_pages);
- if (mapping)
- xa_unlock(&mapping->i_pages);
-
- /* Caller disabled irqs, so they are still disabled here */
- local_irq_enable();
-
- if (nr_dropped)
- shmem_uncharge(mapping->host, nr_dropped);
-
- remap_page(origin_folio, 1 << order,
- folio_test_anon(origin_folio) ?
- RMP_USE_SHARED_ZEROPAGE : 0);
-
- /*
- * At this point, folio should contain the specified page.
- * For uniform split, it is left for caller to unlock.
- * For buddy allocator like split, the first after-split folio is left
- * for caller to unlock.
- */
- for (new_folio = origin_folio; new_folio != next_folio; new_folio = next) {
- next = folio_next(new_folio);
- if (new_folio == page_folio(lock_at))
- continue;
-
- folio_unlock(new_folio);
- /*
- * Subpages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
- */
- free_page_and_swap_cache(&new_folio->page);
- }
return ret;
}
@@ -3716,6 +3575,11 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
* It is in charge of checking whether the split is supported or not and
* preparing @folio for __split_unmapped_folio().
*
+ * After splitting, the after-split folio containing @lock_at remains locked
+ * and others are unlocked:
+ * 1. for uniform split, @lock_at points to one of @folio's subpages;
+ * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio.
+ *
* return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
* split but not to @new_order, the caller needs to check)
*/
@@ -3725,16 +3589,20 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
{
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
XA_STATE(xas, &folio->mapping->i_pages, folio->index);
+ struct folio *end_folio = folio_next(folio);
bool is_anon = folio_test_anon(folio);
struct address_space *mapping = NULL;
struct anon_vma *anon_vma = NULL;
int order = folio_order(folio);
+ struct folio *new_folio, *next;
+ int nr_shmem_dropped = 0;
+ int remap_flags = 0;
int extra_pins, ret;
pgoff_t end;
bool is_hzp;
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
if (folio != page_folio(split_at) || folio != page_folio(lock_at))
return -EINVAL;
@@ -3772,7 +3640,6 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
ret = -EBUSY;
goto out;
}
- end = -1;
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
@@ -3852,13 +3719,19 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
*/
xas_lock(&xas);
xas_reset(&xas);
- if (xas_load(&xas) != folio)
+ if (xas_load(&xas) != folio) {
+ ret = -EAGAIN;
goto fail;
+ }
}
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
if (folio_ref_freeze(folio, 1 + extra_pins)) {
+ struct address_space *swap_cache = NULL;
+ struct lruvec *lruvec;
+ int expected_refs;
+
if (folio_order(folio) > 1 &&
!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
@@ -3892,18 +3765,122 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
}
}
- ret = __split_unmapped_folio(folio, new_order,
- split_at, lock_at, list, end, &xas, mapping,
- uniform_split);
+ if (folio_test_swapcache(folio)) {
+ if (mapping) {
+ VM_WARN_ON_ONCE_FOLIO(mapping, folio);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ swap_cache = swap_address_space(folio->swap);
+ xa_lock(&swap_cache->i_pages);
+ }
+
+ /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
+ lruvec = folio_lruvec_lock(folio);
+
+ ret = __split_unmapped_folio(folio, new_order, split_at, &xas,
+ mapping, uniform_split);
+
+ /*
+ * Unfreeze after-split folios and put them back to the right
+ * list. @folio should be kept frozon until page cache
+ * entries are updated with all the other after-split folios
+ * to prevent others seeing stale page cache entries.
+ * As a result, new_folio starts from the next folio of
+ * @folio.
+ */
+ for (new_folio = folio_next(folio); new_folio != end_folio;
+ new_folio = next) {
+ unsigned long nr_pages = folio_nr_pages(new_folio);
+
+ next = folio_next(new_folio);
+
+ expected_refs = folio_expected_ref_count(new_folio) + 1;
+ folio_ref_unfreeze(new_folio, expected_refs);
+
+ lru_add_split_folio(folio, new_folio, lruvec, list);
+
+ /*
+ * Anonymous folio with swap cache.
+ * NOTE: shmem in swap cache is not supported yet.
+ */
+ if (swap_cache) {
+ __xa_store(&swap_cache->i_pages,
+ swap_cache_index(new_folio->swap),
+ new_folio, 0);
+ continue;
+ }
+
+ /* Anonymous folio without swap cache */
+ if (!mapping)
+ continue;
+
+ /* Add the new folio to the page cache. */
+ if (new_folio->index < end) {
+ __xa_store(&mapping->i_pages, new_folio->index,
+ new_folio, 0);
+ continue;
+ }
+
+ /* Drop folio beyond EOF: ->index >= end */
+ if (shmem_mapping(mapping))
+ nr_shmem_dropped += nr_pages;
+ else if (folio_test_clear_dirty(new_folio))
+ folio_account_cleaned(
+ new_folio, inode_to_wb(mapping->host));
+ __filemap_remove_folio(new_folio, NULL);
+ folio_put_refs(new_folio, nr_pages);
+ }
+ /*
+ * Unfreeze @folio only after all page cache entries, which
+ * used to point to it, have been updated with new folios.
+ * Otherwise, a parallel folio_try_get() can grab @folio
+ * and its caller can see stale page cache entries.
+ */
+ expected_refs = folio_expected_ref_count(folio) + 1;
+ folio_ref_unfreeze(folio, expected_refs);
+
+ unlock_page_lruvec(lruvec);
+
+ if (swap_cache)
+ xa_unlock(&swap_cache->i_pages);
} else {
spin_unlock(&ds_queue->split_queue_lock);
-fail:
- if (mapping)
- xas_unlock(&xas);
- local_irq_enable();
- remap_page(folio, folio_nr_pages(folio), 0);
ret = -EAGAIN;
}
+fail:
+ if (mapping)
+ xas_unlock(&xas);
+
+ local_irq_enable();
+
+ if (nr_shmem_dropped)
+ shmem_uncharge(mapping->host, nr_shmem_dropped);
+
+ if (!ret && is_anon)
+ remap_flags = RMP_USE_SHARED_ZEROPAGE;
+ remap_page(folio, 1 << order, remap_flags);
+
+ /*
+ * Unlock all after-split folios except the one containing
+ * @lock_at page. If @folio is not split, it will be kept locked.
+ */
+ for (new_folio = folio; new_folio != end_folio; new_folio = next) {
+ next = folio_next(new_folio);
+ if (new_folio == page_folio(lock_at))
+ continue;
+
+ folio_unlock(new_folio);
+ /*
+ * Subpages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ free_folio_and_swap_cache(new_folio);
+ }
out_unlock:
if (anon_vma) {
@@ -4675,7 +4652,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
folio_get(folio);
- pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
+ pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6fccfe6d046c..753f99b4c718 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -25,6 +25,7 @@
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
+#include <linux/string_choices.h>
#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -58,6 +59,7 @@ int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
+__initdata nodemask_t hugetlb_bootmem_nodes;
__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata;
@@ -120,7 +122,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static void hugetlb_free_folio(struct folio *folio)
@@ -283,11 +285,6 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
return ret;
}
-static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
-{
- return HUGETLBFS_SB(inode->i_sb)->spool;
-}
-
static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
return subpool_inode(file_inode(vma->vm_file));
@@ -1250,7 +1247,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
/*
* Reset and decrement one ref on hugepage private reservation.
* Called with mm->mmap_lock writer semaphore held.
- * This function should be only used by move_vma() and operate on
+ * This function should be only used by mremap and operate on
* same sized vma. It should never come here with last ref on the
* reservation.
*/
@@ -1950,7 +1947,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
int order = huge_page_order(h);
struct folio *folio;
bool alloc_try_hard = true;
- bool retry = true;
/*
* By default we always try hard to allocate the folio with
@@ -1965,22 +1961,8 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
-retry:
- folio = __folio_alloc(gfp_mask, order, nid, nmask);
- /* Ensure hugetlb folio won't have large_rmappable flag set. */
- if (folio)
- folio_clear_large_rmappable(folio);
- if (folio && !folio_ref_freeze(folio, 1)) {
- folio_put(folio);
- if (retry) { /* retry once */
- retry = false;
- goto retry;
- }
- /* WOW! twice in a row. */
- pr_warn("HugeTLB unexpected inflated folio ref count\n");
- folio = NULL;
- }
+ folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask);
/*
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a
@@ -2271,7 +2253,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
* as surplus_pages, otherwise it might confuse
* persistent_huge_pages() momentarily.
*/
- __prep_account_new_huge_page(h, nid);
+ __prep_account_new_huge_page(h, folio_nid(folio));
/*
* We could have raced with the pool size change.
@@ -2354,12 +2336,15 @@ struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
struct folio *folio;
spin_lock_irq(&hugetlb_lock);
+ if (!h->resv_huge_pages) {
+ spin_unlock_irq(&hugetlb_lock);
+ return NULL;
+ }
+
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid,
nmask);
- if (folio) {
- VM_BUG_ON(!h->resv_huge_pages);
+ if (folio)
h->resv_huge_pages--;
- }
spin_unlock_irq(&hugetlb_lock);
return folio;
@@ -2419,7 +2404,6 @@ static int gather_surplus_pages(struct hstate *h, long delta)
long i;
long needed, allocated;
bool alloc_ok = true;
- int node;
nodemask_t *mbind_nodemask, alloc_nodemask;
mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
@@ -2443,21 +2427,12 @@ retry:
for (i = 0; i < needed; i++) {
folio = NULL;
- /* Prioritize current node */
- if (node_isset(numa_mem_id(), alloc_nodemask))
- folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
- numa_mem_id(), NULL);
-
- if (!folio) {
- for_each_node_mask(node, alloc_nodemask) {
- if (node == numa_mem_id())
- continue;
- folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
- node, NULL);
- if (folio)
- break;
- }
- }
+ /*
+ * It is okay to use NUMA_NO_NODE because we use numa_mem_id()
+ * down the road to pick the current node if that is the case.
+ */
+ folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
+ NUMA_NO_NODE, &alloc_nodemask);
if (!folio) {
alloc_ok = false;
break;
@@ -2811,20 +2786,24 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
/*
* alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
* the old one
- * @h: struct hstate old page belongs to
* @old_folio: Old folio to dissolve
* @list: List to isolate the page in case we need to
* Returns 0 on success, otherwise negated error.
*/
-static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
- struct folio *old_folio, struct list_head *list)
+static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio,
+ struct list_head *list)
{
- gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ gfp_t gfp_mask;
+ struct hstate *h;
int nid = folio_nid(old_folio);
struct folio *new_folio = NULL;
int ret = 0;
retry:
+ /*
+ * The old_folio might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ */
spin_lock_irq(&hugetlb_lock);
if (!folio_test_hugetlb(old_folio)) {
/*
@@ -2853,8 +2832,10 @@ retry:
cond_resched();
goto retry;
} else {
+ h = folio_hstate(old_folio);
if (!new_folio) {
spin_unlock_irq(&hugetlb_lock);
+ gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
NULL, NULL);
if (!new_folio)
@@ -2896,38 +2877,26 @@ free_new:
return ret;
}
-int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)
{
- struct hstate *h;
- struct folio *folio = page_folio(page);
int ret = -EBUSY;
- /*
- * The page might have been dissolved from under our feet, so make sure
- * to carefully check the state under the lock.
- * Return success when racing as if we dissolved the page ourselves.
- */
- spin_lock_irq(&hugetlb_lock);
- if (folio_test_hugetlb(folio)) {
- h = folio_hstate(folio);
- } else {
- spin_unlock_irq(&hugetlb_lock);
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (!folio_test_hugetlb(folio))
return 0;
- }
- spin_unlock_irq(&hugetlb_lock);
/*
* Fence off gigantic pages as there is a cyclic dependency between
* alloc_contig_range and them. Return -ENOMEM as this has the effect
* of bailing out right away without further retrying.
*/
- if (hstate_is_gigantic(h))
+ if (folio_order(folio) > MAX_PAGE_ORDER)
return -ENOMEM;
if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
ret = 0;
else if (!folio_ref_count(folio))
- ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
+ ret = alloc_and_dissolve_hugetlb_folio(folio, list);
return ret;
}
@@ -2941,7 +2910,6 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
*/
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
{
- struct hstate *h;
struct folio *folio;
int ret = 0;
@@ -2949,16 +2917,10 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
while (start_pfn < end_pfn) {
folio = pfn_folio(start_pfn);
- if (folio_test_hugetlb(folio)) {
- h = folio_hstate(folio);
- } else {
- start_pfn++;
- continue;
- }
- if (!folio_ref_count(folio)) {
- ret = alloc_and_dissolve_hugetlb_folio(h, folio,
- &isolate_list);
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) {
+ ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list);
if (ret)
break;
@@ -3010,7 +2972,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct folio *folio;
- long retval, gbl_chg;
+ long retval, gbl_chg, gbl_reserve;
map_chg_state map_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
@@ -3163,8 +3125,16 @@ out_uncharge_cgroup_reservation:
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
h_cg);
out_subpool_put:
- if (map_chg)
- hugepage_subpool_put_pages(spool, 1);
+ /*
+ * put page to subpool iff the quota of subpool's rsv_hpages is used
+ * during hugepage_subpool_get_pages.
+ */
+ if (map_chg && !gbl_chg) {
+ gbl_reserve = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -gbl_reserve);
+ }
+
+
out_end_reservation:
if (map_chg != MAP_CHG_ENFORCED)
vma_end_reservation(h, vma, addr);
@@ -3237,7 +3207,8 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
}
/* allocate from next node when distributing huge pages */
- for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) {
+ for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node,
+ &hugetlb_bootmem_nodes) {
m = alloc_bootmem(h, node, false);
if (!m)
return 0;
@@ -3327,8 +3298,8 @@ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
if (folio_test_hugetlb_cma(folio))
init_cma_pageblock(folio_page(folio, i));
else
- set_pageblock_migratetype(folio_page(folio, i),
- MIGRATE_MOVABLE);
+ init_pageblock_migratetype(folio_page(folio, i),
+ MIGRATE_MOVABLE, false);
}
}
@@ -3701,6 +3672,15 @@ static void __init hugetlb_init_hstates(void)
struct hstate *h, *h2;
for_each_hstate(h) {
+ /*
+ * Always reset to first_memory_node here, even if
+ * next_nid_to_alloc was set before - we can't
+ * reference hugetlb_bootmem_nodes after init, and
+ * first_memory_node is right for all further allocations.
+ */
+ h->next_nid_to_alloc = first_memory_node;
+ h->next_nid_to_free = first_memory_node;
+
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
@@ -3740,10 +3720,10 @@ static void __init report_hugepages(void)
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
- buf, h->free_huge_pages);
+ buf, h->nr_huge_pages);
if (nrinvalid)
pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
- buf, nrinvalid, nrinvalid > 1 ? "s" : "");
+ buf, nrinvalid, str_plural(nrinvalid));
pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
}
@@ -3825,6 +3805,7 @@ found:
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
+ unsigned long persistent_free_count;
unsigned long min_count;
unsigned long allocated;
struct folio *folio;
@@ -3959,8 +3940,24 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
+ *
+ * min_count is the expected number of persistent pages, we
+ * shouldn't calculate min_count by using
+ * resv_huge_pages + persistent_huge_pages() - free_huge_pages,
+ * because there may exist free surplus huge pages, and this will
+ * lead to subtracting twice. Free surplus huge pages come from HVO
+ * failing to restore vmemmap, see comments in the callers of
+ * hugetlb_vmemmap_restore_folio(). Thus, we should calculate
+ * persistent free count first.
*/
- min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
+ persistent_free_count = h->free_huge_pages;
+ if (h->free_huge_pages > persistent_huge_pages(h)) {
+ if (h->free_huge_pages > h->surplus_huge_pages)
+ persistent_free_count -= h->surplus_huge_pages;
+ else
+ persistent_free_count = 0;
+ }
+ min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count;
min_count = max(count, min_count);
try_to_free_low(h, min_count, nodes_allowed);
@@ -4017,10 +4014,13 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
list_for_each_entry_safe(folio, next, src_list, lru) {
int i;
+ bool cma;
if (folio_test_hugetlb_vmemmap_optimized(folio))
continue;
+ cma = folio_test_hugetlb_cma(folio);
+
list_del(&folio->lru);
split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
@@ -4036,6 +4036,9 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
new_folio->mapping = NULL;
init_new_hugetlb_folio(dst, new_folio);
+ /* Copy the CMA flag so that it is freed correctly */
+ if (cma)
+ folio_set_hugetlb_cma(new_folio);
list_add(&new_folio->lru, &dst_list);
}
}
@@ -4630,7 +4633,7 @@ static void __init hugetlb_sysfs_init(void)
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
- pr_err("HugeTLB: Unable to add hstate %s", h->name);
+ pr_err("HugeTLB: Unable to add hstate %s\n", h->name);
}
#ifdef CONFIG_NUMA
@@ -4990,6 +4993,20 @@ static int __init default_hugepagesz_setup(char *s)
}
hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup);
+void __init hugetlb_bootmem_set_nodes(void)
+{
+ int i, nid;
+ unsigned long start_pfn, end_pfn;
+
+ if (!nodes_empty(hugetlb_bootmem_nodes))
+ return;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ if (end_pfn > start_pfn)
+ node_set(nid, hugetlb_bootmem_nodes);
+ }
+}
+
static bool __hugetlb_bootmem_allocated __initdata;
bool __init hugetlb_bootmem_allocated(void)
@@ -5005,6 +5022,8 @@ void __init hugetlb_bootmem_alloc(void)
if (__hugetlb_bootmem_allocated)
return;
+ hugetlb_bootmem_set_nodes();
+
for (i = 0; i < MAX_NUMNODES; i++)
INIT_LIST_HEAD(&huge_boot_pages[i]);
@@ -5012,7 +5031,6 @@ void __init hugetlb_bootmem_alloc(void)
for_each_hstate(h) {
h->next_nid_to_alloc = first_online_node;
- h->next_nid_to_free = first_online_node;
if (hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
@@ -5179,7 +5197,7 @@ static const struct ctl_table hugetlb_table[] = {
},
};
-static void hugetlb_sysctl_init(void)
+static void __init hugetlb_sysctl_init(void)
{
register_sysctl_init("vm", hugetlb_table);
}
@@ -5387,26 +5405,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+ return 0;
+}
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
/*
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ * This function is called in the middle of a VMA split operation, with
+ * MM, VMA and rmap all write-locked to prevent concurrent page table
+ * walks (except hardware and gup_fast()).
*/
+ vma_assert_write_locked(vma);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
if (addr & ~PUD_MASK) {
- /*
- * hugetlb_vm_op_split is called right before we attempt to
- * split the VMA. We will need to unshare PMDs in the old and
- * new VMAs, so let's unshare before we split.
- */
unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;
- if (floor >= vma->vm_start && ceil <= vma->vm_end)
- hugetlb_unshare_pmds(vma, floor, ceil);
+ if (floor >= vma->vm_start && ceil <= vma->vm_end) {
+ /*
+ * Locking:
+ * Use take_locks=false here.
+ * The file rmap lock is already held.
+ * The hugetlb VMA lock can't be taken when we already
+ * hold the file rmap lock, and we don't need it because
+ * its purpose is to synchronize against concurrent page
+ * table walks, which are not possible thanks to the
+ * locks held by our caller.
+ */
+ hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
+ }
}
-
- return 0;
}
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -5441,18 +5473,16 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.pagesize = hugetlb_vm_op_pagesize,
};
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio,
bool try_mkwrite)
{
- pte_t entry;
+ pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);
unsigned int shift = huge_page_shift(hstate_vma(vma));
if (try_mkwrite && (vma->vm_flags & VM_WRITE)) {
- entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
- vma->vm_page_prot)));
+ entry = pte_mkwrite_novma(pte_mkdirty(entry));
} else {
- entry = huge_pte_wrprotect(mk_huge_pte(page,
- vma->vm_page_prot));
+ entry = pte_wrprotect(entry);
}
entry = pte_mkyoung(entry);
entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
@@ -5507,7 +5537,7 @@ static void
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
struct folio *new_folio, pte_t old, unsigned long sz)
{
- pte_t newpte = make_huge_pte(vma, &new_folio->page, true);
+ pte_t newpte = make_huge_pte(vma, new_folio, true);
__folio_mark_uptodate(new_folio);
hugetlb_add_new_anon_rmap(new_folio, vma, addr);
@@ -5811,14 +5841,14 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- struct page *ref_page, zap_flags_t zap_flags)
+ struct folio *folio, zap_flags_t zap_flags)
{
struct mm_struct *mm = vma->vm_mm;
+ const bool folio_provided = !!folio;
unsigned long address;
pte_t *ptep;
pte_t pte;
spinlock_t *ptl;
- struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
bool adjust_reservation = false;
@@ -5882,14 +5912,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
continue;
}
- page = pte_page(pte);
/*
- * If a reference page is supplied, it is because a specific
- * page is being unmapped, not a range. Ensure the page we
- * are about to unmap is the actual page of interest.
+ * If a folio is supplied, it is because a specific
+ * folio is being unmapped, not a range. Ensure the folio we
+ * are about to unmap is the actual folio of interest.
*/
- if (ref_page) {
- if (page != ref_page) {
+ if (folio_provided) {
+ if (folio != page_folio(pte_page(pte))) {
spin_unlock(ptl);
continue;
}
@@ -5899,12 +5928,14 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* looking like data was lost
*/
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+ } else {
+ folio = page_folio(pte_page(pte));
}
pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
- set_page_dirty(page);
+ folio_mark_dirty(folio);
/* Leave a uffd-wp pte marker if needed */
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
@@ -5912,7 +5943,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
make_pte_marker(PTE_MARKER_UFFD_WP),
sz);
hugetlb_count_sub(pages_per_huge_page(h), mm);
- hugetlb_remove_rmap(page_folio(page));
+ hugetlb_remove_rmap(folio);
/*
* Restore the reservation for anonymous page, otherwise the
@@ -5921,8 +5952,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* reservation bit.
*/
if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
- folio_test_anon(page_folio(page))) {
- folio_set_hugetlb_restore_reserve(page_folio(page));
+ folio_test_anon(folio)) {
+ folio_set_hugetlb_restore_reserve(folio);
/* Reservation to be adjusted after the spin lock */
adjust_reservation = true;
}
@@ -5946,16 +5977,17 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* count will not be incremented by free_huge_folio.
* Act as if we consumed the reservation.
*/
- folio_clear_hugetlb_restore_reserve(page_folio(page));
+ folio_clear_hugetlb_restore_reserve(folio);
else if (rc)
vma_add_reservation(h, vma, address);
}
- tlb_remove_page_size(tlb, page, huge_page_size(h));
+ tlb_remove_page_size(tlb, folio_page(folio, 0),
+ folio_size(folio));
/*
- * Bail out after unmapping reference page if supplied
+ * If we were instructed to unmap a specific folio, we're done.
*/
- if (ref_page)
+ if (folio_provided)
break;
}
tlb_end_vma(tlb, vma);
@@ -6017,7 +6049,7 @@ void __hugetlb_zap_end(struct vm_area_struct *vma,
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page,
+ unsigned long end, struct folio *folio,
zap_flags_t zap_flags)
{
struct mmu_notifier_range range;
@@ -6029,7 +6061,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
mmu_notifier_invalidate_range_start(&range);
tlb_gather_mmu(&tlb, vma->vm_mm);
- __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
+ __unmap_hugepage_range(&tlb, vma, start, end,
+ folio, zap_flags);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
@@ -6042,7 +6075,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
* same region.
*/
static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page *page, unsigned long address)
+ struct folio *folio, unsigned long address)
{
struct hstate *h = hstate_vma(vma);
struct vm_area_struct *iter_vma;
@@ -6086,7 +6119,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
unmap_hugepage_range(iter_vma, address,
- address + huge_page_size(h), page, 0);
+ address + huge_page_size(h),
+ folio, 0);
}
i_mmap_unlock_write(mapping);
}
@@ -6097,8 +6131,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
-static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
- struct vm_fault *vmf)
+static vm_fault_t hugetlb_wp(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
@@ -6160,16 +6193,17 @@ retry_avoidcopy:
PageAnonExclusive(&old_folio->page), &old_folio->page);
/*
- * If the process that created a MAP_PRIVATE mapping is about to
- * perform a COW due to a shared page count, attempt to satisfy
- * the allocation without using the existing reserves. The pagecache
- * page is used to determine if the reserve at this address was
- * consumed or not. If reserves were used, a partial faulted mapping
- * at the time of fork() could consume its reserves on COW instead
- * of the full address range.
+ * If the process that created a MAP_PRIVATE mapping is about to perform
+ * a COW due to a shared page count, attempt to satisfy the allocation
+ * without using the existing reserves.
+ * In order to determine where this is a COW on a MAP_PRIVATE mapping it
+ * is enough to check whether the old_folio is anonymous. This means that
+ * the reserve for this address was consumed. If reserves were used, a
+ * partial faulted mapping at the fime of fork() could consume its reserves
+ * on COW instead of the full address range.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
- old_folio != pagecache_folio)
+ folio_test_anon(old_folio))
cow_from_owner = true;
folio_get(old_folio);
@@ -6209,8 +6243,7 @@ retry_avoidcopy:
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- unmap_ref_private(mm, vma, &old_folio->page,
- vmf->address);
+ unmap_ref_private(mm, vma, old_folio, vmf->address);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
@@ -6257,7 +6290,7 @@ retry_avoidcopy:
spin_lock(vmf->ptl);
vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) {
- pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
+ pte_t newpte = make_huge_pte(vma, new_folio, !unshare);
/* Break COW or unshare */
huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
@@ -6373,16 +6406,16 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned
static vm_fault_t hugetlb_no_page(struct address_space *mapping,
struct vm_fault *vmf)
{
+ u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
+ bool new_folio, new_anon_folio = false;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
struct hstate *h = hstate_vma(vma);
vm_fault_t ret = VM_FAULT_SIGBUS;
- int anon_rmap = 0;
- unsigned long size;
+ bool folio_locked = true;
struct folio *folio;
+ unsigned long size;
pte_t new_pte;
- bool new_folio, new_pagecache_folio = false;
- u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
/*
* Currently, we are forced to kill the process in the event the
@@ -6481,10 +6514,9 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
ret = VM_FAULT_SIGBUS;
goto out;
}
- new_pagecache_folio = true;
} else {
+ new_anon_folio = true;
folio_lock(folio);
- anon_rmap = 1;
}
} else {
/*
@@ -6533,11 +6565,11 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte))
goto backout;
- if (anon_rmap)
+ if (new_anon_folio)
hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
else
hugetlb_add_file_rmap(folio);
- new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED);
+ new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED);
/*
* If this pte was previously wr-protected, keep it wr-protected even
* if populated.
@@ -6548,8 +6580,16 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ /*
+ * No need to keep file folios locked. See comment in
+ * hugetlb_fault().
+ */
+ if (!new_anon_folio) {
+ folio_locked = false;
+ folio_unlock(folio);
+ }
/* Optimization, do the COW without a second fault */
- ret = hugetlb_wp(folio, vmf);
+ ret = hugetlb_wp(vmf);
}
spin_unlock(vmf->ptl);
@@ -6562,7 +6602,8 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
if (new_folio)
folio_set_hugetlb_migratable(folio);
- folio_unlock(folio);
+ if (folio_locked)
+ folio_unlock(folio);
out:
hugetlb_vma_unlock_read(vma);
@@ -6579,7 +6620,8 @@ out:
backout:
spin_unlock(vmf->ptl);
backout_unlocked:
- if (new_folio && !new_pagecache_folio)
+ /* We only need to restore reservations for private mappings */
+ if (new_anon_folio)
restore_reserve_on_error(h, vma, vmf->address, folio);
folio_unlock(folio);
@@ -6617,10 +6659,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vm_fault_t ret;
u32 hash;
struct folio *folio = NULL;
- struct folio *pagecache_folio = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
- int need_wait_lock = 0;
+ bool need_wait_lock = false;
struct vm_fault vmf = {
.vma = vma,
.address = address & huge_page_mask(h),
@@ -6686,15 +6727,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = 0;
- /*
- * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
- * point, so this check prevents the kernel from going below assuming
- * that we have an active hugepage in pagecache. This goto expects
- * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
- * check will properly handle it.
- */
+ /* Not present, either a migration or a hwpoisoned entry */
if (!pte_present(vmf.orig_pte)) {
- if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
+ if (is_hugetlb_entry_migration(vmf.orig_pte)) {
/*
* Release the hugetlb fault lock now, but retain
* the vma lock, because it is needed to guard the
@@ -6705,7 +6740,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
migration_entry_wait_huge(vma, vmf.address, vmf.pte);
return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
+ } else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte))
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
@@ -6715,8 +6750,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* If we are going to COW/unshare the mapping later, we examine the
* pending reservations for this page now. This will ensure that any
* allocations necessary to record that reservation occur outside the
- * spinlock. Also lookup the pagecache page now as it is used to
- * determine if a reservation has been consumed.
+ * spinlock.
*/
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
!(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
@@ -6726,11 +6760,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, vmf.address);
-
- pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
- vmf.pgoff);
- if (IS_ERR(pagecache_folio))
- pagecache_folio = NULL;
}
vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
@@ -6744,10 +6773,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
(flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
if (!userfaultfd_wp_async(vma)) {
spin_unlock(vmf.ptl);
- if (pagecache_folio) {
- folio_unlock(pagecache_folio);
- folio_put(pagecache_folio);
- }
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return handle_userfault(&vmf, VM_UFFD_WP);
@@ -6759,24 +6784,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Fallthrough to CoW */
}
- /*
- * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
- * pagecache_folio, so here we need take the former one
- * when folio != pagecache_folio or !pagecache_folio.
- */
- folio = page_folio(pte_page(vmf.orig_pte));
- if (folio != pagecache_folio)
- if (!folio_trylock(folio)) {
- need_wait_lock = 1;
- goto out_ptl;
- }
-
- folio_get(folio);
-
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(vmf.orig_pte)) {
- ret = hugetlb_wp(pagecache_folio, &vmf);
- goto out_put_page;
+ /*
+ * Anonymous folios need to be lock since hugetlb_wp()
+ * checks whether we can re-use the folio exclusively
+ * for us in case we are the only user of it.
+ */
+ folio = page_folio(pte_page(vmf.orig_pte));
+ if (folio_test_anon(folio) && !folio_trylock(folio)) {
+ need_wait_lock = true;
+ goto out_ptl;
+ }
+ folio_get(folio);
+ ret = hugetlb_wp(&vmf);
+ if (folio_test_anon(folio))
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out_ptl;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
}
@@ -6785,17 +6810,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, vmf.address, vmf.pte);
-out_put_page:
- if (folio != pagecache_folio)
- folio_unlock(folio);
- folio_put(folio);
out_ptl:
spin_unlock(vmf.ptl);
-
- if (pagecache_folio) {
- folio_unlock(pagecache_folio);
- folio_put(pagecache_folio);
- }
out_mutex:
hugetlb_vma_unlock_read(vma);
@@ -6808,11 +6824,16 @@ out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
/*
- * Generally it's safe to hold refcount during waiting page lock. But
- * here we just wait to defer the next page fault to avoid busy loop and
- * the page is not used after unlocked before returning from the current
- * page fault. So we are safe from accessing freed page, even if we wait
- * here without taking refcount.
+ * hugetlb_wp drops all the locks, but the folio lock, before trying to
+ * unmap the folio from other processes. During that window, if another
+ * process mapping that folio faults in, it will take the mutex and then
+ * it will wait on folio_lock, causing an ABBA deadlock.
+ * Use trylock instead and bail out if we fail.
+ *
+ * Ideally, we should hold a refcount on the folio we wait for, but we do
+ * not want to use the folio after it becomes unlocked, but rather just
+ * wait for it to become unlocked, so hopefully next fault successes on
+ * the trylock.
*/
if (need_wait_lock)
folio_wait_locked(folio);
@@ -7022,7 +7043,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
* with wp flag set, don't set pte write bit.
*/
- _dst_pte = make_huge_pte(dst_vma, &folio->page,
+ _dst_pte = make_huge_pte(dst_vma, folio,
!wp_enabled && !(is_continue && !vm_shared));
/*
* Always mark UFFDIO_COPY page dirty; note that this may not be
@@ -7132,11 +7153,11 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
/* Nothing to do. */
} else if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = pfn_swap_entry_folio(entry);
pte_t newpte = pte;
if (is_writable_migration_entry(entry)) {
- if (PageAnon(page))
+ if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry));
else
@@ -7210,13 +7231,20 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
return pages > 0 ? (pages << h->order) : pages;
}
-/* Return true if reservation was successful, false otherwise. */
-bool hugetlb_reserve_pages(struct inode *inode,
+/*
+ * Update the reservation map for the range [from, to].
+ *
+ * Returns the number of entries that would be added to the reservation map
+ * associated with the range [from, to]. This number is greater or equal to
+ * zero. -EINVAL or -ENOMEM is returned in case of any errors.
+ */
+
+long hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long chg = -1, add = -1;
+ long chg = -1, add = -1, spool_resv, gbl_resv;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -7226,7 +7254,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
/* This should never happen */
if (from > to) {
VM_WARN(1, "%s called with a negative range\n", __func__);
- return false;
+ return -EINVAL;
}
/*
@@ -7241,7 +7269,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
* without using reserves
*/
if (vm_flags & VM_NORESERVE)
- return true;
+ return 0;
/*
* Shared mappings base their reservation on the number of pages that
@@ -7348,11 +7376,19 @@ bool hugetlb_reserve_pages(struct inode *inode,
hugetlb_cgroup_put_rsvd_cgroup(h_cg);
}
}
- return true;
+ return chg;
out_put_pages:
- /* put back original number of pages, chg */
- (void)hugepage_subpool_put_pages(spool, chg);
+ spool_resv = chg - gbl_reserve;
+ if (spool_resv) {
+ /* put sub pool's reservation back, chg - gbl_reserve */
+ gbl_resv = hugepage_subpool_put_pages(spool, spool_resv);
+ /*
+ * subpool's reserved pages can not be put back due to race,
+ * return to hstate.
+ */
+ hugetlb_acct_memory(h, -gbl_resv);
+ }
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
@@ -7368,7 +7404,7 @@ out_err:
kref_put(&resv_map->refs, resv_map_release);
set_vma_resv_map(vma, NULL);
}
- return false;
+ return chg < 0 ? chg : add < 0 ? add : -EINVAL;
}
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
@@ -7423,8 +7459,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
+ vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
+ vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
/*
* match the virtual addresses, permission and the alignment of the
@@ -7567,6 +7603,13 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
pud_clear(pud);
+ /*
+ * Once our caller drops the rmap lock, some other process might be
+ * using this page table as a normal, non-hugetlb page table.
+ * Wait for pending gup_fast() in other threads to finish before letting
+ * that happen.
+ */
+ tlb_remove_table_sync_one();
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
mm_dec_nr_pmds(mm);
return 1;
@@ -7792,7 +7835,7 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
struct hstate *h = folio_hstate(old_folio);
hugetlb_cgroup_migrate(old_folio, new_folio);
- set_page_owner_migrate_reason(&new_folio->page, reason);
+ folio_set_owner_migrate_reason(new_folio, reason);
/*
* transfer temporary state of the new hugetlb folio. This is
@@ -7837,9 +7880,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
spin_unlock_irq(&hugetlb_lock);
}
+/*
+ * If @take_locks is false, the caller must ensure that no concurrent page table
+ * access can happen (except for gup_fast() and hardware page walks).
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
+ * concurrent page fault handling) and the file rmap lock.
+ */
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool take_locks)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
@@ -7863,8 +7913,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
- hugetlb_vma_lock_write(vma);
- i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (take_locks) {
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ }
for (address = start; address < end; address += PUD_SIZE) {
ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
@@ -7874,8 +7928,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- hugetlb_vma_unlock_write(vma);
+ if (take_locks) {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
@@ -7890,5 +7946,20 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
- ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE),
+ /* take_locks = */ true);
+}
+
+/*
+ * For hugetlb, mremap() is an odd edge case - while the VMA copying is
+ * performed, we permit both the old and new VMAs to reference the same
+ * reservation.
+ *
+ * We fix this up after the operation succeeds, or if a newly allocated VMA
+ * is closed as a result of a failure to allocate memory.
+ */
+void fixup_hugetlb_reservations(struct vm_area_struct *vma)
+{
+ if (is_vm_hugetlb_page(vma))
+ clear_vma_resv_huge_pages(vma);
}
diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c
index e0f2d5c3a84c..f58ef4969e7a 100644
--- a/mm/hugetlb_cma.c
+++ b/mm/hugetlb_cma.c
@@ -66,7 +66,7 @@ hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact)
if (node_exact)
return NULL;
- for_each_online_node(node) {
+ for_each_node_mask(node, hugetlb_bootmem_nodes) {
cma = hugetlb_cma[node];
if (!cma || node == *nid)
continue;
@@ -153,11 +153,13 @@ void __init hugetlb_cma_reserve(int order)
if (!hugetlb_cma_size)
return;
+ hugetlb_bootmem_set_nodes();
+
for (nid = 0; nid < MAX_NUMNODES; nid++) {
if (hugetlb_cma_size_in_node[nid] == 0)
continue;
- if (!node_online(nid)) {
+ if (!node_isset(nid, hugetlb_bootmem_nodes)) {
pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
hugetlb_cma_size_in_node[nid] = 0;
@@ -190,13 +192,14 @@ void __init hugetlb_cma_reserve(int order)
* If 3 GB area is requested on a machine with 4 numa nodes,
* let's allocate 1 GB on first three nodes and ignore the last one.
*/
- per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
+ per_node = DIV_ROUND_UP(hugetlb_cma_size,
+ nodes_weight(hugetlb_bootmem_nodes));
pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
}
reserved = 0;
- for_each_online_node(nid) {
+ for_each_node_mask(nid, hugetlb_bootmem_nodes) {
int res;
char name[CMA_MAX_NAME];
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 9a99dfa3c495..ba0fb1b6a5a8 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -166,7 +166,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
VM_BUG_ON(!PAGE_ALIGNED(start | end));
mmap_read_lock(&init_mm);
- ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
+ ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
NULL, walk);
mmap_read_unlock(&init_mm);
if (ret)
@@ -238,11 +238,11 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
* struct page, the special metadata (e.g. page->flags or page->mapping)
* cannot copy to the tail struct page structs. The invalid value will be
* checked in the free_tail_page_prepare(). In order to avoid the message
- * of "corrupted mapping in tail page". We need to reset at least 3 (one
- * head struct page struct and two tail struct page structs) struct page
+ * of "corrupted mapping in tail page". We need to reset at least 4 (one
+ * head struct page struct and three tail struct page structs) struct page
* structs.
*/
-#define NR_RESET_STRUCT_PAGE 3
+#define NR_RESET_STRUCT_PAGE 4
static inline void reset_struct_pages(struct page *start)
{
diff --git a/mm/internal.h b/mm/internal.h
index 50c2f590b2d0..1da16d550a45 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -149,7 +149,7 @@ static inline void *folio_raw_mapping(const struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
- return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
+ return (void *)(mapping & ~FOLIO_MAPPING_FLAGS);
}
/*
@@ -164,7 +164,7 @@ static inline void *folio_raw_mapping(const struct folio *folio)
*/
static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
{
- int err = call_mmap(file, vma);
+ int err = vfs_mmap(file, vma);
if (likely(!err))
return 0;
@@ -202,109 +202,126 @@ static inline void vma_close(struct vm_area_struct *vma)
/* Flags for folio_pte_batch(). */
typedef int __bitwise fpb_t;
-/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
-#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0))
+/* Compare PTEs respecting the dirty bit. */
+#define FPB_RESPECT_DIRTY ((__force fpb_t)BIT(0))
-/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
-#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1))
+/* Compare PTEs respecting the soft-dirty bit. */
+#define FPB_RESPECT_SOFT_DIRTY ((__force fpb_t)BIT(1))
+
+/* Compare PTEs respecting the writable bit. */
+#define FPB_RESPECT_WRITE ((__force fpb_t)BIT(2))
+
+/*
+ * Merge PTE write bits: if any PTE in the batch is writable, modify the
+ * PTE at @ptentp to be writable.
+ */
+#define FPB_MERGE_WRITE ((__force fpb_t)BIT(3))
+
+/*
+ * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty,
+ * modify the PTE at @ptentp to be young or dirty, respectively.
+ */
+#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(4))
static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
{
- if (flags & FPB_IGNORE_DIRTY)
+ if (!(flags & FPB_RESPECT_DIRTY))
pte = pte_mkclean(pte);
- if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
+ if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY)))
pte = pte_clear_soft_dirty(pte);
- return pte_wrprotect(pte_mkold(pte));
+ if (likely(!(flags & FPB_RESPECT_WRITE)))
+ pte = pte_wrprotect(pte);
+ return pte_mkold(pte);
}
/**
- * folio_pte_batch - detect a PTE batch for a large folio
+ * folio_pte_batch_flags - detect a PTE batch for a large folio
* @folio: The large folio to detect a PTE batch for.
- * @addr: The user virtual address the first page is mapped at.
- * @start_ptep: Page table pointer for the first entry.
- * @pte: Page table entry for the first page.
+ * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL.
+ * @ptep: Page table pointer for the first entry.
+ * @ptentp: Pointer to a COPY of the first page table entry whose flags this
+ * function updates based on @flags if appropriate.
* @max_nr: The maximum number of table entries to consider.
* @flags: Flags to modify the PTE batch semantics.
- * @any_writable: Optional pointer to indicate whether any entry except the
- * first one is writable.
- * @any_young: Optional pointer to indicate whether any entry except the
- * first one is young.
- * @any_dirty: Optional pointer to indicate whether any entry except the
- * first one is dirty.
*
* Detect a PTE batch: consecutive (present) PTEs that map consecutive
- * pages of the same large folio.
+ * pages of the same large folio in a single VMA and a single page table.
*
* All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
- * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
- * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
+ * the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set)
+ * and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set).
+ *
+ * @ptep must map any page of the folio. max_nr must be at least one and
+ * must be limited by the caller so scanning cannot exceed a single VMA and
+ * a single page table.
+ *
+ * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will
+ * be updated: it's crucial that a pointer to a COPY of the first
+ * page table entry, obtained through ptep_get(), is provided as @ptentp.
*
- * start_ptep must map any page of the folio. max_nr must be at least one and
- * must be limited by the caller so scanning cannot exceed a single page table.
+ * This function will be inlined to optimize based on the input parameters;
+ * consider using folio_pte_batch() instead if applicable.
*
* Return: the number of table entries in the batch.
*/
-static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
- pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
- bool *any_writable, bool *any_young, bool *any_dirty)
+static inline unsigned int folio_pte_batch_flags(struct folio *folio,
+ struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp,
+ unsigned int max_nr, fpb_t flags)
{
- unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
- const pte_t *end_ptep = start_ptep + max_nr;
- pte_t expected_pte, *ptep;
- bool writable, young, dirty;
- int nr;
-
- if (any_writable)
- *any_writable = false;
- if (any_young)
- *any_young = false;
- if (any_dirty)
- *any_dirty = false;
+ bool any_writable = false, any_young = false, any_dirty = false;
+ pte_t expected_pte, pte = *ptentp;
+ unsigned int nr, cur_nr;
VM_WARN_ON_FOLIO(!pte_present(pte), folio);
VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
+ /*
+ * Ensure this is a pointer to a copy not a pointer into a page table.
+ * If this is a stack value, it won't be a valid virtual address, but
+ * that's fine because it also cannot be pointing into the page table.
+ */
+ VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp)));
- nr = pte_batch_hint(start_ptep, pte);
+ /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
+ max_nr = min_t(unsigned long, max_nr,
+ folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
+
+ nr = pte_batch_hint(ptep, pte);
expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
- ptep = start_ptep + nr;
+ ptep = ptep + nr;
- while (ptep < end_ptep) {
+ while (nr < max_nr) {
pte = ptep_get(ptep);
- if (any_writable)
- writable = !!pte_write(pte);
- if (any_young)
- young = !!pte_young(pte);
- if (any_dirty)
- dirty = !!pte_dirty(pte);
- pte = __pte_batch_clear_ignored(pte, flags);
- if (!pte_same(pte, expected_pte))
- break;
-
- /*
- * Stop immediately once we reached the end of the folio. In
- * corner cases the next PFN might fall into a different
- * folio.
- */
- if (pte_pfn(pte) >= folio_end_pfn)
+ if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte))
break;
- if (any_writable)
- *any_writable |= writable;
- if (any_young)
- *any_young |= young;
- if (any_dirty)
- *any_dirty |= dirty;
-
- nr = pte_batch_hint(ptep, pte);
- expected_pte = pte_advance_pfn(expected_pte, nr);
- ptep += nr;
+ if (flags & FPB_MERGE_WRITE)
+ any_writable |= pte_write(pte);
+ if (flags & FPB_MERGE_YOUNG_DIRTY) {
+ any_young |= pte_young(pte);
+ any_dirty |= pte_dirty(pte);
+ }
+
+ cur_nr = pte_batch_hint(ptep, pte);
+ expected_pte = pte_advance_pfn(expected_pte, cur_nr);
+ ptep += cur_nr;
+ nr += cur_nr;
}
- return min(ptep - start_ptep, max_nr);
+ if (any_writable)
+ *ptentp = pte_mkwrite(*ptentp, vma);
+ if (any_young)
+ *ptentp = pte_mkyoung(*ptentp);
+ if (any_dirty)
+ *ptentp = pte_mkdirty(*ptentp);
+
+ return min(nr, max_nr);
}
+unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
+ unsigned int max_nr);
+
/**
* pte_move_swp_offset - Move the swap entry offset field of a swap pte
* forward or backward by delta
@@ -435,11 +452,13 @@ void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details);
+void zap_page_range_single_batched(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr,
+ unsigned long size, struct zap_details *details);
int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
gfp_t gfp);
-void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
- unsigned int order);
+void page_cache_ra_order(struct readahead_control *, struct file_ra_state *);
void force_page_cache_ra(struct readahead_control *, unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
struct file *file, pgoff_t index, unsigned long nr_to_read)
@@ -519,6 +538,16 @@ extern unsigned long highest_memmap_pfn;
bool folio_isolate_lru(struct folio *folio);
void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
+#ifdef CONFIG_NUMA
+int user_proactive_reclaim(char *buf,
+ struct mem_cgroup *memcg, pg_data_t *pgdat);
+#else
+static inline int user_proactive_reclaim(char *buf,
+ struct mem_cgroup *memcg, pg_data_t *pgdat)
+{
+ return 0;
+}
+#endif
/*
* in mm/rmap.c:
@@ -823,7 +852,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
int nid, bool exact_nid);
void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
- unsigned long, enum meminit_context, struct vmem_altmap *, int);
+ unsigned long, enum meminit_context, struct vmem_altmap *, int,
+ bool);
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -915,7 +945,7 @@ static inline void init_cma_pageblock(struct page *page)
int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool claim_only, bool *claim_block);
+ int migratetype, bool claimable);
static inline bool free_area_empty(struct free_area *area, int migratetype)
{
@@ -931,7 +961,7 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked);
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool write, int *locked);
-extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+extern bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags,
unsigned long bytes);
/*
@@ -1121,6 +1151,8 @@ DECLARE_STATIC_KEY_TRUE(deferred_pages);
bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+void init_deferred_page(unsigned long pfn, int nid);
+
enum mminit_level {
MMINIT_WARNING,
MMINIT_VERIFY,
@@ -1227,7 +1259,6 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long);
extern void set_pageblock_order(void);
-struct folio *alloc_migrate_folio(struct folio *src, unsigned long private);
unsigned long reclaim_pages(struct list_head *folio_list);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *folio_list);
@@ -1360,7 +1391,7 @@ int migrate_device_coherent_folio(struct folio *folio);
struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift,
- unsigned long flags, unsigned long start,
+ vm_flags_t vm_flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask,
const void *caller);
@@ -1605,6 +1636,9 @@ static inline void accept_page(struct page *page)
int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private);
+int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ pgd_t *pgd, void *private);
/* pt_reclaim.c */
bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);
@@ -1624,5 +1658,7 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
}
#endif /* CONFIG_PT_RECLAIM */
+void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
+int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
#endif /* __MM_INTERNAL_H */
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
index 01b362799930..d3586e95c12c 100644
--- a/mm/io-mapping.c
+++ b/mm/io-mapping.c
@@ -21,9 +21,10 @@ int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
return -EINVAL;
- /* We rely on prevalidation of the io-mapping to skip track_pfn(). */
- return remap_pfn_range_notrack(vma, addr, pfn, size,
- __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
- (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)));
+ pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
+ (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK));
+
+ /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */
+ return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot);
}
EXPORT_SYMBOL_GPL(io_mapping_map_user);
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 1a958e7c8a46..dd93ae8a6beb 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla)
+CFLAGS_KASAN_TEST := $(CFLAGS_KASAN)
ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
# If compiler instruments memintrinsics by prefixing them with __asan/__hwasan,
# we need to treat them normally (as builtins), otherwise the compiler won't
@@ -44,6 +44,7 @@ ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
CFLAGS_KASAN_TEST += -fno-builtin
endif
+CFLAGS_REMOVE_kasan_test_c.o += $(call cc-option, -Wvla-larger-than=1)
CFLAGS_kasan_test_c.o := $(CFLAGS_KASAN_TEST)
RUSTFLAGS_kasan_test_rust.o := $(RUSTFLAGS_KASAN)
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index 59d673400085..2aa12dfa427a 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -1073,14 +1073,11 @@ static void kmem_cache_rcu_uaf(struct kunit *test)
kmem_cache_destroy(cache);
}
-static void empty_cache_ctor(void *object) { }
-
static void kmem_cache_double_destroy(struct kunit *test)
{
struct kmem_cache *cache;
- /* Provide a constructor to prevent cache merging. */
- cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor);
+ cache = kmem_cache_create("test_cache", 200, 0, SLAB_NO_MERGE, NULL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
kmem_cache_destroy(cache);
KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache));
@@ -1570,6 +1567,7 @@ static void kasan_memcmp(struct kunit *test)
static void kasan_strings(struct kunit *test)
{
char *ptr;
+ char *src;
size_t size = 24;
/*
@@ -1581,6 +1579,25 @@ static void kasan_strings(struct kunit *test)
ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO);
+ strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE);
+
+ /*
+ * Make sure that strscpy() does not trigger KASAN if it overreads into
+ * poisoned memory.
+ *
+ * The expected size does not include the terminator '\0'
+ * so it is (KASAN_GRANULE_SIZE - 2) ==
+ * KASAN_GRANULE_SIZE - ("initial removed character" + "\0").
+ */
+ KUNIT_EXPECT_EQ(test, KASAN_GRANULE_SIZE - 2,
+ strscpy(ptr, src + 1, KASAN_GRANULE_SIZE));
+
+ /* strscpy should fail if the first byte is unreadable. */
+ KUNIT_EXPECT_KASAN_FAIL(test, strscpy(ptr, src + KASAN_GRANULE_SIZE,
+ KASAN_GRANULE_SIZE));
+
+ kfree(src);
kfree(ptr);
/*
@@ -1960,6 +1977,11 @@ static void rust_uaf(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, kasan_test_rust_uaf());
}
+/*
+ * copy_to_kernel_nofault() is an internal helper available when
+ * kasan_test is built-in, so it must not be visible to loadable modules.
+ */
+#ifndef MODULE
static void copy_to_kernel_nofault_oob(struct kunit *test)
{
char *ptr;
@@ -1994,6 +2016,7 @@ static void copy_to_kernel_nofault_oob(struct kunit *test)
kfree(ptr);
}
+#endif /* !MODULE */
static void copy_user_test_oob(struct kunit *test)
{
@@ -2114,7 +2137,9 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(match_all_not_assigned),
KUNIT_CASE(match_all_ptr_tag),
KUNIT_CASE(match_all_mem_tag),
+#ifndef MODULE
KUNIT_CASE(copy_to_kernel_nofault_oob),
+#endif
KUNIT_CASE(rust_uaf),
KUNIT_CASE(copy_user_test_oob),
{}
@@ -2130,4 +2155,5 @@ static struct kunit_suite kasan_kunit_test_suite = {
kunit_test_suite(kasan_kunit_test_suite);
+MODULE_DESCRIPTION("KUnit tests for checking KASAN bug-detection capabilities");
MODULE_LICENSE("GPL");
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 8357e1a33699..62c01b4527eb 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -370,36 +370,6 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
-/*
- * This function is invoked with report_lock (a raw_spinlock) held. A
- * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping
- * rt_spinlock.
- *
- * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a
- * lockdep warning for this raw_spinlock -> spinlock dependency. This config
- * option is enabled by default to ensure better test coverage to expose this
- * kind of RT kernel problem. This lockdep splat, however, can be suppressed
- * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the
- * invalid PREEMPT_RT case has been taken care of.
- */
-static inline struct vm_struct *kasan_find_vm_area(void *addr)
-{
- static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP);
- struct vm_struct *va;
-
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- return NULL;
-
- /*
- * Suppress lockdep warning and fetch vmalloc area of the
- * offending address.
- */
- lock_map_acquire_try(&vmalloc_map);
- va = find_vm_area(addr);
- lock_map_release(&vmalloc_map);
- return va;
-}
-
static void print_address_description(void *addr, u8 tag,
struct kasan_report_info *info)
{
@@ -429,19 +399,10 @@ static void print_address_description(void *addr, u8 tag,
}
if (is_vmalloc_addr(addr)) {
- struct vm_struct *va = kasan_find_vm_area(addr);
-
- if (va) {
- pr_err("The buggy address belongs to the virtual mapping at\n"
- " [%px, %px) created by:\n"
- " %pS\n",
- va->addr, va->addr + va->size, va->caller);
- pr_err("\n");
-
- page = vmalloc_to_page(addr);
- } else {
- pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr);
- }
+ pr_err("The buggy address belongs to a");
+ if (!vmalloc_dump_obj(addr))
+ pr_cont(" vmalloc virtual mapping\n");
+ page = vmalloc_to_page(addr);
}
if (page) {
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 88d1c9dcb507..d2c70cd2afb1 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -292,33 +292,99 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start,
{
}
+struct vmalloc_populate_data {
+ unsigned long start;
+ struct page **pages;
+};
+
static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
+ void *_data)
{
- unsigned long page;
+ struct vmalloc_populate_data *data = _data;
+ struct page *page;
pte_t pte;
+ int index;
if (likely(!pte_none(ptep_get(ptep))))
return 0;
- page = __get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
-
- __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
- pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
+ index = PFN_DOWN(addr - data->start);
+ page = data->pages[index];
+ __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock);
if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
- page = 0;
+ data->pages[index] = NULL;
}
spin_unlock(&init_mm.page_table_lock);
- if (page)
- free_page(page);
+
+ return 0;
+}
+
+static void ___free_pages_bulk(struct page **pages, int nr_pages)
+{
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ __free_pages(pages[i], 0);
+ pages[i] = NULL;
+ }
+ }
+}
+
+static int ___alloc_pages_bulk(struct page **pages, int nr_pages)
+{
+ unsigned long nr_populated, nr_total = nr_pages;
+ struct page **page_array = pages;
+
+ while (nr_pages) {
+ nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages);
+ if (!nr_populated) {
+ ___free_pages_bulk(page_array, nr_total - nr_pages);
+ return -ENOMEM;
+ }
+ pages += nr_populated;
+ nr_pages -= nr_populated;
+ }
+
return 0;
}
+static int __kasan_populate_vmalloc(unsigned long start, unsigned long end)
+{
+ unsigned long nr_pages, nr_total = PFN_UP(end - start);
+ struct vmalloc_populate_data data;
+ int ret = 0;
+
+ data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (!data.pages)
+ return -ENOMEM;
+
+ while (nr_total) {
+ nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0]));
+ ret = ___alloc_pages_bulk(data.pages, nr_pages);
+ if (ret)
+ break;
+
+ data.start = start;
+ ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE,
+ kasan_populate_vmalloc_pte, &data);
+ ___free_pages_bulk(data.pages, nr_pages);
+ if (ret)
+ break;
+
+ start += nr_pages * PAGE_SIZE;
+ nr_total -= nr_pages;
+ }
+
+ free_page((unsigned long)data.pages);
+
+ return ret;
+}
+
int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
{
unsigned long shadow_start, shadow_end;
@@ -348,9 +414,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
shadow_start = PAGE_ALIGN_DOWN(shadow_start);
shadow_end = PAGE_ALIGN(shadow_end);
- ret = apply_to_page_range(&init_mm, shadow_start,
- shadow_end - shadow_start,
- kasan_populate_vmalloc_pte, NULL);
+ ret = __kasan_populate_vmalloc(shadow_start, shadow_end);
if (ret)
return ret;
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 102048821c22..0ed3be100963 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -605,8 +605,8 @@ static unsigned long kfence_init_pool(void)
pages = virt_to_page(__kfence_pool);
/*
- * Set up object pages: they must have PG_slab set, to avoid freeing
- * these as real pages.
+ * Set up object pages: they must have PGTY_slab set to avoid freeing
+ * them as real pages.
*
* We also want to avoid inserting kfence_free() in the kfree()
* fast-path in SLUB, and therefore need to ensure kfree() correctly
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cc945c6ab3bd..a55fb1dcd224 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -347,7 +347,7 @@ struct attribute_group khugepaged_attr_group = {
#endif /* CONFIG_SYSFS */
int hugepage_madvise(struct vm_area_struct *vma,
- unsigned long *vm_flags, int advice)
+ vm_flags_t *vm_flags, int advice)
{
switch (advice) {
case MADV_HUGEPAGE:
@@ -470,7 +470,7 @@ void __khugepaged_enter(struct mm_struct *mm)
}
void khugepaged_enter_vma(struct vm_area_struct *vma,
- unsigned long vm_flags)
+ vm_flags_t vm_flags)
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
hugepage_pmd_enabled()) {
@@ -548,19 +548,6 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
}
-static bool is_refcount_suitable(struct folio *folio)
-{
- int expected_refcount = folio_mapcount(folio);
-
- if (!folio_test_anon(folio) || folio_test_swapcache(folio))
- expected_refcount += folio_nr_pages(folio);
-
- if (folio_test_private(folio))
- expected_refcount++;
-
- return folio_ref_count(folio) == expected_refcount;
-}
-
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
@@ -652,7 +639,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
folio_unlock(folio);
result = SCAN_PAGE_COUNT;
goto out;
@@ -696,13 +683,13 @@ next:
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
referenced, writable, result);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
referenced, writable, result);
return result;
}
@@ -746,7 +733,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
ptep_clear(vma->vm_mm, address, _pte);
folio_remove_rmap_pte(src, src_page, vma);
spin_unlock(ptl);
- free_page_and_swap_cache(src_page);
+ free_folio_and_swap_cache(src);
}
}
@@ -954,12 +941,18 @@ static inline int check_pmd_state(pmd_t *pmd)
if (pmd_none(pmde))
return SCAN_PMD_NONE;
+
+ /*
+ * The folio may be under migration when khugepaged is trying to
+ * collapse it. Migration success or failure will eventually end
+ * up with a present PMD mapping a folio again.
+ */
+ if (is_pmd_migration_entry(pmde))
+ return SCAN_PMD_MAPPED;
if (!pmd_present(pmde))
return SCAN_PMD_NULL;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
- if (pmd_devmap(pmde))
- return SCAN_PMD_NULL;
if (pmd_bad(pmde))
return SCAN_PMD_NULL;
return SCAN_SUCCEED;
@@ -1239,7 +1232,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
__folio_mark_uptodate(folio);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
spin_lock(pmd_ptl);
@@ -1402,7 +1395,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -1435,7 +1428,7 @@ out_unmap:
*mmap_locked = false;
}
out:
- trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
none_or_zero, result, unmapped);
return result;
}
@@ -1464,10 +1457,9 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
}
}
-#ifdef CONFIG_SHMEM
-/* hpage must be locked, and mmap_lock must be held */
+/* folio must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmdp, struct page *hpage)
+ pmd_t *pmdp, struct folio *folio, struct page *page)
{
struct vm_fault vmf = {
.vma = vma,
@@ -1476,13 +1468,12 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
.pmd = pmdp,
};
- VM_BUG_ON(!PageTransHuge(hpage));
mmap_assert_locked(vma->vm_mm);
- if (do_set_pmd(&vmf, hpage))
+ if (do_set_pmd(&vmf, folio, page))
return SCAN_FAIL;
- get_page(hpage);
+ folio_get(folio);
return SCAN_SUCCEED;
}
@@ -1689,7 +1680,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
- ? set_huge_pmd(vma, haddr, pmd, &folio->page)
+ ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page)
: SCAN_SUCCEED;
goto drop_folio;
abort:
@@ -2295,6 +2286,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
continue;
}
+ if (!folio_try_get(folio)) {
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
+ xas_reset(&xas);
+ continue;
+ }
+
if (folio_order(folio) == HPAGE_PMD_ORDER &&
folio->index == start) {
/* Maybe PMD-mapped */
@@ -2305,23 +2307,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
* it's safe to skip LRU and refcount checks before
* returning.
*/
+ folio_put(folio);
break;
}
node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
+ folio_put(folio);
break;
}
cc->node_load[node]++;
if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
+ folio_put(folio);
break;
}
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
+ folio_put(folio);
break;
}
@@ -2333,6 +2339,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
*/
present += folio_nr_pages(folio);
+ folio_put(folio);
if (need_resched()) {
xas_pause(&xas);
@@ -2354,14 +2361,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
return result;
}
-#else
-static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
- struct file *file, pgoff_t start,
- struct collapse_control *cc)
-{
- BUILD_BUG();
-}
-#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
struct collapse_control *cc)
@@ -2437,7 +2436,7 @@ skip:
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
@@ -2736,8 +2735,8 @@ static int madvise_collapse_errno(enum scan_result r)
}
}
-int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, bool *lock_dropped)
{
struct collapse_control *cc;
struct mm_struct *mm = vma->vm_mm;
@@ -2748,8 +2747,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- *prev = vma;
-
if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
return -EINVAL;
@@ -2783,7 +2780,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma, addr);
@@ -2797,7 +2794,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
&mmap_locked, cc);
}
if (!mmap_locked)
- *prev = NULL; /* Tell caller we dropped mmap_lock */
+ *lock_dropped = true;
handle_result:
switch (result) {
@@ -2807,7 +2804,6 @@ handle_result:
break;
case SCAN_PTE_MAPPED_HUGEPAGE:
BUG_ON(mmap_locked);
- BUG_ON(*prev);
mmap_read_lock(mm);
result = collapse_pte_mapped_thp(mm, addr, true);
mmap_read_unlock(mm);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c12cef3eeb32..8d588e685311 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -210,13 +210,11 @@ static struct kmem_cache *object_cache;
static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
-static int kmemleak_enabled = 1;
+static int kmemleak_enabled __read_mostly = 1;
/* same as above but only for the kmemleak_free() callback */
-static int kmemleak_free_enabled = 1;
+static int kmemleak_free_enabled __read_mostly = 1;
/* set in the late_initcall if there were no errors */
static int kmemleak_late_initialized;
-/* set if a kmemleak warning was issued */
-static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
static int kmemleak_error;
@@ -254,7 +252,6 @@ static void kmemleak_disable(void);
#define kmemleak_warn(x...) do { \
pr_warn(x); \
dump_stack(); \
- kmemleak_warning = 1; \
} while (0)
/*
@@ -325,8 +322,6 @@ static void hex_dump_object(struct seq_file *seq,
* sufficient references to it (count >= min_count)
* - black - ignore, it doesn't contain references (e.g. text section)
* (min_count == -1). No function defined for this color.
- * Newly created objects don't have any color assigned (object->count == -1)
- * before the next memory scan when they become white.
*/
static bool color_white(const struct kmemleak_object *object)
{
@@ -1252,6 +1247,20 @@ void __ref kmemleak_transient_leak(const void *ptr)
EXPORT_SYMBOL(kmemleak_transient_leak);
/**
+ * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu
+ * address argument
+ * @ptr: percpu address of the object
+ */
+void __ref kmemleak_ignore_percpu(const void __percpu *ptr)
+{
+ pr_debug("%s(0x%px)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr))
+ make_black_object((unsigned long)ptr, OBJECT_PERCPU);
+}
+EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu);
+
+/**
* kmemleak_ignore - ignore an allocated object
* @ptr: pointer to beginning of the object
*
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index a495debf1436..1ea711786c52 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -159,8 +159,8 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
* Make sure we have enough spare bits in @id to hold the UAF bit and
* the chain depth.
*/
- BUILD_BUG_ON(
- (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1));
+ BUILD_BUG_ON((1 << STACK_DEPOT_EXTRA_BITS) <=
+ (KMSAN_MAX_ORIGIN_DEPTH << 1));
extra_bits = stack_depot_get_extra_bits(id);
depth = kmsan_depth_from_eb(extra_bits);
@@ -274,11 +274,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
* bytes before, report them.
*/
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos - 1, user_addr,
reason);
- kmsan_leave_runtime();
}
cur_origin = 0;
cur_off_start = -1;
@@ -292,11 +290,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
* poisoned bytes before, report them.
*/
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos + i - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
cur_origin = 0;
cur_off_start = -1;
@@ -312,11 +308,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
*/
if (cur_origin != new_origin) {
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos + i - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
cur_origin = new_origin;
cur_off_start = pos + i;
@@ -326,10 +320,8 @@ void kmsan_internal_check_memory(void *addr, size_t size,
}
KMSAN_WARN_ON(pos != size);
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 3df45c25c1f6..97de3d6194f0 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -114,9 +114,7 @@ void kmsan_kfree_large(const void *ptr)
kmsan_enter_runtime();
page = virt_to_head_page((void *)ptr);
KMSAN_WARN_ON(ptr != page_address(page));
- kmsan_internal_poison_memory((void *)ptr,
- page_size(page),
- GFP_KERNEL,
+ kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL,
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
@@ -277,8 +275,10 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
* Don't check anything, just copy the shadow of the copied
* bytes.
*/
+ kmsan_enter_runtime();
kmsan_internal_memmove_metadata((void *)to, (void *)from,
to_copy - left);
+ kmsan_leave_runtime();
}
user_access_restore(ua_flags);
}
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index 10f52c085e6c..b14ce3417e65 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -35,8 +35,7 @@ static void __init kmsan_record_future_shadow_range(void *start, void *end)
KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES);
KMSAN_WARN_ON((nstart >= nend) ||
/* Virtual address 0 is valid on s390. */
- (!IS_ENABLED(CONFIG_S390) && !nstart) ||
- !nend);
+ (!IS_ENABLED(CONFIG_S390) && !nstart) || !nend);
nstart = ALIGN_DOWN(nstart, PAGE_SIZE);
nend = ALIGN(nend, PAGE_SIZE);
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
index 02a405e55d6c..69f0a57a401c 100644
--- a/mm/kmsan/instrumentation.c
+++ b/mm/kmsan/instrumentation.c
@@ -312,13 +312,9 @@ EXPORT_SYMBOL(__msan_unpoison_alloca);
void __msan_warning(u32 origin);
void __msan_warning(u32 origin)
{
- if (!kmsan_enabled || kmsan_in_runtime())
- return;
- kmsan_enter_runtime();
kmsan_report(origin, /*address*/ NULL, /*size*/ 0,
/*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ NULL,
REASON_ANY);
- kmsan_leave_runtime();
}
EXPORT_SYMBOL(__msan_warning);
diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h
index 29555a8bc315..bc3d1810f352 100644
--- a/mm/kmsan/kmsan.h
+++ b/mm/kmsan/kmsan.h
@@ -121,7 +121,6 @@ static __always_inline void kmsan_leave_runtime(void)
KMSAN_WARN_ON(--ctx->kmsan_in_runtime);
}
-depot_stack_handle_t kmsan_save_stack(void);
depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
unsigned int extra_bits);
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index 9733a22c46c1..c6c5b2bbede0 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -732,3 +732,4 @@ kunit_test_suites(&kmsan_test_suite);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Alexander Potapenko <glider@google.com>");
+MODULE_DESCRIPTION("Test cases for KMSAN");
diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c
index 94a3303fb65e..d6853ce08954 100644
--- a/mm/kmsan/report.c
+++ b/mm/kmsan/report.c
@@ -157,14 +157,14 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size,
unsigned long ua_flags;
bool is_uaf;
- if (!kmsan_enabled)
+ if (!kmsan_enabled || kmsan_in_runtime())
return;
if (current->kmsan_ctx.depth)
return;
if (!origin)
return;
- kmsan_disable_current();
+ kmsan_enter_runtime();
ua_flags = user_access_save();
raw_spin_lock(&kmsan_report_lock);
pr_err("=====================================================\n");
@@ -217,5 +217,5 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size,
if (panic_on_kmsan)
panic("kmsan.panic set ...\n");
user_access_restore(ua_flags);
- kmsan_enable_current();
+ kmsan_leave_runtime();
}
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 1bb505a08415..54f3c3c962f0 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -207,8 +207,7 @@ void kmsan_free_page(struct page *page, unsigned int order)
if (!kmsan_enabled || kmsan_in_runtime())
return;
kmsan_enter_runtime();
- kmsan_internal_poison_memory(page_address(page),
- page_size(page),
+ kmsan_internal_poison_memory(page_address(page), page_size(page),
GFP_KERNEL,
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
@@ -248,17 +247,19 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
kmsan_enter_runtime();
mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot,
s_pages, page_shift);
+ kmsan_leave_runtime();
if (mapped) {
err = mapped;
goto ret;
}
+ kmsan_enter_runtime();
mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot,
o_pages, page_shift);
+ kmsan_leave_runtime();
if (mapped) {
err = mapped;
goto ret;
}
- kmsan_leave_runtime();
flush_tlb_kernel_range(shadow_start, shadow_end);
flush_tlb_kernel_range(origin_start, origin_end);
flush_cache_vmap(shadow_start, shadow_end);
diff --git a/mm/ksm.c b/mm/ksm.c
index 8583fb91ef13..160787bb121c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -677,28 +677,32 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v
return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}
-static bool vma_ksm_compatible(struct vm_area_struct *vma)
+static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags)
{
- if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP |
- VM_IO | VM_DONTEXPAND | VM_HUGETLB |
- VM_MIXEDMAP| VM_DROPPABLE))
+ if (vm_flags & (VM_SHARED | VM_MAYSHARE | VM_SPECIAL |
+ VM_HUGETLB | VM_DROPPABLE))
return false; /* just ignore the advice */
- if (vma_is_dax(vma))
+ if (file_is_dax(file))
return false;
#ifdef VM_SAO
- if (vma->vm_flags & VM_SAO)
+ if (vm_flags & VM_SAO)
return false;
#endif
#ifdef VM_SPARC_ADI
- if (vma->vm_flags & VM_SPARC_ADI)
+ if (vm_flags & VM_SPARC_ADI)
return false;
#endif
return true;
}
+static bool vma_ksm_compatible(struct vm_area_struct *vma)
+{
+ return ksm_compatible(vma->vm_file, vma->vm_flags);
+}
+
static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
unsigned long addr)
{
@@ -889,7 +893,7 @@ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node,
unsigned long kpfn;
expected_mapping = (void *)((unsigned long)stable_node |
- PAGE_MAPPING_KSM);
+ FOLIO_MAPPING_KSM);
again:
kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
folio = pfn_folio(kpfn);
@@ -1066,7 +1070,7 @@ static inline void folio_set_stable_node(struct folio *folio,
struct ksm_stable_node *stable_node)
{
VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio);
- folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
+ folio->mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM);
}
#ifdef CONFIG_SYSFS
@@ -2696,14 +2700,17 @@ static int ksm_scan_thread(void *nothing)
return 0;
}
-static void __ksm_add_vma(struct vm_area_struct *vma)
+static bool __ksm_should_add_vma(const struct file *file, vm_flags_t vm_flags)
{
- unsigned long vm_flags = vma->vm_flags;
-
if (vm_flags & VM_MERGEABLE)
- return;
+ return false;
+
+ return ksm_compatible(file, vm_flags);
+}
- if (vma_ksm_compatible(vma))
+static void __ksm_add_vma(struct vm_area_struct *vma)
+{
+ if (__ksm_should_add_vma(vma->vm_file, vma->vm_flags))
vm_flags_set(vma, VM_MERGEABLE);
}
@@ -2724,16 +2731,22 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
return 0;
}
/**
- * ksm_add_vma - Mark vma as mergeable if compatible
+ * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible
+ *
+ * @mm: Proposed VMA's mm_struct
+ * @file: Proposed VMA's file-backed mapping, if any.
+ * @vm_flags: Proposed VMA"s flags.
*
- * @vma: Pointer to vma
+ * Returns: @vm_flags possibly updated to mark mergeable.
*/
-void ksm_add_vma(struct vm_area_struct *vma)
+vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
+ vm_flags_t vm_flags)
{
- struct mm_struct *mm = vma->vm_mm;
+ if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) &&
+ __ksm_should_add_vma(file, vm_flags))
+ vm_flags |= VM_MERGEABLE;
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
- __ksm_add_vma(vma);
+ return vm_flags;
}
static void ksm_add_vmas(struct mm_struct *mm)
@@ -2827,7 +2840,7 @@ int ksm_disable(struct mm_struct *mm)
}
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, int advice, unsigned long *vm_flags)
+ unsigned long end, int advice, vm_flags_t *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
int err;
@@ -3669,10 +3682,10 @@ static ssize_t advisor_mode_show(struct kobject *kobj,
{
const char *output;
- if (ksm_advisor == KSM_ADVISOR_NONE)
- output = "[none] scan-time";
- else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
+ if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
output = "none [scan-time]";
+ else
+ output = "[none] scan-time";
return sysfs_emit(buf, "%s\n", output);
}
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 490473af3122..ec48b5dadf51 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -60,30 +60,34 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
return &lru->node[nid].lru;
}
+static inline bool lock_list_lru(struct list_lru_one *l, bool irq)
+{
+ if (irq)
+ spin_lock_irq(&l->lock);
+ else
+ spin_lock(&l->lock);
+ if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) {
+ if (irq)
+ spin_unlock_irq(&l->lock);
+ else
+ spin_unlock(&l->lock);
+ return false;
+ }
+ return true;
+}
+
static inline struct list_lru_one *
lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
bool irq, bool skip_empty)
{
struct list_lru_one *l;
- long nr_items;
rcu_read_lock();
again:
l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
- if (likely(l)) {
- if (irq)
- spin_lock_irq(&l->lock);
- else
- spin_lock(&l->lock);
- nr_items = READ_ONCE(l->nr_items);
- if (likely(nr_items != LONG_MIN)) {
- rcu_read_unlock();
- return l;
- }
- if (irq)
- spin_unlock_irq(&l->lock);
- else
- spin_unlock(&l->lock);
+ if (likely(l) && lock_list_lru(l, irq)) {
+ rcu_read_unlock();
+ return l;
}
/*
* Caller may simply bail out if raced with reparenting or
diff --git a/mm/maccess.c b/mm/maccess.c
index 8f0906180a94..486559d68858 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -82,7 +82,6 @@ Efault:
pagefault_enable();
return -EFAULT;
}
-EXPORT_SYMBOL_GPL(copy_to_kernel_nofault);
long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
@@ -196,7 +195,7 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
if (ret >= count) {
ret = count;
dst[ret - 1] = '\0';
- } else if (ret > 0) {
+ } else if (ret >= 0) {
ret++;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index b17f684322ad..bb80fc5ea08f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -37,6 +37,8 @@
#include "internal.h"
#include "swap.h"
+#define __MADV_SET_ANON_VMA_NAME (-1)
+
/*
* Maximum number of attempts we make to install guard pages before we give up
* and return -ERESTARTNOINTR to have userspace try again.
@@ -48,34 +50,39 @@ struct madvise_walk_private {
bool pageout;
};
-/*
- * Any behaviour which results in changes to the vma->vm_flags needs to
- * take mmap_lock for writing. Others, which simply traverse vmas, need
- * to only take it for reading.
- */
-static int madvise_need_mmap_write(int behavior)
-{
- switch (behavior) {
- case MADV_REMOVE:
- case MADV_WILLNEED:
- case MADV_DONTNEED:
- case MADV_DONTNEED_LOCKED:
- case MADV_COLD:
- case MADV_PAGEOUT:
- case MADV_FREE:
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- case MADV_COLLAPSE:
- case MADV_GUARD_INSTALL:
- case MADV_GUARD_REMOVE:
- return 0;
- default:
- /* be safe, default to 1. list exceptions explicitly */
- return 1;
- }
-}
+enum madvise_lock_mode {
+ MADVISE_NO_LOCK,
+ MADVISE_MMAP_READ_LOCK,
+ MADVISE_MMAP_WRITE_LOCK,
+ MADVISE_VMA_READ_LOCK,
+};
+
+struct madvise_behavior_range {
+ unsigned long start;
+ unsigned long end;
+};
+
+struct madvise_behavior {
+ struct mm_struct *mm;
+ int behavior;
+ struct mmu_gather *tlb;
+ enum madvise_lock_mode lock_mode;
+ struct anon_vma_name *anon_name;
+
+ /*
+ * The range over which the behaviour is currently being applied. If
+ * traversing multiple VMAs, this is updated for each.
+ */
+ struct madvise_behavior_range range;
+ /* The VMA and VMA preceding it (if applicable) currently targeted. */
+ struct vm_area_struct *prev;
+ struct vm_area_struct *vma;
+ bool lock_dropped;
+};
#ifdef CONFIG_ANON_VMA_NAME
+static int madvise_walk_vmas(struct madvise_behavior *madv_behavior);
+
struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
struct anon_vma_name *anon_name;
@@ -101,7 +108,8 @@ void anon_vma_name_free(struct kref *kref)
struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
- mmap_assert_locked(vma->vm_mm);
+ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
+ vma_assert_locked(vma);
return vma->anon_name;
}
@@ -137,40 +145,39 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
- * Update the vm_flags on region of a vma, splitting it or merging it as
- * necessary. Must be called with mmap_lock held for writing;
- * Caller should ensure anon_name stability by raising its refcount even when
- * anon_name belongs to a valid vma because this function might free that vma.
+ * Update the vm_flags or anon_name on region of a vma, splitting it or merging
+ * it as necessary. Must be called with mmap_lock held for writing.
*/
-static int madvise_update_vma(struct vm_area_struct *vma,
- struct vm_area_struct **prev, unsigned long start,
- unsigned long end, unsigned long new_flags,
- struct anon_vma_name *anon_name)
+static int madvise_update_vma(vm_flags_t new_flags,
+ struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
- int error;
- VMA_ITERATOR(vmi, mm, start);
-
- if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
- *prev = vma;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ struct anon_vma_name *anon_name = madv_behavior->anon_name;
+ bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME;
+ VMA_ITERATOR(vmi, madv_behavior->mm, range->start);
+
+ if (new_flags == vma->vm_flags && (!set_new_anon_name ||
+ anon_vma_name_eq(anon_vma_name(vma), anon_name)))
return 0;
- }
- vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
- anon_name);
+ if (set_new_anon_name)
+ vma = vma_modify_name(&vmi, madv_behavior->prev, vma,
+ range->start, range->end, anon_name);
+ else
+ vma = vma_modify_flags(&vmi, madv_behavior->prev, vma,
+ range->start, range->end, new_flags);
+
if (IS_ERR(vma))
return PTR_ERR(vma);
- *prev = vma;
+ madv_behavior->vma = vma;
/* vm_flags is protected by the mmap_lock held in write mode. */
vma_start_write(vma);
vm_flags_reset(vma, new_flags);
- if (!vma->vm_file || vma_is_anon_shmem(vma)) {
- error = replace_anon_vma_name(vma, anon_name);
- if (error)
- return error;
- }
+ if (set_new_anon_name)
+ return replace_anon_vma_name(vma, anon_name);
return 0;
}
@@ -263,21 +270,27 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
}
#endif /* CONFIG_SWAP */
+static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior)
+{
+ VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK);
+ madv_behavior->lock_dropped = true;
+}
+
/*
* Schedule all required I/O operations. Do not wait for completion.
*/
-static long madvise_willneed(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_willneed(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct mm_struct *mm = madv_behavior->mm;
struct file *file = vma->vm_file;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
loff_t offset;
- *prev = vma;
#ifdef CONFIG_SWAP
if (!file) {
- walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
+ walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma);
lru_add_drain(); /* Push any new pages onto the LRU now */
return 0;
}
@@ -303,7 +316,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
* vma's reference to the file) can go away as soon as we drop
* mmap_lock.
*/
- *prev = NULL; /* tell sys_madvise we drop mmap_lock */
+ mark_mmap_lock_dropped(madv_behavior);
get_file(file);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
@@ -331,14 +344,12 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
struct folio *folio, pte_t *ptep,
- pte_t pte, bool *any_young,
- bool *any_dirty)
+ pte_t *ptentp)
{
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
int max_nr = (end - addr) / PAGE_SIZE;
- return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
- any_young, any_dirty);
+ return folio_pte_batch_flags(folio, NULL, ptep, ptentp, max_nr,
+ FPB_MERGE_YOUNG_DIRTY);
}
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
@@ -476,13 +487,7 @@ restart:
* next pte in the range.
*/
if (folio_test_large(folio)) {
- bool any_young;
-
- nr = madvise_folio_pte_batch(addr, end, folio, pte,
- ptent, &any_young, NULL);
- if (any_young)
- ptent = pte_mkyoung(ptent);
-
+ nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent);
if (nr < folio_nr_pages(folio)) {
int err;
@@ -503,6 +508,7 @@ restart:
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -567,16 +573,19 @@ static const struct mm_walk_ops cold_walk_ops = {
};
static void madvise_cold_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+ struct madvise_behavior *madv_behavior)
+
{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
struct madvise_walk_private walk_private = {
.pageout = false,
.tlb = tlb,
};
tlb_start_vma(tlb, vma);
- walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+ walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops,
+ &walk_private);
tlb_end_vma(tlb, vma);
}
@@ -585,28 +594,25 @@ static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
}
-static long madvise_cold(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start_addr, unsigned long end_addr)
+static long madvise_cold(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
struct mmu_gather tlb;
- *prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, madv_behavior->mm);
+ madvise_cold_page_range(&tlb, madv_behavior);
tlb_finish_mmu(&tlb);
return 0;
}
static void madvise_pageout_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+ struct vm_area_struct *vma,
+ struct madvise_behavior_range *range)
{
struct madvise_walk_private walk_private = {
.pageout = true,
@@ -614,18 +620,16 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
};
tlb_start_vma(tlb, vma);
- walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+ walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops,
+ &walk_private);
tlb_end_vma(tlb, vma);
}
-static long madvise_pageout(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start_addr, unsigned long end_addr)
+static long madvise_pageout(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
+ struct vm_area_struct *vma = madv_behavior->vma;
- *prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
@@ -640,8 +644,8 @@ static long madvise_pageout(struct vm_area_struct *vma,
return 0;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, madv_behavior->mm);
+ madvise_pageout_page_range(&tlb, vma, &madv_behavior->range);
tlb_finish_mmu(&tlb);
return 0;
@@ -713,11 +717,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
* next pte in the range.
*/
if (folio_test_large(folio)) {
- bool any_young, any_dirty;
-
- nr = madvise_folio_pte_batch(addr, end, folio, pte,
- ptent, &any_young, &any_dirty);
-
+ nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent);
if (nr < folio_nr_pages(folio)) {
int err;
@@ -736,16 +736,12 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
start_pte = pte;
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
continue;
}
-
- if (any_young)
- ptent = pte_mkyoung(ptent);
- if (any_dirty)
- ptent = pte_mkdirty(ptent);
}
if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
@@ -789,17 +785,31 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static const struct mm_walk_ops madvise_free_walk_ops = {
- .pmd_entry = madvise_free_pte_range,
- .walk_lock = PGWALK_RDLOCK,
-};
+static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode)
+{
+ switch (mode) {
+ case MADVISE_VMA_READ_LOCK:
+ return PGWALK_VMA_RDLOCK_VERIFY;
+ case MADVISE_MMAP_READ_LOCK:
+ return PGWALK_RDLOCK;
+ default:
+ /* Other modes don't require fixing up the walk_lock */
+ WARN_ON_ONCE(1);
+ return PGWALK_RDLOCK;
+ }
+}
-static int madvise_free_single_vma(struct vm_area_struct *vma,
- unsigned long start_addr, unsigned long end_addr)
+static int madvise_free_single_vma(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = madv_behavior->mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ unsigned long start_addr = madv_behavior->range.start;
+ unsigned long end_addr = madv_behavior->range.end;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
+ struct mmu_gather *tlb = madv_behavior->tlb;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = madvise_free_pte_range,
+ };
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
@@ -815,17 +825,15 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
- tlb_start_vma(&tlb, vma);
- walk_page_range(vma->vm_mm, range.start, range.end,
- &madvise_free_walk_ops, &tlb);
- tlb_end_vma(&tlb, vma);
+ tlb_start_vma(tlb, vma);
+ walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode);
+ walk_page_range_vma(vma, range.start, range.end,
+ &walk_ops, tlb);
+ tlb_end_vma(tlb, vma);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb);
-
return 0;
}
@@ -848,23 +856,28 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
-static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior)
+
{
+ struct madvise_behavior_range *range = &madv_behavior->range;
struct zap_details details = {
.reclaim_pt = true,
.even_cows = true,
};
- zap_page_range_single(vma, start, end - start, &details);
+ zap_page_range_single_batched(
+ madv_behavior->tlb, madv_behavior->vma, range->start,
+ range->end - range->start, &details);
return 0;
}
-static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long *end,
- int behavior)
+static
+bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior)
{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ int behavior = madv_behavior->behavior;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+
if (!is_vm_hugetlb_page(vma)) {
unsigned int forbidden = VM_PFNMAP;
@@ -876,7 +889,7 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
return false;
- if (start & ~huge_page_mask(hstate_vma(vma)))
+ if (range->start & ~huge_page_mask(hstate_vma(vma)))
return false;
/*
@@ -885,40 +898,38 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
* Avoid unexpected data loss by rounding down the number of
* huge pages freed.
*/
- *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
+ range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma)));
return true;
}
-static long madvise_dontneed_free(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- int behavior)
+static long madvise_dontneed_free(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = madv_behavior->mm;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ int behavior = madv_behavior->behavior;
- *prev = vma;
- if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
+ if (!madvise_dontneed_free_valid_vma(madv_behavior))
return -EINVAL;
- if (start == end)
+ if (range->start == range->end)
return 0;
- if (!userfaultfd_remove(vma, start, end)) {
- *prev = NULL; /* mmap_lock has been dropped, prev is stale */
+ if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) {
+ struct vm_area_struct *vma;
+ mark_mmap_lock_dropped(madv_behavior);
mmap_read_lock(mm);
- vma = vma_lookup(mm, start);
+ madv_behavior->vma = vma = vma_lookup(mm, range->start);
if (!vma)
return -ENOMEM;
/*
* Potential end adjustment for hugetlb vma is OK as
* the check below keeps end within vma.
*/
- if (!madvise_dontneed_free_valid_vma(vma, start, &end,
- behavior))
+ if (!madvise_dontneed_free_valid_vma(madv_behavior))
return -EINVAL;
- if (end > vma->vm_end) {
+ if (range->end > vma->vm_end) {
/*
* Don't fail if end > vma->vm_end. If the old
* vma was split while the mmap_lock was
@@ -931,7 +942,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
* end-vma->vm_end range, but the manager can
* handle a repetition fine.
*/
- end = vma->vm_end;
+ range->end = vma->vm_end;
}
/*
* If the memory region between start and end was
@@ -940,24 +951,26 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
* the adjustment for hugetlb vma above may have rounded
* end down to the start address.
*/
- if (start == end)
+ if (range->start == range->end)
return 0;
- VM_WARN_ON(start > end);
+ VM_WARN_ON(range->start > range->end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
- return madvise_dontneed_single_vma(vma, start, end);
+ return madvise_dontneed_single_vma(madv_behavior);
else if (behavior == MADV_FREE)
- return madvise_free_single_vma(vma, start, end);
+ return madvise_free_single_vma(madv_behavior);
else
return -EINVAL;
}
-static long madvise_populate(struct mm_struct *mm, unsigned long start,
- unsigned long end, int behavior)
+static long madvise_populate(struct madvise_behavior *madv_behavior)
{
- const bool write = behavior == MADV_POPULATE_WRITE;
+ struct mm_struct *mm = madv_behavior->mm;
+ const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE;
int locked = 1;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
long pages;
while (start < end) {
@@ -994,16 +1007,17 @@ static long madvise_populate(struct mm_struct *mm, unsigned long start,
* Application wants to free up the pages and associated backing store.
* This is effectively punching a hole into the middle of a file.
*/
-static long madvise_remove(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_remove(struct madvise_behavior *madv_behavior)
{
loff_t offset;
int error;
struct file *f;
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = madv_behavior->mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
- *prev = NULL; /* tell sys_madvise we drop mmap_lock */
+ mark_mmap_lock_dropped(madv_behavior);
if (vma->vm_flags & VM_LOCKED)
return -EINVAL;
@@ -1066,7 +1080,7 @@ static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
pud_t pudval = pudp_get(pud);
/* If huge return >0 so we abort the operation + zap. */
- return pud_trans_huge(pudval) || pud_devmap(pudval);
+ return pud_trans_huge(pudval);
}
static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -1075,7 +1089,7 @@ static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
pmd_t pmdval = pmdp_get(pmd);
/* If huge return >0 so we abort the operation + zap. */
- return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
+ return pmd_trans_huge(pmdval);
}
static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
@@ -1115,14 +1129,13 @@ static const struct mm_walk_ops guard_install_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
-static long madvise_guard_install(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_guard_install(struct madvise_behavior *madv_behavior)
{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
long err;
int i;
- *prev = vma;
if (!is_valid_guard_vma(vma, /* allow_locked = */false))
return -EINVAL;
@@ -1153,13 +1166,14 @@ static long madvise_guard_install(struct vm_area_struct *vma,
unsigned long nr_pages = 0;
/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
- err = walk_page_range_mm(vma->vm_mm, start, end,
+ err = walk_page_range_mm(vma->vm_mm, range->start, range->end,
&guard_install_walk_ops, &nr_pages);
if (err < 0)
return err;
if (err == 0) {
- unsigned long nr_expected_pages = PHYS_PFN(end - start);
+ unsigned long nr_expected_pages =
+ PHYS_PFN(range->end - range->start);
VM_WARN_ON(nr_pages != nr_expected_pages);
return 0;
@@ -1169,7 +1183,8 @@ static long madvise_guard_install(struct vm_area_struct *vma,
* OK some of the range have non-guard pages mapped, zap
* them. This leaves existing guard pages in place.
*/
- zap_page_range_single(vma, start, end - start, NULL);
+ zap_page_range_single(vma, range->start,
+ range->end - range->start, NULL);
}
/*
@@ -1186,7 +1201,7 @@ static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
pud_t pudval = pudp_get(pud);
/* If huge, cannot have guard pages present, so no-op - skip. */
- if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ if (pud_trans_huge(pudval))
walk->action = ACTION_CONTINUE;
return 0;
@@ -1198,7 +1213,7 @@ static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
pmd_t pmdval = pmdp_get(pmd);
/* If huge, cannot have guard pages present, so no-op - skip. */
- if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ if (pmd_trans_huge(pmdval))
walk->action = ACTION_CONTINUE;
return 0;
@@ -1225,11 +1240,11 @@ static const struct mm_walk_ops guard_remove_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
-static long madvise_guard_remove(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
{
- *prev = vma;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+
/*
* We're ok with removing guards in mlock()'d ranges, as this is a
* non-destructive action.
@@ -1237,7 +1252,7 @@ static long madvise_guard_remove(struct vm_area_struct *vma,
if (!is_valid_guard_vma(vma, /* allow_locked = */true))
return -EINVAL;
- return walk_page_range(vma->vm_mm, start, end,
+ return walk_page_range_vma(vma, range->start, range->end,
&guard_remove_walk_ops, NULL);
}
@@ -1246,31 +1261,40 @@ static long madvise_guard_remove(struct vm_area_struct *vma,
* will handle splitting a vm area into separate areas, each area with its own
* behavior.
*/
-static int madvise_vma_behavior(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- unsigned long behavior)
+static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
{
+ int behavior = madv_behavior->behavior;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ vm_flags_t new_flags = vma->vm_flags;
+ struct madvise_behavior_range *range = &madv_behavior->range;
int error;
- struct anon_vma_name *anon_name;
- unsigned long new_flags = vma->vm_flags;
- if (unlikely(!can_modify_vma_madv(vma, behavior)))
+ if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior)))
return -EPERM;
switch (behavior) {
case MADV_REMOVE:
- return madvise_remove(vma, prev, start, end);
+ return madvise_remove(madv_behavior);
case MADV_WILLNEED:
- return madvise_willneed(vma, prev, start, end);
+ return madvise_willneed(madv_behavior);
case MADV_COLD:
- return madvise_cold(vma, prev, start, end);
+ return madvise_cold(madv_behavior);
case MADV_PAGEOUT:
- return madvise_pageout(vma, prev, start, end);
+ return madvise_pageout(madv_behavior);
case MADV_FREE:
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
- return madvise_dontneed_free(vma, prev, start, end, behavior);
+ return madvise_dontneed_free(madv_behavior);
+ case MADV_COLLAPSE:
+ return madvise_collapse(vma, range->start, range->end,
+ &madv_behavior->lock_dropped);
+ case MADV_GUARD_INSTALL:
+ return madvise_guard_install(madv_behavior);
+ case MADV_GUARD_REMOVE:
+ return madvise_guard_remove(madv_behavior);
+
+ /* The below behaviours update VMAs via madvise_update_vma(). */
+
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
@@ -1284,18 +1308,18 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
new_flags |= VM_DONTCOPY;
break;
case MADV_DOFORK:
- if (vma->vm_flags & VM_IO)
+ if (new_flags & VM_IO)
return -EINVAL;
new_flags &= ~VM_DONTCOPY;
break;
case MADV_WIPEONFORK:
/* MADV_WIPEONFORK is only supported on anonymous memory. */
- if (vma->vm_file || vma->vm_flags & VM_SHARED)
+ if (vma->vm_file || new_flags & VM_SHARED)
return -EINVAL;
new_flags |= VM_WIPEONFORK;
break;
case MADV_KEEPONFORK:
- if (vma->vm_flags & VM_DROPPABLE)
+ if (new_flags & VM_DROPPABLE)
return -EINVAL;
new_flags &= ~VM_WIPEONFORK;
break;
@@ -1303,14 +1327,15 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
- if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
- (vma->vm_flags & VM_DROPPABLE))
+ if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) ||
+ (new_flags & VM_DROPPABLE))
return -EINVAL;
new_flags &= ~VM_DONTDUMP;
break;
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
- error = ksm_madvise(vma, start, end, behavior, &new_flags);
+ error = ksm_madvise(vma, range->start, range->end,
+ behavior, &new_flags);
if (error)
goto out;
break;
@@ -1320,20 +1345,17 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
if (error)
goto out;
break;
- case MADV_COLLAPSE:
- return madvise_collapse(vma, prev, start, end);
- case MADV_GUARD_INSTALL:
- return madvise_guard_install(vma, prev, start, end);
- case MADV_GUARD_REMOVE:
- return madvise_guard_remove(vma, prev, start, end);
+ case __MADV_SET_ANON_VMA_NAME:
+ /* Only anonymous mappings can be named */
+ if (vma->vm_file && !vma_is_anon_shmem(vma))
+ return -EBADF;
+ break;
}
- anon_name = anon_vma_name(vma);
- anon_vma_name_get(anon_name);
- error = madvise_update_vma(vma, prev, start, end, new_flags,
- anon_name);
- anon_vma_name_put(anon_name);
+ /* This is a write operation.*/
+ VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK);
+ error = madvise_update_vma(new_flags, madv_behavior);
out:
/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1348,15 +1370,15 @@ out:
/*
* Error injection support for memory error handling.
*/
-static int madvise_inject_error(int behavior,
- unsigned long start, unsigned long end)
+static int madvise_inject_error(struct madvise_behavior *madv_behavior)
{
unsigned long size;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
-
for (; start < end; start += size) {
unsigned long pfn;
struct page *page;
@@ -1374,7 +1396,7 @@ static int madvise_inject_error(int behavior,
*/
size = page_size(compound_head(page));
- if (behavior == MADV_SOFT_OFFLINE) {
+ if (madv_behavior->behavior == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
pfn, start);
ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
@@ -1393,9 +1415,9 @@ static int madvise_inject_error(int behavior,
return 0;
}
-static bool is_memory_failure(int behavior)
+static bool is_memory_failure(struct madvise_behavior *madv_behavior)
{
- switch (behavior) {
+ switch (madv_behavior->behavior) {
case MADV_HWPOISON:
case MADV_SOFT_OFFLINE:
return true;
@@ -1406,13 +1428,12 @@ static bool is_memory_failure(int behavior)
#else
-static int madvise_inject_error(int behavior,
- unsigned long start, unsigned long end)
+static int madvise_inject_error(struct madvise_behavior *madv_behavior)
{
return 0;
}
-static bool is_memory_failure(int behavior)
+static bool is_memory_failure(struct madvise_behavior *madv_behavior)
{
return false;
}
@@ -1478,145 +1499,226 @@ static bool process_madvise_remote_valid(int behavior)
}
/*
- * Walk the vmas in range [start,end), and call the visit function on each one.
- * The visit function will get start and end parameters that cover the overlap
- * between the current vma and the original range. Any unmapped regions in the
- * original range will result in this function returning -ENOMEM while still
- * calling the visit function on all of the existing vmas in the range.
- * Must be called with the mmap_lock held for reading or writing.
+ * Try to acquire a VMA read lock if possible.
+ *
+ * We only support this lock over a single VMA, which the input range must
+ * span either partially or fully.
+ *
+ * This function always returns with an appropriate lock held. If a VMA read
+ * lock could be acquired, we return true and set madv_behavior state
+ * accordingly.
+ *
+ * If a VMA read lock could not be acquired, we return false and expect caller to
+ * fallback to mmap lock behaviour.
*/
-static
-int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long arg,
- int (*visit)(struct vm_area_struct *vma,
- struct vm_area_struct **prev, unsigned long start,
- unsigned long end, unsigned long arg))
+static bool try_vma_read_lock(struct madvise_behavior *madv_behavior)
{
+ struct mm_struct *mm = madv_behavior->mm;
struct vm_area_struct *vma;
- struct vm_area_struct *prev;
- unsigned long tmp;
+
+ vma = lock_vma_under_rcu(mm, madv_behavior->range.start);
+ if (!vma)
+ goto take_mmap_read_lock;
+ /*
+ * Must span only a single VMA; uffd and remote processes are
+ * unsupported.
+ */
+ if (madv_behavior->range.end > vma->vm_end || current->mm != mm ||
+ userfaultfd_armed(vma)) {
+ vma_end_read(vma);
+ goto take_mmap_read_lock;
+ }
+ madv_behavior->vma = vma;
+ return true;
+
+take_mmap_read_lock:
+ mmap_read_lock(mm);
+ madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
+ return false;
+}
+
+/*
+ * Walk the vmas in range [start,end), and call the madvise_vma_behavior
+ * function on each one. The function will get start and end parameters that
+ * cover the overlap between the current vma and the original range. Any
+ * unmapped regions in the original range will result in this function returning
+ * -ENOMEM while still calling the madvise_vma_behavior function on all of the
+ * existing vmas in the range. Must be called with the mmap_lock held for
+ * reading or writing.
+ */
+static
+int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
+{
+ struct mm_struct *mm = madv_behavior->mm;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ /* range is updated to span each VMA, so store end of entire range. */
+ unsigned long last_end = range->end;
int unmapped_error = 0;
+ int error;
+ struct vm_area_struct *prev, *vma;
/*
- * If the interval [start,end) covers some unmapped address
- * ranges, just ignore them, but return -ENOMEM at the end.
- * - different from the way of handling in mlock etc.
+ * If VMA read lock is supported, apply madvise to a single VMA
+ * tentatively, avoiding walking VMAs.
*/
- vma = find_vma_prev(mm, start, &prev);
- if (vma && start > vma->vm_start)
+ if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK &&
+ try_vma_read_lock(madv_behavior)) {
+ error = madvise_vma_behavior(madv_behavior);
+ vma_end_read(madv_behavior->vma);
+ return error;
+ }
+
+ vma = find_vma_prev(mm, range->start, &prev);
+ if (vma && range->start > vma->vm_start)
prev = vma;
for (;;) {
- int error;
-
/* Still start < end. */
if (!vma)
return -ENOMEM;
- /* Here start < (end|vma->vm_end). */
- if (start < vma->vm_start) {
+ /* Here start < (last_end|vma->vm_end). */
+ if (range->start < vma->vm_start) {
+ /*
+ * This indicates a gap between VMAs in the input
+ * range. This does not cause the operation to abort,
+ * rather we simply return -ENOMEM to indicate that this
+ * has happened, but carry on.
+ */
unmapped_error = -ENOMEM;
- start = vma->vm_start;
- if (start >= end)
+ range->start = vma->vm_start;
+ if (range->start >= last_end)
break;
}
- /* Here vma->vm_start <= start < (end|vma->vm_end) */
- tmp = vma->vm_end;
- if (end < tmp)
- tmp = end;
+ /* Here vma->vm_start <= range->start < (last_end|vma->vm_end) */
+ range->end = min(vma->vm_end, last_end);
- /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
- error = visit(vma, &prev, start, tmp, arg);
+ /* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */
+ madv_behavior->prev = prev;
+ madv_behavior->vma = vma;
+ error = madvise_vma_behavior(madv_behavior);
if (error)
return error;
- start = tmp;
- if (prev && start < prev->vm_end)
- start = prev->vm_end;
- if (start >= end)
+ if (madv_behavior->lock_dropped) {
+ /* We dropped the mmap lock, we can't ref the VMA. */
+ prev = NULL;
+ vma = NULL;
+ madv_behavior->lock_dropped = false;
+ } else {
+ vma = madv_behavior->vma;
+ prev = vma;
+ }
+
+ if (vma && range->end < vma->vm_end)
+ range->end = vma->vm_end;
+ if (range->end >= last_end)
break;
- if (prev)
- vma = find_vma(mm, prev->vm_end);
- else /* madvise_remove dropped mmap_lock */
- vma = find_vma(mm, start);
+
+ vma = find_vma(mm, vma ? vma->vm_end : range->end);
+ range->start = range->end;
}
return unmapped_error;
}
-#ifdef CONFIG_ANON_VMA_NAME
-static int madvise_vma_anon_name(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- unsigned long anon_name)
+/*
+ * Any behaviour which results in changes to the vma->vm_flags needs to
+ * take mmap_lock for writing. Others, which simply traverse vmas, need
+ * to only take it for reading.
+ */
+static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
{
- int error;
-
- /* Only anonymous mappings can be named */
- if (vma->vm_file && !vma_is_anon_shmem(vma))
- return -EBADF;
-
- error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
- (struct anon_vma_name *)anon_name);
+ if (is_memory_failure(madv_behavior))
+ return MADVISE_NO_LOCK;
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- return error;
+ switch (madv_behavior->behavior) {
+ case MADV_REMOVE:
+ case MADV_WILLNEED:
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ case MADV_COLLAPSE:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
+ return MADVISE_MMAP_READ_LOCK;
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
+ return MADVISE_VMA_READ_LOCK;
+ default:
+ return MADVISE_MMAP_WRITE_LOCK;
+ }
}
-int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in, struct anon_vma_name *anon_name)
+static int madvise_lock(struct madvise_behavior *madv_behavior)
{
- unsigned long end;
- unsigned long len;
+ struct mm_struct *mm = madv_behavior->mm;
+ enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
- if (start & ~PAGE_MASK)
- return -EINVAL;
- len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+ switch (lock_mode) {
+ case MADVISE_NO_LOCK:
+ break;
+ case MADVISE_MMAP_WRITE_LOCK:
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ break;
+ case MADVISE_MMAP_READ_LOCK:
+ mmap_read_lock(mm);
+ break;
+ case MADVISE_VMA_READ_LOCK:
+ /* We will acquire the lock per-VMA in madvise_walk_vmas(). */
+ break;
+ }
- /* Check to see whether len was rounded up from small -ve to zero */
- if (len_in && !len)
- return -EINVAL;
+ madv_behavior->lock_mode = lock_mode;
+ return 0;
+}
- end = start + len;
- if (end < start)
- return -EINVAL;
+static void madvise_unlock(struct madvise_behavior *madv_behavior)
+{
+ struct mm_struct *mm = madv_behavior->mm;
- if (end == start)
- return 0;
+ switch (madv_behavior->lock_mode) {
+ case MADVISE_NO_LOCK:
+ return;
+ case MADVISE_MMAP_WRITE_LOCK:
+ mmap_write_unlock(mm);
+ break;
+ case MADVISE_MMAP_READ_LOCK:
+ mmap_read_unlock(mm);
+ break;
+ case MADVISE_VMA_READ_LOCK:
+ /* We will drop the lock per-VMA in madvise_walk_vmas(). */
+ break;
+ }
- return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
- madvise_vma_anon_name);
+ madv_behavior->lock_mode = MADVISE_NO_LOCK;
}
-#endif /* CONFIG_ANON_VMA_NAME */
-static int madvise_lock(struct mm_struct *mm, int behavior)
+static bool madvise_batch_tlb_flush(int behavior)
{
- if (is_memory_failure(behavior))
- return 0;
-
- if (madvise_need_mmap_write(behavior)) {
- if (mmap_write_lock_killable(mm))
- return -EINTR;
- } else {
- mmap_read_lock(mm);
+ switch (behavior) {
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
+ return true;
+ default:
+ return false;
}
- return 0;
}
-static void madvise_unlock(struct mm_struct *mm, int behavior)
+static void madvise_init_tlb(struct madvise_behavior *madv_behavior)
{
- if (is_memory_failure(behavior))
- return;
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm);
+}
- if (madvise_need_mmap_write(behavior))
- mmap_write_unlock(mm);
- else
- mmap_read_unlock(mm);
+static void madvise_finish_tlb(struct madvise_behavior *madv_behavior)
+{
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_finish_mmu(madv_behavior->tlb);
}
static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
@@ -1665,9 +1767,9 @@ static bool madvise_should_skip(unsigned long start, size_t len_in,
return false;
}
-static bool is_madvise_populate(int behavior)
+static bool is_madvise_populate(struct madvise_behavior *madv_behavior)
{
- switch (behavior) {
+ switch (madv_behavior->behavior) {
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
return true;
@@ -1676,24 +1778,42 @@ static bool is_madvise_populate(int behavior)
}
}
-static int madvise_do_behavior(struct mm_struct *mm,
- unsigned long start, size_t len_in, int behavior)
+/*
+ * untagged_addr_remote() assumes mmap_lock is already held. On
+ * architectures like x86 and RISC-V, tagging is tricky because each
+ * mm may have a different tagging mask. However, we might only hold
+ * the per-VMA lock (currently only local processes are supported),
+ * so untagged_addr is used to avoid the mmap_lock assertion for
+ * local processes.
+ */
+static inline unsigned long get_untagged_addr(struct mm_struct *mm,
+ unsigned long start)
+{
+ return current->mm == mm ? untagged_addr(start) :
+ untagged_addr_remote(mm, start);
+}
+
+static int madvise_do_behavior(unsigned long start, size_t len_in,
+ struct madvise_behavior *madv_behavior)
{
struct blk_plug plug;
- unsigned long end;
int error;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+
+ if (is_memory_failure(madv_behavior)) {
+ range->start = start;
+ range->end = start + len_in;
+ return madvise_inject_error(madv_behavior);
+ }
- if (is_memory_failure(behavior))
- return madvise_inject_error(behavior, start, start + len_in);
- start = untagged_addr_remote(mm, start);
- end = start + PAGE_ALIGN(len_in);
+ range->start = get_untagged_addr(madv_behavior->mm, start);
+ range->end = range->start + PAGE_ALIGN(len_in);
blk_start_plug(&plug);
- if (is_madvise_populate(behavior))
- error = madvise_populate(mm, start, end, behavior);
+ if (is_madvise_populate(madv_behavior))
+ error = madvise_populate(madv_behavior);
else
- error = madvise_walk_vmas(mm, start, end, behavior,
- madvise_vma_behavior);
+ error = madvise_walk_vmas(madv_behavior);
blk_finish_plug(&plug);
return error;
}
@@ -1773,14 +1893,22 @@ static int madvise_do_behavior(struct mm_struct *mm,
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
int error;
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .mm = mm,
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
if (madvise_should_skip(start, len_in, behavior, &error))
return error;
- error = madvise_lock(mm, behavior);
+ error = madvise_lock(&madv_behavior);
if (error)
return error;
- error = madvise_do_behavior(mm, start, len_in, behavior);
- madvise_unlock(mm, behavior);
+ madvise_init_tlb(&madv_behavior);
+ error = madvise_do_behavior(start, len_in, &madv_behavior);
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(&madv_behavior);
return error;
}
@@ -1796,12 +1924,19 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
{
ssize_t ret = 0;
size_t total_len;
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .mm = mm,
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
total_len = iov_iter_count(iter);
- ret = madvise_lock(mm, behavior);
+ ret = madvise_lock(&madv_behavior);
if (ret)
return ret;
+ madvise_init_tlb(&madv_behavior);
while (iov_iter_count(iter)) {
unsigned long start = (unsigned long)iter_iov_addr(iter);
@@ -1811,7 +1946,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
if (madvise_should_skip(start, len_in, behavior, &error))
ret = error;
else
- ret = madvise_do_behavior(mm, start, len_in, behavior);
+ ret = madvise_do_behavior(start, len_in, &madv_behavior);
/*
* An madvise operation is attempting to restart the syscall,
* but we cannot proceed as it would not be correct to repeat
@@ -1829,16 +1964,22 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
}
/* Drop and reacquire lock to unwind race. */
- madvise_unlock(mm, behavior);
- madvise_lock(mm, behavior);
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(&madv_behavior);
+ ret = madvise_lock(&madv_behavior);
+ if (ret)
+ goto out;
+ madvise_init_tlb(&madv_behavior);
continue;
}
if (ret < 0)
break;
iov_iter_advance(iter, iter_iov_len(iter));
}
- madvise_unlock(mm, behavior);
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(&madv_behavior);
+out:
ret = (total_len - iov_iter_count(iter)) ? : ret;
return ret;
@@ -1907,3 +2048,88 @@ free_iov:
out:
return ret;
}
+
+#ifdef CONFIG_ANON_VMA_NAME
+
+#define ANON_VMA_NAME_MAX_LEN 80
+#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]"
+
+static inline bool is_valid_name_char(char ch)
+{
+ /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
+ return ch > 0x1f && ch < 0x7f &&
+ !strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
+}
+
+static int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+ unsigned long len_in, struct anon_vma_name *anon_name)
+{
+ unsigned long end;
+ unsigned long len;
+ int error;
+ struct madvise_behavior madv_behavior = {
+ .mm = mm,
+ .behavior = __MADV_SET_ANON_VMA_NAME,
+ .anon_name = anon_name,
+ };
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return -EINVAL;
+
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+
+ if (end == start)
+ return 0;
+
+ madv_behavior.range.start = start;
+ madv_behavior.range.end = end;
+
+ error = madvise_lock(&madv_behavior);
+ if (error)
+ return error;
+ error = madvise_walk_vmas(&madv_behavior);
+ madvise_unlock(&madv_behavior);
+
+ return error;
+}
+
+int set_anon_vma_name(unsigned long addr, unsigned long size,
+ const char __user *uname)
+{
+ struct anon_vma_name *anon_name = NULL;
+ struct mm_struct *mm = current->mm;
+ int error;
+
+ if (uname) {
+ char *name, *pch;
+
+ name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ for (pch = name; *pch != '\0'; pch++) {
+ if (!is_valid_name_char(*pch)) {
+ kfree(name);
+ return -EINVAL;
+ }
+ }
+ /* anon_vma has its own copy */
+ anon_name = anon_vma_name_alloc(name);
+ kfree(name);
+ if (!anon_name)
+ return -ENOMEM;
+ }
+
+ error = madvise_set_anon_name(mm, addr, size, anon_name);
+ anon_vma_name_put(anon_name);
+
+ return error;
+}
+#endif
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 2f8829b3541a..c193de6cb23a 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -129,7 +129,7 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
pmd_t pmdval = pmdp_get_lockless(pmd);
/* Do not split a huge pmd, present or migrated */
- if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) {
+ if (pmd_trans_huge(pmdval)) {
WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
walk->action = ACTION_CONTINUE;
}
@@ -152,7 +152,7 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
pud_t pudval = READ_ONCE(*pud);
/* Do not split a huge pud */
- if (pud_trans_huge(pudval) || pud_devmap(pudval)) {
+ if (pud_trans_huge(pudval)) {
WARN_ON(pud_write(pudval) || pud_dirty(pudval));
walk->action = ACTION_CONTINUE;
}
@@ -218,7 +218,7 @@ static void wp_clean_post_vma(struct mm_walk *walk)
static int wp_clean_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
- unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags);
+ vm_flags_t vm_flags = READ_ONCE(walk->vma->vm_flags);
/* Skip non-applicable VMAs */
if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) !=
diff --git a/mm/memblock.c b/mm/memblock.c
index 284154445409..154f1d73b61f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -18,6 +18,11 @@
#include <linux/memblock.h>
#include <linux/mutex.h>
+#ifdef CONFIG_KEXEC_HANDOVER
+#include <linux/libfdt.h>
+#include <linux/kexec_handover.h>
+#endif /* CONFIG_KEXEC_HANDOVER */
+
#include <asm/sections.h>
#include <linux/io.h>
@@ -107,6 +112,13 @@ unsigned long min_low_pfn;
unsigned long max_pfn;
unsigned long long max_possible_pfn;
+#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
+/* When set to true, only allocate from MEMBLOCK_KHO_SCRATCH ranges */
+static bool kho_scratch_only;
+#else
+#define kho_scratch_only false
+#endif
+
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -166,6 +178,10 @@ bool __init_memblock memblock_has_mirror(void)
static enum memblock_flags __init_memblock choose_memblock_flags(void)
{
+ /* skip non-scratch memory for kho early boot allocations */
+ if (kho_scratch_only)
+ return MEMBLOCK_KHO_SCRATCH;
+
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
}
@@ -457,7 +473,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
min(new_area_start, memblock.current_limit),
new_alloc_size, PAGE_SIZE);
- new_array = addr ? __va(addr) : NULL;
+ if (addr) {
+ /* The memory may not have been accepted, yet. */
+ accept_memory(addr, new_alloc_size);
+
+ new_array = __va(addr);
+ } else {
+ new_array = NULL;
+ }
}
if (!addr) {
pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
@@ -492,7 +515,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
* needn't do it
*/
if (!use_slab)
- BUG_ON(memblock_reserve(addr, new_alloc_size));
+ BUG_ON(memblock_reserve_kern(addr, new_alloc_size));
/* Update slab flag */
*in_slab = use_slab;
@@ -642,7 +665,7 @@ repeat:
#ifdef CONFIG_NUMA
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
- WARN_ON(flags != rgn->flags);
+ WARN_ON(flags != MEMBLOCK_NONE && flags != rgn->flags);
nr_new++;
if (insert) {
if (start_rgn == -1)
@@ -902,14 +925,15 @@ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
return memblock_remove_range(&memblock.reserved, base, size);
}
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size,
+ int nid, enum memblock_flags flags)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
- &base, &end, (void *)_RET_IP_);
+ memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
+ &base, &end, nid, flags, (void *)_RET_IP_);
- return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&memblock.reserved, base, size, nid, flags);
}
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -924,6 +948,40 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
}
#endif
+#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
+__init void memblock_set_kho_scratch_only(void)
+{
+ kho_scratch_only = true;
+}
+
+__init void memblock_clear_kho_scratch_only(void)
+{
+ kho_scratch_only = false;
+}
+
+__init void memmap_init_kho_scratch_pages(void)
+{
+ phys_addr_t start, end;
+ unsigned long pfn;
+ int nid;
+ u64 i;
+
+ if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT))
+ return;
+
+ /*
+ * Initialize struct pages for free scratch memory.
+ * The struct pages for reserved scratch memory will be set up in
+ * reserve_bootmem_region()
+ */
+ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
+ MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) {
+ for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++)
+ init_deferred_page(pfn, nid);
+ }
+}
+#endif
+
/**
* memblock_setclr_flag - set or clear flag for a memory region
* @type: memblock type to set/clear flag for
@@ -1049,6 +1107,36 @@ int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t
MEMBLOCK_RSRV_NOINIT);
}
+/**
+ * memblock_mark_kho_scratch - Mark a memory region as MEMBLOCK_KHO_SCRATCH.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH will be considered
+ * for allocations during early boot with kexec handover.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+__init int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(&memblock.memory, base, size, 1,
+ MEMBLOCK_KHO_SCRATCH);
+}
+
+/**
+ * memblock_clear_kho_scratch - Clear MEMBLOCK_KHO_SCRATCH flag for a
+ * specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+__init int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(&memblock.memory, base, size, 0,
+ MEMBLOCK_KHO_SCRATCH);
+}
+
static bool should_skip_region(struct memblock_type *type,
struct memblock_region *m,
int nid, int flags)
@@ -1080,6 +1168,13 @@ static bool should_skip_region(struct memblock_type *type,
if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m))
return true;
+ /*
+ * In early alloc during kexec handover, we can only consider
+ * MEMBLOCK_KHO_SCRATCH regions for the allocations
+ */
+ if ((flags & MEMBLOCK_KHO_SCRATCH) && !memblock_is_kho_scratch(m))
+ return true;
+
return false;
}
@@ -1460,14 +1555,14 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
again:
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
- if (found && !memblock_reserve(found, size))
+ if (found && !__memblock_reserve(found, size, nid, MEMBLOCK_RSRV_KERN))
goto done;
if (numa_valid_node(nid) && !exact_nid) {
found = memblock_find_in_range_node(size, align, start,
end, NUMA_NO_NODE,
flags);
- if (found && !memblock_reserve(found, size))
+ if (found && !memblock_reserve_kern(found, size))
goto done;
}
@@ -1752,6 +1847,28 @@ phys_addr_t __init_memblock memblock_reserved_size(void)
return memblock.reserved.total_size;
}
+phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int nid)
+{
+ struct memblock_region *r;
+ phys_addr_t total = 0;
+
+ for_each_reserved_mem_region(r) {
+ phys_addr_t size = r->size;
+
+ if (r->base > limit)
+ break;
+
+ if (r->base + r->size > limit)
+ size = limit - r->base;
+
+ if (nid == memblock_get_region_node(r) || !numa_valid_node(nid))
+ if (r->flags & MEMBLOCK_RSRV_KERN)
+ total += size;
+ }
+
+ return total;
+}
+
/**
* memblock_estimated_nr_free_pages - return estimated number of free pages
* from memblock point of view
@@ -2167,6 +2284,9 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
unsigned long start_pfn = PFN_UP(start);
unsigned long end_pfn = PFN_DOWN(end);
+ if (!IS_ENABLED(CONFIG_HIGHMEM) && end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
+
if (start_pfn >= end_pfn)
return 0;
@@ -2180,11 +2300,14 @@ static void __init memmap_init_reserved_pages(void)
struct memblock_region *region;
phys_addr_t start, end;
int nid;
+ unsigned long max_reserved;
/*
* set nid on all reserved pages and also treat struct
* pages for the NOMAP regions as PageReserved
*/
+repeat:
+ max_reserved = memblock.reserved.max;
for_each_mem_region(region) {
nid = memblock_get_region_node(region);
start = region->base;
@@ -2193,8 +2316,15 @@ static void __init memmap_init_reserved_pages(void)
if (memblock_is_nomap(region))
reserve_bootmem_region(start, end, nid);
- memblock_set_node(start, end, &memblock.reserved, nid);
+ memblock_set_node(start, region->size, &memblock.reserved, nid);
}
+ /*
+ * 'max' is changed means memblock.reserved has been doubled its
+ * array, which may result a new reserved region before current
+ * 'start'. Now we should repeat the procedure to set its node id.
+ */
+ if (max_reserved != memblock.reserved.max)
+ goto repeat;
/*
* initialize struct pages for reserved regions that don't have
@@ -2269,6 +2399,7 @@ void __init memblock_free_all(void)
free_unused_memmap();
reset_all_zones_managed_pages();
+ memblock_clear_kho_scratch_only();
pages = free_low_memory_core_early();
totalram_pages_add(pages);
}
@@ -2366,6 +2497,189 @@ int reserve_mem_release_by_name(const char *name)
return 1;
}
+#ifdef CONFIG_KEXEC_HANDOVER
+#define MEMBLOCK_KHO_FDT "memblock"
+#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1"
+#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1"
+static struct page *kho_fdt;
+
+static int reserve_mem_kho_finalize(struct kho_serialization *ser)
+{
+ int err = 0, i;
+
+ for (i = 0; i < reserved_mem_count; i++) {
+ struct reserve_mem_table *map = &reserved_mem_table[i];
+
+ err |= kho_preserve_phys(map->start, map->size);
+ }
+
+ err |= kho_preserve_folio(page_folio(kho_fdt));
+ err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt));
+
+ return notifier_from_errno(err);
+}
+
+static int reserve_mem_kho_notifier(struct notifier_block *self,
+ unsigned long cmd, void *v)
+{
+ switch (cmd) {
+ case KEXEC_KHO_FINALIZE:
+ return reserve_mem_kho_finalize((struct kho_serialization *)v);
+ case KEXEC_KHO_ABORT:
+ return NOTIFY_DONE;
+ default:
+ return NOTIFY_BAD;
+ }
+}
+
+static struct notifier_block reserve_mem_kho_nb = {
+ .notifier_call = reserve_mem_kho_notifier,
+};
+
+static int __init prepare_kho_fdt(void)
+{
+ int err = 0, i;
+ void *fdt;
+
+ kho_fdt = alloc_page(GFP_KERNEL);
+ if (!kho_fdt)
+ return -ENOMEM;
+
+ fdt = page_to_virt(kho_fdt);
+
+ err |= fdt_create(fdt, PAGE_SIZE);
+ err |= fdt_finish_reservemap(fdt);
+
+ err |= fdt_begin_node(fdt, "");
+ err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE);
+ for (i = 0; i < reserved_mem_count; i++) {
+ struct reserve_mem_table *map = &reserved_mem_table[i];
+
+ err |= fdt_begin_node(fdt, map->name);
+ err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE);
+ err |= fdt_property(fdt, "start", &map->start, sizeof(map->start));
+ err |= fdt_property(fdt, "size", &map->size, sizeof(map->size));
+ err |= fdt_end_node(fdt);
+ }
+ err |= fdt_end_node(fdt);
+
+ err |= fdt_finish(fdt);
+
+ if (err) {
+ pr_err("failed to prepare memblock FDT for KHO: %d\n", err);
+ put_page(kho_fdt);
+ kho_fdt = NULL;
+ }
+
+ return err;
+}
+
+static int __init reserve_mem_init(void)
+{
+ int err;
+
+ if (!kho_is_enabled() || !reserved_mem_count)
+ return 0;
+
+ err = prepare_kho_fdt();
+ if (err)
+ return err;
+
+ err = register_kho_notifier(&reserve_mem_kho_nb);
+ if (err) {
+ put_page(kho_fdt);
+ kho_fdt = NULL;
+ }
+
+ return err;
+}
+late_initcall(reserve_mem_init);
+
+static void *__init reserve_mem_kho_retrieve_fdt(void)
+{
+ phys_addr_t fdt_phys;
+ static void *fdt;
+ int err;
+
+ if (fdt)
+ return fdt;
+
+ err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys);
+ if (err) {
+ if (err != -ENOENT)
+ pr_warn("failed to retrieve FDT '%s' from KHO: %d\n",
+ MEMBLOCK_KHO_FDT, err);
+ return NULL;
+ }
+
+ fdt = phys_to_virt(fdt_phys);
+
+ err = fdt_node_check_compatible(fdt, 0, MEMBLOCK_KHO_NODE_COMPATIBLE);
+ if (err) {
+ pr_warn("FDT '%s' is incompatible with '%s': %d\n",
+ MEMBLOCK_KHO_FDT, MEMBLOCK_KHO_NODE_COMPATIBLE, err);
+ fdt = NULL;
+ }
+
+ return fdt;
+}
+
+static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
+ phys_addr_t align)
+{
+ int err, len_start, len_size, offset;
+ const phys_addr_t *p_start, *p_size;
+ const void *fdt;
+
+ fdt = reserve_mem_kho_retrieve_fdt();
+ if (!fdt)
+ return false;
+
+ offset = fdt_subnode_offset(fdt, 0, name);
+ if (offset < 0) {
+ pr_warn("FDT '%s' has no child '%s': %d\n",
+ MEMBLOCK_KHO_FDT, name, offset);
+ return false;
+ }
+ err = fdt_node_check_compatible(fdt, offset, RESERVE_MEM_KHO_NODE_COMPATIBLE);
+ if (err) {
+ pr_warn("Node '%s' is incompatible with '%s': %d\n",
+ name, RESERVE_MEM_KHO_NODE_COMPATIBLE, err);
+ return false;
+ }
+
+ p_start = fdt_getprop(fdt, offset, "start", &len_start);
+ p_size = fdt_getprop(fdt, offset, "size", &len_size);
+ if (!p_start || len_start != sizeof(*p_start) || !p_size ||
+ len_size != sizeof(*p_size)) {
+ return false;
+ }
+
+ if (*p_start & (align - 1)) {
+ pr_warn("KHO reserve-mem '%s' has wrong alignment (0x%lx, 0x%lx)\n",
+ name, (long)align, (long)*p_start);
+ return false;
+ }
+
+ if (*p_size != size) {
+ pr_warn("KHO reserve-mem '%s' has wrong size (0x%lx != 0x%lx)\n",
+ name, (long)*p_size, (long)size);
+ return false;
+ }
+
+ reserved_mem_add(*p_start, size, name);
+ pr_info("Revived memory reservation '%s' from KHO\n", name);
+
+ return true;
+}
+#else
+static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
+ phys_addr_t align)
+{
+ return false;
+}
+#endif /* CONFIG_KEXEC_HANDOVER */
+
/*
* Parse reserve_mem=nn:align:name
*/
@@ -2421,6 +2735,11 @@ static int __init reserve_mem(char *p)
if (reserve_mem_find_by_name(name, &start, &tmp))
return -EBUSY;
+ /* Pick previous allocations up from KHO if available */
+ if (reserve_mem_kho_revive(name, size, align))
+ return 1;
+
+ /* TODO: Allocation must be outside of scratch region */
start = memblock_phys_alloc(size, align);
if (!start)
return -ENOMEM;
@@ -2438,6 +2757,8 @@ static const char * const flagname[] = {
[ilog2(MEMBLOCK_NOMAP)] = "NOMAP",
[ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG",
[ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT",
+ [ilog2(MEMBLOCK_RSRV_KERN)] = "RSV_KERN",
+ [ilog2(MEMBLOCK_KHO_SCRATCH)] = "KHO_SCRATCH",
};
static int memblock_debug_show(struct seq_file *m, void *private)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 8660908850dc..4b94731305b9 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -512,9 +512,9 @@ static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
{
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
- __count_memcg_events(memcg, PGPGIN, 1);
+ count_memcg_events(memcg, PGPGIN, 1);
else {
- __count_memcg_events(memcg, PGPGOUT, 1);
+ count_memcg_events(memcg, PGPGOUT, 1);
nr_pages = -nr_pages; /* for event */
}
@@ -620,7 +620,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+ swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry);
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
@@ -689,7 +689,7 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long flags;
local_irq_save(flags);
- __count_memcg_events(memcg, PGPGOUT, pgpgout);
+ count_memcg_events(memcg, PGPGOUT, pgpgout);
__this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
memcg1_check_events(memcg, nid);
local_irq_restore(flags);
@@ -2198,8 +2198,7 @@ bool memcg1_alloc_events(struct mem_cgroup *memcg)
void memcg1_free_events(struct mem_cgroup *memcg)
{
- if (memcg->events_percpu)
- free_percpu(memcg->events_percpu);
+ free_percpu(memcg->events_percpu);
}
static int __init memcg1_init(void)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 421740f1bcdc..8dd7fbed5a94 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
+#include <linux/cpuset.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
@@ -50,7 +51,6 @@
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
-#include <linux/parser.h>
#include <linux/vmpressure.h>
#include <linux/memremap.h>
#include <linux/mm_inline.h>
@@ -95,6 +95,9 @@ static bool cgroup_memory_nokmem __ro_after_init;
/* BPF memory accounting disabled? */
static bool cgroup_memory_nobpf __ro_after_init;
+static struct kmem_cache *memcg_cachep;
+static struct kmem_cache *memcg_pn_cachep;
+
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
@@ -129,8 +132,7 @@ bool mem_cgroup_kmem_disabled(void)
return cgroup_memory_nokmem;
}
-static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
- unsigned int nr_pages);
+static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
static void obj_cgroup_release(struct percpu_ref *ref)
{
@@ -163,8 +165,16 @@ static void obj_cgroup_release(struct percpu_ref *ref)
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;
- if (nr_pages)
- obj_cgroup_uncharge_pages(objcg, nr_pages);
+ if (nr_pages) {
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ memcg1_account_kmem(memcg, -nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ memcg_uncharge(memcg, nr_pages);
+ mem_cgroup_put(memcg);
+ }
spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
@@ -492,8 +502,8 @@ struct memcg_vmstats_percpu {
unsigned int stats_updates;
/* Cached pointers for fast iteration in memcg_rstat_updated() */
- struct memcg_vmstats_percpu *parent;
- struct memcg_vmstats *vmstats;
+ struct memcg_vmstats_percpu __percpu *parent_pcpu;
+ struct memcg_vmstats *vmstats;
/* The above should fit a single cacheline for memcg_rstat_updated() */
@@ -520,7 +530,7 @@ struct memcg_vmstats {
unsigned long events_pending[NR_MEMCG_EVENTS];
/* Stats updates since the last flush */
- atomic64_t stats_updates;
+ atomic_t stats_updates;
};
/*
@@ -544,60 +554,41 @@ static u64 flush_last_time;
#define FLUSH_TIME (2UL*HZ)
-/*
- * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
- * not rely on this as part of an acquired spinlock_t lock. These functions are
- * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
- * is sufficient.
- */
-static void memcg_stats_lock(void)
-{
- preempt_disable_nested();
- VM_WARN_ON_IRQS_ENABLED();
-}
-
-static void __memcg_stats_lock(void)
-{
- preempt_disable_nested();
-}
-
-static void memcg_stats_unlock(void)
-{
- preempt_enable_nested();
-}
-
-
static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
- return atomic64_read(&vmstats->stats_updates) >
+ return atomic_read(&vmstats->stats_updates) >
MEMCG_CHARGE_BATCH * num_online_cpus();
}
-static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
+ int cpu)
{
+ struct memcg_vmstats_percpu __percpu *statc_pcpu;
struct memcg_vmstats_percpu *statc;
- int cpu = smp_processor_id();
unsigned int stats_updates;
if (!val)
return;
- cgroup_rstat_updated(memcg->css.cgroup, cpu);
- statc = this_cpu_ptr(memcg->vmstats_percpu);
- for (; statc; statc = statc->parent) {
- stats_updates = READ_ONCE(statc->stats_updates) + abs(val);
- WRITE_ONCE(statc->stats_updates, stats_updates);
+ css_rstat_updated(&memcg->css, cpu);
+ statc_pcpu = memcg->vmstats_percpu;
+ for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) {
+ statc = this_cpu_ptr(statc_pcpu);
+ /*
+ * If @memcg is already flushable then all its ancestors are
+ * flushable as well and also there is no need to increase
+ * stats_updates.
+ */
+ if (memcg_vmstats_needs_flush(statc->vmstats))
+ break;
+
+ stats_updates = this_cpu_add_return(statc_pcpu->stats_updates,
+ abs(val));
if (stats_updates < MEMCG_CHARGE_BATCH)
continue;
- /*
- * If @memcg is already flush-able, increasing stats_updates is
- * redundant. Avoid the overhead of the atomic update.
- */
- if (!memcg_vmstats_needs_flush(statc->vmstats))
- atomic64_add(stats_updates,
- &statc->vmstats->stats_updates);
- WRITE_ONCE(statc->stats_updates, 0);
+ stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0);
+ atomic_add(stats_updates, &statc->vmstats->stats_updates);
}
}
@@ -605,7 +596,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
- trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates),
+ trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates),
force, needs_flush);
if (!force && !needs_flush)
@@ -614,7 +605,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
if (mem_cgroup_is_root(memcg))
WRITE_ONCE(flush_last_time, jiffies_64);
- cgroup_rstat_flush(memcg->css.cgroup);
+ css_rstat_flush(&memcg->css);
}
/*
@@ -687,15 +678,16 @@ static int memcg_state_val_in_pages(int idx, int val)
}
/**
- * __mod_memcg_state - update cgroup memory statistics
+ * mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
* @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
* @val: delta to add to the counter, can be negative
*/
-void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
+void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
int val)
{
int i = memcg_stats_index(idx);
+ int cpu;
if (mem_cgroup_disabled())
return;
@@ -703,10 +695,14 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- __this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_state(memcg, idx, val);
+
+ put_cpu();
}
#ifdef CONFIG_MEMCG_V1
@@ -728,13 +724,14 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
}
#endif
-static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+static void mod_memcg_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
int i = memcg_stats_index(idx);
+ int cpu;
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
@@ -742,35 +739,19 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
- /*
- * The caller from rmap relies on disabled preemption because they never
- * update their counter from in-interrupt context. For these two
- * counters we check that the update is never performed from an
- * interrupt context while other caller need to have disabled interrupt.
- */
- __memcg_stats_lock();
- if (IS_ENABLED(CONFIG_DEBUG_VM)) {
- switch (idx) {
- case NR_ANON_MAPPED:
- case NR_FILE_MAPPED:
- case NR_ANON_THPS:
- WARN_ON_ONCE(!in_task());
- break;
- default:
- VM_WARN_ON_IRQS_ENABLED();
- }
- }
+ cpu = get_cpu();
/* Update memcg */
- __this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
+ this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_lruvec_state(memcg, idx, val);
- memcg_stats_unlock();
+
+ put_cpu();
}
/**
@@ -791,7 +772,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update memcg and lruvec */
if (!mem_cgroup_disabled())
- __mod_memcg_lruvec_state(lruvec, idx, val);
+ mod_memcg_lruvec_state(lruvec, idx, val);
}
void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
@@ -841,15 +822,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
}
/**
- * __count_memcg_events - account VM events in a cgroup
+ * count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
* @idx: the event item
* @count: the number of events that occurred
*/
-void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
int i = memcg_events_index(idx);
+ int cpu;
if (mem_cgroup_disabled())
return;
@@ -857,11 +839,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- memcg_stats_lock();
- __this_cpu_add(memcg->vmstats_percpu->events[i], count);
- memcg_rstat_updated(memcg, count);
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->events[i], count);
+ memcg_rstat_updated(memcg, count, cpu);
trace_count_memcg_events(memcg, idx, count);
- memcg_stats_unlock();
+
+ put_cpu();
}
unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -1168,7 +1152,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
struct mem_cgroup *iter;
int ret = 0;
- int i = 0;
BUG_ON(mem_cgroup_is_root(memcg));
@@ -1178,10 +1161,9 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it))) {
- /* Avoid potential softlockup warning */
- if ((++i & 1023) == 0)
- cond_resched();
ret = fn(task, arg);
+ /* Avoid potential softlockup warning */
+ cond_resched();
}
css_task_iter_end(&it);
if (ret) {
@@ -1664,7 +1646,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
- ret = task_is_dying() || out_of_memory(&oc);
+ ret = out_of_memory(&oc);
unlock:
mutex_unlock(&oom_lock);
@@ -1758,156 +1740,234 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
pr_cont(" are going to be killed due to memory.oom.group set\n");
}
+/*
+ * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
+ * nr_pages in a single cacheline. This may change in future.
+ */
+#define NR_MEMCG_STOCK 7
+#define FLUSHING_CACHED_CHARGE 0
struct memcg_stock_pcp {
- localtry_lock_t stock_lock;
- struct mem_cgroup *cached; /* this never be root cgroup */
- unsigned int nr_pages;
+ local_trylock_t lock;
+ uint8_t nr_pages[NR_MEMCG_STOCK];
+ struct mem_cgroup *cached[NR_MEMCG_STOCK];
+ struct work_struct work;
+ unsigned long flags;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
+ .lock = INIT_LOCAL_TRYLOCK(lock),
+};
+
+struct obj_stock_pcp {
+ local_trylock_t lock;
+ unsigned int nr_bytes;
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
- unsigned int nr_bytes;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
struct work_struct work;
unsigned long flags;
-#define FLUSHING_CACHED_CHARGE 0
};
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
- .stock_lock = INIT_LOCALTRY_LOCK(stock_lock),
+
+static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = {
+ .lock = INIT_LOCAL_TRYLOCK(lock),
};
+
static DEFINE_MUTEX(percpu_charge_mutex);
-static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+static void drain_obj_stock(struct obj_stock_pcp *stock);
+static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg);
/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
- * @gfp_mask: allocation mask.
*
- * The charges will only happen if @memcg matches the current cpu's memcg
- * stock, and at least @nr_pages are available in that stock. Failure to
- * service an allocation will refill the stock.
+ * Consume the cached charge if enough nr_pages are present otherwise return
+ * failure. Also return failure for charge request larger than
+ * MEMCG_CHARGE_BATCH or if the local lock is already taken.
*
* returns true if successful, false otherwise.
*/
-static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
- gfp_t gfp_mask)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- unsigned int stock_pages;
- unsigned long flags;
+ uint8_t stock_pages;
bool ret = false;
+ int i;
- if (nr_pages > MEMCG_CHARGE_BATCH)
+ if (nr_pages > MEMCG_CHARGE_BATCH ||
+ !local_trylock(&memcg_stock.lock))
return ret;
- if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
- if (!gfpflags_allow_spinning(gfp_mask))
- return ret;
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
- }
-
stock = this_cpu_ptr(&memcg_stock);
- stock_pages = READ_ONCE(stock->nr_pages);
- if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
- WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages);
- ret = true;
+
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ if (memcg != READ_ONCE(stock->cached[i]))
+ continue;
+
+ stock_pages = READ_ONCE(stock->nr_pages[i]);
+ if (stock_pages >= nr_pages) {
+ WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
+ ret = true;
+ }
+ break;
}
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock(&memcg_stock.lock);
return ret;
}
+static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+}
+
/*
* Returns stocks cached in percpu and reset cached information.
*/
-static void drain_stock(struct memcg_stock_pcp *stock)
+static void drain_stock(struct memcg_stock_pcp *stock, int i)
{
- unsigned int stock_pages = READ_ONCE(stock->nr_pages);
- struct mem_cgroup *old = READ_ONCE(stock->cached);
+ struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
+ uint8_t stock_pages;
if (!old)
return;
+ stock_pages = READ_ONCE(stock->nr_pages[i]);
if (stock_pages) {
- page_counter_uncharge(&old->memory, stock_pages);
- if (do_memsw_account())
- page_counter_uncharge(&old->memsw, stock_pages);
-
- WRITE_ONCE(stock->nr_pages, 0);
+ memcg_uncharge(old, stock_pages);
+ WRITE_ONCE(stock->nr_pages[i], 0);
}
css_put(&old->css);
- WRITE_ONCE(stock->cached, NULL);
+ WRITE_ONCE(stock->cached[i], NULL);
+}
+
+static void drain_stock_fully(struct memcg_stock_pcp *stock)
+{
+ int i;
+
+ for (i = 0; i < NR_MEMCG_STOCK; ++i)
+ drain_stock(stock, i);
}
-static void drain_local_stock(struct work_struct *dummy)
+static void drain_local_memcg_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
- /*
- * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
- * drain_stock races is that we always operate on local CPU stock
- * here with IRQ disabled
- */
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (WARN_ONCE(!in_task(), "drain in non-task context"))
+ return;
+
+ local_lock(&memcg_stock.lock);
stock = this_cpu_ptr(&memcg_stock);
- old = drain_obj_stock(stock);
- drain_stock(stock);
+ drain_stock_fully(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
+ local_unlock(&memcg_stock.lock);
}
-/*
- * Cache charges(val) to local per_cpu area.
- * This will be consumed by consume_stock() function, later.
- */
-static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void drain_local_obj_stock(struct work_struct *dummy)
{
- struct memcg_stock_pcp *stock;
- unsigned int stock_pages;
+ struct obj_stock_pcp *stock;
- stock = this_cpu_ptr(&memcg_stock);
- if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
- drain_stock(stock);
- css_get(&memcg->css);
- WRITE_ONCE(stock->cached, memcg);
- }
- stock_pages = READ_ONCE(stock->nr_pages) + nr_pages;
- WRITE_ONCE(stock->nr_pages, stock_pages);
+ if (WARN_ONCE(!in_task(), "drain in non-task context"))
+ return;
- if (stock_pages > MEMCG_CHARGE_BATCH)
- drain_stock(stock);
+ local_lock(&obj_stock.lock);
+
+ stock = this_cpu_ptr(&obj_stock);
+ drain_obj_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+
+ local_unlock(&obj_stock.lock);
}
static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
- unsigned long flags;
+ struct memcg_stock_pcp *stock;
+ struct mem_cgroup *cached;
+ uint8_t stock_pages;
+ bool success = false;
+ int empty_slot = -1;
+ int i;
- if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+ /*
+ * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
+ * decide to increase it more than 127 then we will need more careful
+ * handling of nr_pages[] in struct memcg_stock_pcp.
+ */
+ BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);
+
+ VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
+
+ if (nr_pages > MEMCG_CHARGE_BATCH ||
+ !local_trylock(&memcg_stock.lock)) {
/*
- * In case of unlikely failure to lock percpu stock_lock
- * uncharge memcg directly.
+ * In case of larger than batch refill or unlikely failure to
+ * lock the percpu memcg_stock.lock, uncharge memcg directly.
*/
- if (mem_cgroup_is_root(memcg))
- return;
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
+ memcg_uncharge(memcg, nr_pages);
return;
}
- __refill_stock(memcg, nr_pages);
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ cached = READ_ONCE(stock->cached[i]);
+ if (!cached && empty_slot == -1)
+ empty_slot = i;
+ if (memcg == READ_ONCE(stock->cached[i])) {
+ stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
+ WRITE_ONCE(stock->nr_pages[i], stock_pages);
+ if (stock_pages > MEMCG_CHARGE_BATCH)
+ drain_stock(stock, i);
+ success = true;
+ break;
+ }
+ }
+
+ if (!success) {
+ i = empty_slot;
+ if (i == -1) {
+ i = get_random_u32_below(NR_MEMCG_STOCK);
+ drain_stock(stock, i);
+ }
+ css_get(&memcg->css);
+ WRITE_ONCE(stock->cached[i], memcg);
+ WRITE_ONCE(stock->nr_pages[i], nr_pages);
+ }
+
+ local_unlock(&memcg_stock.lock);
+}
+
+static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ struct mem_cgroup *memcg;
+ bool flush = false;
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ memcg = READ_ONCE(stock->cached[i]);
+ if (!memcg)
+ continue;
+
+ if (READ_ONCE(stock->nr_pages[i]) &&
+ mem_cgroup_is_descendant(memcg, root_memcg)) {
+ flush = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return flush;
}
/*
@@ -1930,25 +1990,27 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
migrate_disable();
curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
- struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- struct mem_cgroup *memcg;
- bool flush = false;
+ struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu);
+ struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu);
- rcu_read_lock();
- memcg = READ_ONCE(stock->cached);
- if (memcg && READ_ONCE(stock->nr_pages) &&
- mem_cgroup_is_descendant(memcg, root_memcg))
- flush = true;
- else if (obj_stock_flush_required(stock, root_memcg))
- flush = true;
- rcu_read_unlock();
+ if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) &&
+ is_memcg_drain_needed(memcg_st, root_memcg) &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE,
+ &memcg_st->flags)) {
+ if (cpu == curcpu)
+ drain_local_memcg_stock(&memcg_st->work);
+ else if (!cpu_is_isolated(cpu))
+ schedule_work_on(cpu, &memcg_st->work);
+ }
- if (flush &&
- !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
+ obj_stock_flush_required(obj_st, root_memcg) &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE,
+ &obj_st->flags)) {
if (cpu == curcpu)
- drain_local_stock(&stock->work);
+ drain_local_obj_stock(&obj_st->work);
else if (!cpu_is_isolated(cpu))
- schedule_work_on(cpu, &stock->work);
+ schedule_work_on(cpu, &obj_st->work);
}
}
migrate_enable();
@@ -1957,19 +2019,9 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old;
- unsigned long flags;
-
- stock = &per_cpu(memcg_stock, cpu);
-
- /* drain_obj_stock requires stock_lock */
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
- old = drain_obj_stock(stock);
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
-
- drain_stock(stock);
- obj_cgroup_put(old);
+ /* no need for the local lock */
+ drain_obj_stock(&per_cpu(obj_stock, cpu));
+ drain_stock_fully(&per_cpu(memcg_stock, cpu));
return 0;
}
@@ -2259,7 +2311,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long pflags;
retry:
- if (consume_stock(memcg, nr_pages, gfp_mask))
+ if (consume_stock(memcg, nr_pages))
return 0;
if (!gfpflags_allow_spinning(gfp_mask))
@@ -2460,17 +2512,48 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg;
}
-static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct lruvec *lruvec;
+
+ if (likely(!in_nmi())) {
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+ } else {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id];
+
+ /* preemption is disabled in_nmi(). */
+ css_rstat_updated(&memcg->css, smp_processor_id());
+ if (idx == NR_SLAB_RECLAIMABLE_B)
+ atomic_add(nr, &pn->slab_reclaimable);
+ else
+ atomic_add(nr, &pn->slab_unreclaimable);
+ }
+}
+#else
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+}
+#endif
+
+static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- __mod_memcg_lruvec_state(lruvec, idx, nr);
+ account_slab_nmi_safe(memcg, pgdat, idx, nr);
rcu_read_unlock();
}
@@ -2595,6 +2678,9 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
+ if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
+ return NULL;
+
if (in_task()) {
memcg = current->active_memcg;
if (unlikely(memcg))
@@ -2657,6 +2743,24 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
return objcg;
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ if (likely(!in_nmi())) {
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+ } else {
+ /* preemption is disabled in_nmi(). */
+ css_rstat_updated(&memcg->css, smp_processor_id());
+ atomic_add(val, &memcg->kmem_stat);
+ }
+}
+#else
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+}
+#endif
+
/*
* obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
* @objcg: object cgroup to uncharge
@@ -2669,7 +2773,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
memcg = get_mem_cgroup_from_objcg(objcg);
- mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ account_kmem_nmi_safe(memcg, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
if (!mem_cgroup_is_root(memcg))
refill_stock(memcg, nr_pages);
@@ -2697,7 +2801,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;
- mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
+ account_kmem_nmi_safe(memcg, nr_pages);
memcg1_account_kmem(memcg, nr_pages);
out:
css_put(&memcg->css);
@@ -2765,50 +2869,27 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
obj_cgroup_put(objcg);
}
-/* Replace the stock objcg with objcg, return the old objcg */
-static struct obj_cgroup *replace_stock_objcg(struct memcg_stock_pcp *stock,
- struct obj_cgroup *objcg)
-{
- struct obj_cgroup *old = NULL;
-
- old = drain_obj_stock(stock);
- obj_cgroup_get(objcg);
- stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
- ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
- WRITE_ONCE(stock->cached_objcg, objcg);
- return old;
-}
-
-static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
- enum node_stat_item idx, int nr)
+static void __account_obj_stock(struct obj_cgroup *objcg,
+ struct obj_stock_pcp *stock, int nr,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
int *bytes;
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
- stock = this_cpu_ptr(&memcg_stock);
-
/*
* Save vmstat data in stock and skip vmstat array update unless
- * accumulating over a page of vmstat data or when pgdat or idx
- * changes.
+ * accumulating over a page of vmstat data or when pgdat changes.
*/
- if (READ_ONCE(stock->cached_objcg) != objcg) {
- old = replace_stock_objcg(stock, objcg);
- stock->cached_pgdat = pgdat;
- } else if (stock->cached_pgdat != pgdat) {
+ if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
struct pglist_data *oldpg = stock->cached_pgdat;
if (stock->nr_slab_reclaimable_b) {
- __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
@@ -2834,37 +2915,38 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
}
}
if (nr)
- __mod_objcg_mlstate(objcg, pgdat, idx, nr);
-
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
+ mod_objcg_mlstate(objcg, pgdat, idx, nr);
}
-static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- unsigned long flags;
+ struct obj_stock_pcp *stock;
bool ret = false;
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!local_trylock(&obj_stock.lock))
+ return ret;
- stock = this_cpu_ptr(&memcg_stock);
+ stock = this_cpu_ptr(&obj_stock);
if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
+
+ if (pgdat)
+ __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx);
}
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock(&obj_stock.lock);
return ret;
}
-static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
+static void drain_obj_stock(struct obj_stock_pcp *stock)
{
struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
if (!old)
- return NULL;
+ return;
if (stock->nr_bytes) {
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
@@ -2877,7 +2959,8 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
- __refill_stock(memcg, nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ memcg_uncharge(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -2901,13 +2984,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
*/
if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
if (stock->nr_slab_reclaimable_b) {
- __mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- __mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
@@ -2916,63 +2999,76 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
}
WRITE_ONCE(stock->cached_objcg, NULL);
- /*
- * The `old' objects needs to be released by the caller via
- * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
- */
- return old;
+ obj_cgroup_put(old);
}
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
if (objcg) {
memcg = obj_cgroup_memcg(objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
- return true;
+ flush = true;
}
+ rcu_read_unlock();
- return false;
+ return flush;
}
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
- bool allow_uncharge)
+ bool allow_uncharge, int nr_acct, struct pglist_data *pgdat,
+ enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
+ struct obj_stock_pcp *stock;
unsigned int nr_pages = 0;
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!local_trylock(&obj_stock.lock)) {
+ if (pgdat)
+ mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes);
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+ nr_bytes = nr_bytes & (PAGE_SIZE - 1);
+ atomic_add(nr_bytes, &objcg->nr_charged_bytes);
+ goto out;
+ }
- stock = this_cpu_ptr(&memcg_stock);
+ stock = this_cpu_ptr(&obj_stock);
if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
- old = replace_stock_objcg(stock, objcg);
+ drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ WRITE_ONCE(stock->cached_objcg, objcg);
+
allow_uncharge = true; /* Allow uncharge when objcg changes */
}
stock->nr_bytes += nr_bytes;
+ if (pgdat)
+ __account_obj_stock(objcg, stock, nr_acct, pgdat, idx);
+
if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
nr_pages = stock->nr_bytes >> PAGE_SHIFT;
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
-
+ local_unlock(&obj_stock.lock);
+out:
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
}
-int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
unsigned int nr_pages, nr_bytes;
int ret;
- if (consume_obj_stock(objcg, size))
+ if (likely(consume_obj_stock(objcg, size, pgdat, idx)))
return 0;
/*
@@ -3005,15 +3101,21 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
nr_pages += 1;
ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
- if (!ret && nr_bytes)
- refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
+ if (!ret && (nr_bytes || pgdat))
+ refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0,
+ false, size, pgdat, idx);
return ret;
}
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+ return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0);
+}
+
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
- refill_obj_stock(objcg, size, true);
+ refill_obj_stock(objcg, size, true, 0, NULL, 0);
}
static inline size_t obj_full_size(struct kmem_cache *s)
@@ -3065,23 +3167,32 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
return false;
}
- if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s)))
- return false;
-
for (i = 0; i < size; i++) {
slab = virt_to_slab(p[i]);
if (!slab_obj_exts(slab) &&
alloc_slab_obj_exts(slab, s, flags, false)) {
- obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
+ /*
+ * if we fail and size is 1, memcg_alloc_abort_single() will
+ * just free the object, which is ok as we have not assigned
+ * objcg to its obj_ext yet
+ *
+ * for larger sizes, kmem_cache_free_bulk() will uncharge
+ * any objects that were already charged and obj_ext assigned
+ *
+ * TODO: we could batch this until slab_pgdat(slab) changes
+ * between iterations, with a more complicated undo
+ */
+ if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s),
+ slab_pgdat(slab), cache_vmstat_idx(s)))
+ return false;
+
off = obj_to_index(s, slab, p[i]);
obj_cgroup_get(objcg);
slab_obj_exts(slab)[off].objcg = objcg;
- mod_objcg_state(objcg, slab_pgdat(slab),
- cache_vmstat_idx(s), obj_full_size(s));
}
return true;
@@ -3090,6 +3201,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts)
{
+ size_t obj_size = obj_full_size(s);
+
for (int i = 0; i < objects; i++) {
struct obj_cgroup *objcg;
unsigned int off;
@@ -3100,9 +3213,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
continue;
obj_exts[off].objcg = NULL;
- obj_cgroup_uncharge(objcg, obj_full_size(s));
- mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
- -obj_full_size(s));
+ refill_obj_stock(objcg, obj_size, true, -obj_size,
+ slab_pgdat(slab), cache_vmstat_idx(s));
obj_cgroup_put(objcg);
}
}
@@ -3544,7 +3656,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
- pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
+ pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO,
+ node);
if (!pn)
return false;
@@ -3591,13 +3704,14 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
{
- struct memcg_vmstats_percpu *statc, *pstatc;
+ struct memcg_vmstats_percpu *statc;
+ struct memcg_vmstats_percpu __percpu *pstatc_pcpu;
struct mem_cgroup *memcg;
int node, cpu;
int __maybe_unused i;
long error;
- memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
+ memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL);
if (!memcg)
return ERR_PTR(-ENOMEM);
@@ -3622,9 +3736,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
for_each_possible_cpu(cpu) {
if (parent)
- pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
+ pstatc_pcpu = parent->vmstats_percpu;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- statc->parent = parent ? pstatc : NULL;
+ statc->parent_pcpu = parent ? pstatc_pcpu : NULL;
statc->vmstats = memcg->vmstats;
}
@@ -3640,7 +3754,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
INIT_LIST_HEAD(&memcg->memory_peaks);
INIT_LIST_HEAD(&memcg->swap_peaks);
spin_lock_init(&memcg->peaks_lock);
- memcg->socket_pressure = jiffies;
+ memcg->socket_pressure = get_jiffies_64();
+#if BITS_PER_LONG < 64
+ seqlock_init(&memcg->socket_pressure_seqlock);
+#endif
memcg1_memcg_init(memcg);
memcg->kmemcg_id = -1;
INIT_LIST_HEAD(&memcg->objcg_list);
@@ -3898,6 +4015,53 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
}
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{
+ int nid;
+
+ if (atomic_read(&memcg->kmem_stat)) {
+ int kmem = atomic_xchg(&memcg->kmem_stat, 0);
+ int index = memcg_stats_index(MEMCG_KMEM);
+
+ memcg->vmstats->state[index] += kmem;
+ if (parent)
+ parent->vmstats->state_pending[index] += kmem;
+ }
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct lruvec_stats *lstats = pn->lruvec_stats;
+ struct lruvec_stats *plstats = NULL;
+
+ if (parent)
+ plstats = parent->nodeinfo[nid]->lruvec_stats;
+
+ if (atomic_read(&pn->slab_reclaimable)) {
+ int slab = atomic_xchg(&pn->slab_reclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ if (atomic_read(&pn->slab_unreclaimable)) {
+ int slab = atomic_xchg(&pn->slab_unreclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ }
+}
+#else
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{}
+#endif
+
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -3906,6 +4070,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct aggregate_control ac;
int nid;
+ flush_nmi_stats(memcg, parent, cpu);
+
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
ac = (struct aggregate_control) {
@@ -3955,8 +4121,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
- if (atomic64_read(&memcg->vmstats->stats_updates))
- atomic64_set(&memcg->vmstats->stats_updates, 0);
+ if (atomic_read(&memcg->vmstats->stats_updates))
+ atomic_set(&memcg->vmstats->stats_updates, 0);
}
static void mem_cgroup_fork(struct task_struct *task)
@@ -4197,6 +4363,9 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
page_counter_set_high(&memcg->memory, high);
+ if (of->file->f_flags & O_NONBLOCK)
+ goto out;
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
@@ -4219,7 +4388,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (!reclaimed && !nr_retries--)
break;
}
-
+out:
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -4246,6 +4415,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
xchg(&memcg->memory.max, max);
+ if (of->file->f_flags & O_NONBLOCK)
+ goto out;
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -4273,7 +4445,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
break;
cond_resched();
}
-
+out:
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -4394,78 +4566,15 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}
-enum {
- MEMORY_RECLAIM_SWAPPINESS = 0,
- MEMORY_RECLAIM_NULL,
-};
-
-static const match_table_t tokens = {
- { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
- { MEMORY_RECLAIM_NULL, NULL },
-};
-
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned int nr_retries = MAX_RECLAIM_RETRIES;
- unsigned long nr_to_reclaim, nr_reclaimed = 0;
- int swappiness = -1;
- unsigned int reclaim_options;
- char *old_buf, *start;
- substring_t args[MAX_OPT_ARGS];
-
- buf = strstrip(buf);
-
- old_buf = buf;
- nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
- if (buf == old_buf)
- return -EINVAL;
-
- buf = strstrip(buf);
-
- while ((start = strsep(&buf, " ")) != NULL) {
- if (!strlen(start))
- continue;
- switch (match_token(start, tokens, args)) {
- case MEMORY_RECLAIM_SWAPPINESS:
- if (match_int(&args[0], &swappiness))
- return -EINVAL;
- if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
- }
-
- reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
- while (nr_reclaimed < nr_to_reclaim) {
- /* Will converge on zero, but reclaim enforces a minimum */
- unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
- unsigned long reclaimed;
-
- if (signal_pending(current))
- return -EINTR;
-
- /*
- * This is the final attempt, drain percpu lru caches in the
- * hope of introducing more evictable pages for
- * try_to_free_mem_cgroup_pages().
- */
- if (!nr_retries)
- lru_add_drain_all();
-
- reclaimed = try_to_free_mem_cgroup_pages(memcg,
- batch_size, GFP_KERNEL,
- reclaim_options,
- swappiness == -1 ? NULL : &swappiness);
-
- if (!reclaimed && !nr_retries--)
- return -EAGAIN;
+ int ret;
- nr_reclaimed += reclaimed;
- }
+ ret = user_proactive_reclaim(buf, memcg, NULL);
+ if (ret)
+ return ret;
return nbytes;
}
@@ -4698,9 +4807,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
if (ug->nr_memory) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
- if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
+ memcg_uncharge(ug->memcg, ug->nr_memory);
if (ug->nr_kmem) {
mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
@@ -4976,15 +5083,16 @@ static int __init cgroup_memory(char *s)
__setup("cgroup.memory=", cgroup_memory);
/*
- * subsys_initcall() for memory controller.
+ * Memory controller init before cgroup_init() initialize root_mem_cgroup.
*
* Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
* context because of lock dependencies (cgroup_lock -> cpu hotplug) but
* basically everything that doesn't depend on a specific mem_cgroup structure
* should be initialized from here.
*/
-static int __init mem_cgroup_init(void)
+int __init mem_cgroup_init(void)
{
+ unsigned int memcg_size;
int cpu;
/*
@@ -4998,13 +5106,22 @@ static int __init mem_cgroup_init(void)
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
- drain_local_stock);
+ drain_local_memcg_stock);
+ INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work,
+ drain_local_obj_stock);
+ }
+
+ memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
+ memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
+ SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
+
+ memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node,
+ SLAB_PANIC | SLAB_HWCACHE_ALIGN);
return 0;
}
-subsys_initcall(mem_cgroup_init);
#ifdef CONFIG_SWAP
/**
@@ -5458,3 +5575,8 @@ static int __init mem_cgroup_swap_init(void)
subsys_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_SWAP */
+
+bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+ return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+}
diff --git a/mm/memfd.c b/mm/memfd.c
index c64df1343059..bbe679895ef6 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -20,6 +20,7 @@
#include <linux/memfd.h>
#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>
+#include "swap.h"
/*
* We need a tag: a new tag would expand every xa_node by 8 bytes,
@@ -31,8 +32,7 @@
static bool memfd_folio_has_extra_refs(struct folio *folio)
{
- return folio_ref_count(folio) - folio_mapcount(folio) !=
- folio_nr_pages(folio);
+ return folio_ref_count(folio) != folio_expected_ref_count(folio);
}
static void memfd_tag_pins(struct xa_state *xas)
@@ -70,7 +70,6 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
#ifdef CONFIG_HUGETLB_PAGE
struct folio *folio;
gfp_t gfp_mask;
- int err;
if (is_file_hugepages(memfd)) {
/*
@@ -79,12 +78,19 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
* alloc from. Also, the folio will be pinned for an indefinite
* amount of time, so it is not expected to be migrated away.
*/
+ struct inode *inode = file_inode(memfd);
struct hstate *h = hstate_file(memfd);
+ int err = -ENOMEM;
+ long nr_resv;
gfp_mask = htlb_alloc_mask(h);
gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
idx >>= huge_page_order(h);
+ nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0);
+ if (nr_resv < 0)
+ return ERR_PTR(nr_resv);
+
folio = alloc_hugetlb_folio_reserve(h,
numa_node_id(),
NULL,
@@ -95,12 +101,17 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
idx);
if (err) {
folio_put(folio);
- return ERR_PTR(err);
+ goto err_unresv;
}
+
+ hugetlb_set_folio_subpool(folio, subpool_inode(inode));
folio_unlock(folio);
return folio;
}
- return ERR_PTR(-ENOMEM);
+err_unresv:
+ if (nr_resv > 0)
+ hugetlb_unreserve_pages(inode, idx, idx + 1, 0);
+ return ERR_PTR(err);
}
#endif
return shmem_read_folio(memfd->f_mapping, idx);
@@ -332,10 +343,10 @@ static inline bool is_write_sealed(unsigned int seals)
return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
}
-static int check_write_seal(unsigned long *vm_flags_ptr)
+static int check_write_seal(vm_flags_t *vm_flags_ptr)
{
- unsigned long vm_flags = *vm_flags_ptr;
- unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE);
+ vm_flags_t vm_flags = *vm_flags_ptr;
+ vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE);
/* If a private mapping then writability is irrelevant. */
if (!(mask & VM_SHARED))
@@ -357,7 +368,7 @@ static int check_write_seal(unsigned long *vm_flags_ptr)
return 0;
}
-int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr)
+int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr)
{
int err = 0;
unsigned int *seals_ptr = memfd_file_seals_ptr(file);
@@ -400,7 +411,7 @@ static char *alloc_name(const char __user *uname)
if (!name)
return ERR_PTR(-ENOMEM);
- strcpy(name, MFD_NAME_PREFIX);
+ memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN);
/* returned length does not include terminating zero */
len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1);
if (len < 0) {
@@ -474,22 +485,22 @@ SYSCALL_DEFINE2(memfd_create,
fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
if (fd < 0) {
error = fd;
- goto err_name;
+ goto err_free_name;
}
file = alloc_file(name, flags);
if (IS_ERR(file)) {
error = PTR_ERR(file);
- goto err_fd;
+ goto err_free_fd;
}
fd_install(fd, file);
kfree(name);
return fd;
-err_fd:
+err_free_fd:
put_unused_fd(fd);
-err_name:
+err_free_name:
kfree(name);
return error;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b91a33fb6c69..3047b9ac667e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1388,8 +1388,8 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
if (PageSlab(page))
return false;
- /* Soft offline could migrate non-LRU movable pages */
- if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
+ /* Soft offline could migrate movable_ops pages */
+ if ((flags & MF_SOFT_OFFLINE) && page_has_movable_ops(page))
return true;
return PageLRU(page) || is_free_buddy_page(page);
@@ -1561,6 +1561,10 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
return ret;
}
+/*
+ * The caller must guarantee the folio isn't large folio, except hugetlb.
+ * try_to_unmap() can't handle it.
+ */
int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
@@ -2503,19 +2507,6 @@ static void memory_failure_work_func(struct work_struct *work)
}
}
-/*
- * Process memory_failure work queued on the specified CPU.
- * Used to avoid return-to-userspace racing with the memory_failure workqueue.
- */
-void memory_failure_queue_kick(int cpu)
-{
- struct memory_failure_cpu *mf_cpu;
-
- mf_cpu = &per_cpu(memory_failure_cpu, cpu);
- cancel_work_sync(&mf_cpu->work);
- memory_failure_work_func(&mf_cpu->work);
-}
-
static int __init memory_failure_init(void)
{
struct memory_failure_cpu *mf_cpu;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index fc14fe53e9b7..0382b6942b8b 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -872,25 +872,18 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
unsigned long action, void *_arg)
{
struct memory_tier *memtier;
- struct memory_notify *arg = _arg;
-
- /*
- * Only update the node migration order when a node is
- * changing status, like online->offline.
- */
- if (arg->status_change_nid < 0)
- return notifier_from_errno(0);
+ struct node_notify *nn = _arg;
switch (action) {
- case MEM_OFFLINE:
+ case NODE_REMOVED_LAST_MEMORY:
mutex_lock(&memory_tier_lock);
- if (clear_node_memory_tier(arg->status_change_nid))
+ if (clear_node_memory_tier(nn->nid))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
break;
- case MEM_ONLINE:
+ case NODE_ADDED_FIRST_MEMORY:
mutex_lock(&memory_tier_lock);
- memtier = set_node_memory_tier(arg->status_change_nid);
+ memtier = set_node_memory_tier(nn->nid);
if (!IS_ERR(memtier))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
@@ -929,7 +922,7 @@ static int __init memory_tier_init(void)
nodes_and(default_dram_nodes, node_states[N_MEMORY],
node_states[N_CPU]);
- hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
+ hotplug_node_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
return 0;
}
subsys_initcall(memory_tier_init);
diff --git a/mm/memory.c b/mm/memory.c
index 2d8c265fc7d6..0ba4f6b71847 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,7 +57,6 @@
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
-#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
@@ -125,6 +124,24 @@ int randomize_va_space __read_mostly =
2;
#endif
+static const struct ctl_table mmu_sysctl_table[] = {
+ {
+ .procname = "randomize_va_space",
+ .data = &randomize_va_space,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static int __init init_mm_sysctl(void)
+{
+ register_sysctl_init("kernel", mmu_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_mm_sysctl);
+
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
@@ -278,8 +295,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
p4d_free_tlb(tlb, p4d, start);
}
-/*
- * This function frees user-level page tables of a process.
+/**
+ * free_pgd_range - Unmap and free page tables in the range
+ * @tlb: the mmu_gather containing pending TLB flush info
+ * @addr: virtual address start
+ * @end: virtual address end
+ * @floor: lowest address boundary
+ * @ceiling: highest address boundary
+ *
+ * This function tears down all user-level page tables in the
+ * specified virtual address range [@addr..@end). It is part of
+ * the memory unmap flow.
*/
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
@@ -349,6 +375,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
{
struct unlink_vma_file_batch vb;
+ tlb_free_vmas(tlb);
+
do {
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
@@ -369,32 +397,26 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
vma_start_write(vma);
unlink_anon_vmas(vma);
- if (is_vm_hugetlb_page(vma)) {
- unlink_file_vma(vma);
- hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
- floor, next ? next->vm_start : ceiling);
- } else {
- unlink_file_vma_batch_init(&vb);
- unlink_file_vma_batch_add(&vb, vma);
+ unlink_file_vma_batch_init(&vb);
+ unlink_file_vma_batch_add(&vb, vma);
- /*
- * Optimization: gather nearby vmas into one call down
- */
- while (next && next->vm_start <= vma->vm_end + PMD_SIZE
- && !is_vm_hugetlb_page(next)) {
- vma = next;
- next = mas_find(mas, ceiling - 1);
- if (unlikely(xa_is_zero(next)))
- next = NULL;
- if (mm_wr_locked)
- vma_start_write(vma);
- unlink_anon_vmas(vma);
- unlink_file_vma_batch_add(&vb, vma);
- }
- unlink_file_vma_batch_final(&vb);
- free_pgd_range(tlb, addr, vma->vm_end,
- floor, next ? next->vm_start : ceiling);
+ /*
+ * Optimization: gather nearby vmas into one call down
+ */
+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
+ vma = next;
+ next = mas_find(mas, ceiling - 1);
+ if (unlikely(xa_is_zero(next)))
+ next = NULL;
+ if (mm_wr_locked)
+ vma_start_write(vma);
+ unlink_anon_vmas(vma);
+ unlink_file_vma_batch_add(&vb, vma);
}
+ unlink_file_vma_batch_final(&vb);
+
+ free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next ? next->vm_start : ceiling);
vma = next;
} while (vma);
}
@@ -518,10 +540,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_page(page, "bad pte");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
- pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
+ pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
+ vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL,
mapping ? mapping->a_ops->read_folio : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -586,16 +609,6 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (is_zero_pfn(pfn))
return NULL;
- if (pte_devmap(pte))
- /*
- * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
- * and will have refcounts incremented on their struct pages
- * when they are inserted into PTEs, thus they are safe to
- * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
- * do not have refcounts. Example of legacy ZONE_DEVICE is
- * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
- */
- return NULL;
print_bad_pte(vma, addr, pte, NULL);
return NULL;
@@ -673,9 +686,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
}
}
- if (pmd_devmap(pmd))
- return NULL;
- if (is_huge_zero_pmd(pmd))
+ if (is_huge_zero_pfn(pfn))
return NULL;
if (unlikely(pfn > highest_memmap_pfn))
return NULL;
@@ -785,7 +796,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
- unsigned long vm_flags = dst_vma->vm_flags;
+ vm_flags_t vm_flags = dst_vma->vm_flags;
pte_t orig_pte = ptep_get(src_pte);
pte_t pte = orig_pte;
struct folio *folio;
@@ -929,7 +940,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
rss[MM_ANONPAGES]++;
/* All done, just insert the new page copy in the child */
- pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
+ pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
/* Uffd-wp needs to be delivered to dest pte as well */
@@ -973,10 +984,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
int max_nr, int *rss, struct folio **prealloc)
{
+ fpb_t flags = FPB_MERGE_WRITE;
struct page *page;
struct folio *folio;
- bool any_writable;
- fpb_t flags = 0;
int err, nr;
page = vm_normal_page(src_vma, addr, pte);
@@ -991,13 +1001,12 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* by keeping the batching logic separate.
*/
if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
- if (src_vma->vm_flags & VM_SHARED)
- flags |= FPB_IGNORE_DIRTY;
- if (!vma_soft_dirty_enabled(src_vma))
- flags |= FPB_IGNORE_SOFT_DIRTY;
+ if (!(src_vma->vm_flags & VM_SHARED))
+ flags |= FPB_RESPECT_DIRTY;
+ if (vma_soft_dirty_enabled(src_vma))
+ flags |= FPB_RESPECT_SOFT_DIRTY;
- nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
- &any_writable, NULL, NULL);
+ nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags);
folio_ref_add(folio, nr);
if (folio_test_anon(folio)) {
if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
@@ -1011,8 +1020,6 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
folio_dup_file_rmap_ptes(folio, page, nr, dst_vma);
rss[mm_counter_file(folio)] += nr;
}
- if (any_writable)
- pte = pte_mkwrite(pte, src_vma);
__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
addr, nr);
return nr;
@@ -1238,8 +1245,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
- || pmd_devmap(*src_pmd)) {
+ if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -1275,7 +1281,7 @@ copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
src_pud = pud_offset(src_p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+ if (pud_trans_huge(*src_pud)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
@@ -1361,7 +1367,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
- unsigned long next, pfn;
+ unsigned long next;
bool is_cow;
int ret;
@@ -1371,12 +1377,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
if (is_vm_hugetlb_page(src_vma))
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
- if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
- ret = track_pfn_copy(dst_vma, src_vma, &pfn);
- if (ret)
- return ret;
- }
-
/*
* We need to invalidate the secondary MMU mappings only when
* there could be a permission downgrade on the ptes of the
@@ -1418,8 +1418,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
raw_write_seqcount_end(&src_mm->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
}
- if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
- untrack_pfn_copy(dst_vma, pfn);
return ret;
}
@@ -1545,7 +1543,6 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
struct zap_details *details, int *rss, bool *force_flush,
bool *force_break, bool *any_skipped)
{
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
struct mm_struct *mm = tlb->mm;
struct folio *folio;
struct page *page;
@@ -1575,9 +1572,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
* by keeping the batching logic separate.
*/
if (unlikely(folio_test_large(folio) && max_nr != 1)) {
- nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
- NULL, NULL, NULL);
-
+ nr = folio_pte_batch(folio, pte, ptent, max_nr);
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
addr, details, rss, force_flush,
force_break, any_skipped);
@@ -1797,9 +1792,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- __split_huge_pmd(vma, pmd, addr, false, NULL);
+ __split_huge_pmd(vma, pmd, addr, false);
else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
addr = next;
continue;
@@ -1839,7 +1834,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+ if (pud_trans_huge(*pud)) {
if (next - addr != HPAGE_PUD_SIZE) {
mmap_assert_locked(tlb->mm);
split_huge_pud(vma, pud, addr);
@@ -1914,9 +1909,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (vma->vm_file)
uprobe_munmap(vma, start, end);
- if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn(vma, 0, 0, mm_wr_locked);
-
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
/*
@@ -1990,35 +1982,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
}
/**
- * zap_page_range_single - remove user pages in a given range
+ * zap_page_range_single_batched - remove user pages in a given range
+ * @tlb: pointer to the caller's struct mmu_gather
* @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
- * @size: number of bytes to zap
+ * @address: starting address of pages to remove
+ * @size: number of bytes to remove
* @details: details of shared cache invalidation
*
- * The range must fit into one VMA.
+ * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for
+ * hugetlb, @tlb is flushed and re-initialized by this function.
*/
-void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range_single_batched(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
const unsigned long end = address + size;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
+
+ VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
- tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
/*
* unmap 'address-end' not 'range.start-range.end' as range
* could have been expanded for hugetlb pmd sharing.
*/
- unmap_single_vma(&tlb, vma, address, end, details, false);
+ unmap_single_vma(tlb, vma, address, end, details, false);
mmu_notifier_invalidate_range_end(&range);
+ if (is_vm_hugetlb_page(vma)) {
+ /*
+ * flush tlb and free resources before hugetlb_zap_end(), to
+ * avoid concurrent page faults' allocation failure.
+ */
+ tlb_finish_mmu(tlb);
+ hugetlb_zap_end(vma, details);
+ tlb_gather_mmu(tlb, vma->vm_mm);
+ }
+}
+
+/**
+ * zap_page_range_single - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of shared cache invalidation
+ *
+ * The range must fit into one VMA.
+ */
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details)
+{
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
+ zap_page_range_single_batched(&tlb, vma, address, size, details);
tlb_finish_mmu(&tlb);
- hugetlb_zap_end(vma, details);
}
/**
@@ -2418,7 +2439,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
EXPORT_SYMBOL(vm_map_pages_zero);
static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t prot, bool mkwrite)
+ unsigned long pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte, entry;
@@ -2440,7 +2461,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* allocation and mapping invalidation so just skip the
* update.
*/
- if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
+ if (pte_pfn(entry) != pfn) {
WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
goto out_unlock;
}
@@ -2453,10 +2474,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
}
/* Ok, finally just insert the thing.. */
- if (pfn_t_devmap(pfn))
- entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
- else
- entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+ entry = pte_mkspecial(pfn_pte(pfn, prot));
if (mkwrite) {
entry = pte_mkyoung(entry);
@@ -2525,10 +2543,9 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
if (!pfn_modify_allowed(pfn, pgprot))
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
- false);
+ return insert_pfn(vma, addr, pfn, pgprot, false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);
@@ -2559,25 +2576,22 @@ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vmf_insert_pfn);
-static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite)
+static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn,
+ bool mkwrite)
{
- if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) &&
+ if (unlikely(is_zero_pfn(pfn)) &&
(mkwrite || !vm_mixed_zeropage_allowed(vma)))
return false;
/* these checks mirror the abort conditions in vm_normal_page */
if (vma->vm_flags & VM_MIXEDMAP)
return true;
- if (pfn_t_devmap(pfn))
- return true;
- if (pfn_t_special(pfn))
- return true;
- if (is_zero_pfn(pfn_t_to_pfn(pfn)))
+ if (is_zero_pfn(pfn))
return true;
return false;
}
static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn, bool mkwrite)
+ unsigned long addr, unsigned long pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
int err;
@@ -2588,9 +2602,9 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+ if (!pfn_modify_allowed(pfn, pgprot))
return VM_FAULT_SIGBUS;
/*
@@ -2600,8 +2614,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
- if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
- !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) {
struct page *page;
/*
@@ -2609,7 +2622,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* regardless of whether the caller specified flags that
* result in pfn_t_has_page() == false.
*/
- page = pfn_to_page(pfn_t_to_pfn(pfn));
+ page = pfn_to_page(pfn);
err = insert_page(vma, addr, page, pgprot, mkwrite);
} else {
return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
@@ -2644,7 +2657,7 @@ vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+ unsigned long pfn)
{
return __vm_insert_mixed(vma, addr, pfn, false);
}
@@ -2656,7 +2669,7 @@ EXPORT_SYMBOL(vmf_insert_mixed);
* the same entry was actually inserted.
*/
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn)
+ unsigned long addr, unsigned long pfn)
{
return __vm_insert_mixed(vma, addr, pfn, true);
}
@@ -2833,6 +2846,36 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
return error;
}
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn,
+ unsigned long size, pgprot_t *prot)
+{
+ struct pfnmap_track_ctx *ctx;
+
+ if (pfnmap_track(pfn, size, prot))
+ return ERR_PTR(-EINVAL);
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (unlikely(!ctx)) {
+ pfnmap_untrack(pfn, size);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ctx->pfn = pfn;
+ ctx->size = size;
+ kref_init(&ctx->kref);
+ return ctx;
+}
+
+void pfnmap_track_ctx_release(struct kref *ref)
+{
+ struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref);
+
+ pfnmap_untrack(ctx->pfn, ctx->size);
+ kfree(ctx);
+}
+#endif /* __HAVE_PFNMAP_TRACKING */
+
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
@@ -2845,20 +2888,51 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
*
* Return: %0 on success, negative error code otherwise.
*/
+#ifdef __HAVE_PFNMAP_TRACKING
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
+ struct pfnmap_track_ctx *ctx = NULL;
int err;
- err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
- if (err)
+ size = PAGE_ALIGN(size);
+
+ /*
+ * If we cover the full VMA, we'll perform actual tracking, and
+ * remember to untrack when the last reference to our tracking
+ * context from a VMA goes away. We'll keep tracking the whole pfn
+ * range even during VMA splits and partial unmapping.
+ *
+ * If we only cover parts of the VMA, we'll only setup the cachemode
+ * in the pgprot for the pfn range.
+ */
+ if (addr == vma->vm_start && addr + size == vma->vm_end) {
+ if (vma->pfnmap_track_ctx)
+ return -EINVAL;
+ ctx = pfnmap_track_ctx_alloc(pfn, size, &prot);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ } else if (pfnmap_setup_cachemode(pfn, size, &prot)) {
return -EINVAL;
+ }
err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
- if (err)
- untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
+ if (ctx) {
+ if (err)
+ kref_put(&ctx->kref, pfnmap_track_ctx_release);
+ else
+ vma->pfnmap_track_ctx = ctx;
+ }
return err;
}
+
+#else
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+}
+#endif
EXPORT_SYMBOL(remap_pfn_range);
/**
@@ -2938,11 +3012,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
if (fn) {
do {
if (create || !pte_none(ptep_get(pte))) {
- err = fn(pte++, addr, data);
+ err = fn(pte, addr, data);
if (err)
break;
}
- } while (addr += PAGE_SIZE, addr != end);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
}
*mask |= PGTBL_PTE_MODIFIED;
@@ -3523,7 +3597,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
- entry = mk_pte(&new_folio->page, vma->vm_page_prot);
+ entry = folio_mk_pte(new_folio, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (unlikely(unshare)) {
if (pte_soft_dirty(vmf->orig_pte))
@@ -3730,12 +3804,10 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
* If all folio references are from mappings, and all mappings are in
* the page tables of this MM, then this folio is exclusive to this MM.
*/
- if (folio_test_large_maybe_mapped_shared(folio))
+ if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
return false;
VM_WARN_ON_ONCE(folio_test_ksm(folio));
- VM_WARN_ON_ONCE(folio_mapcount(folio) > folio_nr_pages(folio));
- VM_WARN_ON_ONCE(folio_entire_mapcount(folio));
if (unlikely(folio_test_swapcache(folio))) {
/*
@@ -3753,13 +3825,15 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
/* Stabilize the mapcount vs. refcount and recheck. */
folio_lock_large_mapcount(folio);
- VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio));
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);
- if (folio_test_large_maybe_mapped_shared(folio))
+ if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
goto unlock;
if (folio_large_mapcount(folio) != folio_ref_count(folio))
goto unlock;
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
folio_mm_id(folio, 1) != vma->vm_mm->mm_id);
@@ -4224,26 +4298,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
- struct swap_info_struct *si = swp_swap_info(entry);
- pgoff_t offset = swp_offset(entry);
- int i;
-
- /*
- * While allocating a large folio and doing swap_read_folio, which is
- * the case the being faulted pte doesn't have swapcache. We need to
- * ensure all PTEs have no cache as well, otherwise, we might go to
- * swap devices while the content is in swapcache.
- */
- for (i = 0; i < max_nr; i++) {
- if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
- return i;
- }
-
- return i;
-}
-
/*
* Check if the PTEs within a range are contiguous swap entries
* and have consistent swapcache, zeromap.
@@ -4579,8 +4633,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/*
* KSM sometimes has to copy on read faults, for example, if
- * page->index of !PageKSM() pages would be nonlinear inside the
- * anon VMA -- PageKSM() is lost on actual swapout.
+ * folio->index of non-ksm folios would be nonlinear inside the
+ * anon VMA -- the ksm flag is lost on actual swapout.
*/
folio = ksm_might_need_to_copy(folio, vma, vmf->address);
if (unlikely(!folio)) {
@@ -5013,7 +5067,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
*/
__folio_mark_uptodate(folio);
- entry = mk_pte(&folio->page, vma->vm_page_prot);
+ entry = folio_mk_pte(folio, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry), vma);
@@ -5138,9 +5192,8 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
vmf->prealloc_pte = NULL;
}
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
- struct folio *folio = page_folio(page);
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
@@ -5188,7 +5241,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
flush_icache_pages(vma, page, HPAGE_PMD_NR);
- entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -5213,7 +5266,7 @@ out:
return ret;
}
#else
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
return VM_FAULT_FALLBACK;
}
@@ -5245,6 +5298,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ else if (pte_write(entry) && folio_test_dirty(folio))
+ entry = pte_mkdirty(entry);
if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
@@ -5305,6 +5360,7 @@ fallback:
else
page = vmf->page;
+ folio = page_folio(page);
/*
* check even for read faults because we might have lost our CoWed
* page
@@ -5316,8 +5372,8 @@ fallback:
}
if (pmd_none(*vmf->pmd)) {
- if (PageTransCompound(page)) {
- ret = do_set_pmd(vmf, page);
+ if (folio_test_pmd_mappable(folio)) {
+ ret = do_set_pmd(vmf, folio, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
@@ -5328,15 +5384,14 @@ fallback:
return VM_FAULT_OOM;
}
- folio = page_folio(page);
nr_pages = folio_nr_pages(folio);
/*
* Using per-page fault to maintain the uffd semantics, and same
- * approach also applies to non-anonymous-shmem faults to avoid
+ * approach also applies to non shmem/tmpfs faults to avoid
* inflating the RSS of the process.
*/
- if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
+ if (!vma_is_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
@@ -5892,7 +5947,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
split:
/* COW or write-notify handled on pte level: split pmd. */
- __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
return VM_FAULT_FALLBACK;
}
@@ -6056,7 +6111,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
.gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
- unsigned long vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
pgd_t *pgd;
p4d_t *p4d;
vm_fault_t ret;
@@ -6080,7 +6135,7 @@ retry_pud:
pud_t orig_pud = *vmf.pud;
barrier();
- if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+ if (pud_trans_huge(orig_pud)) {
/*
* TODO once we support anonymous PUDs: NUMA case and
@@ -6121,7 +6176,7 @@ retry_pud:
pmd_migration_entry_wait(mm, vmf.pmd);
return 0;
}
- if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+ if (pmd_trans_huge(vmf.orig_pmd)) {
if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf);
@@ -6338,258 +6393,6 @@ out:
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
-#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
-#include <linux/extable.h>
-
-static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
- if (likely(mmap_read_trylock(mm)))
- return true;
-
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
-
- return !mmap_read_lock_killable(mm);
-}
-
-static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
-{
- /*
- * We don't have this operation yet.
- *
- * It should be easy enough to do: it's basically a
- * atomic_long_try_cmpxchg_acquire()
- * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
- * it also needs the proper lockdep magic etc.
- */
- return false;
-}
-
-static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
- mmap_read_unlock(mm);
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
- return !mmap_write_lock_killable(mm);
-}
-
-/*
- * Helper for page fault handling.
- *
- * This is kind of equivalent to "mmap_read_lock()" followed
- * by "find_extend_vma()", except it's a lot more careful about
- * the locking (and will drop the lock on failure).
- *
- * For example, if we have a kernel bug that causes a page
- * fault, we don't want to just use mmap_read_lock() to get
- * the mm lock, because that would deadlock if the bug were
- * to happen while we're holding the mm lock for writing.
- *
- * So this checks the exception tables on kernel faults in
- * order to only do this all for instructions that are actually
- * expected to fault.
- *
- * We can also actually take the mm lock for writing if we
- * need to extend the vma, which helps the VM layer a lot.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
-{
- struct vm_area_struct *vma;
-
- if (!get_mmap_lock_carefully(mm, regs))
- return NULL;
-
- vma = find_vma(mm, addr);
- if (likely(vma && (vma->vm_start <= addr)))
- return vma;
-
- /*
- * Well, dang. We might still be successful, but only
- * if we can extend a vma to do so.
- */
- if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
- mmap_read_unlock(mm);
- return NULL;
- }
-
- /*
- * We can try to upgrade the mmap lock atomically,
- * in which case we can continue to use the vma
- * we already looked up.
- *
- * Otherwise we'll have to drop the mmap lock and
- * re-take it, and also look up the vma again,
- * re-checking it.
- */
- if (!mmap_upgrade_trylock(mm)) {
- if (!upgrade_mmap_lock_carefully(mm, regs))
- return NULL;
-
- vma = find_vma(mm, addr);
- if (!vma)
- goto fail;
- if (vma->vm_start <= addr)
- goto success;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto fail;
- }
-
- if (expand_stack_locked(vma, addr))
- goto fail;
-
-success:
- mmap_write_downgrade(mm);
- return vma;
-
-fail:
- mmap_write_unlock(mm);
- return NULL;
-}
-#endif
-
-#ifdef CONFIG_PER_VMA_LOCK
-static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
-{
- unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
-
- /* Additional refcnt if the vma is attached. */
- if (!detaching)
- tgt_refcnt++;
-
- /*
- * If vma is detached then only vma_mark_attached() can raise the
- * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
- */
- if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
- return false;
-
- rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
- rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
- refcount_read(&vma->vm_refcnt) == tgt_refcnt,
- TASK_UNINTERRUPTIBLE);
- lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
-
- return true;
-}
-
-static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
-{
- *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
- rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
-}
-
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
-{
- bool locked;
-
- /*
- * __vma_enter_locked() returns false immediately if the vma is not
- * attached, otherwise it waits until refcnt is indicating that vma
- * is attached with no readers.
- */
- locked = __vma_enter_locked(vma, false);
-
- /*
- * We should use WRITE_ONCE() here because we can have concurrent reads
- * from the early lockless pessimistic check in vma_start_read().
- * We don't really care about the correctness of that early check, but
- * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
- */
- WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
-
- if (locked) {
- bool detached;
-
- __vma_exit_locked(vma, &detached);
- WARN_ON_ONCE(detached); /* vma should remain attached */
- }
-}
-EXPORT_SYMBOL_GPL(__vma_start_write);
-
-void vma_mark_detached(struct vm_area_struct *vma)
-{
- vma_assert_write_locked(vma);
- vma_assert_attached(vma);
-
- /*
- * We are the only writer, so no need to use vma_refcount_put().
- * The condition below is unlikely because the vma has been already
- * write-locked and readers can increment vm_refcnt only temporarily
- * before they check vm_lock_seq, realize the vma is locked and drop
- * back the vm_refcnt. That is a narrow window for observing a raised
- * vm_refcnt.
- */
- if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
- /* Wait until vma is detached with no readers. */
- if (__vma_enter_locked(vma, true)) {
- bool detached;
-
- __vma_exit_locked(vma, &detached);
- WARN_ON_ONCE(!detached);
- }
- }
-}
-
-/*
- * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
- * stable and not isolated. If the VMA is not found or is being modified the
- * function returns NULL.
- */
-struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address)
-{
- MA_STATE(mas, &mm->mm_mt, address, address);
- struct vm_area_struct *vma;
-
- rcu_read_lock();
-retry:
- vma = mas_walk(&mas);
- if (!vma)
- goto inval;
-
- vma = vma_start_read(mm, vma);
- if (IS_ERR_OR_NULL(vma)) {
- /* Check if the VMA got isolated after we found it */
- if (PTR_ERR(vma) == -EAGAIN) {
- count_vm_vma_lock_event(VMA_LOCK_MISS);
- /* The area was replaced with another one */
- goto retry;
- }
-
- /* Failed to lock the VMA */
- goto inval;
- }
- /*
- * At this point, we have a stable reference to a VMA: The VMA is
- * locked and we know it hasn't already been isolated.
- * From here on, we can access the VMA without worrying about which
- * fields are accessible for RCU readers.
- */
-
- /* Check if the vma we locked is the right one. */
- if (unlikely(vma->vm_mm != mm ||
- address < vma->vm_start || address >= vma->vm_end))
- goto inval_end_read;
-
- rcu_read_unlock();
- return vma;
-
-inval_end_read:
- vma_end_read(vma);
-inval:
- rcu_read_unlock();
- count_vm_vma_lock_event(VMA_LOCK_ABORT);
- return NULL;
-}
-#endif /* CONFIG_PER_VMA_LOCK */
-
#ifndef __PAGETABLE_P4D_FOLDED
/*
* Allocate p4d page table.
@@ -6900,6 +6703,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
while (len) {
int bytes, offset;
void *maddr;
+ struct folio *folio;
struct vm_area_struct *vma = NULL;
struct page *page = get_user_page_vma_remote(mm, addr,
gup_flags, &vma);
@@ -6931,21 +6735,22 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
if (bytes <= 0)
break;
} else {
+ folio = page_folio(page);
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
- maddr = kmap_local_page(page);
+ maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes);
- set_page_dirty_lock(page);
+ folio_mark_dirty_lock(folio);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
- unmap_and_put_page(page, maddr);
+ folio_release_kmap(folio, maddr);
}
len -= bytes;
buf += bytes;
@@ -7024,6 +6829,7 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
while (len) {
int bytes, offset, retval;
void *maddr;
+ struct folio *folio;
struct page *page;
struct vm_area_struct *vma = NULL;
@@ -7039,17 +6845,18 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
goto out;
}
+ folio = page_folio(page);
bytes = len;
offset = addr & (PAGE_SIZE - 1);
if (bytes > PAGE_SIZE - offset)
bytes = PAGE_SIZE - offset;
- maddr = kmap_local_page(page);
+ maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
retval = strscpy(buf, maddr + offset, bytes);
if (retval >= 0) {
/* Found the end of the string */
buf += retval;
- unmap_and_put_page(page, maddr);
+ folio_release_kmap(folio, maddr);
break;
}
@@ -7067,7 +6874,7 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
}
len -= bytes;
- unmap_and_put_page(page, maddr);
+ folio_release_kmap(folio, maddr);
}
out:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 75401866fb76..1f15af712bc3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -35,6 +35,7 @@
#include <linux/compaction.h>
#include <linux/rmap.h>
#include <linux/module.h>
+#include <linux/node.h>
#include <asm/tlbflush.h>
@@ -699,30 +700,6 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
online_mem_sections(start_pfn, end_pfn);
}
-/* check which state of node_states will be changed when online memory */
-static void node_states_check_changes_online(unsigned long nr_pages,
- struct zone *zone, struct memory_notify *arg)
-{
- int nid = zone_to_nid(zone);
-
- arg->status_change_nid = NUMA_NO_NODE;
- arg->status_change_nid_normal = NUMA_NO_NODE;
-
- if (!node_state(nid, N_MEMORY))
- arg->status_change_nid = nid;
- if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
- arg->status_change_nid_normal = nid;
-}
-
-static void node_states_set_node(int node, struct memory_notify *arg)
-{
- if (arg->status_change_nid_normal >= 0)
- node_set_state(node, N_NORMAL_MEMORY);
-
- if (arg->status_change_nid >= 0)
- node_set_state(node, N_MEMORY);
-}
-
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -770,7 +747,8 @@ static inline void section_taint_zone_device(unsigned long pfn)
*/
void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages,
- struct vmem_altmap *altmap, int migratetype)
+ struct vmem_altmap *altmap, int migratetype,
+ bool isolate_pageblock)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
@@ -797,12 +775,13 @@ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
/*
* TODO now we have a visible range of pages which are not associated
- * with their zone properly. Not nice but set_pfnblock_flags_mask
+ * with their zone properly. Not nice but set_pfnblock_migratetype()
* expects the zone spans the pfn range. All the pages in the range
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
- MEMINIT_HOTPLUG, altmap, migratetype);
+ MEMINIT_HOTPLUG, altmap, migratetype,
+ isolate_pageblock);
set_zone_contiguous(zone);
}
@@ -1127,7 +1106,8 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
if (mhp_off_inaccessible)
page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
+ false);
for (i = 0; i < nr_pages; i++) {
struct page *page = pfn_to_page(pfn + i);
@@ -1173,11 +1153,17 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
int online_pages(unsigned long pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
- unsigned long flags;
- int need_zonelists_rebuild = 0;
+ struct memory_notify mem_arg = {
+ .start_pfn = pfn,
+ .nr_pages = nr_pages,
+ };
+ struct node_notify node_arg = {
+ .nid = NUMA_NO_NODE,
+ };
const int nid = zone_to_nid(zone);
+ int need_zonelists_rebuild = 0;
+ unsigned long flags;
int ret;
- struct memory_notify arg;
/*
* {on,off}lining is constrained to full memory sections (or more
@@ -1192,13 +1178,19 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
/* associate pfn range with the zone */
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
-
- arg.start_pfn = pfn;
- arg.nr_pages = nr_pages;
- node_states_check_changes_online(nr_pages, zone, &arg);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE,
+ true);
+
+ if (!node_state(nid, N_MEMORY)) {
+ /* Adding memory to the node for the first time */
+ node_arg.nid = nid;
+ ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto failed_addition;
+ }
- ret = memory_notify(MEM_GOING_ONLINE, &arg);
+ ret = memory_notify(MEM_GOING_ONLINE, &mem_arg);
ret = notifier_to_errno(ret);
if (ret)
goto failed_addition;
@@ -1224,12 +1216,13 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
online_pages_range(pfn, nr_pages);
adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
- node_states_set_node(nid, &arg);
+ if (node_arg.nid >= 0)
+ node_set_state(nid, N_MEMORY);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
/* Basic onlining is complete, allow allocation of onlined pages. */
- undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
+ undo_isolate_page_range(pfn, pfn + nr_pages);
/*
* Freshly onlined pages aren't shuffled (e.g., all pages are placed to
@@ -1245,16 +1238,22 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
kswapd_run(nid);
kcompactd_run(nid);
+ if (node_arg.nid >= 0)
+ /* First memory added successfully. Notify consumers. */
+ node_notify(NODE_ADDED_FIRST_MEMORY, &node_arg);
+
writeback_set_ratelimit();
- memory_notify(MEM_ONLINE, &arg);
+ memory_notify(MEM_ONLINE, &mem_arg);
return 0;
failed_addition:
pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
(unsigned long long) pfn << PAGE_SHIFT,
(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
- memory_notify(MEM_CANCEL_ONLINE, &arg);
+ memory_notify(MEM_CANCEL_ONLINE, &mem_arg);
+ if (node_arg.nid != NUMA_NO_NODE)
+ node_notify(NODE_CANCEL_ADDING_FIRST_MEMORY, &node_arg);
remove_pfn_range_from_zone(zone, pfn, nr_pages);
return ret;
}
@@ -1571,13 +1570,12 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
* We online node here. We can't roll back from here.
*/
node_set_online(nid);
- ret = __register_one_node(nid);
+ ret = register_one_node(nid);
BUG_ON(ret);
}
- register_memory_blocks_under_node(nid, PFN_DOWN(start),
- PFN_UP(start + size - 1),
- MEMINIT_HOTPLUG);
+ register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start),
+ PFN_UP(start + size - 1));
/* create new memmap entry */
if (!strcmp(res->name, "System RAM"))
@@ -1741,8 +1739,8 @@ bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
- * non-lru movable pages and hugepages). Will skip over most unmovable
+ * Scan pfn range [start,end) to find movable/migratable pages (LRU and
+ * hugetlb folio, movable_ops pages). Will skip over most unmovable
* pages (esp., pages that can be skipped when offlining), but bail out on
* definitely unmovable pages.
*
@@ -1756,20 +1754,16 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
{
unsigned long pfn;
- for (pfn = start; pfn < end; pfn++) {
+ for_each_valid_pfn(pfn, start, end) {
struct page *page;
struct folio *folio;
- if (!pfn_valid(pfn))
- continue;
page = pfn_to_page(pfn);
- if (PageLRU(page))
- goto found;
- if (__PageMovable(page))
+ if (PageLRU(page) || page_has_movable_ops(page))
goto found;
/*
- * PageOffline() pages that are not marked __PageMovable() and
+ * PageOffline() pages that do not have movable_ops and
* have a reference count > 0 (after MEM_GOING_OFFLINE) are
* definitely unmovable. If their reference count would be 0,
* they could at least be skipped when offlining memory.
@@ -1805,29 +1799,21 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ for_each_valid_pfn(pfn, start_pfn, end_pfn) {
struct page *page;
- if (!pfn_valid(pfn))
- continue;
page = pfn_to_page(pfn);
folio = page_folio(page);
- /*
- * No reference or lock is held on the folio, so it might
- * be modified concurrently (e.g. split). As such,
- * folio_nr_pages() may read garbage. This is fine as the outer
- * loop will revisit the split folio later.
- */
- if (folio_test_large(folio))
- pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
-
if (!folio_try_get(folio))
continue;
if (unlikely(page_folio(page) != folio))
goto put_folio;
+ if (folio_test_large(folio))
+ pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
+
if (folio_contain_hwpoisoned_page(folio)) {
if (WARN_ON(folio_test_lru(folio)))
folio_isolate_lru(folio);
@@ -1896,54 +1882,6 @@ static int __init cmdline_parse_movable_node(char *p)
}
early_param("movable_node", cmdline_parse_movable_node);
-/* check which state of node_states will be changed when offline memory */
-static void node_states_check_changes_offline(unsigned long nr_pages,
- struct zone *zone, struct memory_notify *arg)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long present_pages = 0;
- enum zone_type zt;
-
- arg->status_change_nid = NUMA_NO_NODE;
- arg->status_change_nid_normal = NUMA_NO_NODE;
-
- /*
- * Check whether node_states[N_NORMAL_MEMORY] will be changed.
- * If the memory to be offline is within the range
- * [0..ZONE_NORMAL], and it is the last present memory there,
- * the zones in that range will become empty after the offlining,
- * thus we can determine that we need to clear the node from
- * node_states[N_NORMAL_MEMORY].
- */
- for (zt = 0; zt <= ZONE_NORMAL; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
- if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
- arg->status_change_nid_normal = zone_to_nid(zone);
-
- /*
- * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
- * does not apply as we don't support 32bit.
- * Here we count the possible pages from ZONE_MOVABLE.
- * If after having accounted all the pages, we see that the nr_pages
- * to be offlined is over or equal to the accounted pages,
- * we know that the node will become empty, and so, we can clear
- * it for N_MEMORY as well.
- */
- present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
-
- if (nr_pages >= present_pages)
- arg->status_change_nid = zone_to_nid(zone);
-}
-
-static void node_states_clear_node(int node, struct memory_notify *arg)
-{
- if (arg->status_change_nid_normal >= 0)
- node_clear_state(node, N_NORMAL_MEMORY);
-
- if (arg->status_change_nid >= 0)
- node_clear_state(node, N_MEMORY);
-}
-
static int count_system_ram_pages_cb(unsigned long start_pfn,
unsigned long nr_pages, void *data)
{
@@ -1959,11 +1897,18 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
- const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn, managed_pages, system_ram_pages = 0;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ struct pglist_data *pgdat = zone->zone_pgdat;
const int node = zone_to_nid(zone);
+ struct memory_notify mem_arg = {
+ .start_pfn = start_pfn,
+ .nr_pages = nr_pages,
+ };
+ struct node_notify node_arg = {
+ .nid = NUMA_NO_NODE,
+ };
unsigned long flags;
- struct memory_notify arg;
char *reason;
int ret;
@@ -2015,18 +1960,28 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
- MIGRATE_MOVABLE,
- MEMORY_OFFLINE | REPORT_FAILURE);
+ PB_ISOLATE_MODE_MEM_OFFLINE);
if (ret) {
reason = "failure to isolate range";
goto failed_removal_pcplists_disabled;
}
- arg.start_pfn = start_pfn;
- arg.nr_pages = nr_pages;
- node_states_check_changes_offline(nr_pages, zone, &arg);
+ /*
+ * Check whether the node will have no present pages after we offline
+ * 'nr_pages' more. If so, we know that the node will become empty, and
+ * so we will clear N_MEMORY for it.
+ */
+ if (nr_pages >= pgdat->node_present_pages) {
+ node_arg.nid = node;
+ ret = node_notify(NODE_REMOVING_LAST_MEMORY, &node_arg);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ reason = "node notifier failure";
+ goto failed_removal_isolated;
+ }
+ }
- ret = memory_notify(MEM_GOING_OFFLINE, &arg);
+ ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg);
ret = notifier_to_errno(ret);
if (ret) {
reason = "notifier failure";
@@ -2075,7 +2030,8 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
goto failed_removal_isolated;
}
- ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
+ ret = test_pages_isolated(start_pfn, end_pfn,
+ PB_ISOLATE_MODE_MEM_OFFLINE);
} while (ret);
@@ -2106,27 +2062,32 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* Make sure to mark the node as memory-less before rebuilding the zone
* list. Otherwise this node would still appear in the fallback lists.
*/
- node_states_clear_node(node, &arg);
+ if (node_arg.nid >= 0)
+ node_clear_state(node, N_MEMORY);
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
build_all_zonelists(NULL);
}
- if (arg.status_change_nid >= 0) {
+ if (node_arg.nid >= 0) {
kcompactd_stop(node);
kswapd_stop(node);
+ /* Node went memoryless. Notify consumers */
+ node_notify(NODE_REMOVED_LAST_MEMORY, &node_arg);
}
writeback_set_ratelimit();
- memory_notify(MEM_OFFLINE, &arg);
+ memory_notify(MEM_OFFLINE, &mem_arg);
remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
return 0;
failed_removal_isolated:
/* pushback to free area */
- undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
- memory_notify(MEM_CANCEL_OFFLINE, &arg);
+ undo_isolate_page_range(start_pfn, end_pfn);
+ memory_notify(MEM_CANCEL_OFFLINE, &mem_arg);
+ if (node_arg.nid != NUMA_NO_NODE)
+ node_notify(NODE_CANCEL_REMOVING_LAST_MEMORY, &node_arg);
failed_removal_pcplists_disabled:
lru_cache_enable();
zone_pcp_enable(zone);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b28a1e6ae096..eb83cff7db8c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -109,10 +109,12 @@
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>
+#include <linux/gcd.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <linux/uaccess.h>
+#include <linux/memory.h>
#include "internal.h"
@@ -139,31 +141,138 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/*
- * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
- * system-default value should be used. A NULL iw_table also denotes that
- * system-default values should be used. Until the system-default table
- * is implemented, the system-default is always 1.
- *
- * iw_table is RCU protected
+ * weightiness balances the tradeoff between small weights (cycles through nodes
+ * faster, more fair/even distribution) and large weights (smaller errors
+ * between actual bandwidth ratios and weight ratios). 32 is a number that has
+ * been found to perform at a reasonable compromise between the two goals.
+ */
+static const int weightiness = 32;
+
+/*
+ * A null weighted_interleave_state is interpreted as having .mode="auto",
+ * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
*/
-static u8 __rcu *iw_table;
-static DEFINE_MUTEX(iw_table_lock);
+struct weighted_interleave_state {
+ bool mode_auto;
+ u8 iw_table[];
+};
+static struct weighted_interleave_state __rcu *wi_state;
+static unsigned int *node_bw_table;
+
+/*
+ * wi_state_lock protects both wi_state and node_bw_table.
+ * node_bw_table is only used by writers to update wi_state.
+ */
+static DEFINE_MUTEX(wi_state_lock);
static u8 get_il_weight(int node)
{
- u8 *table;
- u8 weight;
+ struct weighted_interleave_state *state;
+ u8 weight = 1;
rcu_read_lock();
- table = rcu_dereference(iw_table);
- /* if no iw_table, use system default */
- weight = table ? table[node] : 1;
- /* if value in iw_table is 0, use system default */
- weight = weight ? weight : 1;
+ state = rcu_dereference(wi_state);
+ if (state)
+ weight = state->iw_table[node];
rcu_read_unlock();
return weight;
}
+/*
+ * Convert bandwidth values into weighted interleave weights.
+ * Call with wi_state_lock.
+ */
+static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
+{
+ u64 sum_bw = 0;
+ unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ sum_bw += bw[nid];
+
+ /* Scale bandwidths to whole numbers in the range [1, weightiness] */
+ for_each_node_state(nid, N_MEMORY) {
+ /*
+ * Try not to perform 64-bit division.
+ * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
+ * If sum_bw > scaling_factor, then round the weight up to 1.
+ */
+ scaling_factor = weightiness * bw[nid];
+ if (bw[nid] && sum_bw < scaling_factor) {
+ cast_sum_bw = (unsigned int)sum_bw;
+ new_iw[nid] = scaling_factor / cast_sum_bw;
+ } else {
+ new_iw[nid] = 1;
+ }
+ if (!iw_gcd)
+ iw_gcd = new_iw[nid];
+ iw_gcd = gcd(iw_gcd, new_iw[nid]);
+ }
+
+ /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
+ for_each_node_state(nid, N_MEMORY)
+ new_iw[nid] /= iw_gcd;
+}
+
+int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
+{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *old_bw, *new_bw;
+ unsigned int bw_val;
+ int i;
+
+ bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
+ new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
+ if (!new_bw)
+ return -ENOMEM;
+
+ new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state) {
+ kfree(new_bw);
+ return -ENOMEM;
+ }
+ new_wi_state->mode_auto = true;
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+
+ /*
+ * Update bandwidth info, even in manual mode. That way, when switching
+ * to auto mode in the future, iw_table can be overwritten using
+ * accurate bw data.
+ */
+ mutex_lock(&wi_state_lock);
+
+ old_bw = node_bw_table;
+ if (old_bw)
+ memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
+ new_bw[node] = bw_val;
+ node_bw_table = new_bw;
+
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state && !old_wi_state->mode_auto) {
+ /* Manual mode; skip reducing weights and updating wi_state */
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ goto out;
+ }
+
+ /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
+ reduce_interleave_weights(new_bw, new_wi_state->iw_table);
+ rcu_assign_pointer(wi_state, new_wi_state);
+
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+out:
+ kfree(old_bw);
+ return 0;
+}
+
/**
* numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
@@ -573,6 +682,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
pte_t *pte, *mapped_pte;
pte_t ptent;
spinlock_t *ptl;
+ int max_nr, nr;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -586,7 +696,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
walk->action = ACTION_AGAIN;
return 0;
}
- for (; addr != end; pte++, addr += PAGE_SIZE) {
+ for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
+ max_nr = (end - addr) >> PAGE_SHIFT;
+ nr = 1;
ptent = ptep_get(pte);
if (pte_none(ptent))
continue;
@@ -598,6 +710,8 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
+ if (folio_test_large(folio) && max_nr != 1)
+ nr = folio_pte_batch(folio, pte, ptent, max_nr);
/*
* vm_normal_folio() filters out zero pages, but there might
* still be reserved folios to skip, perhaps in a VDSO.
@@ -630,7 +744,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
!vma_migratable(vma) ||
!migrate_folio_add(folio, qp->pagelist, flags)) {
- qp->nr_failed++;
+ qp->nr_failed += nr;
if (strictly_unmovable(flags))
break;
}
@@ -2014,26 +2128,28 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
+ struct weighted_interleave_state *state;
nodemask_t nodemask;
unsigned int target, nr_nodes;
- u8 *table;
+ u8 *table = NULL;
unsigned int weight_total = 0;
u8 weight;
- int nid;
+ int nid = 0;
nr_nodes = read_once_policy_nodemask(pol, &nodemask);
if (!nr_nodes)
return numa_node_id();
rcu_read_lock();
- table = rcu_dereference(iw_table);
+
+ state = rcu_dereference(wi_state);
+ /* Uninitialized wi_state means we should assume all weights are 1 */
+ if (state)
+ table = state->iw_table;
+
/* calculate the total weight */
- for_each_node_mask(nid, nodemask) {
- /* detect system default usage */
- weight = table ? table[nid] : 1;
- weight = weight ? weight : 1;
- weight_total += weight;
- }
+ for_each_node_mask(nid, nodemask)
+ weight_total += table ? table[nid] : 1;
/* Calculate the node offset based on totals */
target = ilx % weight_total;
@@ -2041,7 +2157,6 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
while (target) {
/* detect system default usage */
weight = table ? table[nid] : 1;
- weight = weight ? weight : 1;
if (target < weight)
break;
target -= weight;
@@ -2442,13 +2557,14 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
+ struct weighted_interleave_state *state;
struct task_struct *me = current;
unsigned int cpuset_mems_cookie;
unsigned long total_allocated = 0;
unsigned long nr_allocated = 0;
unsigned long rounds;
unsigned long node_pages, delta;
- u8 *table, *weights, weight;
+ u8 *weights, weight;
unsigned int weight_total = 0;
unsigned long rem_pages = nr_pages;
nodemask_t nodes;
@@ -2498,17 +2614,19 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
return total_allocated;
rcu_read_lock();
- table = rcu_dereference(iw_table);
- if (table)
- memcpy(weights, table, nr_node_ids);
- rcu_read_unlock();
+ state = rcu_dereference(wi_state);
+ if (state) {
+ memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ for (i = 0; i < nr_node_ids; i++)
+ weights[i] = 1;
+ }
/* calculate total, detect system default usage */
- for_each_node_mask(node, nodes) {
- if (!weights[node])
- weights[node] = 1;
+ for_each_node_mask(node, nodes)
weight_total += weights[node];
- }
/*
* Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
@@ -3419,6 +3537,14 @@ struct iw_node_attr {
int nid;
};
+struct sysfs_wi_group {
+ struct kobject wi_kobj;
+ struct mutex kobj_lock;
+ struct iw_node_attr *nattrs[];
+};
+
+static struct sysfs_wi_group *wi_group;
+
static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -3433,177 +3559,310 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
struct iw_node_attr *node_attr;
- u8 *new;
- u8 *old;
u8 weight = 0;
+ int i;
node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
- if (count == 0 || sysfs_streq(buf, ""))
- weight = 0;
- else if (kstrtou8(buf, 0, &weight))
+ if (count == 0 || sysfs_streq(buf, "") ||
+ kstrtou8(buf, 0, &weight) || weight == 0)
return -EINVAL;
- new = kzalloc(nr_node_ids, GFP_KERNEL);
- if (!new)
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
return -ENOMEM;
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- if (old)
- memcpy(new, old, nr_node_ids);
- new[node_attr->nid] = weight;
- rcu_assign_pointer(iw_table, new);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
+ mutex_lock(&wi_state_lock);
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state) {
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ } else {
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+ }
+ new_wi_state->iw_table[node_attr->nid] = weight;
+ new_wi_state->mode_auto = false;
+
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
return count;
}
-static struct iw_node_attr **node_attrs;
-
-static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
- struct kobject *parent)
+static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
- if (!node_attr)
- return;
- sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
- kfree(node_attr->kobj_attr.attr.name);
- kfree(node_attr);
+ struct weighted_interleave_state *state;
+ bool wi_auto = true;
+
+ rcu_read_lock();
+ state = rcu_dereference(wi_state);
+ if (state)
+ wi_auto = state->mode_auto;
+ rcu_read_unlock();
+
+ return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
}
-static void sysfs_wi_release(struct kobject *wi_kobj)
+static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *bw;
+ bool input;
int i;
+ if (kstrtobool(buf, &input))
+ return -EINVAL;
+
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
+ return -ENOMEM;
for (i = 0; i < nr_node_ids; i++)
- sysfs_wi_node_release(node_attrs[i], wi_kobj);
- kobject_put(wi_kobj);
+ new_wi_state->iw_table[i] = 1;
+
+ mutex_lock(&wi_state_lock);
+ if (!input) {
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (!old_wi_state)
+ goto update_wi_state;
+ if (input == old_wi_state->mode_auto) {
+ mutex_unlock(&wi_state_lock);
+ return count;
+ }
+
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ goto update_wi_state;
+ }
+
+ bw = node_bw_table;
+ if (!bw) {
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ return -ENODEV;
+ }
+
+ new_wi_state->mode_auto = true;
+ reduce_interleave_weights(bw, new_wi_state->iw_table);
+
+update_wi_state:
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+ return count;
+}
+
+static void sysfs_wi_node_delete(int nid)
+{
+ struct iw_node_attr *attr;
+
+ if (nid < 0 || nid >= nr_node_ids)
+ return;
+
+ mutex_lock(&wi_group->kobj_lock);
+ attr = wi_group->nattrs[nid];
+ if (!attr) {
+ mutex_unlock(&wi_group->kobj_lock);
+ return;
+ }
+
+ wi_group->nattrs[nid] = NULL;
+ mutex_unlock(&wi_group->kobj_lock);
+
+ sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
+ kfree(attr->kobj_attr.attr.name);
+ kfree(attr);
+}
+
+static void sysfs_wi_node_delete_all(void)
+{
+ int nid;
+
+ for (nid = 0; nid < nr_node_ids; nid++)
+ sysfs_wi_node_delete(nid);
+}
+
+static void wi_state_free(void)
+{
+ struct weighted_interleave_state *old_wi_state;
+
+ mutex_lock(&wi_state_lock);
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ rcu_assign_pointer(wi_state, NULL);
+ mutex_unlock(&wi_state_lock);
+
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+}
+
+static struct kobj_attribute wi_auto_attr =
+ __ATTR(auto, 0664, weighted_interleave_auto_show,
+ weighted_interleave_auto_store);
+
+static void wi_cleanup(void) {
+ sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ sysfs_wi_node_delete_all();
+ wi_state_free();
+}
+
+static void wi_kobj_release(struct kobject *wi_kobj)
+{
+ kfree(wi_group);
}
static const struct kobj_type wi_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
- .release = sysfs_wi_release,
+ .release = wi_kobj_release,
};
-static int add_weight_node(int nid, struct kobject *wi_kobj)
+static int sysfs_wi_node_add(int nid)
{
- struct iw_node_attr *node_attr;
+ int ret;
char *name;
+ struct iw_node_attr *new_attr;
- node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
- if (!node_attr)
+ if (nid < 0 || nid >= nr_node_ids) {
+ pr_err("invalid node id: %d\n", nid);
+ return -EINVAL;
+ }
+
+ new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
+ if (!new_attr)
return -ENOMEM;
name = kasprintf(GFP_KERNEL, "node%d", nid);
if (!name) {
- kfree(node_attr);
+ kfree(new_attr);
return -ENOMEM;
}
- sysfs_attr_init(&node_attr->kobj_attr.attr);
- node_attr->kobj_attr.attr.name = name;
- node_attr->kobj_attr.attr.mode = 0644;
- node_attr->kobj_attr.show = node_show;
- node_attr->kobj_attr.store = node_store;
- node_attr->nid = nid;
+ sysfs_attr_init(&new_attr->kobj_attr.attr);
+ new_attr->kobj_attr.attr.name = name;
+ new_attr->kobj_attr.attr.mode = 0644;
+ new_attr->kobj_attr.show = node_show;
+ new_attr->kobj_attr.store = node_store;
+ new_attr->nid = nid;
- if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
- kfree(node_attr->kobj_attr.attr.name);
- kfree(node_attr);
- pr_err("failed to add attribute to weighted_interleave\n");
- return -ENOMEM;
+ mutex_lock(&wi_group->kobj_lock);
+ if (wi_group->nattrs[nid]) {
+ mutex_unlock(&wi_group->kobj_lock);
+ ret = -EEXIST;
+ goto out;
}
- node_attrs[nid] = node_attr;
+ ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
+ if (ret) {
+ mutex_unlock(&wi_group->kobj_lock);
+ goto out;
+ }
+ wi_group->nattrs[nid] = new_attr;
+ mutex_unlock(&wi_group->kobj_lock);
return 0;
+
+out:
+ kfree(new_attr->kobj_attr.attr.name);
+ kfree(new_attr);
+ return ret;
+}
+
+static int wi_node_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ int err;
+ struct node_notify *nn = data;
+ int nid = nn->nid;
+
+ switch (action) {
+ case NODE_ADDED_FIRST_MEMORY:
+ err = sysfs_wi_node_add(nid);
+ if (err)
+ pr_err("failed to add sysfs for node%d during hotplug: %d\n",
+ nid, err);
+ break;
+ case NODE_REMOVED_LAST_MEMORY:
+ sysfs_wi_node_delete(nid);
+ break;
+ }
+
+ return NOTIFY_OK;
}
-static int add_weighted_interleave_group(struct kobject *root_kobj)
+static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
{
- struct kobject *wi_kobj;
int nid, err;
- wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (!wi_kobj)
+ wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
+ GFP_KERNEL);
+ if (!wi_group)
return -ENOMEM;
+ mutex_init(&wi_group->kobj_lock);
- err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+ err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
"weighted_interleave");
- if (err) {
- kfree(wi_kobj);
- return err;
- }
+ if (err)
+ goto err_put_kobj;
- for_each_node_state(nid, N_POSSIBLE) {
- err = add_weight_node(nid, wi_kobj);
+ err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ if (err)
+ goto err_put_kobj;
+
+ for_each_online_node(nid) {
+ if (!node_state(nid, N_MEMORY))
+ continue;
+
+ err = sysfs_wi_node_add(nid);
if (err) {
- pr_err("failed to add sysfs [node%d]\n", nid);
- break;
+ pr_err("failed to add sysfs for node%d during init: %d\n",
+ nid, err);
+ goto err_cleanup_kobj;
}
}
- if (err)
- kobject_put(wi_kobj);
- return 0;
-}
-static void mempolicy_kobj_release(struct kobject *kobj)
-{
- u8 *old;
+ hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
+ return 0;
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- rcu_assign_pointer(iw_table, NULL);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
- kfree(node_attrs);
- kfree(kobj);
+err_cleanup_kobj:
+ wi_cleanup();
+ kobject_del(&wi_group->wi_kobj);
+err_put_kobj:
+ kobject_put(&wi_group->wi_kobj);
+ return err;
}
-static const struct kobj_type mempolicy_ktype = {
- .release = mempolicy_kobj_release
-};
-
static int __init mempolicy_sysfs_init(void)
{
int err;
static struct kobject *mempolicy_kobj;
- mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
- if (!mempolicy_kobj) {
- err = -ENOMEM;
- goto err_out;
- }
-
- node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
- GFP_KERNEL);
- if (!node_attrs) {
- err = -ENOMEM;
- goto mempol_out;
- }
+ mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
+ if (!mempolicy_kobj)
+ return -ENOMEM;
- err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
- "mempolicy");
+ err = add_weighted_interleave_group(mempolicy_kobj);
if (err)
- goto node_out;
+ goto err_kobj;
- err = add_weighted_interleave_group(mempolicy_kobj);
- if (err) {
- pr_err("mempolicy sysfs structure failed to initialize\n");
- kobject_put(mempolicy_kobj);
- return err;
- }
+ return 0;
- return err;
-node_out:
- kfree(node_attrs);
-mempol_out:
- kfree(mempolicy_kobj);
-err_out:
- pr_err("failed to add mempolicy kobject to the system\n");
+err_kobj:
+ kobject_del(mempolicy_kobj);
+ kobject_put(mempolicy_kobj);
return err;
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 3223337135d0..204a216b6418 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -540,11 +540,43 @@ void mempool_free(void *element, mempool_t *pool)
if (likely(pool->curr_nr < pool->min_nr)) {
add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
- wake_up(&pool->wait);
+ if (wq_has_sleeper(&pool->wait))
+ wake_up(&pool->wait);
return;
}
spin_unlock_irqrestore(&pool->lock, flags);
}
+
+ /*
+ * Handle the min_nr = 0 edge case:
+ *
+ * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds,
+ * so waiters sleeping on pool->wait would never be woken by the
+ * wake-up path of previous test. This explicit check ensures the
+ * allocation of element when both min_nr and curr_nr are 0, and
+ * any active waiters are properly awakened.
+ *
+ * Inline the same logic as previous test, add_element() cannot be
+ * directly used here since it has BUG_ON to deny if min_nr equals
+ * curr_nr, so here picked rest of add_element() to use without
+ * BUG_ON check.
+ */
+ if (unlikely(pool->min_nr == 0 &&
+ READ_ONCE(pool->curr_nr) == 0)) {
+ spin_lock_irqsave(&pool->lock, flags);
+ if (likely(pool->curr_nr == 0)) {
+ /* Inline the logic of add_element() */
+ poison_element(pool, element);
+ if (kasan_poison_element(pool, element))
+ pool->elements[pool->curr_nr++] = element;
+ spin_unlock_irqrestore(&pool->lock, flags);
+ if (wq_has_sleeper(&pool->wait))
+ wake_up(&pool->wait);
+ return;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
pool->free(element, pool->pool_data);
}
EXPORT_SYMBOL(mempool_free);
diff --git a/mm/memremap.c b/mm/memremap.c
index 2aebc1b192da..b0ce0d8254bd 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -5,7 +5,6 @@
#include <linux/kasan.h>
#include <linux/memory_hotplug.h>
#include <linux/memremap.h>
-#include <linux/pfn_t.h>
#include <linux/swap.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
@@ -39,30 +38,6 @@ unsigned long memremap_compat_align(void)
EXPORT_SYMBOL_GPL(memremap_compat_align);
#endif
-#ifdef CONFIG_FS_DAX
-DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
-EXPORT_SYMBOL(devmap_managed_key);
-
-static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
-{
- if (pgmap->type == MEMORY_DEVICE_FS_DAX)
- static_branch_dec(&devmap_managed_key);
-}
-
-static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
-{
- if (pgmap->type == MEMORY_DEVICE_FS_DAX)
- static_branch_inc(&devmap_managed_key);
-}
-#else
-static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
-{
-}
-static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
-{
-}
-#endif /* CONFIG_FS_DAX */
-
static void pgmap_array_delete(struct range *range)
{
xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end),
@@ -130,7 +105,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
}
mem_hotplug_done();
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
+ pfnmap_untrack(PHYS_PFN(range->start), range_len(range));
pgmap_array_delete(range);
}
@@ -151,7 +126,6 @@ void memunmap_pages(struct dev_pagemap *pgmap)
percpu_ref_exit(&pgmap->ref);
WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
- devmap_managed_enable_put(pgmap);
}
EXPORT_SYMBOL_GPL(memunmap_pages);
@@ -211,8 +185,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
if (nid < 0)
nid = numa_mem_id();
- error = track_pfn_remap(NULL, &params->pgprot, PHYS_PFN(range->start), 0,
- range_len(range));
+ error = pfnmap_track(PHYS_PFN(range->start), range_len(range),
+ &params->pgprot);
if (error)
goto err_pfn_remap;
@@ -254,7 +228,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
move_pfn_range_to_zone(zone, PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), params->altmap,
- MIGRATE_MOVABLE);
+ MIGRATE_MOVABLE, false);
}
mem_hotplug_done();
@@ -277,7 +251,7 @@ err_add_memory:
if (!is_private)
kasan_remove_zero_shadow(__va(range->start), range_len(range));
err_kasan:
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
+ pfnmap_untrack(PHYS_PFN(range->start), range_len(range));
err_pfn_remap:
pgmap_array_delete(range);
return error;
@@ -332,10 +306,6 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
}
break;
case MEMORY_DEVICE_FS_DAX:
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
- WARN(1, "File system DAX not supported\n");
- return ERR_PTR(-EINVAL);
- }
params.pgprot = pgprot_decrypted(params.pgprot);
break;
case MEMORY_DEVICE_GENERIC:
@@ -354,8 +324,6 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
if (error)
return ERR_PTR(error);
- devmap_managed_enable_get(pgmap);
-
/*
* Clear the pgmap nr_range as it will be incremented for each
* successfully processed range. This communicates how many
diff --git a/mm/migrate.c b/mm/migrate.c
index f3ee6d8d5e2e..425401b2d4e1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,7 +35,6 @@
#include <linux/compat.h>
#include <linux/hugetlb.h>
#include <linux/gfp.h>
-#include <linux/pfn_t.h>
#include <linux/page_idle.h>
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
@@ -44,15 +43,57 @@
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/pagewalk.h>
+#include <linux/balloon_compaction.h>
+#include <linux/zsmalloc.h>
#include <asm/tlbflush.h>
#include <trace/events/migrate.h>
#include "internal.h"
+#include "swap.h"
-bool isolate_movable_page(struct page *page, isolate_mode_t mode)
+static const struct movable_operations *page_movable_ops(struct page *page)
{
+ VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
+
+ /*
+ * If we enable page migration for a page of a certain type by marking
+ * it as movable, the page type must be sticky until the page gets freed
+ * back to the buddy.
+ */
+#ifdef CONFIG_BALLOON_COMPACTION
+ if (PageOffline(page))
+ /* Only balloon compaction sets PageOffline pages movable. */
+ return &balloon_mops;
+#endif /* CONFIG_BALLOON_COMPACTION */
+#if defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION)
+ if (PageZsmalloc(page))
+ return &zsmalloc_mops;
+#endif /* defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION) */
+ return NULL;
+}
+
+/**
+ * isolate_movable_ops_page - isolate a movable_ops page for migration
+ * @page: The page.
+ * @mode: The isolation mode.
+ *
+ * Try to isolate a movable_ops page for migration. Will fail if the page is
+ * not a movable_ops page, if the page is already isolated for migration
+ * or if the page was just was released by its owner.
+ *
+ * Once isolated, the page cannot get freed until it is either putback
+ * or migrated.
+ *
+ * Returns true if isolation succeeded, otherwise false.
+ */
+bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
+{
+ /*
+ * TODO: these pages will not be folios in the future. All
+ * folio dependencies will have to be removed.
+ */
struct folio *folio = folio_get_nontail_page(page);
const struct movable_operations *mops;
@@ -69,11 +110,14 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
goto out;
/*
- * Check movable flag before taking the page lock because
+ * Check for movable_ops pages before taking the page lock because
* we use non-atomic bitops on newly allocated page flags so
* unconditionally grabbing the lock ruins page's owner side.
+ *
+ * Note that once a page has movable_ops, it will stay that way
+ * until the page was freed.
*/
- if (unlikely(!__folio_test_movable(folio)))
+ if (unlikely(!page_has_movable_ops(page)))
goto out_putfolio;
/*
@@ -90,18 +134,20 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
if (unlikely(!folio_trylock(folio)))
goto out_putfolio;
- if (!folio_test_movable(folio) || folio_test_isolated(folio))
+ VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
+ if (PageMovableOpsIsolated(page))
goto out_no_isolated;
- mops = folio_movable_ops(folio);
- VM_BUG_ON_FOLIO(!mops, folio);
+ mops = page_movable_ops(page);
+ if (WARN_ON_ONCE(!mops))
+ goto out_no_isolated;
- if (!mops->isolate_page(&folio->page, mode))
+ if (!mops->isolate_page(page, mode))
goto out_no_isolated;
/* Driver shouldn't use the isolated flag */
- WARN_ON_ONCE(folio_test_isolated(folio));
- folio_set_isolated(folio);
+ VM_WARN_ON_ONCE_PAGE(PageMovableOpsIsolated(page), page);
+ SetPageMovableOpsIsolated(page);
folio_unlock(folio);
return true;
@@ -114,12 +160,69 @@ out:
return false;
}
-static void putback_movable_folio(struct folio *folio)
+/**
+ * putback_movable_ops_page - putback an isolated movable_ops page
+ * @page: The isolated page.
+ *
+ * Putback an isolated movable_ops page.
+ *
+ * After the page was putback, it might get freed instantly.
+ */
+static void putback_movable_ops_page(struct page *page)
{
- const struct movable_operations *mops = folio_movable_ops(folio);
+ /*
+ * TODO: these pages will not be folios in the future. All
+ * folio dependencies will have to be removed.
+ */
+ struct folio *folio = page_folio(page);
- mops->putback_page(&folio->page);
- folio_clear_isolated(folio);
+ VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
+ VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(page), page);
+ folio_lock(folio);
+ page_movable_ops(page)->putback_page(page);
+ ClearPageMovableOpsIsolated(page);
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
+/**
+ * migrate_movable_ops_page - migrate an isolated movable_ops page
+ * @dst: The destination page.
+ * @src: The source page.
+ * @mode: The migration mode.
+ *
+ * Migrate an isolated movable_ops page.
+ *
+ * If the src page was already released by its owner, the src page is
+ * un-isolated (putback) and migration succeeds; the migration core will be the
+ * owner of both pages.
+ *
+ * If the src page was not released by its owner and the migration was
+ * successful, the owner of the src page and the dst page are swapped and
+ * the src page is un-isolated.
+ *
+ * If migration fails, the ownership stays unmodified and the src page
+ * remains isolated: migration may be retried later or the page can be putback.
+ *
+ * TODO: migration core will treat both pages as folios and lock them before
+ * this call to unlock them after this call. Further, the folio refcounts on
+ * src and dst are also released by migration core. These pages will not be
+ * folios in the future, so that must be reworked.
+ *
+ * Returns MIGRATEPAGE_SUCCESS on success, otherwise a negative error
+ * code.
+ */
+static int migrate_movable_ops_page(struct page *dst, struct page *src,
+ enum migrate_mode mode)
+{
+ int rc = MIGRATEPAGE_SUCCESS;
+
+ VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src);
+ VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src);
+ rc = page_movable_ops(src)->migrate_page(dst, src, mode);
+ if (rc == MIGRATEPAGE_SUCCESS)
+ ClearPageMovableOpsIsolated(src);
+ return rc;
}
/*
@@ -141,20 +244,8 @@ void putback_movable_pages(struct list_head *l)
continue;
}
list_del(&folio->lru);
- /*
- * We isolated non-lru movable folio so here we can use
- * __folio_test_movable because LRU folio's mapping cannot
- * have PAGE_MAPPING_MOVABLE.
- */
- if (unlikely(__folio_test_movable(folio))) {
- VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio);
- folio_lock(folio);
- if (folio_test_movable(folio))
- putback_movable_folio(folio);
- else
- folio_clear_isolated(folio);
- folio_unlock(folio);
- folio_put(folio);
+ if (unlikely(page_has_movable_ops(&folio->page))) {
+ putback_movable_ops_page(&folio->page);
} else {
node_stat_mod_folio(folio, NR_ISOLATED_ANON +
folio_is_file_lru(folio), -folio_nr_pages(folio));
@@ -166,26 +257,20 @@ void putback_movable_pages(struct list_head *l)
/* Must be called with an elevated refcount on the non-hugetlb folio */
bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
{
- bool isolated, lru;
-
if (folio_test_hugetlb(folio))
return folio_isolate_hugetlb(folio, list);
- lru = !__folio_test_movable(folio);
- if (lru)
- isolated = folio_isolate_lru(folio);
- else
- isolated = isolate_movable_page(&folio->page,
- ISOLATE_UNEVICTABLE);
-
- if (!isolated)
- return false;
-
- list_add(&folio->lru, list);
- if (lru)
+ if (page_has_movable_ops(&folio->page)) {
+ if (!isolate_movable_ops_page(&folio->page,
+ ISOLATE_UNEVICTABLE))
+ return false;
+ } else {
+ if (!folio_isolate_lru(folio))
+ return false;
node_stat_add_folio(folio, NR_ISOLATED_ANON +
folio_is_file_lru(folio));
-
+ }
+ list_add(&folio->lru, list);
return true;
}
@@ -445,20 +530,6 @@ unlock:
}
#endif
-static int folio_expected_refs(struct address_space *mapping,
- struct folio *folio)
-{
- int refs = 1;
- if (!mapping)
- return refs;
-
- refs += folio_nr_pages(folio);
- if (folio_test_private(folio))
- refs++;
-
- return refs;
-}
-
/*
* Replace the folio in the mapping.
*
@@ -601,7 +672,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
int folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int extra_count)
{
- int expected_count = folio_expected_refs(mapping, folio) + extra_count;
+ int expected_count = folio_expected_ref_count(folio) + extra_count + 1;
if (folio_ref_count(folio) != expected_count)
return -EAGAIN;
@@ -618,7 +689,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
struct folio *dst, struct folio *src)
{
XA_STATE(xas, &mapping->i_pages, folio_index(src));
- int rc, expected_count = folio_expected_refs(mapping, src);
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
return -EAGAIN;
@@ -749,7 +820,7 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, void *src_private,
enum migrate_mode mode)
{
- int rc, expected_count = folio_expected_refs(mapping, src);
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
/* Check whether src does not have extra refs before we do more work */
if (folio_ref_count(src) != expected_count)
@@ -837,7 +908,7 @@ static int __buffer_migrate_folio(struct address_space *mapping,
return migrate_folio(mapping, dst, src, mode);
/* Check whether page does not have extra refs before we do more work */
- expected_count = folio_expected_refs(mapping, src);
+ expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
return -EAGAIN;
@@ -845,9 +916,11 @@ static int __buffer_migrate_folio(struct address_space *mapping,
return -EAGAIN;
if (check_refs) {
- bool busy;
+ bool busy, migrating;
bool invalidated = false;
+ migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state);
+ VM_WARN_ON_ONCE(migrating);
recheck_buffers:
busy = false;
spin_lock(&mapping->i_private_lock);
@@ -859,12 +932,12 @@ recheck_buffers:
}
bh = bh->b_this_page;
} while (bh != head);
+ spin_unlock(&mapping->i_private_lock);
if (busy) {
if (invalidated) {
rc = -EAGAIN;
goto unlock_buffers;
}
- spin_unlock(&mapping->i_private_lock);
invalidate_bh_lrus();
invalidated = true;
goto recheck_buffers;
@@ -883,7 +956,7 @@ recheck_buffers:
unlock_buffers:
if (check_refs)
- spin_unlock(&mapping->i_private_lock);
+ clear_bit_unlock(BH_Migrate, &head->b_state);
bh = head;
do {
unlock_buffer(bh);
@@ -945,66 +1018,20 @@ int filemap_migrate_folio(struct address_space *mapping,
EXPORT_SYMBOL_GPL(filemap_migrate_folio);
/*
- * Writeback a folio to clean the dirty state
- */
-static int writeout(struct address_space *mapping, struct folio *folio)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = 1,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .for_reclaim = 1
- };
- int rc;
-
- if (!mapping->a_ops->writepage)
- /* No write method for the address space */
- return -EINVAL;
-
- if (!folio_clear_dirty_for_io(folio))
- /* Someone else already triggered a write */
- return -EAGAIN;
-
- /*
- * A dirty folio may imply that the underlying filesystem has
- * the folio on some queue. So the folio must be clean for
- * migration. Writeout may mean we lose the lock and the
- * folio state is no longer what we checked for earlier.
- * At this point we know that the migration attempt cannot
- * be successful.
- */
- remove_migration_ptes(folio, folio, 0);
-
- rc = mapping->a_ops->writepage(&folio->page, &wbc);
-
- if (rc != AOP_WRITEPAGE_ACTIVATE)
- /* unlocked. Relock */
- folio_lock(folio);
-
- return (rc < 0) ? -EIO : -EAGAIN;
-}
-
-/*
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- if (folio_test_dirty(src)) {
- /* Only writeback folios in full synchronous migration */
- switch (mode) {
- case MIGRATE_SYNC:
- break;
- default:
- return -EBUSY;
- }
- return writeout(mapping, src);
- }
+ WARN_ONCE(mapping->a_ops->writepages,
+ "%ps does not implement migrate_folio\n",
+ mapping->a_ops);
+ if (folio_test_dirty(src))
+ return -EBUSY;
/*
- * Buffers may be managed in a filesystem specific way.
- * We must have no buffers or drop them.
+ * Filesystem may have private data at folio->private that we
+ * can't migrate automatically.
*/
if (!filemap_release_folio(src, GFP_KERNEL))
return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
@@ -1013,11 +1040,12 @@ static int fallback_migrate_folio(struct address_space *mapping,
}
/*
- * Move a page to a newly allocated page
- * The page is locked and all ptes have been successfully removed.
+ * Move a src folio to a newly allocated dst folio.
+ *
+ * The src and dst folios are locked and the src folios was unmapped from
+ * the page tables.
*
- * The new page will have replaced the old page if this function
- * is successful.
+ * On success, the src folio was replaced by the dst folio.
*
* Return value:
* < 0 - error code
@@ -1026,78 +1054,40 @@ static int fallback_migrate_folio(struct address_space *mapping,
static int move_to_new_folio(struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
+ struct address_space *mapping = folio_mapping(src);
int rc = -EAGAIN;
- bool is_lru = !__folio_test_movable(src);
VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
- if (likely(is_lru)) {
- struct address_space *mapping = folio_mapping(src);
-
- if (!mapping)
- rc = migrate_folio(mapping, dst, src, mode);
- else if (mapping_inaccessible(mapping))
- rc = -EOPNOTSUPP;
- else if (mapping->a_ops->migrate_folio)
- /*
- * Most folios have a mapping and most filesystems
- * provide a migrate_folio callback. Anonymous folios
- * are part of swap space which also has its own
- * migrate_folio callback. This is the most common path
- * for page migration.
- */
- rc = mapping->a_ops->migrate_folio(mapping, dst, src,
- mode);
- else
- rc = fallback_migrate_folio(mapping, dst, src, mode);
- } else {
- const struct movable_operations *mops;
-
+ if (!mapping)
+ rc = migrate_folio(mapping, dst, src, mode);
+ else if (mapping_inaccessible(mapping))
+ rc = -EOPNOTSUPP;
+ else if (mapping->a_ops->migrate_folio)
/*
- * In case of non-lru page, it could be released after
- * isolation step. In that case, we shouldn't try migration.
+ * Most folios have a mapping and most filesystems
+ * provide a migrate_folio callback. Anonymous folios
+ * are part of swap space which also has its own
+ * migrate_folio callback. This is the most common path
+ * for page migration.
*/
- VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
- if (!folio_test_movable(src)) {
- rc = MIGRATEPAGE_SUCCESS;
- folio_clear_isolated(src);
- goto out;
- }
-
- mops = folio_movable_ops(src);
- rc = mops->migrate_page(&dst->page, &src->page, mode);
- WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
- !folio_test_isolated(src));
- }
+ rc = mapping->a_ops->migrate_folio(mapping, dst, src,
+ mode);
+ else
+ rc = fallback_migrate_folio(mapping, dst, src, mode);
- /*
- * When successful, old pagecache src->mapping must be cleared before
- * src is freed; but stats require that PageAnon be left as PageAnon.
- */
if (rc == MIGRATEPAGE_SUCCESS) {
- if (__folio_test_movable(src)) {
- VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
-
- /*
- * We clear PG_movable under page_lock so any compactor
- * cannot try to migrate this page.
- */
- folio_clear_isolated(src);
- }
-
/*
- * Anonymous and movable src->mapping will be cleared by
- * free_pages_prepare so don't reset it here for keeping
- * the type to work PageAnon, for example.
+ * For pagecache folios, src->mapping must be cleared before src
+ * is freed. Anonymous folios must stay anonymous until freed.
*/
- if (!folio_mapping_flags(src))
+ if (!folio_test_anon(src))
src->mapping = NULL;
if (likely(!folio_is_zone_device(dst)))
flush_dcache_folio(dst);
}
-out:
return rc;
}
@@ -1164,12 +1154,7 @@ static void migrate_folio_undo_dst(struct folio *dst, bool locked,
static void migrate_folio_done(struct folio *src,
enum migrate_reason reason)
{
- /*
- * Compaction can migrate also non-LRU pages which are
- * not accounted to NR_ISOLATED_*. They can be recognized
- * as __folio_test_movable
- */
- if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION)
+ if (likely(!page_has_movable_ops(&src->page)) && reason != MR_DEMOTION)
mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
folio_is_file_lru(src), -folio_nr_pages(src));
@@ -1188,7 +1173,6 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
int rc = -EAGAIN;
int old_page_state = 0;
struct anon_vma *anon_vma = NULL;
- bool is_lru = data_race(!__folio_test_movable(src));
bool locked = false;
bool dst_locked = false;
@@ -1289,7 +1273,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
goto out;
dst_locked = true;
- if (unlikely(!is_lru)) {
+ if (unlikely(page_has_movable_ops(&src->page))) {
__migrate_folio_record(dst, old_page_state, anon_vma);
return MIGRATEPAGE_UNMAP;
}
@@ -1348,20 +1332,23 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
int rc;
int old_page_state = 0;
struct anon_vma *anon_vma = NULL;
- bool is_lru = !__folio_test_movable(src);
struct list_head *prev;
__migrate_folio_extract(dst, &old_page_state, &anon_vma);
prev = dst->lru.prev;
list_del(&dst->lru);
+ if (unlikely(page_has_movable_ops(&src->page))) {
+ rc = migrate_movable_ops_page(&dst->page, &src->page, mode);
+ if (rc)
+ goto out;
+ goto out_unlock_both;
+ }
+
rc = move_to_new_folio(dst, src, mode);
if (rc)
goto out;
- if (unlikely(!is_lru))
- goto out_unlock_both;
-
/*
* When successful, push dst to LRU immediately: so that if it
* turns out to be an mlocked page, remove_migration_ptes() will
@@ -1380,7 +1367,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
out_unlock_both:
folio_unlock(dst);
- set_page_owner_migrate_reason(&dst->page, reason);
+ folio_set_owner_migrate_reason(dst, reason);
/*
* If migration is successful, decrease refcount of dst,
* which will not free the page because new page owner increased
@@ -2376,13 +2363,6 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
}
/*
- * The move_pages() man page does not have an -EEXIST choice, so
- * use -EFAULT instead.
- */
- if (err == -EEXIST)
- err = -EFAULT;
-
- /*
* If the page is already on the target node (!err), store the
* node, otherwise, store the err.
*/
@@ -2456,6 +2436,7 @@ set_status:
static int get_compat_pages_array(const void __user *chunk_pages[],
const void __user * __user *pages,
+ unsigned long chunk_offset,
unsigned long chunk_nr)
{
compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
@@ -2463,7 +2444,7 @@ static int get_compat_pages_array(const void __user *chunk_pages[],
int i;
for (i = 0; i < chunk_nr; i++) {
- if (get_user(p, pages32 + i))
+ if (get_user(p, pages32 + chunk_offset + i))
return -EFAULT;
chunk_pages[i] = compat_ptr(p);
}
@@ -2482,27 +2463,28 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
#define DO_PAGES_STAT_CHUNK_NR 16UL
const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
int chunk_status[DO_PAGES_STAT_CHUNK_NR];
+ unsigned long chunk_offset = 0;
while (nr_pages) {
unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
if (in_compat_syscall()) {
if (get_compat_pages_array(chunk_pages, pages,
- chunk_nr))
+ chunk_offset, chunk_nr))
break;
} else {
- if (copy_from_user(chunk_pages, pages,
+ if (copy_from_user(chunk_pages, pages + chunk_offset,
chunk_nr * sizeof(*chunk_pages)))
break;
}
do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
- if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
+ if (copy_to_user(status + chunk_offset, chunk_status,
+ chunk_nr * sizeof(*status)))
break;
- pages += chunk_nr;
- status += chunk_nr;
+ chunk_offset += chunk_nr;
nr_pages -= chunk_nr;
}
return nr_pages ? -EFAULT : 0;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 3158afe7eb23..e05e14d6eacd 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -615,7 +615,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
pmdp = pmd_alloc(mm, pudp, addr);
if (!pmdp)
goto abort;
- if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+ if (pmd_trans_huge(*pmdp))
goto abort;
if (pte_alloc(mm, pmdp))
goto abort;
diff --git a/mm/mincore.c b/mm/mincore.c
index 832f29f46767..42d6c9c8da86 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -21,6 +21,7 @@
#include <linux/uaccess.h>
#include "swap.h"
+#include "internal.h"
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -105,6 +106,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *ptep;
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
+ int step, i;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -118,16 +120,26 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
walk->action = ACTION_AGAIN;
return 0;
}
- for (; addr != end; ptep++, addr += PAGE_SIZE) {
+ for (; addr != end; ptep += step, addr += step * PAGE_SIZE) {
pte_t pte = ptep_get(ptep);
+ step = 1;
/* We need to do cache lookup too for pte markers */
if (pte_none_mostly(pte))
__mincore_unmapped_range(addr, addr + PAGE_SIZE,
vma, vec);
- else if (pte_present(pte))
- *vec = 1;
- else { /* pte is a swap entry */
+ else if (pte_present(pte)) {
+ unsigned int batch = pte_batch_hint(ptep, pte);
+
+ if (batch > 1) {
+ unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
+
+ step = min_t(unsigned int, batch, max_nr);
+ }
+
+ for (i = 0; i < step; i++)
+ vec[i] = 1;
+ } else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
if (non_swap_entry(entry)) {
@@ -146,7 +158,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
#endif
}
}
- vec++;
+ vec += step;
}
pte_unmap_unlock(ptep - 1, ptl);
out:
diff --git a/mm/mlock.c b/mm/mlock.c
index 3cb72b579ffd..a1d93ad33c6d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -307,15 +307,13 @@ void munlock_folio(struct folio *folio)
static inline unsigned int folio_mlock_step(struct folio *folio,
pte_t *pte, unsigned long addr, unsigned long end)
{
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
unsigned int count = (end - addr) >> PAGE_SHIFT;
pte_t ptent = ptep_get(pte);
if (!folio_test_large(folio))
return 1;
- return folio_pte_batch(folio, addr, pte, ptent, count, fpb_flags, NULL,
- NULL, NULL);
+ return folio_pte_batch(folio, pte, ptent, count);
}
static inline bool allow_mlock_munlock(struct folio *folio,
diff --git a/mm/mm_init.c b/mm/mm_init.c
index a38a1909b407..5c21b3af216b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -30,6 +30,7 @@
#include <linux/crash_dump.h>
#include <linux/execmem.h>
#include <linux/vmstat.h>
+#include <linux/kexec_handover.h>
#include <linux/hugetlb.h>
#include "internal.h"
#include "slab.h"
@@ -684,7 +685,8 @@ void __meminit __init_page_from_nid(unsigned long pfn, int nid)
__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
if (pageblock_aligned(pfn))
- set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE);
+ init_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE,
+ false);
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -743,7 +745,7 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static void __meminit init_deferred_page(unsigned long pfn, int nid)
+static void __meminit __init_deferred_page(unsigned long pfn, int nid)
{
if (early_page_initialised(pfn, nid))
return;
@@ -763,11 +765,16 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static inline void init_deferred_page(unsigned long pfn, int nid)
+static inline void __init_deferred_page(unsigned long pfn, int nid)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+void __meminit init_deferred_page(unsigned long pfn, int nid)
+{
+ __init_deferred_page(pfn, nid);
+}
+
/*
* Initialised pages do not have PageReserved set. This function is
* called for each range allocated by the bootmem allocator and
@@ -777,22 +784,19 @@ static inline void init_deferred_page(unsigned long pfn, int nid)
void __meminit reserve_bootmem_region(phys_addr_t start,
phys_addr_t end, int nid)
{
- unsigned long start_pfn = PFN_DOWN(start);
- unsigned long end_pfn = PFN_UP(end);
+ unsigned long pfn;
- for (; start_pfn < end_pfn; start_pfn++) {
- if (pfn_valid(start_pfn)) {
- struct page *page = pfn_to_page(start_pfn);
+ for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) {
+ struct page *page = pfn_to_page(pfn);
- init_deferred_page(start_pfn, nid);
+ __init_deferred_page(pfn, nid);
- /*
- * no need for atomic set_bit because the struct
- * page is not visible yet so nobody should
- * access it yet.
- */
- __SetPageReserved(page);
- }
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
}
}
@@ -828,7 +832,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* - physical memory bank size is not necessarily the exact multiple of the
* arbitrary section size
* - early reserved memory may not be listed in memblock.memory
- * - non-memory regions covered by the contigious flatmem mapping
+ * - non-memory regions covered by the contiguous flatmem mapping
* - memory layouts defined with memmap= kernel parameter may not align
* nicely with memmap sections
*
@@ -848,11 +852,7 @@ static void __init init_unavailable_range(unsigned long spfn,
unsigned long pfn;
u64 pgcnt = 0;
- for (pfn = spfn; pfn < epfn; pfn++) {
- if (!pfn_valid(pageblock_start_pfn(pfn))) {
- pfn = pageblock_end_pfn(pfn) - 1;
- continue;
- }
+ for_each_valid_pfn(pfn, spfn, epfn) {
__init_single_page(pfn_to_page(pfn), pfn, zone, node);
__SetPageReserved(pfn_to_page(pfn));
pgcnt++;
@@ -875,7 +875,8 @@ static void __init init_unavailable_range(unsigned long spfn,
void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, unsigned long zone_end_pfn,
enum meminit_context context,
- struct vmem_altmap *altmap, int migratetype)
+ struct vmem_altmap *altmap, int migratetype,
+ bool isolate_pageblock)
{
unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
@@ -932,7 +933,8 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
* over the place during system boot.
*/
if (pageblock_aligned(pfn)) {
- set_pageblock_migratetype(page, migratetype);
+ init_pageblock_migratetype(page, migratetype,
+ isolate_pageblock);
cond_resched();
}
pfn++;
@@ -955,7 +957,8 @@ static void __init memmap_init_zone_range(struct zone *zone,
return;
memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
- zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE,
+ false);
if (*hole_pfn < start_pfn)
init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
@@ -984,19 +987,19 @@ static void __init memmap_init(void)
}
}
-#ifdef CONFIG_SPARSEMEM
/*
* Initialize the memory map for hole in the range [memory_end,
- * section_end].
+ * section_end] for SPARSEMEM and in the range [memory_end, memmap_end]
+ * for FLATMEM.
* Append the pages in this hole to the highest zone in the last
* node.
- * The call to init_unavailable_range() is outside the ifdef to
- * silence the compiler warining about zone_id set but not used;
- * for FLATMEM it is a nop anyway
*/
+#ifdef CONFIG_SPARSEMEM
end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
- if (hole_pfn < end_pfn)
+#else
+ end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES);
#endif
+ if (hole_pfn < end_pfn)
init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
@@ -1036,7 +1039,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* because this is done early in section_activate()
*/
if (pageblock_aligned(pfn)) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ init_pageblock_migratetype(page, MIGRATE_MOVABLE, false);
cond_resched();
}
@@ -1510,7 +1513,7 @@ static inline void setup_usemap(struct zone *zone) {}
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
void __init set_pageblock_order(void)
{
- unsigned int order = MAX_PAGE_ORDER;
+ unsigned int order = PAGE_BLOCK_MAX_ORDER;
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
@@ -1785,7 +1788,7 @@ static bool arch_has_descending_max_zone_pfns(void)
return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
}
-static void set_high_memory(void)
+static void __init set_high_memory(void)
{
phys_addr_t highmem = memblock_end_of_DRAM();
@@ -1907,7 +1910,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
free_area_init_node(nid);
/*
- * No sysfs hierarcy will be created via register_one_node()
+ * No sysfs hierarchy will be created via register_one_node()
*for memory-less node because here it's not marked as N_MEMORY
*and won't be set online later. The benefit is userspace
*program won't be confused by sysfs files/directories of
@@ -1997,7 +2000,8 @@ static void __init deferred_free_pages(unsigned long pfn,
/* Free a large naturally-aligned chunk if possible */
if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
for (i = 0; i < nr_pages; i += pageblock_nr_pages)
- set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
+ init_pageblock_migratetype(page + i, MIGRATE_MOVABLE,
+ false);
__free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
return;
}
@@ -2007,7 +2011,8 @@ static void __init deferred_free_pages(unsigned long pfn,
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if (pageblock_aligned(pfn))
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ init_pageblock_migratetype(page, MIGRATE_MOVABLE,
+ false);
__free_pages_core(page, 0, MEMINIT_EARLY);
}
}
@@ -2306,7 +2311,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
set_page_count(p, 0);
} while (++p, --i);
- set_pageblock_migratetype(page, MIGRATE_CMA);
+ init_pageblock_migratetype(page, MIGRATE_CMA, false);
set_page_refcounted(page);
/* pages were reserved and not allocated */
clear_page_tag_ref(page);
@@ -2320,7 +2325,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
*/
void __init init_cma_pageblock(struct page *page)
{
- set_pageblock_migratetype(page, MIGRATE_CMA);
+ init_pageblock_migratetype(page, MIGRATE_CMA, false);
adjust_managed_page_count(page, pageblock_nr_pages);
page_zone(page)->cma_pages += pageblock_nr_pages;
}
@@ -2667,12 +2672,6 @@ static void __init report_meminit(void)
stack = "all(pattern)";
else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
stack = "all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
- stack = "byref_all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
- stack = "byref(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
- stack = "__user(zero)";
else
stack = "off";
@@ -2765,6 +2764,13 @@ void __init mm_core_init(void)
report_meminit();
kmsan_init_shadow();
stack_depot_early_init();
+
+ /*
+ * KHO memory setup must happen while memblock is still active, but
+ * as close as possible to buddy initialization
+ */
+ kho_memory_init();
+
memblock_free_all();
mem_init();
kmem_cache_init();
diff --git a/mm/mmap.c b/mm/mmap.c
index bd210aaf7ebd..7306253cc3b5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -80,7 +80,7 @@ core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
- unsigned long vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
pgprot_t vm_page_prot;
vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
@@ -127,18 +127,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
origbrk = mm->brk;
+ min_brk = mm->start_brk;
#ifdef CONFIG_COMPAT_BRK
/*
* CONFIG_COMPAT_BRK can still be overridden by setting
* randomize_va_space to 2, which will still cause mm->start_brk
* to be arbitrarily shifted
*/
- if (current->brk_randomized)
- min_brk = mm->start_brk;
- else
+ if (!current->brk_randomized)
min_brk = mm->end_data;
-#else
- min_brk = mm->start_brk;
#endif
if (brk < min_brk)
goto out;
@@ -228,12 +225,12 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags,
unsigned long bytes)
{
unsigned long locked_pages, limit_pages;
- if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
+ if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
return true;
locked_pages = bytes >> PAGE_SHIFT;
@@ -475,7 +472,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags &= ~VM_MAYEXEC;
}
- if (!file->f_op->mmap)
+ if (!can_mmap_file(file))
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
@@ -871,9 +868,8 @@ mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- if (test_bit(MMF_TOPDOWN, &mm->flags))
- return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags, 0);
- return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0);
+ return mm_get_unmapped_area_vmflags(mm, file, addr, len,
+ pgoff, flags, 0);
}
EXPORT_SYMBOL(mm_get_unmapped_area);
@@ -1207,7 +1203,7 @@ out:
return ret;
}
-int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
+int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
@@ -1224,7 +1220,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
return 0;
/* Until we need other flags, refuse anything except VM_EXEC. */
- if ((flags & (~VM_EXEC)) != 0)
+ if ((vm_flags & (~VM_EXEC)) != 0)
return -EINVAL;
if (mmap_write_lock_killable(mm))
@@ -1239,7 +1235,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
goto munmap_failed;
vma = vma_prev(&vmi);
- ret = do_brk_flags(&vmi, vma, addr, len, flags);
+ ret = do_brk_flags(&vmi, vma, addr, len, vm_flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
@@ -1321,48 +1317,6 @@ destroy:
vm_unacct_memory(nr_accounted);
}
-/* Insert vm structure into process list sorted by address
- * and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_rwsem is taken here.
- */
-int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- unsigned long charged = vma_pages(vma);
-
-
- if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
- return -ENOMEM;
-
- if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
-
- /*
- * The vm_pgoff of a purely anonymous vma should be irrelevant
- * until its first write fault, when page's anon_vma and index
- * are set. But now set the vm_pgoff it will almost certainly
- * end up with (unless mremap moves it elsewhere before that
- * first wfault), so /proc/pid/maps tells a consistent story.
- *
- * By setting it to reflect the virtual start address of the
- * vma, merges and splits can happen in a seamless way, just
- * using the existing file pgoff checks and manipulations.
- * Similarly in do_mmap and in do_brk_flags.
- */
- if (vma_is_anonymous(vma)) {
- BUG_ON(vma->anon_vma);
- vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
- }
-
- if (vma_link(mm, vma)) {
- if (vma->vm_flags & VM_ACCOUNT)
- vm_unacct_memory(charged);
- return -ENOMEM;
- }
-
- return 0;
-}
-
/*
* Return true if the calling process may expand its vm space by the passed
* number of pages
@@ -1486,7 +1440,7 @@ static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
static struct vm_area_struct *__install_special_mapping(
struct mm_struct *mm,
unsigned long addr, unsigned long len,
- unsigned long vm_flags, void *priv,
+ vm_flags_t vm_flags, void *priv,
const struct vm_operations_struct *ops)
{
int ret;
@@ -1538,7 +1492,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma,
struct vm_area_struct *_install_special_mapping(
struct mm_struct *mm,
unsigned long addr, unsigned long len,
- unsigned long vm_flags, const struct vm_special_mapping *spec)
+ vm_flags_t vm_flags, const struct vm_special_mapping *spec)
{
return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
&special_mapping_vmops);
@@ -1596,7 +1550,7 @@ static const struct ctl_table mmap_table[] = {
#endif /* CONFIG_SYSCTL */
/*
- * initialise the percpu counter for VM
+ * initialise the percpu counter for VM, initialise VMA state.
*/
void __init mmap_init(void)
{
@@ -1607,6 +1561,7 @@ void __init mmap_init(void)
#ifdef CONFIG_SYSCTL
register_sysctl_init("vm", mmap_table);
#endif
+ vma_state_init();
}
/*
@@ -1718,90 +1673,6 @@ static int __meminit init_reserve_notifier(void)
subsys_initcall(init_reserve_notifier);
/*
- * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
- * this VMA and its relocated range, which will now reside at [vma->vm_start -
- * shift, vma->vm_end - shift).
- *
- * This function is almost certainly NOT what you want for anything other than
- * early executable temporary stack relocation.
- */
-int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
-{
- /*
- * The process proceeds as follows:
- *
- * 1) Use shift to calculate the new vma endpoints.
- * 2) Extend vma to cover both the old and new ranges. This ensures the
- * arguments passed to subsequent functions are consistent.
- * 3) Move vma's page tables to the new range.
- * 4) Free up any cleared pgd range.
- * 5) Shrink the vma to cover only the new range.
- */
-
- struct mm_struct *mm = vma->vm_mm;
- unsigned long old_start = vma->vm_start;
- unsigned long old_end = vma->vm_end;
- unsigned long length = old_end - old_start;
- unsigned long new_start = old_start - shift;
- unsigned long new_end = old_end - shift;
- VMA_ITERATOR(vmi, mm, new_start);
- VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
- struct vm_area_struct *next;
- struct mmu_gather tlb;
- PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
-
- BUG_ON(new_start > new_end);
-
- /*
- * ensure there are no vmas between where we want to go
- * and where we are
- */
- if (vma != vma_next(&vmi))
- return -EFAULT;
-
- vma_iter_prev_range(&vmi);
- /*
- * cover the whole range: [new_start, old_end)
- */
- vmg.middle = vma;
- if (vma_expand(&vmg))
- return -ENOMEM;
-
- /*
- * move the page tables downwards, on failure we rely on
- * process cleanup to remove whatever mess we made.
- */
- pmc.for_stack = true;
- if (length != move_page_tables(&pmc))
- return -ENOMEM;
-
- tlb_gather_mmu(&tlb, mm);
- next = vma_next(&vmi);
- if (new_end > old_start) {
- /*
- * when the old and new regions overlap clear from new_end.
- */
- free_pgd_range(&tlb, new_end, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- } else {
- /*
- * otherwise, clean from old_start; this is done to not touch
- * the address space in [new_end, old_start) some architectures
- * have constraints on va-space that make this illegal (IA64) -
- * for the others its just a little faster.
- */
- free_pgd_range(&tlb, old_start, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- }
- tlb_finish_mmu(&tlb);
-
- vma_prev(&vmi);
- /* Shrink the vma to just the new range */
- return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
-}
-
-#ifdef CONFIG_MMU
-/*
* Obtain a read lock on mm->mmap_lock, if the specified address is below the
* start of the VMA, the intent is to perform a write, and it is a
* downward-growing stack, then attempt to expand the stack to contain it.
@@ -1844,10 +1715,175 @@ bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
mmap_write_downgrade(mm);
return true;
}
-#else
-bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, bool write)
+
+__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
- return false;
+ struct vm_area_struct *mpnt, *tmp;
+ int retval;
+ unsigned long charge = 0;
+ LIST_HEAD(uf);
+ VMA_ITERATOR(vmi, mm, 0);
+
+ if (mmap_write_lock_killable(oldmm))
+ return -EINTR;
+ flush_cache_dup_mm(oldmm);
+ uprobe_dup_mmap(oldmm, mm);
+ /*
+ * Not linked in yet - no deadlock potential:
+ */
+ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
+
+ /* No ordering required: file already has been exposed. */
+ dup_mm_exe_file(mm, oldmm);
+
+ mm->total_vm = oldmm->total_vm;
+ mm->data_vm = oldmm->data_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
+ /* Use __mt_dup() to efficiently build an identical maple tree. */
+ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+ if (unlikely(retval))
+ goto out;
+
+ mt_clear_in_rcu(vmi.mas.tree);
+ for_each_vma(vmi, mpnt) {
+ struct file *file;
+
+ vma_start_write(mpnt);
+ if (mpnt->vm_flags & VM_DONTCOPY) {
+ retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+ mpnt->vm_end, GFP_KERNEL);
+ if (retval)
+ goto loop_out;
+
+ vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
+ continue;
+ }
+ charge = 0;
+ /*
+ * Don't duplicate many vmas if we've been oom-killed (for
+ * example)
+ */
+ if (fatal_signal_pending(current)) {
+ retval = -EINTR;
+ goto loop_out;
+ }
+ if (mpnt->vm_flags & VM_ACCOUNT) {
+ unsigned long len = vma_pages(mpnt);
+
+ if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
+ goto fail_nomem;
+ charge = len;
+ }
+
+ tmp = vm_area_dup(mpnt);
+ if (!tmp)
+ goto fail_nomem;
+ retval = vma_dup_policy(mpnt, tmp);
+ if (retval)
+ goto fail_nomem_policy;
+ tmp->vm_mm = mm;
+ retval = dup_userfaultfd(tmp, &uf);
+ if (retval)
+ goto fail_nomem_anon_vma_fork;
+ if (tmp->vm_flags & VM_WIPEONFORK) {
+ /*
+ * VM_WIPEONFORK gets a clean slate in the child.
+ * Don't prepare anon_vma until fault since we don't
+ * copy page for current vma.
+ */
+ tmp->anon_vma = NULL;
+ } else if (anon_vma_fork(tmp, mpnt))
+ goto fail_nomem_anon_vma_fork;
+ vm_flags_clear(tmp, VM_LOCKED_MASK);
+ /*
+ * Copy/update hugetlb private vma information.
+ */
+ if (is_vm_hugetlb_page(tmp))
+ hugetlb_dup_vma_private(tmp);
+
+ /*
+ * Link the vma into the MT. After using __mt_dup(), memory
+ * allocation is not necessary here, so it cannot fail.
+ */
+ vma_iter_bulk_store(&vmi, tmp);
+
+ mm->map_count++;
+
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+
+ file = tmp->vm_file;
+ if (file) {
+ struct address_space *mapping = file->f_mapping;
+
+ get_file(file);
+ i_mmap_lock_write(mapping);
+ if (vma_is_shared_maywrite(tmp))
+ mapping_allow_writable(mapping);
+ flush_dcache_mmap_lock(mapping);
+ /* insert tmp into the share list, just after mpnt */
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ i_mmap_unlock_write(mapping);
+ }
+
+ if (!(tmp->vm_flags & VM_WIPEONFORK))
+ retval = copy_page_range(tmp, mpnt);
+
+ if (retval) {
+ mpnt = vma_next(&vmi);
+ goto loop_out;
+ }
+ }
+ /* a new mm has just been created */
+ retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+ vma_iter_free(&vmi);
+ if (!retval) {
+ mt_set_in_rcu(vmi.mas.tree);
+ ksm_fork(mm, oldmm);
+ khugepaged_fork(mm, oldmm);
+ } else {
+
+ /*
+ * The entire maple tree has already been duplicated. If the
+ * mmap duplication fails, mark the failure point with
+ * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+ * stop releasing VMAs that have not been duplicated after this
+ * point.
+ */
+ if (mpnt) {
+ mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+ mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ /* Avoid OOM iterating a broken tree */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ }
+ /*
+ * The mm_struct is going to exit, but the locks will be dropped
+ * first. Set the mm_struct as unstable is advisable as it is
+ * not fully initialised.
+ */
+ set_bit(MMF_UNSTABLE, &mm->flags);
+ }
+out:
+ mmap_write_unlock(mm);
+ flush_tlb_mm(oldmm);
+ mmap_write_unlock(oldmm);
+ if (!retval)
+ dup_userfaultfd_complete(&uf);
+ else
+ dup_userfaultfd_fail(&uf);
+ return retval;
+
+fail_nomem_anon_vma_fork:
+ mpol_put(vma_policy(tmp));
+fail_nomem_policy:
+ vm_area_free(tmp);
+fail_nomem:
+ retval = -ENOMEM;
+ vm_unacct_memory(charge);
+ goto loop_out;
}
-#endif
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index e7dbaf96aa17..729fb7d0dd59 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -42,3 +42,369 @@ void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
}
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
#endif /* CONFIG_TRACING */
+
+#ifdef CONFIG_MMU
+#ifdef CONFIG_PER_VMA_LOCK
+static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
+{
+ unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
+
+ /* Additional refcnt if the vma is attached. */
+ if (!detaching)
+ tgt_refcnt++;
+
+ /*
+ * If vma is detached then only vma_mark_attached() can raise the
+ * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
+ */
+ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
+ return false;
+
+ rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+ rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+ refcount_read(&vma->vm_refcnt) == tgt_refcnt,
+ TASK_UNINTERRUPTIBLE);
+ lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
+
+ return true;
+}
+
+static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
+{
+ *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
+ rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+}
+
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
+{
+ bool locked;
+
+ /*
+ * __vma_enter_locked() returns false immediately if the vma is not
+ * attached, otherwise it waits until refcnt is indicating that vma
+ * is attached with no readers.
+ */
+ locked = __vma_enter_locked(vma, false);
+
+ /*
+ * We should use WRITE_ONCE() here because we can have concurrent reads
+ * from the early lockless pessimistic check in vma_start_read().
+ * We don't really care about the correctness of that early check, but
+ * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
+ */
+ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+
+ if (locked) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(detached); /* vma should remain attached */
+ }
+}
+EXPORT_SYMBOL_GPL(__vma_start_write);
+
+void vma_mark_detached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_attached(vma);
+
+ /*
+ * We are the only writer, so no need to use vma_refcount_put().
+ * The condition below is unlikely because the vma has been already
+ * write-locked and readers can increment vm_refcnt only temporarily
+ * before they check vm_lock_seq, realize the vma is locked and drop
+ * back the vm_refcnt. That is a narrow window for observing a raised
+ * vm_refcnt.
+ */
+ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+ /* Wait until vma is detached with no readers. */
+ if (__vma_enter_locked(vma, true)) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(!detached);
+ }
+ }
+}
+
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
+{
+ MA_STATE(mas, &mm->mm_mt, address, address);
+ struct vm_area_struct *vma;
+
+ rcu_read_lock();
+retry:
+ vma = mas_walk(&mas);
+ if (!vma)
+ goto inval;
+
+ vma = vma_start_read(mm, vma);
+ if (IS_ERR_OR_NULL(vma)) {
+ /* Check if the VMA got isolated after we found it */
+ if (PTR_ERR(vma) == -EAGAIN) {
+ count_vm_vma_lock_event(VMA_LOCK_MISS);
+ /* The area was replaced with another one */
+ goto retry;
+ }
+
+ /* Failed to lock the VMA */
+ goto inval;
+ }
+ /*
+ * At this point, we have a stable reference to a VMA: The VMA is
+ * locked and we know it hasn't already been isolated.
+ * From here on, we can access the VMA without worrying about which
+ * fields are accessible for RCU readers.
+ */
+
+ /* Check if the vma we locked is the right one. */
+ if (unlikely(vma->vm_mm != mm ||
+ address < vma->vm_start || address >= vma->vm_end))
+ goto inval_end_read;
+
+ rcu_read_unlock();
+ return vma;
+
+inval_end_read:
+ vma_end_read(vma);
+inval:
+ rcu_read_unlock();
+ count_vm_vma_lock_event(VMA_LOCK_ABORT);
+ return NULL;
+}
+
+static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
+ struct vma_iterator *vmi,
+ unsigned long from_addr)
+{
+ struct vm_area_struct *vma;
+ int ret;
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ return ERR_PTR(ret);
+
+ /* Lookup the vma at the last position again under mmap_read_lock */
+ vma_iter_set(vmi, from_addr);
+ vma = vma_next(vmi);
+ if (vma) {
+ /* Very unlikely vma->vm_refcnt overflow case */
+ if (unlikely(!vma_start_read_locked(vma)))
+ vma = ERR_PTR(-EAGAIN);
+ }
+
+ mmap_read_unlock(mm);
+
+ return vma;
+}
+
+struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
+ struct vma_iterator *vmi,
+ unsigned long from_addr)
+{
+ struct vm_area_struct *vma;
+ unsigned int mm_wr_seq;
+ bool mmap_unlocked;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
+retry:
+ /* Start mmap_lock speculation in case we need to verify the vma later */
+ mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
+ vma = vma_next(vmi);
+ if (!vma)
+ return NULL;
+
+ vma = vma_start_read(mm, vma);
+ if (IS_ERR_OR_NULL(vma)) {
+ /*
+ * Retry immediately if the vma gets detached from under us.
+ * Infinite loop should not happen because the vma we find will
+ * have to be constantly knocked out from under us.
+ */
+ if (PTR_ERR(vma) == -EAGAIN) {
+ /* reset to search from the last address */
+ vma_iter_set(vmi, from_addr);
+ goto retry;
+ }
+
+ goto fallback;
+ }
+
+ /*
+ * Verify the vma we locked belongs to the same address space and it's
+ * not behind of the last search position.
+ */
+ if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end))
+ goto fallback_unlock;
+
+ /*
+ * vma can be ahead of the last search position but we need to verify
+ * it was not shrunk after we found it and another vma has not been
+ * installed ahead of it. Otherwise we might observe a gap that should
+ * not be there.
+ */
+ if (from_addr < vma->vm_start) {
+ /* Verify only if the address space might have changed since vma lookup. */
+ if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
+ vma_iter_set(vmi, from_addr);
+ if (vma != vma_next(vmi))
+ goto fallback_unlock;
+ }
+ }
+
+ return vma;
+
+fallback_unlock:
+ vma_end_read(vma);
+fallback:
+ rcu_read_unlock();
+ vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
+ rcu_read_lock();
+ /* Reinitialize the iterator after re-entering rcu read section */
+ vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
+
+ return vma;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
+#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
+#include <linux/extable.h>
+
+static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ if (likely(mmap_read_trylock(mm)))
+ return true;
+
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = exception_ip(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+
+ return !mmap_read_lock_killable(mm);
+}
+
+static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+{
+ /*
+ * We don't have this operation yet.
+ *
+ * It should be easy enough to do: it's basically a
+ * atomic_long_try_cmpxchg_acquire()
+ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
+ * it also needs the proper lockdep magic etc.
+ */
+ return false;
+}
+
+static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ mmap_read_unlock(mm);
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = exception_ip(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+ return !mmap_write_lock_killable(mm);
+}
+
+/*
+ * Helper for page fault handling.
+ *
+ * This is kind of equivalent to "mmap_read_lock()" followed
+ * by "find_extend_vma()", except it's a lot more careful about
+ * the locking (and will drop the lock on failure).
+ *
+ * For example, if we have a kernel bug that causes a page
+ * fault, we don't want to just use mmap_read_lock() to get
+ * the mm lock, because that would deadlock if the bug were
+ * to happen while we're holding the mm lock for writing.
+ *
+ * So this checks the exception tables on kernel faults in
+ * order to only do this all for instructions that are actually
+ * expected to fault.
+ *
+ * We can also actually take the mm lock for writing if we
+ * need to extend the vma, which helps the VM layer a lot.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ if (!get_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (likely(vma && (vma->vm_start <= addr)))
+ return vma;
+
+ /*
+ * Well, dang. We might still be successful, but only
+ * if we can extend a vma to do so.
+ */
+ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+
+ /*
+ * We can try to upgrade the mmap lock atomically,
+ * in which case we can continue to use the vma
+ * we already looked up.
+ *
+ * Otherwise we'll have to drop the mmap lock and
+ * re-take it, and also look up the vma again,
+ * re-checking it.
+ */
+ if (!mmap_upgrade_trylock(mm)) {
+ if (!upgrade_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto fail;
+ if (vma->vm_start <= addr)
+ goto success;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto fail;
+ }
+
+ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+success:
+ mmap_write_downgrade(mm);
+ return vma;
+
+fail:
+ mmap_write_unlock(mm);
+ return NULL;
+}
+#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
+
+#else /* CONFIG_MMU */
+
+/*
+ * At least xtensa ends up having protection faults even with no
+ * MMU.. No stack expansion, at least.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ mmap_read_unlock(mm);
+ return vma;
+}
+
+#endif /* CONFIG_MMU */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index db7ba4a725d6..b49cc6385f1f 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -424,6 +424,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
tlb->page_size = 0;
#endif
+ tlb->vma_pfn = 0;
__tlb_reset_range(tlb);
inc_tlb_flush_pending(tlb->mm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index fc18fe274505..8e0125dc0522 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -4,7 +4,7 @@
*
* Copyright (C) 2008 Qumranet, Inc.
* Copyright (C) 2008 SGI
- * Christoph Lameter <cl@linux.com>
+ * Christoph Lameter <cl@gentwo.org>
*/
#include <linux/rculist.h>
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 62c1f7945741..2ddd37b2f462 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -40,11 +40,8 @@
#include "internal.h"
-bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+static bool maybe_change_pte_writable(struct vm_area_struct *vma, pte_t pte)
{
- struct page *page;
-
if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
return false;
@@ -60,16 +57,32 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
if (userfaultfd_pte_wp(vma, pte))
return false;
- if (!(vma->vm_flags & VM_SHARED)) {
- /*
- * Writable MAP_PRIVATE mapping: We can only special-case on
- * exclusive anonymous pages, because we know that our
- * write-fault handler similarly would map them writable without
- * any additional checks while holding the PT lock.
- */
- page = vm_normal_page(vma, addr, pte);
- return page && PageAnon(page) && PageAnonExclusive(page);
- }
+ return true;
+}
+
+static bool can_change_private_pte_writable(struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ struct page *page;
+
+ if (!maybe_change_pte_writable(vma, pte))
+ return false;
+
+ /*
+ * Writable MAP_PRIVATE mapping: We can only special-case on
+ * exclusive anonymous pages, because we know that our
+ * write-fault handler similarly would map them writable without
+ * any additional checks while holding the PT lock.
+ */
+ page = vm_normal_page(vma, addr, pte);
+ return page && PageAnon(page) && PageAnonExclusive(page);
+}
+
+static bool can_change_shared_pte_writable(struct vm_area_struct *vma,
+ pte_t pte)
+{
+ if (!maybe_change_pte_writable(vma, pte))
+ return false;
VM_WARN_ON_ONCE(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte));
@@ -83,6 +96,183 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
return pte_dirty(pte);
}
+bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ if (!(vma->vm_flags & VM_SHARED))
+ return can_change_private_pte_writable(vma, addr, pte);
+
+ return can_change_shared_pte_writable(vma, pte);
+}
+
+static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
+ pte_t pte, int max_nr_ptes, fpb_t flags)
+{
+ /* No underlying folio, so cannot batch */
+ if (!folio)
+ return 1;
+
+ if (!folio_test_large(folio))
+ return 1;
+
+ return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr_ptes, flags);
+}
+
+static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr,
+ pte_t oldpte, pte_t *pte, int target_node,
+ struct folio **foliop)
+{
+ struct folio *folio = NULL;
+ bool ret = true;
+ bool toptier;
+ int nid;
+
+ /* Avoid TLB flush if possible */
+ if (pte_protnone(oldpte))
+ goto skip;
+
+ folio = vm_normal_folio(vma, addr, oldpte);
+ if (!folio)
+ goto skip;
+
+ if (folio_is_zone_device(folio) || folio_test_ksm(folio))
+ goto skip;
+
+ /* Also skip shared copy-on-write pages */
+ if (is_cow_mapping(vma->vm_flags) &&
+ (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio)))
+ goto skip;
+
+ /*
+ * While migration can move some dirty pages,
+ * it cannot move them all from MIGRATE_ASYNC
+ * context.
+ */
+ if (folio_is_file_lru(folio) && folio_test_dirty(folio))
+ goto skip;
+
+ /*
+ * Don't mess with PTEs if page is already on the node
+ * a single-threaded process is running on.
+ */
+ nid = folio_nid(folio);
+ if (target_node == nid)
+ goto skip;
+
+ toptier = node_is_toptier(nid);
+
+ /*
+ * Skip scanning top tier node if normal numa
+ * balancing is disabled
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier)
+ goto skip;
+
+ ret = false;
+ if (folio_use_access_time(folio))
+ folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
+
+skip:
+ *foliop = folio;
+ return ret;
+}
+
+/* Set nr_ptes number of ptes, starting from idx */
+static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes,
+ int idx, bool set_write, struct mmu_gather *tlb)
+{
+ /*
+ * Advance the position in the batch by idx; note that if idx > 0,
+ * then the nr_ptes passed here is <= batch size - idx.
+ */
+ addr += idx * PAGE_SIZE;
+ ptep += idx;
+ oldpte = pte_advance_pfn(oldpte, idx);
+ ptent = pte_advance_pfn(ptent, idx);
+
+ if (set_write)
+ ptent = pte_mkwrite(ptent, vma);
+
+ modify_prot_commit_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes);
+ if (pte_needs_flush(oldpte, ptent))
+ tlb_flush_pte_range(tlb, addr, nr_ptes * PAGE_SIZE);
+}
+
+/*
+ * Get max length of consecutive ptes pointing to PageAnonExclusive() pages or
+ * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce
+ * that the ptes point to consecutive pages of the same anon large folio.
+ */
+static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
+ struct page *first_page, bool expected_anon_exclusive)
+{
+ int idx;
+
+ for (idx = start_idx + 1; idx < start_idx + max_len; ++idx) {
+ if (expected_anon_exclusive != PageAnonExclusive(first_page + idx))
+ break;
+ }
+ return idx - start_idx;
+}
+
+/*
+ * This function is a result of trying our very best to retain the
+ * "avoid the write-fault handler" optimization. In can_change_pte_writable(),
+ * if the vma is a private vma, and we cannot determine whether to change
+ * the pte to writable just from the vma and the pte, we then need to look
+ * at the actual page pointed to by the pte. Unfortunately, if we have a
+ * batch of ptes pointing to consecutive pages of the same anon large folio,
+ * the anon-exclusivity (or the negation) of the first page does not guarantee
+ * the anon-exclusivity (or the negation) of the other pages corresponding to
+ * the pte batch; hence in this case it is incorrect to decide to change or
+ * not change the ptes to writable just by using information from the first
+ * pte of the batch. Therefore, we must individually check all pages and
+ * retrieve sub-batches.
+ */
+static void commit_anon_folio_batch(struct vm_area_struct *vma,
+ struct folio *folio, unsigned long addr, pte_t *ptep,
+ pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
+{
+ struct page *first_page = folio_page(folio, 0);
+ bool expected_anon_exclusive;
+ int sub_batch_idx = 0;
+ int len;
+
+ while (nr_ptes) {
+ expected_anon_exclusive = PageAnonExclusive(first_page + sub_batch_idx);
+ len = page_anon_exclusive_sub_batch(sub_batch_idx, nr_ptes,
+ first_page, expected_anon_exclusive);
+ prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, len,
+ sub_batch_idx, expected_anon_exclusive, tlb);
+ sub_batch_idx += len;
+ nr_ptes -= len;
+ }
+}
+
+static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
+ struct folio *folio, unsigned long addr, pte_t *ptep,
+ pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
+{
+ bool set_write;
+
+ if (vma->vm_flags & VM_SHARED) {
+ set_write = can_change_shared_pte_writable(vma, ptent);
+ prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes,
+ /* idx = */ 0, set_write, tlb);
+ return;
+ }
+
+ set_write = maybe_change_pte_writable(vma, ptent) &&
+ (folio && folio_test_anon(folio));
+ if (!set_write) {
+ prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes,
+ /* idx = */ 0, set_write, tlb);
+ return;
+ }
+ commit_anon_folio_batch(vma, folio, addr, ptep, oldpte, ptent, nr_ptes, tlb);
+}
+
static long change_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
@@ -94,6 +284,7 @@ static long change_pte_range(struct mmu_gather *tlb,
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ int nr_ptes;
tlb_change_page_size(tlb, PAGE_SIZE);
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -108,8 +299,12 @@ static long change_pte_range(struct mmu_gather *tlb,
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
do {
+ nr_ptes = 1;
oldpte = ptep_get(pte);
if (pte_present(oldpte)) {
+ const fpb_t flags = FPB_RESPECT_SOFT_DIRTY | FPB_RESPECT_WRITE;
+ int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
+ struct folio *folio = NULL;
pte_t ptent;
/*
@@ -117,56 +312,23 @@ static long change_pte_range(struct mmu_gather *tlb,
* pages. See similar comment in change_huge_pmd.
*/
if (prot_numa) {
- struct folio *folio;
- int nid;
- bool toptier;
+ int ret = prot_numa_skip(vma, addr, oldpte, pte,
+ target_node, &folio);
+ if (ret) {
- /* Avoid TLB flush if possible */
- if (pte_protnone(oldpte))
+ /* determine batch to skip */
+ nr_ptes = mprotect_folio_pte_batch(folio,
+ pte, oldpte, max_nr_ptes, /* flags = */ 0);
continue;
+ }
+ }
+ if (!folio)
folio = vm_normal_folio(vma, addr, oldpte);
- if (!folio || folio_is_zone_device(folio) ||
- folio_test_ksm(folio))
- continue;
-
- /* Also skip shared copy-on-write pages */
- if (is_cow_mapping(vma->vm_flags) &&
- (folio_maybe_dma_pinned(folio) ||
- folio_maybe_mapped_shared(folio)))
- continue;
- /*
- * While migration can move some dirty pages,
- * it cannot move them all from MIGRATE_ASYNC
- * context.
- */
- if (folio_is_file_lru(folio) &&
- folio_test_dirty(folio))
- continue;
-
- /*
- * Don't mess with PTEs if page is already on the node
- * a single-threaded process is running on.
- */
- nid = folio_nid(folio);
- if (target_node == nid)
- continue;
- toptier = node_is_toptier(nid);
-
- /*
- * Skip scanning top tier node if normal numa
- * balancing is disabled
- */
- if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- toptier)
- continue;
- if (folio_use_access_time(folio))
- folio_xchg_access_time(folio,
- jiffies_to_msecs(jiffies));
- }
+ nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags);
- oldpte = ptep_modify_prot_start(vma, addr, pte);
+ oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes);
ptent = pte_modify(oldpte, newprot);
if (uffd_wp)
@@ -188,14 +350,13 @@ static long change_pte_range(struct mmu_gather *tlb,
* COW or special handling is required.
*/
if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
- !pte_write(ptent) &&
- can_change_pte_writable(vma, addr, ptent))
- ptent = pte_mkwrite(ptent, vma);
-
- ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
- if (pte_needs_flush(oldpte, ptent))
- tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
- pages++;
+ !pte_write(ptent))
+ set_write_prot_commit_flush_ptes(vma, folio,
+ addr, pte, oldpte, ptent, nr_ptes, tlb);
+ else
+ prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent,
+ nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
+ pages += nr_ptes;
} else if (is_swap_pte(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
pte_t newpte;
@@ -280,7 +441,7 @@ static long change_pte_range(struct mmu_gather *tlb,
pages++;
}
}
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
@@ -376,10 +537,10 @@ again:
goto next;
_pmd = pmdp_get_lockless(pmd);
- if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
+ if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd)) {
if ((next - addr != HPAGE_PMD_SIZE) ||
pgtable_split_needed(vma, cp_flags)) {
- __split_huge_pmd(vma, pmd, addr, false, NULL);
+ __split_huge_pmd(vma, pmd, addr, false);
/*
* For file-backed, the pmd could have been
* cleared; make sure pmd populated if
@@ -596,10 +757,10 @@ static const struct mm_walk_ops prot_none_walk_ops = {
int
mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
struct vm_area_struct *vma, struct vm_area_struct **pprev,
- unsigned long start, unsigned long end, unsigned long newflags)
+ unsigned long start, unsigned long end, vm_flags_t newflags)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long oldflags = READ_ONCE(vma->vm_flags);
+ vm_flags_t oldflags = READ_ONCE(vma->vm_flags);
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned int mm_cp_flags = 0;
unsigned long charged = 0;
@@ -774,8 +935,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
nstart = start;
tmp = vma->vm_start;
for_each_vma_range(vmi, vma, end) {
- unsigned long mask_off_old_flags;
- unsigned long newflags;
+ vm_flags_t mask_off_old_flags;
+ vm_flags_t newflags;
int new_vma_pkey;
if (vma->vm_start != tmp) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 0865387531ed..e15cf2e444c7 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -52,7 +52,7 @@ struct vma_remap_struct {
unsigned long addr; /* User-specified address from which we remap. */
unsigned long old_len; /* Length of range being remapped. */
unsigned long new_len; /* Desired new length of mapping. */
- unsigned long flags; /* user-specified MREMAP_* flags. */
+ const unsigned long flags; /* user-specified MREMAP_* flags. */
unsigned long new_addr; /* Optionally, desired new address. */
/* uffd state. */
@@ -65,10 +65,11 @@ struct vma_remap_struct {
/* Internal state, determined in do_mremap(). */
unsigned long delta; /* Absolute delta of old_len,new_len. */
- bool mlocked; /* Was the VMA mlock()'d? */
+ bool populate_expand; /* mlock()'d expanded, must populate. */
enum mremap_type remap_type; /* expand, shrink, etc. */
bool mmap_locked; /* Is mm currently write-locked? */
unsigned long charged; /* If VM_ACCOUNT, # pages to account. */
+ bool vmi_needs_invalidate; /* Is the VMA iterator invalidated? */
};
static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
@@ -170,13 +171,29 @@ static pte_t move_soft_dirty_pte(pte_t pte)
return pte;
}
+static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t pte, int max_nr)
+{
+ struct folio *folio;
+
+ if (max_nr == 1)
+ return 1;
+
+ folio = vm_normal_folio(vma, addr, pte);
+ if (!folio || !folio_test_large(folio))
+ return 1;
+
+ return folio_pte_batch(folio, ptep, pte, max_nr);
+}
+
static int move_ptes(struct pagetable_move_control *pmc,
unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd)
{
struct vm_area_struct *vma = pmc->old;
bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
- pte_t *old_pte, *new_pte, pte;
+ pte_t *old_ptep, *new_ptep;
+ pte_t old_pte, pte;
pmd_t dummy_pmdval;
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
@@ -184,6 +201,8 @@ static int move_ptes(struct pagetable_move_control *pmc,
unsigned long new_addr = pmc->new_addr;
unsigned long old_end = old_addr + extent;
unsigned long len = old_end - old_addr;
+ int max_nr_ptes;
+ int nr_ptes;
int err = 0;
/*
@@ -211,8 +230,8 @@ static int move_ptes(struct pagetable_move_control *pmc,
* We don't have to worry about the ordering of src and dst
* pte locks because exclusive mmap_lock prevents deadlock.
*/
- old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
- if (!old_pte) {
+ old_ptep = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+ if (!old_ptep) {
err = -EAGAIN;
goto out;
}
@@ -223,10 +242,10 @@ static int move_ptes(struct pagetable_move_control *pmc,
* mmap_lock, so this new_pte page is stable, so there is no need to get
* pmdval and do pmd_same() check.
*/
- new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
+ new_ptep = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
&new_ptl);
- if (!new_pte) {
- pte_unmap_unlock(old_pte, old_ptl);
+ if (!new_ptep) {
+ pte_unmap_unlock(old_ptep, old_ptl);
err = -EAGAIN;
goto out;
}
@@ -235,12 +254,16 @@ static int move_ptes(struct pagetable_move_control *pmc,
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
- for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
- new_pte++, new_addr += PAGE_SIZE) {
- if (pte_none(ptep_get(old_pte)))
+ for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE,
+ new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) {
+ VM_WARN_ON_ONCE(!pte_none(*new_ptep));
+
+ nr_ptes = 1;
+ max_nr_ptes = (old_end - old_addr) >> PAGE_SHIFT;
+ old_pte = ptep_get(old_ptep);
+ if (pte_none(old_pte))
continue;
- pte = ptep_get_and_clear(mm, old_addr, old_pte);
/*
* If we are remapping a valid PTE, make sure
* to flush TLB before we drop the PTL for the
@@ -252,13 +275,17 @@ static int move_ptes(struct pagetable_move_control *pmc,
* the TLB entry for the old mapping has been
* flushed.
*/
- if (pte_present(pte))
+ if (pte_present(old_pte)) {
+ nr_ptes = mremap_folio_pte_batch(vma, old_addr, old_ptep,
+ old_pte, max_nr_ptes);
force_flush = true;
+ }
+ pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0);
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
- pte_clear(mm, new_addr, new_pte);
+ pte_clear(mm, new_addr, new_ptep);
else {
if (need_clear_uffd_wp) {
if (pte_present(pte))
@@ -266,7 +293,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
else if (is_swap_pte(pte))
pte = pte_swp_clear_uffd_wp(pte);
}
- set_pte_at(mm, new_addr, new_pte, pte);
+ set_ptes(mm, new_addr, new_ptep, pte, nr_ptes);
}
}
@@ -275,8 +302,8 @@ static int move_ptes(struct pagetable_move_control *pmc,
flush_tlb_range(vma, old_end - len, old_end);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
- pte_unmap(new_pte - 1);
- pte_unmap_unlock(old_pte - 1, old_ptl);
+ pte_unmap(new_ptep - 1);
+ pte_unmap_unlock(old_ptep - 1, old_ptl);
out:
if (pmc->need_rmap_locks)
drop_rmap_locks(vma);
@@ -792,7 +819,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc)
new_pud = alloc_new_pud(mm, pmc->new_addr);
if (!new_pud)
break;
- if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
+ if (pud_trans_huge(*old_pud)) {
if (extent == HPAGE_PUD_SIZE) {
move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud);
/* We ignore and continue on error? */
@@ -811,8 +838,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc)
if (!new_pmd)
break;
again:
- if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
- pmd_devmap(*old_pmd)) {
+ if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE &&
move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd))
continue;
@@ -884,7 +910,11 @@ static bool vrm_overlaps(struct vma_remap_struct *vrm)
return false;
}
-/* Do the mremap() flags require that the new_addr parameter be specified? */
+/*
+ * Will a new address definitely be assigned? This either if the user specifies
+ * it via MREMAP_FIXED, or if MREMAP_DONTUNMAP is used, indicating we will
+ * always detemrine a target address.
+ */
static bool vrm_implies_new_addr(struct vma_remap_struct *vrm)
{
return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP);
@@ -930,7 +960,7 @@ static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm)
*
* Returns true on success, false if insufficient memory to charge.
*/
-static bool vrm_charge(struct vma_remap_struct *vrm)
+static bool vrm_calc_charge(struct vma_remap_struct *vrm)
{
unsigned long charged;
@@ -981,10 +1011,8 @@ static void vrm_stat_account(struct vma_remap_struct *vrm,
struct vm_area_struct *vma = vrm->vma;
vm_stat_account(mm, vma->vm_flags, pages);
- if (vma->vm_flags & VM_LOCKED) {
+ if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += pages;
- vrm->mlocked = true;
- }
}
/*
@@ -997,7 +1025,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
struct vm_area_struct *vma = vrm->vma;
unsigned long old_addr = vrm->addr;
unsigned long old_len = vrm->old_len;
- unsigned long dummy = vma->vm_flags;
+ vm_flags_t dummy = vma->vm_flags;
/*
* We'd prefer to avoid failure later on in do_munmap:
@@ -1084,6 +1112,7 @@ static void unmap_source_vma(struct vma_remap_struct *vrm)
err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false);
vrm->vma = NULL; /* Invalidated. */
+ vrm->vmi_needs_invalidate = true;
if (err) {
/* OOM: unable to split vma, just get accounts right */
vm_acct_memory(len >> PAGE_SHIFT);
@@ -1159,6 +1188,10 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm,
*new_vma_ptr = NULL;
return -ENOMEM;
}
+ /* By merging, we may have invalidated any iterator in use. */
+ if (vma != vrm->vma)
+ vrm->vmi_needs_invalidate = true;
+
vrm->vma = vma;
pmc.old = vma;
pmc.new = new_vma;
@@ -1188,12 +1221,7 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm,
mremap_userfaultfd_prep(new_vma, vrm->uf);
}
- if (is_vm_hugetlb_page(vma))
- clear_vma_resv_huge_pages(vma);
-
- /* Tell pfnmap has moved from this vma */
- if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn_clear(vma);
+ fixup_hugetlb_reservations(vma);
*new_vma_ptr = new_vma;
return err;
@@ -1240,8 +1268,11 @@ static unsigned long move_vma(struct vma_remap_struct *vrm)
if (err)
return err;
- /* If accounted, charge the number of bytes the operation will use. */
- if (!vrm_charge(vrm))
+ /*
+ * If accounted, determine the number of bytes the operation will
+ * charge.
+ */
+ if (!vrm_calc_charge(vrm))
return -ENOMEM;
/* We don't want racing faults. */
@@ -1280,64 +1311,6 @@ static unsigned long move_vma(struct vma_remap_struct *vrm)
}
/*
- * resize_is_valid() - Ensure the vma can be resized to the new length at the give
- * address.
- *
- * Return 0 on success, error otherwise.
- */
-static int resize_is_valid(struct vma_remap_struct *vrm)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma = vrm->vma;
- unsigned long addr = vrm->addr;
- unsigned long old_len = vrm->old_len;
- unsigned long new_len = vrm->new_len;
- unsigned long pgoff;
-
- /*
- * !old_len is a special case where an attempt is made to 'duplicate'
- * a mapping. This makes no sense for private mappings as it will
- * instead create a fresh/new mapping unrelated to the original. This
- * is contrary to the basic idea of mremap which creates new mappings
- * based on the original. There are no known use cases for this
- * behavior. As a result, fail such attempts.
- */
- if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
- pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n",
- current->comm, current->pid);
- return -EINVAL;
- }
-
- if ((vrm->flags & MREMAP_DONTUNMAP) &&
- (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
- return -EINVAL;
-
- /* We can't remap across vm area boundaries */
- if (old_len > vma->vm_end - addr)
- return -EFAULT;
-
- if (new_len == old_len)
- return 0;
-
- /* Need to be careful about a growing mapping */
- pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
- pgoff += vma->vm_pgoff;
- if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
- return -EINVAL;
-
- if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
- return -EFAULT;
-
- if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta))
- return -EAGAIN;
-
- if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT))
- return -ENOMEM;
-
- return 0;
-}
-
-/*
* The user has requested that the VMA be shrunk (i.e., old_len > new_len), so
* execute this, optionally dropping the mmap lock when we do so.
*
@@ -1386,14 +1359,6 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm)
struct mm_struct *mm = current->mm;
unsigned long err;
- /* Is the new length or address silly? */
- if (vrm->new_len > TASK_SIZE ||
- vrm->new_addr > TASK_SIZE - vrm->new_len)
- return -EINVAL;
-
- if (vrm_overlaps(vrm))
- return -EINVAL;
-
if (vrm->flags & MREMAP_FIXED) {
/*
* In mremap_to().
@@ -1403,6 +1368,7 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm)
err = do_munmap(mm, vrm->new_addr, vrm->new_len,
vrm->uf_unmap_early);
vrm->vma = NULL; /* Invalidated. */
+ vrm->vmi_needs_invalidate = true;
if (err)
return err;
@@ -1424,10 +1390,6 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm)
vrm->old_len = vrm->new_len;
}
- err = resize_is_valid(vrm);
- if (err)
- return err;
-
/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
if (vrm->flags & MREMAP_DONTUNMAP) {
vm_flags_t vm_flags = vrm->vma->vm_flags;
@@ -1476,68 +1438,6 @@ static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm)
}
/*
- * Are the parameters passed to mremap() valid? If so return 0, otherwise return
- * error.
- */
-static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
-
-{
- unsigned long addr = vrm->addr;
- unsigned long flags = vrm->flags;
-
- /* Ensure no unexpected flag values. */
- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
- return -EINVAL;
-
- /* Start address must be page-aligned. */
- if (offset_in_page(addr))
- return -EINVAL;
-
- /*
- * We allow a zero old-len as a special case
- * for DOS-emu "duplicate shm area" thing. But
- * a zero new-len is nonsensical.
- */
- if (!PAGE_ALIGN(vrm->new_len))
- return -EINVAL;
-
- /* Remainder of checks are for cases with specific new_addr. */
- if (!vrm_implies_new_addr(vrm))
- return 0;
-
- /* The new address must be page-aligned. */
- if (offset_in_page(vrm->new_addr))
- return -EINVAL;
-
- /* A fixed address implies a move. */
- if (!(flags & MREMAP_MAYMOVE))
- return -EINVAL;
-
- /* MREMAP_DONTUNMAP does not allow resizing in the process. */
- if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len)
- return -EINVAL;
-
- /*
- * move_vma() need us to stay 4 maps below the threshold, otherwise
- * it will bail out at the very beginning.
- * That is a problem if we have already unmaped the regions here
- * (new_addr, and old_addr), because userspace will not know the
- * state of the vma's after it gets -ENOMEM.
- * So, to avoid such scenario we can pre-compute if the whole
- * operation has high chances to success map-wise.
- * Worst-scenario case is when both vma's (new_addr and old_addr) get
- * split in 3 before unmapping it.
- * That means 2 more maps (1 for each) to the ones we already hold.
- * Check whether current map count plus 2 still leads us to 4 maps below
- * the threshold, otherwise return -ENOMEM here to be more safe.
- */
- if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3)
- return -ENOMEM;
-
- return 0;
-}
-
-/*
* We know we can expand the VMA in-place by delta pages, so do so.
*
* If we discover the VMA is locked, update mm_struct statistics accordingly and
@@ -1549,7 +1449,7 @@ static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm)
struct vm_area_struct *vma = vrm->vma;
VMA_ITERATOR(vmi, mm, vma->vm_end);
- if (!vrm_charge(vrm))
+ if (!vrm_calc_charge(vrm))
return -ENOMEM;
/*
@@ -1561,11 +1461,12 @@ static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm)
* adjacent to the expanded vma and otherwise
* compatible.
*/
- vma = vrm->vma = vma_merge_extend(&vmi, vma, vrm->delta);
+ vma = vma_merge_extend(&vmi, vma, vrm->delta);
if (!vma) {
vrm_uncharge(vrm);
return -ENOMEM;
}
+ vrm->vma = vma;
vrm_stat_account(vrm, vrm->delta);
@@ -1592,8 +1493,6 @@ static bool align_hugetlb(struct vma_remap_struct *vrm)
if (vrm->new_len > vrm->old_len)
return false;
- vrm_set_delta(vrm);
-
return true;
}
@@ -1607,11 +1506,6 @@ static bool align_hugetlb(struct vma_remap_struct *vrm)
static unsigned long expand_vma(struct vma_remap_struct *vrm)
{
unsigned long err;
- unsigned long addr = vrm->addr;
-
- err = resize_is_valid(vrm);
- if (err)
- return err;
/*
* [addr, old_len) spans precisely to the end of the VMA, so try to
@@ -1622,16 +1516,8 @@ static unsigned long expand_vma(struct vma_remap_struct *vrm)
if (err)
return err;
- /*
- * We want to populate the newly expanded portion of the VMA to
- * satisfy the expectation that mlock()'ing a VMA maintains all
- * of its pages in memory.
- */
- if (vrm->mlocked)
- vrm->new_addr = addr;
-
/* OK we're done! */
- return addr;
+ return vrm->addr;
}
/*
@@ -1682,64 +1568,371 @@ static unsigned long mremap_at(struct vma_remap_struct *vrm)
return expand_vma(vrm);
}
- BUG();
+ /* Should not be possible. */
+ WARN_ON_ONCE(1);
+ return -EINVAL;
}
-static unsigned long do_mremap(struct vma_remap_struct *vrm)
+/*
+ * Will this operation result in the VMA being expanded or moved and thus need
+ * to map a new portion of virtual address space?
+ */
+static bool vrm_will_map_new(struct vma_remap_struct *vrm)
+{
+ if (vrm->remap_type == MREMAP_EXPAND)
+ return true;
+
+ if (vrm_implies_new_addr(vrm))
+ return true;
+
+ return false;
+}
+
+/* Does this remap ONLY move mappings? */
+static bool vrm_move_only(struct vma_remap_struct *vrm)
+{
+ if (!(vrm->flags & MREMAP_FIXED))
+ return false;
+
+ if (vrm->old_len != vrm->new_len)
+ return false;
+
+ return true;
+}
+
+static void notify_uffd(struct vma_remap_struct *vrm, bool failed)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long ret;
- ret = check_mremap_params(vrm);
- if (ret)
- return ret;
+ /* Regardless of success/failure, we always notify of any unmaps. */
+ userfaultfd_unmap_complete(mm, vrm->uf_unmap_early);
+ if (failed)
+ mremap_userfaultfd_fail(vrm->uf);
+ else
+ mremap_userfaultfd_complete(vrm->uf, vrm->addr,
+ vrm->new_addr, vrm->old_len);
+ userfaultfd_unmap_complete(mm, vrm->uf_unmap);
+}
- vrm->old_len = PAGE_ALIGN(vrm->old_len);
- vrm->new_len = PAGE_ALIGN(vrm->new_len);
- vrm_set_delta(vrm);
+static bool vma_multi_allowed(struct vm_area_struct *vma)
+{
+ struct file *file;
- if (mmap_write_lock_killable(mm))
- return -EINTR;
- vrm->mmap_locked = true;
+ /*
+ * We can't support moving multiple uffd VMAs as notify requires
+ * mmap lock to be dropped.
+ */
+ if (userfaultfd_armed(vma))
+ return false;
- vma = vrm->vma = vma_lookup(mm, vrm->addr);
- if (!vma) {
- ret = -EFAULT;
- goto out;
+ /*
+ * Custom get unmapped area might result in MREMAP_FIXED not
+ * being obeyed.
+ */
+ file = vma->vm_file;
+ if (file && !vma_is_shmem(vma) && !is_vm_hugetlb_page(vma)) {
+ const struct file_operations *fop = file->f_op;
+
+ if (fop->get_unmapped_area)
+ return false;
}
+ return true;
+}
+
+static int check_prep_vma(struct vma_remap_struct *vrm)
+{
+ struct vm_area_struct *vma = vrm->vma;
+ struct mm_struct *mm = current->mm;
+ unsigned long addr = vrm->addr;
+ unsigned long old_len, new_len, pgoff;
+
+ if (!vma)
+ return -EFAULT;
+
/* If mseal()'d, mremap() is prohibited. */
- if (!can_modify_vma(vma)) {
- ret = -EPERM;
- goto out;
- }
+ if (!can_modify_vma(vma))
+ return -EPERM;
/* Align to hugetlb page size, if required. */
- if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) {
- ret = -EINVAL;
- goto out;
- }
+ if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm))
+ return -EINVAL;
+ vrm_set_delta(vrm);
vrm->remap_type = vrm_remap_type(vrm);
+ /* For convenience, we set new_addr even if VMA won't move. */
+ if (!vrm_implies_new_addr(vrm))
+ vrm->new_addr = addr;
- /* Actually execute mremap. */
- ret = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
+ /* Below only meaningful if we expand or move a VMA. */
+ if (!vrm_will_map_new(vrm))
+ return 0;
-out:
- if (vrm->mmap_locked) {
- mmap_write_unlock(mm);
- vrm->mmap_locked = false;
+ old_len = vrm->old_len;
+ new_len = vrm->new_len;
- if (!offset_in_page(ret) && vrm->mlocked && vrm->new_len > vrm->old_len)
- mm_populate(vrm->new_addr + vrm->old_len, vrm->delta);
+ /*
+ * !old_len is a special case where an attempt is made to 'duplicate'
+ * a mapping. This makes no sense for private mappings as it will
+ * instead create a fresh/new mapping unrelated to the original. This
+ * is contrary to the basic idea of mremap which creates new mappings
+ * based on the original. There are no known use cases for this
+ * behavior. As a result, fail such attempts.
+ */
+ if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
+ pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n",
+ current->comm, current->pid);
+ return -EINVAL;
}
- userfaultfd_unmap_complete(mm, vrm->uf_unmap_early);
- mremap_userfaultfd_complete(vrm->uf, vrm->addr, ret, vrm->old_len);
- userfaultfd_unmap_complete(mm, vrm->uf_unmap);
+ if ((vrm->flags & MREMAP_DONTUNMAP) &&
+ (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
+ return -EINVAL;
+
+ /*
+ * We permit crossing of boundaries for the range being unmapped due to
+ * a shrink.
+ */
+ if (vrm->remap_type == MREMAP_SHRINK)
+ old_len = new_len;
+
+ /*
+ * We can't remap across the end of VMAs, as another VMA may be
+ * adjacent:
+ *
+ * addr vma->vm_end
+ * |-----.----------|
+ * | . |
+ * |-----.----------|
+ * .<--------->xxx>
+ * old_len
+ *
+ * We also require that vma->vm_start <= addr < vma->vm_end.
+ */
+ if (old_len > vma->vm_end - addr)
+ return -EFAULT;
+
+ if (new_len == old_len)
+ return 0;
+
+ /* We are expanding and the VMA is mlock()'d so we need to populate. */
+ if (vma->vm_flags & VM_LOCKED)
+ vrm->populate_expand = true;
+
+ /* Need to be careful about a growing mapping */
+ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff += vma->vm_pgoff;
+ if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
+ return -EFAULT;
+
+ if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta))
+ return -EAGAIN;
+
+ if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * Are the parameters passed to mremap() valid? If so return 0, otherwise return
+ * error.
+ */
+static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
+
+{
+ unsigned long addr = vrm->addr;
+ unsigned long flags = vrm->flags;
+
+ /* Ensure no unexpected flag values. */
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
+ return -EINVAL;
+
+ /* Start address must be page-aligned. */
+ if (offset_in_page(addr))
+ return -EINVAL;
+
+ /*
+ * We allow a zero old-len as a special case
+ * for DOS-emu "duplicate shm area" thing. But
+ * a zero new-len is nonsensical.
+ */
+ if (!vrm->new_len)
+ return -EINVAL;
+
+ /* Is the new length or address silly? */
+ if (vrm->new_len > TASK_SIZE ||
+ vrm->new_addr > TASK_SIZE - vrm->new_len)
+ return -EINVAL;
+
+ /* Remainder of checks are for cases with specific new_addr. */
+ if (!vrm_implies_new_addr(vrm))
+ return 0;
+
+ /* The new address must be page-aligned. */
+ if (offset_in_page(vrm->new_addr))
+ return -EINVAL;
+
+ /* A fixed address implies a move. */
+ if (!(flags & MREMAP_MAYMOVE))
+ return -EINVAL;
+
+ /* MREMAP_DONTUNMAP does not allow resizing in the process. */
+ if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len)
+ return -EINVAL;
+
+ /* Target VMA must not overlap source VMA. */
+ if (vrm_overlaps(vrm))
+ return -EINVAL;
+
+ /*
+ * move_vma() need us to stay 4 maps below the threshold, otherwise
+ * it will bail out at the very beginning.
+ * That is a problem if we have already unmaped the regions here
+ * (new_addr, and old_addr), because userspace will not know the
+ * state of the vma's after it gets -ENOMEM.
+ * So, to avoid such scenario we can pre-compute if the whole
+ * operation has high chances to success map-wise.
+ * Worst-scenario case is when both vma's (new_addr and old_addr) get
+ * split in 3 before unmapping it.
+ * That means 2 more maps (1 for each) to the ones we already hold.
+ * Check whether current map count plus 2 still leads us to 4 maps below
+ * the threshold, otherwise return -ENOMEM here to be more safe.
+ */
+ if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static unsigned long remap_move(struct vma_remap_struct *vrm)
+{
+ struct vm_area_struct *vma;
+ unsigned long start = vrm->addr;
+ unsigned long end = vrm->addr + vrm->old_len;
+ unsigned long new_addr = vrm->new_addr;
+ bool allowed = true, seen_vma = false;
+ unsigned long target_addr = new_addr;
+ unsigned long res = -EFAULT;
+ unsigned long last_end;
+ VMA_ITERATOR(vmi, current->mm, start);
+
+ /*
+ * When moving VMAs we allow for batched moves across multiple VMAs,
+ * with all VMAs in the input range [addr, addr + old_len) being moved
+ * (and split as necessary).
+ */
+ for_each_vma_range(vmi, vma, end) {
+ /* Account for start, end not aligned with VMA start, end. */
+ unsigned long addr = max(vma->vm_start, start);
+ unsigned long len = min(end, vma->vm_end) - addr;
+ unsigned long offset, res_vma;
- return ret;
+ if (!allowed)
+ return -EFAULT;
+
+ /* No gap permitted at the start of the range. */
+ if (!seen_vma && start < vma->vm_start)
+ return -EFAULT;
+
+ /*
+ * To sensibly move multiple VMAs, accounting for the fact that
+ * get_unmapped_area() may align even MAP_FIXED moves, we simply
+ * attempt to move such that the gaps between source VMAs remain
+ * consistent in destination VMAs, e.g.:
+ *
+ * X Y X Y
+ * <---> <-> <---> <->
+ * |-------| |-----| |-----| |-------| |-----| |-----|
+ * | A | | B | | C | ---> | A' | | B' | | C' |
+ * |-------| |-----| |-----| |-------| |-----| |-----|
+ * new_addr
+ *
+ * So we map B' at A'->vm_end + X, and C' at B'->vm_end + Y.
+ */
+ offset = seen_vma ? vma->vm_start - last_end : 0;
+ last_end = vma->vm_end;
+
+ vrm->vma = vma;
+ vrm->addr = addr;
+ vrm->new_addr = target_addr + offset;
+ vrm->old_len = vrm->new_len = len;
+
+ allowed = vma_multi_allowed(vma);
+ if (seen_vma && !allowed)
+ return -EFAULT;
+
+ res_vma = check_prep_vma(vrm);
+ if (!res_vma)
+ res_vma = mremap_to(vrm);
+ if (IS_ERR_VALUE(res_vma))
+ return res_vma;
+
+ if (!seen_vma) {
+ VM_WARN_ON_ONCE(allowed && res_vma != new_addr);
+ res = res_vma;
+ }
+
+ /* mmap lock is only dropped on shrink. */
+ VM_WARN_ON_ONCE(!vrm->mmap_locked);
+ /* This is a move, no expand should occur. */
+ VM_WARN_ON_ONCE(vrm->populate_expand);
+
+ if (vrm->vmi_needs_invalidate) {
+ vma_iter_invalidate(&vmi);
+ vrm->vmi_needs_invalidate = false;
+ }
+ seen_vma = true;
+ target_addr = res_vma + vrm->new_len;
+ }
+
+ return res;
+}
+
+static unsigned long do_mremap(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long res;
+ bool failed;
+
+ vrm->old_len = PAGE_ALIGN(vrm->old_len);
+ vrm->new_len = PAGE_ALIGN(vrm->new_len);
+
+ res = check_mremap_params(vrm);
+ if (res)
+ return res;
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ vrm->mmap_locked = true;
+
+ if (vrm_move_only(vrm)) {
+ res = remap_move(vrm);
+ } else {
+ vrm->vma = vma_lookup(current->mm, vrm->addr);
+ res = check_prep_vma(vrm);
+ if (res)
+ goto out;
+
+ /* Actually execute mremap. */
+ res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
+ }
+
+out:
+ failed = IS_ERR_VALUE(res);
+
+ if (vrm->mmap_locked)
+ mmap_write_unlock(mm);
+
+ /* VMA mlock'd + was expanded, so populated expanded region. */
+ if (!failed && vrm->populate_expand)
+ mm_populate(vrm->new_addr + vrm->old_len, vrm->delta);
+
+ notify_uffd(vrm, failed);
+ return res;
}
/*
diff --git a/mm/nommu.c b/mm/nommu.c
index 617e7ba8022f..736d0e0f0618 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -126,7 +126,7 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
- pgprot_t prot, unsigned long vm_flags, int node,
+ pgprot_t prot, vm_flags_t vm_flags, int node,
const void *caller)
{
return __vmalloc_noprof(size, gfp_mask);
@@ -200,7 +200,23 @@ void *vmalloc_noprof(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_noprof);
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof);
+/*
+ * vmalloc_huge_node - allocate virtually contiguous memory, on a node
+ *
+ * @size: allocation size
+ * @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * Due to NOMMU implications the node argument and HUGE page attribute is
+ * ignored.
+ */
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
+{
+ return __vmalloc_noprof(size, gfp_mask);
+}
/*
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -399,7 +415,8 @@ static const struct ctl_table nommu_table[] = {
};
/*
- * initialise the percpu counter for VM and region record slabs
+ * initialise the percpu counter for VM and region record slabs, initialise VMA
+ * state.
*/
void __init mmap_init(void)
{
@@ -409,6 +426,7 @@ void __init mmap_init(void)
VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
register_sysctl_init("vm", nommu_table);
+ vma_state_init();
}
/*
@@ -627,22 +645,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL(find_vma);
/*
- * At least xtensa ends up having protection faults even with no
- * MMU.. No stack expansion, at least.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
-{
- struct vm_area_struct *vma;
-
- mmap_read_lock(mm);
- vma = vma_lookup(mm, addr);
- if (!vma)
- mmap_read_unlock(mm);
- return vma;
-}
-
-/*
* expand a stack to a given address
* - not supported under NOMMU conditions
*/
@@ -717,7 +719,7 @@ static int validate_mmap_request(struct file *file,
if (file) {
/* files must support mmap */
- if (!file->f_op->mmap)
+ if (!can_mmap_file(file))
return -ENODEV;
/* work out if what we've got could possibly be shared
@@ -842,12 +844,12 @@ static int validate_mmap_request(struct file *file,
* we've determined that we can make the mapping, now translate what we
* now know into VMA flags
*/
-static unsigned long determine_vm_flags(struct file *file,
- unsigned long prot,
- unsigned long flags,
- unsigned long capabilities)
+static vm_flags_t determine_vm_flags(struct file *file,
+ unsigned long prot,
+ unsigned long flags,
+ unsigned long capabilities)
{
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags);
@@ -1890,3 +1892,11 @@ static int __meminit init_admin_reserve(void)
return 0;
}
subsys_initcall(init_admin_reserve);
+
+int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ mmap_write_lock(oldmm);
+ dup_mm_exe_file(mm, oldmm);
+ mmap_write_unlock(oldmm);
+ return 0;
+}
diff --git a/mm/numa.c b/mm/numa.c
index f1787d7713a6..7d5e06fe5bd4 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -13,7 +13,6 @@ void __init alloc_node_data(int nid)
{
const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
u64 nd_pa;
- void *nd;
int tnid;
/* Allocate node data. Try node-local memory and then any node. */
@@ -21,7 +20,6 @@ void __init alloc_node_data(int nid)
if (!nd_pa)
panic("Cannot allocate %zu bytes for node %d data\n",
nd_size, nid);
- nd = __va(nd_pa);
/* report and initialize */
pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
@@ -30,7 +28,7 @@ void __init alloc_node_data(int nid)
if (tnid != nid)
pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
- node_data[nid] = nd;
+ node_data[nid] = __va(nd_pa);
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
}
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c
index ff4054f4334d..541a99c4071a 100644
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@@ -201,6 +201,28 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
}
/**
+ * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the numa_reserved_meminfo.
+ *
+ * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances
+ * against memblock_type information and moves any that intersect reserved
+ * ranges to numa_reserved_meminfo. However, when that information is known
+ * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk
+ * to numa_reserved_meminfo directly.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_reserved_memblk(int nid, u64 start, u64 end)
+{
+ return numa_add_memblk_to(nid, start, end, &numa_reserved_meminfo);
+}
+
+/**
* numa_cleanup_meminfo - Cleanup a numa_meminfo
* @mi: numa_meminfo to clean up
*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 18456ddd463b..3e248d1c3969 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -41,6 +41,7 @@
#include <trace/events/writeback.h>
#include "internal.h"
+#include "swap.h"
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
@@ -520,8 +521,8 @@ static int dirty_ratio_handler(const struct ctl_table *table, int write, void *b
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- writeback_set_ratelimit();
vm_dirty_bytes = 0;
+ writeback_set_ratelimit();
}
return ret;
}
@@ -607,7 +608,7 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc);
*/
static void writeout_period(struct timer_list *t)
{
- struct wb_domain *dom = from_timer(dom, t, period_timer);
+ struct wb_domain *dom = timer_container_of(dom, t, period_timer);
int miss_periods = (jiffies - dom->period_time) /
VM_COMPLETIONS_PERIOD_LEN;
@@ -640,7 +641,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
- del_timer_sync(&dom->period_timer);
+ timer_delete_sync(&dom->period_timer);
fprop_global_destroy(&dom->completions);
}
#endif
@@ -1100,9 +1101,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
* such filesystems balance_dirty_pages always checks wb counters
* against wb limits. Even if global "nr_dirty" is under "freerun".
* This is especially important for fuse which sets bdi->max_ratio to
- * 1% by default. Without strictlimit feature, fuse writeback may
- * consume arbitrary amount of RAM because it is accounted in
- * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
+ * 1% by default.
*
* Here, in wb_position_ratio(), we calculate pos_ratio based on
* two values: wb_dirty and wb_thresh. Let's consider an example:
@@ -2202,7 +2201,7 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int
void laptop_mode_timer_fn(struct timer_list *t)
{
struct backing_dev_info *backing_dev_info =
- from_timer(backing_dev_info, t, laptop_mode_wb_timer);
+ timer_container_of(backing_dev_info, t, laptop_mode_wb_timer);
wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
}
@@ -2229,7 +2228,7 @@ void laptop_sync_completion(void)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
- del_timer(&bdi->laptop_mode_wb_timer);
+ timer_delete(&bdi->laptop_mode_wb_timer);
rcu_read_unlock();
}
@@ -2564,11 +2563,11 @@ struct folio *writeback_iter(struct address_space *mapping,
if (!folio) {
/*
* To avoid deadlocks between range_cyclic writeback and callers
- * that hold pages in PageWriteback to aggregate I/O until
+ * that hold folios in writeback to aggregate I/O until
* the writeback iteration finishes, we do not loop back to the
- * start of the file. Doing so causes a page lock/page
+ * start of the file. Doing so causes a folio lock/folio
* writeback access order inversion - we should only ever lock
- * multiple pages in ascending page->index order, and looping
+ * multiple folios in ascending folio->index order, and looping
* back to the start of the file violates that rule and causes
* deadlocks.
*/
@@ -2621,27 +2620,6 @@ int write_cache_pages(struct address_space *mapping,
}
EXPORT_SYMBOL(write_cache_pages);
-static int writeback_use_writepage(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct folio *folio = NULL;
- struct blk_plug plug;
- int err;
-
- blk_start_plug(&plug);
- while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
- err = mapping->a_ops->writepage(&folio->page, wbc);
- if (err == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- err = 0;
- }
- mapping_set_error(mapping, err);
- }
- blk_finish_plug(&plug);
-
- return err;
-}
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2652,14 +2630,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
wb = inode_to_wb_wbc(mapping->host, wbc);
wb_bandwidth_estimate_start(wb);
while (1) {
- if (mapping->a_ops->writepages) {
+ if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
- } else if (mapping->a_ops->writepage) {
- ret = writeback_use_writepage(mapping, wbc);
- } else {
+ else
/* deal with chardevs and other special files */
ret = 0;
- }
if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
break;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b173c2da641..d1d037f97c5f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -290,7 +290,8 @@ EXPORT_SYMBOL(nr_online_nodes);
#endif
static bool page_contains_unaccepted(struct page *page, unsigned int order);
-static bool cond_accept_memory(struct zone *zone, unsigned int order);
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags);
static bool __free_unaccepted(struct page *page);
int page_group_by_mobility_disabled __read_mostly;
@@ -352,81 +353,225 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
+static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
+{
+ return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS;
+}
+
+static __always_inline void
+get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
+ unsigned long **bitmap_word, unsigned long *bitidx)
+{
+ unsigned long *bitmap;
+ unsigned long word_bitidx;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8);
+#else
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+#endif
+ BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits));
+ VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+ bitmap = get_pageblock_bitmap(page, pfn);
+ *bitidx = pfn_to_bitidx(page, pfn);
+ word_bitidx = *bitidx / BITS_PER_LONG;
+ *bitidx &= (BITS_PER_LONG - 1);
+ *bitmap_word = &bitmap[word_bitidx];
+}
+
+
/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * __get_pfnblock_flags_mask - Return the requested group of flags for
+ * a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
*/
-unsigned long get_pfnblock_flags_mask(const struct page *page,
- unsigned long pfn, unsigned long mask)
+static unsigned long __get_pfnblock_flags_mask(const struct page *page,
+ unsigned long pfn,
+ unsigned long mask)
{
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
unsigned long word;
- bitmap = get_pageblock_bitmap(page, pfn);
- bitidx = pfn_to_bitidx(page, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
/*
- * This races, without locks, with set_pfnblock_flags_mask(). Ensure
+ * This races, without locks, with set_pfnblock_migratetype(). Ensure
* a consistent read of the memory array, so that results, even though
* racy, are not corrupted.
*/
- word = READ_ONCE(bitmap[word_bitidx]);
+ word = READ_ONCE(*bitmap_word);
return (word >> bitidx) & mask;
}
-static __always_inline int get_pfnblock_migratetype(const struct page *page,
- unsigned long pfn)
+/**
+ * get_pfnblock_bit - Check if a standalone bit of a pageblock is set
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to check
+ *
+ * Return: true if the bit is set, otherwise false
+ */
+bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
+ enum pageblock_bits pb_bit)
{
- return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+
+ if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+ return false;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+ return test_bit(bitidx + pb_bit, bitmap_word);
}
/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * get_pfnblock_migratetype - Return the migratetype of a pageblock
* @page: The page within the block of interest
- * @flags: The flags to set
* @pfn: The target page frame number
- * @mask: mask of bits that the caller is interested in
+ *
+ * Return: The migratetype of the pageblock
+ *
+ * Use get_pfnblock_migratetype() if caller already has both @page and @pfn
+ * to save a call to page_to_pfn().
*/
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
- unsigned long pfn,
- unsigned long mask)
+__always_inline enum migratetype
+get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
{
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
- unsigned long word;
+ unsigned long mask = MIGRATETYPE_AND_ISO_MASK;
+ unsigned long flags;
- BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
- BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
+ flags = __get_pfnblock_flags_mask(page, pfn, mask);
- bitmap = get_pageblock_bitmap(page, pfn);
- bitidx = pfn_to_bitidx(page, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
+#ifdef CONFIG_MEMORY_ISOLATION
+ if (flags & BIT(PB_migrate_isolate))
+ return MIGRATE_ISOLATE;
+#endif
+ return flags & MIGRATETYPE_MASK;
+}
- VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+/**
+ * __set_pfnblock_flags_mask - Set the requested group of flags for
+ * a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @flags: The flags to set
+ * @mask: mask of bits that the caller is interested in
+ */
+static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ unsigned long flags, unsigned long mask)
+{
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+ unsigned long word;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
mask <<= bitidx;
flags <<= bitidx;
- word = READ_ONCE(bitmap[word_bitidx]);
+ word = READ_ONCE(*bitmap_word);
do {
- } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
+ } while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags));
}
-void set_pageblock_migratetype(struct page *page, int migratetype)
+/**
+ * set_pfnblock_bit - Set a standalone bit of a pageblock
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to set
+ */
+void set_pfnblock_bit(const struct page *page, unsigned long pfn,
+ enum pageblock_bits pb_bit)
+{
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+
+ if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+ return;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+ set_bit(bitidx + pb_bit, bitmap_word);
+}
+
+/**
+ * clear_pfnblock_bit - Clear a standalone bit of a pageblock
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to clear
+ */
+void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
+ enum pageblock_bits pb_bit)
+{
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+
+ if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+ return;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+ clear_bit(bitidx + pb_bit, bitmap_word);
+}
+
+/**
+ * set_pageblock_migratetype - Set the migratetype of a pageblock
+ * @page: The page within the block of interest
+ * @migratetype: migratetype to set
+ */
+static void set_pageblock_migratetype(struct page *page,
+ enum migratetype migratetype)
{
if (unlikely(page_group_by_mobility_disabled &&
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
- set_pfnblock_flags_mask(page, (unsigned long)migratetype,
- page_to_pfn(page), MIGRATETYPE_MASK);
+#ifdef CONFIG_MEMORY_ISOLATION
+ if (migratetype == MIGRATE_ISOLATE) {
+ VM_WARN_ONCE(1,
+ "Use set_pageblock_isolate() for pageblock isolation");
+ return;
+ }
+ VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page),
+ PB_migrate_isolate),
+ "Use clear_pageblock_isolate() to unisolate pageblock");
+ /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */
+#endif
+ __set_pfnblock_flags_mask(page, page_to_pfn(page),
+ (unsigned long)migratetype,
+ MIGRATETYPE_AND_ISO_MASK);
+}
+
+void __meminit init_pageblock_migratetype(struct page *page,
+ enum migratetype migratetype,
+ bool isolate)
+{
+ unsigned long flags;
+
+ if (unlikely(page_group_by_mobility_disabled &&
+ migratetype < MIGRATE_PCPTYPES))
+ migratetype = MIGRATE_UNMOVABLE;
+
+ flags = migratetype;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+ if (migratetype == MIGRATE_ISOLATE) {
+ VM_WARN_ONCE(
+ 1,
+ "Set isolate=true to isolate pageblock with a migratetype");
+ return;
+ }
+ if (isolate)
+ flags |= BIT(PB_migrate_isolate);
+#endif
+ __set_pfnblock_flags_mask(page, page_to_pfn(page), flags,
+ MIGRATETYPE_AND_ISO_MASK);
}
#ifdef CONFIG_DEBUG_VM
@@ -666,7 +811,7 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone,
int nr_pages = 1 << order;
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
- "page type is %lu, passed migratetype is %d (nr=%d)\n",
+ "page type is %d, passed migratetype is %d (nr=%d)\n",
get_pageblock_migratetype(page), migratetype, nr_pages);
if (tail)
@@ -692,7 +837,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
/* Free page moving can fail, so it happens before the type update */
VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
- "page type is %lu, passed migratetype is %d (nr=%d)\n",
+ "page type is %d, passed migratetype is %d (nr=%d)\n",
get_pageblock_migratetype(page), old_mt, nr_pages);
list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
@@ -714,7 +859,7 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon
int nr_pages = 1 << order;
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
- "page type is %lu, passed migratetype is %d (nr=%d)\n",
+ "page type is %d, passed migratetype is %d (nr=%d)\n",
get_pageblock_migratetype(page), migratetype, nr_pages);
/* clear reported state and update reported page count */
@@ -897,9 +1042,7 @@ static inline bool page_expected_state(struct page *page,
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
-#ifdef CONFIG_PAGE_POOL
- ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
-#endif
+ page_pool_page_is_pp(page) |
(page->flags & check_flags)))
return false;
@@ -926,26 +1069,18 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
-#ifdef CONFIG_PAGE_POOL
- if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
+ if (unlikely(page_pool_page_is_pp(page)))
bad_reason = "page_pool leak";
-#endif
return bad_reason;
}
-static void free_page_is_bad_report(struct page *page)
-{
- bad_page(page,
- page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
-}
-
static inline bool free_page_is_bad(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return false;
/* Something has gone sideways, find it */
- free_page_is_bad_report(page);
+ bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
return true;
}
@@ -1151,14 +1286,9 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
__pgalloc_tag_sub(page, nr);
}
-static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
+/* When tag is not NULL, assuming mem_alloc_profiling_enabled */
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
{
- struct alloc_tag *tag;
-
- if (!mem_alloc_profiling_enabled())
- return;
-
- tag = __pgalloc_tag_get(page);
if (tag)
this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
}
@@ -1168,7 +1298,7 @@ static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
unsigned int nr) {}
static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
#endif /* CONFIG_MEM_ALLOC_PROFILING */
@@ -1245,11 +1375,14 @@ __always_inline bool free_pages_prepare(struct page *page,
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
- if (PageMappingFlags(page)) {
- if (PageAnon(page))
- mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
- page->mapping = NULL;
+ if (folio_test_anon(folio)) {
+ mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
+ folio->mapping = NULL;
}
+ if (unlikely(page_has_type(page)))
+ /* Reset the page_type (which overlays _mapcount) */
+ page->page_type = UINT_MAX;
+
if (is_check_pages_enabled()) {
if (free_page_is_bad(page))
bad++;
@@ -1400,11 +1533,12 @@ static void free_one_page(struct zone *zone, struct page *page,
struct llist_head *llhead;
unsigned long flags;
- if (!spin_trylock_irqsave(&zone->lock, flags)) {
- if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
add_page_to_zone_llist(zone, page, order);
return;
}
+ } else {
spin_lock_irqsave(&zone->lock, flags);
}
@@ -1593,7 +1727,7 @@ static __always_inline void page_del_and_expand(struct zone *zone,
static void check_new_page_bad(struct page *page)
{
- if (unlikely(page->flags & __PG_HWPOISON)) {
+ if (unlikely(PageHWPoison(page))) {
/* Don't complain about hwpoisoned pages */
if (PageBuddy(page))
__ClearPageBuddy(page);
@@ -1794,8 +1928,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
#endif
/*
- * Change the type of a block and move all its free pages to that
- * type's freelist.
+ * Move all free pages of a block to new type's freelist. Caller needs to
+ * change the block type.
*/
static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
int old_mt, int new_mt)
@@ -1827,8 +1961,6 @@ static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
pages_moved += 1 << order;
}
- set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
-
return pages_moved;
}
@@ -1873,7 +2005,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
* migration are movable. But we don't actually try
* isolating, as that would be expensive.
*/
- if (PageLRU(page) || __PageMovable(page))
+ if (PageLRU(page) || page_has_movable_ops(page))
(*num_movable)++;
pfn++;
}
@@ -1886,11 +2018,16 @@ static int move_freepages_block(struct zone *zone, struct page *page,
int old_mt, int new_mt)
{
unsigned long start_pfn;
+ int res;
if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
return -1;
- return __move_freepages_block(zone, start_pfn, old_mt, new_mt);
+ res = __move_freepages_block(zone, start_pfn, old_mt, new_mt);
+ set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
+
+ return res;
+
}
#ifdef CONFIG_MEMORY_ISOLATION
@@ -1918,11 +2055,19 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
return start_pfn;
}
+static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
+{
+ if (isolate)
+ set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+ else
+ clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+}
+
/**
- * move_freepages_block_isolate - move free pages in block for page isolation
+ * __move_freepages_block_isolate - move free pages in block for page isolation
* @zone: the zone
* @page: the pageblock page
- * @migratetype: migratetype to set on the pageblock
+ * @isolate: to isolate the given pageblock or unisolate it
*
* This is similar to move_freepages_block(), but handles the special
* case encountered in page isolation, where the block of interest
@@ -1937,10 +2082,18 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
*
* Returns %true if pages could be moved, %false otherwise.
*/
-bool move_freepages_block_isolate(struct zone *zone, struct page *page,
- int migratetype)
+static bool __move_freepages_block_isolate(struct zone *zone,
+ struct page *page, bool isolate)
{
unsigned long start_pfn, pfn;
+ int from_mt;
+ int to_mt;
+
+ if (isolate == get_pageblock_isolate(page)) {
+ VM_WARN_ONCE(1, "%s a pageblock that is already in that state",
+ isolate ? "Isolate" : "Unisolate");
+ return false;
+ }
if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
return false;
@@ -1957,7 +2110,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
del_page_from_free_list(buddy, zone, order,
get_pfnblock_migratetype(buddy, pfn));
- set_pageblock_migratetype(page, migratetype);
+ toggle_pageblock_isolate(page, isolate);
split_large_buddy(zone, buddy, pfn, order, FPI_NONE);
return true;
}
@@ -1968,16 +2121,38 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
del_page_from_free_list(page, zone, order,
get_pfnblock_migratetype(page, pfn));
- set_pageblock_migratetype(page, migratetype);
+ toggle_pageblock_isolate(page, isolate);
split_large_buddy(zone, page, pfn, order, FPI_NONE);
return true;
}
move:
- __move_freepages_block(zone, start_pfn,
- get_pfnblock_migratetype(page, start_pfn),
- migratetype);
+ /* Use MIGRATETYPE_MASK to get non-isolate migratetype */
+ if (isolate) {
+ from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
+ MIGRATETYPE_MASK);
+ to_mt = MIGRATE_ISOLATE;
+ } else {
+ from_mt = MIGRATE_ISOLATE;
+ to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
+ MIGRATETYPE_MASK);
+ }
+
+ __move_freepages_block(zone, start_pfn, from_mt, to_mt);
+ toggle_pageblock_isolate(pfn_to_page(start_pfn), isolate);
+
return true;
}
+
+bool pageblock_isolate_and_move_free_pages(struct zone *zone, struct page *page)
+{
+ return __move_freepages_block_isolate(zone, page, true);
+}
+
+bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *page)
+{
+ return __move_freepages_block_isolate(zone, page, false);
+}
+
#endif /* CONFIG_MEMORY_ISOLATION */
static void change_pageblock_range(struct page *pageblock_page,
@@ -2077,31 +2252,25 @@ static bool should_try_claim_block(unsigned int order, int start_mt)
/*
* Check whether there is a suitable fallback freepage with requested order.
- * Sets *claim_block to instruct the caller whether it should convert a whole
- * pageblock to the returned migratetype.
- * If only_claim is true, this function returns fallback_mt only if
+ * If claimable is true, this function returns fallback_mt only if
* we would do this whole-block claiming. This would help to reduce
* fragmentation due to mixed migratetype pages in one pageblock.
*/
int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_claim, bool *claim_block)
+ int migratetype, bool claimable)
{
int i;
- int fallback_mt;
+
+ if (claimable && !should_try_claim_block(order, migratetype))
+ return -2;
if (area->nr_free == 0)
return -1;
- *claim_block = false;
for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
- fallback_mt = fallbacks[migratetype][i];
- if (free_area_empty(area, fallback_mt))
- continue;
+ int fallback_mt = fallbacks[migratetype][i];
- if (should_try_claim_block(order, migratetype))
- *claim_block = true;
-
- if (*claim_block || !only_claim)
+ if (!free_area_empty(area, fallback_mt))
return fallback_mt;
}
@@ -2175,6 +2344,7 @@ try_to_claim_block(struct zone *zone, struct page *page,
if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled) {
__move_freepages_block(zone, start_pfn, block_type, start_type);
+ set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
return __rmqueue_smallest(zone, order, start_type);
}
@@ -2182,23 +2352,15 @@ try_to_claim_block(struct zone *zone, struct page *page,
}
/*
- * Try finding a free buddy page on the fallback list.
- *
- * This will attempt to claim a whole pageblock for the requested type
- * to ensure grouping of such requests in the future.
- *
- * If a whole block cannot be claimed, steal an individual page, regressing to
- * __rmqueue_smallest() logic to at least break up as little contiguity as
- * possible.
+ * Try to allocate from some fallback migratetype by claiming the entire block,
+ * i.e. converting it to the allocation's start migratetype.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
- *
- * Return the stolen page, or NULL if none can be found.
*/
static __always_inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+__rmqueue_claim(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
@@ -2206,7 +2368,6 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
int min_order = order;
struct page *page;
int fallback_mt;
- bool claim_block;
/*
* Do not steal pages from freelists belonging to other pageblocks
@@ -2225,53 +2386,73 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &claim_block);
+ start_migratetype, true);
+
+ /* No block in that order */
if (fallback_mt == -1)
continue;
- if (!claim_block)
+ /* Advanced into orders too low to claim, abort */
+ if (fallback_mt == -2)
break;
page = get_page_from_free_area(area, fallback_mt);
page = try_to_claim_block(zone, page, current_order, order,
start_migratetype, fallback_mt,
alloc_flags);
- if (page)
- goto got_one;
+ if (page) {
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
}
- if (alloc_flags & ALLOC_NOFRAGMENT)
- return NULL;
+ return NULL;
+}
+
+/*
+ * Try to steal a single page from some fallback migratetype. Leave the rest of
+ * the block as its current migratetype, potentially causing fragmentation.
+ */
+static __always_inline struct page *
+__rmqueue_steal(struct zone *zone, int order, int start_migratetype)
+{
+ struct free_area *area;
+ int current_order;
+ struct page *page;
+ int fallback_mt;
- /* No luck claiming pageblock. Find the smallest fallback page */
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &claim_block);
+ start_migratetype, false);
if (fallback_mt == -1)
continue;
page = get_page_from_free_area(area, fallback_mt);
page_del_and_expand(zone, page, order, current_order, fallback_mt);
- goto got_one;
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
}
return NULL;
-
-got_one:
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
-
- return page;
}
+enum rmqueue_mode {
+ RMQUEUE_NORMAL,
+ RMQUEUE_CMA,
+ RMQUEUE_CLAIM,
+ RMQUEUE_STEAL,
+};
+
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
- unsigned int alloc_flags)
+ unsigned int alloc_flags, enum rmqueue_mode *mode)
{
struct page *page;
@@ -2290,16 +2471,48 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
}
}
- page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page)) {
- if (alloc_flags & ALLOC_CMA)
+ /*
+ * First try the freelists of the requested migratetype, then try
+ * fallbacks modes with increasing levels of fragmentation risk.
+ *
+ * The fallback logic is expensive and rmqueue_bulk() calls in
+ * a loop with the zone->lock held, meaning the freelists are
+ * not subject to any outside changes. Remember in *mode where
+ * we found pay dirt, to save us the search on the next call.
+ */
+ switch (*mode) {
+ case RMQUEUE_NORMAL:
+ page = __rmqueue_smallest(zone, order, migratetype);
+ if (page)
+ return page;
+ fallthrough;
+ case RMQUEUE_CMA:
+ if (alloc_flags & ALLOC_CMA) {
page = __rmqueue_cma_fallback(zone, order);
-
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype,
- alloc_flags);
+ if (page) {
+ *mode = RMQUEUE_CMA;
+ return page;
+ }
+ }
+ fallthrough;
+ case RMQUEUE_CLAIM:
+ page = __rmqueue_claim(zone, order, migratetype, alloc_flags);
+ if (page) {
+ /* Replenished preferred freelist, back to normal mode. */
+ *mode = RMQUEUE_NORMAL;
+ return page;
+ }
+ fallthrough;
+ case RMQUEUE_STEAL:
+ if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
+ page = __rmqueue_steal(zone, order, migratetype);
+ if (page) {
+ *mode = RMQUEUE_STEAL;
+ return page;
+ }
+ }
}
- return page;
+ return NULL;
}
/*
@@ -2311,17 +2524,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
unsigned long flags;
int i;
- if (!spin_trylock_irqsave(&zone->lock, flags)) {
- if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags))
return 0;
+ } else {
spin_lock_irqsave(&zone->lock, flags);
}
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
- alloc_flags);
+ alloc_flags, &rmqm);
if (unlikely(page == NULL))
break;
@@ -2631,10 +2846,10 @@ static void free_frozen_page_commit(struct zone *zone,
* stops will be drained from vmstat refresh context.
*/
if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
- free_high = (pcp->free_count >= batch &&
+ free_high = (pcp->free_count >= (batch + pcp->high_min / 2) &&
(pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
(!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
- pcp->count >= READ_ONCE(batch)));
+ pcp->count >= batch));
pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
@@ -2937,15 +3152,18 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
do {
page = NULL;
- if (!spin_trylock_irqsave(&zone->lock, flags)) {
- if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags))
return NULL;
+ } else {
spin_lock_irqsave(&zone->lock, flags);
}
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
+
+ page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm);
/*
* If the allocation fails, allow OOM handling and
@@ -3094,7 +3312,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
/*
* Do not instrument rmqueue() with KMSAN. This function may call
- * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
+ * __msan_poison_alloca() through a call to set_pfnblock_migratetype().
* If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
* may call rmqueue() again, which will result in a deadlock.
*/
@@ -3422,18 +3640,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
return false;
}
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int highest_zoneidx)
-{
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
-
- if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
- free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
-
- return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
- free_pages);
-}
-
#ifdef CONFIG_NUMA
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
@@ -3522,7 +3728,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
retry:
/*
* Scan zonelist, looking for a zone with enough free.
- * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
+ * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
@@ -3580,7 +3786,7 @@ retry:
}
}
- cond_accept_memory(zone, order);
+ cond_accept_memory(zone, order, alloc_flags);
/*
* Detect whether the number of free pages is below high
@@ -3607,7 +3813,7 @@ check_alloc_wmark:
gfp_mask)) {
int ret;
- if (cond_accept_memory(zone, order))
+ if (cond_accept_memory(zone, order, alloc_flags))
goto try_this_zone;
/*
@@ -3660,7 +3866,7 @@ try_this_zone:
return page;
} else {
- if (cond_accept_memory(zone, order))
+ if (cond_accept_memory(zone, order, alloc_flags))
goto try_this_zone;
/* Try again if zone has deferred pages */
@@ -4209,7 +4415,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
/*
* Ignore cpuset mems for non-blocking __GFP_HIGH (probably
* GFP_ATOMIC) rather than fail, see the comment for
- * cpuset_node_allowed().
+ * cpuset_current_node_allowed().
*/
if (alloc_flags & ALLOC_MIN_RESERVE)
alloc_flags &= ~ALLOC_CPUSET;
@@ -4530,6 +4736,14 @@ restart:
}
retry:
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * infinite retries.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
+
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@@ -4604,8 +4818,8 @@ retry:
goto retry;
/* Reclaim/compaction failed to prevent the fallback */
- if (defrag_mode) {
- alloc_flags &= ALLOC_NOFRAGMENT;
+ if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
@@ -4813,7 +5027,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
goto failed;
}
- cond_accept_memory(zone, 0);
+ cond_accept_memory(zone, 0, alloc_flags);
retry_this_zone:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
if (zone_watermark_fast(zone, 0, mark,
@@ -4822,7 +5036,7 @@ retry_this_zone:
break;
}
- if (cond_accept_memory(zone, 0))
+ if (cond_accept_memory(zone, 0, alloc_flags))
goto retry_this_zone;
/* Try again if zone has deferred pages */
@@ -5003,11 +5217,28 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
}
EXPORT_SYMBOL(get_zeroed_page_noprof);
+static void ___free_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
+{
+ /* get PageHead before we drop reference */
+ int head = PageHead(page);
+ /* get alloc tag in case the page is released by others */
+ struct alloc_tag *tag = pgalloc_tag_get(page);
+
+ if (put_page_testzero(page))
+ __free_frozen_pages(page, order, fpi_flags);
+ else if (!head) {
+ pgalloc_tag_sub_pages(tag, (1 << order) - 1);
+ while (order-- > 0)
+ __free_frozen_pages(page + (1 << order), order,
+ fpi_flags);
+ }
+}
+
/**
- * ___free_pages - Free pages allocated with alloc_pages().
+ * __free_pages - Free pages allocated with alloc_pages().
* @page: The page pointer returned from alloc_pages().
* @order: The order of the allocation.
- * @fpi_flags: Free Page Internal flags.
*
* This function can free multi-page allocations that are not compound
* pages. It does not check that the @order passed in matches that of
@@ -5024,21 +5255,6 @@ EXPORT_SYMBOL(get_zeroed_page_noprof);
* Context: May be called in interrupt context or while holding a normal
* spinlock, but not in NMI context or while holding a raw spinlock.
*/
-static void ___free_pages(struct page *page, unsigned int order,
- fpi_t fpi_flags)
-{
- /* get PageHead before we drop reference */
- int head = PageHead(page);
-
- if (put_page_testzero(page))
- __free_frozen_pages(page, order, fpi_flags);
- else if (!head) {
- pgalloc_tag_sub_pages(page, (1 << order) - 1);
- while (order-- > 0)
- __free_frozen_pages(page + (1 << order), order,
- fpi_flags);
- }
-}
void __free_pages(struct page *page, unsigned int order)
{
___free_pages(page, order, FPI_NONE);
@@ -5047,7 +5263,7 @@ EXPORT_SYMBOL(__free_pages);
/*
* Can be called while holding raw_spin_lock or from IRQ and NMI for any
- * page type (not only those that came from try_alloc_pages)
+ * page type (not only those that came from alloc_pages_nolock)
*/
void free_pages_nolock(struct page *page, unsigned int order)
{
@@ -6478,13 +6694,9 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
}
}
-/*
- * [start, end) must belong to a single zone.
- * @migratetype: using migratetype to filter the type of migration in
- * trace_mm_alloc_contig_migrate_range_info.
- */
+/* [start, end) must belong to a single zone. */
static int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end, int migratetype)
+ unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
unsigned int nr_reclaimed;
@@ -6496,10 +6708,6 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
.gfp_mask = cc->gfp_mask,
.reason = MR_CONTIG_RANGE,
};
- struct page *page;
- unsigned long total_mapped = 0;
- unsigned long total_migrated = 0;
- unsigned long total_reclaimed = 0;
lru_cache_disable();
@@ -6525,22 +6733,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
&cc->migratepages);
cc->nr_migratepages -= nr_reclaimed;
- if (trace_mm_alloc_contig_migrate_range_info_enabled()) {
- total_reclaimed += nr_reclaimed;
- list_for_each_entry(page, &cc->migratepages, lru) {
- struct folio *folio = page_folio(page);
-
- total_mapped += folio_mapped(folio) *
- folio_nr_pages(folio);
- }
- }
-
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
- if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret)
- total_migrated += cc->nr_migratepages;
-
/*
* On -ENOMEM, migrate_pages() bails out right away. It is pointless
* to retry again over this error, so do the same here.
@@ -6556,10 +6751,6 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
putback_movable_pages(&cc->migratepages);
}
- trace_mm_alloc_contig_migrate_range_info(start, end, migratetype,
- total_migrated,
- total_reclaimed,
- total_mapped);
return (ret < 0) ? ret : 0;
}
@@ -6627,10 +6818,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
* @end: one-past-the-last PFN to allocate
- * @migratetype: migratetype of the underlying pageblocks (either
- * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
- * in range must have the same migratetype and it must
- * be either of the two.
+ * @alloc_flags: allocation information
* @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some
* action and reclaim modifiers are supported. Reclaim modifiers
* control allocation behavior during compaction/migration/reclaim.
@@ -6647,7 +6835,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
* need to be freed with free_contig_range().
*/
int alloc_contig_range_noprof(unsigned long start, unsigned long end,
- unsigned migratetype, gfp_t gfp_mask)
+ acr_flags_t alloc_flags, gfp_t gfp_mask)
{
unsigned long outer_start, outer_end;
int ret = 0;
@@ -6662,6 +6850,9 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
.alloc_contig = true,
};
INIT_LIST_HEAD(&cc.migratepages);
+ enum pb_isolate_mode mode = (alloc_flags & ACR_FLAGS_CMA) ?
+ PB_ISOLATE_MODE_CMA_ALLOC :
+ PB_ISOLATE_MODE_OTHER;
gfp_mask = current_gfp_context(gfp_mask);
if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask))
@@ -6688,7 +6879,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
* put back to page allocator so that buddy can use them.
*/
- ret = start_isolate_page_range(start, end, migratetype, 0);
+ ret = start_isolate_page_range(start, end, mode);
if (ret)
goto done;
@@ -6704,7 +6895,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
* allocated. So, if we fall through be sure to clear ret so that
* -EBUSY is not accidentally used or returned to caller.
*/
- ret = __alloc_contig_migrate_range(&cc, start, end, migratetype);
+ ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret && ret != -EBUSY)
goto done;
@@ -6738,7 +6929,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
outer_start = find_large_buddy(start);
/* Make sure the range is really isolated. */
- if (test_pages_isolated(outer_start, end, 0)) {
+ if (test_pages_isolated(outer_start, end, mode)) {
ret = -EBUSY;
goto done;
}
@@ -6771,7 +6962,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
start, end, outer_start, outer_end);
}
done:
- undo_isolate_page_range(start, end, migratetype);
+ undo_isolate_page_range(start, end);
return ret;
}
EXPORT_SYMBOL(alloc_contig_range_noprof);
@@ -6781,8 +6972,8 @@ static int __alloc_contig_pages(unsigned long start_pfn,
{
unsigned long end_pfn = start_pfn + nr_pages;
- return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE,
- gfp_mask);
+ return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE,
+ gfp_mask);
}
static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
@@ -7138,9 +7329,6 @@ bool has_managed_dma(void)
#ifdef CONFIG_UNACCEPTED_MEMORY
-/* Counts number of zones with unaccepted pages. */
-static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
-
static bool lazy_accept = true;
static int __init accept_memory_parse(char *p)
@@ -7167,11 +7355,7 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
static void __accept_page(struct zone *zone, unsigned long *flags,
struct page *page)
{
- bool last;
-
list_del(&page->lru);
- last = list_empty(&zone->unaccepted_pages);
-
account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
__ClearPageUnaccepted(page);
@@ -7180,9 +7364,6 @@ static void __accept_page(struct zone *zone, unsigned long *flags,
accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
-
- if (last)
- static_branch_dec(&zones_with_unaccepted_pages);
}
void accept_page(struct page *page)
@@ -7219,20 +7400,17 @@ static bool try_to_accept_memory_one(struct zone *zone)
return true;
}
-static inline bool has_unaccepted_memory(void)
-{
- return static_branch_unlikely(&zones_with_unaccepted_pages);
-}
-
-static bool cond_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
long to_accept, wmark;
bool ret = false;
- if (!has_unaccepted_memory())
+ if (list_empty(&zone->unaccepted_pages))
return false;
- if (list_empty(&zone->unaccepted_pages))
+ /* Bailout, since try_to_accept_memory_one() needs to take a lock */
+ if (alloc_flags & ALLOC_TRYLOCK)
return false;
wmark = promo_wmark_pages(zone);
@@ -7265,22 +7443,17 @@ static bool __free_unaccepted(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long flags;
- bool first = false;
if (!lazy_accept)
return false;
spin_lock_irqsave(&zone->lock, flags);
- first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
__SetPageUnaccepted(page);
spin_unlock_irqrestore(&zone->lock, flags);
- if (first)
- static_branch_inc(&zones_with_unaccepted_pages);
-
return true;
}
@@ -7291,7 +7464,8 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
return false;
}
-static bool cond_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
return false;
}
@@ -7305,20 +7479,21 @@ static bool __free_unaccepted(struct page *page)
#endif /* CONFIG_UNACCEPTED_MEMORY */
/**
- * try_alloc_pages - opportunistic reentrant allocation from any context
+ * alloc_pages_nolock - opportunistic reentrant allocation from any context
* @nid: node to allocate from
* @order: allocation order size
*
* Allocates pages of a given order from the given node. This is safe to
* call from any context (from atomic, NMI, and also reentrant
- * allocator -> tracepoint -> try_alloc_pages_noprof).
+ * allocator -> tracepoint -> alloc_pages_nolock_noprof).
* Allocation is best effort and to be expected to fail easily so nobody should
* rely on the success. Failures are not reported via warn_alloc().
* See always fail conditions below.
*
- * Return: allocated page or NULL on failure.
+ * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
+ * It means ENOMEM. There is no reason to call it again and expect !NULL.
*/
-struct page *try_alloc_pages_noprof(int nid, unsigned int order)
+struct page *alloc_pages_nolock_noprof(int nid, unsigned int order)
{
/*
* Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
@@ -7327,7 +7502,7 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order)
*
* These two are the conditions for gfpflags_allow_spinning() being true.
*
- * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason
+ * Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason
* to warn. Also warn would trigger printk() which is unsafe from
* various contexts. We cannot use printk_deferred_enter() to mitigate,
* since the running context is unknown.
@@ -7337,7 +7512,7 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order)
* BPF use cases.
*
* Though __GFP_NOMEMALLOC is not checked in the code path below,
- * specify it here to highlight that try_alloc_pages()
+ * specify it here to highlight that alloc_pages_nolock()
* doesn't want to deplete reserves.
*/
gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC
@@ -7362,11 +7537,6 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order)
if (!pcp_allowed_order(order))
return NULL;
-#ifdef CONFIG_UNACCEPTED_MEMORY
- /* Bailout, since try_to_accept_memory_one() needs to take a lock */
- if (has_unaccepted_memory())
- return NULL;
-#endif
/* Bailout, since _deferred_grow_zone() needs to take a lock */
if (deferred_pages_enabled())
return NULL;
diff --git a/mm/page_ext.c b/mm/page_ext.c
index c351fdfe9e9a..d7396a8970e5 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -369,25 +369,15 @@ static void __invalidate_page_ext(unsigned long pfn)
}
static int __meminit online_page_ext(unsigned long start_pfn,
- unsigned long nr_pages,
- int nid)
+ unsigned long nr_pages)
{
+ int nid = pfn_to_nid(start_pfn);
unsigned long start, end, pfn;
int fail = 0;
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
- if (nid == NUMA_NO_NODE) {
- /*
- * In this case, "nid" already exists and contains valid memory.
- * "start_pfn" passed to us is a pfn which is an arg for
- * online__pages(), and start_pfn should exist.
- */
- nid = pfn_to_nid(start_pfn);
- VM_BUG_ON(!node_online(nid));
- }
-
for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
fail = init_section_page_ext(pfn, nid);
if (!fail)
@@ -435,8 +425,7 @@ static int __meminit page_ext_callback(struct notifier_block *self,
switch (action) {
case MEM_GOING_ONLINE:
- ret = online_page_ext(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
+ ret = online_page_ext(mn->start_pfn, mn->nr_pages);
break;
case MEM_OFFLINE:
offline_page_ext(mn->start_pfn,
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 408aaf29a3ea..a82b340dc204 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -208,7 +208,7 @@ static const struct bin_attribute *const page_idle_bin_attrs[] = {
};
static const struct attribute_group page_idle_attr_group = {
- .bin_attrs_new = page_idle_bin_attrs,
+ .bin_attrs = page_idle_bin_attrs,
.name = "page_idle",
};
diff --git a/mm/page_io.c b/mm/page_io.c
index 4bce19df557b..a2056a5ecb13 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -237,15 +237,13 @@ static void swap_zeromap_folio_clear(struct folio *folio)
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
*/
-int swap_writepage(struct page *page, struct writeback_control *wbc)
+int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
{
- struct folio *folio = page_folio(page);
- int ret;
+ int ret = 0;
+
+ if (folio_free_swap(folio))
+ goto out_unlock;
- if (folio_free_swap(folio)) {
- folio_unlock(folio);
- return 0;
- }
/*
* Arch code may have to preserve more data than just the page
* contents, e.g. memory tags.
@@ -253,8 +251,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
ret = arch_prepare_to_swap(folio);
if (ret) {
folio_mark_dirty(folio);
- folio_unlock(folio);
- return ret;
+ goto out_unlock;
}
/*
@@ -265,28 +262,30 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
*/
if (is_folio_zero_filled(folio)) {
swap_zeromap_folio_set(folio);
- folio_unlock(folio);
- return 0;
- } else {
- /*
- * Clear bits this folio occupies in the zeromap to prevent
- * zero data being read in from any previous zero writes that
- * occupied the same swap entries.
- */
- swap_zeromap_folio_clear(folio);
+ goto out_unlock;
}
+
+ /*
+ * Clear bits this folio occupies in the zeromap to prevent zero data
+ * being read in from any previous zero writes that occupied the same
+ * swap entries.
+ */
+ swap_zeromap_folio_clear(folio);
+
if (zswap_store(folio)) {
count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
- folio_unlock(folio);
- return 0;
+ goto out_unlock;
}
if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) {
folio_mark_dirty(folio);
return AOP_WRITEPAGE_ACTIVATE;
}
- __swap_writepage(folio, wbc);
+ __swap_writepage(folio, swap_plug);
return 0;
+out_unlock:
+ folio_unlock(folio);
+ return ret;
}
static inline void count_swpout_vm_event(struct folio *folio)
@@ -372,9 +371,9 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
mempool_free(sio, sio_pool);
}
-static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc)
+static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
{
- struct swap_iocb *sio = NULL;
+ struct swap_iocb *sio = swap_plug ? *swap_plug : NULL;
struct swap_info_struct *sis = swp_swap_info(folio->swap);
struct file *swap_file = sis->swap_file;
loff_t pos = swap_dev_pos(folio->swap);
@@ -382,8 +381,6 @@ static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc
count_swpout_vm_event(folio);
folio_start_writeback(folio);
folio_unlock(folio);
- if (wbc->swap_plug)
- sio = *wbc->swap_plug;
if (sio) {
if (sio->iocb.ki_filp != swap_file ||
sio->iocb.ki_pos + sio->len != pos) {
@@ -402,22 +399,21 @@ static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc
bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
sio->len += folio_size(folio);
sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
+ if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) {
swap_write_unplug(sio);
sio = NULL;
}
- if (wbc->swap_plug)
- *wbc->swap_plug = sio;
+ if (swap_plug)
+ *swap_plug = sio;
}
static void swap_writepage_bdev_sync(struct folio *folio,
- struct writeback_control *wbc, struct swap_info_struct *sis)
+ struct swap_info_struct *sis)
{
struct bio_vec bv;
struct bio bio;
- bio_init(&bio, sis->bdev, &bv, 1,
- REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
+ bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP);
bio.bi_iter.bi_sector = swap_folio_sector(folio);
bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
@@ -432,13 +428,11 @@ static void swap_writepage_bdev_sync(struct folio *folio,
}
static void swap_writepage_bdev_async(struct folio *folio,
- struct writeback_control *wbc, struct swap_info_struct *sis)
+ struct swap_info_struct *sis)
{
struct bio *bio;
- bio = bio_alloc(sis->bdev, 1,
- REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
- GFP_NOIO);
+ bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP, GFP_NOIO);
bio->bi_iter.bi_sector = swap_folio_sector(folio);
bio->bi_end_io = end_swap_bio_write;
bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
@@ -450,7 +444,7 @@ static void swap_writepage_bdev_async(struct folio *folio,
submit_bio(bio);
}
-void __swap_writepage(struct folio *folio, struct writeback_control *wbc)
+void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
{
struct swap_info_struct *sis = swp_swap_info(folio->swap);
@@ -461,16 +455,16 @@ void __swap_writepage(struct folio *folio, struct writeback_control *wbc)
* is safe.
*/
if (data_race(sis->flags & SWP_FS_OPS))
- swap_writepage_fs(folio, wbc);
+ swap_writepage_fs(folio, swap_plug);
/*
* ->flags can be updated non-atomicially (scan_swap_map_slots),
* but that will never affect SWP_SYNCHRONOUS_IO, so the data_race
* is safe.
*/
else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
- swap_writepage_bdev_sync(folio, wbc, sis);
+ swap_writepage_bdev_sync(folio, sis);
else
- swap_writepage_bdev_async(folio, wbc, sis);
+ swap_writepage_bdev_async(folio, sis);
}
void swap_write_unplug(struct swap_iocb *sio)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a051a29e95ad..f72b6cd38b95 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -21,9 +21,9 @@
* consequently belong to a single zone.
*
* PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
- * check without lock_page also may miss some movable non-lru pages at
- * race condition. So you can't expect this function should be exact.
+ * MIGRATE_MOVABLE block might include unmovable pages. Similarly, pages
+ * with movable_ops can only be identified some time after they were
+ * allocated. So you can't expect this function should be exact.
*
* Returns a page without holding a reference. If the caller wants to
* dereference that page (e.g., dumping), it has to make sure that it
@@ -31,7 +31,7 @@
*
*/
static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
- int migratetype, int flags)
+ enum pb_isolate_mode mode)
{
struct page *page = pfn_to_page(start_pfn);
struct zone *zone = page_zone(page);
@@ -46,7 +46,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* isolate CMA pageblocks even when they are not movable in fact
* so consider them movable here.
*/
- if (is_migrate_cma(migratetype))
+ if (mode == PB_ISOLATE_MODE_CMA_ALLOC)
return NULL;
return page;
@@ -83,9 +83,16 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
unsigned int skip_pages;
if (PageHuge(page)) {
- if (!hugepage_migration_supported(folio_hstate(folio)))
+ struct hstate *h;
+
+ /*
+ * The huge page may be freed so can not
+ * use folio_hstate() directly.
+ */
+ h = size_to_hstate(folio_size(folio));
+ if (h && !hugepage_migration_supported(h))
return page;
- } else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) {
+ } else if (!folio_test_lru(folio)) {
return page;
}
@@ -110,7 +117,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
- if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+ if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page))
continue;
/*
@@ -123,10 +130,10 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* move these pages that still have a reference count > 0.
* (false negatives in this function only)
*/
- if ((flags & MEMORY_OFFLINE) && PageOffline(page))
+ if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page))
continue;
- if (__PageMovable(page) || PageLRU(page))
+ if (PageLRU(page) || page_has_movable_ops(page))
continue;
/*
@@ -144,7 +151,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* present in [start_pfn, end_pfn). The pageblock must intersect with
* [start_pfn, end_pfn).
*/
-static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
+static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode,
unsigned long start_pfn, unsigned long end_pfn)
{
struct zone *zone = page_zone(page);
@@ -179,9 +186,9 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
end_pfn);
unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
- migratetype, isol_flags);
+ mode);
if (!unmovable) {
- if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) {
+ if (!pageblock_isolate_and_move_free_pages(zone, page)) {
spin_unlock_irqrestore(&zone->lock, flags);
return -EBUSY;
}
@@ -191,7 +198,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
}
spin_unlock_irqrestore(&zone->lock, flags);
- if (isol_flags & REPORT_FAILURE) {
+ if (mode == PB_ISOLATE_MODE_MEM_OFFLINE) {
/*
* printk() with zone->lock held will likely trigger a
* lockdep splat, so defer it here.
@@ -202,7 +209,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
return -EBUSY;
}
-static void unset_migratetype_isolate(struct page *page, int migratetype)
+static void unset_migratetype_isolate(struct page *page)
{
struct zone *zone;
unsigned long flags;
@@ -255,10 +262,10 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
* Isolating this block already succeeded, so this
* should not fail on zone boundaries.
*/
- WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
+ WARN_ON_ONCE(!pageblock_unisolate_and_move_free_pages(zone, page));
} else {
- set_pageblock_migratetype(page, migratetype);
- __putback_isolated_page(page, order, migratetype);
+ clear_pageblock_isolate(page);
+ __putback_isolated_page(page, order, get_pageblock_migratetype(page));
}
zone->nr_isolate_pageblock--;
out:
@@ -285,11 +292,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* isolate_single_pageblock() -- tries to isolate a pageblock that might be
* within a free or in-use page.
* @boundary_pfn: pageblock-aligned pfn that a page might cross
- * @flags: isolation flags
+ * @mode: isolation mode
* @isolate_before: isolate the pageblock before the boundary_pfn
* @skip_isolation: the flag to skip the pageblock isolation in second
* isolate_single_pageblock()
- * @migratetype: migrate type to set in error recovery.
*
* Free and in-use pages can be as big as MAX_PAGE_ORDER and contain more than one
* pageblock. When not all pageblocks within a page are isolated at the same
@@ -304,8 +310,9 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* either. The function handles this by splitting the free page or migrating
* the in-use page then splitting the free page.
*/
-static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
- bool isolate_before, bool skip_isolation, int migratetype)
+static int isolate_single_pageblock(unsigned long boundary_pfn,
+ enum pb_isolate_mode mode, bool isolate_before,
+ bool skip_isolation)
{
unsigned long start_pfn;
unsigned long isolate_pageblock;
@@ -331,12 +338,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
zone->zone_start_pfn);
if (skip_isolation) {
- int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
-
- VM_BUG_ON(!is_migrate_isolate(mt));
+ VM_BUG_ON(!get_pageblock_isolate(pfn_to_page(isolate_pageblock)));
} else {
- ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype,
- flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+ ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock),
+ mode, isolate_pageblock,
+ isolate_pageblock + pageblock_nr_pages);
if (ret)
return ret;
@@ -376,7 +382,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
if (PageBuddy(page)) {
int order = buddy_order(page);
- /* move_freepages_block_isolate() handled this */
+ /* pageblock_isolate_and_move_free_pages() handled this */
VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn);
pfn += 1UL << order;
@@ -415,7 +421,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
* proper free and split handling for them.
*/
VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
- VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page);
+ VM_WARN_ON_ONCE_PAGE(page_has_movable_ops(page), page);
goto failed;
}
@@ -426,7 +432,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
failed:
/* restore the original migratetype */
if (!skip_isolation)
- unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype);
+ unset_migratetype_isolate(pfn_to_page(isolate_pageblock));
return -EBUSY;
}
@@ -434,14 +440,7 @@ failed:
* start_isolate_page_range() - mark page range MIGRATE_ISOLATE
* @start_pfn: The first PFN of the range to be isolated.
* @end_pfn: The last PFN of the range to be isolated.
- * @migratetype: Migrate type to set in error recovery.
- * @flags: The following flags are allowed (they can be combined in
- * a bit mask)
- * MEMORY_OFFLINE - isolate to offline (!allocate) memory
- * e.g., skip over PageHWPoison() pages
- * and PageOffline() pages.
- * REPORT_FAILURE - report details about the failure to
- * isolate the range
+ * @mode: isolation mode
*
* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
* the range will never be allocated. Any free pages and pages freed in the
@@ -474,7 +473,7 @@ failed:
* Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- int migratetype, int flags)
+ enum pb_isolate_mode mode)
{
unsigned long pfn;
struct page *page;
@@ -485,8 +484,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
- ret = isolate_single_pageblock(isolate_start, flags, false,
- skip_isolation, migratetype);
+ ret = isolate_single_pageblock(isolate_start, mode, false,
+ skip_isolation);
if (ret)
return ret;
@@ -494,10 +493,9 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
skip_isolation = true;
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
- ret = isolate_single_pageblock(isolate_end, flags, true,
- skip_isolation, migratetype);
+ ret = isolate_single_pageblock(isolate_end, mode, true, skip_isolation);
if (ret) {
- unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
+ unset_migratetype_isolate(pfn_to_page(isolate_start));
return ret;
}
@@ -506,12 +504,11 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn < isolate_end - pageblock_nr_pages;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page && set_migratetype_isolate(page, migratetype, flags,
- start_pfn, end_pfn)) {
- undo_isolate_page_range(isolate_start, pfn, migratetype);
+ if (page && set_migratetype_isolate(page, mode, start_pfn,
+ end_pfn)) {
+ undo_isolate_page_range(isolate_start, pfn);
unset_migratetype_isolate(
- pfn_to_page(isolate_end - pageblock_nr_pages),
- migratetype);
+ pfn_to_page(isolate_end - pageblock_nr_pages));
return -EBUSY;
}
}
@@ -522,13 +519,10 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
* undo_isolate_page_range - undo effects of start_isolate_page_range()
* @start_pfn: The first PFN of the isolated range
* @end_pfn: The last PFN of the isolated range
- * @migratetype: New migrate type to set on the range
*
- * This finds every MIGRATE_ISOLATE page block in the given range
- * and switches it to @migratetype.
+ * This finds and unsets every MIGRATE_ISOLATE page block in the given range
*/
-void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- int migratetype)
+void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page;
@@ -541,7 +535,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
page = __first_valid_page(pfn, pageblock_nr_pages);
if (!page || !is_migrate_isolate_page(page))
continue;
- unset_migratetype_isolate(page, migratetype);
+ unset_migratetype_isolate(page);
}
}
/*
@@ -553,7 +547,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
*/
static unsigned long
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
- int flags)
+ enum pb_isolate_mode mode)
{
struct page *page;
@@ -566,11 +560,12 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* simple way to verify that as VM_BUG_ON(), though.
*/
pfn += 1 << buddy_order(page);
- else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+ else if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) &&
+ PageHWPoison(page))
/* A HWPoisoned page cannot be also PageBuddy */
pfn++;
- else if ((flags & MEMORY_OFFLINE) && PageOffline(page) &&
- !page_count(page))
+ else if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) &&
+ PageOffline(page) && !page_count(page))
/*
* The responsible driver agreed to skip PageOffline()
* pages when offlining memory by dropping its
@@ -588,11 +583,11 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* test_pages_isolated - check if pageblocks in range are isolated
* @start_pfn: The first PFN of the isolated range
* @end_pfn: The first PFN *after* the isolated range
- * @isol_flags: Testing mode flags
+ * @mode: Testing mode
*
* This tests if all in the specified range are free.
*
- * If %MEMORY_OFFLINE is specified in @flags, it will consider
+ * If %PB_ISOLATE_MODE_MEM_OFFLINE specified in @mode, it will consider
* poisoned and offlined pages free as well.
*
* Caller must ensure the requested range doesn't span zones.
@@ -600,7 +595,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* Returns 0 if true, -EBUSY if one or more pages are in use.
*/
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
- int isol_flags)
+ enum pb_isolate_mode mode)
{
unsigned long pfn, flags;
struct page *page;
@@ -636,7 +631,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
/* Check all pages are free or marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags);
+ pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, mode);
spin_unlock_irqrestore(&zone->lock, flags);
ret = pfn < end_pfn ? -EBUSY : 0;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index cc4a6916eec6..c3ca21132c2c 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -302,7 +302,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
/*
* Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
* to prevent issues in stack_depot_save().
- * This is similar to try_alloc_pages() gfp flags, but only used
+ * This is similar to alloc_pages_nolock() gfp flags, but only used
* to signal stack_depot to avoid spin_locks.
*/
handle = save_stack(__GFP_NOWARN);
@@ -333,9 +333,9 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
inc_stack_record_count(handle, gfp_mask, 1 << order);
}
-void __set_page_owner_migrate_reason(struct page *page, int reason)
+void __folio_set_owner_migrate_reason(struct folio *folio, int reason)
{
- struct page_ext *page_ext = page_ext_get(page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
struct page_owner *page_owner;
if (unlikely(!page_ext))
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 68109ee93841..4eeca782b888 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -218,33 +218,39 @@ static inline void page_table_check_pmd_flags(pmd_t pmd)
WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd)));
}
-void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd)
+void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd,
+ unsigned int nr)
{
+ unsigned long stride = PMD_SIZE >> PAGE_SHIFT;
+ unsigned int i;
+
if (&init_mm == mm)
return;
page_table_check_pmd_flags(pmd);
- __page_table_check_pmd_clear(mm, *pmdp);
- if (pmd_user_accessible_page(pmd)) {
- page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT,
- pmd_write(pmd));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pmd_clear(mm, *(pmdp + i));
+ if (pmd_user_accessible_page(pmd))
+ page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd));
}
-EXPORT_SYMBOL(__page_table_check_pmd_set);
+EXPORT_SYMBOL(__page_table_check_pmds_set);
-void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud)
+void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud,
+ unsigned int nr)
{
+ unsigned long stride = PUD_SIZE >> PAGE_SHIFT;
+ unsigned int i;
+
if (&init_mm == mm)
return;
- __page_table_check_pud_clear(mm, *pudp);
- if (pud_user_accessible_page(pud)) {
- page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT,
- pud_write(pud));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pud_clear(mm, *(pudp + i));
+ if (pud_user_accessible_page(pud))
+ page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud));
}
-EXPORT_SYMBOL(__page_table_check_pud_set);
+EXPORT_SYMBOL(__page_table_check_puds_set);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
unsigned long addr,
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index e463c3be934a..e981a1a292d2 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -246,8 +246,7 @@ restart:
*/
pmde = pmdp_get_lockless(pvmw->pmd);
- if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) ||
- (pmd_present(pmde) && pmd_devmap(pmde))) {
+ if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
pmde = *pvmw->pmd;
if (!pmd_present(pmde)) {
@@ -262,7 +261,7 @@ restart:
return not_found(pvmw);
return true;
}
- if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) {
+ if (likely(pmd_trans_huge(pmde))) {
if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
if (!check_pmd(pmd_pfn(pmde), pvmw))
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e478777c86e1..648038247a8d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -143,8 +143,7 @@ again:
* We are ONLY installing, so avoid unnecessarily
* splitting a present huge page.
*/
- if (pmd_present(*pmd) &&
- (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+ if (pmd_present(*pmd) && pmd_trans_huge(*pmd))
continue;
}
@@ -210,8 +209,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
* We are ONLY installing, so avoid unnecessarily
* splitting a present huge page.
*/
- if (pud_present(*pud) &&
- (pud_trans_huge(*pud) || pud_devmap(*pud)))
+ if (pud_present(*pud) && pud_trans_huge(*pud))
continue;
}
@@ -422,7 +420,7 @@ static inline void process_mm_walk_lock(struct mm_struct *mm,
{
if (walk_lock == PGWALK_RDLOCK)
mmap_assert_locked(mm);
- else
+ else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY)
mmap_assert_write_locked(mm);
}
@@ -437,6 +435,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma,
case PGWALK_WRLOCK_VERIFY:
vma_assert_write_locked(vma);
break;
+ case PGWALK_VMA_RDLOCK_VERIFY:
+ vma_assert_locked(vma);
+ break;
case PGWALK_RDLOCK:
/* PGWALK_RDLOCK is handled by process_mm_walk_lock */
break;
@@ -585,8 +586,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
}
/**
- * walk_page_range_novma - walk a range of pagetables not backed by a vma
- * @mm: mm_struct representing the target process of page table walk
+ * walk_kernel_page_table_range - walk a range of kernel pagetables.
* @start: start address of the virtual address range
* @end: end address of the virtual address range
* @ops: operation to call during the walk
@@ -596,17 +596,61 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
* Similar to walk_page_range() but can walk any page tables even if they are
* not backed by VMAs. Because 'unusual' entries may be walked this function
* will also not lock the PTEs for the pte_entry() callback. This is useful for
- * walking the kernel pages tables or page tables for firmware.
+ * walking kernel pages tables or page tables for firmware.
*
* Note: Be careful to walk the kernel pages tables, the caller may be need to
* take other effective approaches (mmap lock may be insufficient) to prevent
* the intermediate kernel page tables belonging to the specified address range
* from being freed (e.g. memory hot-remove).
*/
-int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
+int walk_kernel_page_table_range(unsigned long start, unsigned long end,
+ const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
+{
+ struct mm_struct *mm = &init_mm;
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = mm,
+ .pgd = pgd,
+ .private = private,
+ .no_vma = true
+ };
+
+ if (start >= end)
+ return -EINVAL;
+ if (!check_ops_valid(ops))
+ return -EINVAL;
+
+ /*
+ * Kernel intermediate page tables are usually not freed, so the mmap
+ * read lock is sufficient. But there are some exceptions.
+ * E.g. memory hot-remove. In which case, the mmap lock is insufficient
+ * to prevent the intermediate kernel pages tables belonging to the
+ * specified address range from being freed. The caller should take
+ * other actions to prevent this race.
+ */
+ mmap_assert_locked(mm);
+
+ return walk_pgd_range(start, end, &walk);
+}
+
+/**
+ * walk_page_range_debug - walk a range of pagetables not backed by a vma
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @pgd: pgd to walk if different from mm->pgd
+ * @private: private data for callbacks' usage
+ *
+ * Similar to walk_page_range() but can walk any page tables even if they are
+ * not backed by VMAs. Because 'unusual' entries may be walked this function
+ * will also not lock the PTEs for the pte_entry() callback.
+ *
+ * This is for debugging purposes ONLY.
+ */
+int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
- pgd_t *pgd,
- void *private)
+ pgd_t *pgd, void *private)
{
struct mm_walk walk = {
.ops = ops,
@@ -616,34 +660,24 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
.no_vma = true
};
+ /* For convenience, we allow traversal of kernel mappings. */
+ if (mm == &init_mm)
+ return walk_kernel_page_table_range(start, end, ops,
+ pgd, private);
if (start >= end || !walk.mm)
return -EINVAL;
if (!check_ops_valid(ops))
return -EINVAL;
/*
- * 1) For walking the user virtual address space:
- *
* The mmap lock protects the page walker from changes to the page
* tables during the walk. However a read lock is insufficient to
* protect those areas which don't have a VMA as munmap() detaches
* the VMAs before downgrading to a read lock and actually tearing
* down PTEs/page tables. In which case, the mmap write lock should
- * be hold.
- *
- * 2) For walking the kernel virtual address space:
- *
- * The kernel intermediate page tables usually do not be freed, so
- * the mmap map read lock is sufficient. But there are some exceptions.
- * E.g. memory hot-remove. In which case, the mmap lock is insufficient
- * to prevent the intermediate kernel pages tables belonging to the
- * specified address range from being freed. The caller should take
- * other actions to prevent this race.
+ * be held.
*/
- if (mm == &init_mm)
- mmap_assert_locked(walk.mm);
- else
- mmap_assert_write_locked(walk.mm);
+ mmap_assert_write_locked(mm);
return walk_pgd_range(start, end, &walk);
}
@@ -872,7 +906,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
* TODO: FW_MIGRATION support for PUD migration entries
* once there are relevant users.
*/
- if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
+ if (!pud_present(pud) || pud_special(pud)) {
spin_unlock(ptl);
goto not_found;
} else if (!pud_leaf(pud)) {
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index dd3590dfc23d..9b9d5d6accae 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * mm/percpu-debug.c
*
* Copyright (C) 2017 Facebook Inc.
* Copyright (C) 2017 Dennis Zhou <dennis@kernel.org>
diff --git a/mm/percpu.c b/mm/percpu.c
index b35494c8ede2..d9cbaee92b60 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -3355,7 +3355,7 @@ void __init setup_per_cpu_areas(void)
*/
unsigned long pcpu_nr_pages(void)
{
- return pcpu_nr_populated * pcpu_nr_units;
+ return data_race(READ_ONCE(pcpu_nr_populated)) * pcpu_nr_units;
}
/*
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 5a882f2b10f9..567e2d084071 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -139,8 +139,7 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
- !pmd_devmap(*pmdp));
+ VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
@@ -153,7 +152,7 @@ pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
pud_t pud;
VM_BUG_ON(address & ~HPAGE_PUD_MASK);
- VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
+ VM_BUG_ON(!pud_trans_huge(*pudp));
pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
return pud;
@@ -293,7 +292,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
*pmdvalp = pmdval;
if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
goto nomap;
- if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval)))
+ if (unlikely(pmd_trans_huge(pmdval)))
goto nomap;
if (unlikely(pmd_bad(pmdval))) {
pmd_clear_bad(pmd);
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 106e1d66e9f9..b600c7f864b8 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -4,6 +4,7 @@
#include <linux/debugfs.h>
#include <linux/ptdump.h>
#include <linux/kasan.h>
+#include "internal.h"
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
/*
@@ -18,7 +19,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk,
{
struct ptdump_state *st = walk->private;
- st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
+ st->note_page_pte(st, addr, kasan_early_shadow_pte[0]);
walk->action = ACTION_CONTINUE;
@@ -38,11 +39,11 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 0, pgd_val(val));
+ if (st->effective_prot_pgd)
+ st->effective_prot_pgd(st, val);
if (pgd_leaf(val)) {
- st->note_page(st, addr, 0, pgd_val(val));
+ st->note_page_pgd(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -61,11 +62,11 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 1, p4d_val(val));
+ if (st->effective_prot_p4d)
+ st->effective_prot_p4d(st, val);
if (p4d_leaf(val)) {
- st->note_page(st, addr, 1, p4d_val(val));
+ st->note_page_p4d(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -84,11 +85,11 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 2, pud_val(val));
+ if (st->effective_prot_pud)
+ st->effective_prot_pud(st, val);
if (pud_leaf(val)) {
- st->note_page(st, addr, 2, pud_val(val));
+ st->note_page_pud(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -106,10 +107,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 3, pmd_val(val));
+ if (st->effective_prot_pmd)
+ st->effective_prot_pmd(st, val);
if (pmd_leaf(val)) {
- st->note_page(st, addr, 3, pmd_val(val));
+ st->note_page_pmd(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -122,10 +123,10 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
struct ptdump_state *st = walk->private;
pte_t val = ptep_get_lockless(pte);
- if (st->effective_prot)
- st->effective_prot(st, 4, pte_val(val));
+ if (st->effective_prot_pte)
+ st->effective_prot_pte(st, val);
- st->note_page(st, addr, 4, pte_val(val));
+ st->note_page_pte(st, addr, val);
return 0;
}
@@ -134,9 +135,31 @@ static int ptdump_hole(unsigned long addr, unsigned long next,
int depth, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
-
- st->note_page(st, addr, depth, 0);
-
+ pte_t pte_zero = {0};
+ pmd_t pmd_zero = {0};
+ pud_t pud_zero = {0};
+ p4d_t p4d_zero = {0};
+ pgd_t pgd_zero = {0};
+
+ switch (depth) {
+ case 4:
+ st->note_page_pte(st, addr, pte_zero);
+ break;
+ case 3:
+ st->note_page_pmd(st, addr, pmd_zero);
+ break;
+ case 2:
+ st->note_page_pud(st, addr, pud_zero);
+ break;
+ case 1:
+ st->note_page_p4d(st, addr, p4d_zero);
+ break;
+ case 0:
+ st->note_page_pgd(st, addr, pgd_zero);
+ break;
+ default:
+ break;
+ }
return 0;
}
@@ -153,16 +176,18 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
{
const struct ptdump_range *range = st->range;
+ get_online_mems();
mmap_write_lock(mm);
while (range->start != range->end) {
- walk_page_range_novma(mm, range->start, range->end,
+ walk_page_range_debug(mm, range->start, range->end,
&ptdump_ops, pgd, st);
range++;
}
mmap_write_unlock(mm);
+ put_online_mems();
/* Flush out the last page */
- st->note_page(st, 0, -1, 0);
+ st->note_page_flush(st);
}
static int check_wx_show(struct seq_file *m, void *v)
diff --git a/mm/readahead.c b/mm/readahead.c
index 6a4e96b69702..406756d34309 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -457,7 +457,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
}
void page_cache_ra_order(struct readahead_control *ractl,
- struct file_ra_state *ra, unsigned int new_order)
+ struct file_ra_state *ra)
{
struct address_space *mapping = ractl->mapping;
pgoff_t start = readahead_index(ractl);
@@ -468,24 +468,21 @@ void page_cache_ra_order(struct readahead_control *ractl,
unsigned int nofs;
int err = 0;
gfp_t gfp = readahead_gfp_mask(mapping);
- unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping));
+ unsigned int new_order = ra->order;
- /*
- * Fallback when size < min_nrpages as each folio should be
- * at least min_nrpages anyway.
- */
- if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size)
+ if (!mapping_large_folio_support(mapping)) {
+ ra->order = 0;
goto fallback;
+ }
limit = min(limit, index + ra->size - 1);
- if (new_order < mapping_max_folio_order(mapping))
- new_order += 2;
-
new_order = min(mapping_max_folio_order(mapping), new_order);
new_order = min_t(unsigned int, new_order, ilog2(ra->size));
new_order = max(new_order, min_order);
+ ra->order = new_order;
+
/* See comment in page_cache_ra_unbounded() */
nofs = memalloc_nofs_save();
filemap_invalidate_lock_shared(mapping);
@@ -617,8 +614,9 @@ void page_cache_sync_ra(struct readahead_control *ractl,
ra->size = min(contig_count + req_count, max_pages);
ra->async_size = 1;
readit:
+ ra->order = 0;
ractl->_index = ra->start;
- page_cache_ra_order(ractl, ra, 0);
+ page_cache_ra_order(ractl, ra);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
@@ -628,8 +626,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
unsigned long max_pages;
struct file_ra_state *ra = ractl->ra;
pgoff_t index = readahead_index(ractl);
- pgoff_t expected, start;
- unsigned int order = folio_order(folio);
+ pgoff_t expected, start, end, aligned_end, align;
/* no readahead */
if (!ra->ra_pages)
@@ -652,7 +649,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
* Ramp up sizes, and push forward the readahead window.
*/
expected = round_down(ra->start + ra->size - ra->async_size,
- 1UL << order);
+ folio_nr_pages(folio));
if (index == expected) {
ra->start += ra->size;
/*
@@ -660,7 +657,6 @@ void page_cache_async_ra(struct readahead_control *ractl,
* the readahead window.
*/
ra->size = max(ra->size, get_next_ra_size(ra, max_pages));
- ra->async_size = ra->size;
goto readit;
}
@@ -681,18 +677,30 @@ void page_cache_async_ra(struct readahead_control *ractl,
ra->size = start - index; /* old async_size */
ra->size += req_count;
ra->size = get_next_ra_size(ra, max_pages);
- ra->async_size = ra->size;
readit:
+ ra->order += 2;
+ align = 1UL << min(ra->order, ffs(max_pages) - 1);
+ end = ra->start + ra->size;
+ aligned_end = round_down(end, align);
+ if (aligned_end > ra->start)
+ ra->size -= end - aligned_end;
+ ra->async_size = ra->size;
ractl->_index = ra->start;
- page_cache_ra_order(ractl, ra, order);
+ page_cache_ra_order(ractl, ra);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
+ struct file *file;
+ const struct inode *inode;
+
CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ file = fd_file(f);
+ if (!(file->f_mode & FMODE_READ))
return -EBADF;
/*
@@ -700,9 +708,15 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
- if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
- (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
- !S_ISBLK(file_inode(fd_file(f))->i_mode)))
+ if (!file->f_mapping)
+ return -EINVAL;
+ if (!file->f_mapping->a_ops)
+ return -EINVAL;
+
+ inode = file_inode(file);
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ return -EINVAL;
+ if (IS_ANON_FILE(inode))
return -EINVAL;
return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
diff --git a/mm/rmap.c b/mm/rmap.c
index 67bb273dfb80..f93ce27132ab 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -503,12 +503,12 @@ struct anon_vma *folio_get_anon_vma(const struct folio *folio)
rcu_read_lock();
anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
- if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
goto out;
if (!folio_mapped(folio))
goto out;
- anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
if (!atomic_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
@@ -550,12 +550,12 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
retry:
rcu_read_lock();
anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
- if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
goto out;
if (!folio_mapped(folio))
goto out;
- anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
root_anon_vma = READ_ONCE(anon_vma->root);
if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
@@ -746,7 +746,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
if (pending != flushed) {
- arch_flush_tlb_batched_pending(mm);
+ flush_tlb_mm(mm);
/*
* If the new TLB flushing is pending during flushing, leave
* mm->tlb_flush_batched as is, to avoid losing flushing.
@@ -774,7 +774,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
* @vma: The VMA we need to know the address in.
*
* Calculates the user virtual address of this page in the specified VMA.
- * It is the caller's responsibililty to check the page is actually
+ * It is the caller's responsibility to check the page is actually
* within the VMA. There may not currently be a PTE pointing at this
* page, but if a page fault occurs at this address, this is the page
* which will be accessed.
@@ -789,13 +789,13 @@ unsigned long page_address_in_vma(const struct folio *folio,
const struct page *page, const struct vm_area_struct *vma)
{
if (folio_test_anon(folio)) {
- struct anon_vma *page__anon_vma = folio_anon_vma(folio);
+ struct anon_vma *anon_vma = folio_anon_vma(folio);
/*
* Note: swapoff's unuse_vma() is more efficient with this
* check, and needs it to match anon_vma when KSM is active.
*/
- if (!vma->anon_vma || !page__anon_vma ||
- vma->anon_vma->root != page__anon_vma->root)
+ if (!vma->anon_vma || !anon_vma ||
+ vma->anon_vma->root != anon_vma->root)
return -EFAULT;
} else if (!vma->vm_file) {
return -EFAULT;
@@ -803,7 +803,7 @@ unsigned long page_address_in_vma(const struct folio *folio,
return -EFAULT;
}
- /* KSM folios don't reach here because of the !page__anon_vma check */
+ /* KSM folios don't reach here because of the !anon_vma check */
return vma_address(vma, page_pgoff(folio, page), 1);
}
@@ -839,7 +839,7 @@ out:
struct folio_referenced_arg {
int mapcount;
int referenced;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
struct mem_cgroup *memcg;
};
@@ -984,7 +984,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
* the function bailed out due to rmap lock contention.
*/
int folio_referenced(struct folio *folio, int is_locked,
- struct mem_cgroup *memcg, unsigned long *vm_flags)
+ struct mem_cgroup *memcg, vm_flags_t *vm_flags)
{
bool we_locked = false;
struct folio_referenced_arg pra = {
@@ -1334,9 +1334,9 @@ void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_VMA(!anon_vma, vma);
- anon_vma += PAGE_MAPPING_ANON;
+ anon_vma += FOLIO_MAPPING_ANON;
/*
- * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
+ * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written
* simultaneously, so a concurrent reader (eg folio_referenced()'s
* folio_test_anon()) will not see one without the other.
*/
@@ -1367,10 +1367,10 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
/*
* page_idle does a lockless/optimistic rmap scan on folio->mapping.
* Make sure the compiler doesn't split the stores of anon_vma and
- * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
+ * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code
* could mistake the mapping for a struct address_space and crash.
*/
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON;
WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
folio->index = linear_page_index(vma, address);
}
@@ -1845,23 +1845,30 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
#endif
}
-/* We support batch unmapping of PTEs for lazyfree large folios */
-static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
- struct folio *folio, pte_t *ptep)
+static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
+ struct page_vma_mapped_walk *pvmw,
+ enum ttu_flags flags, pte_t pte)
{
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
- int max_nr = folio_nr_pages(folio);
- pte_t pte = ptep_get(ptep);
+ unsigned long end_addr, addr = pvmw->address;
+ struct vm_area_struct *vma = pvmw->vma;
+ unsigned int max_nr;
+
+ if (flags & TTU_HWPOISON)
+ return 1;
+ if (!folio_test_large(folio))
+ return 1;
+ /* We may only batch within a single VMA and a single page table. */
+ end_addr = pmd_addr_end(addr, vma->vm_end);
+ max_nr = (end_addr - addr) >> PAGE_SHIFT;
+
+ /* We only support lazyfree batching for now ... */
if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
- return false;
+ return 1;
if (pte_unused(pte))
- return false;
- if (pte_pfn(pte) != folio_pfn(folio))
- return false;
+ return 1;
- return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
- NULL, NULL) == max_nr;
+ return folio_pte_batch(folio, pvmw->pte, pte, max_nr);
}
/*
@@ -1944,7 +1951,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* restart so we can process the PTE-mapped THP.
*/
split_huge_pmd_locked(vma, pvmw.address,
- pvmw.pmd, false, folio);
+ pvmw.pmd, false);
flags &= ~TTU_SPLIT_HUGE_PMD;
page_vma_mapped_walk_restart(&pvmw);
continue;
@@ -2024,9 +2031,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_dirty(pteval))
folio_mark_dirty(folio);
} else if (likely(pte_present(pteval))) {
- if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
- can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
- nr_pages = folio_nr_pages(folio);
+ nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
end_addr = address + nr_pages * PAGE_SIZE;
flush_cache_range(vma, address, end_addr);
@@ -2206,13 +2211,16 @@ discard:
hugetlb_remove_rmap(folio);
} else {
folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
- folio_ref_sub(folio, nr_pages - 1);
}
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
- folio_put(folio);
- /* We have already batched the entire folio */
- if (nr_pages > 1)
+ folio_put_refs(folio, nr_pages);
+
+ /*
+ * If we are sure that we batched the entire folio and cleared
+ * all PTEs, we can just optimize and stop right here.
+ */
+ if (nr_pages == folio_nr_pages(folio))
goto walk_done;
continue;
walk_abort:
@@ -2292,13 +2300,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
pvmw.flags = PVMW_SYNC;
/*
- * unmap_page() in mm/huge_memory.c is the only user of migration with
- * TTU_SPLIT_HUGE_PMD and it wants to freeze.
- */
- if (flags & TTU_SPLIT_HUGE_PMD)
- split_huge_pmd_address(vma, address, true, folio);
-
- /*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* invalidation in the case of pmd sharing.
@@ -2323,9 +2324,16 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ split_huge_pmd_locked(vma, pvmw.address,
+ pvmw.pmd, true);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
subpage = folio_page(folio,
pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
@@ -2337,8 +2345,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
continue;
- }
#endif
+ }
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 1b0a214ee558..60137305bc20 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -54,7 +54,6 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
pgoff_t offset = vmf->pgoff;
gfp_t gfp = vmf->gfp_mask;
unsigned long addr;
- struct page *page;
struct folio *folio;
vm_fault_t ret;
int err;
@@ -65,16 +64,15 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
filemap_invalidate_lock_shared(mapping);
retry:
- page = find_lock_page(mapping, offset);
- if (!page) {
+ folio = filemap_lock_folio(mapping, offset);
+ if (IS_ERR(folio)) {
folio = folio_alloc(gfp | __GFP_ZERO, 0);
if (!folio) {
ret = VM_FAULT_OOM;
goto out;
}
- page = &folio->page;
- err = set_direct_map_invalid_noflush(page);
+ err = set_direct_map_invalid_noflush(folio_page(folio, 0));
if (err) {
folio_put(folio);
ret = vmf_error(err);
@@ -90,7 +88,7 @@ retry:
* already happened when we marked the page invalid
* which guarantees that this call won't fail
*/
- set_direct_map_default_noflush(page);
+ set_direct_map_default_noflush(folio_page(folio, 0));
if (err == -EEXIST)
goto retry;
@@ -98,11 +96,11 @@ retry:
goto out;
}
- addr = (unsigned long)page_address(page);
+ addr = (unsigned long)folio_address(folio);
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
}
- vmf->page = page;
+ vmf->page = folio_file_page(folio, vmf->pgoff);
ret = VM_FAULT_LOCKED;
out:
@@ -120,18 +118,18 @@ static int secretmem_release(struct inode *inode, struct file *file)
return 0;
}
-static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
+static int secretmem_mmap_prepare(struct vm_area_desc *desc)
{
- unsigned long len = vma->vm_end - vma->vm_start;
+ const unsigned long len = desc->end - desc->start;
- if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
+ if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
return -EINVAL;
- if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
+ if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len))
return -EAGAIN;
- vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP);
- vma->vm_ops = &secretmem_vm_ops;
+ desc->vm_flags |= VM_LOCKED | VM_DONTDUMP;
+ desc->vm_ops = &secretmem_vm_ops;
return 0;
}
@@ -143,7 +141,7 @@ bool vma_is_secretmem(struct vm_area_struct *vma)
static const struct file_operations secretmem_fops = {
.release = secretmem_release,
- .mmap = secretmem_mmap,
+ .mmap_prepare = secretmem_mmap_prepare,
};
static int secretmem_migrate_folio(struct address_space *mapping,
@@ -154,7 +152,7 @@ static int secretmem_migrate_folio(struct address_space *mapping,
static void secretmem_free_folio(struct folio *folio)
{
- set_direct_map_default_noflush(&folio->page);
+ set_direct_map_default_noflush(folio_page(folio, 0));
folio_zero_segment(folio, 0, folio_size(folio));
}
@@ -195,20 +193,13 @@ static struct file *secretmem_file_create(unsigned long flags)
struct file *file;
struct inode *inode;
const char *anon_name = "[secretmem]";
- int err;
- inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
+ inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL);
if (IS_ERR(inode))
return ERR_CAST(inode);
- err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL);
- if (err) {
- file = ERR_PTR(err);
- goto err_free_inode;
- }
-
file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
- O_RDWR, &secretmem_fops);
+ O_RDWR | O_LARGEFILE, &secretmem_fops);
if (IS_ERR(file))
goto err_free_inode;
@@ -222,6 +213,8 @@ static struct file *secretmem_file_create(unsigned long flags)
inode->i_mode |= S_IFREG;
inode->i_size = 0;
+ atomic_inc(&secretmem_users);
+
return file;
err_free_inode:
@@ -255,9 +248,6 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
goto err_put_fd;
}
- file->f_flags |= O_LARGEFILE;
-
- atomic_inc(&secretmem_users);
fd_install(fd, file);
return fd;
@@ -268,7 +258,15 @@ err_put_fd:
static int secretmem_init_fs_context(struct fs_context *fc)
{
- return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM;
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, SECRETMEM_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
+ return 0;
}
static struct file_system_type secretmem_fs = {
@@ -286,9 +284,6 @@ static int __init secretmem_init(void)
if (IS_ERR(secretmem_mnt))
return PTR_ERR(secretmem_mnt);
- /* prevent secretmem mappings from ever getting PROT_EXEC */
- secretmem_mnt->mnt_flags |= MNT_NOEXEC;
-
return 0;
}
fs_initcall(secretmem_init);
diff --git a/mm/shmem.c b/mm/shmem.c
index 99327c30507c..7fdd707ac1ac 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -98,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
#define SHORT_SYMLINK_LEN 128
/*
- * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * shmem_fallocate communicates with shmem_fault or shmem_writeout via
* inode->i_private (with i_rwsem making sure that it has only one user at
* a time): we would prefer not to enlarge the shmem inode just for that.
*/
@@ -107,7 +107,7 @@ struct shmem_falloc {
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
- pgoff_t nr_unswapped; /* how often writepage refused to swap out */
+ pgoff_t nr_unswapped; /* how often writeout refused to swap out */
};
struct shmem_options {
@@ -292,7 +292,7 @@ bool vma_is_shmem(struct vm_area_struct *vma)
}
static LIST_HEAD(shmem_swaplist);
-static DEFINE_MUTEX(shmem_swaplist_mutex);
+static DEFINE_SPINLOCK(shmem_swaplist_lock);
#ifdef CONFIG_TMPFS_QUOTA
@@ -432,10 +432,13 @@ static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
*
* But normally info->alloced == inode->i_mapping->nrpages + info->swapped
* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
+ *
+ * Return: true if swapped was incremented from 0, for shmem_writeout().
*/
-static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
+static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ bool first_swapped = false;
long freed;
spin_lock(&info->lock);
@@ -446,12 +449,15 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
/*
* Special case: whereas normally shmem_recalc_inode() is called
* after i_mapping->nrpages has already been adjusted (up or down),
- * shmem_writepage() has to raise swapped before nrpages is lowered -
+ * shmem_writeout() has to raise swapped before nrpages is lowered -
* to stop a racing shmem_recalc_inode() from thinking that a page has
* been freed. Compensate here, to avoid the need for a followup call.
*/
- if (swapped > 0)
+ if (swapped > 0) {
+ if (info->swapped == swapped)
+ first_swapped = true;
freed += swapped;
+ }
if (freed > 0)
info->alloced -= freed;
spin_unlock(&info->lock);
@@ -459,6 +465,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
/* The quota case may block */
if (freed > 0)
shmem_inode_unacct_blocks(inode, freed);
+ return first_swapped;
}
bool shmem_charge(struct inode *inode, long pages)
@@ -615,7 +622,7 @@ static unsigned int shmem_get_orders_within_size(struct inode *inode,
static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
loff_t write_end, bool shmem_huge_force,
struct vm_area_struct *vma,
- unsigned long vm_flags)
+ vm_flags_t vm_flags)
{
unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
0 : BIT(HPAGE_PMD_ORDER);
@@ -862,7 +869,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
loff_t write_end, bool shmem_huge_force,
struct vm_area_struct *vma,
- unsigned long vm_flags)
+ vm_flags_t vm_flags)
{
return 0;
}
@@ -1375,11 +1382,11 @@ static void shmem_evict_inode(struct inode *inode)
/* Wait while shmem_unuse() is scanning this inode... */
wait_var_event(&info->stop_eviction,
!atomic_read(&info->stop_eviction));
- mutex_lock(&shmem_swaplist_mutex);
+ spin_lock(&shmem_swaplist_lock);
/* ...but beware of the race if we peeked too early */
if (!atomic_read(&info->stop_eviction))
list_del_init(&info->swaplist);
- mutex_unlock(&shmem_swaplist_mutex);
+ spin_unlock(&shmem_swaplist_lock);
}
}
@@ -1446,8 +1453,6 @@ static int shmem_unuse_swap_entries(struct inode *inode,
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
- if (!xa_is_value(folio))
- continue;
error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
mapping_gfp_mask(mapping), NULL, NULL);
if (error == 0) {
@@ -1504,7 +1509,8 @@ int shmem_unuse(unsigned int type)
if (list_empty(&shmem_swaplist))
return 0;
- mutex_lock(&shmem_swaplist_mutex);
+ spin_lock(&shmem_swaplist_lock);
+start_over:
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) {
list_del_init(&info->swaplist);
@@ -1517,31 +1523,38 @@ int shmem_unuse(unsigned int type)
* (igrab() would protect from unlink, but not from unmount).
*/
atomic_inc(&info->stop_eviction);
- mutex_unlock(&shmem_swaplist_mutex);
+ spin_unlock(&shmem_swaplist_lock);
error = shmem_unuse_inode(&info->vfs_inode, type);
cond_resched();
- mutex_lock(&shmem_swaplist_mutex);
- next = list_next_entry(info, swaplist);
- if (!info->swapped)
- list_del_init(&info->swaplist);
+ spin_lock(&shmem_swaplist_lock);
if (atomic_dec_and_test(&info->stop_eviction))
wake_up_var(&info->stop_eviction);
if (error)
break;
+ if (list_empty(&info->swaplist))
+ goto start_over;
+ next = list_next_entry(info, swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
}
- mutex_unlock(&shmem_swaplist_mutex);
+ spin_unlock(&shmem_swaplist_lock);
return error;
}
-/*
- * Move the page from the page cache to the swap cache.
+/**
+ * shmem_writeout - Write the folio to swap
+ * @folio: The folio to write
+ * @plug: swap plug
+ * @folio_list: list to put back folios on split
+ *
+ * Move the folio from the page cache to the swap cache.
*/
-static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
+ struct list_head *folio_list)
{
- struct folio *folio = page_folio(page);
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -1550,16 +1563,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
int nr_pages;
bool split = false;
- /*
- * Our capabilities prevent regular writeback or sync from ever calling
- * shmem_writepage; but a stacking filesystem might use ->writepage of
- * its underlying filesystem, in which case tmpfs should write out to
- * swap only in response to memory pressure, and not for the writeback
- * threads or sync.
- */
- if (WARN_ON_ONCE(!wbc->for_reclaim))
- goto redirty;
-
if ((info->flags & VM_LOCKED) || sbinfo->noswap)
goto redirty;
@@ -1586,9 +1589,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
try_split:
/* Ensure the subpages are still dirty */
folio_test_set_dirty(folio);
- if (split_huge_page_to_list_to_order(page, wbc->list, 0))
+ if (split_folio_to_list(folio, folio_list))
goto redirty;
- folio = page_folio(page);
folio_clear_dirty(folio);
}
@@ -1627,39 +1629,66 @@ try_split:
folio_mark_uptodate(folio);
}
- /*
- * Add inode to shmem_unuse()'s list of swapped-out inodes,
- * if it's not already there. Do it now before the folio is
- * moved to swap cache, when its pagelock no longer protects
- * the inode from eviction. But don't unlock the mutex until
- * we've incremented swapped, because shmem_unuse_inode() will
- * prune a !swapped inode from the swaplist under this mutex.
- */
- mutex_lock(&shmem_swaplist_mutex);
- if (list_empty(&info->swaplist))
- list_add(&info->swaplist, &shmem_swaplist);
-
if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
- shmem_recalc_inode(inode, 0, nr_pages);
+ bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages);
+ int error;
+
+ /*
+ * Add inode to shmem_unuse()'s list of swapped-out inodes,
+ * if it's not already there. Do it now before the folio is
+ * removed from page cache, when its pagelock no longer
+ * protects the inode from eviction. And do it now, after
+ * we've incremented swapped, because shmem_unuse() will
+ * prune a !swapped inode from the swaplist.
+ */
+ if (first_swapped) {
+ spin_lock(&shmem_swaplist_lock);
+ if (list_empty(&info->swaplist))
+ list_add(&info->swaplist, &shmem_swaplist);
+ spin_unlock(&shmem_swaplist_lock);
+ }
+
swap_shmem_alloc(folio->swap, nr_pages);
shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
- mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(folio_mapped(folio));
- return swap_writepage(&folio->page, wbc);
- }
+ error = swap_writeout(folio, plug);
+ if (error != AOP_WRITEPAGE_ACTIVATE) {
+ /* folio has been unlocked */
+ return error;
+ }
+
+ /*
+ * The intention here is to avoid holding on to the swap when
+ * zswap was unable to compress and unable to writeback; but
+ * it will be appropriate if other reactivate cases are added.
+ */
+ error = shmem_add_to_page_cache(folio, mapping, index,
+ swp_to_radix_entry(folio->swap),
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ /* Swap entry might be erased by racing shmem_free_swap() */
+ if (!error) {
+ shmem_recalc_inode(inode, 0, -nr_pages);
+ swap_free_nr(folio->swap, nr_pages);
+ }
- list_del_init(&info->swaplist);
- mutex_unlock(&shmem_swaplist_mutex);
+ /*
+ * The delete_from_swap_cache() below could be left for
+ * shrink_folio_list()'s folio_free_swap() to dispose of;
+ * but I'm a little nervous about letting this folio out of
+ * shmem_writeout() in a hybrid half-tmpfs-half-swap state
+ * e.g. folio_mapping(folio) might give an unexpected answer.
+ */
+ delete_from_swap_cache(folio);
+ goto redirty;
+ }
if (nr_pages > 1)
goto try_split;
redirty:
folio_mark_dirty(folio);
- if (wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
- folio_unlock(folio);
- return 0;
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
}
+EXPORT_SYMBOL_GPL(shmem_writeout);
#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
@@ -1760,7 +1789,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
{
unsigned long mask = READ_ONCE(huge_shmem_orders_always);
unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
- unsigned long vm_flags = vma ? vma->vm_flags : 0;
+ vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
unsigned int global_orders;
if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
@@ -2262,6 +2291,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
folio = swap_cache_get_folio(swap, NULL, 0);
order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
+ int nr_pages = 1 << order;
bool fallback_order0 = false;
/* Or update major stats only when swapin succeeds?? */
@@ -2275,9 +2305,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
* If uffd is active for the vma, we need per-page fault
* fidelity to maintain the uffd semantics, then fallback
* to swapin order-0 folio, as well as for zswap case.
+ * Any existing sub folio in the swap cache also blocks
+ * mTHP swapin.
*/
if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
- !zswap_never_enabled()))
+ !zswap_never_enabled() ||
+ non_swapcache_batch(swap, nr_pages) != nr_pages))
fallback_order0 = true;
/* Skip swapcache for synchronous device. */
@@ -2335,6 +2368,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
*/
split_order = shmem_split_large_entry(inode, index, swap, gfp);
if (split_order < 0) {
+ folio_put(folio);
+ folio = NULL;
error = split_order;
goto failed;
}
@@ -3267,9 +3302,9 @@ static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
static int
-shmem_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -3301,9 +3336,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
}
static int
-shmem_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+shmem_write_end(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
@@ -3768,7 +3803,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
index--;
/*
- * Inform shmem_writepage() how far we have reached.
+ * Inform shmem_writeout() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
if (!folio_test_uptodate(folio))
@@ -4184,7 +4219,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
#ifdef CONFIG_TMPFS_XATTR
-static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
@@ -4194,7 +4229,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
static int shmem_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -4981,7 +5016,6 @@ static void shmem_put_super(struct super_block *sb)
static const struct dentry_operations shmem_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
.d_compare = generic_ci_d_compare,
- .d_delete = always_delete_dentry,
};
#endif
@@ -5029,7 +5063,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
if (ctx->encoding) {
sb->s_encoding = ctx->encoding;
- sb->s_d_op = &shmem_ci_dentry_ops;
+ set_default_d_op(sb, &shmem_ci_dentry_ops);
if (ctx->strict_encoding)
sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
}
@@ -5038,6 +5072,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#else
sb->s_flags |= SB_NOUSER;
#endif /* CONFIG_TMPFS */
+ sb->s_d_flags |= DCACHE_DONTCACHE;
sbinfo->max_blocks = ctx->blocks;
sbinfo->max_inodes = ctx->inodes;
sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
@@ -5191,7 +5226,6 @@ static int shmem_error_remove_folio(struct address_space *mapping,
}
static const struct address_space_operations shmem_aops = {
- .writepage = shmem_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
.write_begin = shmem_write_begin,
@@ -5810,12 +5844,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
- if (shmem_acct_size(flags, size))
- return ERR_PTR(-ENOMEM);
-
if (is_idmapped_mnt(mnt))
return ERR_PTR(-EINVAL);
+ if (shmem_acct_size(flags, size))
+ return ERR_PTR(-ENOMEM);
+
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags);
if (IS_ERR(inode)) {
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 6af13bcd2ab3..41999e94a56d 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -94,26 +94,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
unsigned long free_highpages = 0;
pg_data_t *pgdat = NODE_DATA(nid);
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
- val->totalram = managed_pages;
- val->sharedram = node_page_state(pgdat, NR_SHMEM);
- val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
-#ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
-
+ managed_pages += zone_managed_pages(zone);
if (is_highmem(zone)) {
managed_highpages += zone_managed_pages(zone);
free_highpages += zone_page_state(zone, NR_FREE_PAGES);
}
}
+
+ val->totalram = managed_pages;
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
val->totalhigh = managed_highpages;
val->freehigh = free_highpages;
-#else
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#endif
val->mem_unit = PAGE_SIZE;
}
#endif
@@ -223,7 +217,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
global_node_page_state(NR_SECONDARY_PAGETABLE),
- global_zone_page_state(NR_BOUNCE),
+ 0UL,
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
free_pcp,
@@ -252,7 +246,6 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
" shmem_pmdmapped:%lukB"
" anon_thp:%lukB"
#endif
- " writeback_tmp:%lukB"
" kernel_stack:%lukB"
#ifdef CONFIG_SHADOW_CALL_STACK
" shadow_call_stack:%lukB"
@@ -279,7 +272,6 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
K(node_page_state(pgdat, NR_ANON_THPS)),
#endif
- K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
node_page_state(pgdat, NR_KERNEL_STACK_KB),
#ifdef CONFIG_SHADOW_CALL_STACK
node_page_state(pgdat, NR_KERNEL_SCS_KB),
@@ -311,6 +303,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
" low:%lukB"
" high:%lukB"
" reserved_highatomic:%luKB"
+ " free_highatomic:%luKB"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
@@ -332,6 +325,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
K(zone->nr_reserved_highatomic),
+ K(zone->nr_free_highatomic),
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
@@ -341,7 +335,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_BOUNCE)),
+ 0UL,
K(free_pcp),
K(this_cpu_read(zone->per_cpu_pageset->count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
diff --git a/mm/slab.h b/mm/slab.h
index 05a21dc796e0..248b34c839b7 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -50,7 +50,7 @@ typedef union {
/* Reuses the bits in struct page */
struct slab {
- unsigned long __page_flags;
+ unsigned long flags;
struct kmem_cache *slab_cache;
union {
@@ -99,7 +99,7 @@ struct slab {
#define SLAB_MATCH(pg, sl) \
static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
-SLAB_MATCH(flags, __page_flags);
+SLAB_MATCH(flags, flags);
SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */
SLAB_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_MEMCG
@@ -167,30 +167,6 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
*/
#define slab_page(s) folio_page(slab_folio(s), 0)
-/*
- * If network-based swap is enabled, sl*b must keep track of whether pages
- * were allocated from pfmemalloc reserves.
- */
-static inline bool slab_test_pfmemalloc(const struct slab *slab)
-{
- return folio_test_active(slab_folio(slab));
-}
-
-static inline void slab_set_pfmemalloc(struct slab *slab)
-{
- folio_set_active(slab_folio(slab));
-}
-
-static inline void slab_clear_pfmemalloc(struct slab *slab)
-{
- folio_clear_active(slab_folio(slab));
-}
-
-static inline void __slab_clear_pfmemalloc(struct slab *slab)
-{
- __folio_clear_active(slab_folio(slab));
-}
-
static inline void *slab_address(const struct slab *slab)
{
return folio_address(slab_folio(slab));
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5be257e03c7c..bfe7c40eeee1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -2,7 +2,7 @@
/*
* Slab allocator functions that are independent of the allocator strategy
*
- * (C) 2012 Christoph Lameter <cl@linux.com>
+ * (C) 2012 Christoph Lameter <cl@gentwo.org>
*/
#include <linux/slab.h>
diff --git a/mm/slub.c b/mm/slub.c
index f3d61b330a76..30003763d224 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -23,6 +23,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
+#include <linux/node.h>
#include <linux/kmsan.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
@@ -91,14 +92,14 @@
* The partially empty slabs cached on the CPU partial list are used
* for performance reasons, which speeds up the allocation process.
* These slabs are not frozen, but are also exempt from list management,
- * by clearing the PG_workingset flag when moving out of the node
+ * by clearing the SL_partial flag when moving out of the node
* partial list. Please see __slab_free() for more details.
*
* To sum up, the current scheme is:
- * - node partial slab: PG_Workingset && !frozen
- * - cpu partial slab: !PG_Workingset && !frozen
- * - cpu slab: !PG_Workingset && frozen
- * - full slab: !PG_Workingset && !frozen
+ * - node partial slab: SL_partial && !frozen
+ * - cpu partial slab: !SL_partial && !frozen
+ * - cpu slab: !SL_partial && frozen
+ * - full slab: !SL_partial && !frozen
*
* list_lock
*
@@ -183,6 +184,22 @@
* the fast path and disables lockless freelists.
*/
+/**
+ * enum slab_flags - How the slab flags bits are used.
+ * @SL_locked: Is locked with slab_lock()
+ * @SL_partial: On the per-node partial list
+ * @SL_pfmemalloc: Was allocated from PF_MEMALLOC reserves
+ *
+ * The slab flags share space with the page flags but some bits have
+ * different interpretations. The high bits are used for information
+ * like zone/node/section.
+ */
+enum slab_flags {
+ SL_locked = PG_locked,
+ SL_partial = PG_workingset, /* Historical reasons for this bit */
+ SL_pfmemalloc = PG_active, /* Historical reasons for this bit */
+};
+
/*
* We could simply use migrate_disable()/enable() but as long as it's a
* function call even on !PREEMPT_RT, use inline preempt_disable() there.
@@ -447,7 +464,7 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
/*
* Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
- * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
+ * Corresponds to node_state[N_MEMORY], but can temporarily
* differ during memory hotplug/hotremove operations.
* Protected by slab_mutex.
*/
@@ -635,16 +652,35 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
#endif /* CONFIG_SLUB_CPU_PARTIAL */
/*
+ * If network-based swap is enabled, slub must keep track of whether memory
+ * were allocated from pfmemalloc reserves.
+ */
+static inline bool slab_test_pfmemalloc(const struct slab *slab)
+{
+ return test_bit(SL_pfmemalloc, &slab->flags);
+}
+
+static inline void slab_set_pfmemalloc(struct slab *slab)
+{
+ set_bit(SL_pfmemalloc, &slab->flags);
+}
+
+static inline void __slab_clear_pfmemalloc(struct slab *slab)
+{
+ __clear_bit(SL_pfmemalloc, &slab->flags);
+}
+
+/*
* Per slab locking using the pagelock
*/
static __always_inline void slab_lock(struct slab *slab)
{
- bit_spin_lock(PG_locked, &slab->__page_flags);
+ bit_spin_lock(SL_locked, &slab->flags);
}
static __always_inline void slab_unlock(struct slab *slab)
{
- bit_spin_unlock(PG_locked, &slab->__page_flags);
+ bit_spin_unlock(SL_locked, &slab->flags);
}
static inline bool
@@ -1010,7 +1046,7 @@ static void print_slab_info(const struct slab *slab)
{
pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
slab, slab->objects, slab->inuse, slab->freelist,
- &slab->__page_flags);
+ &slab->flags);
}
void skip_orig_size_check(struct kmem_cache *s, const void *object)
@@ -1973,6 +2009,11 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
__GFP_ACCOUNT | __GFP_NOFAIL)
+static inline void init_slab_obj_exts(struct slab *slab)
+{
+ slab->obj_exts = 0;
+}
+
int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -2023,8 +2064,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
return 0;
}
-/* Should be called only if mem_alloc_profiling_enabled() */
-static noinline void free_slab_obj_exts(struct slab *slab)
+static inline void free_slab_obj_exts(struct slab *slab)
{
struct slabobj_ext *obj_exts;
@@ -2044,20 +2084,12 @@ static noinline void free_slab_obj_exts(struct slab *slab)
slab->obj_exts = 0;
}
-static inline bool need_slab_obj_ext(void)
-{
- if (mem_alloc_profiling_enabled())
- return true;
+#else /* CONFIG_SLAB_OBJ_EXT */
- /*
- * CONFIG_MEMCG creates vector of obj_cgroup objects conditionally
- * inside memcg_slab_post_alloc_hook. No other users for now.
- */
- return false;
+static inline void init_slab_obj_exts(struct slab *slab)
+{
}
-#else /* CONFIG_SLAB_OBJ_EXT */
-
static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -2068,11 +2100,6 @@ static inline void free_slab_obj_exts(struct slab *slab)
{
}
-static inline bool need_slab_obj_ext(void)
-{
- return false;
-}
-
#endif /* CONFIG_SLAB_OBJ_EXT */
#ifdef CONFIG_MEM_ALLOC_PROFILING
@@ -2093,10 +2120,11 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
slab = virt_to_slab(p);
if (!slab_obj_exts(slab) &&
- WARN(alloc_slab_obj_exts(slab, s, flags, false),
- "%s, %s: Failed to create slab extension vector!\n",
- __func__, s->name))
+ alloc_slab_obj_exts(slab, s, flags, false)) {
+ pr_warn_once("%s, %s: Failed to create slab extension vector!\n",
+ __func__, s->name);
return NULL;
+ }
return slab_obj_exts(slab) + obj_to_index(s, slab, p);
}
@@ -2120,7 +2148,7 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
static inline void
alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
{
- if (need_slab_obj_ext())
+ if (mem_alloc_profiling_enabled())
__alloc_tagging_slab_alloc_hook(s, object, flags);
}
@@ -2592,8 +2620,12 @@ static __always_inline void account_slab(struct slab *slab, int order,
static __always_inline void unaccount_slab(struct slab *slab, int order,
struct kmem_cache *s)
{
- if (memcg_kmem_online() || need_slab_obj_ext())
- free_slab_obj_exts(slab);
+ /*
+ * The slab object extensions should now be freed regardless of
+ * whether mem_alloc_profiling_enabled() or not because profiling
+ * might have been disabled after slab->obj_exts got allocated.
+ */
+ free_slab_obj_exts(slab);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
-(PAGE_SIZE << order));
@@ -2637,6 +2669,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
slab->objects = oo_objects(oo);
slab->inuse = 0;
slab->frozen = 0;
+ init_slab_obj_exts(slab);
account_slab(slab, oo_order(oo), s, flags);
@@ -2720,23 +2753,19 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab)
free_slab(s, slab);
}
-/*
- * SLUB reuses PG_workingset bit to keep track of whether it's on
- * the per-node partial list.
- */
static inline bool slab_test_node_partial(const struct slab *slab)
{
- return folio_test_workingset(slab_folio(slab));
+ return test_bit(SL_partial, &slab->flags);
}
static inline void slab_set_node_partial(struct slab *slab)
{
- set_bit(PG_workingset, folio_flags(slab_folio(slab), 0));
+ set_bit(SL_partial, &slab->flags);
}
static inline void slab_clear_node_partial(struct slab *slab)
{
- clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0));
+ clear_bit(SL_partial, &slab->flags);
}
/*
@@ -4272,7 +4301,12 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
flags = kmalloc_fix_flags(flags);
flags |= __GFP_COMP;
- folio = (struct folio *)alloc_pages_node_noprof(node, flags, order);
+
+ if (node == NUMA_NO_NODE)
+ folio = (struct folio *)alloc_frozen_pages_noprof(flags, order);
+ else
+ folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL);
+
if (folio) {
ptr = folio_address(folio);
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
@@ -4768,7 +4802,7 @@ static void free_large_kmalloc(struct folio *folio, void *object)
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
-(PAGE_SIZE << order));
__folio_clear_large_kmalloc(folio);
- folio_put(folio);
+ free_frozen_pages(&folio->page, order);
}
/*
@@ -4933,12 +4967,12 @@ alloc_new:
* When slub_debug_orig_size() is off, krealloc() only knows about the bucket
* size of an allocation (but not the exact size it was allocated with) and
* hence implements the following semantics for shrinking and growing buffers
- * with __GFP_ZERO.
+ * with __GFP_ZERO::
*
- * new bucket
- * 0 size size
- * |--------|----------------|
- * | keep | zero |
+ * new bucket
+ * 0 size size
+ * |--------|----------------|
+ * | keep | zero |
*
* Otherwise, the original allocation size 'orig_size' could be used to
* precisely clear the requested size, and the new size will also be stored
@@ -4972,14 +5006,16 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
* We want to attempt a large physically contiguous block first because
* it is less likely to fragment multiple larger blocks and therefore
* contribute to a long term fragmentation less than vmalloc fallback.
- * However make sure that larger requests are not too disruptive - no
- * OOM killer and no allocation failure warnings as we have a fallback.
+ * However make sure that larger requests are not too disruptive - i.e.
+ * do not direct reclaim unless physically continuous memory is preferred
+ * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to
+ * start working in the background
*/
if (size > PAGE_SIZE) {
flags |= __GFP_NOWARN;
if (!(flags & __GFP_RETRY_MAYFAIL))
- flags |= __GFP_NORETRY;
+ flags &= ~__GFP_DIRECT_RECLAIM;
/* nofail semantic is implemented by the vmalloc fallback */
flags &= ~__GFP_NOFAIL;
@@ -6150,7 +6186,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
return __kmem_cache_do_shrink(s);
}
-static int slab_mem_going_offline_callback(void *arg)
+static int slab_mem_going_offline_callback(void)
{
struct kmem_cache *s;
@@ -6164,46 +6200,13 @@ static int slab_mem_going_offline_callback(void *arg)
return 0;
}
-static void slab_mem_offline_callback(void *arg)
-{
- struct memory_notify *marg = arg;
- int offline_node;
-
- offline_node = marg->status_change_nid_normal;
-
- /*
- * If the node still has available memory. we need kmem_cache_node
- * for it yet.
- */
- if (offline_node < 0)
- return;
-
- mutex_lock(&slab_mutex);
- node_clear(offline_node, slab_nodes);
- /*
- * We no longer free kmem_cache_node structures here, as it would be
- * racy with all get_node() users, and infeasible to protect them with
- * slab_mutex.
- */
- mutex_unlock(&slab_mutex);
-}
-
-static int slab_mem_going_online_callback(void *arg)
+static int slab_mem_going_online_callback(int nid)
{
struct kmem_cache_node *n;
struct kmem_cache *s;
- struct memory_notify *marg = arg;
- int nid = marg->status_change_nid_normal;
int ret = 0;
/*
- * If the node's memory is already available, then kmem_cache_node is
- * already created. Nothing to do.
- */
- if (nid < 0)
- return 0;
-
- /*
* We are bringing a node online. No memory is available yet. We must
* allocate a kmem_cache_node structure in order to bring the node
* online.
@@ -6242,21 +6245,16 @@ out:
static int slab_memory_callback(struct notifier_block *self,
unsigned long action, void *arg)
{
+ struct node_notify *nn = arg;
+ int nid = nn->nid;
int ret = 0;
switch (action) {
- case MEM_GOING_ONLINE:
- ret = slab_mem_going_online_callback(arg);
- break;
- case MEM_GOING_OFFLINE:
- ret = slab_mem_going_offline_callback(arg);
- break;
- case MEM_OFFLINE:
- case MEM_CANCEL_ONLINE:
- slab_mem_offline_callback(arg);
+ case NODE_ADDING_FIRST_MEMORY:
+ ret = slab_mem_going_online_callback(nid);
break;
- case MEM_ONLINE:
- case MEM_CANCEL_OFFLINE:
+ case NODE_REMOVING_LAST_MEMORY:
+ ret = slab_mem_going_offline_callback();
break;
}
if (ret)
@@ -6324,14 +6322,14 @@ void __init kmem_cache_init(void)
* Initialize the nodemask for which we will allocate per node
* structures. Here we don't need taking slab_mutex yet.
*/
- for_each_node_state(node, N_NORMAL_MEMORY)
+ for_each_node_state(node, N_MEMORY)
node_set(node, slab_nodes);
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node),
SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
- hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+ hotplug_node_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
diff --git a/mm/swap.c b/mm/swap.c
index 77b2d5997873..3632dd061beb 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -237,8 +237,9 @@ void folio_rotate_reclaimable(struct folio *folio)
folio_batch_add_and_move(folio, lru_move_tail, true);
}
-void lru_note_cost(struct lruvec *lruvec, bool file,
- unsigned int nr_io, unsigned int nr_rotated)
+void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated)
+ __releases(lruvec->lru_lock)
{
unsigned long cost;
@@ -250,18 +251,14 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
* different between them, adjust scan balance for CPU work.
*/
cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
+ if (!cost) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ return;
+ }
- do {
+ for (;;) {
unsigned long lrusize;
- /*
- * Hold lruvec->lru_lock is safe here, since
- * 1) The pinned lruvec in reclaim, or
- * 2) From a pre-LRU page during refault (which also holds the
- * rcu lock, so would be safe even if the page was on the LRU
- * and could move simultaneously to a new lruvec).
- */
- spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
lruvec->file_cost += cost;
@@ -285,14 +282,22 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
lruvec->file_cost /= 2;
lruvec->anon_cost /= 2;
}
+
spin_unlock_irq(&lruvec->lru_lock);
- } while ((lruvec = parent_lruvec(lruvec)));
+ lruvec = parent_lruvec(lruvec);
+ if (!lruvec)
+ break;
+ spin_lock_irq(&lruvec->lru_lock);
+ }
}
void lru_note_cost_refault(struct folio *folio)
{
- lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
- folio_nr_pages(folio), 0);
+ struct lruvec *lruvec;
+
+ lruvec = folio_lruvec_lock_irq(folio);
+ lru_note_cost_unlock_irq(lruvec, folio_is_file_lru(folio),
+ folio_nr_pages(folio), 0);
}
static void lru_activate(struct lruvec *lruvec, struct folio *folio)
@@ -309,7 +314,7 @@ static void lru_activate(struct lruvec *lruvec, struct folio *folio)
trace_mm_lru_activate(folio);
__count_vm_events(PGACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
}
#ifdef CONFIG_SMP
@@ -581,7 +586,7 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
if (active) {
__count_vm_events(PGDEACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
nr_pages);
}
}
@@ -599,7 +604,7 @@ static void lru_deactivate(struct lruvec *lruvec, struct folio *folio)
lruvec_add_folio(lruvec, folio);
__count_vm_events(PGDEACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
}
static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
@@ -625,7 +630,7 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
lruvec_add_folio(lruvec, folio);
__count_vm_events(PGLAZYFREE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
}
/*
diff --git a/mm/swap.h b/mm/swap.h
index 6f4a3f927edb..911ad5ff0f89 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -3,6 +3,8 @@
#define _MM_SWAP_H
struct mempolicy;
+struct swap_iocb;
+
extern int page_cluster;
#ifdef CONFIG_SWAP
@@ -20,8 +22,8 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
__swap_read_unplug(plug);
}
void swap_write_unplug(struct swap_iocb *sio);
-int swap_writepage(struct page *page, struct writeback_control *wbc);
-void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
+int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
+void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
/* linux/mm/swap_state.c */
/* One swap address space for each 64M swap space */
@@ -106,6 +108,25 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
return find_next_bit(sis->zeromap, end, start) - start;
}
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ struct swap_info_struct *si = swp_swap_info(entry);
+ pgoff_t offset = swp_offset(entry);
+ int i;
+
+ /*
+ * While allocating a large folio and doing mTHP swapin, we need to
+ * ensure all entries are not cached, otherwise, the mTHP folio will
+ * be in conflict with the folio in swap cache.
+ */
+ for (i = 0; i < max_nr; i++) {
+ if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
+ return i;
+ }
+
+ return i;
+}
+
#else /* CONFIG_SWAP */
struct swap_iocb;
static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
@@ -141,7 +162,8 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
return NULL;
}
-static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+static inline int swap_writeout(struct folio *folio,
+ struct swap_iocb **swap_plug)
{
return 0;
}
@@ -199,6 +221,28 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
return 0;
}
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ return 0;
+}
#endif /* CONFIG_SWAP */
+/**
+ * folio_index - File index of a folio.
+ * @folio: The folio.
+ *
+ * For a folio which is either in the page cache or the swap cache,
+ * return its index within the address_space it belongs to. If you know
+ * the folio is definitely in the page cache, you can look at the folio's
+ * index directly.
+ *
+ * Return: The index (offset in units of pages) of a folio in its file.
+ */
+static inline pgoff_t folio_index(struct folio *folio)
+{
+ if (unlikely(folio_test_swapcache(folio)))
+ return swap_cache_index(folio->swap);
+ return folio->index;
+}
+
#endif /* _MM_SWAP_H */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 68fd981b514f..c354435a0923 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -30,7 +30,6 @@
* vmscan's shrink_folio_list.
*/
static const struct address_space_operations swap_aops = {
- .writepage = swap_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
.migrate_folio = migrate_folio,
@@ -232,13 +231,11 @@ void free_swap_cache(struct folio *folio)
}
/*
- * Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page.
+ * Freeing a folio and also freeing any swap cache associated with
+ * this folio if it is the last user.
*/
-void free_page_and_swap_cache(struct page *page)
+void free_folio_and_swap_cache(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
free_swap_cache(folio);
if (!is_huge_zero_folio(folio))
folio_put(folio);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2eff8b51a945..b4f3cc712580 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -52,9 +52,9 @@
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
-static void swap_entry_range_free(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- swp_entry_t entry, unsigned int nr_pages);
+static void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages);
static void swap_range_alloc(struct swap_info_struct *si,
unsigned int nr_entries);
static bool folio_swapcache_freeable(struct folio *folio);
@@ -192,7 +192,7 @@ static bool swap_is_last_map(struct swap_info_struct *si,
unsigned char *map_end = map + nr_pages;
unsigned char count = *map;
- if (swap_count(count) != 1)
+ if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM)
return false;
while (++map < map_end) {
@@ -956,9 +956,8 @@ new_cluster:
}
/*
- * We don't have free cluster but have some clusters in
- * discarding, do discard now and reclaim them, then
- * reread cluster_next_cpu since we dropped si->lock
+ * We don't have free cluster but have some clusters in discarding,
+ * do discard now and reclaim them.
*/
if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si))
goto new_cluster;
@@ -1115,6 +1114,7 @@ static void swap_range_alloc(struct swap_info_struct *si,
if (vm_swap_full())
schedule_work(&si->reclaim_work);
}
+ atomic_long_sub(nr_entries, &nr_swap_pages);
}
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -1272,13 +1272,22 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
- /*
- * Should not even be attempting large allocations when huge
- * page swap is disabled. Warn and fail the allocation.
- */
- if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) {
- VM_WARN_ON_ONCE(1);
- return -EINVAL;
+ if (order) {
+ /*
+ * Reject large allocation when THP_SWAP is disabled,
+ * the caller should split the folio and try again.
+ */
+ if (!IS_ENABLED(CONFIG_THP_SWAP))
+ return -EAGAIN;
+
+ /*
+ * Allocation size should never exceed cluster size
+ * (HPAGE_PMD_SIZE).
+ */
+ if (size > SWAPFILE_CLUSTER) {
+ VM_WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
}
local_lock(&percpu_swap_cluster.lock);
@@ -1304,7 +1313,6 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
goto out_free;
- atomic_long_sub(size, &nr_swap_pages);
return 0;
out_free:
@@ -1346,10 +1354,12 @@ out:
return NULL;
}
-static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
- unsigned long offset,
- unsigned char usage)
+static unsigned char swap_entry_put_locked(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry,
+ unsigned char usage)
{
+ unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
@@ -1381,7 +1391,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
if (usage)
WRITE_ONCE(si->swap_map[offset], usage);
else
- WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE);
+ swap_entries_free(si, ci, entry, 1);
return usage;
}
@@ -1452,71 +1462,104 @@ put_out:
return NULL;
}
-static unsigned char __swap_entry_free(struct swap_info_struct *si,
- swp_entry_t entry)
+static void swap_entries_put_cache(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
- struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
- unsigned char usage;
+ struct swap_cluster_info *ci;
ci = lock_cluster(si, offset);
- usage = __swap_entry_free_locked(si, offset, 1);
- if (!usage)
- swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1);
+ if (swap_only_has_cache(si, offset, nr))
+ swap_entries_free(si, ci, entry, nr);
+ else {
+ for (int i = 0; i < nr; i++, entry.val++)
+ swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
+ }
unlock_cluster(ci);
-
- return usage;
}
-static bool __swap_entries_free(struct swap_info_struct *si,
- swp_entry_t entry, int nr)
+static bool swap_entries_put_map(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
unsigned long offset = swp_offset(entry);
- unsigned int type = swp_type(entry);
struct swap_cluster_info *ci;
bool has_cache = false;
unsigned char count;
int i;
- if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
+ if (nr <= 1)
goto fallback;
- /* cross into another cluster */
- if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
+ count = swap_count(data_race(si->swap_map[offset]));
+ if (count != 1 && count != SWAP_MAP_SHMEM)
goto fallback;
ci = lock_cluster(si, offset);
if (!swap_is_last_map(si, offset, nr, &has_cache)) {
- unlock_cluster(ci);
- goto fallback;
+ goto locked_fallback;
}
- for (i = 0; i < nr; i++)
- WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
if (!has_cache)
- swap_entry_range_free(si, ci, entry, nr);
+ swap_entries_free(si, ci, entry, nr);
+ else
+ for (i = 0; i < nr; i++)
+ WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
unlock_cluster(ci);
return has_cache;
fallback:
- for (i = 0; i < nr; i++) {
- if (data_race(si->swap_map[offset + i])) {
- count = __swap_entry_free(si, swp_entry(type, offset + i));
- if (count == SWAP_HAS_CACHE)
- has_cache = true;
- } else {
- WARN_ON_ONCE(1);
- }
+ ci = lock_cluster(si, offset);
+locked_fallback:
+ for (i = 0; i < nr; i++, entry.val++) {
+ count = swap_entry_put_locked(si, ci, entry, 1);
+ if (count == SWAP_HAS_CACHE)
+ has_cache = true;
}
+ unlock_cluster(ci);
return has_cache;
+
}
/*
- * Drop the last HAS_CACHE flag of swap entries, caller have to
- * ensure all entries belong to the same cgroup.
+ * Only functions with "_nr" suffix are able to free entries spanning
+ * cross multi clusters, so ensure the range is within a single cluster
+ * when freeing entries with functions without "_nr" suffix.
*/
-static void swap_entry_range_free(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- swp_entry_t entry, unsigned int nr_pages)
+static bool swap_entries_put_map_nr(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
+{
+ int cluster_nr, cluster_rest;
+ unsigned long offset = swp_offset(entry);
+ bool has_cache = false;
+
+ cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER;
+ while (nr) {
+ cluster_nr = min(nr, cluster_rest);
+ has_cache |= swap_entries_put_map(si, entry, cluster_nr);
+ cluster_rest = SWAPFILE_CLUSTER;
+ nr -= cluster_nr;
+ entry.val += cluster_nr;
+ }
+
+ return has_cache;
+}
+
+/*
+ * Check if it's the last ref of swap entry in the freeing path.
+ * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM.
+ */
+static inline bool __maybe_unused swap_is_last_ref(unsigned char count)
+{
+ return (count == SWAP_HAS_CACHE) || (count == 1) ||
+ (count == SWAP_MAP_SHMEM);
+}
+
+/*
+ * Drop the last ref of swap entries, caller have to ensure all entries
+ * belong to the same cgroup and cluster.
+ */
+static void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages)
{
unsigned long offset = swp_offset(entry);
unsigned char *map = si->swap_map + offset;
@@ -1529,7 +1572,7 @@ static void swap_entry_range_free(struct swap_info_struct *si,
ci->count -= nr_pages;
do {
- VM_BUG_ON(*map != SWAP_HAS_CACHE);
+ VM_BUG_ON(!swap_is_last_ref(*map));
*map = 0;
} while (++map < map_end);
@@ -1542,21 +1585,6 @@ static void swap_entry_range_free(struct swap_info_struct *si,
partial_free_cluster(si, ci);
}
-static void cluster_swap_free_nr(struct swap_info_struct *si,
- unsigned long offset, int nr_pages,
- unsigned char usage)
-{
- struct swap_cluster_info *ci;
- unsigned long end = offset + nr_pages;
-
- ci = lock_cluster(si, offset);
- do {
- if (!__swap_entry_free_locked(si, offset, usage))
- swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1);
- } while (++offset < end);
- unlock_cluster(ci);
-}
-
/*
* Caller has made sure that the swap device corresponding to entry
* is still around or has not been recycled.
@@ -1573,7 +1601,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
while (nr_pages) {
nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
- cluster_swap_free_nr(sis, offset, nr, 1);
+ swap_entries_put_map(sis, swp_entry(sis->type, offset), nr);
offset += nr;
nr_pages -= nr;
}
@@ -1584,8 +1612,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
*/
void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
- unsigned long offset = swp_offset(entry);
- struct swap_cluster_info *ci;
struct swap_info_struct *si;
int size = 1 << swap_entry_order(folio_order(folio));
@@ -1593,16 +1619,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
if (!si)
return;
- ci = lock_cluster(si, offset);
- if (swap_only_has_cache(si, offset, size))
- swap_entry_range_free(si, ci, entry, size);
- else {
- for (int i = 0; i < size; i++, entry.val++) {
- if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE))
- swap_entry_range_free(si, ci, entry, 1);
- }
- }
- unlock_cluster(ci);
+ swap_entries_put_cache(si, entry, size);
}
int __swap_count(swp_entry_t entry)
@@ -1797,7 +1814,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
/*
* First free all entries in the range.
*/
- any_only_cache = __swap_entries_free(si, entry, nr);
+ any_only_cache = swap_entries_put_map_nr(si, entry, nr);
/*
* Short-circuit the below loop if none of the entries had their
@@ -1807,13 +1824,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
goto out;
/*
- * Now go back over the range trying to reclaim the swap cache. This is
- * more efficient for large folios because we will only try to reclaim
- * the swap once per folio in the common case. If we do
- * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
- * latter will get a reference and lock the folio for every individual
- * page but will only succeed once the swap slot for every subpage is
- * zero.
+ * Now go back over the range trying to reclaim the swap cache.
*/
for (offset = start_offset; offset < end_offset; offset += nr) {
nr = 1;
@@ -2359,7 +2370,7 @@ retry:
* Limit the number of retries? No: when mmget_not_zero()
* above fails, that mm is likely to be freeing swap from
* exit_mmap(), which proceeds at its own independent pace;
- * and even shmem_writepage() could have been preempted after
+ * and even shmem_writeout() could have been preempted after
* folio_alloc_swap(), temporarily hiding that swap. It's easy
* and robust (though cpu-intensive) just to keep retrying.
*/
@@ -3129,43 +3140,30 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
return maxpages;
}
-static int setup_swap_map_and_extents(struct swap_info_struct *si,
- union swap_header *swap_header,
- unsigned char *swap_map,
- unsigned long maxpages,
- sector_t *span)
+static int setup_swap_map(struct swap_info_struct *si,
+ union swap_header *swap_header,
+ unsigned char *swap_map,
+ unsigned long maxpages)
{
- unsigned int nr_good_pages;
unsigned long i;
- int nr_extents;
-
- nr_good_pages = maxpages - 1; /* omit header page */
+ swap_map[0] = SWAP_MAP_BAD; /* omit header page */
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
if (page_nr == 0 || page_nr > swap_header->info.last_page)
return -EINVAL;
if (page_nr < maxpages) {
swap_map[page_nr] = SWAP_MAP_BAD;
- nr_good_pages--;
+ si->pages--;
}
}
- if (nr_good_pages) {
- swap_map[0] = SWAP_MAP_BAD;
- si->max = maxpages;
- si->pages = nr_good_pages;
- nr_extents = setup_swap_extents(si, span);
- if (nr_extents < 0)
- return nr_extents;
- nr_good_pages = si->pages;
- }
- if (!nr_good_pages) {
+ if (!si->pages) {
pr_warn("Empty swap-file\n");
return -EINVAL;
}
- return nr_extents;
+ return 0;
}
#define SWAP_CLUSTER_INFO_COLS \
@@ -3205,13 +3203,17 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
* Mark unusable pages as unavailable. The clusters aren't
* marked free yet, so no list operations are involved yet.
*
- * See setup_swap_map_and_extents(): header page, bad pages,
+ * See setup_swap_map(): header page, bad pages,
* and the EOF part of the last cluster.
*/
inc_cluster_info_page(si, cluster_info, 0);
- for (i = 0; i < swap_header->info.nr_badpages; i++)
- inc_cluster_info_page(si, cluster_info,
- swap_header->info.badpages[i]);
+ for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ unsigned int page_nr = swap_header->info.badpages[i];
+
+ if (page_nr >= maxpages)
+ continue;
+ inc_cluster_info_page(si, cluster_info, page_nr);
+ }
for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
inc_cluster_info_page(si, cluster_info, i);
@@ -3323,6 +3325,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
/*
+ * The swap subsystem needs a major overhaul to support this.
+ * It doesn't work yet so just disable it for now.
+ */
+ if (mapping_min_folio_order(mapping) > 0) {
+ error = -EINVAL;
+ goto bad_swap_unlock_inode;
+ }
+
+ /*
* Read the swap header.
*/
if (!mapping->a_ops->read_folio) {
@@ -3342,6 +3353,21 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap_unlock_inode;
}
+ si->max = maxpages;
+ si->pages = maxpages - 1;
+ nr_extents = setup_swap_extents(si, &span);
+ if (nr_extents < 0) {
+ error = nr_extents;
+ goto bad_swap_unlock_inode;
+ }
+ if (si->pages != si->max - 1) {
+ pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max);
+ error = -EINVAL;
+ goto bad_swap_unlock_inode;
+ }
+
+ maxpages = si->max;
+
/* OK, set up the swap map and apply the bad block list */
swap_map = vzalloc(maxpages);
if (!swap_map) {
@@ -3353,12 +3379,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (error)
goto bad_swap_unlock_inode;
- nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map,
- maxpages, &span);
- if (unlikely(nr_extents < 0)) {
- error = nr_extents;
+ error = setup_swap_map(si, swap_header, swap_map, maxpages);
+ if (error)
goto bad_swap_unlock_inode;
- }
/*
* Use kvmalloc_array instead of bitmap_zalloc as the allocation order might
@@ -3636,11 +3659,13 @@ int swapcache_prepare(swp_entry_t entry, int nr)
return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
}
+/*
+ * Caller should ensure entries belong to the same folio so
+ * the entries won't span cross cluster boundary.
+ */
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
{
- unsigned long offset = swp_offset(entry);
-
- cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE);
+ swap_entries_put_cache(si, entry, nr);
}
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
@@ -3649,21 +3674,6 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry)
}
/*
- * out-of-line methods to avoid include hell.
- */
-struct address_space *swapcache_mapping(struct folio *folio)
-{
- return swp_swap_info(folio->swap)->swap_file->f_mapping;
-}
-EXPORT_SYMBOL_GPL(swapcache_mapping);
-
-pgoff_t __folio_swap_cache_index(struct folio *folio)
-{
- return swap_cache_index(folio->swap);
-}
-EXPORT_SYMBOL_GPL(__folio_swap_cache_index);
-
-/*
* add_swap_count_continuation - called when a swap count is duplicated
* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
* page of the original vmalloc'ed swap_map, to hold the continuation count
@@ -3780,7 +3790,7 @@ outer:
* into, carry if so, or else fail until a new continuation page is allocated;
* when the original swap_map count is decremented from 0 with continuation,
* borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or caller of __swap_entry_free_locked()
+ * Called while __swap_duplicate() or caller of swap_entry_put_locked()
* holds cluster lock.
*/
static bool swap_count_continued(struct swap_info_struct *si,
diff --git a/mm/truncate.c b/mm/truncate.c
index 5d98054094d1..91eb92a5ce4f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -191,6 +191,7 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
loff_t pos = folio_pos(folio);
+ size_t size = folio_size(folio);
unsigned int offset, length;
struct page *split_at, *split_at2;
@@ -198,14 +199,13 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
offset = start - pos;
else
offset = 0;
- length = folio_size(folio);
- if (pos + length <= (u64)end)
- length = length - offset;
+ if (pos + size <= (u64)end)
+ length = size - offset;
else
length = end + 1 - pos - offset;
folio_wait_writeback(folio);
- if (length == folio_size(folio)) {
+ if (length == size) {
truncate_inode_folio(folio->mapping, folio);
return true;
}
@@ -224,16 +224,20 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
return true;
split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
- split_at2 = folio_page(folio,
- PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
-
if (!try_folio_split(folio, split_at, NULL)) {
/*
* try to split at offset + length to make sure folios within
* the range can be dropped, especially to avoid memory waste
* for shmem truncate
*/
- struct folio *folio2 = page_folio(split_at2);
+ struct folio *folio2;
+
+ if (offset + length == size)
+ goto no_split;
+
+ split_at2 = folio_page(folio,
+ PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
+ folio2 = page_folio(split_at2);
if (!folio_try_get(folio2))
goto no_split;
@@ -421,7 +425,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- /* We rely upon deletion not changing page->index */
+ /* We rely upon deletion not changing folio->index */
if (xa_is_value(folio))
continue;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index fbf2cf62ab9f..cbed91b09640 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -561,7 +561,7 @@ retry:
}
while (src_addr < src_start + len) {
- BUG_ON(dst_addr >= dst_start + len);
+ VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
/*
* Serialize via vma_lock and hugetlb_fault_mutex.
@@ -602,7 +602,7 @@ retry:
if (unlikely(err == -ENOENT)) {
up_read(&ctx->map_changing_lock);
uffd_mfill_unlock(dst_vma);
- BUG_ON(!folio);
+ VM_WARN_ON_ONCE(!folio);
err = copy_folio_from_user(folio,
(const void __user *)src_addr, true);
@@ -614,7 +614,7 @@ retry:
dst_vma = NULL;
goto retry;
} else
- BUG_ON(folio);
+ VM_WARN_ON_ONCE(folio);
if (!err) {
dst_addr += vma_hpagesize;
@@ -635,9 +635,9 @@ out_unlock_vma:
out:
if (folio)
folio_put(folio);
- BUG_ON(copied < 0);
- BUG_ON(err > 0);
- BUG_ON(!copied && !err);
+ VM_WARN_ON_ONCE(copied < 0);
+ VM_WARN_ON_ONCE(err > 0);
+ VM_WARN_ON_ONCE(!copied && !err);
return copied ? copied : err;
}
#else /* !CONFIG_HUGETLB_PAGE */
@@ -711,12 +711,12 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
/*
* Sanitize the command parameters:
*/
- BUG_ON(dst_start & ~PAGE_MASK);
- BUG_ON(len & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */
- BUG_ON(src_start + len <= src_start);
- BUG_ON(dst_start + len <= dst_start);
+ VM_WARN_ON_ONCE(src_start + len <= src_start);
+ VM_WARN_ON_ONCE(dst_start + len <= dst_start);
src_addr = src_start;
dst_addr = dst_start;
@@ -775,7 +775,7 @@ retry:
while (src_addr < src_start + len) {
pmd_t dst_pmdval;
- BUG_ON(dst_addr >= dst_start + len);
+ VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
if (unlikely(!dst_pmd)) {
@@ -795,8 +795,8 @@ retry:
* (This includes the case where the PMD used to be THP and
* changed back to none after __pte_alloc().)
*/
- if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
- pmd_devmap(dst_pmdval))) {
+ if (unlikely(!pmd_present(dst_pmdval) ||
+ pmd_trans_huge(dst_pmdval))) {
err = -EEXIST;
break;
}
@@ -818,7 +818,7 @@ retry:
up_read(&ctx->map_changing_lock);
uffd_mfill_unlock(dst_vma);
- BUG_ON(!folio);
+ VM_WARN_ON_ONCE(!folio);
kaddr = kmap_local_folio(folio, 0);
err = copy_from_user(kaddr,
@@ -832,7 +832,7 @@ retry:
flush_dcache_folio(folio);
goto retry;
} else
- BUG_ON(folio);
+ VM_WARN_ON_ONCE(folio);
if (!err) {
dst_addr += PAGE_SIZE;
@@ -852,9 +852,9 @@ out_unlock:
out:
if (folio)
folio_put(folio);
- BUG_ON(copied < 0);
- BUG_ON(err > 0);
- BUG_ON(!copied && !err);
+ VM_WARN_ON_ONCE(copied < 0);
+ VM_WARN_ON_ONCE(err > 0);
+ VM_WARN_ON_ONCE(!copied && !err);
return copied ? copied : err;
}
@@ -940,11 +940,11 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
/*
* Sanitize the command parameters:
*/
- BUG_ON(start & ~PAGE_MASK);
- BUG_ON(len & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */
- BUG_ON(start + len <= start);
+ VM_WARN_ON_ONCE(start + len <= start);
mmap_read_lock(dst_mm);
@@ -1063,9 +1063,14 @@ static int move_present_pte(struct mm_struct *mm,
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
- orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
- /* Follow mremap() behavior and treat the entry dirty after the move */
- orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma);
+ orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+#endif
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
out:
@@ -1079,8 +1084,18 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
pte_t orig_dst_pte, pte_t orig_src_pte,
pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl,
- struct folio *src_folio)
+ struct folio *src_folio,
+ struct swap_info_struct *si, swp_entry_t entry)
{
+ /*
+ * Check if the folio still belongs to the target swap entry after
+ * acquiring the lock. Folio can be freed in the swap cache while
+ * not locked.
+ */
+ if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
+ entry.val != src_folio->swap.val))
+ return -EAGAIN;
+
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1097,9 +1112,31 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
if (src_folio) {
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
+ } else {
+ /*
+ * Check if the swap entry is cached after acquiring the src_pte
+ * lock. Otherwise, we might miss a newly loaded swap cache folio.
+ *
+ * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
+ * We are trying to catch newly added swap cache, the only possible case is
+ * when a folio is swapped in and out again staying in swap cache, using the
+ * same entry before the PTE check above. The PTL is acquired and released
+ * twice, each time after updating the swap_map's flag. So holding
+ * the PTL here ensures we see the updated value. False positive is possible,
+ * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
+ * cache, or during the tiny synchronization window between swap cache and
+ * swap_map, but it will be gone very quickly, worst result is retry jitters.
+ */
+ if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
}
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
+#endif
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
@@ -1404,7 +1441,7 @@ retry:
}
err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
- dst_ptl, src_ptl, src_folio);
+ dst_ptl, src_ptl, src_folio, si, entry);
}
out:
@@ -1701,15 +1738,13 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
ssize_t moved = 0;
/* Sanitize the command parameters. */
- if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
- WARN_ON_ONCE(dst_start & ~PAGE_MASK) ||
- WARN_ON_ONCE(len & ~PAGE_MASK))
- goto out;
+ VM_WARN_ON_ONCE(src_start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */
- if (WARN_ON_ONCE(src_start + len <= src_start) ||
- WARN_ON_ONCE(dst_start + len <= dst_start))
- goto out;
+ VM_WARN_ON_ONCE(src_start + len < src_start);
+ VM_WARN_ON_ONCE(dst_start + len < dst_start);
err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
if (err)
@@ -1783,12 +1818,6 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
ptl = pmd_trans_huge_lock(src_pmd, src_vma);
if (ptl) {
- if (pmd_devmap(*src_pmd)) {
- spin_unlock(ptl);
- err = -ENOENT;
- break;
- }
-
/* Check if we can move the pmd without splitting it. */
if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
!pmd_none(dst_pmdval)) {
@@ -1859,18 +1888,18 @@ out_unlock:
up_read(&ctx->map_changing_lock);
uffd_move_unlock(dst_vma, src_vma);
out:
- VM_WARN_ON(moved < 0);
- VM_WARN_ON(err > 0);
- VM_WARN_ON(!moved && !err);
+ VM_WARN_ON_ONCE(moved < 0);
+ VM_WARN_ON_ONCE(err > 0);
+ VM_WARN_ON_ONCE(!moved && !err);
return moved ? moved : err;
}
static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
- vm_flags_t flags)
+ vm_flags_t vm_flags)
{
- const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
+ const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP;
- vm_flags_reset(vma, flags);
+ vm_flags_reset(vma, vm_flags);
/*
* For shared mappings, we want to enable writenotify while
* userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
@@ -1882,12 +1911,12 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
static void userfaultfd_set_ctx(struct vm_area_struct *vma,
struct userfaultfd_ctx *ctx,
- unsigned long flags)
+ vm_flags_t vm_flags)
{
vma_start_write(vma);
vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx};
userfaultfd_set_vm_flags(vma,
- (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags);
+ (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags);
}
void userfaultfd_reset_ctx(struct vm_area_struct *vma)
@@ -1902,6 +1931,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
unsigned long end)
{
struct vm_area_struct *ret;
+ bool give_up_on_oom = false;
+
+ /*
+ * If we are modifying only and not splitting, just give up on the merge
+ * if OOM prevents us from merging successfully.
+ */
+ if (start == vma->vm_start && end == vma->vm_end)
+ give_up_on_oom = true;
/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
@@ -1909,7 +1946,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
vma->vm_flags & ~__VM_UFFD_FLAGS,
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX, give_up_on_oom);
/*
* In the vma_merge() successful mprotect-like case 8:
@@ -1925,14 +1962,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
/* Assumes mmap write lock taken, and mm_struct pinned. */
int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
struct vm_area_struct *vma,
- unsigned long vm_flags,
+ vm_flags_t vm_flags,
unsigned long start, unsigned long end,
bool wp_async)
{
VMA_ITERATOR(vmi, ctx->mm, start);
struct vm_area_struct *prev = vma_prev(&vmi);
unsigned long vma_end;
- unsigned long new_flags;
+ vm_flags_t new_flags;
if (vma->vm_start < start)
prev = vma;
@@ -1940,10 +1977,10 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
- BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
- vma->vm_userfaultfd_ctx.ctx != ctx);
- WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
+ VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async));
+ VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx &&
+ vma->vm_userfaultfd_ctx.ctx != ctx);
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
/*
* Nothing to do: this vma is already registered into this
@@ -1960,7 +1997,8 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
new_flags,
- (struct vm_userfaultfd_ctx){ctx});
+ (struct vm_userfaultfd_ctx){ctx},
+ /* give_up_on_oom = */false);
if (IS_ERR(vma))
return PTR_ERR(vma);
@@ -2018,8 +2056,8 @@ void userfaultfd_release_all(struct mm_struct *mm,
prev = NULL;
for_each_vma(vmi, vma) {
cond_resched();
- BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
- !!(vma->vm_flags & __VM_UFFD_FLAGS));
+ VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^
+ !!(vma->vm_flags & __VM_UFFD_FLAGS));
if (vma->vm_userfaultfd_ctx.ctx != ctx) {
prev = vma;
continue;
diff --git a/mm/util.c b/mm/util.c
index 448117da071f..f814e6a59ab1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -25,6 +25,7 @@
#include <linux/sizes.h>
#include <linux/compat.h>
#include <linux/fsnotify.h>
+#include <linux/page_idle.h>
#include <linux/uaccess.h>
@@ -670,9 +671,9 @@ struct anon_vma *folio_anon_vma(const struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
- if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
return NULL;
- return (void *)(mapping - PAGE_MAPPING_ANON);
+ return (void *)(mapping - FOLIO_MAPPING_ANON);
}
/**
@@ -699,7 +700,7 @@ struct address_space *folio_mapping(struct folio *folio)
return swap_address_space(folio->swap);
mapping = folio->mapping;
- if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
+ if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS)
return NULL;
return mapping;
@@ -1131,3 +1132,152 @@ void flush_dcache_folio(struct folio *folio)
}
EXPORT_SYMBOL(flush_dcache_folio);
#endif
+
+/**
+ * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an
+ * existing VMA
+ * @file: The file which possesss an f_op->mmap_prepare() hook
+ * @vma: The VMA to apply the .mmap_prepare() hook to.
+ *
+ * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
+ * 'wrapper' file systems invoke a nested mmap hook of an underlying file.
+ *
+ * Until all filesystems are converted to use .mmap_prepare(), we must be
+ * conservative and continue to invoke these 'wrapper' filesystems using the
+ * deprecated .mmap() hook.
+ *
+ * However we have a problem if the underlying file system possesses an
+ * .mmap_prepare() hook, as we are in a different context when we invoke the
+ * .mmap() hook, already having a VMA to deal with.
+ *
+ * compat_vma_mmap_prepare() is a compatibility function that takes VMA state,
+ * establishes a struct vm_area_desc descriptor, passes to the underlying
+ * .mmap_prepare() hook and applies any changes performed by it.
+ *
+ * Once the conversion of filesystems is complete this function will no longer
+ * be required and will be removed.
+ *
+ * Returns: 0 on success or error.
+ */
+int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma)
+{
+ struct vm_area_desc desc;
+ int err;
+
+ err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc));
+ if (err)
+ return err;
+ set_vma_from_desc(vma, &desc);
+
+ return 0;
+}
+EXPORT_SYMBOL(compat_vma_mmap_prepare);
+
+static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
+ const struct page *page)
+{
+ /*
+ * Only the first page of a high-order buddy page has PageBuddy() set.
+ * So we have to check manually whether this page is part of a high-
+ * order buddy page.
+ */
+ if (PageBuddy(page))
+ ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
+ else if (page_count(page) == 0 && is_free_buddy_page(page))
+ ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
+
+ if (folio_test_idle(folio))
+ ps->flags |= PAGE_SNAPSHOT_PG_IDLE;
+}
+
+/**
+ * snapshot_page() - Create a snapshot of a struct page
+ * @ps: Pointer to a struct page_snapshot to store the page snapshot
+ * @page: The page to snapshot
+ *
+ * Create a snapshot of the page and store both its struct page and struct
+ * folio representations in @ps.
+ *
+ * A snapshot is marked as "faithful" if the compound state of @page was
+ * stable and allowed safe reconstruction of the folio representation. In
+ * rare cases where this is not possible (e.g. due to folio splitting),
+ * snapshot_page() falls back to treating @page as a single page and the
+ * snapshot is marked as "unfaithful". The snapshot_page_is_faithful()
+ * helper can be used to check for this condition.
+ */
+void snapshot_page(struct page_snapshot *ps, const struct page *page)
+{
+ unsigned long head, nr_pages = 1;
+ struct folio *foliop;
+ int loops = 5;
+
+ ps->pfn = page_to_pfn(page);
+ ps->flags = PAGE_SNAPSHOT_FAITHFUL;
+
+again:
+ memset(&ps->folio_snapshot, 0, sizeof(struct folio));
+ memcpy(&ps->page_snapshot, page, sizeof(*page));
+ head = ps->page_snapshot.compound_head;
+ if ((head & 1) == 0) {
+ ps->idx = 0;
+ foliop = (struct folio *)&ps->page_snapshot;
+ if (!folio_test_large(foliop)) {
+ set_ps_flags(ps, page_folio(page), page);
+ memcpy(&ps->folio_snapshot, foliop,
+ sizeof(struct page));
+ return;
+ }
+ foliop = (struct folio *)page;
+ } else {
+ foliop = (struct folio *)(head - 1);
+ ps->idx = folio_page_idx(foliop, page);
+ }
+
+ if (ps->idx < MAX_FOLIO_NR_PAGES) {
+ memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page));
+ nr_pages = folio_nr_pages(&ps->folio_snapshot);
+ if (nr_pages > 1)
+ memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2,
+ sizeof(struct page));
+ set_ps_flags(ps, foliop, page);
+ }
+
+ if (ps->idx > nr_pages) {
+ if (loops-- > 0)
+ goto again;
+ clear_compound_head(&ps->page_snapshot);
+ foliop = (struct folio *)&ps->page_snapshot;
+ memcpy(&ps->folio_snapshot, foliop, sizeof(struct page));
+ ps->flags = 0;
+ ps->idx = 0;
+ }
+}
+
+#ifdef CONFIG_MMU
+/**
+ * folio_pte_batch - detect a PTE batch for a large folio
+ * @folio: The large folio to detect a PTE batch for.
+ * @ptep: Page table pointer for the first entry.
+ * @pte: Page table entry for the first page.
+ * @max_nr: The maximum number of table entries to consider.
+ *
+ * This is a simplified variant of folio_pte_batch_flags().
+ *
+ * Detect a PTE batch: consecutive (present) PTEs that map consecutive
+ * pages of the same large folio in a single VMA and a single page table.
+ *
+ * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
+ * the accessed bit, writable bit, dirt-bit and soft-dirty bit.
+ *
+ * ptep must map any page of the folio. max_nr must be at least one and
+ * must be limited by the caller so scanning cannot exceed a single VMA and
+ * a single page table.
+ *
+ * Return: the number of table entries in the batch.
+ */
+unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
+ unsigned int max_nr)
+{
+ return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
+}
+#endif /* CONFIG_MMU */
diff --git a/mm/vma.c b/mm/vma.c
index 5cdc5612bfc1..9ba93be621da 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -15,11 +15,15 @@ struct mmap_state {
unsigned long end;
pgoff_t pgoff;
unsigned long pglen;
- unsigned long flags;
+ vm_flags_t vm_flags;
struct file *file;
+ pgprot_t page_prot;
+
+ /* User-defined fields, perhaps updated by .mmap_prepare(). */
+ const struct vm_operations_struct *vm_ops;
+ void *vm_private_data;
unsigned long charged;
- bool retry_merge;
struct vm_area_struct *prev;
struct vm_area_struct *next;
@@ -28,9 +32,12 @@ struct mmap_state {
struct vma_munmap_struct vms;
struct ma_state mas_detach;
struct maple_tree mt_detach;
+
+ /* Determine if we can check KSM flags early in mmap() logic. */
+ bool check_ksm_early;
};
-#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \
+#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \
struct mmap_state name = { \
.mm = mm_, \
.vmi = vmi_, \
@@ -38,8 +45,9 @@ struct mmap_state {
.end = (addr_) + (len_), \
.pgoff = pgoff_, \
.pglen = PHYS_PFN(len_), \
- .flags = flags_, \
+ .vm_flags = vm_flags_, \
.file = file_, \
+ .page_prot = vm_get_page_prot(vm_flags_), \
}
#define VMG_MMAP_STATE(name, map_, vma_) \
@@ -48,7 +56,7 @@ struct mmap_state {
.vmi = (map_)->vmi, \
.start = (map_)->addr, \
.end = (map_)->end, \
- .flags = (map_)->flags, \
+ .vm_flags = (map_)->vm_flags, \
.pgoff = (map_)->pgoff, \
.file = (map_)->file, \
.prev = (map_)->prev, \
@@ -57,6 +65,22 @@ struct mmap_state {
.state = VMA_MERGE_START, \
}
+/*
+ * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain
+ * more than one anon_vma_chain connecting it to more than one anon_vma. A merge
+ * would mean a wider range of folios sharing the root anon_vma lock, and thus
+ * potential lock contention, we do not wish to encourage merging such that this
+ * scales to a problem.
+ */
+static bool vma_had_uncowed_parents(struct vm_area_struct *vma)
+{
+ /*
+ * The list_is_singular() test is to avoid merging VMA cloned from
+ * parents. This can improve scalability caused by anon_vma lock.
+ */
+ return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain);
+}
+
static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
{
struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
@@ -71,7 +95,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
* the kernel to generate new VMAs when old one could be
* extended instead.
*/
- if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
+ if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY)
return false;
if (vma->vm_file != vmg->file)
return false;
@@ -82,24 +106,28 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
return true;
}
-static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- struct anon_vma *anon_vma2, struct vm_area_struct *vma)
+static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next)
{
+ struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev;
+ struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */
+ struct anon_vma *tgt_anon = tgt->anon_vma;
+ struct anon_vma *src_anon = vmg->anon_vma;
+
/*
- * The list_is_singular() test is to avoid merging VMA cloned from
- * parents. This can improve scalability caused by anon_vma lock.
+ * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we
+ * will remove the existing VMA's anon_vma's so there's no scalability
+ * concerns.
*/
- if ((!anon_vma1 || !anon_vma2) && (!vma ||
- list_is_singular(&vma->anon_vma_chain)))
- return true;
- return anon_vma1 == anon_vma2;
-}
+ VM_WARN_ON(src && src_anon != src->anon_vma);
-/* Are the anon_vma's belonging to each VMA compatible with one another? */
-static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
- struct vm_area_struct *vma2)
-{
- return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL);
+ /* Case 1 - we will dup_anon_vma() from src into tgt. */
+ if (!tgt_anon && src_anon)
+ return !vma_had_uncowed_parents(src);
+ /* Case 2 - we will simply use tgt's anon_vma. */
+ if (tgt_anon && !src_anon)
+ return !vma_had_uncowed_parents(tgt);
+ /* Case 3 - the anon_vma's are already shared. */
+ return src_anon == tgt_anon;
}
/*
@@ -144,6 +172,9 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
vp->file = vma->vm_file;
if (vp->file)
vp->mapping = vma->vm_file->f_mapping;
+
+ if (vmg && vmg->skip_vma_uprobe)
+ vp->skip_vma_uprobe = true;
}
/*
@@ -164,7 +195,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg)
pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
- is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
+ is_mergeable_anon_vma(vmg, /* merge_next = */ true)) {
if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
return true;
}
@@ -184,7 +215,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg)
static bool can_vma_merge_after(struct vma_merge_struct *vmg)
{
if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
- is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
+ is_mergeable_anon_vma(vmg, /* merge_next = */ false)) {
if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
return true;
}
@@ -333,10 +364,13 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
if (vp->file) {
i_mmap_unlock_write(vp->mapping);
- uprobe_mmap(vp->vma);
- if (vp->adj_next)
- uprobe_mmap(vp->adj_next);
+ if (!vp->skip_vma_uprobe) {
+ uprobe_mmap(vp->vma);
+
+ if (vp->adj_next)
+ uprobe_mmap(vp->adj_next);
+ }
}
if (vp->remove) {
@@ -400,8 +434,10 @@ static bool can_vma_merge_left(struct vma_merge_struct *vmg)
static bool can_vma_merge_right(struct vma_merge_struct *vmg,
bool can_merge_left)
{
- if (!vmg->next || vmg->end != vmg->next->vm_start ||
- !can_vma_merge_before(vmg))
+ struct vm_area_struct *next = vmg->next;
+ struct vm_area_struct *prev;
+
+ if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg))
return false;
if (!can_merge_left)
@@ -414,7 +450,9 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg,
*
* We therefore check this in addition to mergeability to either side.
*/
- return are_anon_vmas_compatible(vmg->prev, vmg->next);
+ prev = vmg->prev;
+ return !prev->anon_vma || !next->anon_vma ||
+ prev->anon_vma == next->anon_vma;
}
/*
@@ -510,7 +548,14 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
+
+ /*
+ * Get rid of huge pages and shared page tables straddling the split
+ * boundary.
+ */
vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_split(vma, addr);
if (new_below) {
vma->vm_start = addr;
@@ -554,7 +599,9 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
}
/*
- * dup_anon_vma() - Helper function to duplicate anon_vma
+ * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the
+ * instance that the destination VMA has no anon_vma but the source does.
+ *
* @dst: The destination VMA
* @src: The source VMA
* @dup: Pointer to the destination VMA when successful.
@@ -565,9 +612,18 @@ static int dup_anon_vma(struct vm_area_struct *dst,
struct vm_area_struct *src, struct vm_area_struct **dup)
{
/*
- * Easily overlooked: when mprotect shifts the boundary, make sure the
- * expanding vma has anon_vma set if the shrinking vma had, to cover any
- * anon pages imported.
+ * There are three cases to consider for correctly propagating
+ * anon_vma's on merge.
+ *
+ * The first is trivial - neither VMA has anon_vma, we need not do
+ * anything.
+ *
+ * The second where both have anon_vma is also a no-op, as they must
+ * then be the same, so there is simply nothing to copy.
+ *
+ * Here we cover the third - if the destination VMA has no anon_vma,
+ * that is it is unfaulted, we need to ensure that the newly merged
+ * range is referenced by the anon_vma's of the source.
*/
if (src->anon_vma && !dst->anon_vma) {
int ret;
@@ -666,6 +722,9 @@ static void vmg_adjust_set_range(struct vma_merge_struct *vmg)
/*
* Actually perform the VMA merge operation.
*
+ * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not
+ * modify any VMAs or cause inconsistent state should an OOM condition arise.
+ *
* Returns 0 on success, or an error value on failure.
*/
static int commit_merge(struct vma_merge_struct *vmg)
@@ -685,6 +744,12 @@ static int commit_merge(struct vma_merge_struct *vmg)
init_multi_vma_prep(&vp, vma, vmg);
+ /*
+ * If vmg->give_up_on_oom is set, we're safe, because we don't actually
+ * manipulate any VMAs until we succeed at preallocation.
+ *
+ * Past this point, we will not return an error.
+ */
if (vma_iter_prealloc(vmg->vmi, vma))
return -ENOMEM;
@@ -778,7 +843,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
* furthermost left or right side of the VMA, then we have no chance of
* merging and should abort.
*/
- if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
+ if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side))
return NULL;
if (left_side)
@@ -905,28 +970,28 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
err = dup_anon_vma(next, middle, &anon_dup);
}
- if (err)
+ if (err || commit_merge(vmg))
goto abort;
- err = commit_merge(vmg);
- if (err) {
- VM_WARN_ON(err != -ENOMEM);
-
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
-
- vmg->state = VMA_MERGE_ERROR_NOMEM;
- return NULL;
- }
-
- khugepaged_enter_vma(vmg->target, vmg->flags);
+ khugepaged_enter_vma(vmg->target, vmg->vm_flags);
vmg->state = VMA_MERGE_SUCCESS;
return vmg->target;
abort:
vma_iter_set(vmg->vmi, start);
vma_iter_load(vmg->vmi);
- vmg->state = VMA_MERGE_ERROR_NOMEM;
+
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
+ /*
+ * This means we have failed to clone anon_vma's correctly, but no
+ * actual changes to VMAs have occurred, so no harm no foul - if the
+ * user doesn't want this reported and instead just wants to give up on
+ * the merge, allow it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return NULL;
}
@@ -983,13 +1048,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
mmap_assert_write_locked(vmg->mm);
VM_WARN_ON_VMG(vmg->middle, vmg);
+ VM_WARN_ON_VMG(vmg->target, vmg);
/* vmi must point at or before the gap. */
VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg);
vmg->state = VMA_MERGE_NOMERGE;
/* Special VMAs are unmergeable, also if no prev/next. */
- if ((vmg->flags & VM_SPECIAL) || (!prev && !next))
+ if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next))
return NULL;
can_merge_left = can_vma_merge_left(vmg);
@@ -998,13 +1064,13 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
/* If we can merge with the next VMA, adjust vmg accordingly. */
if (can_merge_right) {
vmg->end = next->vm_end;
- vmg->middle = next;
+ vmg->target = next;
}
/* If we can merge with the previous VMA, adjust vmg accordingly. */
if (can_merge_left) {
vmg->start = prev->vm_start;
- vmg->middle = prev;
+ vmg->target = prev;
vmg->pgoff = prev->vm_pgoff;
/*
@@ -1026,10 +1092,10 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* Now try to expand adjacent VMA(s). This takes care of removing the
* following VMA if we have VMAs on both sides.
*/
- if (vmg->middle && !vma_expand(vmg)) {
- khugepaged_enter_vma(vmg->middle, vmg->flags);
+ if (vmg->target && !vma_expand(vmg)) {
+ khugepaged_enter_vma(vmg->target, vmg->vm_flags);
vmg->state = VMA_MERGE_SUCCESS;
- return vmg->middle;
+ return vmg->target;
}
return NULL;
@@ -1041,46 +1107,51 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* @vmg: Describes a VMA expansion operation.
*
* Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
- * Will expand over vmg->next if it's different from vmg->middle and vmg->end ==
- * vmg->next->vm_end. Checking if the vmg->middle can expand and merge with
+ * Will expand over vmg->next if it's different from vmg->target and vmg->end ==
+ * vmg->next->vm_end. Checking if the vmg->target can expand and merge with
* vmg->next needs to be handled by the caller.
*
* Returns: 0 on success.
*
* ASSUMPTIONS:
- * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock.
- * - The caller must have set @vmg->middle and @vmg->next.
+ * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
+ * - The caller must have set @vmg->target and @vmg->next.
*/
int vma_expand(struct vma_merge_struct *vmg)
{
struct vm_area_struct *anon_dup = NULL;
bool remove_next = false;
- struct vm_area_struct *middle = vmg->middle;
+ struct vm_area_struct *target = vmg->target;
struct vm_area_struct *next = vmg->next;
+ VM_WARN_ON_VMG(!target, vmg);
+
mmap_assert_write_locked(vmg->mm);
- vma_start_write(middle);
- if (next && (middle != next) && (vmg->end == next->vm_end)) {
+ vma_start_write(target);
+ if (next && (target != next) && (vmg->end == next->vm_end)) {
int ret;
remove_next = true;
/* This should already have been checked by this point. */
VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
vma_start_write(next);
- ret = dup_anon_vma(middle, next, &anon_dup);
+ /*
+ * In this case we don't report OOM, so vmg->give_up_on_mm is
+ * safe.
+ */
+ ret = dup_anon_vma(target, next, &anon_dup);
if (ret)
return ret;
}
/* Not merging but overwriting any part of next is not handled. */
VM_WARN_ON_VMG(next && !remove_next &&
- next != middle && vmg->end > next->vm_start, vmg);
+ next != target && vmg->end > next->vm_start, vmg);
/* Only handles expanding */
- VM_WARN_ON_VMG(middle->vm_start < vmg->start ||
- middle->vm_end > vmg->end, vmg);
+ VM_WARN_ON_VMG(target->vm_start < vmg->start ||
+ target->vm_end > vmg->end, vmg);
- vmg->target = middle;
if (remove_next)
vmg->__remove_next = true;
@@ -1090,9 +1161,15 @@ int vma_expand(struct vma_merge_struct *vmg)
return 0;
nomem:
- vmg->state = VMA_MERGE_ERROR_NOMEM;
if (anon_dup)
unlink_anon_vmas(anon_dup);
+ /*
+ * If the user requests that we just give upon OOM, we are safe to do so
+ * here, as commit merge provides this contract to us. Nothing has been
+ * changed - no harm no foul, just don't report it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return -ENOMEM;
}
@@ -1534,6 +1611,13 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
if (vmg_nomem(vmg))
return ERR_PTR(-ENOMEM);
+ /*
+ * Split can fail for reasons other than OOM, so if the user requests
+ * this it's probably a mistake.
+ */
+ VM_WARN_ON(vmg->give_up_on_oom &&
+ (vma->vm_start != start || vma->vm_end != end));
+
/* Split any preceding portion of the VMA. */
if (vma->vm_start < start) {
int err = split_vma(vmg->vmi, vma, start, 1);
@@ -1556,27 +1640,25 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
struct vm_area_struct *vma_modify_flags(
struct vma_iterator *vmi, struct vm_area_struct *prev,
struct vm_area_struct *vma, unsigned long start, unsigned long end,
- unsigned long new_flags)
+ vm_flags_t vm_flags)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
- vmg.flags = new_flags;
+ vmg.vm_flags = vm_flags;
return vma_modify(&vmg);
}
struct vm_area_struct
-*vma_modify_flags_name(struct vma_iterator *vmi,
+*vma_modify_name(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
- unsigned long new_flags,
struct anon_vma_name *new_name)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
- vmg.flags = new_flags;
vmg.anon_name = new_name;
return vma_modify(&vmg);
@@ -1601,13 +1683,16 @@ struct vm_area_struct
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx)
+ vm_flags_t vm_flags,
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
- vmg.flags = new_flags;
+ vmg.vm_flags = vm_flags;
vmg.uffd_ctx = new_ctx;
+ if (give_up_on_oom)
+ vmg.give_up_on_oom = true;
return vma_modify(&vmg);
}
@@ -1740,6 +1825,14 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
+ /*
+ * If the VMA we are copying might contain a uprobe PTE, ensure
+ * that we do not establish one upon merge. Otherwise, when mremap()
+ * moves page tables, it will orphan the newly created PTE.
+ */
+ if (vma->vm_file)
+ vmg.skip_vma_uprobe = true;
+
new_vma = find_vma_prev(mm, addr, &vmg.prev);
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
@@ -1791,6 +1884,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return new_vma;
out_vma_link:
+ fixup_hugetlb_reservations(new_vma);
vma_close(new_vma);
if (new_vma->vm_file)
@@ -2229,6 +2323,11 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
vms_complete_munmap_vmas(vms, mas_detach);
}
+static void update_ksm_flags(struct mmap_state *map)
+{
+ map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags);
+}
+
/*
* __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be
* unmapped once the map operation is completed, check limits, account mapping
@@ -2271,11 +2370,11 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
}
/* Check against address space limit. */
- if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages))
+ if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages))
return -ENOMEM;
/* Private writable mapping: check memory availability. */
- if (accountable_mapping(map->file, map->flags)) {
+ if (accountable_mapping(map->file, map->vm_flags)) {
map->charged = map->pglen;
map->charged -= vms->nr_accounted;
if (map->charged) {
@@ -2285,7 +2384,7 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
}
vms->nr_accounted = 0;
- map->flags |= VM_ACCOUNT;
+ map->vm_flags |= VM_ACCOUNT;
}
/*
@@ -2307,6 +2406,10 @@ static int __mmap_new_file_vma(struct mmap_state *map,
int error;
vma->vm_file = get_file(map->file);
+
+ if (!map->file->f_op->mmap)
+ return 0;
+
error = mmap_file(vma->vm_file, vma);
if (error) {
fput(vma->vm_file);
@@ -2325,13 +2428,12 @@ static int __mmap_new_file_vma(struct mmap_state *map,
* Drivers should not permit writability when previously it was
* disallowed.
*/
- VM_WARN_ON_ONCE(map->flags != vma->vm_flags &&
- !(map->flags & VM_MAYWRITE) &&
+ VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags &&
+ !(map->vm_flags & VM_MAYWRITE) &&
(vma->vm_flags & VM_MAYWRITE));
- /* If the flags change (and are mergeable), let's retry later. */
- map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL);
- map->flags = vma->vm_flags;
+ map->file = vma->vm_file;
+ map->vm_flags = vma->vm_flags;
return 0;
}
@@ -2362,8 +2464,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
vma_iter_config(vmi, map->addr, map->end);
vma_set_range(vma, map->addr, map->end, map->pgoff);
- vm_flags_init(vma, map->flags);
- vma->vm_page_prot = vm_get_page_prot(map->flags);
+ vm_flags_init(vma, map->vm_flags);
+ vma->vm_page_prot = map->page_prot;
if (vma_iter_prealloc(vmi, vma)) {
error = -ENOMEM;
@@ -2372,7 +2474,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
if (map->file)
error = __mmap_new_file_vma(map, vma);
- else if (map->flags & VM_SHARED)
+ else if (map->vm_flags & VM_SHARED)
error = shmem_zero_setup(vma);
else
vma_set_anonymous(vma);
@@ -2380,9 +2482,14 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
if (error)
goto free_iter_vma;
+ if (!map->check_ksm_early) {
+ update_ksm_flags(map);
+ vm_flags_init(vma, map->vm_flags);
+ }
+
#ifdef CONFIG_SPARC64
/* TODO: Fix SPARC ADI! */
- WARN_ON_ONCE(!arch_validate_flags(map->flags));
+ WARN_ON_ONCE(!arch_validate_flags(map->vm_flags));
#endif
/* Lock the VMA since it is modified after insertion into VMA tree */
@@ -2396,8 +2503,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
* call covers the non-merge case.
*/
if (!vma_is_anonymous(vma))
- khugepaged_enter_vma(vma, map->flags);
- ksm_add_vma(vma);
+ khugepaged_enter_vma(vma, map->vm_flags);
*vmap = vma;
return 0;
@@ -2418,7 +2524,7 @@ free_vma:
static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
{
struct mm_struct *mm = map->mm;
- unsigned long vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
perf_event_mmap(vma);
@@ -2450,6 +2556,85 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
vma_set_page_prot(vma);
}
+/*
+ * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that
+ * specifies it.
+ *
+ * This is called prior to any merge attempt, and updates whitelisted fields
+ * that are permitted to be updated by the caller.
+ *
+ * All but user-defined fields will be pre-populated with original values.
+ *
+ * Returns 0 on success, or an error code otherwise.
+ */
+static int call_mmap_prepare(struct mmap_state *map)
+{
+ int err;
+ struct vm_area_desc desc = {
+ .mm = map->mm,
+ .start = map->addr,
+ .end = map->end,
+
+ .pgoff = map->pgoff,
+ .file = map->file,
+ .vm_flags = map->vm_flags,
+ .page_prot = map->page_prot,
+ };
+
+ /* Invoke the hook. */
+ err = vfs_mmap_prepare(map->file, &desc);
+ if (err)
+ return err;
+
+ /* Update fields permitted to be changed. */
+ map->pgoff = desc.pgoff;
+ map->file = desc.file;
+ map->vm_flags = desc.vm_flags;
+ map->page_prot = desc.page_prot;
+ /* User-defined fields. */
+ map->vm_ops = desc.vm_ops;
+ map->vm_private_data = desc.private_data;
+
+ return 0;
+}
+
+static void set_vma_user_defined_fields(struct vm_area_struct *vma,
+ struct mmap_state *map)
+{
+ if (map->vm_ops)
+ vma->vm_ops = map->vm_ops;
+ vma->vm_private_data = map->vm_private_data;
+}
+
+/*
+ * Are we guaranteed no driver can change state such as to preclude KSM merging?
+ * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
+ */
+static bool can_set_ksm_flags_early(struct mmap_state *map)
+{
+ struct file *file = map->file;
+
+ /* Anonymous mappings have no driver which can change them. */
+ if (!file)
+ return true;
+
+ /*
+ * If .mmap_prepare() is specified, then the driver will have already
+ * manipulated state prior to updating KSM flags. So no need to worry
+ * about mmap callbacks modifying VMA flags after the KSM flag has been
+ * updated here, which could otherwise affect KSM eligibility.
+ */
+ if (file->f_op->mmap_prepare)
+ return true;
+
+ /* shmem is safe. */
+ if (shmem_file(file))
+ return true;
+
+ /* Any other .mmap callback is not safe. */
+ return false;
+}
+
static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
@@ -2457,13 +2642,21 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
int error;
+ bool have_mmap_prepare = file && file->f_op->mmap_prepare;
VMA_ITERATOR(vmi, mm, addr);
MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
+ map.check_ksm_early = can_set_ksm_flags_early(&map);
+
error = __mmap_prepare(&map, uf);
+ if (!error && have_mmap_prepare)
+ error = call_mmap_prepare(&map);
if (error)
goto abort_munmap;
+ if (map.check_ksm_early)
+ update_ksm_flags(&map);
+
/* Attempt to merge with adjacent VMAs... */
if (map.prev || map.next) {
VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);
@@ -2478,16 +2671,8 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
goto unacct_error;
}
- /* If flags changed, we might be able to merge, so try again. */
- if (map.retry_merge) {
- struct vm_area_struct *merged;
- VMG_MMAP_STATE(vmg, &map, vma);
-
- vma_iter_config(map.vmi, map.addr, map.end);
- merged = vma_merge_existing_range(&vmg);
- if (merged)
- vma = merged;
- }
+ if (have_mmap_prepare)
+ set_vma_user_defined_fields(vma, &map);
__mmap_complete(&map, vma);
@@ -2567,14 +2752,14 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* @addr: The start address
* @len: The length of the increase
* @vma: The vma,
- * @flags: The VMA Flags
+ * @vm_flags: The VMA Flags
*
* Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
* do not match then create a new anonymous VMA. Eventually we may be able to
* do some brk-specific accounting here.
*/
int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, unsigned long len, unsigned long flags)
+ unsigned long addr, unsigned long len, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
@@ -2582,8 +2767,9 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
* Check against address space limits by the changed size
* Note: This happens *after* clearing old mappings in some code paths.
*/
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
- if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
+ vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+ vm_flags = ksm_vma_flags(mm, NULL, vm_flags);
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT))
return -ENOMEM;
if (mm->map_count > sysctl_max_map_count)
@@ -2597,7 +2783,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
* occur after forking, so the expand will only happen on new VMAs.
*/
if (vma && vma->vm_end == addr) {
- VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
+ VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr));
vmg.prev = vma;
/* vmi is positioned at prev, which this mode expects. */
@@ -2618,20 +2804,19 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_set_anonymous(vma);
vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
- vm_flags_init(vma, flags);
- vma->vm_page_prot = vm_get_page_prot(flags);
+ vm_flags_init(vma, vm_flags);
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma_start_write(vma);
if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
goto mas_store_fail;
mm->map_count++;
validate_mm(mm);
- ksm_add_vma(vma);
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
mm->data_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED)
+ if (vm_flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vm_flags_set(vma, VM_SOFTDIRTY);
return 0;
@@ -2974,3 +3159,45 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock)
userfaultfd_unmap_complete(mm, &uf);
return ret;
}
+
+/* Insert vm structure into process list sorted by address
+ * and into the inode's i_mmap tree. If vm_file is non-NULL
+ * then i_mmap_rwsem is taken here.
+ */
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ unsigned long charged = vma_pages(vma);
+
+
+ if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
+ return -ENOMEM;
+
+ if ((vma->vm_flags & VM_ACCOUNT) &&
+ security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+
+ /*
+ * The vm_pgoff of a purely anonymous vma should be irrelevant
+ * until its first write fault, when page's anon_vma and index
+ * are set. But now set the vm_pgoff it will almost certainly
+ * end up with (unless mremap moves it elsewhere before that
+ * first wfault), so /proc/pid/maps tells a consistent story.
+ *
+ * By setting it to reflect the virtual start address of the
+ * vma, merges and splits can happen in a seamless way, just
+ * using the existing file pgoff checks and manipulations.
+ * Similarly in do_mmap and in do_brk_flags.
+ */
+ if (vma_is_anonymous(vma)) {
+ BUG_ON(vma->anon_vma);
+ vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
+ }
+
+ if (vma_link(mm, vma)) {
+ if (vma->vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(charged);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
diff --git a/mm/vma.h b/mm/vma.h
index 7356ca5a22d3..acdcc515c459 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -19,6 +19,8 @@ struct vma_prepare {
struct vm_area_struct *insert;
struct vm_area_struct *remove;
struct vm_area_struct *remove2;
+
+ bool skip_vma_uprobe :1;
};
struct unlink_vma_file_batch {
@@ -96,7 +98,7 @@ struct vma_merge_struct {
unsigned long end;
pgoff_t pgoff;
- unsigned long flags;
+ vm_flags_t vm_flags;
struct file *file;
struct anon_vma *anon_vma;
struct mempolicy *policy;
@@ -114,6 +116,17 @@ struct vma_merge_struct {
*/
bool just_expand :1;
+ /*
+ * If a merge is possible, but an OOM error occurs, give up and don't
+ * execute the merge, returning NULL.
+ */
+ bool give_up_on_oom :1;
+
+ /*
+ * If set, skip uprobe_mmap upon merged vma.
+ */
+ bool skip_vma_uprobe :1;
+
/* Internal flags set during merge process: */
/*
@@ -151,13 +164,13 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
}
-#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \
+#define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \
struct vma_merge_struct name = { \
.mm = mm_, \
.vmi = vmi_, \
.start = start_, \
.end = end_, \
- .flags = flags_, \
+ .vm_flags = vm_flags_, \
.pgoff = pgoff_, \
.state = VMA_MERGE_START, \
}
@@ -171,7 +184,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.next = NULL, \
.start = start_, \
.end = end_, \
- .flags = vma_->vm_flags, \
+ .vm_flags = vma_->vm_flags, \
.pgoff = vma_pgoff_offset(vma_, start_), \
.file = vma_->vm_file, \
.anon_vma = vma_->anon_vma, \
@@ -209,6 +222,53 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
return 0;
}
+
+/*
+ * Temporary helper functions for file systems which wrap an invocation of
+ * f_op->mmap() but which might have an underlying file system which implements
+ * f_op->mmap_prepare().
+ */
+
+static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma,
+ struct vm_area_desc *desc)
+{
+ desc->mm = vma->vm_mm;
+ desc->start = vma->vm_start;
+ desc->end = vma->vm_end;
+
+ desc->pgoff = vma->vm_pgoff;
+ desc->file = vma->vm_file;
+ desc->vm_flags = vma->vm_flags;
+ desc->page_prot = vma->vm_page_prot;
+
+ desc->vm_ops = NULL;
+ desc->private_data = NULL;
+
+ return desc;
+}
+
+static inline void set_vma_from_desc(struct vm_area_struct *vma,
+ struct vm_area_desc *desc)
+{
+ /*
+ * Since we're invoking .mmap_prepare() despite having a partially
+ * established VMA, we must take care to handle setting fields
+ * correctly.
+ */
+
+ /* Mutable fields. Populated with initial state. */
+ vma->vm_pgoff = desc->pgoff;
+ if (vma->vm_file != desc->file)
+ vma_set_file(vma, desc->file);
+ if (vma->vm_flags != desc->vm_flags)
+ vm_flags_set(vma, desc->vm_flags);
+ vma->vm_page_prot = desc->page_prot;
+
+ /* User-defined fields. */
+ vma->vm_ops = desc->vm_ops;
+ vma->vm_private_data = desc->private_data;
+}
+
int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm, unsigned long start,
@@ -228,17 +288,16 @@ __must_check struct vm_area_struct
*vma_modify_flags(struct vma_iterator *vmi,
struct vm_area_struct *prev, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- unsigned long new_flags);
+ vm_flags_t vm_flags);
-/* We are about to modify the VMA's flags and/or anon_name. */
+/* We are about to modify the VMA's anon_name. */
__must_check struct vm_area_struct
-*vma_modify_flags_name(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- unsigned long new_flags,
- struct anon_vma_name *new_name);
+*vma_modify_name(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ struct anon_vma_name *new_name);
/* We are about to modify the VMA's memory policy. */
__must_check struct vm_area_struct
@@ -254,8 +313,9 @@ __must_check struct vm_area_struct
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx);
+ vm_flags_t vm_flags,
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom);
__must_check struct vm_area_struct
*vma_merge_new_range(struct vma_merge_struct *vmg);
@@ -314,7 +374,7 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma
}
#ifdef CONFIG_MMU
-static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
+static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, vm_flags_t vm_flags)
{
return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
}
@@ -541,4 +601,19 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
int __vm_munmap(unsigned long start, size_t len, bool unlock);
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
+
+/* vma_init.h, shared between CONFIG_MMU and nommu. */
+void __init vma_state_init(void);
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
+void vm_area_free(struct vm_area_struct *vma);
+
+/* vma_exec.c */
+#ifdef CONFIG_MMU
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+ unsigned long *top_mem_p);
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
+#endif
+
#endif /* __MM_VMA_H */
diff --git a/mm/vma_exec.c b/mm/vma_exec.c
new file mode 100644
index 000000000000..922ee51747a6
--- /dev/null
+++ b/mm/vma_exec.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Functions explicitly implemented for exec functionality which however are
+ * explicitly VMA-only logic.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+/*
+ * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
+ * this VMA and its relocated range, which will now reside at [vma->vm_start -
+ * shift, vma->vm_end - shift).
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable temporary stack relocation.
+ */
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
+{
+ /*
+ * The process proceeds as follows:
+ *
+ * 1) Use shift to calculate the new vma endpoints.
+ * 2) Extend vma to cover both the old and new ranges. This ensures the
+ * arguments passed to subsequent functions are consistent.
+ * 3) Move vma's page tables to the new range.
+ * 4) Free up any cleared pgd range.
+ * 5) Shrink the vma to cover only the new range.
+ */
+
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long old_start = vma->vm_start;
+ unsigned long old_end = vma->vm_end;
+ unsigned long length = old_end - old_start;
+ unsigned long new_start = old_start - shift;
+ unsigned long new_end = old_end - shift;
+ VMA_ITERATOR(vmi, mm, new_start);
+ VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
+ struct vm_area_struct *next;
+ struct mmu_gather tlb;
+ PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
+
+ BUG_ON(new_start > new_end);
+
+ /*
+ * ensure there are no vmas between where we want to go
+ * and where we are
+ */
+ if (vma != vma_next(&vmi))
+ return -EFAULT;
+
+ vma_iter_prev_range(&vmi);
+ /*
+ * cover the whole range: [new_start, old_end)
+ */
+ vmg.target = vma;
+ if (vma_expand(&vmg))
+ return -ENOMEM;
+
+ /*
+ * move the page tables downwards, on failure we rely on
+ * process cleanup to remove whatever mess we made.
+ */
+ pmc.for_stack = true;
+ if (length != move_page_tables(&pmc))
+ return -ENOMEM;
+
+ tlb_gather_mmu(&tlb, mm);
+ next = vma_next(&vmi);
+ if (new_end > old_start) {
+ /*
+ * when the old and new regions overlap clear from new_end.
+ */
+ free_pgd_range(&tlb, new_end, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ } else {
+ /*
+ * otherwise, clean from old_start; this is done to not touch
+ * the address space in [new_end, old_start) some architectures
+ * have constraints on va-space that make this illegal (IA64) -
+ * for the others its just a little faster.
+ */
+ free_pgd_range(&tlb, old_start, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ }
+ tlb_finish_mmu(&tlb);
+
+ vma_prev(&vmi);
+ /* Shrink the vma to just the new range */
+ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
+}
+
+/*
+ * Establish the stack VMA in an execve'd process, located temporarily at the
+ * maximum stack address provided by the architecture.
+ *
+ * We later relocate this downwards in relocate_vma_down().
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable initialisation.
+ *
+ * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the
+ * maximum addressable location in the stack (that is capable of storing a
+ * system word of data).
+ */
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+ unsigned long *top_mem_p)
+{
+ int err;
+ struct vm_area_struct *vma = vm_area_alloc(mm);
+
+ if (!vma)
+ return -ENOMEM;
+
+ vma_set_anonymous(vma);
+
+ if (mmap_write_lock_killable(mm)) {
+ err = -EINTR;
+ goto err_free;
+ }
+
+ /*
+ * Need to be called with mmap write lock
+ * held, to avoid race with ksmd.
+ */
+ err = ksm_execve(mm);
+ if (err)
+ goto err_ksm;
+
+ /*
+ * Place the stack at the largest stack address the architecture
+ * supports. Later, we'll move this to an appropriate place. We don't
+ * use STACK_TOP because that can depend on attributes which aren't
+ * configured yet.
+ */
+ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+ vma->vm_end = STACK_TOP_MAX;
+ vma->vm_start = vma->vm_end - PAGE_SIZE;
+ vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+
+ err = insert_vm_struct(mm, vma);
+ if (err)
+ goto err;
+
+ mm->stack_vm = mm->total_vm = 1;
+ mmap_write_unlock(mm);
+ *vmap = vma;
+ *top_mem_p = vma->vm_end - sizeof(void *);
+ return 0;
+
+err:
+ ksm_exit(mm);
+err_ksm:
+ mmap_write_unlock(mm);
+err_free:
+ *vmap = NULL;
+ vm_area_free(vma);
+ return err;
+}
diff --git a/mm/vma_init.c b/mm/vma_init.c
new file mode 100644
index 000000000000..8e53c7943561
--- /dev/null
+++ b/mm/vma_init.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
+ * between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+/* SLAB cache for vm_area_struct structures */
+static struct kmem_cache *vm_area_cachep;
+
+void __init vma_state_init(void)
+{
+ struct kmem_cache_args args = {
+ .use_freeptr_offset = true,
+ .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
+ };
+
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), &args,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
+ SLAB_ACCOUNT);
+}
+
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+
+ return vma;
+}
+
+static void vm_area_init_from(const struct vm_area_struct *src,
+ struct vm_area_struct *dest)
+{
+ dest->vm_mm = src->vm_mm;
+ dest->vm_ops = src->vm_ops;
+ dest->vm_start = src->vm_start;
+ dest->vm_end = src->vm_end;
+ dest->anon_vma = src->anon_vma;
+ dest->vm_pgoff = src->vm_pgoff;
+ dest->vm_file = src->vm_file;
+ dest->vm_private_data = src->vm_private_data;
+ vm_flags_init(dest, src->vm_flags);
+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
+ sizeof(dest->vm_page_prot));
+ /*
+ * src->shared.rb may be modified concurrently when called from
+ * dup_mmap(), but the clone will reinitialize it.
+ */
+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
+ sizeof(dest->vm_userfaultfd_ctx));
+#ifdef CONFIG_ANON_VMA_NAME
+ dest->anon_name = src->anon_name;
+#endif
+#ifdef CONFIG_SWAP
+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
+ sizeof(dest->swap_readahead_info));
+#endif
+#ifndef CONFIG_MMU
+ dest->vm_region = src->vm_region;
+#endif
+#ifdef CONFIG_NUMA
+ dest->vm_policy = src->vm_policy;
+#endif
+#ifdef __HAVE_PFNMAP_TRACKING
+ dest->pfnmap_track_ctx = NULL;
+#endif
+}
+
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
+ struct vm_area_struct *new)
+{
+ struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx;
+
+ if (likely(!ctx))
+ return 0;
+
+ /*
+ * We don't expect to ever hit this. If ever required, we would have
+ * to duplicate the tracking.
+ */
+ if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX))
+ return -ENOMEM;
+ kref_get(&ctx->kref);
+ new->pfnmap_track_ctx = ctx;
+ return 0;
+}
+
+static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
+{
+ struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx;
+
+ if (likely(!ctx))
+ return;
+
+ kref_put(&ctx->kref, pfnmap_track_ctx_release);
+ vma->pfnmap_track_ctx = NULL;
+}
+#else
+static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
+ struct vm_area_struct *new)
+{
+ return 0;
+}
+static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
+{
+}
+#endif
+
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+ struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ vm_area_init_from(orig, new);
+
+ if (vma_pfnmap_track_ctx_dup(orig, new)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return NULL;
+ }
+ vma_lock_init(new, true);
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
+ return new;
+}
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+ /* The vma should be detached while being destroyed. */
+ vma_assert_detached(vma);
+ vma_numab_state_free(vma);
+ free_anon_vma_name(vma);
+ vma_pfnmap_track_ctx_release(vma);
+ kmem_cache_free(vm_area_cachep, vma);
+}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3ed720a787ec..6dbcdceecae1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -104,6 +104,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
if (unlikely(!pte_none(ptep_get(pte)))) {
if (pfn_valid(pfn)) {
@@ -127,6 +130,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -350,12 +355,30 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pte_t *pte;
+ pte_t ptent;
+ unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
+ arch_enter_lazy_mmu_mode();
+
do {
- pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
+#ifdef CONFIG_HUGETLB_PAGE
+ size = arch_vmap_pte_range_unmap_size(addr, pte);
+ if (size != PAGE_SIZE) {
+ if (WARN_ON(!IS_ALIGNED(addr, size))) {
+ addr = ALIGN_DOWN(addr, size);
+ pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT));
+ }
+ ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size);
+ if (WARN_ON(end - addr < size))
+ size = end - addr;
+ } else
+#endif
+ ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
}
@@ -374,8 +397,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
if (cleared || pmd_bad(*pmd))
*mask |= PGTBL_PMD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PMD_SIZE);
continue;
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next, mask);
@@ -399,8 +424,10 @@ static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (cleared || pud_bad(*pud))
*mask |= PGTBL_PUD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PUD_SIZE);
continue;
+ }
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next, mask);
@@ -487,6 +514,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
+ int err = 0;
pte_t *pte;
/*
@@ -497,21 +525,33 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
struct page *page = pages[*nr];
- if (WARN_ON(!pte_none(ptep_get(pte))))
- return -EBUSY;
- if (WARN_ON(!page))
- return -ENOMEM;
- if (WARN_ON(!pfn_valid(page_to_pfn(page))))
- return -EINVAL;
+ if (WARN_ON(!pte_none(ptep_get(pte)))) {
+ err = -EBUSY;
+ break;
+ }
+ if (WARN_ON(!page)) {
+ err = -ENOMEM;
+ break;
+ }
+ if (WARN_ON(!pfn_valid(page_to_pfn(page)))) {
+ err = -EINVAL;
+ break;
+ }
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
- return 0;
+
+ return err;
}
static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
@@ -900,6 +940,11 @@ static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;
+/* A simple iterator over all vmap-nodes. */
+#define for_each_vmap_node(vn) \
+ for ((vn) = &vmap_nodes[0]; \
+ (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++)
+
static inline unsigned int
addr_to_node_id(unsigned long addr)
{
@@ -918,6 +963,19 @@ id_to_node(unsigned int id)
return &vmap_nodes[id % nr_vmap_nodes];
}
+static inline unsigned int
+node_to_id(struct vmap_node *node)
+{
+ /* Pointer arithmetic. */
+ unsigned int id = node - vmap_nodes;
+
+ if (likely(id < nr_vmap_nodes))
+ return id;
+
+ WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node);
+ return 0;
+}
+
/*
* We use the value 0 to represent "no node", that is why
* an encoded value will be the node-id incremented by 1.
@@ -990,7 +1048,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
-static atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr;
unsigned long vmalloc_nr_pages(void)
{
@@ -1056,12 +1115,11 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
unsigned long va_start_lowest;
struct vmap_node *vn;
- int i;
repeat:
- for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ va_start_lowest = 0;
+ for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
*va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
@@ -1698,7 +1756,7 @@ va_clip(struct rb_root *root, struct list_head *head,
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
- return -1;
+ return -ENOMEM;
}
/*
@@ -1712,7 +1770,7 @@ va_clip(struct rb_root *root, struct list_head *head,
*/
va->va_start = nva_start_addr + size;
} else {
- return -1;
+ return -EINVAL;
}
if (type != FL_FIT_TYPE) {
@@ -1741,19 +1799,19 @@ va_alloc(struct vmap_area *va,
/* Check the "vend" restriction. */
if (nva_start_addr + size > vend)
- return vend;
+ return -ERANGE;
/* Update the free vmap_area. */
ret = va_clip(root, head, va, nva_start_addr, size);
if (WARN_ON_ONCE(ret))
- return vend;
+ return ret;
return nva_start_addr;
}
/*
* Returns a start address of the newly allocated area, if success.
- * Otherwise a vend is returned that indicates failure.
+ * Otherwise an error value is returned that indicates failure.
*/
static __always_inline unsigned long
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
@@ -1778,14 +1836,13 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
if (unlikely(!va))
- return vend;
+ return -ENOENT;
nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
- if (nva_start_addr == vend)
- return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
- find_vmap_lowest_match_check(root, head, size, align);
+ if (!IS_ERR_VALUE(nva_start_addr))
+ find_vmap_lowest_match_check(root, head, size, align);
#endif
return nva_start_addr;
@@ -1915,7 +1972,7 @@ node_alloc(unsigned long size, unsigned long align,
struct vmap_area *va;
*vn_id = 0;
- *addr = vend;
+ *addr = -EINVAL;
/*
* Fallback to a global heap if not vmalloc or there
@@ -1940,7 +1997,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm,
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
- vm->size = va_size(va);
+ vm->size = vm->requested_size = va_size(va);
vm->caller = caller;
va->vm = vm;
}
@@ -1995,20 +2052,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
}
retry:
- if (addr == vend) {
+ if (IS_ERR_VALUE(addr)) {
preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
}
- trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
+ trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));
/*
- * If an allocation fails, the "vend" address is
+ * If an allocation fails, the error value is
* returned. Therefore trigger the overflow path.
*/
- if (unlikely(addr == vend))
+ if (IS_ERR_VALUE(addr))
goto overflow;
va->va_start = addr;
@@ -2100,8 +2157,6 @@ static unsigned long lazy_max_pages(void)
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
-static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
-
/*
* Serialize vmap purging. There is no actual critical section protected
* by this lock, but we want to avoid concurrent calls for performance
@@ -2111,7 +2166,6 @@ static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
-static cpumask_t purge_nodes;
static void
reclaim_list_global(struct list_head *head)
@@ -2134,7 +2188,7 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
LIST_HEAD(decay_list);
struct rb_root decay_root = RB_ROOT;
struct vmap_area *va, *nva;
- unsigned long n_decay;
+ unsigned long n_decay, pool_len;
int i;
for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
@@ -2148,22 +2202,20 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
list_replace_init(&vn->pool[i].head, &tmp_list);
spin_unlock(&vn->pool_lock);
- if (full_decay)
- WRITE_ONCE(vn->pool[i].len, 0);
+ pool_len = n_decay = vn->pool[i].len;
+ WRITE_ONCE(vn->pool[i].len, 0);
/* Decay a pool by ~25% out of left objects. */
- n_decay = vn->pool[i].len >> 2;
+ if (!full_decay)
+ n_decay >>= 2;
+ pool_len -= n_decay;
list_for_each_entry_safe(va, nva, &tmp_list, list) {
+ if (!n_decay--)
+ break;
+
list_del_init(&va->list);
merge_or_add_vmap_area(va, &decay_root, &decay_list);
-
- if (!full_decay) {
- WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);
-
- if (!--n_decay)
- break;
- }
}
/*
@@ -2172,9 +2224,10 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
* can populate the pool therefore a simple list replace
* operation takes place here.
*/
- if (!full_decay && !list_empty(&tmp_list)) {
+ if (!list_empty(&tmp_list)) {
spin_lock(&vn->pool_lock);
list_replace_init(&tmp_list, &vn->pool[i].head);
+ WRITE_ONCE(vn->pool[i].len, pool_len);
spin_unlock(&vn->pool_lock);
}
}
@@ -2244,6 +2297,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
{
unsigned long nr_purged_areas = 0;
unsigned int nr_purge_helpers;
+ static cpumask_t purge_nodes;
unsigned int nr_purge_nodes;
struct vmap_node *vn;
int i;
@@ -2255,9 +2309,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
*/
purge_nodes = CPU_MASK_NONE;
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
-
+ for_each_vmap_node(vn) {
INIT_LIST_HEAD(&vn->purge_list);
vn->skip_populate = full_pool_decay;
decay_va_pool_node(vn, full_pool_decay);
@@ -2276,7 +2328,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
end = max(end, list_last_entry(&vn->purge_list,
struct vmap_area, list)->va_end);
- cpumask_set_cpu(i, &purge_nodes);
+ cpumask_set_cpu(node_to_id(vn), &purge_nodes);
}
nr_purge_nodes = cpumask_weight(&purge_nodes);
@@ -2355,7 +2407,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
if (WARN_ON_ONCE(!list_empty(&va->list)))
return;
- nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT,
+ nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT,
&vmap_lazy_nr);
/*
@@ -2421,7 +2473,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
if (va)
return va;
- } while ((i = (i + 1) % nr_vmap_nodes) != j);
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
@@ -2447,7 +2499,7 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
if (va)
return va;
- } while ((i = (i + 1) % nr_vmap_nodes) != j);
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
@@ -2916,10 +2968,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
*/
void vm_unmap_aliases(void)
{
- unsigned long start = ULONG_MAX, end = 0;
- int flush = 0;
-
- _vm_unmap_aliases(start, end, flush);
+ _vm_unmap_aliases(ULONG_MAX, 0, 0);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
@@ -3100,7 +3149,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
/*
* Before removing VM_UNINITIALIZED,
* we should make sure that vm has proper values.
- * Pair with smp_rmb() in show_numa_info().
+ * Pair with smp_rmb() in vread_iter() and vmalloc_info_show().
*/
smp_wmb();
vm->flags &= ~VM_UNINITIALIZED;
@@ -3133,6 +3182,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size,
area->flags = flags;
area->caller = caller;
+ area->requested_size = requested_size;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
if (IS_ERR(va)) {
@@ -3370,12 +3420,13 @@ void vfree(const void *addr)
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
vm_reset_perms(vm);
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES))
+ mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages);
for (i = 0; i < vm->nr_pages; i++) {
struct page *page = vm->pages[i];
BUG_ON(!page);
- if (!(vm->flags & VM_MAP_PUT_PAGES))
- mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
/*
* High-order allocs for huge vmallocs are split, so
* can be freed as an array of order-0 allocations
@@ -3671,12 +3722,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
node, page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (gfp_mask & __GFP_ACCOUNT) {
- int i;
-
- for (i = 0; i < area->nr_pages; i++)
- mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
- }
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (gfp_mask & __GFP_ACCOUNT && area->nr_pages)
+ mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC,
+ area->nr_pages);
/*
* If not enough pages were obtained to accomplish an
@@ -3943,9 +3992,10 @@ void *vmalloc_noprof(unsigned long size)
EXPORT_SYMBOL(vmalloc_noprof);
/**
- * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
+ * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
* @size: allocation size
* @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
@@ -3954,13 +4004,13 @@ EXPORT_SYMBOL(vmalloc_noprof);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- NUMA_NO_NODE, __builtin_return_address(0));
+ gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
}
-EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
+EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);
/**
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -4063,6 +4113,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof);
*/
void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
{
+ struct vm_struct *vm = NULL;
+ size_t alloced_size = 0;
size_t old_size = 0;
void *n;
@@ -4072,15 +4124,17 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
}
if (p) {
- struct vm_struct *vm;
-
vm = find_vm_area(p);
if (unlikely(!vm)) {
WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
return NULL;
}
- old_size = get_vm_area_size(vm);
+ alloced_size = get_vm_area_size(vm);
+ old_size = vm->requested_size;
+ if (WARN(alloced_size < old_size,
+ "vrealloc() has mismatched area vs requested sizes (%p)\n", p))
+ return NULL;
}
/*
@@ -4088,11 +4142,26 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
* would be a good heuristic for when to shrink the vm_area?
*/
if (size <= old_size) {
- /* Zero out spare memory. */
- if (want_init_on_alloc(flags))
+ /* Zero out "freed" memory, potentially for future realloc. */
+ if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
+ vm->requested_size = size;
kasan_poison_vmalloc(p + size, old_size - size);
- kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL);
+ return (void *)p;
+ }
+
+ /*
+ * We already have the bytes available in the allocation; use them.
+ */
+ if (size <= alloced_size) {
+ kasan_unpoison_vmalloc(p + old_size, size - old_size,
+ KASAN_VMALLOC_PROT_NORMAL);
+ /*
+ * No need to zero memory here, as unused memory will have
+ * already been zeroed at initial allocation time or during
+ * realloc shrink time.
+ */
+ vm->requested_size = size;
return (void *)p;
}
@@ -4914,39 +4983,37 @@ bool vmalloc_dump_obj(void *object)
#endif
#ifdef CONFIG_PROC_FS
-static void show_numa_info(struct seq_file *m, struct vm_struct *v)
-{
- if (IS_ENABLED(CONFIG_NUMA)) {
- unsigned int nr, *counters = m->private;
- unsigned int step = 1U << vm_area_page_order(v);
- if (!counters)
- return;
+/*
+ * Print number of pages allocated on each memory node.
+ *
+ * This function can only be called if CONFIG_NUMA is enabled
+ * and VM_UNINITIALIZED bit in v->flags is disabled.
+ */
+static void show_numa_info(struct seq_file *m, struct vm_struct *v,
+ unsigned int *counters)
+{
+ unsigned int nr;
+ unsigned int step = 1U << vm_area_page_order(v);
- if (v->flags & VM_UNINITIALIZED)
- return;
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
+ if (!counters)
+ return;
- memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+ memset(counters, 0, nr_node_ids * sizeof(unsigned int));
- for (nr = 0; nr < v->nr_pages; nr += step)
- counters[page_to_nid(v->pages[nr])] += step;
- for_each_node_state(nr, N_HIGH_MEMORY)
- if (counters[nr])
- seq_printf(m, " N%u=%u", nr, counters[nr]);
- }
+ for (nr = 0; nr < v->nr_pages; nr += step)
+ counters[page_to_nid(v->pages[nr])] += step;
+ for_each_node_state(nr, N_HIGH_MEMORY)
+ if (counters[nr])
+ seq_printf(m, " N%u=%u", nr, counters[nr]);
}
static void show_purge_info(struct seq_file *m)
{
struct vmap_node *vn;
struct vmap_area *va;
- int i;
-
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ for_each_vmap_node(vn) {
spin_lock(&vn->lazy.lock);
list_for_each_entry(va, &vn->lazy.head, list) {
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
@@ -4962,11 +5029,12 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
struct vmap_node *vn;
struct vmap_area *va;
struct vm_struct *v;
- int i;
+ unsigned int *counters;
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ if (IS_ENABLED(CONFIG_NUMA))
+ counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+ for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
list_for_each_entry(va, &vn->busy.head, list) {
if (!va->vm) {
@@ -4979,6 +5047,11 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
}
v = va->vm;
+ if (v->flags & VM_UNINITIALIZED)
+ continue;
+
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
@@ -5013,7 +5086,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
- show_numa_info(m, v);
+ if (IS_ENABLED(CONFIG_NUMA))
+ show_numa_info(m, v, counters);
+
seq_putc(m, '\n');
}
spin_unlock(&vn->busy.lock);
@@ -5023,19 +5098,14 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
* As a final step, dump "unpurged" areas.
*/
show_purge_info(m);
+ if (IS_ENABLED(CONFIG_NUMA))
+ kfree(counters);
return 0;
}
static int __init proc_vmalloc_init(void)
{
- void *priv_data = NULL;
-
- if (IS_ENABLED(CONFIG_NUMA))
- priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
-
- proc_create_single_data("vmallocinfo",
- 0400, NULL, vmalloc_info_show, priv_data);
-
+ proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show);
return 0;
}
module_init(proc_vmalloc_init);
@@ -5087,7 +5157,7 @@ static void __init vmap_init_free_space(void)
static void vmap_init_nodes(void)
{
struct vmap_node *vn;
- int i, n;
+ int i;
#if BITS_PER_LONG == 64
/*
@@ -5104,7 +5174,7 @@ static void vmap_init_nodes(void)
* set of cores. Therefore a per-domain purging is supposed to
* be added as well as a per-domain balancing.
*/
- n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
+ int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
if (n > 1) {
vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
@@ -5119,8 +5189,7 @@ static void vmap_init_nodes(void)
}
#endif
- for (n = 0; n < nr_vmap_nodes; n++) {
- vn = &vmap_nodes[n];
+ for_each_vmap_node(vn) {
vn->busy.root = RB_ROOT;
INIT_LIST_HEAD(&vn->busy.head);
spin_lock_init(&vn->busy.lock);
@@ -5141,15 +5210,13 @@ static void vmap_init_nodes(void)
static unsigned long
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
- unsigned long count;
+ unsigned long count = 0;
struct vmap_node *vn;
- int i, j;
-
- for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ int i;
- for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
- count += READ_ONCE(vn->pool[j].len);
+ for_each_vmap_node(vn) {
+ for (i = 0; i < MAX_VA_SIZE_PAGES; i++)
+ count += READ_ONCE(vn->pool[i].len);
}
return count ? count : SHRINK_EMPTY;
@@ -5158,10 +5225,10 @@ vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
static unsigned long
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- int i;
+ struct vmap_node *vn;
- for (i = 0; i < nr_vmap_nodes; i++)
- decay_va_pool_node(&vmap_nodes[i], true);
+ for_each_vmap_node(vn)
+ decay_va_pool_node(vn, true);
return SHRINK_STOP;
}
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index bd5183dfd879..c197ed47bcc4 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -316,7 +316,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
* asserted for a second in which subsequent
* pressure events can occur.
*/
- WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
+ mem_cgroup_set_socket_pressure(memcg);
}
}
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b620d74b0f66..7de11524a936 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -57,6 +57,7 @@
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <linux/mmu_notifier.h>
+#include <linux/parser.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -93,10 +94,8 @@ struct scan_control {
unsigned long anon_cost;
unsigned long file_cost;
-#ifdef CONFIG_MEMCG
/* Swappiness value for proactive reclaim. Always use sc_swappiness()! */
int *proactive_swappiness;
-#endif
/* Can active folios be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
@@ -120,7 +119,7 @@ struct scan_control {
/* Has cache_trim_mode failed at least once? */
unsigned int cache_trim_mode_failed:1;
- /* Proactive reclaim invoked by userspace through memory.reclaim */
+ /* Proactive reclaim invoked by userspace */
unsigned int proactive:1;
/*
@@ -342,16 +341,22 @@ static void flush_reclaim_state(struct scan_control *sc)
}
}
-static bool can_demote(int nid, struct scan_control *sc)
+static bool can_demote(int nid, struct scan_control *sc,
+ struct mem_cgroup *memcg)
{
+ int demotion_nid;
+
if (!numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
- if (next_demotion_node(nid) == NUMA_NO_NODE)
+
+ demotion_nid = next_demotion_node(nid);
+ if (demotion_nid == NUMA_NO_NODE)
return false;
- return true;
+ /* If demotion node isn't in the cgroup's mems_allowed, fall back */
+ return mem_cgroup_node_allowed(memcg, demotion_nid);
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -376,7 +381,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
*
* Can it be reclaimed from this node via demotion?
*/
- return can_demote(nid, sc);
+ return can_demote(nid, sc, memcg);
}
/*
@@ -646,23 +651,53 @@ typedef enum {
PAGE_CLEAN,
} pageout_t;
+static pageout_t writeout(struct folio *folio, struct address_space *mapping,
+ struct swap_iocb **plug, struct list_head *folio_list)
+{
+ int res;
+
+ folio_set_reclaim(folio);
+
+ /*
+ * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled
+ * or we failed to allocate contiguous swap entries, in which case
+ * the split out folios get added back to folio_list.
+ */
+ if (shmem_mapping(mapping))
+ res = shmem_writeout(folio, plug, folio_list);
+ else
+ res = swap_writeout(folio, plug);
+
+ if (res < 0)
+ handle_write_error(mapping, folio, res);
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
+ folio_clear_reclaim(folio);
+ return PAGE_ACTIVATE;
+ }
+
+ /* synchronous write? */
+ if (!folio_test_writeback(folio))
+ folio_clear_reclaim(folio);
+
+ trace_mm_vmscan_write_folio(folio);
+ node_stat_add_folio(folio, NR_VMSCAN_WRITE);
+ return PAGE_SUCCESS;
+}
+
/*
* pageout is called by shrink_folio_list() for each dirty folio.
- * Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
struct swap_iocb **plug, struct list_head *folio_list)
{
/*
- * If the folio is dirty, only perform writeback if that write
- * will be non-blocking. To prevent this allocation from being
- * stalled by pagecache activity. But note that there may be
- * stalls if we need to run get_block(). We could test
- * PagePrivate for that.
- *
- * If this process is currently in __generic_file_write_iter() against
- * this folio's queue, we can perform writeback even if that
- * will block.
+ * We no longer attempt to writeback filesystem folios here, other
+ * than tmpfs/shmem. That's taken care of in page-writeback.
+ * If we find a dirty filesystem folio at the end of the LRU list,
+ * typically that means the filesystem is saturating the storage
+ * with contiguous writes and telling it to write a folio here
+ * would only make the situation worse by injecting an element
+ * of random access.
*
* If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
@@ -685,47 +720,12 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
return PAGE_KEEP;
}
- if (mapping->a_ops->writepage == NULL)
- return PAGE_ACTIVATE;
-
- if (folio_clear_dirty_for_io(folio)) {
- int res;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = SWAP_CLUSTER_MAX,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .for_reclaim = 1,
- .swap_plug = plug,
- };
-
- /*
- * The large shmem folio can be split if CONFIG_THP_SWAP is
- * not enabled or contiguous swap entries are failed to
- * allocate.
- */
- if (shmem_mapping(mapping) && folio_test_large(folio))
- wbc.list = folio_list;
-
- folio_set_reclaim(folio);
- res = mapping->a_ops->writepage(&folio->page, &wbc);
- if (res < 0)
- handle_write_error(mapping, folio, res);
- if (res == AOP_WRITEPAGE_ACTIVATE) {
- folio_clear_reclaim(folio);
- return PAGE_ACTIVATE;
- }
-
- if (!folio_test_writeback(folio)) {
- /* synchronous write or broken a_ops? */
- folio_clear_reclaim(folio);
- }
- trace_mm_vmscan_write_folio(folio);
- node_stat_add_folio(folio, NR_VMSCAN_WRITE);
- return PAGE_SUCCESS;
- }
- return PAGE_CLEAN;
+ if (!shmem_mapping(mapping) && !folio_test_anon(folio))
+ return PAGE_ACTIVATE;
+ if (!folio_clear_dirty_for_io(folio))
+ return PAGE_CLEAN;
+ return writeout(folio, mapping, plug, folio_list);
}
/*
@@ -906,7 +906,7 @@ static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
int referenced_ptes, referenced_folio;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
&vm_flags);
@@ -1005,7 +1005,8 @@ static void folio_check_dirty_writeback(struct folio *folio,
mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
}
-struct folio *alloc_migrate_folio(struct folio *src, unsigned long private)
+static struct folio *alloc_demote_folio(struct folio *src,
+ unsigned long private)
{
struct folio *dst;
nodemask_t *allowed_mask;
@@ -1068,7 +1069,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
node_get_allowed_targets(pgdat, &allowed_mask);
/* Demotion ignores all cpuset and mempolicy settings */
- migrate_pages(demote_folios, alloc_migrate_folio, NULL,
+ migrate_pages(demote_folios, alloc_demote_folio, NULL,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
@@ -1096,7 +1097,8 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
*/
static unsigned int shrink_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat, struct scan_control *sc,
- struct reclaim_stat *stat, bool ignore_references)
+ struct reclaim_stat *stat, bool ignore_references,
+ struct mem_cgroup *memcg)
{
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
@@ -1109,7 +1111,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
folio_batch_init(&free_folios);
memset(stat, 0, sizeof(*stat));
cond_resched();
- do_demote_pass = can_demote(pgdat->node_id, sc);
+ do_demote_pass = can_demote(pgdat->node_id, sc, memcg);
retry:
while (!list_empty(folio_list)) {
@@ -1128,6 +1130,14 @@ retry:
goto keep;
if (folio_contain_hwpoisoned_page(folio)) {
+ /*
+ * unmap_poisoned_folio() can't handle large
+ * folio, just skip it. memory_failure() will
+ * handle it if the UCE is triggered again.
+ */
+ if (folio_test_large(folio))
+ goto keep_locked;
+
unmap_poisoned_folio(folio, folio_pfn(folio), false);
folio_unlock(folio);
folio_put(folio);
@@ -1187,8 +1197,10 @@ retry:
* 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the folio for immediate
- * reclaim and continue scanning.
+ * not to fs), or the folio belongs to a mapping where
+ * waiting on writeback during reclaim may lead to a deadlock.
+ * In this case mark the folio for immediate reclaim and
+ * continue scanning.
*
* Require may_enter_fs() because we would wait on fs, which
* may not have submitted I/O yet. And the loop driver might
@@ -1213,6 +1225,8 @@ retry:
* takes to write them to disk.
*/
if (folio_test_writeback(folio)) {
+ mapping = folio_mapping(folio);
+
/* Case 1 above */
if (current_is_kswapd() &&
folio_test_reclaim(folio) &&
@@ -1223,7 +1237,9 @@ retry:
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!folio_test_reclaim(folio) ||
- !may_enter_fs(folio, sc->gfp_mask)) {
+ !may_enter_fs(folio, sc->gfp_mask) ||
+ (mapping &&
+ mapping_writeback_may_deadlock_on_reclaim(mapping))) {
/*
* This is slightly racy -
* folio_end_writeback() might have
@@ -1642,9 +1658,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
unsigned int noreclaim_flag;
list_for_each_entry_safe(folio, next, folio_list, lru) {
+ /* TODO: these pages should not even appear in this list. */
+ if (page_has_movable_ops(&folio->page))
+ continue;
if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
- !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
- !folio_test_unevictable(folio)) {
+ !folio_test_dirty(folio) && !folio_test_unevictable(folio)) {
folio_clear_active(folio);
list_move(&folio->lru, &clean_folios);
}
@@ -1658,7 +1676,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
*/
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
- &stat, true);
+ &stat, true, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
list_splice(&clean_folios, folio_list);
@@ -1725,13 +1743,11 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
- unsigned long skipped = 0;
- unsigned long scan, total_scan, nr_pages;
+ unsigned long skipped = 0, total_scan = 0, scan = 0;
+ unsigned long nr_pages;
unsigned long max_nr_skipped = 0;
LIST_HEAD(folios_skipped);
- total_scan = 0;
- scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
struct list_head *move_to = src;
struct folio *folio;
@@ -2023,7 +2039,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2031,7 +2047,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
+ nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
+ lruvec_memcg(lruvec));
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &folio_list);
@@ -2042,11 +2059,11 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
- spin_unlock_irq(&lruvec->lru_lock);
- lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
+ lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout,
+ nr_scanned - nr_reclaimed);
/*
* If dirty folios are scanned that are not queued for IO, it
@@ -2112,7 +2129,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
{
unsigned long nr_taken;
unsigned long nr_scanned;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
LIST_HEAD(l_hold); /* The folios which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
@@ -2132,7 +2149,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (!cgroup_reclaim(sc))
__count_vm_events(PGREFILL, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2189,13 +2206,11 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&lruvec->lru_lock);
- if (nr_rotated)
- lru_note_cost(lruvec, file, 0, nr_rotated);
+ lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
@@ -2214,7 +2229,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
@@ -2467,6 +2482,69 @@ static inline void calculate_pressure_balance(struct scan_control *sc,
*denominator = ap + fp;
}
+static unsigned long apply_proportional_protection(struct mem_cgroup *memcg,
+ struct scan_control *sc, unsigned long scan)
+{
+ unsigned long min, low;
+
+ mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low);
+
+ if (min || low) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+ unsigned long protection;
+
+ /* memory.low scaling, make sure we retry before OOM */
+ if (!sc->memcg_low_reclaim && low > min) {
+ protection = low;
+ sc->memcg_low_skipped = 1;
+ } else {
+ protection = min;
+ }
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan -= scan * protection / (cgroup_size + 1);
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decrementing
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ }
+ return scan;
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -2503,6 +2581,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
goto out;
}
+ /* Proactive reclaim initiated by userspace for anonymous memory only */
+ if (swappiness == SWAPPINESS_ANON_ONLY) {
+ WARN_ON_ONCE(!sc->proactive);
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
+
/*
* Do not apply any pressure balancing cleverness when the
* system is close to OOM, scan both anon and file equally
@@ -2523,7 +2608,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
/*
* If there is enough inactive page cache, we do not reclaim
- * anything from the anonymous working right now.
+ * anything from the anonymous working right now to make sure
+ * a streaming file access pattern doesn't cause swapping.
*/
if (sc->cache_trim_mode) {
scan_balance = SCAN_FILE;
@@ -2537,70 +2623,10 @@ out:
for_each_evictable_lru(lru) {
bool file = is_file_lru(lru);
unsigned long lruvec_size;
- unsigned long low, min;
unsigned long scan;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- mem_cgroup_protection(sc->target_mem_cgroup, memcg,
- &min, &low);
-
- if (min || low) {
- /*
- * Scale a cgroup's reclaim pressure by proportioning
- * its current usage to its memory.low or memory.min
- * setting.
- *
- * This is important, as otherwise scanning aggression
- * becomes extremely binary -- from nothing as we
- * approach the memory protection threshold, to totally
- * nominal as we exceed it. This results in requiring
- * setting extremely liberal protection thresholds. It
- * also means we simply get no protection at all if we
- * set it too low, which is not ideal.
- *
- * If there is any protection in place, we reduce scan
- * pressure by how much of the total memory used is
- * within protection thresholds.
- *
- * There is one special case: in the first reclaim pass,
- * we skip over all groups that are within their low
- * protection. If that fails to reclaim enough pages to
- * satisfy the reclaim goal, we come back and override
- * the best-effort low protection. However, we still
- * ideally want to honor how well-behaved groups are in
- * that case instead of simply punishing them all
- * equally. As such, we reclaim them based on how much
- * memory they are using, reducing the scan pressure
- * again by how much of the total memory used is under
- * hard protection.
- */
- unsigned long cgroup_size = mem_cgroup_size(memcg);
- unsigned long protection;
-
- /* memory.low scaling, make sure we retry before OOM */
- if (!sc->memcg_low_reclaim && low > min) {
- protection = low;
- sc->memcg_low_skipped = 1;
- } else {
- protection = min;
- }
-
- /* Avoid TOCTOU with earlier protection check */
- cgroup_size = max(cgroup_size, protection);
-
- scan = lruvec_size - lruvec_size * protection /
- (cgroup_size + 1);
-
- /*
- * Minimally target SWAP_CLUSTER_MAX pages to keep
- * reclaim moving forwards, avoiding decrementing
- * sc->priority further than desirable.
- */
- scan = max(scan, SWAP_CLUSTER_MAX);
- } else {
- scan = lruvec_size;
- }
-
+ scan = apply_proportional_protection(memcg, sc, lruvec_size);
scan >>= sc->priority;
/*
@@ -2646,7 +2672,7 @@ out:
* Anonymous LRU management is a waste if there is
* ultimately no way to reclaim the memory.
*/
-static bool can_age_anon_pages(struct pglist_data *pgdat,
+static bool can_age_anon_pages(struct lruvec *lruvec,
struct scan_control *sc)
{
/* Aging the anon LRU is valuable if swap is present: */
@@ -2654,7 +2680,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
return true;
/* Also valuable if anon pages can be demoted: */
- return can_demote(pgdat->node_id, sc);
+ return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
+ lruvec_memcg(lruvec));
}
#ifdef CONFIG_LRU_GEN
@@ -2690,8 +2717,12 @@ static bool should_clear_pmd_young(void)
READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
}
+/* Get the min/max evictable type based on swappiness */
+#define min_type(swappiness) (!(swappiness))
+#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY)
+
#define evictable_min_seq(min_seq, swappiness) \
- min((min_seq)[!(swappiness)], (min_seq)[(swappiness) <= MAX_SWAPPINESS])
+ min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)])
#define for_each_gen_type_zone(gen, type, zone) \
for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
@@ -2699,7 +2730,7 @@ static bool should_clear_pmd_young(void)
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
#define for_each_evictable_type(type, swappiness) \
- for ((type) = !(swappiness); (type) <= ((swappiness) <= MAX_SWAPPINESS); (type)++)
+ for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++)
#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
@@ -2732,7 +2763,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
if (!sc->may_swap)
return 0;
- if (!can_demote(pgdat->node_id, sc) &&
+ if (!can_demote(pgdat->node_id, sc, memcg) &&
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
@@ -3401,7 +3432,7 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
if (!pte_present(pte) || is_zero_pfn(pfn))
return -1;
- if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
+ if (WARN_ON_ONCE(pte_special(pte)))
return -1;
if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
@@ -3426,9 +3457,6 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
return -1;
- if (WARN_ON_ONCE(pmd_devmap(pmd)))
- return -1;
-
if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
return -1;
@@ -3850,7 +3878,12 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- if (type ? swappiness > MAX_SWAPPINESS : !swappiness)
+ /* For file type, skip the check if swappiness is anon only */
+ if (type && (swappiness == SWAPPINESS_ANON_ONLY))
+ goto done;
+
+ /* For anon type, skip the check if swappiness is zero (file only) */
+ if (!type && !swappiness)
goto done;
/* prevent cold/hot inversion if the type is evictable */
@@ -3894,6 +3927,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
{
int gen, type, zone;
bool success = false;
+ bool seq_inc_flag = false;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
@@ -3910,11 +3944,20 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
}
min_seq[type]++;
+ seq_inc_flag = true;
}
next:
;
}
+ /*
+ * If min_seq[type] of both anonymous and file is not increased,
+ * we can directly return false to avoid unnecessary checking
+ * overhead later.
+ */
+ if (!seq_inc_flag)
+ return success;
+
/* see the comment on lru_gen_folio */
if (swappiness && swappiness <= MAX_SWAPPINESS) {
unsigned long seq = lrugen->max_seq - MIN_NR_GENS;
@@ -4521,8 +4564,9 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
return true;
}
-static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
- int type, int tier, struct list_head *list)
+static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int type, int tier,
+ struct list_head *list)
{
int i;
int gen;
@@ -4531,7 +4575,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int scanned = 0;
int isolated = 0;
int skipped = 0;
- int remaining = MAX_LRU_BATCH;
+ int remaining = min(nr_to_scan, MAX_LRU_BATCH);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -4588,8 +4632,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
}
- __count_memcg_events(memcg, item, isolated);
- __count_memcg_events(memcg, PGREFILL, sorted);
+ count_memcg_events(memcg, item, isolated);
+ count_memcg_events(memcg, PGREFILL, sorted);
__count_vm_events(PGSCAN_ANON + type, isolated);
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
scanned, skipped, isolated,
@@ -4642,7 +4686,8 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
return positive_ctrl_err(&sp, &pv);
}
-static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int swappiness,
int *type_scanned, struct list_head *list)
{
int i;
@@ -4654,7 +4699,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
*type_scanned = type;
- scanned = scan_folios(lruvec, sc, type, tier, list);
+ scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list);
if (scanned)
return scanned;
@@ -4664,7 +4709,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
return 0;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int swappiness)
{
int type;
int scanned;
@@ -4683,7 +4729,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
spin_lock_irq(&lruvec->lru_lock);
- scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
+ scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list);
scanned += try_to_inc_min_seq(lruvec, swappiness);
@@ -4695,7 +4741,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
if (list_empty(&list))
return scanned;
retry:
- reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
@@ -4739,7 +4785,7 @@ retry:
item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
- __count_memcg_events(memcg, item, reclaimed);
+ count_memcg_events(memcg, item, reclaimed);
__count_vm_events(PGSTEAL_ANON + type, reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
@@ -4804,6 +4850,8 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int s
if (nr_to_scan && !mem_cgroup_online(memcg))
return nr_to_scan;
+ nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan);
+
/* try to get away with not aging at the default priority */
if (!success || sc->priority == DEF_PRIORITY)
return nr_to_scan >> sc->priority;
@@ -4856,7 +4904,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
if (nr_to_scan <= 0)
break;
- delta = evict_folios(lruvec, sc, swappiness);
+ delta = evict_folios(nr_to_scan, lruvec, sc, swappiness);
if (!delta)
break;
@@ -5387,7 +5435,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
static int lru_gen_seq_show(struct seq_file *m, void *v)
{
unsigned long seq;
- bool full = !debugfs_real_fops(m->file)->write;
+ bool full = debugfs_get_aux_num(m->file);
struct lruvec *lruvec = v;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int nid = lruvec_pgdat(lruvec)->node_id;
@@ -5477,7 +5525,8 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
if (sc->nr_reclaimed >= nr_to_reclaim)
return 0;
- if (!evict_folios(lruvec, sc, swappiness))
+ if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc,
+ swappiness))
return 0;
cond_resched();
@@ -5516,7 +5565,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (swappiness < MIN_SWAPPINESS)
swappiness = get_swappiness(lruvec, sc);
- else if (swappiness > MAX_SWAPPINESS + 1)
+ else if (swappiness > SWAPPINESS_ANON_ONLY)
goto done;
switch (cmd) {
@@ -5573,24 +5622,35 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
while ((cur = strsep(&next, ",;\n"))) {
int n;
int end;
- char cmd;
+ char cmd, swap_string[5];
unsigned int memcg_id;
unsigned int nid;
unsigned long seq;
- unsigned int swappiness = -1;
+ unsigned int swappiness;
unsigned long opt = -1;
cur = skip_spaces(cur);
if (!*cur)
continue;
- n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
- &seq, &end, &swappiness, &end, &opt, &end);
+ n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, swap_string, &end, &opt, &end);
if (n < 4 || cur[end]) {
err = -EINVAL;
break;
}
+ if (n == 4) {
+ swappiness = -1;
+ } else if (!strcmp("max", swap_string)) {
+ /* set by userspace for anonymous memory only */
+ swappiness = SWAPPINESS_ANON_ONLY;
+ } else {
+ err = kstrtouint(swap_string, 0, &swappiness);
+ if (err)
+ break;
+ }
+
err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
if (err)
break;
@@ -5712,8 +5772,10 @@ static int __init init_lru_gen(void)
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
pr_err("lru_gen: failed to create sysfs group\n");
- debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
- debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+ debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, 1,
+ &lru_gen_rw_fops);
+ debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, 0,
+ &lru_gen_ro_fops);
return 0;
};
@@ -5850,7 +5912,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ if (can_age_anon_pages(lruvec, sc) &&
inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -6669,6 +6731,15 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
return nr_reclaimed;
}
+#else
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ unsigned long nr_pages,
+ gfp_t gfp_mask,
+ unsigned int reclaim_options,
+ int *swappiness)
+{
+ return 0;
+}
#endif
static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
@@ -6681,10 +6752,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
return;
}
- if (!can_age_anon_pages(pgdat, sc))
+ lruvec = mem_cgroup_lruvec(NULL, pgdat);
+ if (!can_age_anon_pages(lruvec, sc))
return;
- lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
return;
@@ -6736,6 +6807,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
* meet watermarks.
*/
for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
+ enum zone_stat_item item;
unsigned long free_pages;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
@@ -6746,11 +6818,33 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
/*
* In defrag_mode, watermarks must be met in whole
* blocks to avoid polluting allocator fallbacks.
+ *
+ * However, kswapd usually cannot accomplish this on
+ * its own and needs kcompactd support. Once it's
+ * reclaimed a compaction gap, and kswapd_shrink_node
+ * has dropped order, simply ensure there are enough
+ * base pages for compaction, wake kcompactd & sleep.
*/
- if (defrag_mode)
- free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS);
+ if (defrag_mode && order)
+ item = NR_FREE_PAGES_BLOCKS;
else
- free_pages = zone_page_state(zone, NR_FREE_PAGES);
+ item = NR_FREE_PAGES;
+
+ /*
+ * When there is a high number of CPUs in the system,
+ * the cumulative error from the vmstat per-cpu cache
+ * can blur the line between the watermarks. In that
+ * case, be safe and get an accurate snapshot.
+ *
+ * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
+ * pageblock_nr_pages, while the vmstat pcp threshold
+ * is limited to 125. On many configurations that
+ * counter won't actually be per-cpu cached. But keep
+ * things simple for now; revisit when somebody cares.
+ */
+ free_pages = zone_page_state(zone, item);
+ if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(zone, item);
if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
0, free_pages))
@@ -7540,36 +7634,26 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
/*
* Try to free up some pages from this node through reclaim.
*/
-static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
+static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
+ unsigned long nr_pages,
+ struct scan_control *sc)
{
- /* Minimum pages needed in order to stay on node */
- const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
unsigned int noreclaim_flag;
- struct scan_control sc = {
- .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = current_gfp_context(gfp_mask),
- .order = order,
- .priority = NODE_RECLAIM_PRIORITY,
- .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
- .may_swap = 1,
- .reclaim_idx = gfp_zone(gfp_mask),
- };
unsigned long pflags;
- trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
- sc.gfp_mask);
+ trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, sc->order,
+ sc->gfp_mask);
cond_resched();
psi_memstall_enter(&pflags);
delayacct_freepages_start();
- fs_reclaim_acquire(sc.gfp_mask);
+ fs_reclaim_acquire(sc->gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
*/
noreclaim_flag = memalloc_noreclaim_save();
- set_task_reclaim_state(p, &sc.reclaim_state);
+ set_task_reclaim_state(p, &sc->reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
@@ -7578,24 +7662,36 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
* priorities until we have enough memory freed.
*/
do {
- shrink_node(pgdat, &sc);
- } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
+ shrink_node(pgdat, sc);
+ } while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0);
}
set_task_reclaim_state(p, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
- fs_reclaim_release(sc.gfp_mask);
- psi_memstall_leave(&pflags);
+ fs_reclaim_release(sc->gfp_mask);
delayacct_freepages_end();
+ psi_memstall_leave(&pflags);
- trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
+ trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed);
- return sc.nr_reclaimed >= nr_pages;
+ return sc->nr_reclaimed;
}
int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
int ret;
+ /* Minimum pages needed in order to stay on node */
+ const unsigned long nr_pages = 1 << order;
+ struct scan_control sc = {
+ .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .order = order,
+ .priority = NODE_RECLAIM_PRIORITY,
+ .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
+ .may_swap = 1,
+ .reclaim_idx = gfp_zone(gfp_mask),
+ };
/*
* Node reclaim reclaims unmapped file backed pages and
@@ -7630,7 +7726,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
return NODE_RECLAIM_NOSCAN;
- ret = __node_reclaim(pgdat, gfp_mask, order);
+ ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages;
clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (ret)
@@ -7640,6 +7736,114 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
return ret;
}
+
+enum {
+ MEMORY_RECLAIM_SWAPPINESS = 0,
+ MEMORY_RECLAIM_SWAPPINESS_MAX,
+ MEMORY_RECLAIM_NULL,
+};
+static const match_table_t tokens = {
+ { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+ { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"},
+ { MEMORY_RECLAIM_NULL, NULL },
+};
+
+int user_proactive_reclaim(char *buf,
+ struct mem_cgroup *memcg, pg_data_t *pgdat)
+{
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+ unsigned long nr_to_reclaim, nr_reclaimed = 0;
+ int swappiness = -1;
+ char *old_buf, *start;
+ substring_t args[MAX_OPT_ARGS];
+ gfp_t gfp_mask = GFP_KERNEL;
+
+ if (!buf || (!memcg && !pgdat) || (memcg && pgdat))
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ old_buf = buf;
+ nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+ if (buf == old_buf)
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ while ((start = strsep(&buf, " ")) != NULL) {
+ if (!strlen(start))
+ continue;
+ switch (match_token(start, tokens, args)) {
+ case MEMORY_RECLAIM_SWAPPINESS:
+ if (match_int(&args[0], &swappiness))
+ return -EINVAL;
+ if (swappiness < MIN_SWAPPINESS ||
+ swappiness > MAX_SWAPPINESS)
+ return -EINVAL;
+ break;
+ case MEMORY_RECLAIM_SWAPPINESS_MAX:
+ swappiness = SWAPPINESS_ANON_ONLY;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ while (nr_reclaimed < nr_to_reclaim) {
+ /* Will converge on zero, but reclaim enforces a minimum */
+ unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
+ unsigned long reclaimed;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ /*
+ * This is the final attempt, drain percpu lru caches in the
+ * hope of introducing more evictable pages.
+ */
+ if (!nr_retries)
+ lru_add_drain_all();
+
+ if (memcg) {
+ unsigned int reclaim_options;
+
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_PROACTIVE;
+ reclaimed = try_to_free_mem_cgroup_pages(memcg,
+ batch_size, gfp_mask,
+ reclaim_options,
+ swappiness == -1 ? NULL : &swappiness);
+ } else {
+ struct scan_control sc = {
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .reclaim_idx = gfp_zone(gfp_mask),
+ .proactive_swappiness = swappiness == -1 ? NULL : &swappiness,
+ .priority = DEF_PRIORITY,
+ .may_writepage = !laptop_mode,
+ .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX),
+ .may_unmap = 1,
+ .may_swap = 1,
+ .proactive = 1,
+ };
+
+ if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED,
+ &pgdat->flags))
+ return -EBUSY;
+
+ reclaimed = __node_reclaim(pgdat, gfp_mask,
+ batch_size, &sc);
+ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+ }
+
+ if (!reclaimed && !nr_retries--)
+ return -EAGAIN;
+
+ nr_reclaimed += reclaimed;
+ }
+
+ return 0;
+}
+
#endif
/**
@@ -7687,3 +7891,26 @@ void check_move_unevictable_folios(struct folio_batch *fbatch)
}
}
EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+static ssize_t reclaim_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret, nid = dev->id;
+
+ ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid));
+ return ret ? -EAGAIN : count;
+}
+
+static DEVICE_ATTR_WO(reclaim);
+int reclaim_register_node(struct node *node)
+{
+ return device_create_file(&node->dev, &dev_attr_reclaim);
+}
+
+void reclaim_unregister_node(struct node *node)
+{
+ return device_remove_file(&node->dev, &dev_attr_reclaim);
+}
+#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4c268ce39ff2..71cd1ceba191 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -7,7 +7,7 @@
*
* zoned VM statistics
* Copyright (C) 2006 Silicon Graphics, Inc.,
- * Christoph Lameter <christoph@lameter.com>
+ * Christoph Lameter <cl@gentwo.org>
* Copyright (C) 2008-2014 Christoph Lameter
*/
#include <linux/fs.h>
@@ -1163,320 +1163,339 @@ int fragmentation_index(struct zone *zone, unsigned int order)
#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
#ifdef CONFIG_ZONE_DMA
-#define TEXT_FOR_DMA(xx) xx "_dma",
+#define TEXT_FOR_DMA(xx, yy) [xx##_DMA] = yy "_dma",
#else
-#define TEXT_FOR_DMA(xx)
+#define TEXT_FOR_DMA(xx, yy)
#endif
#ifdef CONFIG_ZONE_DMA32
-#define TEXT_FOR_DMA32(xx) xx "_dma32",
+#define TEXT_FOR_DMA32(xx, yy) [xx##_DMA32] = yy "_dma32",
#else
-#define TEXT_FOR_DMA32(xx)
+#define TEXT_FOR_DMA32(xx, yy)
#endif
#ifdef CONFIG_HIGHMEM
-#define TEXT_FOR_HIGHMEM(xx) xx "_high",
+#define TEXT_FOR_HIGHMEM(xx, yy) [xx##_HIGH] = yy "_high",
#else
-#define TEXT_FOR_HIGHMEM(xx)
+#define TEXT_FOR_HIGHMEM(xx, yy)
#endif
#ifdef CONFIG_ZONE_DEVICE
-#define TEXT_FOR_DEVICE(xx) xx "_device",
+#define TEXT_FOR_DEVICE(xx, yy) [xx##_DEVICE] = yy "_device",
#else
-#define TEXT_FOR_DEVICE(xx)
+#define TEXT_FOR_DEVICE(xx, yy)
#endif
-#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx) xx "_movable", \
- TEXT_FOR_DEVICE(xx)
+#define TEXTS_FOR_ZONES(xx, yy) \
+ TEXT_FOR_DMA(xx, yy) \
+ TEXT_FOR_DMA32(xx, yy) \
+ [xx##_NORMAL] = yy "_normal", \
+ TEXT_FOR_HIGHMEM(xx, yy) \
+ [xx##_MOVABLE] = yy "_movable", \
+ TEXT_FOR_DEVICE(xx, yy)
const char * const vmstat_text[] = {
/* enum zone_stat_item counters */
- "nr_free_pages",
- "nr_free_pages_blocks",
- "nr_zone_inactive_anon",
- "nr_zone_active_anon",
- "nr_zone_inactive_file",
- "nr_zone_active_file",
- "nr_zone_unevictable",
- "nr_zone_write_pending",
- "nr_mlock",
- "nr_bounce",
+#define I(x) (x)
+ [I(NR_FREE_PAGES)] = "nr_free_pages",
+ [I(NR_FREE_PAGES_BLOCKS)] = "nr_free_pages_blocks",
+ [I(NR_ZONE_INACTIVE_ANON)] = "nr_zone_inactive_anon",
+ [I(NR_ZONE_ACTIVE_ANON)] = "nr_zone_active_anon",
+ [I(NR_ZONE_INACTIVE_FILE)] = "nr_zone_inactive_file",
+ [I(NR_ZONE_ACTIVE_FILE)] = "nr_zone_active_file",
+ [I(NR_ZONE_UNEVICTABLE)] = "nr_zone_unevictable",
+ [I(NR_ZONE_WRITE_PENDING)] = "nr_zone_write_pending",
+ [I(NR_MLOCK)] = "nr_mlock",
#if IS_ENABLED(CONFIG_ZSMALLOC)
- "nr_zspages",
+ [I(NR_ZSPAGES)] = "nr_zspages",
#endif
- "nr_free_cma",
+ [I(NR_FREE_CMA_PAGES)] = "nr_free_cma",
#ifdef CONFIG_UNACCEPTED_MEMORY
- "nr_unaccepted",
+ [I(NR_UNACCEPTED)] = "nr_unaccepted",
#endif
+#undef I
/* enum numa_stat_item counters */
+#define I(x) (NR_VM_ZONE_STAT_ITEMS + x)
#ifdef CONFIG_NUMA
- "numa_hit",
- "numa_miss",
- "numa_foreign",
- "numa_interleave",
- "numa_local",
- "numa_other",
+ [I(NUMA_HIT)] = "numa_hit",
+ [I(NUMA_MISS)] = "numa_miss",
+ [I(NUMA_FOREIGN)] = "numa_foreign",
+ [I(NUMA_INTERLEAVE_HIT)] = "numa_interleave",
+ [I(NUMA_LOCAL)] = "numa_local",
+ [I(NUMA_OTHER)] = "numa_other",
#endif
+#undef I
/* enum node_stat_item counters */
- "nr_inactive_anon",
- "nr_active_anon",
- "nr_inactive_file",
- "nr_active_file",
- "nr_unevictable",
- "nr_slab_reclaimable",
- "nr_slab_unreclaimable",
- "nr_isolated_anon",
- "nr_isolated_file",
- "workingset_nodes",
- "workingset_refault_anon",
- "workingset_refault_file",
- "workingset_activate_anon",
- "workingset_activate_file",
- "workingset_restore_anon",
- "workingset_restore_file",
- "workingset_nodereclaim",
- "nr_anon_pages",
- "nr_mapped",
- "nr_file_pages",
- "nr_dirty",
- "nr_writeback",
- "nr_writeback_temp",
- "nr_shmem",
- "nr_shmem_hugepages",
- "nr_shmem_pmdmapped",
- "nr_file_hugepages",
- "nr_file_pmdmapped",
- "nr_anon_transparent_hugepages",
- "nr_vmscan_write",
- "nr_vmscan_immediate_reclaim",
- "nr_dirtied",
- "nr_written",
- "nr_throttled_written",
- "nr_kernel_misc_reclaimable",
- "nr_foll_pin_acquired",
- "nr_foll_pin_released",
- "nr_kernel_stack",
+#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + x)
+ [I(NR_INACTIVE_ANON)] = "nr_inactive_anon",
+ [I(NR_ACTIVE_ANON)] = "nr_active_anon",
+ [I(NR_INACTIVE_FILE)] = "nr_inactive_file",
+ [I(NR_ACTIVE_FILE)] = "nr_active_file",
+ [I(NR_UNEVICTABLE)] = "nr_unevictable",
+ [I(NR_SLAB_RECLAIMABLE_B)] = "nr_slab_reclaimable",
+ [I(NR_SLAB_UNRECLAIMABLE_B)] = "nr_slab_unreclaimable",
+ [I(NR_ISOLATED_ANON)] = "nr_isolated_anon",
+ [I(NR_ISOLATED_FILE)] = "nr_isolated_file",
+ [I(WORKINGSET_NODES)] = "workingset_nodes",
+ [I(WORKINGSET_REFAULT_ANON)] = "workingset_refault_anon",
+ [I(WORKINGSET_REFAULT_FILE)] = "workingset_refault_file",
+ [I(WORKINGSET_ACTIVATE_ANON)] = "workingset_activate_anon",
+ [I(WORKINGSET_ACTIVATE_FILE)] = "workingset_activate_file",
+ [I(WORKINGSET_RESTORE_ANON)] = "workingset_restore_anon",
+ [I(WORKINGSET_RESTORE_FILE)] = "workingset_restore_file",
+ [I(WORKINGSET_NODERECLAIM)] = "workingset_nodereclaim",
+ [I(NR_ANON_MAPPED)] = "nr_anon_pages",
+ [I(NR_FILE_MAPPED)] = "nr_mapped",
+ [I(NR_FILE_PAGES)] = "nr_file_pages",
+ [I(NR_FILE_DIRTY)] = "nr_dirty",
+ [I(NR_WRITEBACK)] = "nr_writeback",
+ [I(NR_SHMEM)] = "nr_shmem",
+ [I(NR_SHMEM_THPS)] = "nr_shmem_hugepages",
+ [I(NR_SHMEM_PMDMAPPED)] = "nr_shmem_pmdmapped",
+ [I(NR_FILE_THPS)] = "nr_file_hugepages",
+ [I(NR_FILE_PMDMAPPED)] = "nr_file_pmdmapped",
+ [I(NR_ANON_THPS)] = "nr_anon_transparent_hugepages",
+ [I(NR_VMSCAN_WRITE)] = "nr_vmscan_write",
+ [I(NR_VMSCAN_IMMEDIATE)] = "nr_vmscan_immediate_reclaim",
+ [I(NR_DIRTIED)] = "nr_dirtied",
+ [I(NR_WRITTEN)] = "nr_written",
+ [I(NR_THROTTLED_WRITTEN)] = "nr_throttled_written",
+ [I(NR_KERNEL_MISC_RECLAIMABLE)] = "nr_kernel_misc_reclaimable",
+ [I(NR_FOLL_PIN_ACQUIRED)] = "nr_foll_pin_acquired",
+ [I(NR_FOLL_PIN_RELEASED)] = "nr_foll_pin_released",
+ [I(NR_KERNEL_STACK_KB)] = "nr_kernel_stack",
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
- "nr_shadow_call_stack",
+ [I(NR_KERNEL_SCS_KB)] = "nr_shadow_call_stack",
#endif
- "nr_page_table_pages",
- "nr_sec_page_table_pages",
+ [I(NR_PAGETABLE)] = "nr_page_table_pages",
+ [I(NR_SECONDARY_PAGETABLE)] = "nr_sec_page_table_pages",
#ifdef CONFIG_IOMMU_SUPPORT
- "nr_iommu_pages",
+ [I(NR_IOMMU_PAGES)] = "nr_iommu_pages",
#endif
#ifdef CONFIG_SWAP
- "nr_swapcached",
+ [I(NR_SWAPCACHE)] = "nr_swapcached",
#endif
#ifdef CONFIG_NUMA_BALANCING
- "pgpromote_success",
- "pgpromote_candidate",
+ [I(PGPROMOTE_SUCCESS)] = "pgpromote_success",
+ [I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate",
#endif
- "pgdemote_kswapd",
- "pgdemote_direct",
- "pgdemote_khugepaged",
- "pgdemote_proactive",
+ [I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd",
+ [I(PGDEMOTE_DIRECT)] = "pgdemote_direct",
+ [I(PGDEMOTE_KHUGEPAGED)] = "pgdemote_khugepaged",
+ [I(PGDEMOTE_PROACTIVE)] = "pgdemote_proactive",
#ifdef CONFIG_HUGETLB_PAGE
- "nr_hugetlb",
+ [I(NR_HUGETLB)] = "nr_hugetlb",
#endif
- "nr_balloon_pages",
- /* system-wide enum vm_stat_item counters */
- "nr_dirty_threshold",
- "nr_dirty_background_threshold",
- "nr_memmap_pages",
- "nr_memmap_boot_pages",
+ [I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
+#undef I
-#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
+ /* system-wide enum vm_stat_item counters */
+#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
+ NR_VM_NODE_STAT_ITEMS + x)
+ [I(NR_DIRTY_THRESHOLD)] = "nr_dirty_threshold",
+ [I(NR_DIRTY_BG_THRESHOLD)] = "nr_dirty_background_threshold",
+ [I(NR_MEMMAP_PAGES)] = "nr_memmap_pages",
+ [I(NR_MEMMAP_BOOT_PAGES)] = "nr_memmap_boot_pages",
+#undef I
+
+#if defined(CONFIG_VM_EVENT_COUNTERS)
/* enum vm_event_item counters */
- "pgpgin",
- "pgpgout",
- "pswpin",
- "pswpout",
-
- TEXTS_FOR_ZONES("pgalloc")
- TEXTS_FOR_ZONES("allocstall")
- TEXTS_FOR_ZONES("pgskip")
-
- "pgfree",
- "pgactivate",
- "pgdeactivate",
- "pglazyfree",
-
- "pgfault",
- "pgmajfault",
- "pglazyfreed",
-
- "pgrefill",
- "pgreuse",
- "pgsteal_kswapd",
- "pgsteal_direct",
- "pgsteal_khugepaged",
- "pgsteal_proactive",
- "pgscan_kswapd",
- "pgscan_direct",
- "pgscan_khugepaged",
- "pgscan_proactive",
- "pgscan_direct_throttle",
- "pgscan_anon",
- "pgscan_file",
- "pgsteal_anon",
- "pgsteal_file",
+#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
+ NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS + x)
+
+ [I(PGPGIN)] = "pgpgin",
+ [I(PGPGOUT)] = "pgpgout",
+ [I(PSWPIN)] = "pswpin",
+ [I(PSWPOUT)] = "pswpout",
+
+#define OFF (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
+ NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS)
+ TEXTS_FOR_ZONES(OFF+PGALLOC, "pgalloc")
+ TEXTS_FOR_ZONES(OFF+ALLOCSTALL, "allocstall")
+ TEXTS_FOR_ZONES(OFF+PGSCAN_SKIP, "pgskip")
+#undef OFF
+
+ [I(PGFREE)] = "pgfree",
+ [I(PGACTIVATE)] = "pgactivate",
+ [I(PGDEACTIVATE)] = "pgdeactivate",
+ [I(PGLAZYFREE)] = "pglazyfree",
+
+ [I(PGFAULT)] = "pgfault",
+ [I(PGMAJFAULT)] = "pgmajfault",
+ [I(PGLAZYFREED)] = "pglazyfreed",
+
+ [I(PGREFILL)] = "pgrefill",
+ [I(PGREUSE)] = "pgreuse",
+ [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd",
+ [I(PGSTEAL_DIRECT)] = "pgsteal_direct",
+ [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged",
+ [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive",
+ [I(PGSCAN_KSWAPD)] = "pgscan_kswapd",
+ [I(PGSCAN_DIRECT)] = "pgscan_direct",
+ [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged",
+ [I(PGSCAN_PROACTIVE)] = "pgscan_proactive",
+ [I(PGSCAN_DIRECT_THROTTLE)] = "pgscan_direct_throttle",
+ [I(PGSCAN_ANON)] = "pgscan_anon",
+ [I(PGSCAN_FILE)] = "pgscan_file",
+ [I(PGSTEAL_ANON)] = "pgsteal_anon",
+ [I(PGSTEAL_FILE)] = "pgsteal_file",
#ifdef CONFIG_NUMA
- "zone_reclaim_success",
- "zone_reclaim_failed",
+ [I(PGSCAN_ZONE_RECLAIM_SUCCESS)] = "zone_reclaim_success",
+ [I(PGSCAN_ZONE_RECLAIM_FAILED)] = "zone_reclaim_failed",
#endif
- "pginodesteal",
- "slabs_scanned",
- "kswapd_inodesteal",
- "kswapd_low_wmark_hit_quickly",
- "kswapd_high_wmark_hit_quickly",
- "pageoutrun",
+ [I(PGINODESTEAL)] = "pginodesteal",
+ [I(SLABS_SCANNED)] = "slabs_scanned",
+ [I(KSWAPD_INODESTEAL)] = "kswapd_inodesteal",
+ [I(KSWAPD_LOW_WMARK_HIT_QUICKLY)] = "kswapd_low_wmark_hit_quickly",
+ [I(KSWAPD_HIGH_WMARK_HIT_QUICKLY)] = "kswapd_high_wmark_hit_quickly",
+ [I(PAGEOUTRUN)] = "pageoutrun",
- "pgrotated",
+ [I(PGROTATED)] = "pgrotated",
- "drop_pagecache",
- "drop_slab",
- "oom_kill",
+ [I(DROP_PAGECACHE)] = "drop_pagecache",
+ [I(DROP_SLAB)] = "drop_slab",
+ [I(OOM_KILL)] = "oom_kill",
#ifdef CONFIG_NUMA_BALANCING
- "numa_pte_updates",
- "numa_huge_pte_updates",
- "numa_hint_faults",
- "numa_hint_faults_local",
- "numa_pages_migrated",
+ [I(NUMA_PTE_UPDATES)] = "numa_pte_updates",
+ [I(NUMA_HUGE_PTE_UPDATES)] = "numa_huge_pte_updates",
+ [I(NUMA_HINT_FAULTS)] = "numa_hint_faults",
+ [I(NUMA_HINT_FAULTS_LOCAL)] = "numa_hint_faults_local",
+ [I(NUMA_PAGE_MIGRATE)] = "numa_pages_migrated",
#endif
#ifdef CONFIG_MIGRATION
- "pgmigrate_success",
- "pgmigrate_fail",
- "thp_migration_success",
- "thp_migration_fail",
- "thp_migration_split",
+ [I(PGMIGRATE_SUCCESS)] = "pgmigrate_success",
+ [I(PGMIGRATE_FAIL)] = "pgmigrate_fail",
+ [I(THP_MIGRATION_SUCCESS)] = "thp_migration_success",
+ [I(THP_MIGRATION_FAIL)] = "thp_migration_fail",
+ [I(THP_MIGRATION_SPLIT)] = "thp_migration_split",
#endif
#ifdef CONFIG_COMPACTION
- "compact_migrate_scanned",
- "compact_free_scanned",
- "compact_isolated",
- "compact_stall",
- "compact_fail",
- "compact_success",
- "compact_daemon_wake",
- "compact_daemon_migrate_scanned",
- "compact_daemon_free_scanned",
+ [I(COMPACTMIGRATE_SCANNED)] = "compact_migrate_scanned",
+ [I(COMPACTFREE_SCANNED)] = "compact_free_scanned",
+ [I(COMPACTISOLATED)] = "compact_isolated",
+ [I(COMPACTSTALL)] = "compact_stall",
+ [I(COMPACTFAIL)] = "compact_fail",
+ [I(COMPACTSUCCESS)] = "compact_success",
+ [I(KCOMPACTD_WAKE)] = "compact_daemon_wake",
+ [I(KCOMPACTD_MIGRATE_SCANNED)] = "compact_daemon_migrate_scanned",
+ [I(KCOMPACTD_FREE_SCANNED)] = "compact_daemon_free_scanned",
#endif
#ifdef CONFIG_HUGETLB_PAGE
- "htlb_buddy_alloc_success",
- "htlb_buddy_alloc_fail",
+ [I(HTLB_BUDDY_PGALLOC)] = "htlb_buddy_alloc_success",
+ [I(HTLB_BUDDY_PGALLOC_FAIL)] = "htlb_buddy_alloc_fail",
#endif
#ifdef CONFIG_CMA
- "cma_alloc_success",
- "cma_alloc_fail",
+ [I(CMA_ALLOC_SUCCESS)] = "cma_alloc_success",
+ [I(CMA_ALLOC_FAIL)] = "cma_alloc_fail",
#endif
- "unevictable_pgs_culled",
- "unevictable_pgs_scanned",
- "unevictable_pgs_rescued",
- "unevictable_pgs_mlocked",
- "unevictable_pgs_munlocked",
- "unevictable_pgs_cleared",
- "unevictable_pgs_stranded",
+ [I(UNEVICTABLE_PGCULLED)] = "unevictable_pgs_culled",
+ [I(UNEVICTABLE_PGSCANNED)] = "unevictable_pgs_scanned",
+ [I(UNEVICTABLE_PGRESCUED)] = "unevictable_pgs_rescued",
+ [I(UNEVICTABLE_PGMLOCKED)] = "unevictable_pgs_mlocked",
+ [I(UNEVICTABLE_PGMUNLOCKED)] = "unevictable_pgs_munlocked",
+ [I(UNEVICTABLE_PGCLEARED)] = "unevictable_pgs_cleared",
+ [I(UNEVICTABLE_PGSTRANDED)] = "unevictable_pgs_stranded",
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- "thp_fault_alloc",
- "thp_fault_fallback",
- "thp_fault_fallback_charge",
- "thp_collapse_alloc",
- "thp_collapse_alloc_failed",
- "thp_file_alloc",
- "thp_file_fallback",
- "thp_file_fallback_charge",
- "thp_file_mapped",
- "thp_split_page",
- "thp_split_page_failed",
- "thp_deferred_split_page",
- "thp_underused_split_page",
- "thp_split_pmd",
- "thp_scan_exceed_none_pte",
- "thp_scan_exceed_swap_pte",
- "thp_scan_exceed_share_pte",
+ [I(THP_FAULT_ALLOC)] = "thp_fault_alloc",
+ [I(THP_FAULT_FALLBACK)] = "thp_fault_fallback",
+ [I(THP_FAULT_FALLBACK_CHARGE)] = "thp_fault_fallback_charge",
+ [I(THP_COLLAPSE_ALLOC)] = "thp_collapse_alloc",
+ [I(THP_COLLAPSE_ALLOC_FAILED)] = "thp_collapse_alloc_failed",
+ [I(THP_FILE_ALLOC)] = "thp_file_alloc",
+ [I(THP_FILE_FALLBACK)] = "thp_file_fallback",
+ [I(THP_FILE_FALLBACK_CHARGE)] = "thp_file_fallback_charge",
+ [I(THP_FILE_MAPPED)] = "thp_file_mapped",
+ [I(THP_SPLIT_PAGE)] = "thp_split_page",
+ [I(THP_SPLIT_PAGE_FAILED)] = "thp_split_page_failed",
+ [I(THP_DEFERRED_SPLIT_PAGE)] = "thp_deferred_split_page",
+ [I(THP_UNDERUSED_SPLIT_PAGE)] = "thp_underused_split_page",
+ [I(THP_SPLIT_PMD)] = "thp_split_pmd",
+ [I(THP_SCAN_EXCEED_NONE_PTE)] = "thp_scan_exceed_none_pte",
+ [I(THP_SCAN_EXCEED_SWAP_PTE)] = "thp_scan_exceed_swap_pte",
+ [I(THP_SCAN_EXCEED_SHARED_PTE)] = "thp_scan_exceed_share_pte",
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
- "thp_split_pud",
+ [I(THP_SPLIT_PUD)] = "thp_split_pud",
#endif
- "thp_zero_page_alloc",
- "thp_zero_page_alloc_failed",
- "thp_swpout",
- "thp_swpout_fallback",
+ [I(THP_ZERO_PAGE_ALLOC)] = "thp_zero_page_alloc",
+ [I(THP_ZERO_PAGE_ALLOC_FAILED)] = "thp_zero_page_alloc_failed",
+ [I(THP_SWPOUT)] = "thp_swpout",
+ [I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback",
#endif
#ifdef CONFIG_MEMORY_BALLOON
- "balloon_inflate",
- "balloon_deflate",
+ [I(BALLOON_INFLATE)] = "balloon_inflate",
+ [I(BALLOON_DEFLATE)] = "balloon_deflate",
#ifdef CONFIG_BALLOON_COMPACTION
- "balloon_migrate",
+ [I(BALLOON_MIGRATE)] = "balloon_migrate",
#endif
#endif /* CONFIG_MEMORY_BALLOON */
#ifdef CONFIG_DEBUG_TLBFLUSH
- "nr_tlb_remote_flush",
- "nr_tlb_remote_flush_received",
- "nr_tlb_local_flush_all",
- "nr_tlb_local_flush_one",
+ [I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush",
+ [I(NR_TLB_REMOTE_FLUSH_RECEIVED)] = "nr_tlb_remote_flush_received",
+ [I(NR_TLB_LOCAL_FLUSH_ALL)] = "nr_tlb_local_flush_all",
+ [I(NR_TLB_LOCAL_FLUSH_ONE)] = "nr_tlb_local_flush_one",
#endif /* CONFIG_DEBUG_TLBFLUSH */
#ifdef CONFIG_SWAP
- "swap_ra",
- "swap_ra_hit",
- "swpin_zero",
- "swpout_zero",
+ [I(SWAP_RA)] = "swap_ra",
+ [I(SWAP_RA_HIT)] = "swap_ra_hit",
+ [I(SWPIN_ZERO)] = "swpin_zero",
+ [I(SWPOUT_ZERO)] = "swpout_zero",
#ifdef CONFIG_KSM
- "ksm_swpin_copy",
+ [I(KSM_SWPIN_COPY)] = "ksm_swpin_copy",
#endif
#endif
#ifdef CONFIG_KSM
- "cow_ksm",
+ [I(COW_KSM)] = "cow_ksm",
#endif
#ifdef CONFIG_ZSWAP
- "zswpin",
- "zswpout",
- "zswpwb",
+ [I(ZSWPIN)] = "zswpin",
+ [I(ZSWPOUT)] = "zswpout",
+ [I(ZSWPWB)] = "zswpwb",
#endif
#ifdef CONFIG_X86
- "direct_map_level2_splits",
- "direct_map_level3_splits",
- "direct_map_level2_collapses",
- "direct_map_level3_collapses",
+ [I(DIRECT_MAP_LEVEL2_SPLIT)] = "direct_map_level2_splits",
+ [I(DIRECT_MAP_LEVEL3_SPLIT)] = "direct_map_level3_splits",
+ [I(DIRECT_MAP_LEVEL2_COLLAPSE)] = "direct_map_level2_collapses",
+ [I(DIRECT_MAP_LEVEL3_COLLAPSE)] = "direct_map_level3_collapses",
#endif
#ifdef CONFIG_PER_VMA_LOCK_STATS
- "vma_lock_success",
- "vma_lock_abort",
- "vma_lock_retry",
- "vma_lock_miss",
+ [I(VMA_LOCK_SUCCESS)] = "vma_lock_success",
+ [I(VMA_LOCK_ABORT)] = "vma_lock_abort",
+ [I(VMA_LOCK_RETRY)] = "vma_lock_retry",
+ [I(VMA_LOCK_MISS)] = "vma_lock_miss",
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
- "kstack_1k",
+ [I(KSTACK_1K)] = "kstack_1k",
#if THREAD_SIZE > 1024
- "kstack_2k",
+ [I(KSTACK_2K)] = "kstack_2k",
#endif
#if THREAD_SIZE > 2048
- "kstack_4k",
+ [I(KSTACK_4K)] = "kstack_4k",
#endif
#if THREAD_SIZE > 4096
- "kstack_8k",
+ [I(KSTACK_8K)] = "kstack_8k",
#endif
#if THREAD_SIZE > 8192
- "kstack_16k",
+ [I(KSTACK_16K)] = "kstack_16k",
#endif
#if THREAD_SIZE > 16384
- "kstack_32k",
+ [I(KSTACK_32K)] = "kstack_32k",
#endif
#if THREAD_SIZE > 32768
- "kstack_64k",
+ [I(KSTACK_64K)] = "kstack_64k",
#endif
#if THREAD_SIZE > 65536
- "kstack_rest",
+ [I(KSTACK_REST)] = "kstack_rest",
#endif
#endif
-#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
+#undef I
+#endif /* CONFIG_VM_EVENT_COUNTERS */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
@@ -1868,7 +1887,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
if (*pos >= NR_VMSTAT_ITEMS)
return NULL;
- BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
+ BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) != NR_VMSTAT_ITEMS);
fold_vm_numa_events();
v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
m->private = v;
diff --git a/mm/workingset.c b/mm/workingset.c
index 4841ae8af411..6e7f4cb1b9a7 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -612,7 +612,6 @@ struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
- struct address_space *mapping;
struct page *page = virt_to_page(node);
/*
@@ -623,8 +622,7 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- mapping = container_of(node->array, struct address_space, i_pages);
- lockdep_assert_held(&mapping->i_pages.xa_lock);
+ lockdep_assert_held(&node->array->xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
diff --git a/mm/zpdesc.h b/mm/zpdesc.h
index fa47fece2237..25bf5ea0beb8 100644
--- a/mm/zpdesc.h
+++ b/mm/zpdesc.h
@@ -7,6 +7,9 @@
#ifndef __MM_ZPDESC_H__
#define __MM_ZPDESC_H__
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+
/*
* struct zpdesc - Memory descriptor for zpool memory.
* @flags: Page flags, mostly unused by zsmalloc.
@@ -51,8 +54,8 @@ struct zpdesc {
ZPDESC_MATCH(flags, flags);
ZPDESC_MATCH(lru, lru);
ZPDESC_MATCH(mapping, movable_ops);
-ZPDESC_MATCH(index, next);
-ZPDESC_MATCH(index, handle);
+ZPDESC_MATCH(__folio_index, next);
+ZPDESC_MATCH(__folio_index, handle);
ZPDESC_MATCH(private, zspage);
ZPDESC_MATCH(page_type, first_obj_offset);
ZPDESC_MATCH(_refcount, _refcount);
@@ -149,10 +152,9 @@ static inline struct zpdesc *pfn_zpdesc(unsigned long pfn)
return page_zpdesc(pfn_to_page(pfn));
}
-static inline void __zpdesc_set_movable(struct zpdesc *zpdesc,
- const struct movable_operations *mops)
+static inline void __zpdesc_set_movable(struct zpdesc *zpdesc)
{
- __SetPageMovable(zpdesc_page(zpdesc), mops);
+ SetPageMovableOps(zpdesc_page(zpdesc));
}
static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc)
@@ -160,16 +162,6 @@ static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc)
__SetPageZsmalloc(zpdesc_page(zpdesc));
}
-static inline void __zpdesc_clear_zsmalloc(struct zpdesc *zpdesc)
-{
- __ClearPageZsmalloc(zpdesc_page(zpdesc));
-}
-
-static inline bool zpdesc_is_isolated(struct zpdesc *zpdesc)
-{
- return PageIsolated(zpdesc_page(zpdesc));
-}
-
static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc)
{
return page_zone(zpdesc_page(zpdesc));
diff --git a/mm/zpool.c b/mm/zpool.c
index 6d6d88930932..0a71d03369f1 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -226,20 +226,22 @@ const char *zpool_get_type(struct zpool *zpool)
* @size: The amount of memory to allocate.
* @gfp: The GFP flags to use when allocating memory.
* @handle: Pointer to the handle to set
+ * @nid: The preferred node id.
*
* This allocates the requested amount of memory from the pool.
* The gfp flags will be used when allocating memory, if the
* implementation supports it. The provided @handle will be
- * set to the allocated object handle.
+ * set to the allocated object handle. The allocation will
+ * prefer the NUMA node specified by @nid.
*
* Implementations must guarantee this to be thread-safe.
*
* Returns: 0 on success, negative value on error.
*/
int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp,
- unsigned long *handle)
+ unsigned long *handle, const int nid)
{
- return zpool->driver->malloc(zpool->pool, size, gfp, handle);
+ return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid);
}
/**
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 961b270f023c..2c5e56a65354 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -26,17 +26,10 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <linux/pgtable.h>
-#include <asm/tlbflush.h>
-#include <linux/cpumask.h>
-#include <linux/cpu.h>
-#include <linux/vmalloc.h>
-#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/sprintf.h>
#include <linux/shrinker.h>
@@ -44,11 +37,8 @@
#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
-#include <linux/migrate.h>
-#include <linux/wait.h>
-#include <linux/pagemap.h>
#include <linux/fs.h>
-#include <linux/local_lock.h>
+#include <linux/workqueue.h>
#include "zpdesc.h"
#define ZSPAGE_MAGIC 0x58
@@ -243,9 +233,9 @@ static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc)
dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES);
}
-static inline struct zpdesc *alloc_zpdesc(gfp_t gfp)
+static inline struct zpdesc *alloc_zpdesc(gfp_t gfp, const int nid)
{
- struct page *page = alloc_page(gfp);
+ struct page *page = alloc_pages_node(nid, gfp, 0);
return page_zpdesc(page);
}
@@ -254,6 +244,7 @@ static inline void free_zpdesc(struct zpdesc *zpdesc)
{
struct page *page = zpdesc_page(zpdesc);
+ /* PageZsmalloc is sticky until the page is freed to the buddy. */
__free_page(page);
}
@@ -462,9 +453,9 @@ static void zs_zpool_destroy(void *pool)
}
static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
+ unsigned long *handle, const int nid)
{
- *handle = zs_malloc(pool, size, gfp);
+ *handle = zs_malloc(pool, size, gfp, nid);
if (IS_ERR_VALUE(*handle))
return PTR_ERR((void *)*handle);
@@ -886,11 +877,10 @@ static void reset_zpdesc(struct zpdesc *zpdesc)
{
struct page *page = zpdesc_page(zpdesc);
- __ClearPageMovable(page);
ClearPagePrivate(page);
zpdesc->zspage = NULL;
zpdesc->next = NULL;
- __ClearPageZsmalloc(page);
+ /* PageZsmalloc is sticky until the page is freed to the buddy. */
}
static int trylock_zspage(struct zspage *zspage)
@@ -1043,8 +1033,8 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage,
* Allocate a zspage for the given size class
*/
static struct zspage *alloc_zspage(struct zs_pool *pool,
- struct size_class *class,
- gfp_t gfp)
+ struct size_class *class,
+ gfp_t gfp, const int nid)
{
int i;
struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE];
@@ -1053,6 +1043,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
if (!zspage)
return NULL;
+ if (!IS_ENABLED(CONFIG_COMPACTION))
+ gfp &= ~__GFP_MOVABLE;
+
zspage->magic = ZSPAGE_MAGIC;
zspage->pool = pool;
zspage->class = class->index;
@@ -1061,11 +1054,10 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
for (i = 0; i < class->pages_per_zspage; i++) {
struct zpdesc *zpdesc;
- zpdesc = alloc_zpdesc(gfp);
+ zpdesc = alloc_zpdesc(gfp, nid);
if (!zpdesc) {
while (--i >= 0) {
zpdesc_dec_zone_page_state(zpdescs[i]);
- __zpdesc_clear_zsmalloc(zpdescs[i]);
free_zpdesc(zpdescs[i]);
}
cache_free_zspage(pool, zspage);
@@ -1243,19 +1235,19 @@ void zs_obj_write(struct zs_pool *pool, unsigned long handle,
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
- if (off + class->size <= PAGE_SIZE) {
+ if (!ZsHugePage(zspage))
+ off += ZS_HANDLE_SIZE;
+
+ if (off + mem_len <= PAGE_SIZE) {
/* this object is contained entirely within a page */
void *dst = kmap_local_zpdesc(zpdesc);
- if (!ZsHugePage(zspage))
- off += ZS_HANDLE_SIZE;
memcpy(dst + off, handle_mem, mem_len);
kunmap_local(dst);
} else {
/* this object spans two pages */
size_t sizes[2];
- off += ZS_HANDLE_SIZE;
sizes[0] = PAGE_SIZE - off;
sizes[1] = mem_len - sizes[0];
@@ -1336,12 +1328,14 @@ static unsigned long obj_malloc(struct zs_pool *pool,
* @pool: pool to allocate from
* @size: size of block to allocate
* @gfp: gfp flags when allocating object
+ * @nid: The preferred node id to allocate new zspage (if needed)
*
* On success, handle to the allocated object is returned,
* otherwise an ERR_PTR().
* Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
*/
-unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
+ const int nid)
{
unsigned long handle;
struct size_class *class;
@@ -1376,7 +1370,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
spin_unlock(&class->lock);
- zspage = alloc_zspage(pool, class, gfp);
+ zspage = alloc_zspage(pool, class, gfp, nid);
if (!zspage) {
cache_free_handle(pool, handle);
return (unsigned long)ERR_PTR(-ENOMEM);
@@ -1694,8 +1688,6 @@ static void lock_zspage(struct zspage *zspage)
#ifdef CONFIG_COMPACTION
-static const struct movable_operations zsmalloc_mops;
-
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc)
{
@@ -1718,18 +1710,17 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
set_first_obj_offset(newzpdesc, first_obj_offset);
if (unlikely(ZsHugePage(zspage)))
newzpdesc->handle = oldzpdesc->handle;
- __zpdesc_set_movable(newzpdesc, &zsmalloc_mops);
+ __zpdesc_set_movable(newzpdesc);
}
static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
{
/*
- * Page is locked so zspage couldn't be destroyed. For detail, look at
- * lock_zspage in free_zspage.
+ * Page is locked so zspage can't be destroyed concurrently
+ * (see free_zspage()). But if the page was already destroyed
+ * (see reset_zpdesc()), refuse isolation here.
*/
- VM_BUG_ON_PAGE(PageIsolated(page), page);
-
- return true;
+ return page_zpdesc(page)->zspage;
}
static int zs_page_migrate(struct page *newpage, struct page *page,
@@ -1747,7 +1738,15 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
unsigned long old_obj, new_obj;
unsigned int obj_idx;
- VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc));
+ /*
+ * TODO: nothing prevents a zspage from getting destroyed while
+ * it is isolated for migration, as the page lock is temporarily
+ * dropped after zs_page_isolate() succeeded: we should rework that
+ * and defer destroying such pages once they are un-isolated (putback)
+ * instead.
+ */
+ if (!zpdesc->zspage)
+ return MIGRATEPAGE_SUCCESS;
/* The page is locked, so this pointer must remain valid */
zspage = get_zspage(zpdesc);
@@ -1819,10 +1818,9 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
static void zs_page_putback(struct page *page)
{
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
}
-static const struct movable_operations zsmalloc_mops = {
+const struct movable_operations zsmalloc_mops = {
.isolate_page = zs_page_isolate,
.migrate_page = zs_page_migrate,
.putback_page = zs_page_putback,
@@ -1885,7 +1883,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
do {
WARN_ON(!zpdesc_trylock(zpdesc));
- __zpdesc_set_movable(zpdesc, &zsmalloc_mops);
+ __zpdesc_set_movable(zpdesc);
zpdesc_unlock(zpdesc);
} while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL);
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 0dcc54eab58b..3c0fd8a13718 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -883,18 +883,32 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ struct acomp_req *req;
+ struct crypto_acomp *acomp;
+ u8 *buffer;
+
+ if (IS_ERR_OR_NULL(acomp_ctx))
+ return 0;
mutex_lock(&acomp_ctx->mutex);
- if (!IS_ERR_OR_NULL(acomp_ctx)) {
- if (!IS_ERR_OR_NULL(acomp_ctx->req))
- acomp_request_free(acomp_ctx->req);
- acomp_ctx->req = NULL;
- if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
- crypto_free_acomp(acomp_ctx->acomp);
- kfree(acomp_ctx->buffer);
- }
+ req = acomp_ctx->req;
+ acomp = acomp_ctx->acomp;
+ buffer = acomp_ctx->buffer;
+ acomp_ctx->req = NULL;
+ acomp_ctx->acomp = NULL;
+ acomp_ctx->buffer = NULL;
mutex_unlock(&acomp_ctx->mutex);
+ /*
+ * Do the actual freeing after releasing the mutex to avoid subtle
+ * locking dependencies causing deadlocks.
+ */
+ if (!IS_ERR_OR_NULL(req))
+ acomp_request_free(req);
+ if (!IS_ERR_OR_NULL(acomp))
+ crypto_free_acomp(acomp);
+ kfree(buffer);
+
return 0;
}
@@ -967,7 +981,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
zpool = pool->zpool;
gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
- alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle);
+ alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page));
if (alloc_ret)
goto unlock;
@@ -1056,9 +1070,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
struct mempolicy *mpol;
bool folio_was_allocated;
struct swap_info_struct *si;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- };
int ret = 0;
/* try to allocate swap cache folio */
@@ -1120,7 +1131,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
folio_set_reclaim(folio);
/* start writeback */
- __swap_writepage(folio, &wbc);
+ __swap_writepage(folio, NULL);
out:
if (ret && ret != -EEXIST) {