summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig21
-rw-r--r--mm/Kconfig.debug74
-rw-r--r--mm/cma.c26
-rw-r--r--mm/compaction.c102
-rw-r--r--mm/damon/Kconfig7
-rw-r--r--mm/damon/core-test.h30
-rw-r--r--mm/damon/core.c113
-rw-r--r--mm/damon/dbgfs.c19
-rw-r--r--mm/damon/ops-common.c38
-rw-r--r--mm/damon/ops-common.h2
-rw-r--r--mm/damon/paddr.c167
-rw-r--r--mm/damon/reclaim.c19
-rw-r--r--mm/damon/sysfs-common.c2
-rw-r--r--mm/damon/sysfs-common.h4
-rw-r--r--mm/damon/sysfs-schemes.c391
-rw-r--r--mm/damon/sysfs.c22
-rw-r--r--mm/damon/vaddr-test.h20
-rw-r--r--mm/damon/vaddr.c68
-rw-r--r--mm/debug.c9
-rw-r--r--mm/debug_vm_pgtable.c129
-rw-r--r--mm/fadvise.c5
-rw-r--r--mm/filemap.c300
-rw-r--r--mm/folio-compat.c15
-rw-r--r--mm/gup.c382
-rw-r--r--mm/hmm.c15
-rw-r--r--mm/huge_memory.c188
-rw-r--r--mm/hugetlb.c764
-rw-r--r--mm/hugetlb_cgroup.c8
-rw-r--r--mm/hugetlb_vmemmap.c2
-rw-r--r--mm/internal.h282
-rw-r--r--mm/kasan/Makefile9
-rw-r--r--mm/kasan/common.c23
-rw-r--r--mm/kasan/generic.c11
-rw-r--r--mm/kasan/hw_tags.c60
-rw-r--r--mm/kasan/kasan.h41
-rw-r--r--mm/kasan/kasan_test.c29
-rw-r--r--mm/kasan/report.c43
-rw-r--r--mm/kasan/report_generic.c32
-rw-r--r--mm/kasan/report_hw_tags.c35
-rw-r--r--mm/kasan/report_sw_tags.c26
-rw-r--r--mm/kasan/report_tags.c2
-rw-r--r--mm/kasan/shadow.c64
-rw-r--r--mm/kasan/sw_tags.c6
-rw-r--r--mm/khugepaged.c86
-rw-r--r--mm/kmemleak.c88
-rw-r--r--mm/kmsan/Makefile8
-rw-r--r--mm/kmsan/core.c10
-rw-r--r--mm/kmsan/instrumentation.c23
-rw-r--r--mm/ksm.c12
-rw-r--r--mm/madvise.c127
-rw-r--r--mm/mapping_dirty_helpers.c2
-rw-r--r--mm/memblock.c43
-rw-r--r--mm/memcontrol.c183
-rw-r--r--mm/memfd.c56
-rw-r--r--mm/memory-failure.c231
-rw-r--r--mm/memory-tiers.c4
-rw-r--r--mm/memory.c268
-rw-r--r--mm/memory_hotplug.c16
-rw-r--r--mm/mempolicy.c189
-rw-r--r--mm/memremap.c6
-rw-r--r--mm/migrate.c975
-rw-r--r--mm/migrate_device.c6
-rw-r--r--mm/mincore.c4
-rw-r--r--mm/mlock.c351
-rw-r--r--mm/mmap.c1098
-rw-r--r--mm/mmu_notifier.c10
-rw-r--r--mm/mprotect.c170
-rw-r--r--mm/mremap.c72
-rw-r--r--mm/nommu.c202
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c107
-rw-r--r--mm/page_alloc.c238
-rw-r--r--mm/page_ext.c16
-rw-r--r--mm/page_idle.c47
-rw-r--r--mm/page_io.c191
-rw-r--r--mm/page_owner.c6
-rw-r--r--mm/page_reporting.c6
-rw-r--r--mm/page_table_check.c1
-rw-r--r--mm/page_vma_mapped.c9
-rw-r--r--mm/pagewalk.c6
-rw-r--r--mm/percpu-internal.h6
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/readahead.c39
-rw-r--r--mm/rmap.c327
-rw-r--r--mm/secretmem.c8
-rw-r--r--mm/shmem.c180
-rw-r--r--mm/shrinker_debug.c13
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h18
-rw-r--r--mm/slab_common.c1
-rw-r--r--mm/slub.c6
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c80
-rw-r--r--mm/swap.h10
-rw-r--r--mm/swap_state.c65
-rw-r--r--mm/swapfile.c34
-rw-r--r--mm/userfaultfd.c49
-rw-r--r--mm/util.c26
-rw-r--r--mm/vmalloc.c462
-rw-r--r--mm/vmscan.c1088
-rw-r--r--mm/workingset.c28
-rw-r--r--mm/z3fold.c2
-rw-r--r--mm/zsmalloc.c311
103 files changed, 6926 insertions, 4307 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ff7b209dec05..4751031f3f05 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -191,6 +191,25 @@ config ZSMALLOC_STAT
information to userspace via debugfs.
If unsure, say N.
+config ZSMALLOC_CHAIN_SIZE
+ int "Maximum number of physical pages per-zspage"
+ default 8
+ range 4 16
+ depends on ZSMALLOC
+ help
+ This option sets the upper limit on the number of physical pages
+ that a zmalloc page (zspage) can consist of. The optimal zspage
+ chain size is calculated for each size class during the
+ initialization of the pool.
+
+ Changing this option can alter the characteristics of size classes,
+ such as the number of pages per zspage and the number of objects
+ per zspage. This can also result in different configurations of
+ the pool, as zsmalloc merges size classes with similar
+ characteristics.
+
+ For more information, see zsmalloc documentation.
+
menu "SLAB allocator options"
choice
@@ -1073,7 +1092,7 @@ config GUP_TEST
pin_user_pages*(), or pinned via get_user_pages*(), as specified
by other command line arguments.
- See tools/testing/selftests/vm/gup_test.c
+ See tools/testing/selftests/mm/gup_test.c
comment "GUP_TEST needs to have DEBUG_FS enabled"
depends on !GUP_TEST && !DEBUG_FS
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index fca699ad1fb0..c3547a373c9c 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -90,7 +90,7 @@ config PAGE_OWNER
help to find bare alloc_page(s) leaks. Even if you include this
feature on your build, it is disabled in default. You should pass
"page_owner=on" to boot parameter in order to enable it. Eats
- a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
+ a fair amount of memory if enabled. See tools/mm/page_owner_sort.c
for user-space helper.
If unsure, say N.
@@ -207,3 +207,75 @@ config PTDUMP_DEBUGFS
kernel.
If in doubt, say N.
+
+config HAVE_DEBUG_KMEMLEAK
+ bool
+
+config DEBUG_KMEMLEAK
+ bool "Kernel memory leak detector"
+ depends on DEBUG_KERNEL && HAVE_DEBUG_KMEMLEAK
+ select DEBUG_FS
+ select STACKTRACE if STACKTRACE_SUPPORT
+ select KALLSYMS
+ select CRC32
+ select STACKDEPOT
+ select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF
+ help
+ Say Y here if you want to enable the memory leak
+ detector. The memory allocation/freeing is traced in a way
+ similar to the Boehm's conservative garbage collector, the
+ difference being that the orphan objects are not freed but
+ only shown in /sys/kernel/debug/kmemleak. Enabling this
+ feature will introduce an overhead to memory
+ allocations. See Documentation/dev-tools/kmemleak.rst for more
+ details.
+
+ Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances
+ of finding leaks due to the slab objects poisoning.
+
+ In order to access the kmemleak file, debugfs needs to be
+ mounted (usually at /sys/kernel/debug).
+
+config DEBUG_KMEMLEAK_MEM_POOL_SIZE
+ int "Kmemleak memory pool size"
+ depends on DEBUG_KMEMLEAK
+ range 200 1000000
+ default 16000
+ help
+ Kmemleak must track all the memory allocations to avoid
+ reporting false positives. Since memory may be allocated or
+ freed before kmemleak is fully initialised, use a static pool
+ of metadata objects to track such callbacks. After kmemleak is
+ fully initialised, this memory pool acts as an emergency one
+ if slab allocations fail.
+
+config DEBUG_KMEMLEAK_TEST
+ tristate "Simple test for the kernel memory leak detector"
+ depends on DEBUG_KMEMLEAK && m
+ help
+ This option enables a module that explicitly leaks memory.
+
+ If unsure, say N.
+
+config DEBUG_KMEMLEAK_DEFAULT_OFF
+ bool "Default kmemleak to off"
+ depends on DEBUG_KMEMLEAK
+ help
+ Say Y here to disable kmemleak by default. It can then be enabled
+ on the command line via kmemleak=on.
+
+config DEBUG_KMEMLEAK_AUTO_SCAN
+ bool "Enable kmemleak auto scan thread on boot up"
+ default y
+ depends on DEBUG_KMEMLEAK
+ help
+ Depending on the cpu, kmemleak scan may be cpu intensive and can
+ stall user tasks at times. This option enables/disables automatic
+ kmemleak scan at boot up.
+
+ Say N here to disable kmemleak auto scan thread to stop automatic
+ scanning. Disabling this option disables automatic reporting of
+ memory leaks.
+
+ If unsure, say Y.
+
diff --git a/mm/cma.c b/mm/cma.c
index 4a978e09547a..a7263aa02c92 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -322,18 +322,6 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
phys_addr_t addr = 0;
/*
- * All pages in the reserved area must come from the same zone.
- * If the requested region crosses the low/high memory boundary,
- * try allocating from high memory first and fall back to low
- * memory in case of failure.
- */
- if (base < highmem_start && limit > highmem_start) {
- addr = memblock_alloc_range_nid(size, alignment,
- highmem_start, limit, nid, true);
- limit = highmem_start;
- }
-
- /*
* If there is enough memory, try a bottom-up allocation first.
* It will place the new cma area close to the start of the node
* and guarantee that the compaction is moving pages out of the
@@ -350,6 +338,18 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
}
#endif
+ /*
+ * All pages in the reserved area must come from the same zone.
+ * If the requested region crosses the low/high memory boundary,
+ * try allocating from high memory first and fall back to low
+ * memory in case of failure.
+ */
+ if (!addr && base < highmem_start && limit > highmem_start) {
+ addr = memblock_alloc_range_nid(size, alignment,
+ highmem_start, limit, nid, true);
+ limit = highmem_start;
+ }
+
if (!addr) {
addr = memblock_alloc_range_nid(size, alignment, base,
limit, nid, true);
@@ -491,7 +491,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
start = bitmap_no + mask + 1;
}
- trace_cma_alloc_finish(cma->name, pfn, page, count, align);
+ trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret);
/*
* CMA can allocate multiple page blocks, which results in different
diff --git a/mm/compaction.c b/mm/compaction.c
index ca1603524bbe..5a9501e0ae01 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -122,7 +122,6 @@ bool PageMovable(struct page *page)
return false;
}
-EXPORT_SYMBOL(PageMovable);
void __SetPageMovable(struct page *page, const struct movable_operations *mops)
{
@@ -977,7 +976,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
locked = NULL;
}
- if (!isolate_movable_page(page, mode))
+ if (isolate_movable_page(page, mode))
goto isolate_success;
}
@@ -1102,12 +1101,12 @@ isolate_success_no_list:
/*
* Avoid isolating too much unless this block is being
- * rescanned (e.g. dirty/writeback pages, parallel allocation)
+ * fully scanned (e.g. dirty/writeback pages, parallel allocation)
* or a lock is contended. For contention, isolate quickly to
* potentially remove one source of contention.
*/
if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
- !cc->rescan && !cc->contended) {
+ !cc->finish_pageblock && !cc->contended) {
++low_pfn;
break;
}
@@ -1172,14 +1171,14 @@ isolate_abort:
}
/*
- * Updated the cached scanner pfn once the pageblock has been scanned
+ * Update the cached scanner pfn once the pageblock has been scanned.
* Pages will either be migrated in which case there is no point
* scanning in the near future or migration failed in which case the
* failure reason may persist. The block is marked for skipping if
* there were no pages isolated in the block or if the block is
* rescanned twice in a row.
*/
- if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
+ if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) {
if (valid_page && !skip_updated)
set_pageblock_skip(valid_page);
update_cached_migrate(cc, low_pfn);
@@ -1763,6 +1762,13 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
return pfn;
/*
+ * If the pageblock should be finished then do not select a different
+ * pageblock.
+ */
+ if (cc->finish_pageblock)
+ return pfn;
+
+ /*
* If the migrate_pfn is not at the start of a zone or the start
* of a pageblock then assume this is a continuation of a previous
* scan restarted due to COMPACT_CLUSTER_MAX.
@@ -1839,6 +1845,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
+ set_pageblock_skip(freepage);
break;
}
}
@@ -2026,6 +2033,8 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat)
struct zone *zone;
zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
score += fragmentation_score_zone_weighted(zone);
}
@@ -2314,9 +2323,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
return ret;
- /* huh, compaction_suitable is returning something unexpected */
- VM_BUG_ON(ret != COMPACT_CONTINUE);
-
/*
* Clear pageblock skip if there were failures recently and compaction
* is about to be retried after being deferred.
@@ -2374,19 +2380,20 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
unsigned long iteration_start_pfn = cc->migrate_pfn;
/*
- * Avoid multiple rescans which can happen if a page cannot be
- * isolated (dirty/writeback in async mode) or if the migrated
- * pages are being allocated before the pageblock is cleared.
- * The first rescan will capture the entire pageblock for
- * migration. If it fails, it'll be marked skip and scanning
- * will proceed as normal.
+ * Avoid multiple rescans of the same pageblock which can
+ * happen if a page cannot be isolated (dirty/writeback in
+ * async mode) or if the migrated pages are being allocated
+ * before the pageblock is cleared. The first rescan will
+ * capture the entire pageblock for migration. If it fails,
+ * it'll be marked skip and scanning will proceed as normal.
*/
- cc->rescan = false;
+ cc->finish_pageblock = false;
if (pageblock_start_pfn(last_migrated_pfn) ==
pageblock_start_pfn(iteration_start_pfn)) {
- cc->rescan = true;
+ cc->finish_pageblock = true;
}
+rescan:
switch (isolate_migratepages(cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
@@ -2429,18 +2436,37 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
goto out;
}
/*
- * We failed to migrate at least one page in the current
- * order-aligned block, so skip the rest of it.
+ * If an ASYNC or SYNC_LIGHT fails to migrate a page
+ * within the current order-aligned block, scan the
+ * remainder of the pageblock. This will mark the
+ * pageblock "skip" to avoid rescanning in the near
+ * future. This will isolate more pages than necessary
+ * for the request but avoid loops due to
+ * fast_find_migrateblock revisiting blocks that were
+ * recently partially scanned.
*/
- if (cc->direct_compaction &&
- (cc->mode == MIGRATE_ASYNC)) {
- cc->migrate_pfn = block_end_pfn(
- cc->migrate_pfn - 1, cc->order);
- /* Draining pcplists is useless in this case */
- last_migrated_pfn = 0;
+ if (cc->direct_compaction && !cc->finish_pageblock &&
+ (cc->mode < MIGRATE_SYNC)) {
+ cc->finish_pageblock = true;
+
+ /*
+ * Draining pcplists does not help THP if
+ * any page failed to migrate. Even after
+ * drain, the pageblock will not be free.
+ */
+ if (cc->order == COMPACTION_HPAGE_ORDER)
+ last_migrated_pfn = 0;
+
+ goto rescan;
}
}
+ /* Stop if a page has been captured */
+ if (capc && capc->page) {
+ ret = COMPACT_SUCCESS;
+ break;
+ }
+
check_drain:
/*
* Has the migration scanner moved away from the previous
@@ -2459,12 +2485,6 @@ check_drain:
last_migrated_pfn = 0;
}
}
-
- /* Stop if a page has been captured */
- if (capc && capc->page) {
- ret = COMPACT_SUCCESS;
- break;
- }
}
out:
@@ -2492,6 +2512,9 @@ out:
trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret);
+ VM_BUG_ON(!list_empty(&cc->freepages));
+ VM_BUG_ON(!list_empty(&cc->migratepages));
+
return ret;
}
@@ -2530,9 +2553,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
ret = compact_zone(&cc, &capc);
- VM_BUG_ON(!list_empty(&cc.freepages));
- VM_BUG_ON(!list_empty(&cc.migratepages));
-
/*
* Make sure we hide capture control first before we read the captured
* page pointer, otherwise an interrupt could free and capture a page
@@ -2664,8 +2684,10 @@ static void proactive_compact_node(pg_data_t *pgdat)
compact_zone(&cc, NULL);
- VM_BUG_ON(!list_empty(&cc.freepages));
- VM_BUG_ON(!list_empty(&cc.migratepages));
+ count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
+ cc.total_migrate_scanned);
+ count_compact_events(KCOMPACTD_FREE_SCANNED,
+ cc.total_free_scanned);
}
}
@@ -2693,9 +2715,6 @@ static void compact_node(int nid)
cc.zone = zone;
compact_zone(&cc, NULL);
-
- VM_BUG_ON(!list_empty(&cc.freepages));
- VM_BUG_ON(!list_empty(&cc.migratepages));
}
}
@@ -2735,6 +2754,8 @@ int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
continue;
pgdat->proactive_compact_trigger = true;
+ trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, -1,
+ pgdat->nr_zones - 1);
wake_up_interruptible(&pgdat->kcompactd_wait);
}
}
@@ -2872,9 +2893,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
cc.total_migrate_scanned);
count_compact_events(KCOMPACTD_FREE_SCANNED,
cc.total_free_scanned);
-
- VM_BUG_ON(!list_empty(&cc.freepages));
- VM_BUG_ON(!list_empty(&cc.migratepages));
}
/*
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 7821fcb3f258..436c6b4cb5ec 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -60,7 +60,7 @@ config DAMON_SYSFS
the interface for arbitrary data access monitoring.
config DAMON_DBGFS
- bool "DAMON debugfs interface"
+ bool "DAMON debugfs interface (DEPRECATED!)"
depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
help
This builds the debugfs interface for DAMON. The user space admins
@@ -68,8 +68,9 @@ config DAMON_DBGFS
If unsure, say N.
- This will be removed after >5.15.y LTS kernel is released, so users
- should move to the sysfs interface (DAMON_SYSFS).
+ This is deprecated, so users should move to the sysfs interface
+ (DAMON_SYSFS). If you depend on this and cannot move, please report
+ your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
config DAMON_DBGFS_KUNIT_TEST
bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 3db9b7368756..fae64d32b925 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -289,6 +289,35 @@ static void damon_test_set_regions(struct kunit *test)
damon_destroy_target(t);
}
+static void damon_test_update_monitoring_result(struct kunit *test)
+{
+ struct damon_attrs old_attrs = {
+ .sample_interval = 10, .aggr_interval = 1000,};
+ struct damon_attrs new_attrs;
+ struct damon_region *r = damon_new_region(3, 7);
+
+ r->nr_accesses = 15;
+ r->age = 20;
+
+ new_attrs = (struct damon_attrs){
+ .sample_interval = 100, .aggr_interval = 10000,};
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ KUNIT_EXPECT_EQ(test, r->nr_accesses, 15);
+ KUNIT_EXPECT_EQ(test, r->age, 2);
+
+ new_attrs = (struct damon_attrs){
+ .sample_interval = 1, .aggr_interval = 1000};
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
+ KUNIT_EXPECT_EQ(test, r->age, 2);
+
+ new_attrs = (struct damon_attrs){
+ .sample_interval = 1, .aggr_interval = 100};
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
+ KUNIT_EXPECT_EQ(test, r->age, 20);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -299,6 +328,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_split_regions_of),
KUNIT_CASE(damon_test_ops_registration),
KUNIT_CASE(damon_test_set_regions),
+ KUNIT_CASE(damon_test_update_monitoring_result),
{},
};
diff --git a/mm/damon/core.c b/mm/damon/core.c
index ceec75b88ef9..d9ef62047bf5 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -263,6 +263,40 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
return 0;
}
+struct damos_filter *damos_new_filter(enum damos_filter_type type,
+ bool matching)
+{
+ struct damos_filter *filter;
+
+ filter = kmalloc(sizeof(*filter), GFP_KERNEL);
+ if (!filter)
+ return NULL;
+ filter->type = type;
+ filter->matching = matching;
+ return filter;
+}
+
+void damos_add_filter(struct damos *s, struct damos_filter *f)
+{
+ list_add_tail(&f->list, &s->filters);
+}
+
+static void damos_del_filter(struct damos_filter *f)
+{
+ list_del(&f->list);
+}
+
+static void damos_free_filter(struct damos_filter *f)
+{
+ kfree(f);
+}
+
+void damos_destroy_filter(struct damos_filter *f)
+{
+ damos_del_filter(f);
+ damos_free_filter(f);
+}
+
/* initialize private fields of damos_quota and return the pointer */
static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota)
{
@@ -287,6 +321,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
return NULL;
scheme->pattern = *pattern;
scheme->action = action;
+ INIT_LIST_HEAD(&scheme->filters);
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
@@ -315,6 +350,10 @@ static void damon_free_scheme(struct damos *s)
void damon_destroy_scheme(struct damos *s)
{
+ struct damos_filter *f, *next;
+
+ damos_for_each_filter_safe(f, next, s)
+ damos_destroy_filter(f);
damon_del_scheme(s);
damon_free_scheme(s);
}
@@ -426,6 +465,76 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
kfree(ctx);
}
+static unsigned int damon_age_for_new_attrs(unsigned int age,
+ struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
+{
+ return age * old_attrs->aggr_interval / new_attrs->aggr_interval;
+}
+
+/* convert access ratio in bp (per 10,000) to nr_accesses */
+static unsigned int damon_accesses_bp_to_nr_accesses(
+ unsigned int accesses_bp, struct damon_attrs *attrs)
+{
+ unsigned int max_nr_accesses =
+ attrs->aggr_interval / attrs->sample_interval;
+
+ return accesses_bp * max_nr_accesses / 10000;
+}
+
+/* convert nr_accesses to access ratio in bp (per 10,000) */
+static unsigned int damon_nr_accesses_to_accesses_bp(
+ unsigned int nr_accesses, struct damon_attrs *attrs)
+{
+ unsigned int max_nr_accesses =
+ attrs->aggr_interval / attrs->sample_interval;
+
+ return nr_accesses * 10000 / max_nr_accesses;
+}
+
+static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses,
+ struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
+{
+ return damon_accesses_bp_to_nr_accesses(
+ damon_nr_accesses_to_accesses_bp(
+ nr_accesses, old_attrs),
+ new_attrs);
+}
+
+static void damon_update_monitoring_result(struct damon_region *r,
+ struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
+{
+ r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses,
+ old_attrs, new_attrs);
+ r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs);
+}
+
+/*
+ * region->nr_accesses is the number of sampling intervals in the last
+ * aggregation interval that access to the region has found, and region->age is
+ * the number of aggregation intervals that its access pattern has maintained.
+ * For the reason, the real meaning of the two fields depend on current
+ * sampling interval and aggregation interval. This function updates
+ * ->nr_accesses and ->age of given damon_ctx's regions for new damon_attrs.
+ */
+static void damon_update_monitoring_results(struct damon_ctx *ctx,
+ struct damon_attrs *new_attrs)
+{
+ struct damon_attrs *old_attrs = &ctx->attrs;
+ struct damon_target *t;
+ struct damon_region *r;
+
+ /* if any interval is zero, simply forgive conversion */
+ if (!old_attrs->sample_interval || !old_attrs->aggr_interval ||
+ !new_attrs->sample_interval ||
+ !new_attrs->aggr_interval)
+ return;
+
+ damon_for_each_target(t, ctx)
+ damon_for_each_region(r, t)
+ damon_update_monitoring_result(
+ r, old_attrs, new_attrs);
+}
+
/**
* damon_set_attrs() - Set attributes for the monitoring.
* @ctx: monitoring context
@@ -443,6 +552,7 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
if (attrs->min_nr_regions > attrs->max_nr_regions)
return -EINVAL;
+ damon_update_monitoring_results(ctx, attrs);
ctx->attrs = *attrs;
return 0;
}
@@ -1230,7 +1340,8 @@ static int kdamond_fn(void *data)
if (ctx->callback.after_aggregation &&
ctx->callback.after_aggregation(ctx))
break;
- kdamond_apply_schemes(ctx);
+ if (!list_empty(&ctx->schemes))
+ kdamond_apply_schemes(ctx);
kdamond_reset_aggregated(ctx);
kdamond_split_regions(ctx);
if (ctx->ops.reset_aggregated)
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index b3f454a5c682..124f0f8c97b7 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -20,6 +20,14 @@ static int dbgfs_nr_ctxs;
static struct dentry **dbgfs_dirs;
static DEFINE_MUTEX(damon_dbgfs_lock);
+static void damon_dbgfs_warn_deprecation(void)
+{
+ pr_warn_once("DAMON debugfs interface is deprecated, "
+ "so users should move to DAMON_SYSFS. If you cannot, "
+ "please report your usecase to damon@lists.linux.dev and "
+ "linux-mm@kvack.org.\n");
+}
+
/*
* Returns non-empty string on success, negative error code otherwise.
*/
@@ -711,6 +719,8 @@ out:
static int damon_dbgfs_open(struct inode *inode, struct file *file)
{
+ damon_dbgfs_warn_deprecation();
+
file->private_data = inode->i_private;
return nonseekable_open(inode, file);
@@ -1039,15 +1049,24 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
return ret;
}
+static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file)
+{
+ damon_dbgfs_warn_deprecation();
+ return nonseekable_open(inode, file);
+}
+
static const struct file_operations mk_contexts_fops = {
+ .open = damon_dbgfs_static_file_open,
.write = dbgfs_mk_context_write,
};
static const struct file_operations rm_contexts_fops = {
+ .open = damon_dbgfs_static_file_open,
.write = dbgfs_rm_context_write,
};
static const struct file_operations monitor_on_fops = {
+ .open = damon_dbgfs_static_file_open,
.read = dbgfs_monitor_on_read,
.write = dbgfs_monitor_on_write,
};
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 75409601f934..cc63cf953636 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -16,29 +16,33 @@
* Get an online page for a pfn if it's in the LRU list. Otherwise, returns
* NULL.
*
- * The body of this function is stolen from the 'page_idle_get_page()'. We
+ * The body of this function is stolen from the 'page_idle_get_folio()'. We
* steal rather than reuse it because the code is quite simple.
*/
-struct page *damon_get_page(unsigned long pfn)
+struct folio *damon_get_folio(unsigned long pfn)
{
struct page *page = pfn_to_online_page(pfn);
+ struct folio *folio;
- if (!page || !PageLRU(page) || !get_page_unless_zero(page))
+ if (!page || PageTail(page))
return NULL;
- if (unlikely(!PageLRU(page))) {
- put_page(page);
- page = NULL;
+ folio = page_folio(page);
+ if (!folio_test_lru(folio) || !folio_try_get(folio))
+ return NULL;
+ if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) {
+ folio_put(folio);
+ folio = NULL;
}
- return page;
+ return folio;
}
void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
{
bool referenced = false;
- struct page *page = damon_get_page(pte_pfn(*pte));
+ struct folio *folio = damon_get_folio(pte_pfn(*pte));
- if (!page)
+ if (!folio)
return;
if (pte_young(*pte)) {
@@ -52,19 +56,19 @@ void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
- set_page_young(page);
+ folio_set_young(folio);
- set_page_idle(page);
- put_page(page);
+ folio_set_idle(folio);
+ folio_put(folio);
}
void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool referenced = false;
- struct page *page = damon_get_page(pmd_pfn(*pmd));
+ struct folio *folio = damon_get_folio(pmd_pfn(*pmd));
- if (!page)
+ if (!folio)
return;
if (pmd_young(*pmd)) {
@@ -78,10 +82,10 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
- set_page_young(page);
+ folio_set_young(folio);
- set_page_idle(page);
- put_page(page);
+ folio_set_idle(folio);
+ folio_put(folio);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 8d82d3722204..14f4bc69f29b 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -7,7 +7,7 @@
#include <linux/damon.h>
-struct page *damon_get_page(unsigned long pfn);
+struct folio *damon_get_folio(unsigned long pfn);
void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index e1a4315c4be6..dd9c33fbe805 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -33,17 +33,15 @@ static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma,
static void damon_pa_mkold(unsigned long paddr)
{
- struct folio *folio;
- struct page *page = damon_get_page(PHYS_PFN(paddr));
+ struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
struct rmap_walk_control rwc = {
.rmap_one = __damon_pa_mkold,
.anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
- if (!page)
+ if (!folio)
return;
- folio = page_folio(page);
if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
folio_set_idle(folio);
@@ -81,70 +79,57 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
}
}
-struct damon_pa_access_chk_result {
- unsigned long page_sz;
- bool accessed;
-};
-
static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr, void *arg)
{
- struct damon_pa_access_chk_result *result = arg;
+ bool *accessed = arg;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
- result->accessed = false;
- result->page_sz = PAGE_SIZE;
+ *accessed = false;
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
if (pvmw.pte) {
- result->accessed = pte_young(*pvmw.pte) ||
+ *accessed = pte_young(*pvmw.pte) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- result->accessed = pmd_young(*pvmw.pmd) ||
+ *accessed = pmd_young(*pvmw.pmd) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
- result->page_sz = HPAGE_PMD_SIZE;
#else
WARN_ON_ONCE(1);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
}
- if (result->accessed) {
+ if (*accessed) {
page_vma_mapped_walk_done(&pvmw);
break;
}
}
/* If accessed, stop walking */
- return !result->accessed;
+ return *accessed == false;
}
-static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
+static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
{
- struct folio *folio;
- struct page *page = damon_get_page(PHYS_PFN(paddr));
- struct damon_pa_access_chk_result result = {
- .page_sz = PAGE_SIZE,
- .accessed = false,
- };
+ struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
+ bool accessed = false;
struct rmap_walk_control rwc = {
- .arg = &result,
+ .arg = &accessed,
.rmap_one = __damon_pa_young,
.anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
- if (!page)
+ if (!folio)
return false;
- folio = page_folio(page);
if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
if (folio_test_idle(folio))
- result.accessed = false;
+ accessed = false;
else
- result.accessed = true;
- folio_put(folio);
+ accessed = true;
goto out;
}
@@ -158,28 +143,28 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
if (need_lock)
folio_unlock(folio);
- folio_put(folio);
out:
- *page_sz = result.page_sz;
- return result.accessed;
+ *folio_sz = folio_size(folio);
+ folio_put(folio);
+ return accessed;
}
static void __damon_pa_check_access(struct damon_region *r)
{
static unsigned long last_addr;
- static unsigned long last_page_sz = PAGE_SIZE;
+ static unsigned long last_folio_sz = PAGE_SIZE;
static bool last_accessed;
/* If the region is in the last checked page, reuse the result */
- if (ALIGN_DOWN(last_addr, last_page_sz) ==
- ALIGN_DOWN(r->sampling_addr, last_page_sz)) {
+ if (ALIGN_DOWN(last_addr, last_folio_sz) ==
+ ALIGN_DOWN(r->sampling_addr, last_folio_sz)) {
if (last_accessed)
r->nr_accesses++;
return;
}
- last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz);
+ last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz);
if (last_accessed)
r->nr_accesses++;
@@ -202,63 +187,115 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
-static unsigned long damon_pa_pageout(struct damon_region *r)
+static bool __damos_pa_filter_out(struct damos_filter *filter,
+ struct folio *folio)
+{
+ bool matched = false;
+ struct mem_cgroup *memcg;
+
+ switch (filter->type) {
+ case DAMOS_FILTER_TYPE_ANON:
+ matched = folio_test_anon(folio);
+ break;
+ case DAMOS_FILTER_TYPE_MEMCG:
+ rcu_read_lock();
+ memcg = folio_memcg_check(folio);
+ if (!memcg)
+ matched = false;
+ else
+ matched = filter->memcg_id == mem_cgroup_id(memcg);
+ rcu_read_unlock();
+ break;
+ default:
+ break;
+ }
+
+ return matched == filter->matching;
+}
+
+/*
+ * damos_pa_filter_out - Return true if the page should be filtered out.
+ */
+static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
+{
+ struct damos_filter *filter;
+
+ damos_for_each_filter(filter, scheme) {
+ if (__damos_pa_filter_out(filter, folio))
+ return true;
+ }
+ return false;
+}
+
+static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
{
unsigned long addr, applied;
- LIST_HEAD(page_list);
+ LIST_HEAD(folio_list);
for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct page *page = damon_get_page(PHYS_PFN(addr));
+ struct folio *folio = damon_get_folio(PHYS_PFN(addr));
- if (!page)
+ if (!folio)
continue;
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
- if (isolate_lru_page(page)) {
- put_page(page);
+ if (damos_pa_filter_out(s, folio)) {
+ folio_put(folio);
continue;
}
- if (PageUnevictable(page)) {
- putback_lru_page(page);
- } else {
- list_add(&page->lru, &page_list);
- put_page(page);
+
+ folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
+ if (!folio_isolate_lru(folio)) {
+ folio_put(folio);
+ continue;
}
+ if (folio_test_unevictable(folio))
+ folio_putback_lru(folio);
+ else
+ list_add(&folio->lru, &folio_list);
+ folio_put(folio);
}
- applied = reclaim_pages(&page_list);
+ applied = reclaim_pages(&folio_list);
cond_resched();
return applied * PAGE_SIZE;
}
static inline unsigned long damon_pa_mark_accessed_or_deactivate(
- struct damon_region *r, bool mark_accessed)
+ struct damon_region *r, struct damos *s, bool mark_accessed)
{
unsigned long addr, applied = 0;
for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct page *page = damon_get_page(PHYS_PFN(addr));
+ struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+
+ if (!folio)
+ continue;
- if (!page)
+ if (damos_pa_filter_out(s, folio)) {
+ folio_put(folio);
continue;
+ }
+
if (mark_accessed)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
else
- deactivate_page(page);
- put_page(page);
- applied++;
+ folio_deactivate(folio);
+ applied += folio_nr_pages(folio);
+ folio_put(folio);
}
return applied * PAGE_SIZE;
}
-static unsigned long damon_pa_mark_accessed(struct damon_region *r)
+static unsigned long damon_pa_mark_accessed(struct damon_region *r,
+ struct damos *s)
{
- return damon_pa_mark_accessed_or_deactivate(r, true);
+ return damon_pa_mark_accessed_or_deactivate(r, s, true);
}
-static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
+ struct damos *s)
{
- return damon_pa_mark_accessed_or_deactivate(r, false);
+ return damon_pa_mark_accessed_or_deactivate(r, s, false);
}
static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
@@ -267,11 +304,11 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
{
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pa_pageout(r);
+ return damon_pa_pageout(r, scheme);
case DAMOS_LRU_PRIO:
- return damon_pa_mark_accessed(r);
+ return damon_pa_mark_accessed(r, scheme);
case DAMOS_LRU_DEPRIO:
- return damon_pa_deactivate_pages(r);
+ return damon_pa_deactivate_pages(r, scheme);
case DAMOS_STAT:
break;
default:
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index e82631f39481..648d2a85523a 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -99,6 +99,15 @@ static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
/*
+ * Skip anonymous pages reclamation.
+ *
+ * If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous
+ * pages. By default, ``N``.
+ */
+static bool skip_anon __read_mostly;
+module_param(skip_anon, bool, 0600);
+
+/*
* PID of the DAMON thread
*
* If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.
@@ -142,6 +151,7 @@ static struct damos *damon_reclaim_new_scheme(void)
static int damon_reclaim_apply_parameters(void)
{
struct damos *scheme;
+ struct damos_filter *filter;
int err = 0;
err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs);
@@ -152,6 +162,15 @@ static int damon_reclaim_apply_parameters(void)
scheme = damon_reclaim_new_scheme();
if (!scheme)
return -ENOMEM;
+ if (skip_anon) {
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+ if (!filter) {
+ /* Will be freed by next 'damon_set_schemes()' below */
+ damon_destroy_scheme(scheme);
+ return -ENOMEM;
+ }
+ damos_add_filter(scheme, filter);
+ }
damon_set_schemes(ctx, &scheme, 1);
return damon_set_region_biggest_system_ram_default(target,
diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
index 52bebf242f74..70edf45c2174 100644
--- a/mm/damon/sysfs-common.c
+++ b/mm/damon/sysfs-common.c
@@ -99,7 +99,7 @@ static struct attribute *damon_sysfs_ul_range_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_ul_range);
-struct kobj_type damon_sysfs_ul_range_ktype = {
+const struct kobj_type damon_sysfs_ul_range_ktype = {
.release = damon_sysfs_ul_range_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_ul_range_groups,
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 604a6cbc3ede..db677eba78fd 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -21,7 +21,7 @@ struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
unsigned long max);
void damon_sysfs_ul_range_release(struct kobject *kobj);
-extern struct kobj_type damon_sysfs_ul_range_ktype;
+extern const struct kobj_type damon_sysfs_ul_range_ktype;
/*
* schemes directory
@@ -36,7 +36,7 @@ struct damon_sysfs_schemes {
struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void);
void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes);
-extern struct kobj_type damon_sysfs_schemes_ktype;
+extern const struct kobj_type damon_sysfs_schemes_ktype;
int damon_sysfs_set_schemes(struct damon_ctx *ctx,
struct damon_sysfs_schemes *sysfs_schemes);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 81fc4d27f4e4..3cdad5a7f936 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -103,7 +103,7 @@ static struct attribute *damon_sysfs_scheme_region_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme_region);
-static struct kobj_type damon_sysfs_scheme_region_ktype = {
+static const struct kobj_type damon_sysfs_scheme_region_ktype = {
.release = damon_sysfs_scheme_region_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_scheme_region_groups,
@@ -153,7 +153,7 @@ static struct attribute *damon_sysfs_scheme_regions_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions);
-static struct kobj_type damon_sysfs_scheme_regions_ktype = {
+static const struct kobj_type damon_sysfs_scheme_regions_ktype = {
.release = damon_sysfs_scheme_regions_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_scheme_regions_groups,
@@ -252,13 +252,264 @@ static struct attribute *damon_sysfs_stats_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_stats);
-static struct kobj_type damon_sysfs_stats_ktype = {
+static const struct kobj_type damon_sysfs_stats_ktype = {
.release = damon_sysfs_stats_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_stats_groups,
};
/*
+ * filter directory
+ */
+
+struct damon_sysfs_scheme_filter {
+ struct kobject kobj;
+ enum damos_filter_type type;
+ bool matching;
+ char *memcg_path;
+};
+
+static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL);
+}
+
+/* Should match with enum damos_filter_type */
+static const char * const damon_sysfs_scheme_filter_type_strs[] = {
+ "anon",
+ "memcg",
+};
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ damon_sysfs_scheme_filter_type_strs[filter->type]);
+}
+
+static ssize_t type_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ enum damos_filter_type type;
+ ssize_t ret = -EINVAL;
+
+ for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) {
+ if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[
+ type])) {
+ filter->type = type;
+ ret = count;
+ break;
+ }
+ }
+ return ret;
+}
+
+static ssize_t matching_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%c\n", filter->matching ? 'Y' : 'N');
+}
+
+static ssize_t matching_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ bool matching;
+ int err = kstrtobool(buf, &matching);
+
+ if (err)
+ return err;
+
+ filter->matching = matching;
+ return count;
+}
+
+static ssize_t memcg_path_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ filter->memcg_path ? filter->memcg_path : "");
+}
+
+static ssize_t memcg_path_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL);
+
+ if (!path)
+ return -ENOMEM;
+
+ strscpy(path, buf, count + 1);
+ filter->memcg_path = path;
+ return count;
+}
+
+static void damon_sysfs_scheme_filter_release(struct kobject *kobj)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ kfree(filter->memcg_path);
+ kfree(filter);
+}
+
+static struct kobj_attribute damon_sysfs_scheme_filter_type_attr =
+ __ATTR_RW_MODE(type, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr =
+ __ATTR_RW_MODE(matching, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr =
+ __ATTR_RW_MODE(memcg_path, 0600);
+
+static struct attribute *damon_sysfs_scheme_filter_attrs[] = {
+ &damon_sysfs_scheme_filter_type_attr.attr,
+ &damon_sysfs_scheme_filter_matching_attr.attr,
+ &damon_sysfs_scheme_filter_memcg_path_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter);
+
+static struct kobj_type damon_sysfs_scheme_filter_ktype = {
+ .release = damon_sysfs_scheme_filter_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_filter_groups,
+};
+
+/*
+ * filters directory
+ */
+
+struct damon_sysfs_scheme_filters {
+ struct kobject kobj;
+ struct damon_sysfs_scheme_filter **filters_arr;
+ int nr;
+};
+
+static struct damon_sysfs_scheme_filters *
+damon_sysfs_scheme_filters_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL);
+}
+
+static void damon_sysfs_scheme_filters_rm_dirs(
+ struct damon_sysfs_scheme_filters *filters)
+{
+ struct damon_sysfs_scheme_filter **filters_arr = filters->filters_arr;
+ int i;
+
+ for (i = 0; i < filters->nr; i++)
+ kobject_put(&filters_arr[i]->kobj);
+ filters->nr = 0;
+ kfree(filters_arr);
+ filters->filters_arr = NULL;
+}
+
+static int damon_sysfs_scheme_filters_add_dirs(
+ struct damon_sysfs_scheme_filters *filters, int nr_filters)
+{
+ struct damon_sysfs_scheme_filter **filters_arr, *filter;
+ int err, i;
+
+ damon_sysfs_scheme_filters_rm_dirs(filters);
+ if (!nr_filters)
+ return 0;
+
+ filters_arr = kmalloc_array(nr_filters, sizeof(*filters_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!filters_arr)
+ return -ENOMEM;
+ filters->filters_arr = filters_arr;
+
+ for (i = 0; i < nr_filters; i++) {
+ filter = damon_sysfs_scheme_filter_alloc();
+ if (!filter) {
+ damon_sysfs_scheme_filters_rm_dirs(filters);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&filter->kobj,
+ &damon_sysfs_scheme_filter_ktype,
+ &filters->kobj, "%d", i);
+ if (err) {
+ kobject_put(&filter->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(filters);
+ return err;
+ }
+
+ filters_arr[i] = filter;
+ filters->nr++;
+ }
+ return 0;
+}
+
+static ssize_t nr_filters_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filters *filters = container_of(kobj,
+ struct damon_sysfs_scheme_filters, kobj);
+
+ return sysfs_emit(buf, "%d\n", filters->nr);
+}
+
+static ssize_t nr_filters_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filters *filters;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ filters = container_of(kobj, struct damon_sysfs_scheme_filters, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_scheme_filters_add_dirs(filters, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_scheme_filters_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_scheme_filters, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_scheme_filters_nr_attr =
+ __ATTR_RW_MODE(nr_filters, 0600);
+
+static struct attribute *damon_sysfs_scheme_filters_attrs[] = {
+ &damon_sysfs_scheme_filters_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_filters);
+
+static struct kobj_type damon_sysfs_scheme_filters_ktype = {
+ .release = damon_sysfs_scheme_filters_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_filters_groups,
+};
+
+/*
* watermarks directory
*/
@@ -427,7 +678,7 @@ static struct attribute *damon_sysfs_watermarks_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_watermarks);
-static struct kobj_type damon_sysfs_watermarks_ktype = {
+static const struct kobj_type damon_sysfs_watermarks_ktype = {
.release = damon_sysfs_watermarks_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_watermarks_groups,
@@ -538,7 +789,7 @@ static struct attribute *damon_sysfs_weights_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_weights);
-static struct kobj_type damon_sysfs_weights_ktype = {
+static const struct kobj_type damon_sysfs_weights_ktype = {
.release = damon_sysfs_weights_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_weights_groups,
@@ -669,7 +920,7 @@ static struct attribute *damon_sysfs_quotas_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_quotas);
-static struct kobj_type damon_sysfs_quotas_ktype = {
+static const struct kobj_type damon_sysfs_quotas_ktype = {
.release = damon_sysfs_quotas_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_quotas_groups,
@@ -768,7 +1019,7 @@ static struct attribute *damon_sysfs_access_pattern_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_access_pattern);
-static struct kobj_type damon_sysfs_access_pattern_ktype = {
+static const struct kobj_type damon_sysfs_access_pattern_ktype = {
.release = damon_sysfs_access_pattern_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_access_pattern_groups,
@@ -784,6 +1035,7 @@ struct damon_sysfs_scheme {
struct damon_sysfs_access_pattern *access_pattern;
struct damon_sysfs_quotas *quotas;
struct damon_sysfs_watermarks *watermarks;
+ struct damon_sysfs_scheme_filters *filters;
struct damon_sysfs_stats *stats;
struct damon_sysfs_scheme_regions *tried_regions;
};
@@ -878,6 +1130,24 @@ static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
return err;
}
+static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_scheme_filters *filters =
+ damon_sysfs_scheme_filters_alloc();
+ int err;
+
+ if (!filters)
+ return -ENOMEM;
+ err = kobject_init_and_add(&filters->kobj,
+ &damon_sysfs_scheme_filters_ktype, &scheme->kobj,
+ "filters");
+ if (err)
+ kobject_put(&filters->kobj);
+ else
+ scheme->filters = filters;
+ return err;
+}
+
static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
{
struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
@@ -926,9 +1196,12 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
err = damon_sysfs_scheme_set_watermarks(scheme);
if (err)
goto put_quotas_access_pattern_out;
- err = damon_sysfs_scheme_set_stats(scheme);
+ err = damon_sysfs_scheme_set_filters(scheme);
if (err)
goto put_watermarks_quotas_access_pattern_out;
+ err = damon_sysfs_scheme_set_stats(scheme);
+ if (err)
+ goto put_filters_watermarks_quotas_access_pattern_out;
err = damon_sysfs_scheme_set_tried_regions(scheme);
if (err)
goto put_tried_regions_out;
@@ -937,6 +1210,9 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
put_tried_regions_out:
kobject_put(&scheme->tried_regions->kobj);
scheme->tried_regions = NULL;
+put_filters_watermarks_quotas_access_pattern_out:
+ kobject_put(&scheme->filters->kobj);
+ scheme->filters = NULL;
put_watermarks_quotas_access_pattern_out:
kobject_put(&scheme->watermarks->kobj);
scheme->watermarks = NULL;
@@ -956,6 +1232,8 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
damon_sysfs_quotas_rm_dirs(scheme->quotas);
kobject_put(&scheme->quotas->kobj);
kobject_put(&scheme->watermarks->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(scheme->filters);
+ kobject_put(&scheme->filters->kobj);
kobject_put(&scheme->stats->kobj);
damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions);
kobject_put(&scheme->tried_regions->kobj);
@@ -1001,7 +1279,7 @@ static struct attribute *damon_sysfs_scheme_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme);
-static struct kobj_type damon_sysfs_scheme_ktype = {
+static const struct kobj_type damon_sysfs_scheme_ktype = {
.release = damon_sysfs_scheme_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_scheme_groups,
@@ -1118,12 +1396,85 @@ static struct attribute *damon_sysfs_schemes_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_schemes);
-struct kobj_type damon_sysfs_schemes_ktype = {
+const struct kobj_type damon_sysfs_schemes_ktype = {
.release = damon_sysfs_schemes_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_schemes_groups,
};
+static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg,
+ char *memcg_path_buf, char *path)
+{
+#ifdef CONFIG_MEMCG
+ cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX);
+ if (sysfs_streq(memcg_path_buf, path))
+ return true;
+#endif /* CONFIG_MEMCG */
+ return false;
+}
+
+static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id)
+{
+ struct mem_cgroup *memcg;
+ char *path;
+ bool found = false;
+
+ if (!memcg_path)
+ return -EINVAL;
+
+ path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL);
+ if (!path)
+ return -ENOMEM;
+
+ for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg;
+ memcg = mem_cgroup_iter(NULL, memcg, NULL)) {
+ /* skip removed memcg */
+ if (!mem_cgroup_id(memcg))
+ continue;
+ if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) {
+ *id = mem_cgroup_id(memcg);
+ found = true;
+ break;
+ }
+ }
+
+ kfree(path);
+ return found ? 0 : -EINVAL;
+}
+
+static int damon_sysfs_set_scheme_filters(struct damos *scheme,
+ struct damon_sysfs_scheme_filters *sysfs_filters)
+{
+ int i;
+ struct damos_filter *filter, *next;
+
+ damos_for_each_filter_safe(filter, next, scheme)
+ damos_destroy_filter(filter);
+
+ for (i = 0; i < sysfs_filters->nr; i++) {
+ struct damon_sysfs_scheme_filter *sysfs_filter =
+ sysfs_filters->filters_arr[i];
+ struct damos_filter *filter =
+ damos_new_filter(sysfs_filter->type,
+ sysfs_filter->matching);
+ int err;
+
+ if (!filter)
+ return -ENOMEM;
+ if (filter->type == DAMOS_FILTER_TYPE_MEMCG) {
+ err = damon_sysfs_memcg_path_to_id(
+ sysfs_filter->memcg_path,
+ &filter->memcg_id);
+ if (err) {
+ damos_destroy_filter(filter);
+ return err;
+ }
+ }
+ damos_add_filter(scheme, filter);
+ }
+ return 0;
+}
+
static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_scheme *sysfs_scheme)
{
@@ -1132,6 +1483,10 @@ static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+ struct damon_sysfs_scheme_filters *sysfs_filters =
+ sysfs_scheme->filters;
+ struct damos *scheme;
+ int err;
struct damos_access_pattern pattern = {
.min_sz_region = access_pattern->sz->min,
@@ -1157,8 +1512,17 @@ static struct damos *damon_sysfs_mk_scheme(
.low = sysfs_wmarks->low,
};
- return damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
+ scheme = damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
&wmarks);
+ if (!scheme)
+ return NULL;
+
+ err = damon_sysfs_set_scheme_filters(scheme, sysfs_filters);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
+ return scheme;
}
static void damon_sysfs_update_scheme(struct damos *scheme,
@@ -1169,6 +1533,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme,
struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+ int err;
scheme->pattern.min_sz_region = access_pattern->sz->min;
scheme->pattern.max_sz_region = access_pattern->sz->max;
@@ -1191,6 +1556,10 @@ static void damon_sysfs_update_scheme(struct damos *scheme,
scheme->wmarks.high = sysfs_wmarks->high;
scheme->wmarks.mid = sysfs_wmarks->mid;
scheme->wmarks.low = sysfs_wmarks->low;
+
+ err = damon_sysfs_set_scheme_filters(scheme, sysfs_scheme->filters);
+ if (err)
+ damon_destroy_scheme(scheme);
}
int damon_sysfs_set_schemes(struct damon_ctx *ctx,
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index aeb0beb1da91..33e1d5c9cb54 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -81,7 +81,7 @@ static struct attribute *damon_sysfs_region_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_region);
-static struct kobj_type damon_sysfs_region_ktype = {
+static const struct kobj_type damon_sysfs_region_ktype = {
.release = damon_sysfs_region_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_region_groups,
@@ -198,7 +198,7 @@ static struct attribute *damon_sysfs_regions_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_regions);
-static struct kobj_type damon_sysfs_regions_ktype = {
+static const struct kobj_type damon_sysfs_regions_ktype = {
.release = damon_sysfs_regions_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_regions_groups,
@@ -277,7 +277,7 @@ static struct attribute *damon_sysfs_target_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_target);
-static struct kobj_type damon_sysfs_target_ktype = {
+static const struct kobj_type damon_sysfs_target_ktype = {
.release = damon_sysfs_target_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_target_groups,
@@ -402,7 +402,7 @@ static struct attribute *damon_sysfs_targets_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_targets);
-static struct kobj_type damon_sysfs_targets_ktype = {
+static const struct kobj_type damon_sysfs_targets_ktype = {
.release = damon_sysfs_targets_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_targets_groups,
@@ -530,7 +530,7 @@ static struct attribute *damon_sysfs_intervals_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_intervals);
-static struct kobj_type damon_sysfs_intervals_ktype = {
+static const struct kobj_type damon_sysfs_intervals_ktype = {
.release = damon_sysfs_intervals_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_intervals_groups,
@@ -612,7 +612,7 @@ static struct attribute *damon_sysfs_attrs_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_attrs);
-static struct kobj_type damon_sysfs_attrs_ktype = {
+static const struct kobj_type damon_sysfs_attrs_ktype = {
.release = damon_sysfs_attrs_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_attrs_groups,
@@ -800,7 +800,7 @@ static struct attribute *damon_sysfs_context_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_context);
-static struct kobj_type damon_sysfs_context_ktype = {
+static const struct kobj_type damon_sysfs_context_ktype = {
.release = damon_sysfs_context_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_context_groups,
@@ -926,7 +926,7 @@ static struct attribute *damon_sysfs_contexts_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_contexts);
-static struct kobj_type damon_sysfs_contexts_ktype = {
+static const struct kobj_type damon_sysfs_contexts_ktype = {
.release = damon_sysfs_contexts_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_contexts_groups,
@@ -1564,7 +1564,7 @@ static struct attribute *damon_sysfs_kdamond_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_kdamond);
-static struct kobj_type damon_sysfs_kdamond_ktype = {
+static const struct kobj_type damon_sysfs_kdamond_ktype = {
.release = damon_sysfs_kdamond_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_kdamond_groups,
@@ -1707,7 +1707,7 @@ static struct attribute *damon_sysfs_kdamonds_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_kdamonds);
-static struct kobj_type damon_sysfs_kdamonds_ktype = {
+static const struct kobj_type damon_sysfs_kdamonds_ktype = {
.release = damon_sysfs_kdamonds_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_kdamonds_groups,
@@ -1757,7 +1757,7 @@ static struct attribute *damon_sysfs_ui_dir_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_ui_dir);
-static struct kobj_type damon_sysfs_ui_dir_ktype = {
+static const struct kobj_type damon_sysfs_ui_dir_ktype = {
.release = damon_sysfs_ui_dir_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_ui_dir_groups,
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index bce37c487540..c4b455b5ee30 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -14,19 +14,26 @@
#include <kunit/test.h>
-static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas,
+static int __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas,
ssize_t nr_vmas)
{
- int i;
+ int i, ret = -ENOMEM;
MA_STATE(mas, mt, 0, 0);
if (!nr_vmas)
- return;
+ return 0;
mas_lock(&mas);
- for (i = 0; i < nr_vmas; i++)
- vma_mas_store(&vmas[i], &mas);
+ for (i = 0; i < nr_vmas; i++) {
+ mas_set_range(&mas, vmas[i].vm_start, vmas[i].vm_end - 1);
+ if (mas_store_gfp(&mas, &vmas[i], GFP_KERNEL))
+ goto failed;
+ }
+
+ ret = 0;
+failed:
mas_unlock(&mas);
+ return ret;
}
/*
@@ -71,7 +78,8 @@ static void damon_test_three_regions_in_vmas(struct kunit *test)
};
mt_init_flags(&mm.mm_mt, MM_MT_FLAGS);
- __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas));
+ if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)))
+ kunit_skip(test, "Failed to create VMA tree");
__damon_va_three_regions(&mm, regions);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 15f03df66db6..1fec16d7263e 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -335,9 +335,9 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
{
bool referenced = false;
pte_t entry = huge_ptep_get(pte);
- struct page *page = pte_page(entry);
+ struct folio *folio = pfn_folio(pte_pfn(entry));
- get_page(page);
+ folio_get(folio);
if (pte_young(entry)) {
referenced = true;
@@ -352,10 +352,10 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
- set_page_young(page);
+ folio_set_young(folio);
- set_page_idle(page);
- put_page(page);
+ folio_set_idle(folio);
+ folio_put(folio);
}
static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
@@ -422,7 +422,8 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
}
struct damon_young_walk_private {
- unsigned long *page_sz;
+ /* size of the folio for the access checked virtual memory address */
+ unsigned long *folio_sz;
bool young;
};
@@ -431,7 +432,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
{
pte_t *pte;
spinlock_t *ptl;
- struct page *page;
+ struct folio *folio;
struct damon_young_walk_private *priv = walk->private;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -446,16 +447,15 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
spin_unlock(ptl);
goto regular_page;
}
- page = damon_get_page(pmd_pfn(*pmd));
- if (!page)
+ folio = damon_get_folio(pmd_pfn(*pmd));
+ if (!folio)
goto huge_out;
- if (pmd_young(*pmd) || !page_is_idle(page) ||
+ if (pmd_young(*pmd) || !folio_test_idle(folio) ||
mmu_notifier_test_young(walk->mm,
- addr)) {
- *priv->page_sz = HPAGE_PMD_SIZE;
+ addr))
priv->young = true;
- }
- put_page(page);
+ *priv->folio_sz = HPAGE_PMD_SIZE;
+ folio_put(folio);
huge_out:
spin_unlock(ptl);
return 0;
@@ -469,15 +469,14 @@ regular_page:
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
if (!pte_present(*pte))
goto out;
- page = damon_get_page(pte_pfn(*pte));
- if (!page)
+ folio = damon_get_folio(pte_pfn(*pte));
+ if (!folio)
goto out;
- if (pte_young(*pte) || !page_is_idle(page) ||
- mmu_notifier_test_young(walk->mm, addr)) {
- *priv->page_sz = PAGE_SIZE;
+ if (pte_young(*pte) || !folio_test_idle(folio) ||
+ mmu_notifier_test_young(walk->mm, addr))
priv->young = true;
- }
- put_page(page);
+ *priv->folio_sz = folio_size(folio);
+ folio_put(folio);
out:
pte_unmap_unlock(pte, ptl);
return 0;
@@ -490,7 +489,7 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
{
struct damon_young_walk_private *priv = walk->private;
struct hstate *h = hstate_vma(walk->vma);
- struct page *page;
+ struct folio *folio;
spinlock_t *ptl;
pte_t entry;
@@ -499,16 +498,15 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
if (!pte_present(entry))
goto out;
- page = pte_page(entry);
- get_page(page);
+ folio = pfn_folio(pte_pfn(entry));
+ folio_get(folio);
- if (pte_young(entry) || !page_is_idle(page) ||
- mmu_notifier_test_young(walk->mm, addr)) {
- *priv->page_sz = huge_page_size(h);
+ if (pte_young(entry) || !folio_test_idle(folio) ||
+ mmu_notifier_test_young(walk->mm, addr))
priv->young = true;
- }
+ *priv->folio_sz = huge_page_size(h);
- put_page(page);
+ folio_put(folio);
out:
spin_unlock(ptl);
@@ -524,10 +522,10 @@ static const struct mm_walk_ops damon_young_ops = {
};
static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
- unsigned long *page_sz)
+ unsigned long *folio_sz)
{
struct damon_young_walk_private arg = {
- .page_sz = page_sz,
+ .folio_sz = folio_sz,
.young = false,
};
@@ -547,18 +545,18 @@ static void __damon_va_check_access(struct mm_struct *mm,
struct damon_region *r, bool same_target)
{
static unsigned long last_addr;
- static unsigned long last_page_sz = PAGE_SIZE;
+ static unsigned long last_folio_sz = PAGE_SIZE;
static bool last_accessed;
/* If the region is in the last checked page, reuse the result */
- if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) ==
- ALIGN_DOWN(r->sampling_addr, last_page_sz))) {
+ if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) ==
+ ALIGN_DOWN(r->sampling_addr, last_folio_sz))) {
if (last_accessed)
r->nr_accesses++;
return;
}
- last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz);
+ last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz);
if (last_accessed)
r->nr_accesses++;
diff --git a/mm/debug.c b/mm/debug.c
index 7f8e5f744e42..96d594e16292 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -94,11 +94,11 @@ static void __dump_page(struct page *page)
page, page_ref_count(head), mapcount, mapping,
page_to_pgoff(page), page_to_pfn(page));
if (compound) {
- pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d compound_pincount:%d\n",
+ pr_warn("head:%p order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
head, compound_order(head),
- head_compound_mapcount(head),
- head_subpages_mapcount(head),
- head_compound_pincount(head));
+ folio_entire_mapcount(folio),
+ folio_nr_pages_mapped(folio),
+ atomic_read(&folio->_pincount));
}
#ifdef CONFIG_MEMCG
@@ -215,6 +215,7 @@ void dump_mm(const struct mm_struct *mm)
mm->def_flags, &mm->def_flags
);
}
+EXPORT_SYMBOL(dump_mm);
static bool page_init_poisoning __read_mostly = true;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index c631ade3f1d2..af59cc7bd307 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -15,6 +15,7 @@
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/kconfig.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/mm_types.h>
@@ -80,6 +81,7 @@ struct pgtable_debug_args {
unsigned long pmd_pfn;
unsigned long pte_pfn;
+ unsigned long fixed_alignment;
unsigned long fixed_pgd_pfn;
unsigned long fixed_p4d_pfn;
unsigned long fixed_pud_pfn;
@@ -430,7 +432,8 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
- if (!arch_vmap_pmd_supported(args->page_prot))
+ if (!arch_vmap_pmd_supported(args->page_prot) ||
+ args->fixed_alignment < PMD_SIZE)
return;
pr_debug("Validating PMD huge\n");
@@ -449,7 +452,8 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (!arch_vmap_pud_supported(args->page_prot))
+ if (!arch_vmap_pud_supported(args->page_prot) ||
+ args->fixed_alignment < PUD_SIZE)
return;
pr_debug("Validating PUD huge\n");
@@ -806,15 +810,36 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) {
static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
{
-#ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE
- pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
+ unsigned long max_swap_offset;
+ swp_entry_t entry, entry2;
+ pte_t pte;
pr_debug("Validating PTE swap exclusive\n");
+
+ /* See generic_max_swapfile_size(): probe the maximum offset */
+ max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
+
+ /* Create a swp entry with all possible bits set */
+ entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset);
+
+ pte = swp_entry_to_pte(entry);
+ WARN_ON(pte_swp_exclusive(pte));
+ WARN_ON(!is_swap_pte(pte));
+ entry2 = pte_to_swp_entry(pte);
+ WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
+
pte = pte_swp_mkexclusive(pte);
WARN_ON(!pte_swp_exclusive(pte));
+ WARN_ON(!is_swap_pte(pte));
+ WARN_ON(pte_swp_soft_dirty(pte));
+ entry2 = pte_to_swp_entry(pte);
+ WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
+
pte = pte_swp_clear_exclusive(pte);
WARN_ON(pte_swp_exclusive(pte));
-#endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */
+ WARN_ON(!is_swap_pte(pte));
+ entry2 = pte_to_swp_entry(pte);
+ WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
}
static void __init pte_swap_tests(struct pgtable_debug_args *args)
@@ -1077,10 +1102,85 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
return page;
}
+/*
+ * Check if a physical memory range described by <pstart, pend> contains
+ * an area that is of size psize, and aligned to psize.
+ *
+ * Don't use address 0, an all-zeroes physical address might mask bugs, and
+ * it's not used on x86.
+ */
+static void __init phys_align_check(phys_addr_t pstart,
+ phys_addr_t pend, unsigned long psize,
+ phys_addr_t *physp, unsigned long *alignp)
+{
+ phys_addr_t aligned_start, aligned_end;
+
+ if (pstart == 0)
+ pstart = PAGE_SIZE;
+
+ aligned_start = ALIGN(pstart, psize);
+ aligned_end = aligned_start + psize;
+
+ if (aligned_end > aligned_start && aligned_end <= pend) {
+ *alignp = psize;
+ *physp = aligned_start;
+ }
+}
+
+static void __init init_fixed_pfns(struct pgtable_debug_args *args)
+{
+ u64 idx;
+ phys_addr_t phys, pstart, pend;
+
+ /*
+ * Initialize the fixed pfns. To do this, try to find a
+ * valid physical range, preferably aligned to PUD_SIZE,
+ * but settling for aligned to PMD_SIZE as a fallback. If
+ * neither of those is found, use the physical address of
+ * the start_kernel symbol.
+ *
+ * The memory doesn't need to be allocated, it just needs to exist
+ * as usable memory. It won't be touched.
+ *
+ * The alignment is recorded, and can be checked to see if we
+ * can run the tests that require an actual valid physical
+ * address range on some architectures ({pmd,pud}_huge_test
+ * on x86).
+ */
+
+ phys = __pa_symbol(&start_kernel);
+ args->fixed_alignment = PAGE_SIZE;
+
+ for_each_mem_range(idx, &pstart, &pend) {
+ /* First check for a PUD-aligned area */
+ phys_align_check(pstart, pend, PUD_SIZE, &phys,
+ &args->fixed_alignment);
+
+ /* If a PUD-aligned area is found, we're done */
+ if (args->fixed_alignment == PUD_SIZE)
+ break;
+
+ /*
+ * If no PMD-aligned area found yet, check for one,
+ * but continue the loop to look for a PUD-aligned area.
+ */
+ if (args->fixed_alignment < PMD_SIZE)
+ phys_align_check(pstart, pend, PMD_SIZE, &phys,
+ &args->fixed_alignment);
+ }
+
+ args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
+ args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
+ args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
+ args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
+ args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
+ WARN_ON(!pfn_valid(args->fixed_pte_pfn));
+}
+
+
static int __init init_args(struct pgtable_debug_args *args)
{
struct page *page = NULL;
- phys_addr_t phys;
int ret = 0;
/*
@@ -1160,22 +1260,7 @@ static int __init init_args(struct pgtable_debug_args *args)
args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp));
WARN_ON(!args->start_ptep);
- /*
- * PFN for mapping at PTE level is determined from a standard kernel
- * text symbol. But pfns for higher page table levels are derived by
- * masking lower bits of this real pfn. These derived pfns might not
- * exist on the platform but that does not really matter as pfn_pxx()
- * helpers will still create appropriate entries for the test. This
- * helps avoid large memory block allocations to be used for mapping
- * at higher page table levels in some of the tests.
- */
- phys = __pa_symbol(&start_kernel);
- args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
- args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
- args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
- args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
- args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
- WARN_ON(!pfn_valid(args->fixed_pte_pfn));
+ init_fixed_pfns(args);
/*
* Allocate (huge) pages because some of the tests need to access
diff --git a/mm/fadvise.c b/mm/fadvise.c
index bf04fec87f35..fb7c5f43fd2a 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
spin_lock(&file->f_lock);
- file->f_mode &= ~FMODE_RANDOM;
+ file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE);
spin_unlock(&file->f_lock);
break;
case POSIX_FADV_RANDOM:
@@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
force_page_cache_readahead(mapping, file, start_index, nrpages);
break;
case POSIX_FADV_NOREUSE:
+ spin_lock(&file->f_lock);
+ file->f_mode |= FMODE_NOREUSE;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_DONTNEED:
__filemap_fdatawrite_range(mapping, offset, endbyte,
diff --git a/mm/filemap.c b/mm/filemap.c
index c4d4ace9cc70..2723104cc06a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,6 +42,8 @@
#include <linux/ramfs.h>
#include <linux/page_idle.h>
#include <linux/migrate.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/splice.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -97,7 +99,7 @@
* ->i_pages lock (__sync_single_inode)
*
* ->i_mmap_rwsem
- * ->anon_vma.lock (vma_adjust)
+ * ->anon_vma.lock (vma_merge)
*
* ->anon_vma.lock
* ->page_table_lock or pte_lock (anon_vma_prepare and various)
@@ -470,7 +472,7 @@ EXPORT_SYMBOL(filemap_flush);
bool filemap_range_has_page(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
- struct page *page;
+ struct folio *folio;
XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
pgoff_t max = end_byte >> PAGE_SHIFT;
@@ -479,11 +481,11 @@ bool filemap_range_has_page(struct address_space *mapping,
rcu_read_lock();
for (;;) {
- page = xas_find(&xas, max);
- if (xas_retry(&xas, page))
+ folio = xas_find(&xas, max);
+ if (xas_retry(&xas, folio))
continue;
/* Shadow entries don't count */
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
/*
* We don't need to try to pin this page; we're about to
@@ -494,7 +496,7 @@ bool filemap_range_has_page(struct address_space *mapping,
}
rcu_read_unlock();
- return page != NULL;
+ return folio != NULL;
}
EXPORT_SYMBOL(filemap_range_has_page);
@@ -503,25 +505,27 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
{
pgoff_t index = start_byte >> PAGE_SHIFT;
pgoff_t end = end_byte >> PAGE_SHIFT;
- struct pagevec pvec;
- int nr_pages;
+ struct folio_batch fbatch;
+ unsigned nr_folios;
+
+ folio_batch_init(&fbatch);
- pagevec_init(&pvec);
while (index <= end) {
unsigned i;
- nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
- end, PAGECACHE_TAG_WRITEBACK);
- if (!nr_pages)
+ nr_folios = filemap_get_folios_tag(mapping, &index, end,
+ PAGECACHE_TAG_WRITEBACK, &fbatch);
+
+ if (!nr_folios)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr_folios; i++) {
+ struct folio *folio = fbatch.folios[i];
- wait_on_page_writeback(page);
- ClearPageError(page);
+ folio_wait_writeback(folio);
+ folio_clear_error(folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
}
}
@@ -2282,64 +2286,58 @@ out:
EXPORT_SYMBOL(filemap_get_folios_contig);
/**
- * find_get_pages_range_tag - Find and return head pages matching @tag.
- * @mapping: the address_space to search
- * @index: the starting page index
- * @end: The final page index (inclusive)
- * @tag: the tag index
- * @nr_pages: the maximum number of pages
- * @pages: where the resulting pages are placed
+ * filemap_get_folios_tag - Get a batch of folios matching @tag
+ * @mapping: The address_space to search
+ * @start: The starting page index
+ * @end: The final page index (inclusive)
+ * @tag: The tag index
+ * @fbatch: The batch to fill
*
- * Like find_get_pages_range(), except we only return head pages which are
- * tagged with @tag. @index is updated to the index immediately after the
- * last page we return, ready for the next iteration.
+ * Same as filemap_get_folios(), but only returning folios tagged with @tag.
*
- * Return: the number of pages which were found.
+ * Return: The number of folios found.
+ * Also update @start to index the next folio for traversal.
*/
-unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
- pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
- struct page **pages)
+unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
{
- XA_STATE(xas, &mapping->i_pages, *index);
+ XA_STATE(xas, &mapping->i_pages, *start);
struct folio *folio;
- unsigned ret = 0;
-
- if (unlikely(!nr_pages))
- return 0;
rcu_read_lock();
- while ((folio = find_get_entry(&xas, end, tag))) {
+ while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
/*
* Shadow entries should never be tagged, but this iteration
* is lockless so there is a window for page reclaim to evict
- * a page we saw tagged. Skip over it.
+ * a page we saw tagged. Skip over it.
*/
if (xa_is_value(folio))
continue;
+ if (!folio_batch_add(fbatch, folio)) {
+ unsigned long nr = folio_nr_pages(folio);
- pages[ret] = &folio->page;
- if (++ret == nr_pages) {
- *index = folio->index + folio_nr_pages(folio);
+ if (folio_test_hugetlb(folio))
+ nr = 1;
+ *start = folio->index + nr;
goto out;
}
}
-
/*
- * We come here when we got to @end. We take care to not overflow the
- * index @index as it confuses some of the callers. This breaks the
- * iteration when there is a page at index -1 but that is already
- * broken anyway.
+ * We come here when there is no page beyond @end. We take care to not
+ * overflow the index @start as it confuses some of the callers. This
+ * breaks the iteration when there is a page at index -1 but that is
+ * already broke anyway.
*/
if (end == (pgoff_t)-1)
- *index = (pgoff_t)-1;
+ *start = (pgoff_t)-1;
else
- *index = end + 1;
+ *start = end + 1;
out:
rcu_read_unlock();
- return ret;
+ return folio_batch_count(fbatch);
}
-EXPORT_SYMBOL(find_get_pages_range_tag);
+EXPORT_SYMBOL(filemap_get_folios_tag);
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
@@ -2440,21 +2438,19 @@ static int filemap_read_folio(struct file *file, filler_t filler,
}
static bool filemap_range_uptodate(struct address_space *mapping,
- loff_t pos, struct iov_iter *iter, struct folio *folio)
+ loff_t pos, size_t count, struct folio *folio,
+ bool need_uptodate)
{
- int count;
-
if (folio_test_uptodate(folio))
return true;
/* pipes can't handle partially uptodate pages */
- if (iov_iter_is_pipe(iter))
+ if (need_uptodate)
return false;
if (!mapping->a_ops->is_partially_uptodate)
return false;
if (mapping->host->i_blkbits >= folio_shift(folio))
return false;
- count = iter->count;
if (folio_pos(folio) > pos) {
count -= folio_pos(folio) - pos;
pos = 0;
@@ -2466,8 +2462,8 @@ static bool filemap_range_uptodate(struct address_space *mapping,
}
static int filemap_update_page(struct kiocb *iocb,
- struct address_space *mapping, struct iov_iter *iter,
- struct folio *folio)
+ struct address_space *mapping, size_t count,
+ struct folio *folio, bool need_uptodate)
{
int error;
@@ -2501,7 +2497,8 @@ static int filemap_update_page(struct kiocb *iocb,
goto unlock;
error = 0;
- if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio))
+ if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
+ need_uptodate))
goto unlock;
error = -EAGAIN;
@@ -2577,8 +2574,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file,
return 0;
}
-static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
- struct folio_batch *fbatch)
+static int filemap_get_pages(struct kiocb *iocb, size_t count,
+ struct folio_batch *fbatch, bool need_uptodate)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
@@ -2588,18 +2585,19 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
struct folio *folio;
int err = 0;
- last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
+ /* "last_index" is the index of the page beyond the end of the read */
+ last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
if (fatal_signal_pending(current))
return -EINTR;
- filemap_get_read_batch(mapping, index, last_index, fbatch);
+ filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
page_cache_sync_readahead(mapping, ra, filp, index,
last_index - index);
- filemap_get_read_batch(mapping, index, last_index, fbatch);
+ filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
}
if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
@@ -2621,7 +2619,8 @@ retry:
if ((iocb->ki_flags & IOCB_WAITQ) &&
folio_batch_count(fbatch) > 1)
iocb->ki_flags |= IOCB_NOWAIT;
- err = filemap_update_page(iocb, mapping, iter, folio);
+ err = filemap_update_page(iocb, mapping, count, folio,
+ need_uptodate);
if (err)
goto err;
}
@@ -2691,7 +2690,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
if (unlikely(iocb->ki_pos >= i_size_read(inode)))
break;
- error = filemap_get_pages(iocb, iter, &fbatch);
+ error = filemap_get_pages(iocb, iter->count, &fbatch,
+ iov_iter_is_pipe(iter));
if (error < 0)
break;
@@ -2841,6 +2841,134 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
}
EXPORT_SYMBOL(generic_file_read_iter);
+/*
+ * Splice subpages from a folio into a pipe.
+ */
+size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio, loff_t fpos, size_t size)
+{
+ struct page *page;
+ size_t spliced = 0, offset = offset_in_folio(folio, fpos);
+
+ page = folio_page(folio, offset / PAGE_SIZE);
+ size = min(size, folio_size(folio) - offset);
+ offset %= PAGE_SIZE;
+
+ while (spliced < size &&
+ !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ struct pipe_buffer *buf = pipe_head_buf(pipe);
+ size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
+
+ *buf = (struct pipe_buffer) {
+ .ops = &page_cache_pipe_buf_ops,
+ .page = page,
+ .offset = offset,
+ .len = part,
+ };
+ folio_get(folio);
+ pipe->head++;
+ page++;
+ spliced += part;
+ offset = 0;
+ }
+
+ return spliced;
+}
+
+/*
+ * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into
+ * a pipe.
+ */
+ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct folio_batch fbatch;
+ struct kiocb iocb;
+ size_t total_spliced = 0, used, npages;
+ loff_t isize, end_offset;
+ bool writably_mapped;
+ int i, error = 0;
+
+ init_sync_kiocb(&iocb, in);
+ iocb.ki_pos = *ppos;
+
+ /* Work out how much data we can actually add into the pipe */
+ used = pipe_occupancy(pipe->head, pipe->tail);
+ npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ len = min_t(size_t, len, npages * PAGE_SIZE);
+
+ folio_batch_init(&fbatch);
+
+ do {
+ cond_resched();
+
+ if (*ppos >= i_size_read(file_inode(in)))
+ break;
+
+ iocb.ki_pos = *ppos;
+ error = filemap_get_pages(&iocb, len, &fbatch, true);
+ if (error < 0)
+ break;
+
+ /*
+ * i_size must be checked after we know the pages are Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+ isize = i_size_read(file_inode(in));
+ if (unlikely(*ppos >= isize))
+ break;
+ end_offset = min_t(loff_t, isize, *ppos + len);
+
+ /*
+ * Once we start copying data, we don't want to be touching any
+ * cachelines that might be contended:
+ */
+ writably_mapped = mapping_writably_mapped(in->f_mapping);
+
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ size_t n;
+
+ if (folio_pos(folio) >= end_offset)
+ goto out;
+ folio_mark_accessed(folio);
+
+ /*
+ * If users can be writing to this folio using arbitrary
+ * virtual addresses, take care of potential aliasing
+ * before reading the folio on the kernel side.
+ */
+ if (writably_mapped)
+ flush_dcache_folio(folio);
+
+ n = min_t(loff_t, len, isize - *ppos);
+ n = splice_folio_into_pipe(pipe, folio, *ppos, n);
+ if (!n)
+ goto out;
+ len -= n;
+ total_spliced += n;
+ *ppos += n;
+ in->f_ra.prev_pos = *ppos;
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ goto out;
+ }
+
+ folio_batch_release(&fbatch);
+ } while (len);
+
+out:
+ folio_batch_release(&fbatch);
+ file_accessed(in);
+
+ return total_spliced ? total_spliced : error;
+}
+EXPORT_SYMBOL(filemap_splice_read);
+
static inline loff_t folio_seek_hole_data(struct xa_state *xas,
struct address_space *mapping, struct folio *folio,
loff_t start, loff_t end, bool seek_data)
@@ -3263,22 +3391,24 @@ out_retry:
}
EXPORT_SYMBOL(filemap_fault);
-static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
+static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
+ pgoff_t start)
{
struct mm_struct *mm = vmf->vma->vm_mm;
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd)) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return true;
}
- if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
+ if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
+ struct page *page = folio_file_page(folio, start);
vm_fault_t ret = do_set_pmd(vmf, page);
if (!ret) {
/* The page is mapped successfully, reference consumed. */
- unlock_page(page);
+ folio_unlock(folio);
return true;
}
}
@@ -3288,8 +3418,8 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
/* See comment in handle_pte_fault() */
if (pmd_devmap_trans_unstable(vmf->pmd)) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return true;
}
@@ -3372,7 +3502,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
if (!folio)
goto out;
- if (filemap_map_pmd(vmf, &folio->page)) {
+ if (filemap_map_pmd(vmf, folio, start_pgoff)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
@@ -3587,6 +3717,30 @@ struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
}
EXPORT_SYMBOL(read_cache_folio);
+/**
+ * mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
+ * @mapping: The address_space for the folio.
+ * @index: The index that the allocated folio will contain.
+ * @gfp: The page allocator flags to use if allocating.
+ *
+ * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
+ * any new memory allocations done using the specified allocation flags.
+ *
+ * The most likely error from this function is EIO, but ENOMEM is
+ * possible and so is EINTR. If ->read_folio returns another error,
+ * that will be returned to the caller.
+ *
+ * The function expects mapping->invalidate_lock to be already held.
+ *
+ * Return: Uptodate folio on success, ERR_PTR() on failure.
+ */
+struct folio *mapping_read_folio_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
+{
+ return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
+}
+EXPORT_SYMBOL(mapping_read_folio_gfp);
+
static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
{
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 69ed25790c68..cabcd1de9ecb 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -6,6 +6,7 @@
#include <linux/migrate.h>
#include <linux/pagemap.h>
+#include <linux/rmap.h>
#include <linux/swap.h>
#include "internal.h"
@@ -112,10 +113,10 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
-int isolate_lru_page(struct page *page)
+bool isolate_lru_page(struct page *page)
{
if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"))
- return -EBUSY;
+ return false;
return folio_isolate_lru((struct folio *)page);
}
@@ -123,3 +124,13 @@ void putback_lru_page(struct page *page)
{
folio_putback_lru(page_folio(page));
}
+
+#ifdef CONFIG_MMU
+void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma,
+ unsigned long address)
+{
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ return folio_add_new_anon_rmap((struct folio *)page, vma, address);
+}
+#endif
diff --git a/mm/gup.c b/mm/gup.c
index f45a3a5be53a..eab18ba045db 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -111,7 +111,7 @@ retry:
* FOLL_GET: folio's refcount will be incremented by @refs.
*
* FOLL_PIN on large folios: folio's refcount will be incremented by
- * @refs, and its compound_pincount will be incremented by @refs.
+ * @refs, and its pincount will be incremented by @refs.
*
* FOLL_PIN on single-page folios: folio's refcount will be incremented by
* @refs * GUP_PIN_COUNTING_BIAS.
@@ -157,7 +157,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
* try_get_folio() is left intact.
*/
if (folio_test_large(folio))
- atomic_add(refs, folio_pincount_ptr(folio));
+ atomic_add(refs, &folio->_pincount);
else
folio_ref_add(folio,
refs * (GUP_PIN_COUNTING_BIAS - 1));
@@ -182,7 +182,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
if (flags & FOLL_PIN) {
node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
if (folio_test_large(folio))
- atomic_sub(refs, folio_pincount_ptr(folio));
+ atomic_sub(refs, &folio->_pincount);
else
refs *= GUP_PIN_COUNTING_BIAS;
}
@@ -215,7 +215,6 @@ int __must_check try_grab_page(struct page *page, unsigned int flags)
{
struct folio *folio = page_folio(page);
- WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
return -ENOMEM;
@@ -232,7 +231,7 @@ int __must_check try_grab_page(struct page *page, unsigned int flags)
*/
if (folio_test_large(folio)) {
folio_ref_add(folio, 1);
- atomic_add(1, folio_pincount_ptr(folio));
+ atomic_add(1, &folio->_pincount);
} else {
folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
}
@@ -818,7 +817,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
if (vma_is_secretmem(vma))
return NULL;
- if (foll_flags & FOLL_PIN)
+ if (WARN_ON_ONCE(foll_flags & FOLL_PIN))
return NULL;
page = follow_page_mask(vma, address, foll_flags, &ctx);
@@ -880,9 +879,9 @@ unmap:
}
/*
- * mmap_lock must be held on entry. If @locked != NULL and *@flags
- * does not include FOLL_NOWAIT, the mmap_lock may be released. If it
- * is, *@locked will be set to 0 and -EBUSY returned.
+ * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not
+ * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set
+ * to 0 and -EBUSY returned.
*/
static int faultin_page(struct vm_area_struct *vma,
unsigned long address, unsigned int *flags, bool unshare,
@@ -897,7 +896,7 @@ static int faultin_page(struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
fault_flags |= FAULT_FLAG_REMOTE;
- if (locked) {
+ if (*flags & FOLL_UNLOCKABLE) {
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
/*
* FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
@@ -931,8 +930,8 @@ static int faultin_page(struct vm_area_struct *vma,
* mmap lock in the page fault handler. Sanity check this.
*/
WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
- if (locked)
- *locked = 0;
+ *locked = 0;
+
/*
* We should do the same as VM_FAULT_RETRY, but let's not
* return -EBUSY since that's not reflecting the reality of
@@ -952,7 +951,7 @@ static int faultin_page(struct vm_area_struct *vma,
}
if (ret & VM_FAULT_RETRY) {
- if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
+ if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*locked = 0;
return -EBUSY;
}
@@ -975,9 +974,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
return -EOPNOTSUPP;
- if ((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA))
- return -EOPNOTSUPP;
-
if (vma_is_secretmem(vma))
return -EFAULT;
@@ -1055,7 +1051,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* This does not guarantee that the page exists in the user mappings when
* __get_user_pages returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
+ * and subsequently re-faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
@@ -1066,14 +1062,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* appropriate) must be called after the page is finished with, and
* before put_page is called.
*
- * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
- * released by an up_read(). That can happen if @gup_flags does not
- * have FOLL_NOWAIT.
+ * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may
+ * be released. If this happens *@locked will be set to 0 on return.
*
- * A caller using such a combination of @locked and @gup_flags
- * must therefore hold the mmap_lock for reading only, and recognize
- * when it's been released. Otherwise, it must be held for either
- * reading or writing and will not be released.
+ * A caller using such a combination of @gup_flags must therefore hold the
+ * mmap_lock for reading only, and recognize when it's been released. Otherwise,
+ * it must be held for either reading or writing and will not be released.
*
* In most cases, get_user_pages or get_user_pages_fast should be used
* instead of __get_user_pages. __get_user_pages should be used only if
@@ -1125,7 +1119,7 @@ static long __get_user_pages(struct mm_struct *mm,
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
gup_flags, locked);
- if (locked && *locked == 0) {
+ if (!*locked) {
/*
* We've got a VM_FAULT_RETRY
* and we've lost mmap_lock.
@@ -1331,8 +1325,17 @@ static bool gup_signal_pending(unsigned int flags)
}
/*
- * Please note that this function, unlike __get_user_pages will not
- * return 0 for nr_pages > 0 without FOLL_NOWAIT
+ * Locking: (*locked == 1) means that the mmap_lock has already been acquired by
+ * the caller. This function may drop the mmap_lock. If it does so, then it will
+ * set (*locked = 0).
+ *
+ * (*locked == 0) means that the caller expects this function to acquire and
+ * drop the mmap_lock. Therefore, the value of *locked will still be zero when
+ * the function returns, even though it may have changed temporarily during
+ * function execution.
+ *
+ * Please note that this function, unlike __get_user_pages(), will not return 0
+ * for nr_pages > 0, unless FOLL_NOWAIT is used.
*/
static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
unsigned long start,
@@ -1343,14 +1346,20 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
unsigned int flags)
{
long ret, pages_done;
- bool lock_dropped;
+ bool must_unlock = false;
- if (locked) {
- /* if VM_FAULT_RETRY can be returned, vmas become invalid */
- BUG_ON(vmas);
- /* check caller initialized locked */
- BUG_ON(*locked != 1);
+ /*
+ * The internal caller expects GUP to manage the lock internally and the
+ * lock must be released when this returns.
+ */
+ if (!*locked) {
+ if (mmap_read_lock_killable(mm))
+ return -EAGAIN;
+ must_unlock = true;
+ *locked = 1;
}
+ else
+ mmap_assert_locked(mm);
if (flags & FOLL_PIN)
mm_set_has_pinned_flag(&mm->flags);
@@ -1368,13 +1377,14 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
flags |= FOLL_GET;
pages_done = 0;
- lock_dropped = false;
for (;;) {
ret = __get_user_pages(mm, start, nr_pages, flags, pages,
vmas, locked);
- if (!locked)
+ if (!(flags & FOLL_UNLOCKABLE)) {
/* VM_FAULT_RETRY couldn't trigger, bypass */
- return ret;
+ pages_done = ret;
+ break;
+ }
/* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
if (!*locked) {
@@ -1404,7 +1414,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
if (likely(pages))
pages += ret;
start += ret << PAGE_SHIFT;
- lock_dropped = true;
+
+ /* The lock was temporarily dropped, so we must unlock later */
+ must_unlock = true;
retry:
/*
@@ -1451,10 +1463,11 @@ retry:
pages++;
start += PAGE_SIZE;
}
- if (lock_dropped && *locked) {
+ if (must_unlock && *locked) {
/*
- * We must let the caller know we temporarily dropped the lock
- * and so the critical section protected by it was lost.
+ * We either temporarily dropped the lock, or the caller
+ * requested that we both acquire and drop the lock. Either way,
+ * we must now unlock, and notify the caller of that state.
*/
mmap_read_unlock(mm);
*locked = 0;
@@ -1487,6 +1500,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int local_locked = 1;
int gup_flags;
long ret;
@@ -1519,12 +1533,15 @@ long populate_vma_page_range(struct vm_area_struct *vma,
if (vma_is_accessible(vma))
gup_flags |= FOLL_FORCE;
+ if (locked)
+ gup_flags |= FOLL_UNLOCKABLE;
+
/*
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
- NULL, NULL, locked);
+ NULL, NULL, locked ? locked : &local_locked);
lru_add_drain();
return ret;
}
@@ -1545,12 +1562,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* code on error (see __get_user_pages()).
*
* vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
- * covered by the VMA.
- *
- * If @locked is NULL, it may be held for read or write and will be unperturbed.
- *
- * If @locked is non-NULL, it must held for read only and may be released. If
- * it's released, *@locked will be set to 0.
+ * covered by the VMA. If it's released, *@locked will be set to 0.
*/
long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, bool write, int *locked)
@@ -1575,7 +1587,7 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
* a poisoned page.
* !FOLL_FORCE: Require proper access permissions.
*/
- gup_flags = FOLL_TOUCH | FOLL_HWPOISON;
+ gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE;
if (write)
gup_flags |= FOLL_WRITE;
@@ -1659,9 +1671,24 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned int foll_flags)
{
struct vm_area_struct *vma;
+ bool must_unlock = false;
unsigned long vm_flags;
long i;
+ if (!nr_pages)
+ return 0;
+
+ /*
+ * The internal caller expects GUP to manage the lock internally and the
+ * lock must be released when this returns.
+ */
+ if (!*locked) {
+ if (mmap_read_lock_killable(mm))
+ return -EAGAIN;
+ must_unlock = true;
+ *locked = 1;
+ }
+
/* calculate required read or write permissions.
* If FOLL_FORCE is set, we only require the "MAY" flags.
*/
@@ -1673,12 +1700,12 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
for (i = 0; i < nr_pages; i++) {
vma = find_vma(mm, start);
if (!vma)
- goto finish_or_fault;
+ break;
/* protect what we can, including chardevs */
if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
!(vm_flags & vma->vm_flags))
- goto finish_or_fault;
+ break;
if (pages) {
pages[i] = virt_to_page((void *)start);
@@ -1690,9 +1717,11 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
start = (start + PAGE_SIZE) & PAGE_MASK;
}
- return i;
+ if (must_unlock && *locked) {
+ mmap_read_unlock(mm);
+ *locked = 0;
+ }
-finish_or_fault:
return i ? : -EFAULT;
}
#endif /* !CONFIG_MMU */
@@ -1861,17 +1890,13 @@ EXPORT_SYMBOL(fault_in_readable);
#ifdef CONFIG_ELF_CORE
struct page *get_dump_page(unsigned long addr)
{
- struct mm_struct *mm = current->mm;
struct page *page;
- int locked = 1;
+ int locked = 0;
int ret;
- if (mmap_read_lock_killable(mm))
- return NULL;
- ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
+ ret = __get_user_pages_locked(current->mm, addr, 1, &page, NULL,
+ &locked,
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
- if (locked)
- mmap_read_unlock(mm);
return (ret == 1) ? page : NULL;
}
#endif /* CONFIG_ELF_CORE */
@@ -1905,7 +1930,7 @@ static unsigned long collect_longterm_unpinnable_pages(
continue;
if (folio_test_hugetlb(folio)) {
- isolate_hugetlb(&folio->page, movable_page_list);
+ isolate_hugetlb(folio, movable_page_list);
continue;
}
@@ -2047,34 +2072,15 @@ static long __gup_longterm_locked(struct mm_struct *mm,
int *locked,
unsigned int gup_flags)
{
- bool must_unlock = false;
unsigned int flags;
long rc, nr_pinned_pages;
- if (locked && WARN_ON_ONCE(!*locked))
- return -EINVAL;
-
if (!(gup_flags & FOLL_LONGTERM))
return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
locked, gup_flags);
- /*
- * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM
- * implies FOLL_PIN (although the reverse is not true). Therefore it is
- * correct to unconditionally call check_and_migrate_movable_pages()
- * which assumes pages have been pinned via FOLL_PIN.
- *
- * Enforce the above reasoning by asserting that FOLL_PIN is set.
- */
- if (WARN_ON(!(gup_flags & FOLL_PIN)))
- return -EINVAL;
flags = memalloc_pin_save();
do {
- if (locked && !*locked) {
- mmap_read_lock(mm);
- must_unlock = true;
- *locked = 1;
- }
nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
pages, vmas, locked,
gup_flags);
@@ -2082,33 +2088,70 @@ static long __gup_longterm_locked(struct mm_struct *mm,
rc = nr_pinned_pages;
break;
}
+
+ /* FOLL_LONGTERM implies FOLL_PIN */
rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
} while (rc == -EAGAIN);
memalloc_pin_restore(flags);
-
- if (locked && *locked && must_unlock) {
- mmap_read_unlock(mm);
- *locked = 0;
- }
return rc ? rc : nr_pinned_pages;
}
-static bool is_valid_gup_flags(unsigned int gup_flags)
+/*
+ * Check that the given flags are valid for the exported gup/pup interface, and
+ * update them with the required flags that the caller must have set.
+ */
+static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
+ int *locked, unsigned int *gup_flags_p,
+ unsigned int to_set)
{
+ unsigned int gup_flags = *gup_flags_p;
+
/*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that with an assertion:
+ * These flags not allowed to be specified externally to the gup
+ * interfaces:
+ * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
+ * - FOLL_REMOTE is internal only and used on follow_page()
+ * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
*/
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE |
+ FOLL_REMOTE | FOLL_FAST_ONLY)))
return false;
+
+ gup_flags |= to_set;
+ if (locked) {
+ /* At the external interface locked must be set */
+ if (WARN_ON_ONCE(*locked != 1))
+ return false;
+
+ gup_flags |= FOLL_UNLOCKABLE;
+ }
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return false;
+
+ /* LONGTERM can only be specified when pinning */
+ if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))
+ return false;
+
+ /* Pages input must be given if using GET/PIN */
+ if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))
+ return false;
+
+ /* We want to allow the pgmap to be hot-unplugged at all times */
+ if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&
+ (gup_flags & FOLL_PCI_P2PDMA)))
+ return false;
+
/*
- * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
- * that is, FOLL_LONGTERM is a specific case, more restrictive case of
- * FOLL_PIN.
+ * Can't use VMAs with locked, as locked allows GUP to unlock
+ * which invalidates the vmas array
*/
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ if (WARN_ON_ONCE(vmas && (gup_flags & FOLL_UNLOCKABLE)))
return false;
+ *gup_flags_p = gup_flags;
return true;
}
@@ -2178,11 +2221,15 @@ long get_user_pages_remote(struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
- if (!is_valid_gup_flags(gup_flags))
+ int local_locked = 1;
+
+ if (!is_valid_gup_args(pages, vmas, locked, &gup_flags,
+ FOLL_TOUCH | FOLL_REMOTE))
return -EINVAL;
- return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked,
- gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ locked ? locked : &local_locked,
+ gup_flags);
}
EXPORT_SYMBOL(get_user_pages_remote);
@@ -2216,11 +2263,13 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
- if (!is_valid_gup_flags(gup_flags))
+ int locked = 1;
+
+ if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH))
return -EINVAL;
- return __gup_longterm_locked(current->mm, start, nr_pages,
- pages, vmas, NULL, gup_flags | FOLL_TOUCH);
+ return __get_user_pages_locked(current->mm, start, nr_pages, pages,
+ vmas, &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages);
@@ -2242,16 +2291,14 @@ EXPORT_SYMBOL(get_user_pages);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
{
- struct mm_struct *mm = current->mm;
- int locked = 1;
- long ret;
+ int locked = 0;
- mmap_read_lock(mm);
- ret = __gup_longterm_locked(mm, start, nr_pages, pages, NULL, &locked,
- gup_flags | FOLL_TOUCH);
- if (locked)
- mmap_read_unlock(mm);
- return ret;
+ if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
+ FOLL_TOUCH | FOLL_UNLOCKABLE))
+ return -EINVAL;
+
+ return __get_user_pages_locked(current->mm, start, nr_pages, pages,
+ NULL, &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
@@ -2904,6 +2951,7 @@ static int internal_get_user_pages_fast(unsigned long start,
{
unsigned long len, end;
unsigned long nr_pinned;
+ int locked = 0;
int ret;
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
@@ -2932,8 +2980,9 @@ static int internal_get_user_pages_fast(unsigned long start,
/* Slow path: try to get the remaining pages with get_user_pages */
start += nr_pinned << PAGE_SHIFT;
pages += nr_pinned;
- ret = get_user_pages_unlocked(start, nr_pages - nr_pinned, pages,
- gup_flags);
+ ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
+ pages, NULL, &locked,
+ gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
if (ret < 0) {
/*
* The caller has to unpin the pages we already pinned so
@@ -2956,8 +3005,6 @@ static int internal_get_user_pages_fast(unsigned long start,
*
* Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
* the regular GUP.
- * Note a difference with get_user_pages_fast: this always returns the
- * number of pages pinned, 0 if no pages were pinned.
*
* If the architecture does not support this function, simply return with no
* pages pinned.
@@ -2969,7 +3016,6 @@ static int internal_get_user_pages_fast(unsigned long start,
int get_user_pages_fast_only(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- int nr_pinned;
/*
* Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
* because gup fast is always a "pin with a +1 page refcount" request.
@@ -2977,21 +3023,11 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages,
* FOLL_FAST_ONLY is required in order to match the API description of
* this routine: no fall back to regular ("slow") GUP.
*/
- gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
-
- nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
- pages);
-
- /*
- * As specified in the API description above, this routine is not
- * allowed to return negative values. However, the common core
- * routine internal_get_user_pages_fast() *can* return -errno.
- * Therefore, correct for that here:
- */
- if (nr_pinned < 0)
- nr_pinned = 0;
+ if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
+ FOLL_GET | FOLL_FAST_ONLY))
+ return -EINVAL;
- return nr_pinned;
+ return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
@@ -3014,16 +3050,14 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- if (!is_valid_gup_flags(gup_flags))
- return -EINVAL;
-
/*
* The caller may or may not have explicitly set FOLL_GET; either way is
* OK. However, internally (within mm/gup.c), gup fast variants must set
* FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
* request.
*/
- gup_flags |= FOLL_GET;
+ if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_GET))
+ return -EINVAL;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
@@ -3047,57 +3081,12 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- return -EINVAL;
-
- if (WARN_ON_ONCE(!pages))
+ if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN))
return -EINVAL;
-
- gup_flags |= FOLL_PIN;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
-/*
- * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
- * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
- *
- * The API rules are the same, too: no negative values may be returned.
- */
-int pin_user_pages_fast_only(unsigned long start, int nr_pages,
- unsigned int gup_flags, struct page **pages)
-{
- int nr_pinned;
-
- /*
- * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
- * rules require returning 0, rather than -errno:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- return 0;
-
- if (WARN_ON_ONCE(!pages))
- return 0;
- /*
- * FOLL_FAST_ONLY is required in order to match the API description of
- * this routine: no fall back to regular ("slow") GUP.
- */
- gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
- nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
- pages);
- /*
- * This routine is not allowed to return negative values. However,
- * internal_get_user_pages_fast() *can* return -errno. Therefore,
- * correct for that here:
- */
- if (nr_pinned < 0)
- nr_pinned = 0;
-
- return nr_pinned;
-}
-EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
-
/**
* pin_user_pages_remote() - pin pages of a remote process
*
@@ -3125,16 +3114,14 @@ long pin_user_pages_remote(struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- return -EINVAL;
-
- if (WARN_ON_ONCE(!pages))
- return -EINVAL;
+ int local_locked = 1;
- return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked,
- gup_flags | FOLL_PIN | FOLL_TOUCH |
- FOLL_REMOTE);
+ if (!is_valid_gup_args(pages, vmas, locked, &gup_flags,
+ FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
+ return 0;
+ return __gup_longterm_locked(mm, start, nr_pages, pages, vmas,
+ locked ? locked : &local_locked,
+ gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_remote);
@@ -3159,16 +3146,12 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- return -EINVAL;
-
- if (WARN_ON_ONCE(!pages))
- return -EINVAL;
+ int locked = 1;
- gup_flags |= FOLL_PIN;
+ if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN))
+ return 0;
return __gup_longterm_locked(current->mm, start, nr_pages,
- pages, vmas, NULL, gup_flags);
+ pages, vmas, &locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
@@ -3180,14 +3163,13 @@ EXPORT_SYMBOL(pin_user_pages);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
{
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- return -EINVAL;
+ int locked = 0;
- if (WARN_ON_ONCE(!pages))
- return -EINVAL;
+ if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
+ FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
+ return 0;
- gup_flags |= FOLL_PIN;
- return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
+ return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL,
+ &locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);
diff --git a/mm/hmm.c b/mm/hmm.c
index 601a99ce3c84..6a151c09de5e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -492,8 +492,21 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault) {
+ int ret;
+
spin_unlock(ptl);
- return hmm_vma_fault(addr, end, required_fault, walk);
+ hugetlb_vma_unlock_read(vma);
+ /*
+ * Avoid deadlock: drop the vma lock before calling
+ * hmm_vma_fault(), which will itself potentially take and
+ * drop the vma lock. This is also correct from a
+ * protection point of view, because there is no further
+ * use here of either pte or ptl after dropping the vma
+ * lock.
+ */
+ ret = hmm_vma_fault(addr, end, required_fault, walk);
+ hugetlb_vma_lock_read(vma);
+ return ret;
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index abe6cfd92ffa..032fb0ef9cd1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -119,7 +119,8 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
* own flags.
*/
if (!in_pf && shmem_file(vma->vm_file))
- return shmem_huge_enabled(vma, !enforce_sysfs);
+ return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
+ !enforce_sysfs, vma->vm_mm, vm_flags);
/* Enforce sysfs THP requirements as necessary */
if (enforce_sysfs &&
@@ -559,10 +560,11 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
}
#ifdef CONFIG_MEMCG
-static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+static inline
+struct deferred_split *get_deferred_split_queue(struct folio *folio)
{
- struct mem_cgroup *memcg = page_memcg(compound_head(page));
- struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
if (memcg)
return &memcg->deferred_split_queue;
@@ -570,9 +572,10 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page)
return &pgdat->deferred_split_queue;
}
#else
-static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+static inline
+struct deferred_split *get_deferred_split_queue(struct folio *folio)
{
- struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+ struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
return &pgdat->deferred_split_queue;
}
@@ -580,23 +583,23 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page)
void prep_transhuge_page(struct page *page)
{
- /*
- * we use page->mapping and page->index in second tail page
- * as list_head: assuming THP order >= 2
- */
+ struct folio *folio = (struct folio *)page;
- INIT_LIST_HEAD(page_deferred_list(page));
+ VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
+ INIT_LIST_HEAD(&folio->_deferred_list);
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
static inline bool is_transparent_hugepage(struct page *page)
{
+ struct folio *folio;
+
if (!PageCompound(page))
return false;
- page = compound_head(page);
- return is_huge_zero_page(page) ||
- page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
+ folio = page_folio(page);
+ return is_huge_zero_page(&folio->page) ||
+ folio->_folio_dtor == TRANSHUGE_PAGE_DTOR;
}
static unsigned long __thp_get_unmapped_area(struct file *filp,
@@ -1039,11 +1042,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
assert_spin_locked(pmd_lockptr(mm, pmd));
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
- return NULL;
-
if (flags & FOLL_WRITE && !pmd_write(*pmd))
return NULL;
@@ -1202,11 +1200,6 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
if (flags & FOLL_WRITE && !pud_write(*pud))
return NULL;
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
- return NULL;
-
if (pud_present(*pud) && pud_devmap(*pud))
/* pass */;
else
@@ -1603,7 +1596,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
{
spinlock_t *ptl;
pmd_t orig_pmd;
- struct page *page;
+ struct folio *folio;
struct mm_struct *mm = tlb->mm;
bool ret = false;
@@ -1623,15 +1616,15 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto out;
}
- page = pmd_page(orig_pmd);
+ folio = pfn_folio(pmd_pfn(orig_pmd));
/*
- * If other processes are mapping this page, we couldn't discard
- * the page unless they all do MADV_FREE so let's skip the page.
+ * If other processes are mapping this folio, we couldn't discard
+ * the folio unless they all do MADV_FREE so let's skip the folio.
*/
- if (total_mapcount(page) != 1)
+ if (folio_mapcount(folio) != 1)
goto out;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto out;
/*
@@ -1639,17 +1632,17 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* will deactivate only them.
*/
if (next - addr != HPAGE_PMD_SIZE) {
- get_page(page);
+ folio_get(folio);
spin_unlock(ptl);
- split_huge_page(page);
- unlock_page(page);
- put_page(page);
+ split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
goto out_unlocked;
}
- if (PageDirty(page))
- ClearPageDirty(page);
- unlock_page(page);
+ if (folio_test_dirty(folio))
+ folio_clear_dirty(folio);
+ folio_unlock(folio);
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
pmdp_invalidate(vma, addr, pmd);
@@ -1660,7 +1653,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
- mark_page_lazyfree(page);
+ folio_mark_lazyfree(folio);
ret = true;
out:
spin_unlock(ptl);
@@ -1920,17 +1913,15 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
entry = pmd_modify(oldpmd, newprot);
- if (uffd_wp) {
- entry = pmd_wrprotect(entry);
+ if (uffd_wp)
entry = pmd_mkuffd_wp(entry);
- } else if (uffd_wp_resolve) {
+ else if (uffd_wp_resolve)
/*
* Leave the write bit to be handled by PF interrupt
* handler, then things like COW could be properly
* handled.
*/
entry = pmd_clear_uffd_wp(entry);
- }
/* See change_pte_range(). */
if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
@@ -2022,7 +2013,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
spinlock_t *ptl;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address & HPAGE_PUD_MASK,
(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -2046,7 +2037,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
pgtable_t pgtable;
- pmd_t _pmd;
+ pmd_t _pmd, old_pmd;
int i;
/*
@@ -2057,7 +2048,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
*
* See Documentation/mm/mmu_notifier.rst
*/
- pmdp_huge_clear_flush(vma, haddr, pmd);
+ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -2066,6 +2057,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pte_t *pte, entry;
entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
entry = pte_mkspecial(entry);
+ if (pmd_uffd_wp(old_pmd))
+ entry = pte_mkuffd_wp(entry);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
@@ -2284,7 +2277,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
spinlock_t *ptl;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address & HPAGE_PMD_MASK,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -2479,9 +2472,9 @@ static void __split_huge_page_tail(struct page *head, int tail,
* of swap cache pages that store the swp_entry_t in tail pages.
* Fix up and warn once if private is unexpectedly set.
*
- * What of 32-bit systems, on which head[1].compound_pincount overlays
+ * What of 32-bit systems, on which folio->_pincount overlays
* head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and
- * compound_pincount must be 0 for folio_ref_freeze() to have succeeded.
+ * pincount must be 0 for folio_ref_freeze() to have succeeded.
*/
if (!folio_test_swapcache(page_folio(head))) {
VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail);
@@ -2652,7 +2645,7 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct folio *folio = page_folio(page);
- struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page);
+ struct deferred_split *ds_queue = get_deferred_split_queue(folio);
XA_STATE(xas, &folio->mapping->i_pages, folio->index);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
@@ -2756,9 +2749,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
if (folio_ref_freeze(folio, 1 + extra_pins)) {
- if (!list_empty(page_deferred_list(&folio->page))) {
+ if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
- list_del(page_deferred_list(&folio->page));
+ list_del(&folio->_deferred_list);
}
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
@@ -2802,49 +2795,53 @@ out:
void free_transhuge_page(struct page *page)
{
- struct deferred_split *ds_queue = get_deferred_split_queue(page);
+ struct folio *folio = (struct folio *)page;
+ struct deferred_split *ds_queue = get_deferred_split_queue(folio);
unsigned long flags;
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
- if (!list_empty(page_deferred_list(page))) {
+ if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
- list_del(page_deferred_list(page));
+ list_del(&folio->_deferred_list);
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
free_compound_page(page);
}
-void deferred_split_huge_page(struct page *page)
+void deferred_split_folio(struct folio *folio)
{
- struct deferred_split *ds_queue = get_deferred_split_queue(page);
+ struct deferred_split *ds_queue = get_deferred_split_queue(folio);
#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg = page_memcg(compound_head(page));
+ struct mem_cgroup *memcg = folio_memcg(folio);
#endif
unsigned long flags;
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
/*
* The try_to_unmap() in page reclaim path might reach here too,
* this may cause a race condition to corrupt deferred split queue.
- * And, if page reclaim is already handling the same page, it is
+ * And, if page reclaim is already handling the same folio, it is
* unnecessary to handle it again in shrinker.
*
- * Check PageSwapCache to determine if the page is being
- * handled by page reclaim since THP swap would add the page into
+ * Check the swapcache flag to determine if the folio is being
+ * handled by page reclaim since THP swap would add the folio into
* swap cache before calling try_to_unmap().
*/
- if (PageSwapCache(page))
+ if (folio_test_swapcache(folio))
+ return;
+
+ if (!list_empty(&folio->_deferred_list))
return;
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
- if (list_empty(page_deferred_list(page))) {
+ if (list_empty(&folio->_deferred_list)) {
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
- list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
+ list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
ds_queue->split_queue_len++;
#ifdef CONFIG_MEMCG
if (memcg)
- set_shrinker_bit(memcg, page_to_nid(page),
+ set_shrinker_bit(memcg, folio_nid(folio),
deferred_split_shrinker.id);
#endif
}
@@ -2870,8 +2867,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
struct pglist_data *pgdata = NODE_DATA(sc->nid);
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
unsigned long flags;
- LIST_HEAD(list), *pos, *next;
- struct page *page;
+ LIST_HEAD(list);
+ struct folio *folio, *next;
int split = 0;
#ifdef CONFIG_MEMCG
@@ -2881,14 +2878,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
- list_for_each_safe(pos, next, &ds_queue->split_queue) {
- page = list_entry((void *)pos, struct page, deferred_list);
- page = compound_head(page);
- if (get_page_unless_zero(page)) {
- list_move(page_deferred_list(page), &list);
+ list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
+ _deferred_list) {
+ if (folio_try_get(folio)) {
+ list_move(&folio->_deferred_list, &list);
} else {
- /* We lost race with put_compound_page() */
- list_del_init(page_deferred_list(page));
+ /* We lost race with folio_put() */
+ list_del_init(&folio->_deferred_list);
ds_queue->split_queue_len--;
}
if (!--sc->nr_to_scan)
@@ -2896,16 +2892,15 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
- list_for_each_safe(pos, next, &list) {
- page = list_entry((void *)pos, struct page, deferred_list);
- if (!trylock_page(page))
+ list_for_each_entry_safe(folio, next, &list, _deferred_list) {
+ if (!folio_trylock(folio))
goto next;
/* split_huge_page() removes page from list on success */
- if (!split_huge_page(page))
+ if (!split_folio(folio))
split++;
- unlock_page(page);
+ folio_unlock(folio);
next:
- put_page(page);
+ folio_put(folio);
}
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -2934,6 +2929,7 @@ static void split_huge_pages_all(void)
{
struct zone *zone;
struct page *page;
+ struct folio *folio;
unsigned long pfn, max_zone_pfn;
unsigned long total = 0, split = 0;
@@ -2946,24 +2942,32 @@ static void split_huge_pages_all(void)
int nr_pages;
page = pfn_to_online_page(pfn);
- if (!page || !get_page_unless_zero(page))
+ if (!page || PageTail(page))
continue;
+ folio = page_folio(page);
+ if (!folio_try_get(folio))
+ continue;
+
+ if (unlikely(page_folio(page) != folio))
+ goto next;
- if (zone != page_zone(page))
+ if (zone != folio_zone(folio))
goto next;
- if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
+ if (!folio_test_large(folio)
+ || folio_test_hugetlb(folio)
+ || !folio_test_lru(folio))
goto next;
total++;
- lock_page(page);
- nr_pages = thp_nr_pages(page);
- if (!split_huge_page(page))
+ folio_lock(folio);
+ nr_pages = folio_nr_pages(folio);
+ if (!split_folio(folio))
split++;
pfn += nr_pages - 1;
- unlock_page(page);
+ folio_unlock(folio);
next:
- put_page(page);
+ folio_put(folio);
cond_resched();
}
}
@@ -3272,15 +3276,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
- if (is_writable_migration_entry(entry))
- pmde = maybe_pmd_mkwrite(pmde, vma);
if (pmd_swp_uffd_wp(*pvmw->pmd))
- pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
+ pmde = pmd_mkuffd_wp(pmde);
if (!is_migration_entry_young(entry))
pmde = pmd_mkold(pmde);
/* NOTE: this may contain setting soft-dirty on some archs */
if (PageDirty(new) && is_migration_entry_dirty(entry))
pmde = pmd_mkdirty(pmde);
+ if (is_writable_migration_entry(entry))
+ pmde = maybe_pmd_mkwrite(pmde, vma);
+ else
+ pmde = pmd_wrprotect(pmde);
if (PageAnon(new)) {
rmap_t rmap_flags = RMAP_COMPOUND;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index db895230ee7e..07abcb6eb203 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -94,6 +94,8 @@ static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -258,12 +260,6 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
/*
* hugetlb vma_lock helper routines
*/
-static bool __vma_shareable_lock(struct vm_area_struct *vma)
-{
- return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
- vma->vm_private_data;
-}
-
void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
if (__vma_shareable_lock(vma)) {
@@ -1181,7 +1177,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
/*
* Reset and decrement one ref on hugepage private reservation.
- * Called with mm->mmap_sem writer semaphore held.
+ * Called with mm->mmap_lock writer semaphore held.
* This function should be only used by move_vma() and operate on
* same sized vma. It should never come here with last ref on the
* reservation.
@@ -1286,32 +1282,33 @@ static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
folio_set_hugetlb_freed(folio);
}
-static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
+static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
+ int nid)
{
- struct page *page;
+ struct folio *folio;
bool pin = !!(current->flags & PF_MEMALLOC_PIN);
lockdep_assert_held(&hugetlb_lock);
- list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
- if (pin && !is_longterm_pinnable_page(page))
+ list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) {
+ if (pin && !folio_is_longterm_pinnable(folio))
continue;
- if (PageHWPoison(page))
+ if (folio_test_hwpoison(folio))
continue;
- list_move(&page->lru, &h->hugepage_activelist);
- set_page_refcounted(page);
- ClearHPageFreed(page);
+ list_move(&folio->lru, &h->hugepage_activelist);
+ folio_ref_unfreeze(folio, 1);
+ folio_clear_hugetlb_freed(folio);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
- return page;
+ return folio;
}
return NULL;
}
-static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
- nodemask_t *nmask)
+static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
{
unsigned int cpuset_mems_cookie;
struct zonelist *zonelist;
@@ -1324,7 +1321,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask,
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
- struct page *page;
+ struct folio *folio;
if (!cpuset_zone_allowed(zone, gfp_mask))
continue;
@@ -1336,9 +1333,9 @@ retry_cpuset:
continue;
node = zone_to_nid(zone);
- page = dequeue_huge_page_node_exact(h, node);
- if (page)
- return page;
+ folio = dequeue_hugetlb_folio_node_exact(h, node);
+ if (folio)
+ return folio;
}
if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
@@ -1351,12 +1348,12 @@ static unsigned long available_huge_pages(struct hstate *h)
return h->free_huge_pages - h->resv_huge_pages;
}
-static struct page *dequeue_huge_page_vma(struct hstate *h,
+static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
long chg)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask;
nodemask_t *nodemask;
@@ -1378,22 +1375,24 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
if (mpol_is_preferred_many(mpol)) {
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+ folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
+ nid, nodemask);
/* Fallback to all nodes if page==NULL */
nodemask = NULL;
}
- if (!page)
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+ if (!folio)
+ folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
+ nid, nodemask);
- if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
- SetHPageRestoreReserve(page);
+ if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) {
+ folio_set_hugetlb_restore_reserve(folio);
h->resv_huge_pages--;
}
mpol_cond_put(mpol);
- return page;
+ return folio;
err:
return NULL;
@@ -1478,9 +1477,9 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
int nr_pages = 1 << order;
struct page *p;
- atomic_set(folio_mapcount_ptr(folio), 0);
- atomic_set(folio_subpages_mapcount_ptr(folio), 0);
- atomic_set(folio_pincount_ptr(folio), 0);
+ atomic_set(&folio->_entire_mapcount, 0);
+ atomic_set(&folio->_nr_pages_mapped, 0);
+ atomic_set(&folio->_pincount, 0);
for (i = 1; i < nr_pages; i++) {
p = folio_page(folio, i);
@@ -1490,7 +1489,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
set_page_refcounted(p);
}
- folio_set_compound_order(folio, 0);
+ folio_set_order(folio, 0);
__folio_clear_head(folio);
}
@@ -1702,10 +1701,10 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
enqueue_hugetlb_folio(h, folio);
}
-static void __update_and_free_page(struct hstate *h, struct page *page)
+static void __update_and_free_hugetlb_folio(struct hstate *h,
+ struct folio *folio)
{
int i;
- struct folio *folio = page_folio(page);
struct page *subpage;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
@@ -1718,7 +1717,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return;
- if (hugetlb_vmemmap_restore(h, page)) {
+ if (hugetlb_vmemmap_restore(h, &folio->page)) {
spin_lock_irq(&hugetlb_lock);
/*
* If we cannot allocate vmemmap pages, just refuse to free the
@@ -1735,7 +1734,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
* which makes any healthy subpages reusable.
*/
if (unlikely(folio_test_hwpoison(folio)))
- hugetlb_clear_page_hwpoison(&folio->page);
+ folio_clear_hugetlb_hwpoison(folio);
for (i = 0; i < pages_per_huge_page(h); i++) {
subpage = folio_page(folio, i);
@@ -1754,7 +1753,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
destroy_compound_gigantic_folio(folio, huge_page_order(h));
free_gigantic_folio(folio, huge_page_order(h));
} else {
- __free_pages(page, huge_page_order(h));
+ __free_pages(&folio->page, huge_page_order(h));
}
}
@@ -1794,7 +1793,7 @@ static void free_hpage_workfn(struct work_struct *work)
*/
h = size_to_hstate(page_size(page));
- __update_and_free_page(h, page);
+ __update_and_free_hugetlb_folio(h, page_folio(page));
cond_resched();
}
@@ -1811,7 +1810,7 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
bool atomic)
{
if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
- __update_and_free_page(h, &folio->page);
+ __update_and_free_hugetlb_folio(h, folio);
return;
}
@@ -1954,7 +1953,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
__folio_clear_reserved(folio);
__folio_set_head(folio);
/* we rely on prep_new_hugetlb_folio to set the destructor */
- folio_set_compound_order(folio, order);
+ folio_set_order(folio, order);
for (i = 0; i < nr_pages; i++) {
p = folio_page(folio, i);
@@ -2000,9 +1999,9 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
if (i != 0)
set_compound_head(p, &folio->page);
}
- atomic_set(folio_mapcount_ptr(folio), -1);
- atomic_set(folio_subpages_mapcount_ptr(folio), 0);
- atomic_set(folio_pincount_ptr(folio), 0);
+ atomic_set(&folio->_entire_mapcount, -1);
+ atomic_set(&folio->_nr_pages_mapped, 0);
+ atomic_set(&folio->_pincount, 0);
return true;
out_error:
@@ -2018,7 +2017,7 @@ out_error:
p = folio_page(folio, j);
__ClearPageReserved(p);
}
- folio_set_compound_order(folio, 0);
+ folio_set_order(folio, 0);
__folio_clear_head(folio);
return false;
}
@@ -2042,11 +2041,12 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
*/
int PageHuge(struct page *page)
{
+ struct folio *folio;
+
if (!PageCompound(page))
return 0;
-
- page = compound_head(page);
- return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
+ folio = page_folio(page);
+ return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
}
EXPORT_SYMBOL_GPL(PageHuge);
@@ -2056,10 +2056,11 @@ EXPORT_SYMBOL_GPL(PageHuge);
*/
int PageHeadHuge(struct page *page_head)
{
- if (!PageHead(page_head))
+ struct folio *folio = (struct folio *)page_head;
+ if (!folio_test_large(folio))
return 0;
- return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
+ return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
}
EXPORT_SYMBOL_GPL(PageHeadHuge);
@@ -2377,8 +2378,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
/*
* Allocates a fresh surplus page from the page allocator.
*/
-static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask)
+static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
struct folio *folio = NULL;
@@ -2415,10 +2416,10 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
out_unlock:
spin_unlock_irq(&hugetlb_lock);
- return &folio->page;
+ return folio;
}
-static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct folio *folio;
@@ -2438,17 +2439,17 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
*/
folio_set_hugetlb_temporary(folio);
- return &folio->page;
+ return folio;
}
/*
* Use the VMA's mpolicy to allocate a huge page from the buddy.
*/
static
-struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
+struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask = htlb_alloc_mask(h);
int nid;
@@ -2459,53 +2460,54 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
gfp_t gfp = gfp_mask | __GFP_NOWARN;
gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- page = alloc_surplus_huge_page(h, gfp, nid, nodemask);
+ folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask);
/* Fallback to all nodes if page==NULL */
nodemask = NULL;
}
- if (!page)
- page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
+ if (!folio)
+ folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask);
mpol_cond_put(mpol);
- return page;
+ return folio;
}
-/* page migration callback function */
-struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
+/* folio migration callback function */
+struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask)
{
spin_lock_irq(&hugetlb_lock);
if (available_huge_pages(h)) {
- struct page *page;
+ struct folio *folio;
- page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
- if (page) {
+ folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
+ preferred_nid, nmask);
+ if (folio) {
spin_unlock_irq(&hugetlb_lock);
- return page;
+ return folio;
}
}
spin_unlock_irq(&hugetlb_lock);
- return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
+ return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
}
/* mempolicy aware migration callback */
-struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
+struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address)
{
struct mempolicy *mpol;
nodemask_t *nodemask;
- struct page *page;
+ struct folio *folio;
gfp_t gfp_mask;
int node;
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
+ folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
- return page;
+ return folio;
}
/*
@@ -2516,6 +2518,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
__must_hold(&hugetlb_lock)
{
LIST_HEAD(surplus_list);
+ struct folio *folio;
struct page *page, *tmp;
int ret;
long i;
@@ -2535,13 +2538,13 @@ static int gather_surplus_pages(struct hstate *h, long delta)
retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
+ folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
NUMA_NO_NODE, NULL);
- if (!page) {
+ if (!folio) {
alloc_ok = false;
break;
}
- list_add(&page->lru, &surplus_list);
+ list_add(&folio->lru, &surplus_list);
cond_resched();
}
allocated += i;
@@ -2795,14 +2798,14 @@ static long vma_del_reservation(struct hstate *h,
/*
* This routine is called to restore reservation information on error paths.
- * It should ONLY be called for pages allocated via alloc_huge_page(), and
- * the hugetlb mutex should remain held when calling this routine.
+ * It should ONLY be called for folios allocated via alloc_hugetlb_folio(),
+ * and the hugetlb mutex should remain held when calling this routine.
*
* It handles two specific cases:
- * 1) A reservation was in place and the page consumed the reservation.
- * HPageRestoreReserve is set in the page.
- * 2) No reservation was in place for the page, so HPageRestoreReserve is
- * not set. However, alloc_huge_page always updates the reserve map.
+ * 1) A reservation was in place and the folio consumed the reservation.
+ * hugetlb_restore_reserve is set in the folio.
+ * 2) No reservation was in place for the page, so hugetlb_restore_reserve is
+ * not set. However, alloc_hugetlb_folio always updates the reserve map.
*
* In case 1, free_huge_page later in the error path will increment the
* global reserve count. But, free_huge_page does not have enough context
@@ -2811,27 +2814,27 @@ static long vma_del_reservation(struct hstate *h,
* reserve count adjustments to be made by free_huge_page. Make sure the
* reserve map indicates there is a reservation present.
*
- * In case 2, simply undo reserve map modifications done by alloc_huge_page.
+ * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio.
*/
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
- unsigned long address, struct page *page)
+ unsigned long address, struct folio *folio)
{
long rc = vma_needs_reservation(h, vma, address);
- if (HPageRestoreReserve(page)) {
+ if (folio_test_hugetlb_restore_reserve(folio)) {
if (unlikely(rc < 0))
/*
* Rare out of memory condition in reserve map
- * manipulation. Clear HPageRestoreReserve so that
- * global reserve count will not be incremented
+ * manipulation. Clear hugetlb_restore_reserve so
+ * that global reserve count will not be incremented
* by free_huge_page. This will make it appear
- * as though the reservation for this page was
+ * as though the reservation for this folio was
* consumed. This may prevent the task from
- * faulting in the page at a later time. This
+ * faulting in the folio at a later time. This
* is better than inconsistent global huge page
* accounting of reserve counts.
*/
- ClearHPageRestoreReserve(page);
+ folio_clear_hugetlb_restore_reserve(folio);
else if (rc)
(void)vma_add_reservation(h, vma, address);
else
@@ -2840,9 +2843,9 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
if (!rc) {
/*
* This indicates there is an entry in the reserve map
- * not added by alloc_huge_page. We know it was added
- * before the alloc_huge_page call, otherwise
- * HPageRestoreReserve would be set on the page.
+ * not added by alloc_hugetlb_folio. We know it was added
+ * before the alloc_hugetlb_folio call, otherwise
+ * hugetlb_restore_reserve would be set on the folio.
* Remove the entry so that a subsequent allocation
* does not consume a reservation.
*/
@@ -2851,12 +2854,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
/*
* VERY rare out of memory condition. Since
* we can not delete the entry, set
- * HPageRestoreReserve so that the reserve
- * count will be incremented when the page
+ * hugetlb_restore_reserve so that the reserve
+ * count will be incremented when the folio
* is freed. This reserve will be consumed
* on a subsequent allocation.
*/
- SetHPageRestoreReserve(page);
+ folio_set_hugetlb_restore_reserve(folio);
} else if (rc < 0) {
/*
* Rare out of memory condition from
@@ -2872,12 +2875,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
/*
* For private mappings, no entry indicates
* a reservation is present. Since we can
- * not add an entry, set SetHPageRestoreReserve
- * on the page so reserve count will be
+ * not add an entry, set hugetlb_restore_reserve
+ * on the folio so reserve count will be
* incremented when freed. This reserve will
* be consumed on a subsequent allocation.
*/
- SetHPageRestoreReserve(page);
+ folio_set_hugetlb_restore_reserve(folio);
} else
/*
* No reservation present, do nothing
@@ -2922,12 +2925,15 @@ retry:
*/
goto free_new;
} else if (folio_ref_count(old_folio)) {
+ bool isolated;
+
/*
* Someone has grabbed the folio, try to isolate it here.
* Fail with -EBUSY if not possible.
*/
spin_unlock_irq(&hugetlb_lock);
- ret = isolate_hugetlb(&old_folio->page, list);
+ isolated = isolate_hugetlb(old_folio, list);
+ ret = isolated ? 0 : -EBUSY;
spin_lock_irq(&hugetlb_lock);
goto free_new;
} else if (!folio_test_hugetlb_freed(old_folio)) {
@@ -3002,7 +3008,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
if (hstate_is_gigantic(h))
return -ENOMEM;
- if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list))
+ if (folio_ref_count(folio) && isolate_hugetlb(folio, list))
ret = 0;
else if (!folio_ref_count(folio))
ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
@@ -3010,17 +3016,16 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
return ret;
}
-struct page *alloc_huge_page(struct vm_area_struct *vma,
+struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
- struct page *page;
struct folio *folio;
long map_chg, map_commit;
long gbl_chg;
int ret, idx;
- struct hugetlb_cgroup *h_cg;
+ struct hugetlb_cgroup *h_cg = NULL;
bool deferred_reserve;
idx = hstate_index(h);
@@ -3079,34 +3084,34 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
* from the global free pool (global change). gbl_chg == 0 indicates
* a reservation exists for the allocation.
*/
- page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
- if (!page) {
+ folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg);
+ if (!folio) {
spin_unlock_irq(&hugetlb_lock);
- page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
- if (!page)
+ folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
+ if (!folio)
goto out_uncharge_cgroup;
spin_lock_irq(&hugetlb_lock);
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
- SetHPageRestoreReserve(page);
+ folio_set_hugetlb_restore_reserve(folio);
h->resv_huge_pages--;
}
- list_add(&page->lru, &h->hugepage_activelist);
- set_page_refcounted(page);
+ list_add(&folio->lru, &h->hugepage_activelist);
+ folio_ref_unfreeze(folio, 1);
/* Fall through */
}
- folio = page_folio(page);
- hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
/* If allocation is not consuming a reservation, also store the
* hugetlb_cgroup pointer on the page.
*/
if (deferred_reserve) {
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
- h_cg, page);
+ h_cg, folio);
}
spin_unlock_irq(&hugetlb_lock);
- hugetlb_set_page_subpool(page, spool);
+ hugetlb_set_folio_subpool(folio, spool);
map_commit = vma_commit_reservation(h, vma, addr);
if (unlikely(map_chg > map_commit)) {
@@ -3127,7 +3132,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
pages_per_huge_page(h), folio);
}
- return page;
+ return folio;
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
@@ -3494,7 +3499,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
*
- * We might race with alloc_surplus_huge_page() here and be unable
+ * We might race with alloc_surplus_hugetlb_folio() here and be unable
* to convert a surplus huge page to a normal huge page. That is
* not critical, though, it just means the overall size of the
* pool might be one hugepage larger than it needs to be, but
@@ -3537,7 +3542,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* By placing pages into the surplus state independent of the
* overcommit value, we are allowing the surplus pool size to
* exceed overcommit. There are few sane options here. Since
- * alloc_surplus_huge_page() is checking the global counter,
+ * alloc_surplus_hugetlb_folio() is checking the global counter,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
@@ -3576,12 +3581,12 @@ out:
return 0;
}
-static int demote_free_huge_page(struct hstate *h, struct page *page)
+static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
{
- int i, nid = page_to_nid(page);
+ int i, nid = folio_nid(folio);
struct hstate *target_hstate;
- struct folio *folio = page_folio(page);
struct page *subpage;
+ struct folio *inner_folio;
int rc = 0;
target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
@@ -3589,18 +3594,18 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
remove_hugetlb_folio_for_demote(h, folio, false);
spin_unlock_irq(&hugetlb_lock);
- rc = hugetlb_vmemmap_restore(h, page);
+ rc = hugetlb_vmemmap_restore(h, &folio->page);
if (rc) {
- /* Allocation of vmemmmap failed, we can not demote page */
+ /* Allocation of vmemmmap failed, we can not demote folio */
spin_lock_irq(&hugetlb_lock);
- set_page_refcounted(page);
- add_hugetlb_folio(h, page_folio(page), false);
+ folio_ref_unfreeze(folio, 1);
+ add_hugetlb_folio(h, folio, false);
return rc;
}
/*
* Use destroy_compound_hugetlb_folio_for_demote for all huge page
- * sizes as it will not ref count pages.
+ * sizes as it will not ref count folios.
*/
destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h));
@@ -3615,15 +3620,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
mutex_lock(&target_hstate->resize_lock);
for (i = 0; i < pages_per_huge_page(h);
i += pages_per_huge_page(target_hstate)) {
- subpage = nth_page(page, i);
- folio = page_folio(subpage);
+ subpage = folio_page(folio, i);
+ inner_folio = page_folio(subpage);
if (hstate_is_gigantic(target_hstate))
- prep_compound_gigantic_folio_for_demote(folio,
+ prep_compound_gigantic_folio_for_demote(inner_folio,
target_hstate->order);
else
prep_compound_page(subpage, target_hstate->order);
- set_page_private(subpage, 0);
- prep_new_hugetlb_folio(target_hstate, folio, nid);
+ folio_change_private(inner_folio, NULL);
+ prep_new_hugetlb_folio(target_hstate, inner_folio, nid);
free_huge_page(subpage);
}
mutex_unlock(&target_hstate->resize_lock);
@@ -3645,7 +3650,7 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
__must_hold(&hugetlb_lock)
{
int nr_nodes, node;
- struct page *page;
+ struct folio *folio;
lockdep_assert_held(&hugetlb_lock);
@@ -3656,11 +3661,10 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
}
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
- list_for_each_entry(page, &h->hugepage_freelists[node], lru) {
- if (PageHWPoison(page))
+ list_for_each_entry(folio, &h->hugepage_freelists[node], lru) {
+ if (folio_test_hwpoison(folio))
continue;
-
- return demote_free_huge_page(h, page);
+ return demote_free_hugetlb_folio(h, folio);
}
}
@@ -4834,6 +4838,25 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+
+ /*
+ * PMD sharing is only possible for PUD_SIZE-aligned address ranges
+ * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
+ * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ */
+ if (addr & ~PUD_MASK) {
+ /*
+ * hugetlb_vm_op_split is called right before we attempt to
+ * split the VMA. We will need to unshare PMDs in the old and
+ * new VMAs, so let's unshare before we split.
+ */
+ unsigned long floor = addr & PUD_MASK;
+ unsigned long ceil = floor + PUD_SIZE;
+
+ if (floor >= vma->vm_start && ceil <= vma->vm_end)
+ hugetlb_unshare_pmds(vma, floor, ceil);
+ }
+
return 0;
}
@@ -4925,14 +4948,14 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
}
static void
-hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
- struct page *new_page)
+hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
+ struct folio *new_folio)
{
- __SetPageUptodate(new_page);
- hugepage_add_new_anon_rmap(new_page, vma, addr);
- set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
+ __folio_mark_uptodate(new_folio);
+ hugepage_add_new_anon_rmap(new_folio, vma, addr);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1));
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
- SetHPageMigratable(new_page);
+ folio_set_hugetlb_migratable(new_folio);
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
@@ -4951,7 +4974,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
int ret = 0;
if (cow) {
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src,
src_vma->vm_start,
src_vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
@@ -4960,7 +4983,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
} else {
/*
* For shared mappings the vma lock must be held before
- * calling huge_pte_offset in the src vma. Otherwise, the
+ * calling hugetlb_walk() in the src vma. Otherwise, the
* returned ptep could go away if part of a shared pmd and
* another thread calls huge_pmd_unshare.
*/
@@ -4970,7 +4993,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
last_addr_mask = hugetlb_mask_last_page(h);
for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
- src_pte = huge_pte_offset(src, addr, sz);
+ src_pte = hugetlb_walk(src_vma, addr, sz);
if (!src_pte) {
addr |= last_addr_mask;
continue;
@@ -5030,6 +5053,9 @@ again:
entry = huge_pte_clear_uffd_wp(entry);
set_huge_pte_at(dst, addr, dst_pte, entry);
} else if (unlikely(is_pte_marker(entry))) {
+ /* No swap on hugetlb */
+ WARN_ON_ONCE(
+ is_swapin_error_entry(pte_to_swp_entry(entry)));
/*
* We copy the pte marker only if the dst vma has
* uffd-wp enabled.
@@ -5056,34 +5082,34 @@ again:
} else if (page_try_dup_anon_rmap(ptepage, true,
src_vma)) {
pte_t src_pte_old = entry;
- struct page *new;
+ struct folio *new_folio;
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
- new = alloc_huge_page(dst_vma, addr, 1);
- if (IS_ERR(new)) {
+ new_folio = alloc_hugetlb_folio(dst_vma, addr, 1);
+ if (IS_ERR(new_folio)) {
put_page(ptepage);
- ret = PTR_ERR(new);
+ ret = PTR_ERR(new_folio);
break;
}
- copy_user_huge_page(new, ptepage, addr, dst_vma,
+ copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma,
npages);
put_page(ptepage);
- /* Install the new huge page if src pte stable */
+ /* Install the new hugetlb folio if src pte stable */
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
restore_reserve_on_error(h, dst_vma, addr,
- new);
- put_page(new);
+ new_folio);
+ folio_put(new_folio);
/* huge_ptep of dst_pte won't change as in child */
goto again;
}
- hugetlb_install_page(dst_vma, dst_pte, addr, new);
+ hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
continue;
@@ -5131,7 +5157,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
/*
* We don't have to worry about the ordering of src and dst ptlocks
- * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock.
+ * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock.
*/
if (src_ptl != dst_ptl)
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -5159,7 +5185,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
struct mmu_notifier_range range;
bool shared_pmd = false;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
old_end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
/*
@@ -5174,7 +5200,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
- src_pte = huge_pte_offset(mm, old_addr, sz);
+ src_pte = hugetlb_walk(vma, old_addr, sz);
if (!src_pte) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
@@ -5237,7 +5263,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
- ptep = huge_pte_offset(mm, address, sz);
+ ptep = hugetlb_walk(vma, address, sz);
if (!ptep) {
address |= last_addr_mask;
continue;
@@ -5373,7 +5399,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
struct mmu_notifier_range range;
struct mmu_gather tlb;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
start, end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
mmu_notifier_invalidate_range_start(&range);
@@ -5449,12 +5475,13 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, unsigned int flags,
- struct page *pagecache_page, spinlock_t *ptl)
+ struct folio *pagecache_folio, spinlock_t *ptl)
{
const bool unshare = flags & FAULT_FLAG_UNSHARE;
pte_t pte;
struct hstate *h = hstate_vma(vma);
- struct page *old_page, *new_page;
+ struct page *old_page;
+ struct folio *new_folio;
int outside_reserve = 0;
vm_fault_t ret = 0;
unsigned long haddr = address & huge_page_mask(h);
@@ -5505,7 +5532,7 @@ retry_avoidcopy:
* of the full address range.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
- old_page != pagecache_page)
+ page_folio(old_page) != pagecache_folio)
outside_reserve = 1;
get_page(old_page);
@@ -5515,9 +5542,9 @@ retry_avoidcopy:
* be acquired again before returning to the caller, as expected.
*/
spin_unlock(ptl);
- new_page = alloc_huge_page(vma, haddr, outside_reserve);
+ new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve);
- if (IS_ERR(new_page)) {
+ if (IS_ERR(new_folio)) {
/*
* If a process owning a MAP_PRIVATE mapping fails to COW,
* it is due to references held by a child and an insufficient
@@ -5550,7 +5577,7 @@ retry_avoidcopy:
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
spin_lock(ptl);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
@@ -5562,7 +5589,7 @@ retry_avoidcopy:
return 0;
}
- ret = vmf_error(PTR_ERR(new_page));
+ ret = vmf_error(PTR_ERR(new_folio));
goto out_release_old;
}
@@ -5575,11 +5602,11 @@ retry_avoidcopy:
goto out_release_all;
}
- copy_user_huge_page(new_page, old_page, address, vma,
+ copy_user_huge_page(&new_folio->page, old_page, address, vma,
pages_per_huge_page(h));
- __SetPageUptodate(new_page);
+ __folio_mark_uptodate(new_folio);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
haddr + huge_page_size(h));
mmu_notifier_invalidate_range_start(&range);
@@ -5588,18 +5615,18 @@ retry_avoidcopy:
* before the page tables are altered
*/
spin_lock(ptl);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
page_remove_rmap(old_page, vma, true);
- hugepage_add_new_anon_rmap(new_page, vma, haddr);
+ hugepage_add_new_anon_rmap(new_folio, vma, haddr);
set_huge_pte_at(mm, haddr, ptep,
- make_huge_pte(vma, new_page, !unshare));
- SetHPageMigratable(new_page);
+ make_huge_pte(vma, &new_folio->page, !unshare));
+ folio_set_hugetlb_migratable(new_folio);
/* Make the old page be freed below */
- new_page = old_page;
+ new_folio = page_folio(old_page);
}
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
@@ -5608,9 +5635,9 @@ out_release_all:
* No restore in case of successful pagetable update (Break COW or
* unshare)
*/
- if (new_page != old_page)
- restore_reserve_on_error(h, vma, haddr, new_page);
- put_page(new_page);
+ if (new_folio != page_folio(old_page))
+ restore_reserve_on_error(h, vma, haddr, new_folio);
+ folio_put(new_folio);
out_release_old:
put_page(old_page);
@@ -5627,23 +5654,20 @@ out_release_old:
static bool hugetlbfs_pagecache_present(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
- struct address_space *mapping;
- pgoff_t idx;
- struct page *page;
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, vma, address);
+ bool present;
- mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
+ rcu_read_lock();
+ present = page_cache_next_miss(mapping, idx, 1) != idx;
+ rcu_read_unlock();
- page = find_get_page(mapping, idx);
- if (page)
- put_page(page);
- return page != NULL;
+ return present;
}
-int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
+int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
pgoff_t idx)
{
- struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
struct hstate *h = hstate_inode(inode);
int err;
@@ -5655,7 +5679,7 @@ int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
__folio_clear_locked(folio);
return err;
}
- ClearHPageRestoreReserve(page);
+ folio_clear_hugetlb_restore_reserve(folio);
/*
* mark folio dirty so that it will not be removed from cache/file
@@ -5731,11 +5755,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
vm_fault_t ret = VM_FAULT_SIGBUS;
int anon_rmap = 0;
unsigned long size;
- struct page *page;
+ struct folio *folio;
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
- bool new_page, new_pagecache_page = false;
+ bool new_folio, new_pagecache_folio = false;
u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
/*
@@ -5754,9 +5778,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* Use page lock to guard against racing truncation
* before we get page_table_lock.
*/
- new_page = false;
- page = find_lock_page(mapping, idx);
- if (!page) {
+ new_folio = false;
+ folio = filemap_lock_folio(mapping, idx);
+ if (!folio) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto out;
@@ -5789,8 +5813,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
VM_UFFD_MISSING);
}
- page = alloc_huge_page(vma, haddr, 0);
- if (IS_ERR(page)) {
+ folio = alloc_hugetlb_folio(vma, haddr, 0);
+ if (IS_ERR(folio)) {
/*
* Returning error will result in faulting task being
* sent SIGBUS. The hugetlb fault mutex prevents two
@@ -5804,17 +5828,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* sure there really is no pte entry.
*/
if (hugetlb_pte_stable(h, mm, ptep, old_pte))
- ret = vmf_error(PTR_ERR(page));
+ ret = vmf_error(PTR_ERR(folio));
else
ret = 0;
goto out;
}
- clear_huge_page(page, address, pages_per_huge_page(h));
- __SetPageUptodate(page);
- new_page = true;
+ clear_huge_page(&folio->page, address, pages_per_huge_page(h));
+ __folio_mark_uptodate(folio);
+ new_folio = true;
if (vma->vm_flags & VM_MAYSHARE) {
- int err = hugetlb_add_to_page_cache(page, mapping, idx);
+ int err = hugetlb_add_to_page_cache(folio, mapping, idx);
if (err) {
/*
* err can't be -EEXIST which implies someone
@@ -5823,13 +5847,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* to the page cache. So it's safe to call
* restore_reserve_on_error() here.
*/
- restore_reserve_on_error(h, vma, haddr, page);
- put_page(page);
+ restore_reserve_on_error(h, vma, haddr, folio);
+ folio_put(folio);
goto out;
}
- new_pagecache_page = true;
+ new_pagecache_folio = true;
} else {
- lock_page(page);
+ folio_lock(folio);
if (unlikely(anon_vma_prepare(vma))) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
@@ -5842,7 +5866,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* don't have hwpoisoned swap entry for errored virtual address.
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
- if (unlikely(PageHWPoison(page))) {
+ if (unlikely(folio_test_hwpoison(folio))) {
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
@@ -5850,8 +5874,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
/* Check for page in userfault range. */
if (userfaultfd_minor(vma)) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
/* See comment in userfaultfd_missing() block above */
if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
ret = 0;
@@ -5885,36 +5909,36 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
goto backout;
if (anon_rmap)
- hugepage_add_new_anon_rmap(page, vma, haddr);
+ hugepage_add_new_anon_rmap(folio, vma, haddr);
else
- page_dup_file_rmap(page, true);
- new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ page_dup_file_rmap(&folio->page, true);
+ new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
/*
* If this pte was previously wr-protected, keep it wr-protected even
* if populated.
*/
if (unlikely(pte_marker_uffd_wp(old_pte)))
- new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
+ new_pte = huge_pte_mkuffd_wp(new_pte);
set_huge_pte_at(mm, haddr, ptep, new_pte);
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
+ ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl);
}
spin_unlock(ptl);
/*
- * Only set HPageMigratable in newly allocated pages. Existing pages
- * found in the pagecache may not have HPageMigratableset if they have
+ * Only set hugetlb_migratable in newly allocated pages. Existing pages
+ * found in the pagecache may not have hugetlb_migratable if they have
* been isolated for migration.
*/
- if (new_page)
- SetHPageMigratable(page);
+ if (new_folio)
+ folio_set_hugetlb_migratable(folio);
- unlock_page(page);
+ folio_unlock(folio);
out:
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -5923,11 +5947,11 @@ out:
backout:
spin_unlock(ptl);
backout_unlocked:
- if (new_page && !new_pagecache_page)
- restore_reserve_on_error(h, vma, haddr, page);
+ if (new_folio && !new_pagecache_folio)
+ restore_reserve_on_error(h, vma, haddr, folio);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
goto out;
}
@@ -5964,28 +5988,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
u32 hash;
pgoff_t idx;
struct page *page = NULL;
- struct page *pagecache_page = NULL;
+ struct folio *pagecache_folio = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
- if (ptep) {
- /*
- * Since we hold no locks, ptep could be stale. That is
- * OK as we are only making decisions based on content and
- * not actually modifying content here.
- */
- entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, ptep);
- return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(hstate_index(h));
- }
-
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
@@ -6000,10 +6008,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* Acquire vma lock before calling huge_pte_alloc and hold
* until finished with ptep. This prevents huge_pmd_unshare from
* being called elsewhere and making the ptep no longer valid.
- *
- * ptep could have already be assigned via huge_pte_offset. That
- * is OK, as huge_pte_alloc will return the same value unless
- * something has changed.
*/
hugetlb_vma_lock_read(vma);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
@@ -6032,8 +6036,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
* properly handle it.
*/
- if (!pte_present(entry))
+ if (!pte_present(entry)) {
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ /*
+ * Release the hugetlb fault lock now, but retain
+ * the vma lock, because it is needed to guard the
+ * huge_pte_lockptr() later in
+ * migration_entry_wait_huge(). The vma lock will
+ * be released there.
+ */
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ migration_entry_wait_huge(vma, ptep);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ ret = VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
+ }
/*
* If we are going to COW/unshare the mapping later, we examine the
@@ -6051,7 +6070,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
- pagecache_page = find_lock_page(mapping, idx);
+ pagecache_folio = filemap_lock_folio(mapping, idx);
}
ptl = huge_pte_lock(h, mm, ptep);
@@ -6071,9 +6090,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
};
spin_unlock(ptl);
- if (pagecache_page) {
- unlock_page(pagecache_page);
- put_page(pagecache_page);
+ if (pagecache_folio) {
+ folio_unlock(pagecache_folio);
+ folio_put(pagecache_folio);
}
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -6082,11 +6101,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* hugetlb_wp() requires page locks of pte_page(entry) and
- * pagecache_page, so here we need take the former one
- * when page != pagecache_page or !pagecache_page.
+ * pagecache_folio, so here we need take the former one
+ * when page != pagecache_folio or !pagecache_folio.
*/
page = pte_page(entry);
- if (page != pagecache_page)
+ if (page_folio(page) != pagecache_folio)
if (!trylock_page(page)) {
need_wait_lock = 1;
goto out_ptl;
@@ -6097,7 +6116,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(entry)) {
ret = hugetlb_wp(mm, vma, address, ptep, flags,
- pagecache_page, ptl);
+ pagecache_folio, ptl);
goto out_put_page;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
entry = huge_pte_mkdirty(entry);
@@ -6108,15 +6127,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, haddr, ptep);
out_put_page:
- if (page != pagecache_page)
+ if (page_folio(page) != pagecache_folio)
unlock_page(page);
put_page(page);
out_ptl:
spin_unlock(ptl);
- if (pagecache_page) {
- unlock_page(pagecache_page);
- put_page(pagecache_page);
+ if (pagecache_folio) {
+ folio_unlock(pagecache_folio);
+ folio_put(pagecache_folio);
}
out_mutex:
hugetlb_vma_unlock_read(vma);
@@ -6156,16 +6175,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
pte_t _dst_pte;
spinlock_t *ptl;
int ret = -ENOMEM;
- struct page *page;
+ struct folio *folio;
int writable;
- bool page_in_pagecache = false;
+ bool folio_in_pagecache = false;
if (is_continue) {
ret = -EFAULT;
- page = find_lock_page(mapping, idx);
- if (!page)
+ folio = filemap_lock_folio(mapping, idx);
+ if (!folio)
goto out;
- page_in_pagecache = true;
+ folio_in_pagecache = true;
} else if (!*pagep) {
/* If a page already exists, then it's UFFDIO_COPY for
* a non-missing case. Return -EEXIST.
@@ -6176,34 +6195,34 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
goto out;
}
- page = alloc_huge_page(dst_vma, dst_addr, 0);
- if (IS_ERR(page)) {
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ if (IS_ERR(folio)) {
ret = -ENOMEM;
goto out;
}
- ret = copy_huge_page_from_user(page,
+ ret = copy_huge_page_from_user(&folio->page,
(const void __user *) src_addr,
pages_per_huge_page(h), false);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
- /* Free the allocated page which may have
+ /* Free the allocated folio which may have
* consumed a reservation.
*/
- restore_reserve_on_error(h, dst_vma, dst_addr, page);
- put_page(page);
+ restore_reserve_on_error(h, dst_vma, dst_addr, folio);
+ folio_put(folio);
- /* Allocate a temporary page to hold the copied
+ /* Allocate a temporary folio to hold the copied
* contents.
*/
- page = alloc_huge_page_vma(h, dst_vma, dst_addr);
- if (!page) {
+ folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr);
+ if (!folio) {
ret = -ENOMEM;
goto out;
}
- *pagep = page;
+ *pagep = &folio->page;
/* Set the outparam pagep and return to the caller to
* copy the contents outside the lock. Don't free the
* page.
@@ -6219,25 +6238,25 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
goto out;
}
- page = alloc_huge_page(dst_vma, dst_addr, 0);
- if (IS_ERR(page)) {
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ if (IS_ERR(folio)) {
put_page(*pagep);
ret = -ENOMEM;
*pagep = NULL;
goto out;
}
- copy_user_huge_page(page, *pagep, dst_addr, dst_vma,
+ copy_user_huge_page(&folio->page, *pagep, dst_addr, dst_vma,
pages_per_huge_page(h));
put_page(*pagep);
*pagep = NULL;
}
/*
- * The memory barrier inside __SetPageUptodate makes sure that
+ * The memory barrier inside __folio_mark_uptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
- __SetPageUptodate(page);
+ __folio_mark_uptodate(folio);
/* Add shared, newly allocated pages to the page cache. */
if (vm_shared && !is_continue) {
@@ -6252,16 +6271,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
* hugetlb_fault_mutex_table that here must be hold by
* the caller.
*/
- ret = hugetlb_add_to_page_cache(page, mapping, idx);
+ ret = hugetlb_add_to_page_cache(folio, mapping, idx);
if (ret)
goto out_release_nounlock;
- page_in_pagecache = true;
+ folio_in_pagecache = true;
}
ptl = huge_pte_lock(h, dst_mm, dst_pte);
ret = -EIO;
- if (PageHWPoison(page))
+ if (folio_test_hwpoison(folio))
goto out_release_unlock;
/*
@@ -6273,10 +6292,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
- if (page_in_pagecache)
- page_dup_file_rmap(page, true);
+ if (folio_in_pagecache)
+ page_dup_file_rmap(&folio->page, true);
else
- hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
+ hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr);
/*
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
@@ -6287,7 +6306,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
else
writable = dst_vma->vm_flags & VM_WRITE;
- _dst_pte = make_huge_pte(dst_vma, page, writable);
+ _dst_pte = make_huge_pte(dst_vma, &folio->page, writable);
/*
* Always mark UFFDIO_COPY page dirty; note that this may not be
* extremely important for hugetlbfs for now since swapping is not
@@ -6309,20 +6328,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
spin_unlock(ptl);
if (!is_continue)
- SetHPageMigratable(page);
+ folio_set_hugetlb_migratable(folio);
if (vm_shared || is_continue)
- unlock_page(page);
+ folio_unlock(folio);
ret = 0;
out:
return ret;
out_release_unlock:
spin_unlock(ptl);
if (vm_shared || is_continue)
- unlock_page(page);
+ folio_unlock(folio);
out_release_nounlock:
- if (!page_in_pagecache)
- restore_reserve_on_error(h, dst_vma, dst_addr, page);
- put_page(page);
+ if (!folio_in_pagecache)
+ restore_reserve_on_error(h, dst_vma, dst_addr, folio);
+ folio_put(folio);
goto out;
}
#endif /* CONFIG_USERFAULTFD */
@@ -6378,10 +6397,10 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
if (WARN_ON_ONCE(flags & FOLL_PIN))
return NULL;
-retry:
- pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+ hugetlb_vma_lock_read(vma);
+ pte = hugetlb_walk(vma, haddr, huge_page_size(h));
if (!pte)
- return NULL;
+ goto out_unlock;
ptl = huge_pte_lock(h, mm, pte);
entry = huge_ptep_get(pte);
@@ -6401,19 +6420,11 @@ retry:
page = NULL;
goto out;
}
- } else {
- if (is_hugetlb_entry_migration(entry)) {
- spin_unlock(ptl);
- __migration_entry_wait_huge(pte, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
}
out:
spin_unlock(ptl);
+out_unlock:
+ hugetlb_vma_unlock_read(vma);
return page;
}
@@ -6444,6 +6455,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
+ hugetlb_vma_lock_read(vma);
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
@@ -6451,8 +6463,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*
* Note that page table lock is not held when pte is null.
*/
- pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
- huge_page_size(h));
+ pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
+ huge_page_size(h));
if (pte)
ptl = huge_pte_lock(h, mm, pte);
absent = !pte || huge_pte_none(huge_ptep_get(pte));
@@ -6468,6 +6480,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
!hugetlbfs_pagecache_present(h, vma, vaddr)) {
if (pte)
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
remainder = 0;
break;
}
@@ -6489,6 +6502,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (pte)
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
+
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
else if (unshare)
@@ -6551,6 +6566,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
remainder -= pages_per_huge_page(h);
i += pages_per_huge_page(h);
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
continue;
}
@@ -6580,6 +6596,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
flags))) {
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
remainder = 0;
err = -ENOMEM;
break;
@@ -6591,6 +6608,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
i += refs;
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
}
*nr_pages = remainder;
/*
@@ -6603,7 +6621,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
return i ? i : err;
}
-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
@@ -6612,7 +6630,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
- unsigned long pages = 0, psize = huge_page_size(h);
+ long pages = 0, psize = huge_page_size(h);
bool shared_pmd = false;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
@@ -6625,7 +6643,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* range if PMD sharing is possible.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
- 0, vma, mm, start, end);
+ 0, mm, start, end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
@@ -6637,10 +6655,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
last_addr_mask = hugetlb_mask_last_page(h);
for (; address < end; address += psize) {
spinlock_t *ptl;
- ptep = huge_pte_offset(mm, address, psize);
+ ptep = hugetlb_walk(vma, address, psize);
if (!ptep) {
- address |= last_addr_mask;
- continue;
+ if (!uffd_wp) {
+ address |= last_addr_mask;
+ continue;
+ }
+ /*
+ * Userfaultfd wr-protect requires pgtable
+ * pre-allocations to install pte markers.
+ */
+ ptep = huge_pte_alloc(mm, vma, address, psize);
+ if (!ptep) {
+ pages = -ENOMEM;
+ break;
+ }
}
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, vma, address, ptep)) {
@@ -6658,16 +6687,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
}
pte = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
- spin_unlock(ptl);
- continue;
- }
- if (unlikely(is_hugetlb_entry_migration(pte))) {
+ /* Nothing to do. */
+ } else if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
struct page *page = pfn_swap_entry_to_page(entry);
+ pte_t newpte = pte;
- if (!is_readable_migration_entry(entry)) {
- pte_t newpte;
-
+ if (is_writable_migration_entry(entry)) {
if (PageAnon(page))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry));
@@ -6675,25 +6701,22 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
entry = make_readable_migration_entry(
swp_offset(entry));
newpte = swp_entry_to_pte(entry);
- if (uffd_wp)
- newpte = pte_swp_mkuffd_wp(newpte);
- else if (uffd_wp_resolve)
- newpte = pte_swp_clear_uffd_wp(newpte);
- set_huge_pte_at(mm, address, ptep, newpte);
pages++;
}
- spin_unlock(ptl);
- continue;
- }
- if (unlikely(pte_marker_uffd_wp(pte))) {
- /*
- * This is changing a non-present pte into a none pte,
- * no need for huge_ptep_modify_prot_start/commit().
- */
+
+ if (uffd_wp)
+ newpte = pte_swp_mkuffd_wp(newpte);
+ else if (uffd_wp_resolve)
+ newpte = pte_swp_clear_uffd_wp(newpte);
+ if (!pte_same(pte, newpte))
+ set_huge_pte_at(mm, address, ptep, newpte);
+ } else if (unlikely(is_pte_marker(pte))) {
+ /* No other markers apply for now. */
+ WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
if (uffd_wp_resolve)
+ /* Safe to modify directly (non-present->none). */
huge_pte_clear(mm, address, ptep, psize);
- }
- if (!huge_pte_none(pte)) {
+ } else if (!huge_pte_none(pte)) {
pte_t old_pte;
unsigned int shift = huge_page_shift(hstate_vma(vma));
@@ -6701,7 +6724,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte = huge_pte_modify(old_pte, newprot);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
if (uffd_wp)
- pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
+ pte = huge_pte_mkuffd_wp(pte);
else if (uffd_wp_resolve)
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
@@ -6736,7 +6759,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
mmu_notifier_invalidate_range_end(&range);
- return pages << h->order;
+ return pages > 0 ? (pages << h->order) : pages;
}
/* Return true if reservation was successful, false otherwise. */
@@ -6745,7 +6768,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long chg, add = -1;
+ long chg = -1, add = -1;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -6850,7 +6873,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
/*
* pages in this range were added to the reserve
* map between region_chg and region_add. This
- * indicates a race with alloc_huge_page. Adjust
+ * indicates a race with alloc_hugetlb_folio. Adjust
* the subpool and reserve counts modified above
* based on the difference.
*/
@@ -6950,8 +6973,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
- unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
+ unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
/*
* match the virtual addresses, permission and the alignment of the
@@ -7044,8 +7067,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
- spte = huge_pte_offset(svma->vm_mm, saddr,
- vma_mmu_pagesize(svma));
+ spte = hugetlb_walk(svma, saddr,
+ vma_mmu_pagesize(svma));
if (spte) {
get_page(virt_to_page(spte));
break;
@@ -7231,36 +7254,36 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
* These functions are overwritable if your architecture needs its own
* behavior.
*/
-int isolate_hugetlb(struct page *page, struct list_head *list)
+bool isolate_hugetlb(struct folio *folio, struct list_head *list)
{
- int ret = 0;
+ bool ret = true;
spin_lock_irq(&hugetlb_lock);
- if (!PageHeadHuge(page) ||
- !HPageMigratable(page) ||
- !get_page_unless_zero(page)) {
- ret = -EBUSY;
+ if (!folio_test_hugetlb(folio) ||
+ !folio_test_hugetlb_migratable(folio) ||
+ !folio_try_get(folio)) {
+ ret = false;
goto unlock;
}
- ClearHPageMigratable(page);
- list_move_tail(&page->lru, list);
+ folio_clear_hugetlb_migratable(folio);
+ list_move_tail(&folio->lru, list);
unlock:
spin_unlock_irq(&hugetlb_lock);
return ret;
}
-int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison)
+int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
{
int ret = 0;
*hugetlb = false;
spin_lock_irq(&hugetlb_lock);
- if (PageHeadHuge(page)) {
+ if (folio_test_hugetlb(folio)) {
*hugetlb = true;
- if (HPageFreed(page))
+ if (folio_test_hugetlb_freed(folio))
ret = 0;
- else if (HPageMigratable(page) || unpoison)
- ret = get_page_unless_zero(page);
+ else if (folio_test_hugetlb_migratable(folio) || unpoison)
+ ret = folio_try_get(folio);
else
ret = -EBUSY;
}
@@ -7279,13 +7302,13 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
return ret;
}
-void putback_active_hugepage(struct page *page)
+void folio_putback_active_hugetlb(struct folio *folio)
{
spin_lock_irq(&hugetlb_lock);
- SetHPageMigratable(page);
- list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
+ folio_set_hugetlb_migratable(folio);
+ list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist);
spin_unlock_irq(&hugetlb_lock);
- put_page(page);
+ folio_put(folio);
}
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
@@ -7328,26 +7351,21 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
}
}
-/*
- * This function will unconditionally remove all the shared pmd pgtable entries
- * within the specific vma for a hugetlbfs memory range.
- */
-void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
- unsigned long address, start, end;
+ unsigned long address;
spinlock_t *ptl;
pte_t *ptep;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
- start = ALIGN(vma->vm_start, PUD_SIZE);
- end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
-
if (start >= end)
return;
@@ -7356,13 +7374,13 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
* No need to call adjust_range_if_pmd_sharing_possible(), because
* we have already done the PUD_SIZE alignment.
*/
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
- ptep = huge_pte_offset(mm, address, sz);
+ ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
@@ -7379,6 +7397,16 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
mmu_notifier_invalidate_range_end(&range);
}
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+ hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+}
+
#ifdef CONFIG_CMA
static bool cma_reserve_called __initdata;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index d9e4425d81ac..dedd2edb076e 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -331,19 +331,15 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page)
+ struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
__hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
}
void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page)
+ struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
__hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 45e93a545dd7..a559037cce00 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -581,7 +581,7 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
{
.procname = "hugetlb_optimize_vmemmap",
.data = &vmemmap_optimize_enabled,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(vmemmap_optimize_enabled),
.mode = 0644,
.proc_handler = proc_dobool,
},
diff --git a/mm/internal.h b/mm/internal.h
index bcf75a8b032d..7920a8b7982e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,7 +24,7 @@ struct folio_batch;
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
- __GFP_ATOMIC|__GFP_NOLOCKDEP)
+ __GFP_NOLOCKDEP)
/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
@@ -52,6 +52,24 @@ struct folio_batch;
void page_writeback_init(void);
+/*
+ * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
+ * its nr_pages_mapped would be 0x400000: choose the COMPOUND_MAPPED bit
+ * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently
+ * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
+ */
+#define COMPOUND_MAPPED 0x800000
+#define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1)
+
+/*
+ * How many individual pages have an elevated _mapcount. Excludes
+ * the folio's entire_mapcount.
+ */
+static inline int folio_nr_pages_mapped(struct folio *folio)
+{
+ return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
+}
+
static inline void *folio_raw_mapping(struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
@@ -141,17 +159,6 @@ static inline bool folio_evictable(struct folio *folio)
return ret;
}
-static inline bool page_evictable(struct page *page)
-{
- bool ret;
-
- /* Prevent address_space of inode and swap cache from being freed */
- rcu_read_lock();
- ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
- rcu_read_unlock();
- return ret;
-}
-
/*
* Turn a non-refcounted page (->_refcount == 0) into refcounted with
* a count of one.
@@ -180,8 +187,8 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
/*
* in mm/vmscan.c:
*/
-int isolate_lru_page(struct page *page);
-int folio_isolate_lru(struct folio *folio);
+bool isolate_lru_page(struct page *page);
+bool folio_isolate_lru(struct folio *folio);
void putback_lru_page(struct page *page);
void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
@@ -378,6 +385,25 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
int split_free_page(struct page *free_page,
unsigned int order, unsigned long split_pfn_offset);
+/*
+ * This will have no effect, other than possibly generating a warning, if the
+ * caller passes in a non-large folio.
+ */
+static inline void folio_set_order(struct folio *folio, unsigned int order)
+{
+ if (WARN_ON_ONCE(!folio_test_large(folio)))
+ return;
+
+ folio->_folio_order = order;
+#ifdef CONFIG_64BIT
+ /*
+ * When hugetlb dissolves a folio, we need to clear the tail
+ * page, rather than setting nr_pages to 1.
+ */
+ folio->_folio_nr_pages = order ? 1U << order : 0;
+#endif
+}
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -422,7 +448,11 @@ struct compact_control {
bool proactive_compaction; /* kcompactd proactive compaction */
bool whole_zone; /* Whole zone should/has been scanned */
bool contended; /* Signal lock contention */
- bool rescan; /* Rescanning the same pageblock */
+ bool finish_pageblock; /* Scan the remainder of a pageblock. Used
+ * when there are potentially transient
+ * isolation or migration failures to
+ * ensure forward progress.
+ */
bool alloc_contig; /* alloc_contig_range allocation */
};
@@ -492,14 +522,13 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma,
extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
unsigned long len);
/*
- * mlock_vma_page() and munlock_vma_page():
+ * mlock_vma_folio() and munlock_vma_folio():
* should be called with vma's mmap_lock held for read or write,
* under page table lock for the pte/pmd being added or removed.
*
- * mlock is usually called at the end of page_add_*_rmap(),
- * munlock at the end of page_remove_rmap(); but new anon
- * pages are managed by lru_cache_add_inactive_or_unevictable()
- * calling mlock_new_page().
+ * mlock is usually called at the end of page_add_*_rmap(), munlock at
+ * the end of page_remove_rmap(); but new anon folios are managed by
+ * folio_add_lru_vma() calling mlock_new_folio().
*
* @compound is used to include pmd mappings of THPs, but filter out
* pte mappings of THPs, which cannot be consistently counted: a pte
@@ -522,24 +551,19 @@ static inline void mlock_vma_folio(struct folio *folio,
mlock_folio(folio);
}
-static inline void mlock_vma_page(struct page *page,
- struct vm_area_struct *vma, bool compound)
-{
- mlock_vma_folio(page_folio(page), vma, compound);
-}
-
-void munlock_page(struct page *page);
-static inline void munlock_vma_page(struct page *page,
+void munlock_folio(struct folio *folio);
+static inline void munlock_vma_folio(struct folio *folio,
struct vm_area_struct *vma, bool compound)
{
if (unlikely(vma->vm_flags & VM_LOCKED) &&
- (compound || !PageTransCompound(page)))
- munlock_page(page);
+ (compound || !folio_test_large(folio)))
+ munlock_folio(folio);
}
-void mlock_new_page(struct page *page);
-bool need_mlock_page_drain(int cpu);
-void mlock_page_drain_local(void);
-void mlock_page_drain_remote(int cpu);
+
+void mlock_new_folio(struct folio *folio);
+bool need_mlock_drain(int cpu);
+void mlock_drain_local(void);
+void mlock_drain_remote(int cpu);
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
@@ -624,14 +648,10 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
}
#else /* !CONFIG_MMU */
static inline void unmap_mapping_folio(struct folio *folio) { }
-static inline void mlock_vma_page(struct page *page,
- struct vm_area_struct *vma, bool compound) { }
-static inline void munlock_vma_page(struct page *page,
- struct vm_area_struct *vma, bool compound) { }
-static inline void mlock_new_page(struct page *page) { }
-static inline bool need_mlock_page_drain(int cpu) { return false; }
-static inline void mlock_page_drain_local(void) { }
-static inline void mlock_page_drain_remote(int cpu) { }
+static inline void mlock_new_folio(struct folio *folio) { }
+static inline bool need_mlock_drain(int cpu) { return false; }
+static inline void mlock_drain_local(void) { }
+static inline void mlock_drain_remote(int cpu) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
@@ -735,8 +755,13 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_OOM ALLOC_NO_WATERMARKS
#endif
-#define ALLOC_HARDER 0x10 /* try to alloc harder */
-#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access
+ * to 25% of the min watermark or
+ * 62.5% if __GFP_HIGH is set.
+ */
+#define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50%
+ * of the min watermark.
+ */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
#ifdef CONFIG_ZONE_DMA32
@@ -744,8 +769,12 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
#else
#define ALLOC_NOFRAGMENT 0x0
#endif
+#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
+/* Flags that allow allocations below the min watermark. */
+#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
+
enum ttu_flags;
struct tlbflush_unmap_batch;
@@ -795,6 +824,12 @@ struct migration_target_control {
};
/*
+ * mm/filemap.c
+ */
+size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio, loff_t fpos, size_t size);
+
+/*
* mm/vmalloc.c
*/
#ifdef CONFIG_MMU
@@ -827,6 +862,87 @@ int migrate_device_coherent_page(struct page *page);
* mm/gup.c
*/
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
+int __must_check try_grab_page(struct page *page, unsigned int flags);
+
+enum {
+ /* mark page accessed */
+ FOLL_TOUCH = 1 << 16,
+ /* a retry, previous pass started an IO */
+ FOLL_TRIED = 1 << 17,
+ /* we are working on non-current tsk/mm */
+ FOLL_REMOTE = 1 << 18,
+ /* pages must be released via unpin_user_page */
+ FOLL_PIN = 1 << 19,
+ /* gup_fast: prevent fall-back to slow gup */
+ FOLL_FAST_ONLY = 1 << 20,
+ /* allow unlocking the mmap lock */
+ FOLL_UNLOCKABLE = 1 << 21,
+};
+
+/*
+ * Indicates for which pages that are write-protected in the page table,
+ * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
+ * GUP pin will remain consistent with the pages mapped into the page tables
+ * of the MM.
+ *
+ * Temporary unmapping of PageAnonExclusive() pages or clearing of
+ * PageAnonExclusive() has to protect against concurrent GUP:
+ * * Ordinary GUP: Using the PT lock
+ * * GUP-fast and fork(): mm->write_protect_seq
+ * * GUP-fast and KSM or temporary unmapping (swap, migration): see
+ * page_try_share_anon_rmap()
+ *
+ * Must be called with the (sub)page that's actually referenced via the
+ * page table entry, which might not necessarily be the head page for a
+ * PTE-mapped THP.
+ *
+ * If the vma is NULL, we're coming from the GUP-fast path and might have
+ * to fallback to the slow path just to lookup the vma.
+ */
+static inline bool gup_must_unshare(struct vm_area_struct *vma,
+ unsigned int flags, struct page *page)
+{
+ /*
+ * FOLL_WRITE is implicitly handled correctly as the page table entry
+ * has to be writable -- and if it references (part of) an anonymous
+ * folio, that part is required to be marked exclusive.
+ */
+ if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
+ return false;
+ /*
+ * Note: PageAnon(page) is stable until the page is actually getting
+ * freed.
+ */
+ if (!PageAnon(page)) {
+ /*
+ * We only care about R/O long-term pining: R/O short-term
+ * pinning does not have the semantics to observe successive
+ * changes through the process page tables.
+ */
+ if (!(flags & FOLL_LONGTERM))
+ return false;
+
+ /* We really need the vma ... */
+ if (!vma)
+ return true;
+
+ /*
+ * ... because we only care about writable private ("COW")
+ * mappings where we have to break COW early.
+ */
+ return is_cow_mapping(vma->vm_flags);
+ }
+
+ /* Paired with a memory barrier in page_try_share_anon_rmap(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_rmb();
+
+ /*
+ * Note that PageKsm() pages cannot be exclusive, and consequently,
+ * cannot get pinned.
+ */
+ return !PageAnonExclusive(page);
+}
extern bool mirrored_kernelcore;
@@ -848,4 +964,82 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
return !(vma->vm_flags & VM_SOFTDIRTY);
}
+/*
+ * VMA Iterator functions shared between nommu and mmap
+ */
+static inline int vma_iter_prealloc(struct vma_iterator *vmi)
+{
+ return mas_preallocate(&vmi->mas, GFP_KERNEL);
+}
+
+static inline void vma_iter_clear(struct vma_iterator *vmi,
+ unsigned long start, unsigned long end)
+{
+ mas_set_range(&vmi->mas, start, end - 1);
+ mas_store_prealloc(&vmi->mas, NULL);
+}
+
+static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
+{
+ return mas_walk(&vmi->mas);
+}
+
+/* Store a VMA with preallocated memory */
+static inline void vma_iter_store(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
+{
+
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) {
+ printk("%lu > %lu\n", vmi->mas.index, vma->vm_start);
+ printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end);
+ printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last);
+ mt_dump(vmi->mas.tree);
+ }
+ if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) {
+ printk("%lu < %lu\n", vmi->mas.last, vma->vm_start);
+ printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end);
+ printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last);
+ mt_dump(vmi->mas.tree);
+ }
+#endif
+
+ if (vmi->mas.node != MAS_START &&
+ ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
+ vma_iter_invalidate(vmi);
+
+ vmi->mas.index = vma->vm_start;
+ vmi->mas.last = vma->vm_end - 1;
+ mas_store_prealloc(&vmi->mas, vma);
+}
+
+static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
+ struct vm_area_struct *vma, gfp_t gfp)
+{
+ if (vmi->mas.node != MAS_START &&
+ ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
+ vma_iter_invalidate(vmi);
+
+ vmi->mas.index = vma->vm_start;
+ vmi->mas.last = vma->vm_end - 1;
+ mas_store_gfp(&vmi->mas, vma, gfp);
+ if (unlikely(mas_is_err(&vmi->mas)))
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * VMA lock generalization
+ */
+struct vma_prepare {
+ struct vm_area_struct *vma;
+ struct vm_area_struct *adj_next;
+ struct file *file;
+ struct address_space *mapping;
+ struct anon_vma *anon_vma;
+ struct vm_area_struct *insert;
+ struct vm_area_struct *remove;
+ struct vm_area_struct *remove2;
+};
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index d4837bff3b60..7634dd2a6128 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -35,7 +35,14 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) -fno-builtin $(call cc-disable-warning, vla)
+CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla)
+ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
+# If compiler instruments memintrinsics by prefixing them with __asan/__hwasan,
+# we need to treat them normally (as builtins), otherwise the compiler won't
+# recognize them as instrumentable. If it doesn't instrument them, we need to
+# pass -fno-builtin, so the compiler doesn't inline them.
+CFLAGS_KASAN_TEST += -fno-builtin
+endif
CFLAGS_kasan_test.o := $(CFLAGS_KASAN_TEST)
CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST)
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 833bf2cfd2a3..b376a5d055e5 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -43,7 +43,7 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc)
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
- return __stack_depot_save(entries, nr_entries, 0, flags, can_alloc);
+ return __stack_depot_save(entries, nr_entries, flags, can_alloc);
}
void kasan_set_track(struct kasan_track *track, gfp_t flags)
@@ -95,19 +95,24 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
}
#endif /* CONFIG_KASAN_STACK */
-void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
+bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
{
u8 tag;
unsigned long i;
if (unlikely(PageHighMem(page)))
- return;
+ return false;
+
+ if (!kasan_sample_page_alloc(order))
+ return false;
tag = kasan_random_tag();
kasan_unpoison(set_tag(page_address(page), tag),
PAGE_SIZE << order, init);
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
+
+ return true;
}
void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
@@ -117,11 +122,6 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
KASAN_PAGE_FREE, init);
}
-void __kasan_cache_create_kmalloc(struct kmem_cache *cache)
-{
- cache->kasan_info.is_kmalloc = true;
-}
-
void __kasan_poison_slab(struct slab *slab)
{
struct page *page = slab_page(slab);
@@ -246,6 +246,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object,
static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
{
+ if (!kasan_arch_is_ready())
+ return false;
+
if (ptr != page_address(virt_to_head_page(ptr))) {
kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE);
return true;
@@ -321,7 +324,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
kasan_unpoison(tagged_object, cache->object_size, init);
/* Save alloc info (if possible) for non-kmalloc() allocations. */
- if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc)
+ if (kasan_stack_collection_enabled() && !is_kmalloc_cache(cache))
kasan_save_alloc_info(cache, tagged_object, flags);
return tagged_object;
@@ -367,7 +370,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache,
* Save alloc info (if possible) for kmalloc() allocations.
* This also rewrites the alloc info when called from kasan_krealloc().
*/
- if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc)
+ if (kasan_stack_collection_enabled() && is_kmalloc_cache(cache))
kasan_save_alloc_info(cache, (void *)object, flags);
/* Keep the tag that was set by kasan_slab_alloc(). */
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index b076f597a378..e5eef670735e 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -172,10 +172,8 @@ static __always_inline bool check_region_inline(unsigned long addr,
if (unlikely(addr + size < addr))
return !kasan_report(addr, size, write, ret_ip);
- if (unlikely((void *)addr <
- kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+ if (unlikely(!addr_has_metadata((void *)addr)))
return !kasan_report(addr, size, write, ret_ip);
- }
if (likely(!memory_is_poisoned(addr, size)))
return true;
@@ -191,7 +189,12 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write,
bool kasan_byte_accessible(const void *addr)
{
- s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
+ s8 shadow_byte;
+
+ if (!kasan_arch_is_ready())
+ return true;
+
+ shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE;
}
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index b22c4f461cb0..d1bcb0205327 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -59,6 +59,24 @@ EXPORT_SYMBOL_GPL(kasan_mode);
/* Whether to enable vmalloc tagging. */
DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+#define PAGE_ALLOC_SAMPLE_DEFAULT 1
+#define PAGE_ALLOC_SAMPLE_ORDER_DEFAULT 3
+
+/*
+ * Sampling interval of page_alloc allocation (un)poisoning.
+ * Defaults to no sampling.
+ */
+unsigned long kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT;
+
+/*
+ * Minimum order of page_alloc allocations to be affected by sampling.
+ * The default value is chosen to match both
+ * PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER.
+ */
+unsigned int kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT;
+
+DEFINE_PER_CPU(long, kasan_page_alloc_skip);
+
/* kasan=off/on */
static int __init early_kasan_flag(char *arg)
{
@@ -122,6 +140,48 @@ static inline const char *kasan_mode_info(void)
return "sync";
}
+/* kasan.page_alloc.sample=<sampling interval> */
+static int __init early_kasan_flag_page_alloc_sample(char *arg)
+{
+ int rv;
+
+ if (!arg)
+ return -EINVAL;
+
+ rv = kstrtoul(arg, 0, &kasan_page_alloc_sample);
+ if (rv)
+ return rv;
+
+ if (!kasan_page_alloc_sample || kasan_page_alloc_sample > LONG_MAX) {
+ kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("kasan.page_alloc.sample", early_kasan_flag_page_alloc_sample);
+
+/* kasan.page_alloc.sample.order=<minimum page order> */
+static int __init early_kasan_flag_page_alloc_sample_order(char *arg)
+{
+ int rv;
+
+ if (!arg)
+ return -EINVAL;
+
+ rv = kstrtouint(arg, 0, &kasan_page_alloc_sample_order);
+ if (rv)
+ return rv;
+
+ if (kasan_page_alloc_sample_order > INT_MAX) {
+ kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("kasan.page_alloc.sample.order", early_kasan_flag_page_alloc_sample_order);
+
/*
* kasan_init_hw_tags_cpu() is called for each CPU.
* Not marked as __init as a CPU can be hot-plugged after boot.
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ea8cf1310b1e..a61eeee3095a 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -42,6 +42,10 @@ enum kasan_mode {
extern enum kasan_mode kasan_mode __ro_after_init;
+extern unsigned long kasan_page_alloc_sample;
+extern unsigned int kasan_page_alloc_sample_order;
+DECLARE_PER_CPU(long, kasan_page_alloc_skip);
+
static inline bool kasan_vmalloc_enabled(void)
{
return static_branch_likely(&kasan_flag_vmalloc);
@@ -57,6 +61,24 @@ static inline bool kasan_sync_fault_possible(void)
return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM;
}
+static inline bool kasan_sample_page_alloc(unsigned int order)
+{
+ /* Fast-path for when sampling is disabled. */
+ if (kasan_page_alloc_sample == 1)
+ return true;
+
+ if (order < kasan_page_alloc_sample_order)
+ return true;
+
+ if (this_cpu_dec_return(kasan_page_alloc_skip) < 0) {
+ this_cpu_write(kasan_page_alloc_skip,
+ kasan_page_alloc_sample - 1);
+ return true;
+ }
+
+ return false;
+}
+
#else /* CONFIG_KASAN_HW_TAGS */
static inline bool kasan_async_fault_possible(void)
@@ -69,6 +91,11 @@ static inline bool kasan_sync_fault_possible(void)
return true;
}
+static inline bool kasan_sample_page_alloc(unsigned int order)
+{
+ return true;
+}
+
#endif /* CONFIG_KASAN_HW_TAGS */
#ifdef CONFIG_KASAN_GENERIC
@@ -180,6 +207,7 @@ struct kasan_report_info {
void *first_bad_addr;
struct kmem_cache *cache;
void *object;
+ size_t alloc_size;
/* Filled in by the mode-specific reporting code. */
const char *bug_type;
@@ -269,7 +297,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
<< KASAN_SHADOW_SCALE_SHIFT);
}
-static inline bool addr_has_metadata(const void *addr)
+static __always_inline bool addr_has_metadata(const void *addr)
{
return (kasan_reset_tag(addr) >=
kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
@@ -288,7 +316,7 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write,
#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
-static inline bool addr_has_metadata(const void *addr)
+static __always_inline bool addr_has_metadata(const void *addr)
{
return (is_vmalloc_addr(addr) || virt_addr_valid(addr));
}
@@ -296,6 +324,7 @@ static inline bool addr_has_metadata(const void *addr)
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
void *kasan_find_first_bad_addr(void *addr, size_t size);
+size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache);
void kasan_complete_mode_report_info(struct kasan_report_info *info);
void kasan_metadata_fetch_row(char *buffer, void *row);
@@ -618,6 +647,10 @@ void __asan_set_shadow_f3(const void *addr, size_t size);
void __asan_set_shadow_f5(const void *addr, size_t size);
void __asan_set_shadow_f8(const void *addr, size_t size);
+void *__asan_memset(void *addr, int c, size_t len);
+void *__asan_memmove(void *dest, const void *src, size_t len);
+void *__asan_memcpy(void *dest, const void *src, size_t len);
+
void __hwasan_load1_noabort(unsigned long addr);
void __hwasan_store1_noabort(unsigned long addr);
void __hwasan_load2_noabort(unsigned long addr);
@@ -633,4 +666,8 @@ void __hwasan_storeN_noabort(unsigned long addr, size_t size);
void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size);
+void *__hwasan_memset(void *addr, int c, size_t len);
+void *__hwasan_memmove(void *dest, const void *src, size_t len);
+void *__hwasan_memcpy(void *dest, const void *src, size_t len);
+
#endif /* __MM_KASAN_KASAN_H */
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index 74cd80c12b25..627eaf1ee1db 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -165,6 +165,15 @@ static void kasan_test_exit(struct kunit *test)
kunit_skip((test), "Test requires " #config "=n"); \
} while (0)
+#define KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test) do { \
+ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) \
+ break; /* No compiler instrumentation. */ \
+ if (IS_ENABLED(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX)) \
+ break; /* Should always be instrumented! */ \
+ if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) \
+ kunit_skip((test), "Test requires checked mem*()"); \
+} while (0)
+
static void kmalloc_oob_right(struct kunit *test)
{
char *ptr;
@@ -454,6 +463,8 @@ static void kmalloc_oob_16(struct kunit *test)
u64 words[2];
} *ptr1, *ptr2;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
/* This test is specifically crafted for the generic mode. */
KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
@@ -476,6 +487,8 @@ static void kmalloc_uaf_16(struct kunit *test)
u64 words[2];
} *ptr1, *ptr2;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr1 = kmalloc(sizeof(*ptr1), GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
@@ -498,6 +511,8 @@ static void kmalloc_oob_memset_2(struct kunit *test)
char *ptr;
size_t size = 128 - KASAN_GRANULE_SIZE;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -511,6 +526,8 @@ static void kmalloc_oob_memset_4(struct kunit *test)
char *ptr;
size_t size = 128 - KASAN_GRANULE_SIZE;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -524,6 +541,8 @@ static void kmalloc_oob_memset_8(struct kunit *test)
char *ptr;
size_t size = 128 - KASAN_GRANULE_SIZE;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -537,6 +556,8 @@ static void kmalloc_oob_memset_16(struct kunit *test)
char *ptr;
size_t size = 128 - KASAN_GRANULE_SIZE;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -550,6 +571,8 @@ static void kmalloc_oob_in_memset(struct kunit *test)
char *ptr;
size_t size = 128 - KASAN_GRANULE_SIZE;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -566,6 +589,8 @@ static void kmalloc_memmove_negative_size(struct kunit *test)
size_t size = 64;
size_t invalid_size = -2;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
/*
* Hardware tag-based mode doesn't check memmove for negative size.
* As a result, this test introduces a side-effect memory corruption,
@@ -590,6 +615,8 @@ static void kmalloc_memmove_invalid_size(struct kunit *test)
size_t size = 64;
size_t invalid_size = size;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
@@ -618,6 +645,8 @@ static void kmalloc_uaf_memset(struct kunit *test)
char *ptr;
size_t size = 33;
+ KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test);
+
/*
* Only generic KASAN uses quarantine, which is required to avoid a
* kernel memory corruption this test causes.
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 1d02757e90a3..89078f912827 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
* Whether the KASAN KUnit test suite is currently being executed.
* Updated in kasan_test.c.
*/
-bool kasan_kunit_executing;
+static bool kasan_kunit_executing;
void kasan_kunit_test_suite_start(void)
{
@@ -231,33 +231,46 @@ static inline struct page *addr_to_page(const void *addr)
return NULL;
}
-static void describe_object_addr(const void *addr, struct kmem_cache *cache,
- void *object)
+static void describe_object_addr(const void *addr, struct kasan_report_info *info)
{
unsigned long access_addr = (unsigned long)addr;
- unsigned long object_addr = (unsigned long)object;
- const char *rel_type;
+ unsigned long object_addr = (unsigned long)info->object;
+ const char *rel_type, *region_state = "";
int rel_bytes;
pr_err("The buggy address belongs to the object at %px\n"
" which belongs to the cache %s of size %d\n",
- object, cache->name, cache->object_size);
+ info->object, info->cache->name, info->cache->object_size);
if (access_addr < object_addr) {
rel_type = "to the left";
rel_bytes = object_addr - access_addr;
- } else if (access_addr >= object_addr + cache->object_size) {
+ } else if (access_addr >= object_addr + info->alloc_size) {
rel_type = "to the right";
- rel_bytes = access_addr - (object_addr + cache->object_size);
+ rel_bytes = access_addr - (object_addr + info->alloc_size);
} else {
rel_type = "inside";
rel_bytes = access_addr - object_addr;
}
+ /*
+ * Tag-Based modes use the stack ring to infer the bug type, but the
+ * memory region state description is generated based on the metadata.
+ * Thus, defining the region state as below can contradict the metadata.
+ * Fixing this requires further improvements, so only infer the state
+ * for the Generic mode.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC)) {
+ if (strcmp(info->bug_type, "slab-out-of-bounds") == 0)
+ region_state = "allocated ";
+ else if (strcmp(info->bug_type, "slab-use-after-free") == 0)
+ region_state = "freed ";
+ }
+
pr_err("The buggy address is located %d bytes %s of\n"
- " %d-byte region [%px, %px)\n",
- rel_bytes, rel_type, cache->object_size, (void *)object_addr,
- (void *)(object_addr + cache->object_size));
+ " %s%zu-byte region [%px, %px)\n",
+ rel_bytes, rel_type, region_state, info->alloc_size,
+ (void *)object_addr, (void *)(object_addr + info->alloc_size));
}
static void describe_object_stacks(struct kasan_report_info *info)
@@ -279,7 +292,7 @@ static void describe_object(const void *addr, struct kasan_report_info *info)
{
if (kasan_stack_collection_enabled())
describe_object_stacks(info);
- describe_object_addr(addr, info->cache, info->object);
+ describe_object_addr(addr, info);
}
static inline bool kernel_or_module_addr(const void *addr)
@@ -436,6 +449,12 @@ static void complete_report_info(struct kasan_report_info *info)
if (slab) {
info->cache = slab->slab_cache;
info->object = nearest_obj(info->cache, slab, addr);
+
+ /* Try to determine allocation size based on the metadata. */
+ info->alloc_size = kasan_get_alloc_size(info->object, info->cache);
+ /* Fallback to the object size if failed. */
+ if (!info->alloc_size)
+ info->alloc_size = info->cache->object_size;
} else
info->cache = info->object = NULL;
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 043c94b04605..87d39bc0a673 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -43,6 +43,34 @@ void *kasan_find_first_bad_addr(void *addr, size_t size)
return p;
}
+size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache)
+{
+ size_t size = 0;
+ u8 *shadow;
+
+ /*
+ * Skip the addr_has_metadata check, as this function only operates on
+ * slab memory, which must have metadata.
+ */
+
+ /*
+ * The loop below returns 0 for freed objects, for which KASAN cannot
+ * calculate the allocation size based on the metadata.
+ */
+ shadow = (u8 *)kasan_mem_to_shadow(object);
+ while (size < cache->object_size) {
+ if (*shadow == 0)
+ size += KASAN_GRANULE_SIZE;
+ else if (*shadow >= 1 && *shadow <= KASAN_GRANULE_SIZE - 1)
+ return size + *shadow;
+ else
+ return size;
+ shadow++;
+ }
+
+ return cache->object_size;
+}
+
static const char *get_shadow_bug_type(struct kasan_report_info *info)
{
const char *bug_type = "unknown-crash";
@@ -79,9 +107,11 @@ static const char *get_shadow_bug_type(struct kasan_report_info *info)
bug_type = "stack-out-of-bounds";
break;
case KASAN_PAGE_FREE:
+ bug_type = "use-after-free";
+ break;
case KASAN_SLAB_FREE:
case KASAN_SLAB_FREETRACK:
- bug_type = "use-after-free";
+ bug_type = "slab-use-after-free";
break;
case KASAN_ALLOCA_LEFT:
case KASAN_ALLOCA_RIGHT:
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
index f3d3be614e4b..32e80f78de7d 100644
--- a/mm/kasan/report_hw_tags.c
+++ b/mm/kasan/report_hw_tags.c
@@ -17,10 +17,43 @@
void *kasan_find_first_bad_addr(void *addr, size_t size)
{
- /* Return the same value regardless of whether addr_has_metadata(). */
+ /*
+ * Hardware Tag-Based KASAN only calls this function for normal memory
+ * accesses, and thus addr points precisely to the first bad address
+ * with an invalid (and present) memory tag. Therefore:
+ * 1. Return the address as is without walking memory tags.
+ * 2. Skip the addr_has_metadata check.
+ */
return kasan_reset_tag(addr);
}
+size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache)
+{
+ size_t size = 0;
+ int i = 0;
+ u8 memory_tag;
+
+ /*
+ * Skip the addr_has_metadata check, as this function only operates on
+ * slab memory, which must have metadata.
+ */
+
+ /*
+ * The loop below returns 0 for freed objects, for which KASAN cannot
+ * calculate the allocation size based on the metadata.
+ */
+ while (size < cache->object_size) {
+ memory_tag = hw_get_mem_tag(object + i * KASAN_GRANULE_SIZE);
+ if (memory_tag != KASAN_TAG_INVALID)
+ size += KASAN_GRANULE_SIZE;
+ else
+ return size;
+ i++;
+ }
+
+ return cache->object_size;
+}
+
void kasan_metadata_fetch_row(char *buffer, void *row)
{
int i;
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index 7a26397297ed..8b1f5a73ee6d 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -45,6 +45,32 @@ void *kasan_find_first_bad_addr(void *addr, size_t size)
return p;
}
+size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache)
+{
+ size_t size = 0;
+ u8 *shadow;
+
+ /*
+ * Skip the addr_has_metadata check, as this function only operates on
+ * slab memory, which must have metadata.
+ */
+
+ /*
+ * The loop below returns 0 for freed objects, for which KASAN cannot
+ * calculate the allocation size based on the metadata.
+ */
+ shadow = (u8 *)kasan_mem_to_shadow(object);
+ while (size < cache->object_size) {
+ if (*shadow != KASAN_TAG_INVALID)
+ size += KASAN_GRANULE_SIZE;
+ else
+ return size;
+ shadow++;
+ }
+
+ return cache->object_size;
+}
+
void kasan_metadata_fetch_row(char *buffer, void *row)
{
memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c
index ecede06ef374..8b8bfdb3cfdb 100644
--- a/mm/kasan/report_tags.c
+++ b/mm/kasan/report_tags.c
@@ -89,7 +89,7 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info)
* a use-after-free.
*/
if (!info->bug_type)
- info->bug_type = "use-after-free";
+ info->bug_type = "slab-use-after-free";
} else {
/* Second alloc of the same object. Give up. */
if (alloc_found)
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 2fba1f51f042..c8b86f3273b5 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -38,6 +38,15 @@ bool __kasan_check_write(const volatile void *p, unsigned int size)
}
EXPORT_SYMBOL(__kasan_check_write);
+#if !defined(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX) && !defined(CONFIG_GENERIC_ENTRY)
+/*
+ * CONFIG_GENERIC_ENTRY relies on compiler emitted mem*() calls to not be
+ * instrumented. KASAN enabled toolchains should emit __asan_mem*() functions
+ * for the sites they want to instrument.
+ *
+ * If we have a compiler that can instrument meminstrinsics, never override
+ * these, so that non-instrumented files can safely consider them as builtins.
+ */
#undef memset
void *memset(void *addr, int c, size_t len)
{
@@ -68,6 +77,49 @@ void *memcpy(void *dest, const void *src, size_t len)
return __memcpy(dest, src, len);
}
+#endif
+
+void *__asan_memset(void *addr, int c, size_t len)
+{
+ if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_))
+ return NULL;
+
+ return __memset(addr, c, len);
+}
+EXPORT_SYMBOL(__asan_memset);
+
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__asan_memmove(void *dest, const void *src, size_t len)
+{
+ if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
+ !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memmove(dest, src, len);
+}
+EXPORT_SYMBOL(__asan_memmove);
+#endif
+
+void *__asan_memcpy(void *dest, const void *src, size_t len)
+{
+ if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
+ !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memcpy(dest, src, len);
+}
+EXPORT_SYMBOL(__asan_memcpy);
+
+#ifdef CONFIG_KASAN_SW_TAGS
+void *__hwasan_memset(void *addr, int c, size_t len) __alias(__asan_memset);
+EXPORT_SYMBOL(__hwasan_memset);
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__hwasan_memmove(void *dest, const void *src, size_t len) __alias(__asan_memmove);
+EXPORT_SYMBOL(__hwasan_memmove);
+#endif
+void *__hwasan_memcpy(void *dest, const void *src, size_t len) __alias(__asan_memcpy);
+EXPORT_SYMBOL(__hwasan_memcpy);
+#endif
void kasan_poison(const void *addr, size_t size, u8 value, bool init)
{
@@ -291,6 +343,9 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
unsigned long shadow_start, shadow_end;
int ret;
+ if (!kasan_arch_is_ready())
+ return 0;
+
if (!is_vmalloc_or_module_addr((void *)addr))
return 0;
@@ -459,6 +514,9 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long region_start, region_end;
unsigned long size;
+ if (!kasan_arch_is_ready())
+ return;
+
region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE);
region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE);
@@ -502,6 +560,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
* with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored.
*/
+ if (!kasan_arch_is_ready())
+ return (void *)start;
+
if (!is_vmalloc_or_module_addr(start))
return (void *)start;
@@ -524,6 +585,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
*/
void __kasan_poison_vmalloc(const void *start, unsigned long size)
{
+ if (!kasan_arch_is_ready())
+ return;
+
if (!is_vmalloc_or_module_addr(start))
return;
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index a3afaf2ad1b1..30da65fa02a1 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -106,10 +106,8 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write,
return true;
untagged_addr = kasan_reset_tag((const void *)addr);
- if (unlikely(untagged_addr <
- kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+ if (unlikely(!addr_has_metadata(untagged_addr)))
return !kasan_report(addr, size, write, ret_ip);
- }
shadow_first = kasan_mem_to_shadow(untagged_addr);
shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1);
for (shadow = shadow_first; shadow <= shadow_last; shadow++) {
@@ -127,7 +125,7 @@ bool kasan_byte_accessible(const void *addr)
void *untagged_addr = kasan_reset_tag(addr);
u8 shadow_byte;
- if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))
+ if (!addr_has_metadata(untagged_addr))
return false;
shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr));
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 5cb401aa2b9d..92e6f56a932d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -490,32 +490,43 @@ void __khugepaged_exit(struct mm_struct *mm)
}
}
+static void release_pte_folio(struct folio *folio)
+{
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ -folio_nr_pages(folio));
+ folio_unlock(folio);
+ folio_putback_lru(folio);
+}
+
static void release_pte_page(struct page *page)
{
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_is_file_lru(page),
- -compound_nr(page));
- unlock_page(page);
- putback_lru_page(page);
+ release_pte_folio(page_folio(page));
}
static void release_pte_pages(pte_t *pte, pte_t *_pte,
struct list_head *compound_pagelist)
{
- struct page *page, *tmp;
+ struct folio *folio, *tmp;
while (--_pte >= pte) {
pte_t pteval = *_pte;
+ unsigned long pfn;
- page = pte_page(pteval);
- if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
- !PageCompound(page))
- release_pte_page(page);
+ if (pte_none(pteval))
+ continue;
+ pfn = pte_pfn(pteval);
+ if (is_zero_pfn(pfn))
+ continue;
+ folio = pfn_folio(pfn);
+ if (folio_test_large(folio))
+ continue;
+ release_pte_folio(folio);
}
- list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
- list_del(&page->lru);
- release_pte_page(page);
+ list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
+ list_del(&folio->lru);
+ release_pte_folio(folio);
}
}
@@ -625,7 +636,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
*/
- if (isolate_lru_page(page)) {
+ if (!isolate_lru_page(page)) {
unlock_page(page);
result = SCAN_DEL_PAGE_LRU;
goto out;
@@ -847,6 +858,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED;
}
+/*
+ * See pmd_trans_unstable() for how the result may change out from
+ * underneath us, even if we hold mmap_lock in read.
+ */
static int find_pmd_or_thp_or_none(struct mm_struct *mm,
unsigned long address,
pmd_t **pmd)
@@ -865,8 +880,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
#endif
if (pmd_none(pmde))
return SCAN_PMD_NONE;
+ if (!pmd_present(pmde))
+ return SCAN_PMD_NULL;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
+ if (pmd_devmap(pmde))
+ return SCAN_PMD_NULL;
if (pmd_bad(pmde))
return SCAN_PMD_NULL;
return SCAN_SUCCEED;
@@ -1032,8 +1051,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
anon_vma_lock_write(vma->anon_vma);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
- address, address + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
+ address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
pte = pte_offset_map(pmd, address);
@@ -1404,7 +1423,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
if (vma->anon_vma)
lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
pmd = pmdp_collapse_flush(vma, addr, pmdp);
@@ -1460,14 +1479,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
return SCAN_VMA_CHECK;
- /*
- * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings
- * that got written to. Without this, we'd have to also lock the
- * anon_vma if one exists.
- */
- if (vma->anon_vma)
- return SCAN_VMA_CHECK;
-
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
if (userfaultfd_wp(vma))
return SCAN_PTE_UFFD_WP;
@@ -1567,8 +1578,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
}
/* step 4: remove pte entries */
+ /* we make no change to anon, but protect concurrent anon page lookup */
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+
collapse_and_free_pmd(mm, vma, haddr, pmd);
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
i_mmap_unlock_write(vma->vm_file->f_mapping);
maybe_install_pmd:
@@ -1644,7 +1661,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
* has higher cost too. It would also probably require locking
* the anon_vma.
*/
- if (vma->anon_vma) {
+ if (READ_ONCE(vma->anon_vma)) {
result = SCAN_PAGE_ANON;
goto next;
}
@@ -1673,6 +1690,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
if ((cc->is_khugepaged || is_target) &&
mmap_write_trylock(mm)) {
/*
+ * Re-check whether we have an ->anon_vma, because
+ * collapse_and_free_pmd() requires that either no
+ * ->anon_vma exists or the anon_vma is locked.
+ * We already checked ->anon_vma above, but that check
+ * is racy because ->anon_vma can be populated under the
+ * mmap lock in read mode.
+ */
+ if (vma->anon_vma) {
+ result = SCAN_PAGE_ANON;
+ goto unlock_next;
+ }
+ /*
* When a vma is registered with uffd-wp, we can't
* recycle the pmd pgtable because there can be pte
* markers installed. Skip it only, so the rest mm/vma
@@ -1921,7 +1950,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
goto out_unlock;
}
- if (folio_isolate_lru(folio)) {
+ if (!folio_isolate_lru(folio)) {
result = SCAN_DEL_PAGE_LRU;
goto out_unlock;
}
@@ -2593,6 +2622,7 @@ static int madvise_collapse_errno(enum scan_result r)
case SCAN_CGROUP_CHARGE_FAIL:
return -EBUSY;
/* Resource temporary unavailable - trying again might succeed */
+ case SCAN_PAGE_COUNT:
case SCAN_PAGE_LOCK:
case SCAN_PAGE_LRU:
case SCAN_DEL_PAGE_LRU:
@@ -2649,7 +2679,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
goto out_nolock;
}
- hend = vma->vm_end & HPAGE_PMD_MASK;
+ hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 92f670edbf51..a2d34226e3c8 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -13,11 +13,12 @@
*
* The following locks and mutexes are used by kmemleak:
*
- * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and
- * accesses to the object_tree_root (or object_phys_tree_root). The
- * object_list is the main list holding the metadata (struct kmemleak_object)
- * for the allocated memory blocks. The object_tree_root and object_phys_tree_root
- * are red black trees used to look-up metadata based on a pointer to the
+ * - kmemleak_lock (raw_spinlock_t): protects the object_list as well as
+ * del_state modifications and accesses to the object_tree_root (or
+ * object_phys_tree_root). The object_list is the main list holding the
+ * metadata (struct kmemleak_object) for the allocated memory blocks.
+ * The object_tree_root and object_phys_tree_root are red
+ * black trees used to look-up metadata based on a pointer to the
* corresponding memory block. The object_phys_tree_root is for objects
* allocated with physical address. The kmemleak_object structures are
* added to the object_list and object_tree_root (or object_phys_tree_root)
@@ -148,6 +149,7 @@ struct kmemleak_object {
struct rcu_head rcu; /* object_list lockless traversal */
/* object usage count; object freed when use_count == 0 */
atomic_t use_count;
+ unsigned int del_state; /* deletion state */
unsigned long pointer;
size_t size;
/* pass surplus references to this pointer */
@@ -177,6 +179,11 @@ struct kmemleak_object {
/* flag set for object allocated with physical address */
#define OBJECT_PHYS (1 << 4)
+/* set when __remove_object() called */
+#define DELSTATE_REMOVED (1 << 0)
+/* set to temporarily prevent deletion from object_list */
+#define DELSTATE_NO_DELETE (1 << 1)
+
#define HEX_PREFIX " "
/* number of bytes to print per line; must be 16 or 32 */
#define HEX_ROW_SIZE 16
@@ -571,7 +578,9 @@ static void __remove_object(struct kmemleak_object *object)
rb_erase(&object->rb_node, object->flags & OBJECT_PHYS ?
&object_phys_tree_root :
&object_tree_root);
- list_del_rcu(&object->object_list);
+ if (!(object->del_state & DELSTATE_NO_DELETE))
+ list_del_rcu(&object->object_list);
+ object->del_state |= DELSTATE_REMOVED;
}
/*
@@ -643,6 +652,7 @@ static void __create_object(unsigned long ptr, size_t size,
object->count = 0; /* white color initially */
object->jiffies = jiffies;
object->checksum = 0;
+ object->del_state = 0;
/* task information */
if (in_hardirq()) {
@@ -1472,22 +1482,30 @@ static void scan_gray_list(void)
/*
* Conditionally call resched() in an object iteration loop while making sure
* that the given object won't go away without RCU read lock by performing a
- * get_object() if !pinned.
- *
- * Return: false if can't do a cond_resched() due to get_object() failure
- * true otherwise
+ * get_object() if necessaary.
*/
-static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned)
+static void kmemleak_cond_resched(struct kmemleak_object *object)
{
- if (!pinned && !get_object(object))
- return false;
+ if (!get_object(object))
+ return; /* Try next object */
+
+ raw_spin_lock_irq(&kmemleak_lock);
+ if (object->del_state & DELSTATE_REMOVED)
+ goto unlock_put; /* Object removed */
+ object->del_state |= DELSTATE_NO_DELETE;
+ raw_spin_unlock_irq(&kmemleak_lock);
rcu_read_unlock();
cond_resched();
rcu_read_lock();
- if (!pinned)
- put_object(object);
- return true;
+
+ raw_spin_lock_irq(&kmemleak_lock);
+ if (object->del_state & DELSTATE_REMOVED)
+ list_del_rcu(&object->object_list);
+ object->del_state &= ~DELSTATE_NO_DELETE;
+unlock_put:
+ raw_spin_unlock_irq(&kmemleak_lock);
+ put_object(object);
}
/*
@@ -1501,15 +1519,12 @@ static void kmemleak_scan(void)
struct zone *zone;
int __maybe_unused i;
int new_leaks = 0;
- int loop_cnt = 0;
jiffies_last_scan = jiffies;
/* prepare the kmemleak_object's */
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- bool obj_pinned = false;
-
raw_spin_lock_irq(&object->lock);
#ifdef DEBUG
/*
@@ -1535,19 +1550,13 @@ static void kmemleak_scan(void)
/* reset the reference count (whiten the object) */
object->count = 0;
- if (color_gray(object) && get_object(object)) {
+ if (color_gray(object) && get_object(object))
list_add_tail(&object->gray_list, &gray_list);
- obj_pinned = true;
- }
raw_spin_unlock_irq(&object->lock);
- /*
- * Do a cond_resched() every 64k objects to avoid soft lockup.
- */
- if (!(++loop_cnt & 0xffff) &&
- !kmemleak_cond_resched(object, obj_pinned))
- loop_cnt--; /* Try again on next object */
+ if (need_resched())
+ kmemleak_cond_resched(object);
}
rcu_read_unlock();
@@ -1614,14 +1623,9 @@ static void kmemleak_scan(void)
* scan and color them gray until the next scan.
*/
rcu_read_lock();
- loop_cnt = 0;
list_for_each_entry_rcu(object, &object_list, object_list) {
- /*
- * Do a cond_resched() every 64k objects to avoid soft lockup.
- */
- if (!(++loop_cnt & 0xffff) &&
- !kmemleak_cond_resched(object, false))
- loop_cnt--; /* Try again on next object */
+ if (need_resched())
+ kmemleak_cond_resched(object);
/*
* This is racy but we can save the overhead of lock/unlock
@@ -1656,14 +1660,9 @@ static void kmemleak_scan(void)
* Scanning result reporting.
*/
rcu_read_lock();
- loop_cnt = 0;
list_for_each_entry_rcu(object, &object_list, object_list) {
- /*
- * Do a cond_resched() every 64k objects to avoid soft lockup.
- */
- if (!(++loop_cnt & 0xffff) &&
- !kmemleak_cond_resched(object, false))
- loop_cnt--; /* Try again on next object */
+ if (need_resched())
+ kmemleak_cond_resched(object);
/*
* This is racy but we can save the overhead of lock/unlock
@@ -2070,8 +2069,10 @@ static int __init kmemleak_boot_config(char *str)
return -EINVAL;
if (strcmp(str, "off") == 0)
kmemleak_disable();
- else if (strcmp(str, "on") == 0)
+ else if (strcmp(str, "on") == 0) {
kmemleak_skip_disable = 1;
+ stack_depot_request_early_init();
+ }
else
return -EINVAL;
return 0;
@@ -2093,7 +2094,6 @@ void __init kmemleak_init(void)
if (kmemleak_error)
return;
- stack_depot_init();
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile
index 98eab2856626..91cfdde642d1 100644
--- a/mm/kmsan/Makefile
+++ b/mm/kmsan/Makefile
@@ -14,7 +14,13 @@ CC_FLAGS_KMSAN_RUNTIME := -fno-stack-protector
CC_FLAGS_KMSAN_RUNTIME += $(call cc-option,-fno-conserve-stack)
CC_FLAGS_KMSAN_RUNTIME += -DDISABLE_BRANCH_PROFILING
-CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE)
+# Disable ftrace to avoid recursion.
+CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_hooks.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_instrumentation.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_shadow.o = $(CC_FLAGS_FTRACE)
CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME)
CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME)
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index 112dce135c7f..f710257d6867 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -69,13 +69,15 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
{
unsigned long entries[KMSAN_STACK_DEPTH];
unsigned int nr_entries;
+ depot_stack_handle_t handle;
nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0);
/* Don't sleep (see might_sleep_if() in __alloc_pages_nodemask()). */
flags &= ~__GFP_DIRECT_RECLAIM;
- return __stack_depot_save(entries, nr_entries, extra, flags, true);
+ handle = __stack_depot_save(entries, nr_entries, flags, true);
+ return stack_depot_set_extra_bits(handle, extra);
}
/* Copy the metadata following the memmove() behavior. */
@@ -215,6 +217,7 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
u32 extra_bits;
int depth;
bool uaf;
+ depot_stack_handle_t handle;
if (!id)
return id;
@@ -250,8 +253,9 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
* positives when __stack_depot_save() passes it to instrumented code.
*/
kmsan_internal_unpoison_memory(entries, sizeof(entries), false);
- return __stack_depot_save(entries, ARRAY_SIZE(entries), extra_bits,
- GFP_ATOMIC, true);
+ handle = __stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC,
+ true);
+ return stack_depot_set_extra_bits(handle, extra_bits);
}
void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
index 770fe02904f3..cf12e9616b24 100644
--- a/mm/kmsan/instrumentation.c
+++ b/mm/kmsan/instrumentation.c
@@ -38,8 +38,16 @@ get_shadow_origin_ptr(void *addr, u64 size, bool store)
return ret;
}
+/*
+ * KMSAN instrumentation functions follow. They are not declared elsewhere in
+ * the kernel code, so they are preceded by prototypes, to silence
+ * -Wmissing-prototypes warnings.
+ */
+
/* Get shadow and origin pointers for a memory load with non-standard size. */
struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr,
+ uintptr_t size);
+struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr,
uintptr_t size)
{
return get_shadow_origin_ptr(addr, size, /*store*/ false);
@@ -48,6 +56,8 @@ EXPORT_SYMBOL(__msan_metadata_ptr_for_load_n);
/* Get shadow and origin pointers for a memory store with non-standard size. */
struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr,
+ uintptr_t size);
+struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr,
uintptr_t size)
{
return get_shadow_origin_ptr(addr, size, /*store*/ true);
@@ -60,12 +70,16 @@ EXPORT_SYMBOL(__msan_metadata_ptr_for_store_n);
*/
#define DECLARE_METADATA_PTR_GETTER(size) \
struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \
+ void *addr); \
+ struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \
void *addr) \
{ \
return get_shadow_origin_ptr(addr, size, /*store*/ false); \
} \
EXPORT_SYMBOL(__msan_metadata_ptr_for_load_##size); \
struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \
+ void *addr); \
+ struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \
void *addr) \
{ \
return get_shadow_origin_ptr(addr, size, /*store*/ true); \
@@ -86,6 +100,7 @@ DECLARE_METADATA_PTR_GETTER(8);
* entering or leaving IRQ. We omit the check for kmsan_in_runtime() to ensure
* the memory written to in these cases is also marked as initialized.
*/
+void __msan_instrument_asm_store(void *addr, uintptr_t size);
void __msan_instrument_asm_store(void *addr, uintptr_t size)
{
unsigned long ua_flags;
@@ -138,6 +153,7 @@ static inline void set_retval_metadata(u64 shadow, depot_stack_handle_t origin)
}
/* Handle llvm.memmove intrinsic. */
+void *__msan_memmove(void *dst, const void *src, uintptr_t n);
void *__msan_memmove(void *dst, const void *src, uintptr_t n)
{
depot_stack_handle_t origin;
@@ -162,6 +178,7 @@ void *__msan_memmove(void *dst, const void *src, uintptr_t n)
EXPORT_SYMBOL(__msan_memmove);
/* Handle llvm.memcpy intrinsic. */
+void *__msan_memcpy(void *dst, const void *src, uintptr_t n);
void *__msan_memcpy(void *dst, const void *src, uintptr_t n)
{
depot_stack_handle_t origin;
@@ -188,6 +205,7 @@ void *__msan_memcpy(void *dst, const void *src, uintptr_t n)
EXPORT_SYMBOL(__msan_memcpy);
/* Handle llvm.memset intrinsic. */
+void *__msan_memset(void *dst, int c, uintptr_t n);
void *__msan_memset(void *dst, int c, uintptr_t n)
{
depot_stack_handle_t origin;
@@ -217,6 +235,7 @@ EXPORT_SYMBOL(__msan_memset);
* uninitialized value to memory. When reporting an error, KMSAN unrolls and
* prints the whole chain of stores that preceded the use of this value.
*/
+depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin);
depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin)
{
depot_stack_handle_t ret = 0;
@@ -237,6 +256,7 @@ depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin)
EXPORT_SYMBOL(__msan_chain_origin);
/* Poison a local variable when entering a function. */
+void __msan_poison_alloca(void *address, uintptr_t size, char *descr);
void __msan_poison_alloca(void *address, uintptr_t size, char *descr)
{
depot_stack_handle_t handle;
@@ -272,6 +292,7 @@ void __msan_poison_alloca(void *address, uintptr_t size, char *descr)
EXPORT_SYMBOL(__msan_poison_alloca);
/* Unpoison a local variable. */
+void __msan_unpoison_alloca(void *address, uintptr_t size);
void __msan_unpoison_alloca(void *address, uintptr_t size)
{
if (!kmsan_enabled || kmsan_in_runtime())
@@ -287,6 +308,7 @@ EXPORT_SYMBOL(__msan_unpoison_alloca);
* Report that an uninitialized value with the given origin was used in a way
* that constituted undefined behavior.
*/
+void __msan_warning(u32 origin);
void __msan_warning(u32 origin)
{
if (!kmsan_enabled || kmsan_in_runtime())
@@ -303,6 +325,7 @@ EXPORT_SYMBOL(__msan_warning);
* At the beginning of an instrumented function, obtain the pointer to
* `struct kmsan_context_state` holding the metadata for function parameters.
*/
+struct kmsan_context_state *__msan_get_context_state(void);
struct kmsan_context_state *__msan_get_context_state(void)
{
return &kmsan_get_context()->cstate;
diff --git a/mm/ksm.c b/mm/ksm.c
index dd02780c387f..ad591b779d53 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1057,8 +1057,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
BUG_ON(PageTransCompound(page));
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
- pvmw.address,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address,
pvmw.address + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -1164,7 +1163,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
if (!pmd_present(pmde) || pmd_trans_huge(pmde))
goto out;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -2629,8 +2628,11 @@ struct page *ksm_might_need_to_copy(struct page *page,
new_page = NULL;
}
if (new_page) {
- copy_user_highpage(new_page, page, address, vma);
-
+ if (copy_mc_user_highpage(new_page, page, address, vma)) {
+ put_page(new_page);
+ memory_failure_queue(page_to_pfn(page), 0);
+ return ERR_PTR(-EHWPOISON);
+ }
SetPageDirty(new_page);
__SetPageUptodate(new_page);
__SetPageLocked(new_page);
diff --git a/mm/madvise.c b/mm/madvise.c
index a56a6d17e201..340125d08c03 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -130,7 +130,7 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
#endif /* CONFIG_ANON_VMA_NAME */
/*
* Update the vm_flags on region of a vma, splitting it or merging it as
- * necessary. Must be called with mmap_sem held for writing;
+ * necessary. Must be called with mmap_lock held for writing;
* Caller should ensure anon_name stability by raising its refcount even when
* anon_name belongs to a valid vma because this function might free that vma.
*/
@@ -142,6 +142,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
int error;
pgoff_t pgoff;
+ VMA_ITERATOR(vmi, mm, start);
if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
*prev = vma;
@@ -149,8 +150,8 @@ static int madvise_update_vma(struct vm_area_struct *vma,
}
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma),
+ *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags,
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_name);
if (*prev) {
vma = *prev;
@@ -160,17 +161,13 @@ static int madvise_update_vma(struct vm_area_struct *vma,
*prev = vma;
if (start != vma->vm_start) {
- if (unlikely(mm->map_count >= sysctl_max_map_count))
- return -ENOMEM;
- error = __split_vma(mm, vma, start, 1);
+ error = split_vma(&vmi, vma, start, 1);
if (error)
return error;
}
if (end != vma->vm_end) {
- if (unlikely(mm->map_count >= sysctl_max_map_count))
- return -ENOMEM;
- error = __split_vma(mm, vma, end, 0);
+ error = split_vma(&vmi, vma, end, 0);
if (error)
return error;
}
@@ -179,7 +176,7 @@ success:
/*
* vm_flags is protected by the mmap_lock held in write mode.
*/
- vma->vm_flags = new_flags;
+ vm_flags_reset(vma, new_flags);
if (!vma->vm_file || vma_is_anon_shmem(vma)) {
error = replace_anon_vma_name(vma, anon_name);
if (error)
@@ -329,7 +326,7 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
* otherwise we'd be including shared non-exclusive mappings, which
* opens a side channel.
*/
- return inode_owner_or_capable(&init_user_ns,
+ return inode_owner_or_capable(&nop_mnt_idmap,
file_inode(vma->vm_file)) ||
file_permission(vma->vm_file, MAY_WRITE) == 0;
}
@@ -345,8 +342,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
struct vm_area_struct *vma = walk->vma;
pte_t *orig_pte, *pte, ptent;
spinlock_t *ptl;
- struct page *page = NULL;
- LIST_HEAD(page_list);
+ struct folio *folio = NULL;
+ LIST_HEAD(folio_list);
bool pageout_anon_only_filter;
if (fatal_signal_pending(current))
@@ -375,26 +372,26 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
goto huge_unlock;
}
- page = pmd_page(orig_pmd);
+ folio = pfn_folio(pmd_pfn(orig_pmd));
- /* Do not interfere with other mappings of this page */
- if (page_mapcount(page) != 1)
+ /* Do not interfere with other mappings of this folio */
+ if (folio_mapcount(folio) != 1)
goto huge_unlock;
- if (pageout_anon_only_filter && !PageAnon(page))
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
goto huge_unlock;
if (next - addr != HPAGE_PMD_SIZE) {
int err;
- get_page(page);
+ folio_get(folio);
spin_unlock(ptl);
- lock_page(page);
- err = split_huge_page(page);
- unlock_page(page);
- put_page(page);
+ folio_lock(folio);
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
if (!err)
- goto regular_page;
+ goto regular_folio;
return 0;
}
@@ -406,25 +403,25 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
+ folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
if (pageout) {
- if (!isolate_lru_page(page)) {
- if (PageUnevictable(page))
- putback_lru_page(page);
+ if (folio_isolate_lru(folio)) {
+ if (folio_test_unevictable(folio))
+ folio_putback_lru(folio);
else
- list_add(&page->lru, &page_list);
+ list_add(&folio->lru, &folio_list);
}
} else
- deactivate_page(page);
+ folio_deactivate(folio);
huge_unlock:
spin_unlock(ptl);
if (pageout)
- reclaim_pages(&page_list);
+ reclaim_pages(&folio_list);
return 0;
}
-regular_page:
+regular_folio:
if (pmd_trans_unstable(pmd))
return 0;
#endif
@@ -441,33 +438,33 @@ regular_page:
if (!pte_present(ptent))
continue;
- page = vm_normal_page(vma, addr, ptent);
- if (!page || is_zone_device_page(page))
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio || folio_is_zone_device(folio))
continue;
/*
* Creating a THP page is expensive so split it only if we
* are sure it's worth. Split it if we are only owner.
*/
- if (PageTransCompound(page)) {
- if (page_mapcount(page) != 1)
+ if (folio_test_large(folio)) {
+ if (folio_mapcount(folio) != 1)
break;
- if (pageout_anon_only_filter && !PageAnon(page))
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
break;
- get_page(page);
- if (!trylock_page(page)) {
- put_page(page);
+ folio_get(folio);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
break;
}
pte_unmap_unlock(orig_pte, ptl);
- if (split_huge_page(page)) {
- unlock_page(page);
- put_page(page);
+ if (split_folio(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
break;
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte--;
addr -= PAGE_SIZE;
@@ -475,16 +472,16 @@ regular_page:
}
/*
- * Do not interfere with other mappings of this page and
- * non-LRU page.
+ * Do not interfere with other mappings of this folio and
+ * non-LRU folio.
*/
- if (!PageLRU(page) || page_mapcount(page) != 1)
+ if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
continue;
- if (pageout_anon_only_filter && !PageAnon(page))
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
if (pte_young(ptent)) {
ptent = ptep_get_and_clear_full(mm, addr, pte,
@@ -495,28 +492,28 @@ regular_page:
}
/*
- * We are deactivating a page for accelerating reclaiming.
- * VM couldn't reclaim the page unless we clear PG_young.
+ * We are deactivating a folio for accelerating reclaiming.
+ * VM couldn't reclaim the folio unless we clear PG_young.
* As a side effect, it makes confuse idle-page tracking
* because they will miss recent referenced history.
*/
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
+ folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
if (pageout) {
- if (!isolate_lru_page(page)) {
- if (PageUnevictable(page))
- putback_lru_page(page);
+ if (folio_isolate_lru(folio)) {
+ if (folio_test_unevictable(folio))
+ folio_putback_lru(folio);
else
- list_add(&page->lru, &page_list);
+ list_add(&folio->lru, &folio_list);
}
} else
- deactivate_page(page);
+ folio_deactivate(folio);
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
if (pageout)
- reclaim_pages(&page_list);
+ reclaim_pages(&folio_list);
cond_resched();
return 0;
@@ -617,7 +614,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
spinlock_t *ptl;
pte_t *orig_pte, *pte, ptent;
struct folio *folio;
- struct page *page;
int nr_swap = 0;
unsigned long next;
@@ -658,10 +654,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
}
- page = vm_normal_page(vma, addr, ptent);
- if (!page || is_zone_device_page(page))
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio || folio_is_zone_device(folio))
continue;
- folio = page_folio(page);
/*
* If pmd isn't transhuge but the folio is large and
@@ -728,7 +723,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(mm, addr, pte, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
}
- mark_page_lazyfree(&folio->page);
+ folio_mark_lazyfree(folio);
}
out:
if (nr_swap) {
@@ -765,7 +760,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.end = min(vma->vm_end, end_addr);
if (range.end <= vma->vm_start)
return -EINVAL;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
range.start, range.end);
lru_add_drain();
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 175e424b9ab1..e1eb33f49059 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -191,7 +191,7 @@ static int wp_clean_pre_vma(unsigned long start, unsigned long end,
wpwalk->tlbflush_end = start;
mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0,
- walk->vma, walk->mm, start, end);
+ walk->mm, start, end);
mmu_notifier_invalidate_range_start(&wpwalk->range);
flush_cache_range(walk->vma, start, end);
diff --git a/mm/memblock.c b/mm/memblock.c
index 511d4783dcf1..25fd0626a9e7 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -500,15 +500,19 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
/**
* memblock_merge_regions - merge neighboring compatible regions
* @type: memblock type to scan
- *
- * Scan @type and merge neighboring compatible regions.
+ * @start_rgn: start scanning from (@start_rgn - 1)
+ * @end_rgn: end scanning at (@end_rgn - 1)
+ * Scan @type and merge neighboring compatible regions in [@start_rgn - 1, @end_rgn)
*/
-static void __init_memblock memblock_merge_regions(struct memblock_type *type)
+static void __init_memblock memblock_merge_regions(struct memblock_type *type,
+ unsigned long start_rgn,
+ unsigned long end_rgn)
{
int i = 0;
-
- /* cnt never goes below 1 */
- while (i < type->cnt - 1) {
+ if (start_rgn)
+ i = start_rgn - 1;
+ end_rgn = min(end_rgn, type->cnt - 1);
+ while (i < end_rgn) {
struct memblock_region *this = &type->regions[i];
struct memblock_region *next = &type->regions[i + 1];
@@ -525,6 +529,7 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
/* move forward from next + 1, index of which is i + 2 */
memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
type->cnt--;
+ end_rgn--;
}
}
@@ -581,7 +586,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
bool insert = false;
phys_addr_t obase = base;
phys_addr_t end = base + memblock_cap_size(base, &size);
- int idx, nr_new;
+ int idx, nr_new, start_rgn = -1, end_rgn;
struct memblock_region *rgn;
if (!size)
@@ -601,11 +606,11 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
/*
* The worst case is when new range overlaps all existing regions,
* then we'll need type->cnt + 1 empty regions in @type. So if
- * type->cnt * 2 + 1 is less than type->max, we know
+ * type->cnt * 2 + 1 is less than or equal to type->max, we know
* that there is enough empty regions in @type, and we can insert
* regions directly.
*/
- if (type->cnt * 2 + 1 < type->max)
+ if (type->cnt * 2 + 1 <= type->max)
insert = true;
repeat:
@@ -635,10 +640,14 @@ repeat:
#endif
WARN_ON(flags != rgn->flags);
nr_new++;
- if (insert)
+ if (insert) {
+ if (start_rgn == -1)
+ start_rgn = idx;
+ end_rgn = idx + 1;
memblock_insert_region(type, idx++, base,
rbase - base, nid,
flags);
+ }
}
/* area below @rend is dealt with, forget about it */
base = min(rend, end);
@@ -647,9 +656,13 @@ repeat:
/* insert the remaining portion */
if (base < end) {
nr_new++;
- if (insert)
+ if (insert) {
+ if (start_rgn == -1)
+ start_rgn = idx;
+ end_rgn = idx + 1;
memblock_insert_region(type, idx, base, end - base,
nid, flags);
+ }
}
if (!nr_new)
@@ -666,7 +679,7 @@ repeat:
insert = true;
goto repeat;
} else {
- memblock_merge_regions(type);
+ memblock_merge_regions(type, start_rgn, end_rgn);
return 0;
}
}
@@ -836,7 +849,7 @@ void __init_memblock memblock_free(void *ptr, size_t size)
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
*
- * Free boot memory block previously allocated by memblock_alloc_xx() API.
+ * Free boot memory block previously allocated by memblock_phys_alloc_xx() API.
* The freeing memory will not be released to the buddy allocator.
*/
int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
@@ -902,7 +915,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
r->flags &= ~flag;
}
- memblock_merge_regions(type);
+ memblock_merge_regions(type, start_rgn, end_rgn);
return 0;
}
@@ -1275,7 +1288,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
for (i = start_rgn; i < end_rgn; i++)
memblock_set_region_node(&type->regions[i], nid);
- memblock_merge_regions(type);
+ memblock_merge_regions(type, start_rgn, end_rgn);
#endif
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ab457f0394ab..5abffe6f8389 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,7 +63,6 @@
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
-#include <linux/parser.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -89,6 +88,9 @@ static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem __ro_after_init;
+/* BPF memory accounting disabled? */
+static bool cgroup_memory_nobpf __ro_after_init;
+
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
@@ -346,26 +348,27 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
-DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
-EXPORT_SYMBOL(memcg_kmem_enabled_key);
+DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key);
+EXPORT_SYMBOL(memcg_kmem_online_key);
+
+DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
+EXPORT_SYMBOL(memcg_bpf_enabled_key);
#endif
/**
- * mem_cgroup_css_from_page - css of the memcg associated with a page
- * @page: page of interest
+ * mem_cgroup_css_from_folio - css of the memcg associated with a folio
+ * @folio: folio of interest
*
* If memcg is bound to the default hierarchy, css of the memcg associated
- * with @page is returned. The returned css remains associated with @page
+ * with @folio is returned. The returned css remains associated with @folio
* until it is released.
*
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
*/
-struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
{
- struct mem_cgroup *memcg;
-
- memcg = page_memcg(page);
+ struct mem_cgroup *memcg = folio_memcg(folio);
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
memcg = root_mem_cgroup;
@@ -478,6 +481,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_node *mctz;
+ if (lru_gen_enabled()) {
+ if (soft_limit_excess(memcg))
+ lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec);
+ return;
+ }
+
mctz = soft_limit_tree.rb_tree_per_node[nid];
if (!mctz)
return;
@@ -2393,8 +2402,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask,
- MEMCG_RECLAIM_MAY_SWAP,
- NULL);
+ MEMCG_RECLAIM_MAY_SWAP);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
@@ -2685,8 +2693,7 @@ retry:
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
- gfp_mask, reclaim_options,
- NULL);
+ gfp_mask, reclaim_options);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -2942,13 +2949,13 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
}
/*
- * page_memcg_check() is used here, because in theory we can encounter
+ * folio_memcg_check() is used here, because in theory we can encounter
* a folio where the slab flag has been cleared already, but
* slab->memcg_data has not been freed yet
- * page_memcg_check(page) will guarantee that a proper memory
+ * folio_memcg_check() will guarantee that a proper memory
* cgroup pointer or NULL will be returned.
*/
- return page_memcg_check(folio_page(folio, 0));
+ return folio_memcg_check(folio);
}
/*
@@ -3033,7 +3040,7 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
{
struct obj_cgroup *objcg;
- if (!memcg_kmem_enabled())
+ if (!memcg_kmem_online())
return NULL;
if (PageMemcgKmem(page)) {
@@ -3506,8 +3513,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
}
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP,
- NULL)) {
+ memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
ret = -EBUSY;
break;
}
@@ -3530,6 +3536,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
struct mem_cgroup_tree_per_node *mctz;
unsigned long excess;
+ if (lru_gen_enabled())
+ return 0;
+
if (order > 0)
return 0;
@@ -3618,8 +3627,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- MEMCG_RECLAIM_MAY_SWAP,
- NULL))
+ MEMCG_RECLAIM_MAY_SWAP))
nr_retries--;
}
@@ -3744,7 +3752,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
objcg->memcg = memcg;
rcu_assign_pointer(memcg->objcg, objcg);
- static_branch_enable(&memcg_kmem_enabled_key);
+ static_branch_enable(&memcg_kmem_online_key);
memcg->kmemcg_id = memcg->id.id;
@@ -3919,6 +3927,10 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
+
if (val & ~MOVE_MASK)
return -EINVAL;
@@ -5362,6 +5374,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
+#if defined(CONFIG_MEMCG_KMEM)
+ if (!cgroup_memory_nobpf)
+ static_branch_inc(&memcg_bpf_enabled_key);
+#endif
+
return &memcg->css;
}
@@ -5387,6 +5404,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (unlikely(mem_cgroup_is_root(memcg)))
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
2UL*HZ);
+ lru_gen_online_memcg(memcg);
return 0;
offline_kmem:
memcg_offline_kmem(memcg);
@@ -5418,6 +5436,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
memcg_offline_kmem(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
+ lru_gen_offline_memcg(memcg);
drain_all_stock(memcg);
@@ -5429,6 +5448,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
invalidate_reclaim_iterators(memcg);
+ lru_gen_release_memcg(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -5446,6 +5466,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
static_branch_dec(&memcg_sockets_enabled_key);
+#if defined(CONFIG_MEMCG_KMEM)
+ if (!cgroup_memory_nobpf)
+ static_branch_dec(&memcg_bpf_enabled_key);
+#endif
+
vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
@@ -5692,7 +5717,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
- * The caller must make sure the page is not on LRU (isolate_page() is useful.)
+ * The page must be locked and not on the LRU.
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
@@ -5709,20 +5734,13 @@ static int mem_cgroup_move_account(struct page *page,
int nid, ret;
VM_BUG_ON(from == to);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
VM_BUG_ON(compound && !folio_test_large(folio));
- /*
- * Prevent mem_cgroup_migrate() from looking at
- * page's memory cgroup of its source page while we change it.
- */
- ret = -EBUSY;
- if (!folio_trylock(folio))
- goto out;
-
ret = -EINVAL;
if (folio_memcg(folio) != from)
- goto out_unlock;
+ goto out;
pgdat = folio_pgdat(folio);
from_vec = mem_cgroup_lruvec(from, pgdat);
@@ -5809,8 +5827,6 @@ static int mem_cgroup_move_account(struct page *page,
mem_cgroup_charge_statistics(from, -nr_pages);
memcg_check_events(from, nid);
local_irq_enable();
-out_unlock:
- folio_unlock(folio);
out:
return ret;
}
@@ -5859,6 +5875,29 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
+ if (target && page) {
+ if (!trylock_page(page)) {
+ put_page(page);
+ return ret;
+ }
+ /*
+ * page_mapped() must be stable during the move. This
+ * pte is locked, so if it's present, the page cannot
+ * become unmapped. If it isn't, we have only partial
+ * control over the mapped state: the page lock will
+ * prevent new faults against pagecache and swapcache,
+ * so an unmapped page cannot become mapped. However,
+ * if the page is already mapped elsewhere, it can
+ * unmap, and there is nothing we can do about it.
+ * Alas, skip moving the page in this case.
+ */
+ if (!pte_present(ptent) && page_mapped(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ret;
+ }
+ }
+
if (!page && !ent.val)
return ret;
if (page) {
@@ -5875,8 +5914,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (target)
target->page = page;
}
- if (!ret || !target)
+ if (!ret || !target) {
+ if (target)
+ unlock_page(page);
put_page(page);
+ }
}
/*
* There is a swap entry and a page doesn't exist or isn't charged.
@@ -5916,6 +5958,10 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
ret = MC_TARGET_PAGE;
if (target) {
get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ return MC_TARGET_NONE;
+ }
target->page = page;
}
}
@@ -6146,7 +6192,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
page = target.page;
- if (!isolate_lru_page(page)) {
+ if (isolate_lru_page(page)) {
if (!mem_cgroup_move_account(page, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
@@ -6154,6 +6200,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
putback_lru_page(page);
}
+ unlock_page(page);
put_page(page);
} else if (target_type == MC_TARGET_DEVICE) {
page = target.page;
@@ -6162,6 +6209,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
+ unlock_page(page);
put_page(page);
}
spin_unlock(ptl);
@@ -6194,7 +6242,7 @@ retry:
*/
if (PageTransCompound(page))
goto put;
- if (!device && isolate_lru_page(page))
+ if (!device && !isolate_lru_page(page))
goto put;
if (!mem_cgroup_move_account(page, false,
mc.from, mc.to)) {
@@ -6204,7 +6252,8 @@ retry:
}
if (!device)
putback_lru_page(page);
-put: /* get_mctgt_type() gets the page */
+put: /* get_mctgt_type() gets & locks the page */
+ unlock_page(page);
put_page(page);
break;
case MC_TARGET_SWAP:
@@ -6429,8 +6478,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
}
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
- NULL);
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
if (!reclaimed && !nr_retries--)
break;
@@ -6479,8 +6527,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
- NULL))
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
nr_reclaims--;
continue;
}
@@ -6603,54 +6650,21 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}
-enum {
- MEMORY_RECLAIM_NODES = 0,
- MEMORY_RECLAIM_NULL,
-};
-
-static const match_table_t if_tokens = {
- { MEMORY_RECLAIM_NODES, "nodes=%s" },
- { MEMORY_RECLAIM_NULL, NULL },
-};
-
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0;
- unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
- MEMCG_RECLAIM_PROACTIVE;
- char *old_buf, *start;
- substring_t args[MAX_OPT_ARGS];
- int token;
- char value[256];
- nodemask_t nodemask = NODE_MASK_ALL;
-
- buf = strstrip(buf);
-
- old_buf = buf;
- nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
- if (buf == old_buf)
- return -EINVAL;
+ unsigned int reclaim_options;
+ int err;
buf = strstrip(buf);
+ err = page_counter_memparse(buf, "", &nr_to_reclaim);
+ if (err)
+ return err;
- while ((start = strsep(&buf, " ")) != NULL) {
- if (!strlen(start))
- continue;
- token = match_token(start, if_tokens, args);
- match_strlcpy(value, args, sizeof(value));
- switch (token) {
- case MEMORY_RECLAIM_NODES:
- if (nodelist_parse(value, nodemask) < 0)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
- }
-
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
while (nr_reclaimed < nr_to_reclaim) {
unsigned long reclaimed;
@@ -6667,8 +6681,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaimed = try_to_free_mem_cgroup_pages(memcg,
nr_to_reclaim - nr_reclaimed,
- GFP_KERNEL, reclaim_options,
- &nodemask);
+ GFP_KERNEL, reclaim_options);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
@@ -7310,6 +7323,8 @@ static int __init cgroup_memory(char *s)
cgroup_memory_nosocket = true;
if (!strcmp(token, "nokmem"))
cgroup_memory_nokmem = true;
+ if (!strcmp(token, "nobpf"))
+ cgroup_memory_nobpf = true;
}
return 1;
}
diff --git a/mm/memfd.c b/mm/memfd.c
index 08f5f8304746..a0a7a37e8177 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -18,6 +18,7 @@
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
+#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>
/*
@@ -147,6 +148,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file)
}
#define F_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_EXEC | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
F_SEAL_WRITE | \
@@ -175,6 +177,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
* SEAL_SHRINK: Prevent the file from shrinking
* SEAL_GROW: Prevent the file from growing
* SEAL_WRITE: Prevent write access to the file
+ * SEAL_EXEC: Prevent modification of the exec bits in the file mode
*
* As we don't require any trust relationship between two parties, we
* must prevent seals from being removed. Therefore, sealing a file
@@ -219,6 +222,12 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
}
}
+ /*
+ * SEAL_EXEC implys SEAL_WRITE, making W^X from the start.
+ */
+ if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
+ seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
+
*file_seals |= seals;
error = 0;
@@ -261,12 +270,13 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
{
+ char comm[TASK_COMM_LEN];
unsigned int *file_seals;
struct file *file;
int fd, error;
@@ -283,6 +293,40 @@ SYSCALL_DEFINE2(memfd_create,
return -EINVAL;
}
+ /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
+ if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
+ return -EINVAL;
+
+ if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
+#ifdef CONFIG_SYSCTL
+ int sysctl = MEMFD_NOEXEC_SCOPE_EXEC;
+ struct pid_namespace *ns;
+
+ ns = task_active_pid_ns(current);
+ if (ns)
+ sysctl = ns->memfd_noexec_scope;
+
+ switch (sysctl) {
+ case MEMFD_NOEXEC_SCOPE_EXEC:
+ flags |= MFD_EXEC;
+ break;
+ case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL:
+ flags |= MFD_NOEXEC_SEAL;
+ break;
+ default:
+ pr_warn_once(
+ "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n",
+ task_pid_nr(current), get_task_comm(comm, current));
+ return -EINVAL;
+ }
+#else
+ flags |= MFD_EXEC;
+#endif
+ pr_warn_once(
+ "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n",
+ task_pid_nr(current), get_task_comm(comm, current));
+ }
+
/* length includes terminating zero */
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
if (len <= 0)
@@ -326,7 +370,15 @@ SYSCALL_DEFINE2(memfd_create,
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
- if (flags & MFD_ALLOW_SEALING) {
+ if (flags & MFD_NOEXEC_SEAL) {
+ struct inode *inode = file_inode(file);
+
+ inode->i_mode &= ~0111;
+ file_seals = memfd_file_seals_ptr(file);
+ *file_seals &= ~F_SEAL_SEAL;
+ *file_seals |= F_SEAL_EXEC;
+ } else if (flags & MFD_ALLOW_SEALING) {
+ /* MFD_EXEC and MFD_ALLOW_SEALING are set */
file_seals = memfd_file_seals_ptr(file);
*file_seals &= ~F_SEAL_SEAL;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c77a9e37e27e..fae9baf3be16 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -24,7 +24,7 @@
* - You have a test that can be added to mce-test
* https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
* - The case actually shows up as a frequent (top 10) page state in
- * tools/vm/page-types when running a real workload.
+ * tools/mm/page-types when running a real workload.
*
* There are several operations here with exponential complexity because
* of unsuitable VM data structures. For example the operation to map back
@@ -87,6 +87,41 @@ inline void num_poisoned_pages_sub(unsigned long pfn, long i)
memblk_nr_poison_sub(pfn, i);
}
+/**
+ * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
+ * @_name: name of the file in the per NUMA sysfs directory.
+ */
+#define MF_ATTR_RO(_name) \
+static ssize_t _name##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ struct memory_failure_stats *mf_stats = \
+ &NODE_DATA(dev->id)->mf_stats; \
+ return sprintf(buf, "%lu\n", mf_stats->_name); \
+} \
+static DEVICE_ATTR_RO(_name)
+
+MF_ATTR_RO(total);
+MF_ATTR_RO(ignored);
+MF_ATTR_RO(failed);
+MF_ATTR_RO(delayed);
+MF_ATTR_RO(recovered);
+
+static struct attribute *memory_failure_attr[] = {
+ &dev_attr_total.attr,
+ &dev_attr_ignored.attr,
+ &dev_attr_failed.attr,
+ &dev_attr_delayed.attr,
+ &dev_attr_recovered.attr,
+ NULL,
+};
+
+const struct attribute_group memory_failure_attr_group = {
+ .name = "memory_failure",
+ .attrs = memory_failure_attr,
+};
+
/*
* Return values:
* 1: the page is dissolved (if needed) and taken off from buddy,
@@ -811,7 +846,7 @@ static const char * const action_page_types[] = {
*/
static int delete_from_lru_cache(struct page *p)
{
- if (!isolate_lru_page(p)) {
+ if (isolate_lru_page(p)) {
/*
* Clear sensible page flags, so that the buddy system won't
* complain when the page is unpoison-and-freed.
@@ -1034,7 +1069,7 @@ static int me_pagecache_dirty(struct page_state *ps, struct page *p)
* cache and swap cache(ie. page is freshly swapped in). So it could be
* referenced concurrently by 2 types of PTEs:
* normal PTEs and swap PTEs. We try to handle them consistently by calling
- * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
+ * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs,
* and then
* - clear dirty bit to prevent IO
* - remove from LRU
@@ -1192,6 +1227,39 @@ static struct page_state error_states[] = {
#undef slab
#undef reserved
+static void update_per_node_mf_stats(unsigned long pfn,
+ enum mf_result result)
+{
+ int nid = MAX_NUMNODES;
+ struct memory_failure_stats *mf_stats = NULL;
+
+ nid = pfn_to_nid(pfn);
+ if (unlikely(nid < 0 || nid >= MAX_NUMNODES)) {
+ WARN_ONCE(1, "Memory failure: pfn=%#lx, invalid nid=%d", pfn, nid);
+ return;
+ }
+
+ mf_stats = &NODE_DATA(nid)->mf_stats;
+ switch (result) {
+ case MF_IGNORED:
+ ++mf_stats->ignored;
+ break;
+ case MF_FAILED:
+ ++mf_stats->failed;
+ break;
+ case MF_DELAYED:
+ ++mf_stats->delayed;
+ break;
+ case MF_RECOVERED:
+ ++mf_stats->recovered;
+ break;
+ default:
+ WARN_ONCE(1, "Memory failure: mf_result=%d is not properly handled", result);
+ break;
+ }
+ ++mf_stats->total;
+}
+
/*
* "Dirty/Clean" indication is not 100% accurate due to the possibility of
* setting PG_dirty outside page lock. See also comment above set_page_dirty().
@@ -1202,6 +1270,9 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
trace_memory_failure_event(pfn, type, result);
num_poisoned_pages_inc(pfn);
+
+ update_per_node_mf_stats(pfn, result);
+
pr_err("%#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
@@ -1257,28 +1328,28 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
static int __get_hwpoison_page(struct page *page, unsigned long flags)
{
- struct page *head = compound_head(page);
+ struct folio *folio = page_folio(page);
int ret = 0;
bool hugetlb = false;
- ret = get_hwpoison_huge_page(head, &hugetlb, false);
+ ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false);
if (hugetlb)
return ret;
/*
- * This check prevents from calling get_page_unless_zero() for any
- * unsupported type of page in order to reduce the risk of unexpected
- * races caused by taking a page refcount.
+ * This check prevents from calling folio_try_get() for any
+ * unsupported type of folio in order to reduce the risk of unexpected
+ * races caused by taking a folio refcount.
*/
- if (!HWPoisonHandlable(head, flags))
+ if (!HWPoisonHandlable(&folio->page, flags))
return -EBUSY;
- if (get_page_unless_zero(head)) {
- if (head == compound_head(page))
+ if (folio_try_get(folio)) {
+ if (folio == page_folio(page))
return 1;
pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
- put_page(head);
+ folio_put(folio);
}
return 0;
@@ -1347,11 +1418,11 @@ out:
static int __get_unpoison_page(struct page *page)
{
- struct page *head = compound_head(page);
+ struct folio *folio = page_folio(page);
int ret = 0;
bool hugetlb = false;
- ret = get_hwpoison_huge_page(head, &hugetlb, true);
+ ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true);
if (hugetlb)
return ret;
@@ -1415,7 +1486,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
int flags, struct page *hpage)
{
struct folio *folio = page_folio(hpage);
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
@@ -1445,7 +1516,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (PageSwapCache(p)) {
pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
- ttu |= TTU_IGNORE_HWPOISON;
+ ttu &= ~TTU_HWPOISON;
}
/*
@@ -1460,7 +1531,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
} else {
- ttu |= TTU_IGNORE_HWPOISON;
+ ttu &= ~TTU_HWPOISON;
pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
@@ -1695,18 +1766,18 @@ struct raw_hwp_page {
struct page *page;
};
-static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
+static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
{
- return (struct llist_head *)&page_folio(hpage)->_hugetlb_hwpoison;
+ return (struct llist_head *)&folio->_hugetlb_hwpoison;
}
-static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
+static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
{
struct llist_head *head;
struct llist_node *t, *tnode;
unsigned long count = 0;
- head = raw_hwp_list_head(hpage);
+ head = raw_hwp_list_head(folio);
llist_for_each_safe(tnode, t, head->first) {
struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
@@ -1721,21 +1792,21 @@ static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
return count;
}
-static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
+static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
{
struct llist_head *head;
struct raw_hwp_page *raw_hwp;
struct llist_node *t, *tnode;
- int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0;
+ int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
/*
* Once the hwpoison hugepage has lost reliable raw error info,
* there is little meaning to keep additional error info precisely,
* so skip to add additional raw error info.
*/
- if (HPageRawHwpUnreliable(hpage))
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return -EHWPOISON;
- head = raw_hwp_list_head(hpage);
+ head = raw_hwp_list_head(folio);
llist_for_each_safe(tnode, t, head->first) {
struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
@@ -1756,41 +1827,41 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
* hwpoisoned subpages, and we need refuse to free/dissolve
* this hwpoisoned hugepage.
*/
- SetHPageRawHwpUnreliable(hpage);
+ folio_set_hugetlb_raw_hwp_unreliable(folio);
/*
- * Once HPageRawHwpUnreliable is set, raw_hwp_page is not
+ * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not
* used any more, so free it.
*/
- __free_raw_hwp_pages(hpage, false);
+ __folio_free_raw_hwp(folio, false);
}
return ret;
}
-static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag)
+static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
{
/*
- * HPageVmemmapOptimized hugepages can't be freed because struct
+ * hugetlb_vmemmap_optimized hugepages can't be freed because struct
* pages for tail pages are required but they don't exist.
*/
- if (move_flag && HPageVmemmapOptimized(hpage))
+ if (move_flag && folio_test_hugetlb_vmemmap_optimized(folio))
return 0;
/*
- * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by
+ * hugetlb_raw_hwp_unreliable hugepages shouldn't be unpoisoned by
* definition.
*/
- if (HPageRawHwpUnreliable(hpage))
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return 0;
- return __free_raw_hwp_pages(hpage, move_flag);
+ return __folio_free_raw_hwp(folio, move_flag);
}
-void hugetlb_clear_page_hwpoison(struct page *hpage)
+void folio_clear_hugetlb_hwpoison(struct folio *folio)
{
- if (HPageRawHwpUnreliable(hpage))
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return;
- ClearPageHWPoison(hpage);
- free_raw_hwp_pages(hpage, true);
+ folio_clear_hwpoison(folio);
+ folio_free_raw_hwp(folio, true);
}
/*
@@ -1807,20 +1878,20 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared)
{
struct page *page = pfn_to_page(pfn);
- struct page *head = compound_head(page);
+ struct folio *folio = page_folio(page);
int ret = 2; /* fallback to normal page handling */
bool count_increased = false;
- if (!PageHeadHuge(head))
+ if (!folio_test_hugetlb(folio))
goto out;
if (flags & MF_COUNT_INCREASED) {
ret = 1;
count_increased = true;
- } else if (HPageFreed(head)) {
+ } else if (folio_test_hugetlb_freed(folio)) {
ret = 0;
- } else if (HPageMigratable(head)) {
- ret = get_page_unless_zero(head);
+ } else if (folio_test_hugetlb_migratable(folio)) {
+ ret = folio_try_get(folio);
if (ret)
count_increased = true;
} else {
@@ -1829,24 +1900,24 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
goto out;
}
- if (hugetlb_set_page_hwpoison(head, page)) {
+ if (folio_set_hugetlb_hwpoison(folio, page)) {
ret = -EHWPOISON;
goto out;
}
/*
- * Clearing HPageMigratable for hwpoisoned hugepages to prevent them
+ * Clearing hugetlb_migratable for hwpoisoned hugepages to prevent them
* from being migrated by memory hotremove.
*/
- if (count_increased && HPageMigratable(head)) {
- ClearHPageMigratable(head);
+ if (count_increased && folio_test_hugetlb_migratable(folio)) {
+ folio_clear_hugetlb_migratable(folio);
*migratable_cleared = true;
}
return ret;
out:
if (count_increased)
- put_page(head);
+ folio_put(folio);
return ret;
}
@@ -1860,7 +1931,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
{
int res;
struct page *p = pfn_to_page(pfn);
- struct page *head;
+ struct folio *folio;
unsigned long page_flags;
bool migratable_cleared = false;
@@ -1873,8 +1944,8 @@ retry:
} else if (res == -EHWPOISON) {
pr_err("%#lx: already hardware poisoned\n", pfn);
if (flags & MF_ACTION_REQUIRED) {
- head = compound_head(p);
- res = kill_accessing_process(current, page_to_pfn(head), flags);
+ folio = page_folio(p);
+ res = kill_accessing_process(current, folio_pfn(folio), flags);
}
return res;
} else if (res == -EBUSY) {
@@ -1885,16 +1956,16 @@ retry:
return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
}
- head = compound_head(p);
- lock_page(head);
+ folio = page_folio(p);
+ folio_lock(folio);
if (hwpoison_filter(p)) {
- hugetlb_clear_page_hwpoison(head);
+ folio_clear_hugetlb_hwpoison(folio);
if (migratable_cleared)
- SetHPageMigratable(head);
- unlock_page(head);
+ folio_set_hugetlb_migratable(folio);
+ folio_unlock(folio);
if (res == 1)
- put_page(head);
+ folio_put(folio);
return -EOPNOTSUPP;
}
@@ -1903,7 +1974,7 @@ retry:
* or demotion can be prevented by PageHWPoison flag.
*/
if (res == 0) {
- unlock_page(head);
+ folio_unlock(folio);
if (__page_handle_poison(p) >= 0) {
page_ref_inc(p);
res = MF_RECOVERED;
@@ -1913,10 +1984,10 @@ retry:
return action_result(pfn, MF_MSG_FREE_HUGE, res);
}
- page_flags = head->flags;
+ page_flags = folio->flags;
- if (!hwpoison_user_mappings(p, pfn, flags, head)) {
- unlock_page(head);
+ if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) {
+ folio_unlock(folio);
return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
}
@@ -1929,7 +2000,7 @@ static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *
return 0;
}
-static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
+static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag)
{
return 0;
}
@@ -2167,7 +2238,7 @@ try_again:
}
/*
- * __munlock_pagevec may clear a writeback page's LRU flag without
+ * __munlock_folio() may clear a writeback page's LRU flag without
* page_lock. We need wait writeback completion for this page or it
* may trigger vfs BUG while evict inode.
*/
@@ -2335,7 +2406,7 @@ core_initcall(memory_failure_init);
*/
int unpoison_memory(unsigned long pfn)
{
- struct page *page;
+ struct folio *folio;
struct page *p;
int ret = -EBUSY;
unsigned long count = 1;
@@ -2347,7 +2418,7 @@ int unpoison_memory(unsigned long pfn)
return -ENXIO;
p = pfn_to_page(pfn);
- page = compound_head(p);
+ folio = page_folio(p);
mutex_lock(&mf_mutex);
@@ -2358,44 +2429,44 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
- if (!PageHWPoison(p)) {
+ if (!folio_test_hwpoison(folio)) {
unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
pfn, &unpoison_rs);
goto unlock_mutex;
}
- if (page_count(page) > 1) {
+ if (folio_ref_count(folio) > 1) {
unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
pfn, &unpoison_rs);
goto unlock_mutex;
}
- if (page_mapped(page)) {
+ if (folio_mapped(folio)) {
unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
pfn, &unpoison_rs);
goto unlock_mutex;
}
- if (page_mapping(page)) {
+ if (folio_mapping(folio)) {
unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
pfn, &unpoison_rs);
goto unlock_mutex;
}
- if (PageSlab(page) || PageTable(page) || PageReserved(page))
+ if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio))
goto unlock_mutex;
ret = get_hwpoison_page(p, MF_UNPOISON);
if (!ret) {
if (PageHuge(p)) {
huge = true;
- count = free_raw_hwp_pages(page, false);
+ count = folio_free_raw_hwp(folio, false);
if (count == 0) {
ret = -EBUSY;
goto unlock_mutex;
}
}
- ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
+ ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY;
} else if (ret < 0) {
if (ret == -EHWPOISON) {
ret = put_page_back_buddy(p) ? 0 : -EBUSY;
@@ -2405,17 +2476,17 @@ int unpoison_memory(unsigned long pfn)
} else {
if (PageHuge(p)) {
huge = true;
- count = free_raw_hwp_pages(page, false);
+ count = folio_free_raw_hwp(folio, false);
if (count == 0) {
ret = -EBUSY;
- put_page(page);
+ folio_put(folio);
goto unlock_mutex;
}
}
- put_page(page);
+ folio_put(folio);
if (TestClearPageHWPoison(p)) {
- put_page(page);
+ folio_put(folio);
ret = 0;
}
}
@@ -2437,15 +2508,15 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
bool isolated = false;
if (PageHuge(page)) {
- isolated = !isolate_hugetlb(page, pagelist);
+ isolated = isolate_hugetlb(page_folio(page), pagelist);
} else {
bool lru = !__PageMovable(page);
if (lru)
- isolated = !isolate_lru_page(page);
+ isolated = isolate_lru_page(page);
else
- isolated = !isolate_movable_page(page,
- ISOLATE_UNEVICTABLE);
+ isolated = isolate_movable_page(page,
+ ISOLATE_UNEVICTABLE);
if (isolated) {
list_add(&page->lru, pagelist);
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index c734658c6242..e593e56e530b 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -211,8 +211,8 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
ret = device_register(&new_memtier->dev);
if (ret) {
- list_del(&memtier->list);
- put_device(&memtier->dev);
+ list_del(&new_memtier->list);
+ put_device(&new_memtier->dev);
return ERR_PTR(ret);
}
memtier = new_memtier;
diff --git a/mm/memory.c b/mm/memory.c
index aad226daf41b..f456f3b5049c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -625,6 +625,16 @@ out:
return pfn_to_page(pfn);
}
+struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ struct page *page = vm_normal_page(vma, addr, pte);
+
+ if (page)
+ return page_folio(page);
+ return NULL;
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
@@ -828,12 +838,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return -EBUSY;
return -ENOENT;
} else if (is_pte_marker_entry(entry)) {
- /*
- * We're copying the pgtable should only because dst_vma has
- * uffd-wp enabled, do sanity check.
- */
- WARN_ON_ONCE(!userfaultfd_wp(dst_vma));
- set_pte_at(dst_mm, addr, dst_pte, pte);
+ if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma))
+ set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
if (!userfaultfd_wp(dst_vma))
@@ -857,13 +863,13 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
- struct page **prealloc, struct page *page)
+ struct folio **prealloc, struct page *page)
{
- struct page *new_page;
+ struct folio *new_folio;
pte_t pte;
- new_page = *prealloc;
- if (!new_page)
+ new_folio = *prealloc;
+ if (!new_folio)
return -EAGAIN;
/*
@@ -871,18 +877,18 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* over and copy the page & arm it.
*/
*prealloc = NULL;
- copy_user_highpage(new_page, page, addr, src_vma);
- __SetPageUptodate(new_page);
- page_add_new_anon_rmap(new_page, dst_vma, addr);
- lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
- rss[mm_counter(new_page)]++;
+ copy_user_highpage(&new_folio->page, page, addr, src_vma);
+ __folio_mark_uptodate(new_folio);
+ folio_add_new_anon_rmap(new_folio, dst_vma, addr);
+ folio_add_lru_vma(new_folio, dst_vma);
+ rss[MM_ANONPAGES]++;
/* All done, just insert the new page copy in the child */
- pte = mk_pte(new_page, dst_vma->vm_page_prot);
+ pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
if (userfaultfd_pte_wp(dst_vma, *src_pte))
/* Uffd-wp needs to be delivered to dest pte as well */
- pte = pte_wrprotect(pte_mkuffd_wp(pte));
+ pte = pte_mkuffd_wp(pte);
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
@@ -894,33 +900,36 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
static inline int
copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
- struct page **prealloc)
+ struct folio **prealloc)
{
struct mm_struct *src_mm = src_vma->vm_mm;
unsigned long vm_flags = src_vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
+ struct folio *folio;
page = vm_normal_page(src_vma, addr, pte);
- if (page && PageAnon(page)) {
+ if (page)
+ folio = page_folio(page);
+ if (page && folio_test_anon(folio)) {
/*
* If this page may have been pinned by the parent process,
* copy the page immediately for the child so that we'll always
* guarantee the pinned page won't be randomly replaced in the
* future.
*/
- get_page(page);
+ folio_get(folio);
if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
- /* Page maybe pinned, we have to copy. */
- put_page(page);
+ /* Page may be pinned, we have to copy. */
+ folio_put(folio);
return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
addr, rss, prealloc, page);
}
- rss[mm_counter(page)]++;
+ rss[MM_ANONPAGES]++;
} else if (page) {
- get_page(page);
+ folio_get(folio);
page_dup_file_rmap(page, false);
- rss[mm_counter(page)]++;
+ rss[mm_counter_file(page)]++;
}
/*
@@ -931,7 +940,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
- VM_BUG_ON(page && PageAnon(page) && PageAnonExclusive(page));
+ VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
/*
* If it's a shared mapping, mark it clean in
@@ -948,23 +957,22 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
return 0;
}
-static inline struct page *
-page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
- unsigned long addr)
+static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm,
+ struct vm_area_struct *vma, unsigned long addr)
{
- struct page *new_page;
+ struct folio *new_folio;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
- if (!new_page)
+ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
+ if (!new_folio)
return NULL;
- if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) {
- put_page(new_page);
+ if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) {
+ folio_put(new_folio);
return NULL;
}
- cgroup_throttle_swaprate(new_page, GFP_KERNEL);
+ cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL);
- return new_page;
+ return new_folio;
}
static int
@@ -980,7 +988,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
int progress, ret = 0;
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
- struct page *prealloc = NULL;
+ struct folio *prealloc = NULL;
again:
progress = 0;
@@ -1050,7 +1058,7 @@ again:
* will allocate page according to address). This
* could only happen if one pinned pte changed.
*/
- put_page(prealloc);
+ folio_put(prealloc);
prealloc = NULL;
}
progress += 8;
@@ -1087,7 +1095,7 @@ again:
goto again;
out:
if (unlikely(prealloc))
- put_page(prealloc);
+ folio_put(prealloc);
return ret;
}
@@ -1260,7 +1268,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
if (is_cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
- 0, src_vma, src_mm, addr, end);
+ 0, src_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
/*
* Disabling preemption is not needed for the write side, as
@@ -1396,8 +1404,7 @@ again:
force_flush = 1;
}
}
- if (pte_young(ptent) &&
- likely(!(vma->vm_flags & VM_SEQ_READ)))
+ if (pte_young(ptent) && likely(vma_has_recency(vma)))
mark_page_accessed(page);
}
rss[mm_counter(page)]--;
@@ -1606,7 +1613,7 @@ void unmap_page_range(struct mmu_gather *tlb,
static void unmap_single_vma(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr,
- struct zap_details *details)
+ struct zap_details *details, bool mm_wr_locked)
{
unsigned long start = max(vma->vm_start, start_addr);
unsigned long end;
@@ -1621,7 +1628,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
uprobe_munmap(vma, start, end);
if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn(vma, 0, 0);
+ untrack_pfn(vma, 0, 0, mm_wr_locked);
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -1668,7 +1675,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
*/
void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr)
+ unsigned long end_addr, bool mm_wr_locked)
{
struct mmu_notifier_range range;
struct zap_details details = {
@@ -1678,46 +1685,17 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
};
MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
do {
- unmap_single_vma(tlb, vma, start_addr, end_addr, &details);
+ unmap_single_vma(tlb, vma, start_addr, end_addr, &details,
+ mm_wr_locked);
} while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
}
/**
- * zap_page_range - remove user pages in a given range
- * @vma: vm_area_struct holding the applicable pages
- * @start: starting address of pages to zap
- * @size: number of bytes to zap
- *
- * Caller must protect the VMA list
- */
-void zap_page_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long size)
-{
- struct maple_tree *mt = &vma->vm_mm->mm_mt;
- unsigned long end = start + size;
- struct mmu_notifier_range range;
- struct mmu_gather tlb;
- MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
-
- lru_add_drain();
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- start, start + size);
- tlb_gather_mmu(&tlb, vma->vm_mm);
- update_hiwater_rss(vma->vm_mm);
- mmu_notifier_invalidate_range_start(&range);
- do {
- unmap_single_vma(&tlb, vma, start, range.end, NULL);
- } while ((vma = mas_find(&mas, end - 1)) != NULL);
- mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb);
-}
-
-/**
* zap_page_range_single - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap
@@ -1734,7 +1712,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
struct mmu_gather tlb;
lru_add_drain();
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
if (is_vm_hugetlb_page(vma))
adjust_range_if_pmd_sharing_possible(vma, &range.start,
@@ -1746,7 +1724,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
* unmap 'address-end' not 'range.start-range.end' as range
* could have been expanded for hugetlb pmd sharing.
*/
- unmap_single_vma(&tlb, vma, address, end, details);
+ unmap_single_vma(&tlb, vma, address, end, details, false);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
@@ -1951,7 +1929,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
if (!(vma->vm_flags & VM_MIXEDMAP)) {
BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
- vma->vm_flags |= VM_MIXEDMAP;
+ vm_flags_set(vma, VM_MIXEDMAP);
}
/* Defer page refcount checking till we're about to map that page. */
return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
@@ -2009,7 +1987,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
if (!(vma->vm_flags & VM_MIXEDMAP)) {
BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
- vma->vm_flags |= VM_MIXEDMAP;
+ vm_flags_set(vma, VM_MIXEDMAP);
}
return insert_page(vma, addr, page, vma->vm_page_prot);
}
@@ -2475,7 +2453,7 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
vma->vm_pgoff = pfn;
}
- vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
@@ -2515,7 +2493,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
if (err)
- untrack_pfn(vma, pfn, PAGE_ALIGN(size));
+ untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -3068,8 +3046,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
- struct page *old_page = vmf->page;
- struct page *new_page = NULL;
+ struct folio *old_folio = NULL;
+ struct folio *new_folio = NULL;
pte_t entry;
int page_copied = 0;
struct mmu_notifier_range range;
@@ -3077,21 +3055,22 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
delayacct_wpcopy_start();
+ if (vmf->page)
+ old_folio = page_folio(vmf->page);
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
- new_page = alloc_zeroed_user_highpage_movable(vma,
- vmf->address);
- if (!new_page)
+ new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
+ if (!new_folio)
goto oom;
} else {
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- vmf->address);
- if (!new_page)
+ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
+ vmf->address, false);
+ if (!new_folio)
goto oom;
- ret = __wp_page_copy_user(new_page, old_page, vmf);
+ ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
if (ret) {
/*
* COW failed, if the fault was solved by other,
@@ -3100,23 +3079,23 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* from the second attempt.
* The -EHWPOISON case will not be retried.
*/
- put_page(new_page);
- if (old_page)
- put_page(old_page);
+ folio_put(new_folio);
+ if (old_folio)
+ folio_put(old_folio);
delayacct_wpcopy_end();
return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
}
- kmsan_copy_page_meta(new_page, old_page);
+ kmsan_copy_page_meta(&new_folio->page, vmf->page);
}
- if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
+ if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
goto oom_free_new;
- cgroup_throttle_swaprate(new_page, GFP_KERNEL);
+ cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL);
- __SetPageUptodate(new_page);
+ __folio_mark_uptodate(new_folio);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -3126,16 +3105,16 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
- if (old_page) {
- if (!PageAnon(old_page)) {
- dec_mm_counter(mm, mm_counter_file(old_page));
+ if (old_folio) {
+ if (!folio_test_anon(old_folio)) {
+ dec_mm_counter(mm, mm_counter_file(&old_folio->page));
inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
- entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = mk_pte(&new_folio->page, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (unlikely(unshare)) {
if (pte_soft_dirty(vmf->orig_pte))
@@ -3154,8 +3133,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* some TLBs while the old PTE remains in others.
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
- page_add_new_anon_rmap(new_page, vma, vmf->address);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ folio_add_new_anon_rmap(new_folio, vma, vmf->address);
+ folio_add_lru_vma(new_folio, vma);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
@@ -3164,7 +3143,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
BUG_ON(unshare && pte_write(entry));
set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
update_mmu_cache(vma, vmf->address, vmf->pte);
- if (old_page) {
+ if (old_folio) {
/*
* Only after switching the pte to the new page may
* we remove the mapcount here. Otherwise another
@@ -3187,18 +3166,18 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
- page_remove_rmap(old_page, vma, false);
+ page_remove_rmap(vmf->page, vma, false);
}
/* Free the old page.. */
- new_page = old_page;
+ new_folio = old_folio;
page_copied = 1;
} else {
update_mmu_tlb(vma, vmf->address, vmf->pte);
}
- if (new_page)
- put_page(new_page);
+ if (new_folio)
+ folio_put(new_folio);
pte_unmap_unlock(vmf->pte, vmf->ptl);
/*
@@ -3206,19 +3185,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* the above ptep_clear_flush_notify() did already call it.
*/
mmu_notifier_invalidate_range_only_end(&range);
- if (old_page) {
+ if (old_folio) {
if (page_copied)
- free_swap_cache(old_page);
- put_page(old_page);
+ free_swap_cache(&old_folio->page);
+ folio_put(old_folio);
}
delayacct_wpcopy_end();
return 0;
oom_free_new:
- put_page(new_page);
+ folio_put(new_folio);
oom:
- if (old_page)
- put_page(old_page);
+ if (old_folio)
+ folio_put(old_folio);
delayacct_wpcopy_end();
return VM_FAULT_OOM;
@@ -3586,7 +3565,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags))
return VM_FAULT_RETRY;
- mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
vma->vm_mm, vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
mmu_notifier_invalidate_range_start(&range);
@@ -3629,8 +3608,12 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
/*
* Be careful so that we will only recover a special uffd-wp pte into a
* none pte. Otherwise it means the pte could have changed, so retry.
+ *
+ * This should also cover the case where e.g. the pte changed
+ * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
+ * So is_pte_marker() check is not enough to safely drop the pte.
*/
- if (is_pte_marker(*vmf->pte))
+ if (pte_same(vmf->orig_pte, *vmf->pte))
pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
@@ -3644,9 +3627,7 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
{
/*
* Just in case there're leftover special ptes even after the region
- * got unregistered - we can simply clear them. We can also do that
- * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp
- * ranges, but it should be more efficient to be done lazily here.
+ * got unregistered - we can simply clear them.
*/
if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
return pte_marker_clear(vmf);
@@ -3840,6 +3821,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
goto out_page;
+ } else if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
+ ret = VM_FAULT_HWPOISON;
+ goto out_page;
}
folio = page_folio(page);
@@ -3885,10 +3869,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* the swap entry concurrently) for certainly exclusive pages.
*/
if (!folio_test_ksm(folio)) {
- /*
- * Note that pte_swp_exclusive() == false for architectures
- * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE.
- */
exclusive = pte_swp_exclusive(vmf->orig_pte);
if (folio != swapcache) {
/*
@@ -3950,10 +3930,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(vmf->orig_pte)) {
+ if (pte_swp_uffd_wp(vmf->orig_pte))
pte = pte_mkuffd_wp(pte);
- pte = pte_wrprotect(pte);
- }
vmf->orig_pte = pte;
/* ksm created a completely new copy */
@@ -4021,7 +3999,7 @@ out_release:
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page;
+ struct folio *folio;
vm_fault_t ret = 0;
pte_t entry;
@@ -4071,22 +4049,22 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
- if (!page)
+ folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
+ if (!folio)
goto oom;
- if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
+ if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
- cgroup_throttle_swaprate(page, GFP_KERNEL);
+ cgroup_throttle_swaprate(&folio->page, GFP_KERNEL);
/*
- * The memory barrier inside __SetPageUptodate makes sure that
+ * The memory barrier inside __folio_mark_uptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
- __SetPageUptodate(page);
+ __folio_mark_uptodate(folio);
- entry = mk_pte(page, vma->vm_page_prot);
+ entry = mk_pte(&folio->page, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
@@ -4105,13 +4083,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
- put_page(page);
+ folio_put(folio);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, vmf->address);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ folio_add_new_anon_rmap(folio, vma, vmf->address);
+ folio_add_lru_vma(folio, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -4121,10 +4099,10 @@ unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
release:
- put_page(page);
+ folio_put(folio);
goto unlock;
oom_free_page:
- put_page(page);
+ folio_put(folio);
oom:
return VM_FAULT_OOM;
}
@@ -4296,7 +4274,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (unlikely(uffd_wp))
- entry = pte_mkuffd_wp(pte_wrprotect(entry));
+ entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
@@ -5137,8 +5115,8 @@ static inline void mm_account_fault(struct pt_regs *regs,
#ifdef CONFIG_LRU_GEN
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
- /* the LRU algorithm doesn't apply to sequential or random reads */
- current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
+ /* the LRU algorithm only applies to accesses with recency */
+ current->in_lru_fault = vma_has_recency(vma);
}
static void lru_gen_exit_fault(void)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fd40f7e9f176..db3b270254f1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1620,18 +1620,17 @@ found:
return 0;
}
-static int
-do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
+static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page, *head;
- int ret = 0;
LIST_HEAD(source);
static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
struct folio *folio;
+ bool isolated;
if (!pfn_valid(pfn))
continue;
@@ -1641,7 +1640,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (PageHuge(page)) {
pfn = page_to_pfn(head) + compound_nr(head) - 1;
- isolate_hugetlb(head, &source);
+ isolate_hugetlb(folio, &source);
continue;
} else if (PageTransHuge(page))
pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
@@ -1668,10 +1667,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* LRU and non-lru movable pages.
*/
if (PageLRU(page))
- ret = isolate_lru_page(page);
+ isolated = isolate_lru_page(page);
else
- ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
- if (!ret) { /* Success */
+ isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
+ if (isolated) {
list_add_tail(&page->lru, &source);
if (!__PageMovable(page))
inc_node_page_state(page, NR_ISOLATED_ANON +
@@ -1691,6 +1690,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
.nmask = &nmask,
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
+ int ret;
/*
* We have checked that migration range is on a single zone so
@@ -1719,8 +1719,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
putback_movable_pages(&source);
}
}
-
- return ret;
}
static int __init cmdline_parse_movable_node(char *p)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 02c8a712282f..a256a241fd1d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -414,7 +414,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
},
};
-static int migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags);
struct queue_pages {
@@ -427,36 +427,36 @@ struct queue_pages {
};
/*
- * Check if the page's nid is in qp->nmask.
+ * Check if the folio's nid is in qp->nmask.
*
* If MPOL_MF_INVERT is set in qp->flags, check if the nid is
* in the invert of qp->nmask.
*/
-static inline bool queue_pages_required(struct page *page,
+static inline bool queue_folio_required(struct folio *folio,
struct queue_pages *qp)
{
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
unsigned long flags = qp->flags;
return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}
/*
- * queue_pages_pmd() has three possible return values:
- * 0 - pages are placed on the right node or queued successfully, or
+ * queue_folios_pmd() has three possible return values:
+ * 0 - folios are placed on the right node or queued successfully, or
* special page is met, i.e. huge zero page.
- * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
- * existing page was already on a node that does not follow the
+ * existing folio was already on a node that does not follow the
* policy.
*/
-static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
+static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
__releases(ptl)
{
int ret = 0;
- struct page *page;
+ struct folio *folio;
struct queue_pages *qp = walk->private;
unsigned long flags;
@@ -464,19 +464,19 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
ret = -EIO;
goto unlock;
}
- page = pmd_page(*pmd);
- if (is_huge_zero_page(page)) {
+ folio = pfn_folio(pmd_pfn(*pmd));
+ if (is_huge_zero_page(&folio->page)) {
walk->action = ACTION_CONTINUE;
goto unlock;
}
- if (!queue_pages_required(page, qp))
+ if (!queue_folio_required(folio, qp))
goto unlock;
flags = qp->flags;
- /* go to thp migration */
+ /* go to folio migration */
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
if (!vma_migratable(walk->vma) ||
- migrate_page_add(page, qp->pagelist, flags)) {
+ migrate_folio_add(folio, qp->pagelist, flags)) {
ret = 1;
goto unlock;
}
@@ -491,19 +491,19 @@ unlock:
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
*
- * queue_pages_pte_range() has three possible return values:
- * 0 - pages are placed on the right node or queued successfully, or
+ * queue_folios_pte_range() has three possible return values:
+ * 0 - folios are placed on the right node or queued successfully, or
* special page is met, i.e. zero page.
- * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
- * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
+ * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already
* on a node that does not follow the policy.
*/
-static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
+static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
- struct page *page;
+ struct folio *folio;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
bool has_unmovable = false;
@@ -512,7 +512,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl)
- return queue_pages_pmd(pmd, ptl, addr, end, walk);
+ return queue_folios_pmd(pmd, ptl, addr, end, walk);
if (pmd_trans_unstable(pmd))
return 0;
@@ -521,16 +521,16 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
- page = vm_normal_page(vma, addr, *pte);
- if (!page || is_zone_device_page(page))
+ folio = vm_normal_folio(vma, addr, *pte);
+ if (!folio || folio_is_zone_device(folio))
continue;
/*
- * vm_normal_page() filters out zero pages, but there might
- * still be PageReserved pages to skip, perhaps in a VDSO.
+ * vm_normal_folio() filters out zero pages, but there might
+ * still be reserved folios to skip, perhaps in a VDSO.
*/
- if (PageReserved(page))
+ if (folio_test_reserved(folio))
continue;
- if (!queue_pages_required(page, qp))
+ if (!queue_folio_required(folio, qp))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
/* MPOL_MF_STRICT must be specified if we get here */
@@ -544,7 +544,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
* temporary off LRU pages in the range. Still
* need migrate other LRU pages.
*/
- if (migrate_page_add(page, qp->pagelist, flags))
+ if (migrate_folio_add(folio, qp->pagelist, flags))
has_unmovable = true;
} else
break;
@@ -558,7 +558,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
return addr != end ? -EIO : 0;
}
-static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
+static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
@@ -566,7 +566,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
unsigned long flags = (qp->flags & MPOL_MF_VALID);
- struct page *page;
+ struct folio *folio;
spinlock_t *ptl;
pte_t entry;
@@ -574,13 +574,13 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
entry = huge_ptep_get(pte);
if (!pte_present(entry))
goto unlock;
- page = pte_page(entry);
- if (!queue_pages_required(page, qp))
+ folio = pfn_folio(pte_pfn(entry));
+ if (!queue_folio_required(folio, qp))
goto unlock;
if (flags == MPOL_MF_STRICT) {
/*
- * STRICT alone means only detecting misplaced page and no
+ * STRICT alone means only detecting misplaced folio and no
* need to further check other vma.
*/
ret = -EIO;
@@ -591,20 +591,28 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
/*
* Must be STRICT with MOVE*, otherwise .test_walk() have
* stopped walking current vma.
- * Detecting misplaced page but allow migrating pages which
+ * Detecting misplaced folio but allow migrating folios which
* have been queued.
*/
ret = 1;
goto unlock;
}
- /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
+ /*
+ * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it
+ * is shared it is likely not worth migrating.
+ *
+ * To check if the folio is shared, ideally we want to make sure
+ * every page is mapped to the same process. Doing that is very
+ * expensive, so check the estimated mapcount of the folio instead.
+ */
if (flags & (MPOL_MF_MOVE_ALL) ||
- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
- if (isolate_hugetlb(page, qp->pagelist) &&
+ (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 &&
+ !hugetlb_pmd_shared(pte))) {
+ if (!isolate_hugetlb(folio, qp->pagelist) &&
(flags & MPOL_MF_STRICT))
/*
- * Failed to isolate page but allow migrating pages
+ * Failed to isolate folio but allow migrating pages
* which have been queued.
*/
ret = 1;
@@ -631,13 +639,12 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
struct mmu_gather tlb;
- int nr_updated;
+ long nr_updated;
tlb_gather_mmu(&tlb, vma->vm_mm);
- nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE,
- MM_CP_PROT_NUMA);
- if (nr_updated)
+ nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
+ if (nr_updated > 0)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
tlb_finish_mmu(&tlb);
@@ -703,8 +710,8 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
}
static const struct mm_walk_ops queue_pages_walk_ops = {
- .hugetlb_entry = queue_pages_hugetlb,
- .pmd_entry = queue_pages_pte_range,
+ .hugetlb_entry = queue_folios_hugetlb,
+ .pmd_entry = queue_folios_pte_range,
.test_walk = queue_pages_test_walk,
};
@@ -787,24 +794,21 @@ static int vma_replace_policy(struct vm_area_struct *vma,
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
- MA_STATE(mas, &mm->mm_mt, start, start);
+ VMA_ITERATOR(vmi, mm, start);
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
pgoff_t pgoff;
- prev = mas_prev(&mas, 0);
- if (unlikely(!prev))
- mas_set(&mas, start);
-
- vma = mas_find(&mas, end - 1);
+ prev = vma_prev(&vmi);
+ vma = vma_find(&vmi, end);
if (WARN_ON(!vma))
return 0;
if (start > vma->vm_start)
prev = vma;
- for (; vma; vma = mas_next(&mas, end - 1)) {
+ do {
unsigned long vmstart = max(start, vma->vm_start);
unsigned long vmend = min(end, vma->vm_end);
@@ -813,29 +817,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
+ prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx,
anon_vma_name(vma));
if (prev) {
- /* vma_merge() invalidated the mas */
- mas_pause(&mas);
vma = prev;
goto replace;
}
if (vma->vm_start != vmstart) {
- err = split_vma(vma->vm_mm, vma, vmstart, 1);
+ err = split_vma(&vmi, vma, vmstart, 1);
if (err)
goto out;
- /* split_vma() invalidated the mas */
- mas_pause(&mas);
}
if (vma->vm_end != vmend) {
- err = split_vma(vma->vm_mm, vma, vmend, 0);
+ err = split_vma(&vmi, vma, vmend, 0);
if (err)
goto out;
- /* split_vma() invalidated the mas */
- mas_pause(&mas);
}
replace:
err = vma_replace_policy(vma, new_pol);
@@ -843,7 +841,7 @@ replace:
goto out;
next:
prev = vma;
- }
+ } for_each_vma_range(vmi, vma, end);
out:
return err;
@@ -1023,27 +1021,28 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
}
#ifdef CONFIG_MIGRATION
-/*
- * page migration, thp tail pages can be passed.
- */
-static int migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags)
{
- struct page *head = compound_head(page);
/*
- * Avoid migrating a page that is shared with others.
+ * We try to migrate only unshared folios. If it is shared it
+ * is likely not worth migrating.
+ *
+ * To check if the folio is shared, ideally we want to make sure
+ * every page is mapped to the same process. Doing that is very
+ * expensive, so check the estimated mapcount of the folio instead.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
- if (!isolate_lru_page(head)) {
- list_add_tail(&head->lru, pagelist);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_lru(head),
- thp_nr_pages(head));
+ if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
+ if (folio_isolate_lru(folio)) {
+ list_add_tail(&folio->lru, foliolist);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
} else if (flags & MPOL_MF_STRICT) {
/*
- * Non-movable page may reach here. And, there may be
- * temporary off LRU pages or non-LRU movable pages.
- * Treat them as unmovable pages since they can't be
+ * Non-movable folio may reach here. And, there may be
+ * temporary off LRU folios or non-LRU movable folios.
+ * Treat them as unmovable folios since they can't be
* isolated, so they can't be moved at the moment. It
* should return -EIO for this case too.
*/
@@ -1219,9 +1218,11 @@ static struct page *new_page(struct page *page, unsigned long start)
break;
}
- if (folio_test_hugetlb(src))
- return alloc_huge_page_vma(page_hstate(&src->page),
+ if (folio_test_hugetlb(src)) {
+ dst = alloc_hugetlb_folio_vma(folio_hstate(src),
vma, address);
+ return &dst->page;
+ }
if (folio_test_large(src))
gfp = GFP_TRANSHUGE;
@@ -1235,7 +1236,7 @@ static struct page *new_page(struct page *page, unsigned long start)
}
#else
-static int migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags)
{
return -EIO;
@@ -1489,7 +1490,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- struct mempolicy *new;
+ struct mempolicy *new, *old;
unsigned long vmstart;
unsigned long vmend;
unsigned long end;
@@ -1521,31 +1522,27 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
return 0;
mmap_write_lock(mm);
for_each_vma_range(vmi, vma, end) {
- vmstart = max(start, vma->vm_start);
- vmend = min(end, vma->vm_end);
- new = mpol_dup(vma_policy(vma));
- if (IS_ERR(new)) {
- err = PTR_ERR(new);
- break;
- }
- /*
- * Only update home node if there is an existing vma policy
- */
- if (!new)
- continue;
-
/*
* If any vma in the range got policy other than MPOL_BIND
* or MPOL_PREFERRED_MANY we return error. We don't reset
* the home node for vmas we already updated before.
*/
- if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
- mpol_put(new);
+ old = vma_policy(vma);
+ if (!old)
+ continue;
+ if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
err = -EOPNOTSUPP;
break;
}
+ new = mpol_dup(old);
+ if (IS_ERR(new)) {
+ err = PTR_ERR(new);
+ break;
+ }
new->home_node = home_node;
+ vmstart = max(start, vma->vm_start);
+ vmend = min(end, vma->vm_end);
err = mbind_range(mm, vmstart, vmend, new);
mpol_put(new);
if (err)
diff --git a/mm/memremap.c b/mm/memremap.c
index 08cbf54fe037..bee85560a243 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -129,7 +129,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
}
mem_hotplug_done();
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
+ untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
pgmap_array_delete(range);
}
@@ -276,7 +276,7 @@ err_add_memory:
if (!is_private)
kasan_remove_zero_shadow(__va(range->start), range_len(range));
err_kasan:
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
+ untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
err_pfn_remap:
pgmap_array_delete(range);
return error;
@@ -385,7 +385,7 @@ EXPORT_SYMBOL_GPL(memremap_pages);
* @pgmap: pointer to a struct dev_pagemap
*
* Notes:
- * 1/ At a minimum the res and type members of @pgmap must be initialized
+ * 1/ At a minimum the range and type members of @pgmap must be initialized
* by the caller before passing it to this function
*
* 2/ The altmap field may optionally be initialized, in which case
diff --git a/mm/migrate.c b/mm/migrate.c
index a4d3fc65085f..db3f154446af 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -58,8 +58,9 @@
#include "internal.h"
-int isolate_movable_page(struct page *page, isolate_mode_t mode)
+bool isolate_movable_page(struct page *page, isolate_mode_t mode)
{
+ struct folio *folio = folio_get_nontail_page(page);
const struct movable_operations *mops;
/*
@@ -71,11 +72,11 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
* the put_page() at the end of this block will take care of
* release this page, thus avoiding a nasty leakage.
*/
- if (unlikely(!get_page_unless_zero(page)))
+ if (!folio)
goto out;
- if (unlikely(PageSlab(page)))
- goto out_putpage;
+ if (unlikely(folio_test_slab(folio)))
+ goto out_putfolio;
/* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */
smp_rmb();
/*
@@ -83,12 +84,12 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
* we use non-atomic bitops on newly allocated page flags so
* unconditionally grabbing the lock ruins page's owner side.
*/
- if (unlikely(!__PageMovable(page)))
- goto out_putpage;
+ if (unlikely(!__folio_test_movable(folio)))
+ goto out_putfolio;
/* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */
smp_rmb();
- if (unlikely(PageSlab(page)))
- goto out_putpage;
+ if (unlikely(folio_test_slab(folio)))
+ goto out_putfolio;
/*
* As movable pages are not isolated from LRU lists, concurrent
@@ -101,39 +102,39 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
* lets be sure we have the page lock
* before proceeding with the movable page isolation steps.
*/
- if (unlikely(!trylock_page(page)))
- goto out_putpage;
+ if (unlikely(!folio_trylock(folio)))
+ goto out_putfolio;
- if (!PageMovable(page) || PageIsolated(page))
+ if (!folio_test_movable(folio) || folio_test_isolated(folio))
goto out_no_isolated;
- mops = page_movable_ops(page);
- VM_BUG_ON_PAGE(!mops, page);
+ mops = folio_movable_ops(folio);
+ VM_BUG_ON_FOLIO(!mops, folio);
- if (!mops->isolate_page(page, mode))
+ if (!mops->isolate_page(&folio->page, mode))
goto out_no_isolated;
/* Driver shouldn't use PG_isolated bit of page->flags */
- WARN_ON_ONCE(PageIsolated(page));
- SetPageIsolated(page);
- unlock_page(page);
+ WARN_ON_ONCE(folio_test_isolated(folio));
+ folio_set_isolated(folio);
+ folio_unlock(folio);
- return 0;
+ return true;
out_no_isolated:
- unlock_page(page);
-out_putpage:
- put_page(page);
+ folio_unlock(folio);
+out_putfolio:
+ folio_put(folio);
out:
- return -EBUSY;
+ return false;
}
-static void putback_movable_page(struct page *page)
+static void putback_movable_folio(struct folio *folio)
{
- const struct movable_operations *mops = page_movable_ops(page);
+ const struct movable_operations *mops = folio_movable_ops(folio);
- mops->putback_page(page);
- ClearPageIsolated(page);
+ mops->putback_page(&folio->page);
+ folio_clear_isolated(folio);
}
/*
@@ -146,33 +147,33 @@ static void putback_movable_page(struct page *page)
*/
void putback_movable_pages(struct list_head *l)
{
- struct page *page;
- struct page *page2;
+ struct folio *folio;
+ struct folio *folio2;
- list_for_each_entry_safe(page, page2, l, lru) {
- if (unlikely(PageHuge(page))) {
- putback_active_hugepage(page);
+ list_for_each_entry_safe(folio, folio2, l, lru) {
+ if (unlikely(folio_test_hugetlb(folio))) {
+ folio_putback_active_hugetlb(folio);
continue;
}
- list_del(&page->lru);
+ list_del(&folio->lru);
/*
- * We isolated non-lru movable page so here we can use
- * __PageMovable because LRU page's mapping cannot have
+ * We isolated non-lru movable folio so here we can use
+ * __PageMovable because LRU folio's mapping cannot have
* PAGE_MAPPING_MOVABLE.
*/
- if (unlikely(__PageMovable(page))) {
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
- lock_page(page);
- if (PageMovable(page))
- putback_movable_page(page);
+ if (unlikely(__folio_test_movable(folio))) {
+ VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio);
+ folio_lock(folio);
+ if (folio_test_movable(folio))
+ putback_movable_folio(folio);
else
- ClearPageIsolated(page);
- unlock_page(page);
- put_page(page);
+ folio_clear_isolated(folio);
+ folio_unlock(folio);
+ folio_put(folio);
} else {
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_lru(page), -thp_nr_pages(page));
- putback_lru_page(page);
+ node_stat_mod_folio(folio, NR_ISOLATED_ANON +
+ folio_is_file_lru(folio), -folio_nr_pages(folio));
+ folio_putback_lru(folio);
}
}
}
@@ -224,6 +225,8 @@ static bool remove_migration_pte(struct folio *folio,
pte = maybe_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(*pvmw.pte))
pte = pte_mkuffd_wp(pte);
+ else
+ pte = pte_wrprotect(pte);
if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
rmap_flags |= RMAP_EXCLUSIVE;
@@ -265,7 +268,7 @@ static bool remove_migration_pte(struct folio *folio,
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
}
if (vma->vm_flags & VM_LOCKED)
- mlock_page_drain_local();
+ mlock_drain_local();
trace_remove_migration_pte(pvmw.address, pte_val(pte),
compound_order(new));
@@ -329,24 +332,41 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
}
#ifdef CONFIG_HUGETLB_PAGE
-void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
+/*
+ * The vma read lock must be held upon entry. Holding that lock prevents either
+ * the pte or the ptl from being freed.
+ *
+ * This function will release the vma lock before returning.
+ */
+void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl)
{
pte_t pte;
+ hugetlb_vma_assert_locked(vma);
spin_lock(ptl);
pte = huge_ptep_get(ptep);
- if (unlikely(!is_hugetlb_entry_migration(pte)))
+ if (unlikely(!is_hugetlb_entry_migration(pte))) {
spin_unlock(ptl);
- else
+ hugetlb_vma_unlock_read(vma);
+ } else {
+ /*
+ * If migration entry existed, safe to release vma lock
+ * here because the pgtable page won't be freed without the
+ * pgtable lock released. See comment right above pgtable
+ * lock release in migration_entry_wait_on_locked().
+ */
+ hugetlb_vma_unlock_read(vma);
migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
+ }
}
void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
{
spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
- __migration_entry_wait_huge(pte, ptl);
+ __migration_entry_wait_huge(vma, pte, ptl);
}
#endif
@@ -973,7 +993,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
goto out;
}
- mops = page_movable_ops(&src->page);
+ mops = folio_movable_ops(src);
rc = mops->migrate_page(&dst->page, &src->page, mode);
WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
!folio_test_isolated(src));
@@ -1009,16 +1029,121 @@ out:
return rc;
}
-static int __unmap_and_move(struct folio *src, struct folio *dst,
- int force, enum migrate_mode mode)
+/*
+ * To record some information during migration, we use some unused
+ * fields (mapping and private) of struct folio of the newly allocated
+ * destination folio. This is safe because nobody is using them
+ * except us.
+ */
+union migration_ptr {
+ struct anon_vma *anon_vma;
+ struct address_space *mapping;
+};
+static void __migrate_folio_record(struct folio *dst,
+ unsigned long page_was_mapped,
+ struct anon_vma *anon_vma)
+{
+ union migration_ptr ptr = { .anon_vma = anon_vma };
+ dst->mapping = ptr.mapping;
+ dst->private = (void *)page_was_mapped;
+}
+
+static void __migrate_folio_extract(struct folio *dst,
+ int *page_was_mappedp,
+ struct anon_vma **anon_vmap)
+{
+ union migration_ptr ptr = { .mapping = dst->mapping };
+ *anon_vmap = ptr.anon_vma;
+ *page_was_mappedp = (unsigned long)dst->private;
+ dst->mapping = NULL;
+ dst->private = NULL;
+}
+
+/* Restore the source folio to the original state upon failure */
+static void migrate_folio_undo_src(struct folio *src,
+ int page_was_mapped,
+ struct anon_vma *anon_vma,
+ bool locked,
+ struct list_head *ret)
+{
+ if (page_was_mapped)
+ remove_migration_ptes(src, src, false);
+ /* Drop an anon_vma reference if we took one */
+ if (anon_vma)
+ put_anon_vma(anon_vma);
+ if (locked)
+ folio_unlock(src);
+ if (ret)
+ list_move_tail(&src->lru, ret);
+}
+
+/* Restore the destination folio to the original state upon failure */
+static void migrate_folio_undo_dst(struct folio *dst,
+ bool locked,
+ free_page_t put_new_page,
+ unsigned long private)
+{
+ if (locked)
+ folio_unlock(dst);
+ if (put_new_page)
+ put_new_page(&dst->page, private);
+ else
+ folio_put(dst);
+}
+
+/* Cleanup src folio upon migration success */
+static void migrate_folio_done(struct folio *src,
+ enum migrate_reason reason)
+{
+ /*
+ * Compaction can migrate also non-LRU pages which are
+ * not accounted to NR_ISOLATED_*. They can be recognized
+ * as __PageMovable
+ */
+ if (likely(!__folio_test_movable(src)))
+ mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
+ folio_is_file_lru(src), -folio_nr_pages(src));
+
+ if (reason != MR_MEMORY_FAILURE)
+ /* We release the page in page_handle_poison. */
+ folio_put(src);
+}
+
+/* Obtain the lock on page, remove all ptes. */
+static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
+ unsigned long private, struct folio *src,
+ struct folio **dstp, enum migrate_mode mode,
+ enum migrate_reason reason, struct list_head *ret)
{
+ struct folio *dst;
int rc = -EAGAIN;
- bool page_was_mapped = false;
+ struct page *newpage = NULL;
+ int page_was_mapped = 0;
struct anon_vma *anon_vma = NULL;
bool is_lru = !__PageMovable(&src->page);
+ bool locked = false;
+ bool dst_locked = false;
+
+ if (folio_ref_count(src) == 1) {
+ /* Folio was freed from under us. So we are done. */
+ folio_clear_active(src);
+ folio_clear_unevictable(src);
+ /* free_pages_prepare() will clear PG_isolated. */
+ list_del(&src->lru);
+ migrate_folio_done(src, reason);
+ return MIGRATEPAGE_SUCCESS;
+ }
+
+ newpage = get_new_page(&src->page, private);
+ if (!newpage)
+ return -ENOMEM;
+ dst = page_folio(newpage);
+ *dstp = dst;
+
+ dst->private = NULL;
if (!folio_trylock(src)) {
- if (!force || mode == MIGRATE_ASYNC)
+ if (mode == MIGRATE_ASYNC)
goto out;
/*
@@ -1039,6 +1164,7 @@ static int __unmap_and_move(struct folio *src, struct folio *dst,
folio_lock(src);
}
+ locked = true;
if (folio_test_writeback(src)) {
/*
@@ -1053,10 +1179,8 @@ static int __unmap_and_move(struct folio *src, struct folio *dst,
break;
default:
rc = -EBUSY;
- goto out_unlock;
+ goto out;
}
- if (!force)
- goto out_unlock;
folio_wait_writeback(src);
}
@@ -1086,11 +1210,12 @@ static int __unmap_and_move(struct folio *src, struct folio *dst,
* This is much like races on refcount of oldpage: just don't BUG().
*/
if (unlikely(!folio_trylock(dst)))
- goto out_unlock;
+ goto out;
+ dst_locked = true;
if (unlikely(!is_lru)) {
- rc = move_to_new_folio(dst, src, mode);
- goto out_unlock_both;
+ __migrate_folio_record(dst, page_was_mapped, anon_vma);
+ return MIGRATEPAGE_UNMAP;
}
/*
@@ -1108,18 +1233,57 @@ static int __unmap_and_move(struct folio *src, struct folio *dst,
if (!src->mapping) {
if (folio_test_private(src)) {
try_to_free_buffers(src);
- goto out_unlock_both;
+ goto out;
}
} else if (folio_mapped(src)) {
/* Establish migration ptes */
VM_BUG_ON_FOLIO(folio_test_anon(src) &&
!folio_test_ksm(src) && !anon_vma, src);
- try_to_migrate(src, 0);
- page_was_mapped = true;
+ try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
+ page_was_mapped = 1;
}
- if (!folio_mapped(src))
- rc = move_to_new_folio(dst, src, mode);
+ if (!folio_mapped(src)) {
+ __migrate_folio_record(dst, page_was_mapped, anon_vma);
+ return MIGRATEPAGE_UNMAP;
+ }
+
+out:
+ /*
+ * A folio that has not been unmapped will be restored to
+ * right list unless we want to retry.
+ */
+ if (rc == -EAGAIN)
+ ret = NULL;
+
+ migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret);
+ migrate_folio_undo_dst(dst, dst_locked, put_new_page, private);
+
+ return rc;
+}
+
+/* Migrate the folio to the newly allocated folio in dst. */
+static int migrate_folio_move(free_page_t put_new_page, unsigned long private,
+ struct folio *src, struct folio *dst,
+ enum migrate_mode mode, enum migrate_reason reason,
+ struct list_head *ret)
+{
+ int rc;
+ int page_was_mapped = 0;
+ struct anon_vma *anon_vma = NULL;
+ bool is_lru = !__PageMovable(&src->page);
+ struct list_head *prev;
+
+ __migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
+ prev = dst->lru.prev;
+ list_del(&dst->lru);
+
+ rc = move_to_new_folio(dst, src, mode);
+ if (rc)
+ goto out;
+
+ if (unlikely(!is_lru))
+ goto out_unlock_both;
/*
* When successful, push dst to LRU immediately: so that if it
@@ -1130,111 +1294,49 @@ static int __unmap_and_move(struct folio *src, struct folio *dst,
* unsuccessful, and other cases when a page has been temporarily
* isolated from the unevictable LRU: but this case is the easiest.
*/
- if (rc == MIGRATEPAGE_SUCCESS) {
- folio_add_lru(dst);
- if (page_was_mapped)
- lru_add_drain();
- }
+ folio_add_lru(dst);
+ if (page_was_mapped)
+ lru_add_drain();
if (page_was_mapped)
- remove_migration_ptes(src,
- rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
+ remove_migration_ptes(src, dst, false);
out_unlock_both:
folio_unlock(dst);
-out_unlock:
- /* Drop an anon_vma reference if we took one */
- if (anon_vma)
- put_anon_vma(anon_vma);
- folio_unlock(src);
-out:
+ set_page_owner_migrate_reason(&dst->page, reason);
/*
* If migration is successful, decrease refcount of dst,
* which will not free the page because new page owner increased
* refcounter.
*/
- if (rc == MIGRATEPAGE_SUCCESS)
- folio_put(dst);
-
- return rc;
-}
-
-/*
- * Obtain the lock on folio, remove all ptes and migrate the folio
- * to the newly allocated folio in dst.
- */
-static int unmap_and_move(new_page_t get_new_page,
- free_page_t put_new_page,
- unsigned long private, struct folio *src,
- int force, enum migrate_mode mode,
- enum migrate_reason reason,
- struct list_head *ret)
-{
- struct folio *dst;
- int rc = MIGRATEPAGE_SUCCESS;
- struct page *newpage = NULL;
-
- if (!thp_migration_supported() && folio_test_transhuge(src))
- return -ENOSYS;
-
- if (folio_ref_count(src) == 1) {
- /* Folio was freed from under us. So we are done. */
- folio_clear_active(src);
- folio_clear_unevictable(src);
- /* free_pages_prepare() will clear PG_isolated. */
- goto out;
- }
+ folio_put(dst);
- newpage = get_new_page(&src->page, private);
- if (!newpage)
- return -ENOMEM;
- dst = page_folio(newpage);
-
- dst->private = NULL;
- rc = __unmap_and_move(src, dst, force, mode);
- if (rc == MIGRATEPAGE_SUCCESS)
- set_page_owner_migrate_reason(&dst->page, reason);
+ /*
+ * A folio that has been migrated has all references removed
+ * and will be freed.
+ */
+ list_del(&src->lru);
+ /* Drop an anon_vma reference if we took one */
+ if (anon_vma)
+ put_anon_vma(anon_vma);
+ folio_unlock(src);
+ migrate_folio_done(src, reason);
+ return rc;
out:
- if (rc != -EAGAIN) {
- /*
- * A folio that has been migrated has all references
- * removed and will be freed. A folio that has not been
- * migrated will have kept its references and be restored.
- */
- list_del(&src->lru);
- }
-
/*
- * If migration is successful, releases reference grabbed during
- * isolation. Otherwise, restore the folio to right list unless
- * we want to retry.
+ * A folio that has not been migrated will be restored to
+ * right list unless we want to retry.
*/
- if (rc == MIGRATEPAGE_SUCCESS) {
- /*
- * Compaction can migrate also non-LRU folios which are
- * not accounted to NR_ISOLATED_*. They can be recognized
- * as __folio_test_movable
- */
- if (likely(!__folio_test_movable(src)))
- mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
- folio_is_file_lru(src), -folio_nr_pages(src));
-
- if (reason != MR_MEMORY_FAILURE)
- /*
- * We release the folio in page_handle_poison.
- */
- folio_put(src);
- } else {
- if (rc != -EAGAIN)
- list_add_tail(&src->lru, ret);
-
- if (put_new_page)
- put_new_page(&dst->page, private);
- else
- folio_put(dst);
+ if (rc == -EAGAIN) {
+ list_add(&dst->lru, prev);
+ __migrate_folio_record(dst, page_was_mapped, anon_vma);
+ return rc;
}
+ migrate_folio_undo_src(src, page_was_mapped, anon_vma, true, ret);
+ migrate_folio_undo_dst(dst, true, put_new_page, private);
+
return rc;
}
@@ -1269,19 +1371,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
- /*
- * Migratability of hugepages depends on architectures and their size.
- * This check is necessary because some callers of hugepage migration
- * like soft offline and memory hotremove don't walk through page
- * tables or check whether the hugepage is pmd-based or not before
- * kicking migration.
- */
- if (!hugepage_migration_supported(page_hstate(hpage)))
- return -ENOSYS;
-
if (folio_ref_count(src) == 1) {
/* page was freed from under us. So we are done. */
- putback_active_hugepage(hpage);
+ folio_putback_active_hugetlb(src);
return MIGRATEPAGE_SUCCESS;
}
@@ -1366,7 +1458,7 @@ out_unlock:
folio_unlock(src);
out:
if (rc == MIGRATEPAGE_SUCCESS)
- putback_active_hugepage(hpage);
+ folio_putback_active_hugetlb(src);
else if (rc != -EAGAIN)
list_move_tail(&src->lru, ret);
@@ -1378,7 +1470,7 @@ out:
if (put_new_page)
put_new_page(new_hpage, private);
else
- putback_active_hugepage(new_hpage);
+ folio_putback_active_hugetlb(dst);
return rc;
}
@@ -1396,61 +1488,156 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f
return rc;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR
+#else
+#define NR_MAX_BATCHED_MIGRATION 512
+#endif
+#define NR_MAX_MIGRATE_PAGES_RETRY 10
+#define NR_MAX_MIGRATE_ASYNC_RETRY 3
+#define NR_MAX_MIGRATE_SYNC_RETRY \
+ (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)
+
+struct migrate_pages_stats {
+ int nr_succeeded; /* Normal and large folios migrated successfully, in
+ units of base pages */
+ int nr_failed_pages; /* Normal and large folios failed to be migrated, in
+ units of base pages. Untried folios aren't counted */
+ int nr_thp_succeeded; /* THP migrated successfully */
+ int nr_thp_failed; /* THP failed to be migrated */
+ int nr_thp_split; /* THP split before migrating */
+};
+
/*
- * migrate_pages - migrate the folios specified in a list, to the free folios
- * supplied as the target for the page migration
- *
- * @from: The list of folios to be migrated.
- * @get_new_page: The function used to allocate free folios to be used
- * as the target of the folio migration.
- * @put_new_page: The function used to free target folios if migration
- * fails, or NULL if no special handling is necessary.
- * @private: Private data to be passed on to get_new_page()
- * @mode: The migration mode that specifies the constraints for
- * folio migration, if any.
- * @reason: The reason for folio migration.
- * @ret_succeeded: Set to the number of folios migrated successfully if
- * the caller passes a non-NULL pointer.
- *
- * The function returns after 10 attempts or if no folios are movable any more
- * because the list has become empty or no retryable folios exist any more.
- * It is caller's responsibility to call putback_movable_pages() to return folios
- * to the LRU or free list only if ret != 0.
+ * Returns the number of hugetlb folios that were not migrated, or an error code
+ * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable
+ * any more because the list has become empty or no retryable hugetlb folios
+ * exist any more. It is caller's responsibility to call putback_movable_pages()
+ * only if ret != 0.
+ */
+static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page,
+ free_page_t put_new_page, unsigned long private,
+ enum migrate_mode mode, int reason,
+ struct migrate_pages_stats *stats,
+ struct list_head *ret_folios)
+{
+ int retry = 1;
+ int nr_failed = 0;
+ int nr_retry_pages = 0;
+ int pass = 0;
+ struct folio *folio, *folio2;
+ int rc, nr_pages;
+
+ for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) {
+ retry = 0;
+ nr_retry_pages = 0;
+
+ list_for_each_entry_safe(folio, folio2, from, lru) {
+ if (!folio_test_hugetlb(folio))
+ continue;
+
+ nr_pages = folio_nr_pages(folio);
+
+ cond_resched();
+
+ /*
+ * Migratability of hugepages depends on architectures and
+ * their size. This check is necessary because some callers
+ * of hugepage migration like soft offline and memory
+ * hotremove don't walk through page tables or check whether
+ * the hugepage is pmd-based or not before kicking migration.
+ */
+ if (!hugepage_migration_supported(folio_hstate(folio))) {
+ nr_failed++;
+ stats->nr_failed_pages += nr_pages;
+ list_move_tail(&folio->lru, ret_folios);
+ continue;
+ }
+
+ rc = unmap_and_move_huge_page(get_new_page,
+ put_new_page, private,
+ &folio->page, pass > 2, mode,
+ reason, ret_folios);
+ /*
+ * The rules are:
+ * Success: hugetlb folio will be put back
+ * -EAGAIN: stay on the from list
+ * -ENOMEM: stay on the from list
+ * Other errno: put on ret_folios list
+ */
+ switch(rc) {
+ case -ENOMEM:
+ /*
+ * When memory is low, don't bother to try to migrate
+ * other folios, just exit.
+ */
+ stats->nr_failed_pages += nr_pages + nr_retry_pages;
+ return -ENOMEM;
+ case -EAGAIN:
+ retry++;
+ nr_retry_pages += nr_pages;
+ break;
+ case MIGRATEPAGE_SUCCESS:
+ stats->nr_succeeded += nr_pages;
+ break;
+ default:
+ /*
+ * Permanent failure (-EBUSY, etc.):
+ * unlike -EAGAIN case, the failed folio is
+ * removed from migration folio list and not
+ * retried in the next outer loop.
+ */
+ nr_failed++;
+ stats->nr_failed_pages += nr_pages;
+ break;
+ }
+ }
+ }
+ /*
+ * nr_failed is number of hugetlb folios failed to be migrated. After
+ * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb
+ * folios as failed.
+ */
+ nr_failed += retry;
+ stats->nr_failed_pages += nr_retry_pages;
+
+ return nr_failed;
+}
+
+/*
+ * migrate_pages_batch() first unmaps folios in the from list as many as
+ * possible, then move the unmapped folios.
*
- * Returns the number of {normal folio, large folio, hugetlb} that were not
- * migrated, or an error code. The number of large folio splits will be
- * considered as the number of non-migrated large folio, no matter how many
- * split folios of the large folio are migrated successfully.
+ * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a
+ * lock or bit when we have locked more than one folio. Which may cause
+ * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the
+ * length of the from list must be <= 1.
*/
-int migrate_pages(struct list_head *from, new_page_t get_new_page,
+static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
- enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
+ enum migrate_mode mode, int reason, struct list_head *ret_folios,
+ struct list_head *split_folios, struct migrate_pages_stats *stats,
+ int nr_pass)
{
int retry = 1;
int large_retry = 1;
int thp_retry = 1;
int nr_failed = 0;
- int nr_failed_pages = 0;
int nr_retry_pages = 0;
- int nr_succeeded = 0;
- int nr_thp_succeeded = 0;
int nr_large_failed = 0;
- int nr_thp_failed = 0;
- int nr_thp_split = 0;
int pass = 0;
bool is_large = false;
bool is_thp = false;
- struct folio *folio, *folio2;
- int rc, nr_pages;
- LIST_HEAD(ret_folios);
- LIST_HEAD(split_folios);
+ struct folio *folio, *folio2, *dst = NULL, *dst2;
+ int rc, rc_saved = 0, nr_pages;
+ LIST_HEAD(unmap_folios);
+ LIST_HEAD(dst_folios);
bool nosplit = (reason == MR_NUMA_MISPLACED);
- bool no_split_folio_counting = false;
- trace_mm_migrate_pages_start(mode, reason);
+ VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
+ !list_empty(from) && !list_is_singular(from));
-split_folio_migration:
- for (pass = 0; pass < 10 && (retry || large_retry); pass++) {
+ for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
retry = 0;
large_retry = 0;
thp_retry = 0;
@@ -1462,35 +1649,15 @@ split_folio_migration:
* folio. Capture required information that might get
* lost during migration.
*/
- is_large = folio_test_large(folio) && !folio_test_hugetlb(folio);
+ is_large = folio_test_large(folio);
is_thp = is_large && folio_test_pmd_mappable(folio);
nr_pages = folio_nr_pages(folio);
+
cond_resched();
- if (folio_test_hugetlb(folio))
- rc = unmap_and_move_huge_page(get_new_page,
- put_new_page, private,
- &folio->page, pass > 2, mode,
- reason,
- &ret_folios);
- else
- rc = unmap_and_move(get_new_page, put_new_page,
- private, folio, pass > 2, mode,
- reason, &ret_folios);
- /*
- * The rules are:
- * Success: non hugetlb folio will be freed, hugetlb
- * folio will be put back
- * -EAGAIN: stay on the from list
- * -ENOMEM: stay on the from list
- * -ENOSYS: stay on the from list
- * Other errno: put on ret_folios list then splice to
- * from list
- */
- switch(rc) {
/*
* Large folio migration might be unsupported or
- * the allocation could've failed so we should retry
+ * the allocation might be failed so we should retry
* on the same folio with the large folio split
* to normal folios.
*
@@ -1498,37 +1665,44 @@ split_folio_migration:
* we will migrate them after the rest of the
* list is processed.
*/
- case -ENOSYS:
- /* Large folio migration is unsupported */
- if (is_large) {
- nr_large_failed++;
- nr_thp_failed += is_thp;
- if (!try_split_folio(folio, &split_folios)) {
- nr_thp_split += is_thp;
- break;
- }
- /* Hugetlb migration is unsupported */
- } else if (!no_split_folio_counting) {
- nr_failed++;
+ if (!thp_migration_supported() && is_thp) {
+ nr_large_failed++;
+ stats->nr_thp_failed++;
+ if (!try_split_folio(folio, split_folios)) {
+ stats->nr_thp_split++;
+ continue;
}
+ stats->nr_failed_pages += nr_pages;
+ list_move_tail(&folio->lru, ret_folios);
+ continue;
+ }
- nr_failed_pages += nr_pages;
- list_move_tail(&folio->lru, &ret_folios);
- break;
+ rc = migrate_folio_unmap(get_new_page, put_new_page, private,
+ folio, &dst, mode, reason, ret_folios);
+ /*
+ * The rules are:
+ * Success: folio will be freed
+ * Unmap: folio will be put on unmap_folios list,
+ * dst folio put on dst_folios list
+ * -EAGAIN: stay on the from list
+ * -ENOMEM: stay on the from list
+ * Other errno: put on ret_folios list
+ */
+ switch(rc) {
case -ENOMEM:
/*
* When memory is low, don't bother to try to migrate
- * other folios, just exit.
+ * other folios, move unmapped folios, then exit.
*/
if (is_large) {
nr_large_failed++;
- nr_thp_failed += is_thp;
+ stats->nr_thp_failed += is_thp;
/* Large folio NUMA faulting doesn't split to retry. */
if (!nosplit) {
- int ret = try_split_folio(folio, &split_folios);
+ int ret = try_split_folio(folio, split_folios);
if (!ret) {
- nr_thp_split += is_thp;
+ stats->nr_thp_split += is_thp;
break;
} else if (reason == MR_LONGTERM_PIN &&
ret == -EAGAIN) {
@@ -1542,34 +1716,35 @@ split_folio_migration:
break;
}
}
- } else if (!no_split_folio_counting) {
+ } else {
nr_failed++;
}
- nr_failed_pages += nr_pages + nr_retry_pages;
- /*
- * There might be some split folios of fail-to-migrate large
- * folios left in split_folios list. Move them back to migration
- * list so that they could be put back to the right list by
- * the caller otherwise the folio refcnt will be leaked.
- */
- list_splice_init(&split_folios, from);
+ stats->nr_failed_pages += nr_pages + nr_retry_pages;
/* nr_failed isn't updated for not used */
nr_large_failed += large_retry;
- nr_thp_failed += thp_retry;
- goto out;
+ stats->nr_thp_failed += thp_retry;
+ rc_saved = rc;
+ if (list_empty(&unmap_folios))
+ goto out;
+ else
+ goto move;
case -EAGAIN:
if (is_large) {
large_retry++;
thp_retry += is_thp;
- } else if (!no_split_folio_counting) {
+ } else {
retry++;
}
nr_retry_pages += nr_pages;
break;
case MIGRATEPAGE_SUCCESS:
- nr_succeeded += nr_pages;
- nr_thp_succeeded += is_thp;
+ stats->nr_succeeded += nr_pages;
+ stats->nr_thp_succeeded += is_thp;
+ break;
+ case MIGRATEPAGE_UNMAP:
+ list_move_tail(&folio->lru, &unmap_folios);
+ list_add_tail(&dst->lru, &dst_folios);
break;
default:
/*
@@ -1580,38 +1755,242 @@ split_folio_migration:
*/
if (is_large) {
nr_large_failed++;
- nr_thp_failed += is_thp;
- } else if (!no_split_folio_counting) {
+ stats->nr_thp_failed += is_thp;
+ } else {
+ nr_failed++;
+ }
+
+ stats->nr_failed_pages += nr_pages;
+ break;
+ }
+ }
+ }
+ nr_failed += retry;
+ nr_large_failed += large_retry;
+ stats->nr_thp_failed += thp_retry;
+ stats->nr_failed_pages += nr_retry_pages;
+move:
+ /* Flush TLBs for all unmapped folios */
+ try_to_unmap_flush();
+
+ retry = 1;
+ for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
+ retry = 0;
+ large_retry = 0;
+ thp_retry = 0;
+ nr_retry_pages = 0;
+
+ dst = list_first_entry(&dst_folios, struct folio, lru);
+ dst2 = list_next_entry(dst, lru);
+ list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
+ is_large = folio_test_large(folio);
+ is_thp = is_large && folio_test_pmd_mappable(folio);
+ nr_pages = folio_nr_pages(folio);
+
+ cond_resched();
+
+ rc = migrate_folio_move(put_new_page, private,
+ folio, dst, mode,
+ reason, ret_folios);
+ /*
+ * The rules are:
+ * Success: folio will be freed
+ * -EAGAIN: stay on the unmap_folios list
+ * Other errno: put on ret_folios list
+ */
+ switch(rc) {
+ case -EAGAIN:
+ if (is_large) {
+ large_retry++;
+ thp_retry += is_thp;
+ } else {
+ retry++;
+ }
+ nr_retry_pages += nr_pages;
+ break;
+ case MIGRATEPAGE_SUCCESS:
+ stats->nr_succeeded += nr_pages;
+ stats->nr_thp_succeeded += is_thp;
+ break;
+ default:
+ if (is_large) {
+ nr_large_failed++;
+ stats->nr_thp_failed += is_thp;
+ } else {
nr_failed++;
}
- nr_failed_pages += nr_pages;
+ stats->nr_failed_pages += nr_pages;
break;
}
+ dst = dst2;
+ dst2 = list_next_entry(dst, lru);
}
}
nr_failed += retry;
nr_large_failed += large_retry;
- nr_thp_failed += thp_retry;
- nr_failed_pages += nr_retry_pages;
+ stats->nr_thp_failed += thp_retry;
+ stats->nr_failed_pages += nr_retry_pages;
+
+ if (rc_saved)
+ rc = rc_saved;
+ else
+ rc = nr_failed + nr_large_failed;
+out:
+ /* Cleanup remaining folios */
+ dst = list_first_entry(&dst_folios, struct folio, lru);
+ dst2 = list_next_entry(dst, lru);
+ list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
+ int page_was_mapped = 0;
+ struct anon_vma *anon_vma = NULL;
+
+ __migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
+ migrate_folio_undo_src(folio, page_was_mapped, anon_vma,
+ true, ret_folios);
+ list_del(&dst->lru);
+ migrate_folio_undo_dst(dst, true, put_new_page, private);
+ dst = dst2;
+ dst2 = list_next_entry(dst, lru);
+ }
+
+ return rc;
+}
+
+static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
+ free_page_t put_new_page, unsigned long private,
+ enum migrate_mode mode, int reason, struct list_head *ret_folios,
+ struct list_head *split_folios, struct migrate_pages_stats *stats)
+{
+ int rc, nr_failed = 0;
+ LIST_HEAD(folios);
+ struct migrate_pages_stats astats;
+
+ memset(&astats, 0, sizeof(astats));
+ /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
+ rc = migrate_pages_batch(from, get_new_page, put_new_page, private, MIGRATE_ASYNC,
+ reason, &folios, split_folios, &astats,
+ NR_MAX_MIGRATE_ASYNC_RETRY);
+ stats->nr_succeeded += astats.nr_succeeded;
+ stats->nr_thp_succeeded += astats.nr_thp_succeeded;
+ stats->nr_thp_split += astats.nr_thp_split;
+ if (rc < 0) {
+ stats->nr_failed_pages += astats.nr_failed_pages;
+ stats->nr_thp_failed += astats.nr_thp_failed;
+ list_splice_tail(&folios, ret_folios);
+ return rc;
+ }
+ stats->nr_thp_failed += astats.nr_thp_split;
+ nr_failed += astats.nr_thp_split;
/*
- * Try to migrate split folios of fail-to-migrate large folios, no
- * nr_failed counting in this round, since all split folios of a
- * large folio is counted as 1 failure in the first round.
+ * Fall back to migrate all failed folios one by one synchronously. All
+ * failed folios except split THPs will be retried, so their failure
+ * isn't counted
*/
+ list_splice_tail_init(&folios, from);
+ while (!list_empty(from)) {
+ list_move(from->next, &folios);
+ rc = migrate_pages_batch(&folios, get_new_page, put_new_page,
+ private, mode, reason, ret_folios,
+ split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY);
+ list_splice_tail_init(&folios, ret_folios);
+ if (rc < 0)
+ return rc;
+ nr_failed += rc;
+ }
+
+ return nr_failed;
+}
+
+/*
+ * migrate_pages - migrate the folios specified in a list, to the free folios
+ * supplied as the target for the page migration
+ *
+ * @from: The list of folios to be migrated.
+ * @get_new_page: The function used to allocate free folios to be used
+ * as the target of the folio migration.
+ * @put_new_page: The function used to free target folios if migration
+ * fails, or NULL if no special handling is necessary.
+ * @private: Private data to be passed on to get_new_page()
+ * @mode: The migration mode that specifies the constraints for
+ * folio migration, if any.
+ * @reason: The reason for folio migration.
+ * @ret_succeeded: Set to the number of folios migrated successfully if
+ * the caller passes a non-NULL pointer.
+ *
+ * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios
+ * are movable any more because the list has become empty or no retryable folios
+ * exist any more. It is caller's responsibility to call putback_movable_pages()
+ * only if ret != 0.
+ *
+ * Returns the number of {normal folio, large folio, hugetlb} that were not
+ * migrated, or an error code. The number of large folio splits will be
+ * considered as the number of non-migrated large folio, no matter how many
+ * split folios of the large folio are migrated successfully.
+ */
+int migrate_pages(struct list_head *from, new_page_t get_new_page,
+ free_page_t put_new_page, unsigned long private,
+ enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
+{
+ int rc, rc_gather;
+ int nr_pages;
+ struct folio *folio, *folio2;
+ LIST_HEAD(folios);
+ LIST_HEAD(ret_folios);
+ LIST_HEAD(split_folios);
+ struct migrate_pages_stats stats;
+
+ trace_mm_migrate_pages_start(mode, reason);
+
+ memset(&stats, 0, sizeof(stats));
+
+ rc_gather = migrate_hugetlbs(from, get_new_page, put_new_page, private,
+ mode, reason, &stats, &ret_folios);
+ if (rc_gather < 0)
+ goto out;
+
+again:
+ nr_pages = 0;
+ list_for_each_entry_safe(folio, folio2, from, lru) {
+ /* Retried hugetlb folios will be kept in list */
+ if (folio_test_hugetlb(folio)) {
+ list_move_tail(&folio->lru, &ret_folios);
+ continue;
+ }
+
+ nr_pages += folio_nr_pages(folio);
+ if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
+ break;
+ }
+ if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
+ list_cut_before(&folios, from, &folio2->lru);
+ else
+ list_splice_init(from, &folios);
+ if (mode == MIGRATE_ASYNC)
+ rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private,
+ mode, reason, &ret_folios, &split_folios, &stats,
+ NR_MAX_MIGRATE_PAGES_RETRY);
+ else
+ rc = migrate_pages_sync(&folios, get_new_page, put_new_page, private,
+ mode, reason, &ret_folios, &split_folios, &stats);
+ list_splice_tail_init(&folios, &ret_folios);
+ if (rc < 0) {
+ rc_gather = rc;
+ list_splice_tail(&split_folios, &ret_folios);
+ goto out;
+ }
if (!list_empty(&split_folios)) {
/*
- * Move non-migrated folios (after 10 retries) to ret_folios
- * to avoid migrating them again.
+ * Failure isn't counted since all split folios of a large folio
+ * is counted as 1 failure already. And, we only try to migrate
+ * with minimal effort, force MIGRATE_ASYNC mode and retry once.
*/
- list_splice_init(from, &ret_folios);
- list_splice_init(&split_folios, from);
- no_split_folio_counting = true;
- retry = 1;
- goto split_folio_migration;
+ migrate_pages_batch(&split_folios, get_new_page, put_new_page, private,
+ MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1);
+ list_splice_tail_init(&split_folios, &ret_folios);
}
-
- rc = nr_failed + nr_large_failed;
+ rc_gather += rc;
+ if (!list_empty(from))
+ goto again;
out:
/*
* Put the permanent failure folio back to migration list, they
@@ -1624,20 +2003,21 @@ out:
* are migrated successfully.
*/
if (list_empty(from))
- rc = 0;
+ rc_gather = 0;
- count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
- count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
- count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
- count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
- count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
- trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
- nr_thp_failed, nr_thp_split, mode, reason);
+ count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded);
+ count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages);
+ count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded);
+ count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed);
+ count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split);
+ trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages,
+ stats.nr_thp_succeeded, stats.nr_thp_failed,
+ stats.nr_thp_split, mode, reason);
if (ret_succeeded)
- *ret_succeeded = nr_succeeded;
+ *ret_succeeded = stats.nr_succeeded;
- return rc;
+ return rc_gather;
}
struct page *alloc_migration_target(struct page *page, unsigned long private)
@@ -1646,6 +2026,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
struct migration_target_control *mtc;
gfp_t gfp_mask;
unsigned int order = 0;
+ struct folio *hugetlb_folio = NULL;
struct folio *new_folio = NULL;
int nid;
int zidx;
@@ -1660,7 +2041,9 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
struct hstate *h = folio_hstate(folio);
gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
- return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
+ hugetlb_folio = alloc_hugetlb_folio_nodemask(h, nid,
+ mtc->nmask, gfp_mask);
+ return &hugetlb_folio->page;
}
if (folio_test_large(folio)) {
@@ -1725,6 +2108,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct *vma;
struct page *page;
int err;
+ bool isolated;
mmap_read_lock(mm);
err = -EFAULT;
@@ -1756,17 +2140,18 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
if (PageHuge(page)) {
if (PageHead(page)) {
- err = isolate_hugetlb(page, pagelist);
- if (!err)
- err = 1;
+ isolated = isolate_hugetlb(page_folio(page), pagelist);
+ err = isolated ? 1 : -EBUSY;
}
} else {
struct page *head;
head = compound_head(page);
- err = isolate_lru_page(head);
- if (err)
+ isolated = isolate_lru_page(head);
+ if (!isolated) {
+ err = -EBUSY;
goto out_putpage;
+ }
err = 1;
list_add_tail(&head->lru, pagelist);
@@ -2171,7 +2556,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
return 0;
}
- if (isolate_lru_page(page))
+ if (!isolate_lru_page(page))
return 0;
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page),
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 721b2365dbca..d30c9de60b0d 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -306,7 +306,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
* private page mappings that won't be migrated.
*/
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
- migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
+ migrate->vma->vm_mm, migrate->start, migrate->end,
migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
@@ -388,7 +388,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
allow_drain = false;
}
- if (isolate_lru_page(page)) {
+ if (!isolate_lru_page(page)) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
continue;
@@ -733,7 +733,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
notified = true;
mmu_notifier_range_init_owner(&range,
- MMU_NOTIFY_MIGRATE, 0, migrate->vma,
+ MMU_NOTIFY_MIGRATE, 0,
migrate->vma->vm_mm, addr, migrate->end,
migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
diff --git a/mm/mincore.c b/mm/mincore.c
index a085a2aeabd8..d359650b0f75 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -33,7 +33,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
* Hugepages under user process are always in RAM and never
* swapped out, but theoretically it needs to be checked.
*/
- present = pte && !huge_pte_none(huge_ptep_get(pte));
+ present = pte && !huge_pte_none_mostly(huge_ptep_get(pte));
for (; addr != end; vec++, addr += PAGE_SIZE)
*vec = present;
walk->private = vec;
@@ -168,7 +168,7 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
* for writing; otherwise we'd be including shared non-exclusive
* mappings, which opens a side channel.
*/
- return inode_owner_or_capable(&init_user_ns,
+ return inode_owner_or_capable(&nop_mnt_idmap,
file_inode(vma->vm_file)) ||
file_permission(vma->vm_file, MAY_WRITE) == 0;
}
diff --git a/mm/mlock.c b/mm/mlock.c
index 7032f6dd0ce1..617469fce96d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -28,12 +28,12 @@
#include "internal.h"
-struct mlock_pvec {
+struct mlock_fbatch {
local_lock_t lock;
- struct pagevec vec;
+ struct folio_batch fbatch;
};
-static DEFINE_PER_CPU(struct mlock_pvec, mlock_pvec) = {
+static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = {
.lock = INIT_LOCAL_LOCK(lock),
};
@@ -48,192 +48,192 @@ bool can_do_mlock(void)
EXPORT_SYMBOL(can_do_mlock);
/*
- * Mlocked pages are marked with PageMlocked() flag for efficient testing
+ * Mlocked folios are marked with the PG_mlocked flag for efficient testing
* in vmscan and, possibly, the fault path; and to support semi-accurate
* statistics.
*
- * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
- * be placed on the LRU "unevictable" list, rather than the [in]active lists.
- * The unevictable list is an LRU sibling list to the [in]active lists.
- * PageUnevictable is set to indicate the unevictable state.
+ * An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it
+ * will be ostensibly placed on the LRU "unevictable" list (actually no such
+ * list exists), rather than the [in]active lists. PG_unevictable is set to
+ * indicate the unevictable state.
*/
-static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec)
+static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec)
{
/* There is nothing more we can do while it's off LRU */
- if (!TestClearPageLRU(page))
+ if (!folio_test_clear_lru(folio))
return lruvec;
- lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
- if (unlikely(page_evictable(page))) {
+ if (unlikely(folio_evictable(folio))) {
/*
- * This is a little surprising, but quite possible:
- * PageMlocked must have got cleared already by another CPU.
- * Could this page be on the Unevictable LRU? I'm not sure,
- * but move it now if so.
+ * This is a little surprising, but quite possible: PG_mlocked
+ * must have got cleared already by another CPU. Could this
+ * folio be unevictable? I'm not sure, but move it now if so.
*/
- if (PageUnevictable(page)) {
- del_page_from_lru_list(page, lruvec);
- ClearPageUnevictable(page);
- add_page_to_lru_list(page, lruvec);
+ if (folio_test_unevictable(folio)) {
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_unevictable(folio);
+ lruvec_add_folio(lruvec, folio);
+
__count_vm_events(UNEVICTABLE_PGRESCUED,
- thp_nr_pages(page));
+ folio_nr_pages(folio));
}
goto out;
}
- if (PageUnevictable(page)) {
- if (PageMlocked(page))
- page->mlock_count++;
+ if (folio_test_unevictable(folio)) {
+ if (folio_test_mlocked(folio))
+ folio->mlock_count++;
goto out;
}
- del_page_from_lru_list(page, lruvec);
- ClearPageActive(page);
- SetPageUnevictable(page);
- page->mlock_count = !!PageMlocked(page);
- add_page_to_lru_list(page, lruvec);
- __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_set_unevictable(folio);
+ folio->mlock_count = !!folio_test_mlocked(folio);
+ lruvec_add_folio(lruvec, folio);
+ __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
out:
- SetPageLRU(page);
+ folio_set_lru(folio);
return lruvec;
}
-static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec)
+static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec)
{
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
/* As above, this is a little surprising, but possible */
- if (unlikely(page_evictable(page)))
+ if (unlikely(folio_evictable(folio)))
goto out;
- SetPageUnevictable(page);
- page->mlock_count = !!PageMlocked(page);
- __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
+ folio_set_unevictable(folio);
+ folio->mlock_count = !!folio_test_mlocked(folio);
+ __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
out:
- add_page_to_lru_list(page, lruvec);
- SetPageLRU(page);
+ lruvec_add_folio(lruvec, folio);
+ folio_set_lru(folio);
return lruvec;
}
-static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec)
+static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec)
{
- int nr_pages = thp_nr_pages(page);
+ int nr_pages = folio_nr_pages(folio);
bool isolated = false;
- if (!TestClearPageLRU(page))
+ if (!folio_test_clear_lru(folio))
goto munlock;
isolated = true;
- lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
- if (PageUnevictable(page)) {
+ if (folio_test_unevictable(folio)) {
/* Then mlock_count is maintained, but might undercount */
- if (page->mlock_count)
- page->mlock_count--;
- if (page->mlock_count)
+ if (folio->mlock_count)
+ folio->mlock_count--;
+ if (folio->mlock_count)
goto out;
}
/* else assume that was the last mlock: reclaim will fix it if not */
munlock:
- if (TestClearPageMlocked(page)) {
- __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
- if (isolated || !PageUnevictable(page))
+ if (folio_test_clear_mlocked(folio)) {
+ __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
+ if (isolated || !folio_test_unevictable(folio))
__count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
else
__count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
}
- /* page_evictable() has to be checked *after* clearing Mlocked */
- if (isolated && PageUnevictable(page) && page_evictable(page)) {
- del_page_from_lru_list(page, lruvec);
- ClearPageUnevictable(page);
- add_page_to_lru_list(page, lruvec);
+ /* folio_evictable() has to be checked *after* clearing Mlocked */
+ if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) {
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_unevictable(folio);
+ lruvec_add_folio(lruvec, folio);
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
}
out:
if (isolated)
- SetPageLRU(page);
+ folio_set_lru(folio);
return lruvec;
}
/*
- * Flags held in the low bits of a struct page pointer on the mlock_pvec.
+ * Flags held in the low bits of a struct folio pointer on the mlock_fbatch.
*/
-#define LRU_PAGE 0x1
-#define NEW_PAGE 0x2
-static inline struct page *mlock_lru(struct page *page)
+#define LRU_FOLIO 0x1
+#define NEW_FOLIO 0x2
+static inline struct folio *mlock_lru(struct folio *folio)
{
- return (struct page *)((unsigned long)page + LRU_PAGE);
+ return (struct folio *)((unsigned long)folio + LRU_FOLIO);
}
-static inline struct page *mlock_new(struct page *page)
+static inline struct folio *mlock_new(struct folio *folio)
{
- return (struct page *)((unsigned long)page + NEW_PAGE);
+ return (struct folio *)((unsigned long)folio + NEW_FOLIO);
}
/*
- * mlock_pagevec() is derived from pagevec_lru_move_fn():
- * perhaps that can make use of such page pointer flags in future,
- * but for now just keep it for mlock. We could use three separate
- * pagevecs instead, but one feels better (munlocking a full pagevec
- * does not need to drain mlocking pagevecs first).
+ * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can
+ * make use of such folio pointer flags in future, but for now just keep it for
+ * mlock. We could use three separate folio batches instead, but one feels
+ * better (munlocking a full folio batch does not need to drain mlocking folio
+ * batches first).
*/
-static void mlock_pagevec(struct pagevec *pvec)
+static void mlock_folio_batch(struct folio_batch *fbatch)
{
struct lruvec *lruvec = NULL;
unsigned long mlock;
- struct page *page;
+ struct folio *folio;
int i;
- for (i = 0; i < pagevec_count(pvec); i++) {
- page = pvec->pages[i];
- mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE);
- page = (struct page *)((unsigned long)page - mlock);
- pvec->pages[i] = page;
+ for (i = 0; i < folio_batch_count(fbatch); i++) {
+ folio = fbatch->folios[i];
+ mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO);
+ folio = (struct folio *)((unsigned long)folio - mlock);
+ fbatch->folios[i] = folio;
- if (mlock & LRU_PAGE)
- lruvec = __mlock_page(page, lruvec);
- else if (mlock & NEW_PAGE)
- lruvec = __mlock_new_page(page, lruvec);
+ if (mlock & LRU_FOLIO)
+ lruvec = __mlock_folio(folio, lruvec);
+ else if (mlock & NEW_FOLIO)
+ lruvec = __mlock_new_folio(folio, lruvec);
else
- lruvec = __munlock_page(page, lruvec);
+ lruvec = __munlock_folio(folio, lruvec);
}
if (lruvec)
unlock_page_lruvec_irq(lruvec);
- release_pages(pvec->pages, pvec->nr);
- pagevec_reinit(pvec);
+ release_pages(fbatch->folios, fbatch->nr);
+ folio_batch_reinit(fbatch);
}
-void mlock_page_drain_local(void)
+void mlock_drain_local(void)
{
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
- local_lock(&mlock_pvec.lock);
- pvec = this_cpu_ptr(&mlock_pvec.vec);
- if (pagevec_count(pvec))
- mlock_pagevec(pvec);
- local_unlock(&mlock_pvec.lock);
+ local_lock(&mlock_fbatch.lock);
+ fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
+ if (folio_batch_count(fbatch))
+ mlock_folio_batch(fbatch);
+ local_unlock(&mlock_fbatch.lock);
}
-void mlock_page_drain_remote(int cpu)
+void mlock_drain_remote(int cpu)
{
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
WARN_ON_ONCE(cpu_online(cpu));
- pvec = &per_cpu(mlock_pvec.vec, cpu);
- if (pagevec_count(pvec))
- mlock_pagevec(pvec);
+ fbatch = &per_cpu(mlock_fbatch.fbatch, cpu);
+ if (folio_batch_count(fbatch))
+ mlock_folio_batch(fbatch);
}
-bool need_mlock_page_drain(int cpu)
+bool need_mlock_drain(int cpu)
{
- return pagevec_count(&per_cpu(mlock_pvec.vec, cpu));
+ return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu));
}
/**
@@ -242,10 +242,10 @@ bool need_mlock_page_drain(int cpu)
*/
void mlock_folio(struct folio *folio)
{
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
- local_lock(&mlock_pvec.lock);
- pvec = this_cpu_ptr(&mlock_pvec.vec);
+ local_lock(&mlock_fbatch.lock);
+ fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
if (!folio_test_set_mlocked(folio)) {
int nr_pages = folio_nr_pages(folio);
@@ -255,54 +255,54 @@ void mlock_folio(struct folio *folio)
}
folio_get(folio);
- if (!pagevec_add(pvec, mlock_lru(&folio->page)) ||
+ if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
folio_test_large(folio) || lru_cache_disabled())
- mlock_pagevec(pvec);
- local_unlock(&mlock_pvec.lock);
+ mlock_folio_batch(fbatch);
+ local_unlock(&mlock_fbatch.lock);
}
/**
- * mlock_new_page - mlock a newly allocated page not yet on LRU
- * @page: page to be mlocked, either a normal page or a THP head.
+ * mlock_new_folio - mlock a newly allocated folio not yet on LRU
+ * @folio: folio to be mlocked, either normal or a THP head.
*/
-void mlock_new_page(struct page *page)
+void mlock_new_folio(struct folio *folio)
{
- struct pagevec *pvec;
- int nr_pages = thp_nr_pages(page);
+ struct folio_batch *fbatch;
+ int nr_pages = folio_nr_pages(folio);
+
+ local_lock(&mlock_fbatch.lock);
+ fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
+ folio_set_mlocked(folio);
- local_lock(&mlock_pvec.lock);
- pvec = this_cpu_ptr(&mlock_pvec.vec);
- SetPageMlocked(page);
- mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
- get_page(page);
- if (!pagevec_add(pvec, mlock_new(page)) ||
- PageHead(page) || lru_cache_disabled())
- mlock_pagevec(pvec);
- local_unlock(&mlock_pvec.lock);
+ folio_get(folio);
+ if (!folio_batch_add(fbatch, mlock_new(folio)) ||
+ folio_test_large(folio) || lru_cache_disabled())
+ mlock_folio_batch(fbatch);
+ local_unlock(&mlock_fbatch.lock);
}
/**
- * munlock_page - munlock a page
- * @page: page to be munlocked, either a normal page or a THP head.
+ * munlock_folio - munlock a folio
+ * @folio: folio to be munlocked, either normal or a THP head.
*/
-void munlock_page(struct page *page)
+void munlock_folio(struct folio *folio)
{
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
- local_lock(&mlock_pvec.lock);
- pvec = this_cpu_ptr(&mlock_pvec.vec);
+ local_lock(&mlock_fbatch.lock);
+ fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
/*
- * TestClearPageMlocked(page) must be left to __munlock_page(),
- * which will check whether the page is multiply mlocked.
+ * folio_test_clear_mlocked(folio) must be left to __munlock_folio(),
+ * which will check whether the folio is multiply mlocked.
*/
-
- get_page(page);
- if (!pagevec_add(pvec, page) ||
- PageHead(page) || lru_cache_disabled())
- mlock_pagevec(pvec);
- local_unlock(&mlock_pvec.lock);
+ folio_get(folio);
+ if (!folio_batch_add(fbatch, folio) ||
+ folio_test_large(folio) || lru_cache_disabled())
+ mlock_folio_batch(fbatch);
+ local_unlock(&mlock_fbatch.lock);
}
static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
@@ -312,7 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
pte_t *start_pte, *pte;
- struct page *page;
+ struct folio *folio;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -320,11 +320,11 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
goto out;
if (is_huge_zero_pmd(*pmd))
goto out;
- page = pmd_page(*pmd);
+ folio = page_folio(pmd_page(*pmd));
if (vma->vm_flags & VM_LOCKED)
- mlock_folio(page_folio(page));
+ mlock_folio(folio);
else
- munlock_page(page);
+ munlock_folio(folio);
goto out;
}
@@ -332,15 +332,15 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
- page = vm_normal_page(vma, addr, *pte);
- if (!page || is_zone_device_page(page))
+ folio = vm_normal_folio(vma, addr, *pte);
+ if (!folio || folio_is_zone_device(folio))
continue;
- if (PageTransCompound(page))
+ if (folio_test_large(folio))
continue;
if (vma->vm_flags & VM_LOCKED)
- mlock_folio(page_folio(page));
+ mlock_folio(folio);
else
- munlock_page(page);
+ munlock_folio(folio);
}
pte_unmap(start_pte);
out:
@@ -370,9 +370,9 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
/*
* There is a slight chance that concurrent page migration,
* or page reclaim finding a page of this now-VM_LOCKED vma,
- * will call mlock_vma_page() and raise page's mlock_count:
+ * will call mlock_vma_folio() and raise page's mlock_count:
* double counting, leaving the page unevictable indefinitely.
- * Communicate this danger to mlock_vma_page() with VM_IO,
+ * Communicate this danger to mlock_vma_folio() with VM_IO,
* which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
* mmap_lock is held in write mode here, so this weird
* combination should not be visible to other mmap_lock users;
@@ -380,7 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
*/
if (newflags & VM_LOCKED)
newflags |= VM_IO;
- WRITE_ONCE(vma->vm_flags, newflags);
+ vm_flags_reset_once(vma, newflags);
lru_add_drain();
walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
@@ -388,7 +388,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
if (newflags & VM_IO) {
newflags &= ~VM_IO;
- WRITE_ONCE(vma->vm_flags, newflags);
+ vm_flags_reset_once(vma, newflags);
}
}
@@ -401,8 +401,9 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
*
* For vmas that pass the filters, merge/split as appropriate.
*/
-static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end, vm_flags_t newflags)
+static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct vm_area_struct **prev, unsigned long start,
+ unsigned long end, vm_flags_t newflags)
{
struct mm_struct *mm = vma->vm_mm;
pgoff_t pgoff;
@@ -417,22 +418,22 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
goto out;
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ *prev = vma_merge(vmi, mm, *prev, start, end, newflags,
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*prev) {
vma = *prev;
goto success;
}
if (start != vma->vm_start) {
- ret = split_vma(mm, vma, start, 1);
+ ret = split_vma(vmi, vma, start, 1);
if (ret)
goto out;
}
if (end != vma->vm_end) {
- ret = split_vma(mm, vma, end, 0);
+ ret = split_vma(vmi, vma, end, 0);
if (ret)
goto out;
}
@@ -456,7 +457,7 @@ success:
if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
/* No work to do, and mlocking twice would be wrong */
- vma->vm_flags = newflags;
+ vm_flags_reset(vma, newflags);
} else {
mlock_vma_pages_range(vma, start, end, newflags);
}
@@ -471,7 +472,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
unsigned long nstart, end, tmp;
struct vm_area_struct *vma, *prev;
int error;
- MA_STATE(mas, &current->mm->mm_mt, start, start);
+ VMA_ITERATOR(vmi, current->mm, start);
VM_BUG_ON(offset_in_page(start));
VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -480,39 +481,37 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
return -EINVAL;
if (end == start)
return 0;
- vma = mas_walk(&mas);
+ vma = vma_iter_load(&vmi);
if (!vma)
return -ENOMEM;
+ prev = vma_prev(&vmi);
if (start > vma->vm_start)
prev = vma;
- else
- prev = mas_prev(&mas, 0);
- for (nstart = start ; ; ) {
- vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ nstart = start;
+ tmp = vma->vm_start;
+ for_each_vma_range(vmi, vma, end) {
+ vm_flags_t newflags;
+
+ if (vma->vm_start != tmp)
+ return -ENOMEM;
+ newflags = vma->vm_flags & ~VM_LOCKED_MASK;
newflags |= flags;
-
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
- error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
+ error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
if (error)
break;
nstart = tmp;
- if (nstart < prev->vm_end)
- nstart = prev->vm_end;
- if (nstart >= end)
- break;
-
- vma = find_vma(prev->vm_mm, prev->vm_end);
- if (!vma || vma->vm_start != nstart) {
- error = -ENOMEM;
- break;
- }
}
+
+ if (vma_iter_end(&vmi) < end)
+ return -ENOMEM;
+
return error;
}
@@ -658,11 +657,11 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
*/
static int apply_mlockall_flags(int flags)
{
- MA_STATE(mas, &current->mm->mm_mt, 0, 0);
+ VMA_ITERATOR(vmi, current->mm, 0);
struct vm_area_struct *vma, *prev = NULL;
vm_flags_t to_add = 0;
- current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
+ current->mm->def_flags &= ~VM_LOCKED_MASK;
if (flags & MCL_FUTURE) {
current->mm->def_flags |= VM_LOCKED;
@@ -679,15 +678,15 @@ static int apply_mlockall_flags(int flags)
to_add |= VM_LOCKONFAULT;
}
- mas_for_each(&mas, vma, ULONG_MAX) {
+ for_each_vma(vmi, vma) {
vm_flags_t newflags;
- newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ newflags = vma->vm_flags & ~VM_LOCKED_MASK;
newflags |= to_add;
/* Ignore errors */
- mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
- mas_pause(&mas);
+ mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
+ newflags);
cond_resched();
}
out:
diff --git a/mm/mmap.c b/mm/mmap.c
index 87d929316d57..740b54be3ed4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -78,7 +78,7 @@ core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
struct vm_area_struct *vma, struct vm_area_struct *prev,
struct vm_area_struct *next, unsigned long start,
- unsigned long end);
+ unsigned long end, bool mm_wr_locked);
static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
@@ -144,6 +144,24 @@ static void remove_vma(struct vm_area_struct *vma)
vm_area_free(vma);
}
+static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
+ unsigned long min)
+{
+ return mas_prev(&vmi->mas, min);
+}
+
+static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
+ unsigned long start, unsigned long end, gfp_t gfp)
+{
+ vmi->mas.index = start;
+ vmi->mas.last = end - 1;
+ mas_store_gfp(&vmi->mas, NULL, gfp);
+ if (unlikely(mas_is_err(&vmi->mas)))
+ return -ENOMEM;
+
+ return 0;
+}
+
/*
* check_brk_limits() - Use platform specific check of range & verify mlock
* limits.
@@ -162,10 +180,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
return mlock_future_check(current->mm, current->mm->def_flags, len);
}
-static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
- unsigned long newbrk, unsigned long oldbrk,
- struct list_head *uf);
-static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma,
+static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
unsigned long addr, unsigned long request, unsigned long flags);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
@@ -176,7 +191,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
bool populate;
bool downgraded = false;
LIST_HEAD(uf);
- MA_STATE(mas, &mm->mm_mt, 0, 0);
+ struct vma_iterator vmi;
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -218,23 +233,23 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Always allow shrinking brk.
- * do_brk_munmap() may downgrade mmap_lock to read.
+ * do_vma_munmap() may downgrade mmap_lock to read.
*/
if (brk <= mm->brk) {
int ret;
/* Search one past newbrk */
- mas_set(&mas, newbrk);
- brkvma = mas_find(&mas, oldbrk);
+ vma_iter_init(&vmi, mm, newbrk);
+ brkvma = vma_find(&vmi, oldbrk);
if (!brkvma || brkvma->vm_start >= oldbrk)
goto out; /* mapping intersects with an existing non-brk vma. */
/*
* mm->brk must be protected by write mmap_lock.
- * do_brk_munmap() may downgrade the lock, so update it
- * before calling do_brk_munmap().
+ * do_vma_munmap() may downgrade the lock, so update it
+ * before calling do_vma_munmap().
*/
mm->brk = brk;
- ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
+ ret = do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true);
if (ret == 1) {
downgraded = true;
goto success;
@@ -252,14 +267,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
* Only check if the next VMA is within the stack_guard_gap of the
* expansion area
*/
- mas_set(&mas, oldbrk);
- next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
+ vma_iter_init(&vmi, mm, oldbrk);
+ next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap);
if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;
- brkvma = mas_prev(&mas, mm->start_brk);
+ brkvma = vma_prev_limit(&vmi, mm->start_brk);
/* Ok, looks good - let it rip. */
- if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
+ if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
goto out;
mm->brk = brk;
@@ -417,85 +432,218 @@ static void __vma_link_file(struct vm_area_struct *vma,
flush_dcache_mmap_unlock(mapping);
}
+static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ VMA_ITERATOR(vmi, mm, 0);
+ struct address_space *mapping = NULL;
+
+ if (vma_iter_prealloc(&vmi))
+ return -ENOMEM;
+
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ i_mmap_lock_write(mapping);
+ }
+
+ vma_iter_store(&vmi, vma);
+
+ if (mapping) {
+ __vma_link_file(vma, mapping);
+ i_mmap_unlock_write(mapping);
+ }
+
+ mm->map_count++;
+ validate_mm(mm);
+ return 0;
+}
+
/*
- * vma_mas_store() - Store a VMA in the maple tree.
- * @vma: The vm_area_struct
- * @mas: The maple state
- *
- * Efficient way to store a VMA in the maple tree when the @mas has already
- * walked to the correct location.
- *
- * Note: the end address is inclusive in the maple tree.
+ * init_multi_vma_prep() - Initializer for struct vma_prepare
+ * @vp: The vma_prepare struct
+ * @vma: The vma that will be altered once locked
+ * @next: The next vma if it is to be adjusted
+ * @remove: The first vma to be removed
+ * @remove2: The second vma to be removed
*/
-void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas)
+static inline void init_multi_vma_prep(struct vma_prepare *vp,
+ struct vm_area_struct *vma, struct vm_area_struct *next,
+ struct vm_area_struct *remove, struct vm_area_struct *remove2)
{
- trace_vma_store(mas->tree, vma);
- mas_set_range(mas, vma->vm_start, vma->vm_end - 1);
- mas_store_prealloc(mas, vma);
+ memset(vp, 0, sizeof(struct vma_prepare));
+ vp->vma = vma;
+ vp->anon_vma = vma->anon_vma;
+ vp->remove = remove;
+ vp->remove2 = remove2;
+ vp->adj_next = next;
+ if (!vp->anon_vma && next)
+ vp->anon_vma = next->anon_vma;
+
+ vp->file = vma->vm_file;
+ if (vp->file)
+ vp->mapping = vma->vm_file->f_mapping;
+
}
/*
- * vma_mas_remove() - Remove a VMA from the maple tree.
- * @vma: The vm_area_struct
- * @mas: The maple state
- *
- * Efficient way to remove a VMA from the maple tree when the @mas has already
- * been established and points to the correct location.
- * Note: the end address is inclusive in the maple tree.
+ * init_vma_prep() - Initializer wrapper for vma_prepare struct
+ * @vp: The vma_prepare struct
+ * @vma: The vma that will be altered once locked
*/
-void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
+static inline void init_vma_prep(struct vma_prepare *vp,
+ struct vm_area_struct *vma)
{
- trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1);
- mas->index = vma->vm_start;
- mas->last = vma->vm_end - 1;
- mas_store_prealloc(mas, NULL);
+ init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
}
+
/*
- * vma_mas_szero() - Set a given range to zero. Used when modifying a
- * vm_area_struct start or end.
- *
- * @mas: The maple tree ma_state
- * @start: The start address to zero
- * @end: The end address to zero.
+ * vma_prepare() - Helper function for handling locking VMAs prior to altering
+ * @vp: The initialized vma_prepare struct
*/
-static inline void vma_mas_szero(struct ma_state *mas, unsigned long start,
- unsigned long end)
+static inline void vma_prepare(struct vma_prepare *vp)
{
- trace_vma_mas_szero(mas->tree, start, end - 1);
- mas_set_range(mas, start, end - 1);
- mas_store_prealloc(mas, NULL);
+ if (vp->file) {
+ uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
+
+ if (vp->adj_next)
+ uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
+ vp->adj_next->vm_end);
+
+ i_mmap_lock_write(vp->mapping);
+ if (vp->insert && vp->insert->vm_file) {
+ /*
+ * Put into interval tree now, so instantiated pages
+ * are visible to arm/parisc __flush_dcache_page
+ * throughout; but we cannot insert into address
+ * space until vma start or end is updated.
+ */
+ __vma_link_file(vp->insert,
+ vp->insert->vm_file->f_mapping);
+ }
+ }
+
+ if (vp->anon_vma) {
+ anon_vma_lock_write(vp->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vp->vma);
+ if (vp->adj_next)
+ anon_vma_interval_tree_pre_update_vma(vp->adj_next);
+ }
+
+ if (vp->file) {
+ flush_dcache_mmap_lock(vp->mapping);
+ vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
+ if (vp->adj_next)
+ vma_interval_tree_remove(vp->adj_next,
+ &vp->mapping->i_mmap);
+ }
+
}
-static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- MA_STATE(mas, &mm->mm_mt, 0, 0);
- struct address_space *mapping = NULL;
+/*
+ * vma_complete- Helper function for handling the unlocking after altering VMAs,
+ * or for inserting a VMA.
+ *
+ * @vp: The vma_prepare struct
+ * @vmi: The vma iterator
+ * @mm: The mm_struct
+ */
+static inline void vma_complete(struct vma_prepare *vp,
+ struct vma_iterator *vmi, struct mm_struct *mm)
+{
+ if (vp->file) {
+ if (vp->adj_next)
+ vma_interval_tree_insert(vp->adj_next,
+ &vp->mapping->i_mmap);
+ vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
+ flush_dcache_mmap_unlock(vp->mapping);
+ }
+
+ if (vp->remove && vp->file) {
+ __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping);
+ if (vp->remove2)
+ __remove_shared_vm_struct(vp->remove2, vp->file,
+ vp->mapping);
+ } else if (vp->insert) {
+ /*
+ * split_vma has split insert from vma, and needs
+ * us to insert it before dropping the locks
+ * (it may either follow vma or precede it).
+ */
+ vma_iter_store(vmi, vp->insert);
+ mm->map_count++;
+ }
- if (mas_preallocate(&mas, vma, GFP_KERNEL))
- return -ENOMEM;
+ if (vp->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vp->vma);
+ if (vp->adj_next)
+ anon_vma_interval_tree_post_update_vma(vp->adj_next);
+ anon_vma_unlock_write(vp->anon_vma);
+ }
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
+ if (vp->file) {
+ i_mmap_unlock_write(vp->mapping);
+ uprobe_mmap(vp->vma);
+
+ if (vp->adj_next)
+ uprobe_mmap(vp->adj_next);
}
- vma_mas_store(vma, &mas);
+ if (vp->remove) {
+again:
+ if (vp->file) {
+ uprobe_munmap(vp->remove, vp->remove->vm_start,
+ vp->remove->vm_end);
+ fput(vp->file);
+ }
+ if (vp->remove->anon_vma)
+ anon_vma_merge(vp->vma, vp->remove);
+ mm->map_count--;
+ mpol_put(vma_policy(vp->remove));
+ if (!vp->remove2)
+ WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
+ vm_area_free(vp->remove);
- if (mapping) {
- __vma_link_file(vma, mapping);
- i_mmap_unlock_write(mapping);
+ /*
+ * In mprotect's case 6 (see comments on vma_merge),
+ * we must remove the one after next as well.
+ */
+ if (vp->remove2) {
+ vp->remove = vp->remove2;
+ vp->remove2 = NULL;
+ goto again;
+ }
+ }
+ if (vp->insert && vp->file)
+ uprobe_mmap(vp->insert);
+}
+
+/*
+ * dup_anon_vma() - Helper function to duplicate anon_vma
+ * @dst: The destination VMA
+ * @src: The source VMA
+ *
+ * Returns: 0 on success.
+ */
+static inline int dup_anon_vma(struct vm_area_struct *dst,
+ struct vm_area_struct *src)
+{
+ /*
+ * Easily overlooked: when mprotect shifts the boundary, make sure the
+ * expanding vma has anon_vma set if the shrinking vma had, to cover any
+ * anon pages imported.
+ */
+ if (src->anon_vma && !dst->anon_vma) {
+ dst->anon_vma = src->anon_vma;
+ return anon_vma_clone(dst, src);
}
- mm->map_count++;
- validate_mm(mm);
return 0;
}
/*
* vma_expand - Expand an existing VMA
*
- * @mas: The maple state
+ * @vmi: The vma iterator
* @vma: The vma to expand
* @start: The start of the vma
* @end: The exclusive end of the vma
@@ -509,96 +657,46 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
*
* Returns: 0 on success
*/
-inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff,
- struct vm_area_struct *next)
+int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff,
+ struct vm_area_struct *next)
{
- struct mm_struct *mm = vma->vm_mm;
- struct address_space *mapping = NULL;
- struct rb_root_cached *root = NULL;
- struct anon_vma *anon_vma = vma->anon_vma;
- struct file *file = vma->vm_file;
bool remove_next = false;
+ struct vma_prepare vp;
if (next && (vma != next) && (end == next->vm_end)) {
- remove_next = true;
- if (next->anon_vma && !vma->anon_vma) {
- int error;
+ int ret;
- anon_vma = next->anon_vma;
- vma->anon_vma = anon_vma;
- error = anon_vma_clone(vma, next);
- if (error)
- return error;
- }
+ remove_next = true;
+ ret = dup_anon_vma(vma, next);
+ if (ret)
+ return ret;
}
+ init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
/* Not merging but overwriting any part of next is not handled. */
- VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start);
+ VM_WARN_ON(next && !vp.remove &&
+ next != vma && end > next->vm_start);
/* Only handles expanding */
- VM_BUG_ON(vma->vm_start < start || vma->vm_end > end);
+ VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);
- if (mas_preallocate(mas, vma, GFP_KERNEL))
+ if (vma_iter_prealloc(vmi))
goto nomem;
vma_adjust_trans_huge(vma, start, end, 0);
+ /* VMA iterator points to previous, so set to start if necessary */
+ if (vma_iter_addr(vmi) != start)
+ vma_iter_set(vmi, start);
- if (file) {
- mapping = file->f_mapping;
- root = &mapping->i_mmap;
- uprobe_munmap(vma, vma->vm_start, vma->vm_end);
- i_mmap_lock_write(mapping);
- }
-
- if (anon_vma) {
- anon_vma_lock_write(anon_vma);
- anon_vma_interval_tree_pre_update_vma(vma);
- }
-
- if (file) {
- flush_dcache_mmap_lock(mapping);
- vma_interval_tree_remove(vma, root);
- }
-
+ vma_prepare(&vp);
vma->vm_start = start;
vma->vm_end = end;
vma->vm_pgoff = pgoff;
/* Note: mas must be pointing to the expanding VMA */
- vma_mas_store(vma, mas);
-
- if (file) {
- vma_interval_tree_insert(vma, root);
- flush_dcache_mmap_unlock(mapping);
- }
-
- /* Expanding over the next vma */
- if (remove_next && file) {
- __remove_shared_vm_struct(next, file, mapping);
- }
-
- if (anon_vma) {
- anon_vma_interval_tree_post_update_vma(vma);
- anon_vma_unlock_write(anon_vma);
- }
-
- if (file) {
- i_mmap_unlock_write(mapping);
- uprobe_mmap(vma);
- }
-
- if (remove_next) {
- if (file) {
- uprobe_munmap(next, next->vm_start, next->vm_end);
- fput(file);
- }
- if (next->anon_vma)
- anon_vma_merge(vma, next);
- mm->map_count--;
- mpol_put(vma_policy(next));
- vm_area_free(next);
- }
+ vma_iter_store(vmi, vma);
- validate_mm(mm);
+ vma_complete(&vp, vmi, vma->vm_mm);
+ validate_mm(vma->vm_mm);
return 0;
nomem:
@@ -606,256 +704,39 @@ nomem:
}
/*
- * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
- * is already present in an i_mmap tree without adjusting the tree.
- * The following helper function should be used when such adjustments
- * are necessary. The "insert" vma (if any) is to be inserted
- * before we drop the necessary locks.
+ * vma_shrink() - Reduce an existing VMAs memory area
+ * @vmi: The vma iterator
+ * @vma: The VMA to modify
+ * @start: The new start
+ * @end: The new end
+ *
+ * Returns: 0 on success, -ENOMEM otherwise
*/
-int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
- struct vm_area_struct *expand)
+int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff)
{
- struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next_next = NULL; /* uninit var warning */
- struct vm_area_struct *next = find_vma(mm, vma->vm_end);
- struct vm_area_struct *orig_vma = vma;
- struct address_space *mapping = NULL;
- struct rb_root_cached *root = NULL;
- struct anon_vma *anon_vma = NULL;
- struct file *file = vma->vm_file;
- bool vma_changed = false;
- long adjust_next = 0;
- int remove_next = 0;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
- struct vm_area_struct *exporter = NULL, *importer = NULL;
-
- if (next && !insert) {
- if (end >= next->vm_end) {
- /*
- * vma expands, overlapping all the next, and
- * perhaps the one after too (mprotect case 6).
- * The only other cases that gets here are
- * case 1, case 7 and case 8.
- */
- if (next == expand) {
- /*
- * The only case where we don't expand "vma"
- * and we expand "next" instead is case 8.
- */
- VM_WARN_ON(end != next->vm_end);
- /*
- * remove_next == 3 means we're
- * removing "vma" and that to do so we
- * swapped "vma" and "next".
- */
- remove_next = 3;
- VM_WARN_ON(file != next->vm_file);
- swap(vma, next);
- } else {
- VM_WARN_ON(expand != vma);
- /*
- * case 1, 6, 7, remove_next == 2 is case 6,
- * remove_next == 1 is case 1 or 7.
- */
- remove_next = 1 + (end > next->vm_end);
- if (remove_next == 2)
- next_next = find_vma(mm, next->vm_end);
-
- VM_WARN_ON(remove_next == 2 &&
- end != next_next->vm_end);
- }
-
- exporter = next;
- importer = vma;
+ struct vma_prepare vp;
- /*
- * If next doesn't have anon_vma, import from vma after
- * next, if the vma overlaps with it.
- */
- if (remove_next == 2 && !next->anon_vma)
- exporter = next_next;
-
- } else if (end > next->vm_start) {
- /*
- * vma expands, overlapping part of the next:
- * mprotect case 5 shifting the boundary up.
- */
- adjust_next = (end - next->vm_start);
- exporter = next;
- importer = vma;
- VM_WARN_ON(expand != importer);
- } else if (end < vma->vm_end) {
- /*
- * vma shrinks, and !insert tells it's not
- * split_vma inserting another: so it must be
- * mprotect case 4 shifting the boundary down.
- */
- adjust_next = -(vma->vm_end - end);
- exporter = vma;
- importer = next;
- VM_WARN_ON(expand != importer);
- }
+ WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
- /*
- * Easily overlooked: when mprotect shifts the boundary,
- * make sure the expanding vma has anon_vma set if the
- * shrinking vma had, to cover any anon pages imported.
- */
- if (exporter && exporter->anon_vma && !importer->anon_vma) {
- int error;
-
- importer->anon_vma = exporter->anon_vma;
- error = anon_vma_clone(importer, exporter);
- if (error)
- return error;
- }
- }
-
- if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ if (vma_iter_prealloc(vmi))
return -ENOMEM;
- vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
- if (file) {
- mapping = file->f_mapping;
- root = &mapping->i_mmap;
- uprobe_munmap(vma, vma->vm_start, vma->vm_end);
-
- if (adjust_next)
- uprobe_munmap(next, next->vm_start, next->vm_end);
-
- i_mmap_lock_write(mapping);
- if (insert && insert->vm_file) {
- /*
- * Put into interval tree now, so instantiated pages
- * are visible to arm/parisc __flush_dcache_page
- * throughout; but we cannot insert into address
- * space until vma start or end is updated.
- */
- __vma_link_file(insert, insert->vm_file->f_mapping);
- }
- }
-
- anon_vma = vma->anon_vma;
- if (!anon_vma && adjust_next)
- anon_vma = next->anon_vma;
- if (anon_vma) {
- VM_WARN_ON(adjust_next && next->anon_vma &&
- anon_vma != next->anon_vma);
- anon_vma_lock_write(anon_vma);
- anon_vma_interval_tree_pre_update_vma(vma);
- if (adjust_next)
- anon_vma_interval_tree_pre_update_vma(next);
- }
-
- if (file) {
- flush_dcache_mmap_lock(mapping);
- vma_interval_tree_remove(vma, root);
- if (adjust_next)
- vma_interval_tree_remove(next, root);
- }
+ init_vma_prep(&vp, vma);
+ vma_adjust_trans_huge(vma, start, end, 0);
+ vma_prepare(&vp);
- if (start != vma->vm_start) {
- if ((vma->vm_start < start) &&
- (!insert || (insert->vm_end != start))) {
- vma_mas_szero(&mas, vma->vm_start, start);
- VM_WARN_ON(insert && insert->vm_start > vma->vm_start);
- } else {
- vma_changed = true;
- }
- vma->vm_start = start;
- }
- if (end != vma->vm_end) {
- if (vma->vm_end > end) {
- if (!insert || (insert->vm_start != end)) {
- vma_mas_szero(&mas, end, vma->vm_end);
- mas_reset(&mas);
- VM_WARN_ON(insert &&
- insert->vm_end < vma->vm_end);
- }
- } else {
- vma_changed = true;
- }
- vma->vm_end = end;
- }
+ if (vma->vm_start < start)
+ vma_iter_clear(vmi, vma->vm_start, start);
- if (vma_changed)
- vma_mas_store(vma, &mas);
+ if (vma->vm_end > end)
+ vma_iter_clear(vmi, end, vma->vm_end);
+ vma->vm_start = start;
+ vma->vm_end = end;
vma->vm_pgoff = pgoff;
- if (adjust_next) {
- next->vm_start += adjust_next;
- next->vm_pgoff += adjust_next >> PAGE_SHIFT;
- vma_mas_store(next, &mas);
- }
-
- if (file) {
- if (adjust_next)
- vma_interval_tree_insert(next, root);
- vma_interval_tree_insert(vma, root);
- flush_dcache_mmap_unlock(mapping);
- }
-
- if (remove_next && file) {
- __remove_shared_vm_struct(next, file, mapping);
- if (remove_next == 2)
- __remove_shared_vm_struct(next_next, file, mapping);
- } else if (insert) {
- /*
- * split_vma has split insert from vma, and needs
- * us to insert it before dropping the locks
- * (it may either follow vma or precede it).
- */
- mas_reset(&mas);
- vma_mas_store(insert, &mas);
- mm->map_count++;
- }
-
- if (anon_vma) {
- anon_vma_interval_tree_post_update_vma(vma);
- if (adjust_next)
- anon_vma_interval_tree_post_update_vma(next);
- anon_vma_unlock_write(anon_vma);
- }
-
- if (file) {
- i_mmap_unlock_write(mapping);
- uprobe_mmap(vma);
-
- if (adjust_next)
- uprobe_mmap(next);
- }
-
- if (remove_next) {
-again:
- if (file) {
- uprobe_munmap(next, next->vm_start, next->vm_end);
- fput(file);
- }
- if (next->anon_vma)
- anon_vma_merge(vma, next);
- mm->map_count--;
- mpol_put(vma_policy(next));
- if (remove_next != 2)
- BUG_ON(vma->vm_end < next->vm_end);
- vm_area_free(next);
-
- /*
- * In mprotect's case 6 (see comments on vma_merge),
- * we must remove next_next too.
- */
- if (remove_next == 2) {
- remove_next = 1;
- next = next_next;
- goto again;
- }
- }
- if (insert && file)
- uprobe_mmap(insert);
-
- mas_destroy(&mas);
- validate_mm(mm);
-
+ vma_complete(&vp, vmi, vma->vm_mm);
+ validate_mm(vma->vm_mm);
return 0;
}
@@ -864,9 +745,9 @@ again:
* per-vma resources, so we don't attempt to merge those.
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
- struct file *file, unsigned long vm_flags,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name)
+ struct file *file, unsigned long vm_flags,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ struct anon_vma_name *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -985,7 +866,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* It is important for case 8 that the vma NNNN overlapping the
* region AAAA is never going to extended over XXXX. Instead XXXX must
* be extended in region AAAA and NNNN must be removed. This way in
- * all cases where vma_merge succeeds, the moment vma_adjust drops the
+ * all cases where vma_merge succeeds, the moment vma_merge drops the
* rmap_locks, the properties of the merged vma will be already
* correct for the whole merged range. Some of those properties like
* vm_page_prot/vm_flags may be accessed by rmap_walks and they must
@@ -995,8 +876,14 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* or other rmap walkers (if working on addresses beyond the "end"
* parameter) may establish ptes with the wrong permissions of NNNN
* instead of the right permissions of XXXX.
+ *
+ * In the code below:
+ * PPPP is represented by *prev
+ * NNNN is represented by *mid (and possibly equal to *next)
+ * XXXX is represented by *next or not represented at all.
+ * AAAA is not represented - it will be merged or the function will return NULL
*/
-struct vm_area_struct *vma_merge(struct mm_struct *mm,
+struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
@@ -1005,11 +892,19 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct anon_vma_name *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
- struct vm_area_struct *mid, *next, *res;
+ pgoff_t vma_pgoff;
+ struct vm_area_struct *mid, *next, *res = NULL;
+ struct vm_area_struct *vma, *adjust, *remove, *remove2;
int err = -1;
bool merge_prev = false;
bool merge_next = false;
+ bool vma_expanded = false;
+ struct vma_prepare vp;
+ unsigned long vma_end = end;
+ long adj_next = 0;
+ unsigned long vma_start = addr;
+ validate_mm(mm);
/*
* We later require that vma->vm_flags == vm_flags,
* so this tests vma->vm_flags & VM_SPECIAL, too.
@@ -1027,13 +922,18 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
VM_WARN_ON(mid && end > mid->vm_end);
VM_WARN_ON(addr >= end);
- /* Can we merge the predecessor? */
- if (prev && prev->vm_end == addr &&
- mpol_equal(vma_policy(prev), policy) &&
- can_vma_merge_after(prev, vm_flags,
- anon_vma, file, pgoff,
- vm_userfaultfd_ctx, anon_name)) {
- merge_prev = true;
+ if (prev) {
+ res = prev;
+ vma = prev;
+ vma_start = prev->vm_start;
+ vma_pgoff = prev->vm_pgoff;
+ /* Can we merge the predecessor? */
+ if (prev->vm_end == addr && mpol_equal(vma_policy(prev), policy)
+ && can_vma_merge_after(prev, vm_flags, anon_vma, file,
+ pgoff, vm_userfaultfd_ctx, anon_name)) {
+ merge_prev = true;
+ vma_prev(vmi);
+ }
}
/* Can we merge the successor? */
if (next && end == next->vm_start &&
@@ -1043,34 +943,87 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
vm_userfaultfd_ctx, anon_name)) {
merge_next = true;
}
+
+ remove = remove2 = adjust = NULL;
/* Can we merge both the predecessor and the successor? */
if (merge_prev && merge_next &&
- is_mergeable_anon_vma(prev->anon_vma,
- next->anon_vma, NULL)) { /* cases 1, 6 */
- err = __vma_adjust(prev, prev->vm_start,
- next->vm_end, prev->vm_pgoff, NULL,
- prev);
- res = prev;
- } else if (merge_prev) { /* cases 2, 5, 7 */
- err = __vma_adjust(prev, prev->vm_start,
- end, prev->vm_pgoff, NULL, prev);
- res = prev;
+ is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
+ remove = mid; /* case 1 */
+ vma_end = next->vm_end;
+ err = dup_anon_vma(res, remove);
+ if (mid != next) { /* case 6 */
+ remove2 = next;
+ if (!remove->anon_vma)
+ err = dup_anon_vma(res, remove2);
+ }
+ } else if (merge_prev) {
+ err = 0; /* case 2 */
+ if (mid && end > mid->vm_start) {
+ err = dup_anon_vma(res, mid);
+ if (end == mid->vm_end) { /* case 7 */
+ remove = mid;
+ } else { /* case 5 */
+ adjust = mid;
+ adj_next = (end - mid->vm_start);
+ }
+ }
} else if (merge_next) {
- if (prev && addr < prev->vm_end) /* case 4 */
- err = __vma_adjust(prev, prev->vm_start,
- addr, prev->vm_pgoff, NULL, next);
- else /* cases 3, 8 */
- err = __vma_adjust(mid, addr, next->vm_end,
- next->vm_pgoff - pglen, NULL, next);
res = next;
+ if (prev && addr < prev->vm_end) { /* case 4 */
+ vma_end = addr;
+ adjust = mid;
+ adj_next = -(vma->vm_end - addr);
+ err = dup_anon_vma(adjust, prev);
+ } else {
+ vma = next; /* case 3 */
+ vma_start = addr;
+ vma_end = next->vm_end;
+ vma_pgoff = mid->vm_pgoff;
+ err = 0;
+ if (mid != next) { /* case 8 */
+ remove = mid;
+ err = dup_anon_vma(res, remove);
+ }
+ }
}
- /*
- * Cannot merge with predecessor or successor or error in __vma_adjust?
- */
+ /* Cannot merge or error in anon_vma clone */
if (err)
return NULL;
+
+ if (vma_iter_prealloc(vmi))
+ return NULL;
+
+ vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next);
+ init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
+ VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
+ vp.anon_vma != adjust->anon_vma);
+
+ vma_prepare(&vp);
+ if (vma_start < vma->vm_start || vma_end > vma->vm_end)
+ vma_expanded = true;
+
+ vma->vm_start = vma_start;
+ vma->vm_end = vma_end;
+ vma->vm_pgoff = vma_pgoff;
+
+ if (vma_expanded)
+ vma_iter_store(vmi, vma);
+
+ if (adj_next) {
+ adjust->vm_start += adj_next;
+ adjust->vm_pgoff += adj_next >> PAGE_SHIFT;
+ if (adj_next < 0) {
+ WARN_ON(vma_expanded);
+ vma_iter_store(vmi, next);
+ }
+ }
+
+ vma_complete(&vp, vmi, mm);
+ vma_iter_free(vmi);
+ validate_mm(mm);
khugepaged_enter_vma(res, vm_flags);
+
return res;
}
@@ -1524,6 +1477,10 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
return 1;
+ /* Do we need write faults for uffd-wp tracking? */
+ if (userfaultfd_wp(vma))
+ return 1;
+
/* Specialty mapping? */
if (vm_flags & VM_PFNMAP)
return 0;
@@ -1554,8 +1511,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
* the correct alignment and offset, all from @info. Note: current->mm is used
* for the search.
*
- * @info: The unmapped area information including the range (low_limit -
- * hight_limit), the alignment offset and mask.
+ * @info: The unmapped area information including the range [low_limit -
+ * high_limit), the alignment offset and mask.
*
* Return: A memory address or -ENOMEM.
*/
@@ -1581,11 +1538,11 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
/**
* unmapped_area_topdown() - Find an area between the low_limit and the
- * high_limit with * the correct alignment and offset at the highest available
+ * high_limit with the correct alignment and offset at the highest available
* address, all from @info. Note: current->mm is used for the search.
*
- * @info: The unmapped area information including the range (low_limit -
- * hight_limit), the alignment offset and mask.
+ * @info: The unmapped area information including the range [low_limit -
+ * high_limit), the alignment offset and mask.
*
* Return: A memory address or -ENOMEM.
*/
@@ -1934,7 +1891,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/* Check that both stack segments have the same anon_vma? */
}
- if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ if (mas_preallocate(&mas, GFP_KERNEL))
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
@@ -1977,7 +1934,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
/* Overwrite old entry in mtree. */
- vma_mas_store(vma, &mas);
+ mas_set_range(&mas, vma->vm_start, address - 1);
+ mas_store_prealloc(&mas, vma);
anon_vma_interval_tree_post_update_vma(vma);
spin_unlock(&mm->page_table_lock);
@@ -2015,7 +1973,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
return -ENOMEM;
}
- if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ if (mas_preallocate(&mas, GFP_KERNEL))
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
@@ -2059,7 +2017,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
vma->vm_start = address;
vma->vm_pgoff -= grow;
/* Overwrite old entry in mtree. */
- vma_mas_store(vma, &mas);
+ mas_set_range(&mas, address, vma->vm_end - 1);
+ mas_store_prealloc(&mas, vma);
anon_vma_interval_tree_post_update_vma(vma);
spin_unlock(&mm->page_table_lock);
@@ -2174,14 +2133,14 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
struct vm_area_struct *vma, struct vm_area_struct *prev,
struct vm_area_struct *next,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end, bool mm_wr_locked)
{
struct mmu_gather tlb;
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, mt, vma, start, end);
+ unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked);
free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
@@ -2190,13 +2149,19 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
/*
* __split_vma() bypasses sysctl_max_map_count checking. We use this where it
* has already been checked or doesn't make sense to fail.
+ * VMA Iterator will point to the end VMA.
*/
-int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
+ struct vma_prepare vp;
struct vm_area_struct *new;
int err;
- validate_mm_mt(mm);
+
+ validate_mm_mt(vma->vm_mm);
+
+ WARN_ON(vma->vm_start >= addr);
+ WARN_ON(vma->vm_end <= addr);
if (vma->vm_ops && vma->vm_ops->may_split) {
err = vma->vm_ops->may_split(vma, addr);
@@ -2208,16 +2173,20 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (!new)
return -ENOMEM;
- if (new_below)
+ err = -ENOMEM;
+ if (vma_iter_prealloc(vmi))
+ goto out_free_vma;
+
+ if (new_below) {
new->vm_end = addr;
- else {
+ } else {
new->vm_start = addr;
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
err = vma_dup_policy(vma, new);
if (err)
- goto out_free_vma;
+ goto out_free_vmi;
err = anon_vma_clone(new, vma);
if (err)
@@ -2229,30 +2198,34 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
- if (new_below)
- err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
- ((addr - new->vm_start) >> PAGE_SHIFT), new);
- else
- err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+ vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
+ init_vma_prep(&vp, vma);
+ vp.insert = new;
+ vma_prepare(&vp);
+
+ if (new_below) {
+ vma->vm_start = addr;
+ vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
+ } else {
+ vma->vm_end = addr;
+ }
+
+ /* vma_complete stores the new vma */
+ vma_complete(&vp, vmi, vma->vm_mm);
/* Success. */
- if (!err)
- return 0;
+ if (new_below)
+ vma_next(vmi);
+ validate_mm_mt(vma->vm_mm);
+ return 0;
- /* Avoid vm accounting in close() operation */
- new->vm_start = new->vm_end;
- new->vm_pgoff = 0;
- /* Clean everything up if vma_adjust failed. */
- if (new->vm_ops && new->vm_ops->close)
- new->vm_ops->close(new);
- if (new->vm_file)
- fput(new->vm_file);
- unlink_anon_vmas(new);
- out_free_mpol:
+out_free_mpol:
mpol_put(vma_policy(new));
- out_free_vma:
+out_free_vmi:
+ vma_iter_free(vmi);
+out_free_vma:
vm_area_free(new);
- validate_mm_mt(mm);
+ validate_mm_mt(vma->vm_mm);
return err;
}
@@ -2260,13 +2233,13 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
* Split a vma into two pieces at address 'addr', a new vma is allocated
* either for the first part or the tail.
*/
-int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
- if (mm->map_count >= sysctl_max_map_count)
+ if (vma->vm_mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
- return __split_vma(mm, vma, addr, new_below);
+ return __split_vma(vmi, vma, addr, new_below);
}
static inline int munmap_sidetree(struct vm_area_struct *vma,
@@ -2283,19 +2256,19 @@ static inline int munmap_sidetree(struct vm_area_struct *vma,
}
/*
- * do_mas_align_munmap() - munmap the aligned region from @start to @end.
- * @mas: The maple_state, ideally set up to alter the correct tree location.
+ * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
+ * @vmi: The vma iterator
* @vma: The starting vm_area_struct
* @mm: The mm_struct
* @start: The aligned start address to munmap.
* @end: The aligned end address to munmap.
* @uf: The userfaultfd list_head
- * @downgrade: Set to true to attempt a write downgrade of the mmap_sem
+ * @downgrade: Set to true to attempt a write downgrade of the mmap_lock
*
* If @downgrade is true, check return code for potential release of the lock.
*/
static int
-do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm, unsigned long start,
unsigned long end, struct list_head *uf, bool downgrade)
{
@@ -2307,10 +2280,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
mt_set_external_lock(&mt_detach, &mm->mmap_lock);
- if (mas_preallocate(mas, vma, GFP_KERNEL))
- return -ENOMEM;
-
- mas->last = end - 1;
/*
* If we need to split any vma, do it now to save pain later.
*
@@ -2330,45 +2299,27 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
goto map_count_exceeded;
- /*
- * mas_pause() is not needed since mas->index needs to be set
- * differently than vma->vm_end anyways.
- */
- error = __split_vma(mm, vma, start, 0);
+ error = __split_vma(vmi, vma, start, 0);
if (error)
goto start_split_failed;
- mas_set(mas, start);
- vma = mas_walk(mas);
+ vma = vma_iter_load(vmi);
}
- prev = mas_prev(mas, 0);
+ prev = vma_prev(vmi);
if (unlikely((!prev)))
- mas_set(mas, start);
+ vma_iter_set(vmi, start);
/*
* Detach a range of VMAs from the mm. Using next as a temp variable as
* it is always overwritten.
*/
- mas_for_each(mas, next, end - 1) {
+ for_each_vma_range(*vmi, next, end) {
/* Does it split the end? */
if (next->vm_end > end) {
- struct vm_area_struct *split;
-
- error = __split_vma(mm, next, end, 1);
+ error = __split_vma(vmi, next, end, 0);
if (error)
goto end_split_failed;
-
- mas_set(mas, end);
- split = mas_prev(mas, 0);
- error = munmap_sidetree(split, &mas_detach);
- if (error)
- goto munmap_sidetree_failed;
-
- count++;
- if (vma == next)
- vma = split;
- break;
}
error = munmap_sidetree(next, &mas_detach);
if (error)
@@ -2381,9 +2332,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
#endif
}
- if (!next)
- next = mas_next(mas, ULONG_MAX);
-
+ next = vma_next(vmi);
if (unlikely(uf)) {
/*
* If userfaultfd_unmap_prep returns an error the vmas
@@ -2400,8 +2349,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
goto userfaultfd_error;
}
- /* Point of no return */
- mas_set_range(mas, start, end - 1);
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
/* Make sure no VMAs are about to be lost. */
{
@@ -2409,19 +2356,23 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *vma_mas, *vma_test;
int test_count = 0;
+ vma_iter_set(vmi, start);
rcu_read_lock();
vma_test = mas_find(&test, end - 1);
- mas_for_each(mas, vma_mas, end - 1) {
+ for_each_vma_range(*vmi, vma_mas, end) {
BUG_ON(vma_mas != vma_test);
test_count++;
vma_test = mas_next(&test, end - 1);
}
rcu_read_unlock();
BUG_ON(count != test_count);
- mas_set_range(mas, start, end - 1);
}
#endif
- mas_store_prealloc(mas, NULL);
+ /* Point of no return */
+ vma_iter_set(vmi, start);
+ if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL))
+ return -ENOMEM;
+
mm->map_count -= count;
/*
* Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
@@ -2437,7 +2388,11 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
mmap_write_downgrade(mm);
}
- unmap_region(mm, &mt_detach, vma, prev, next, start, end);
+ /*
+ * We can free page tables without write-locking mmap_lock because VMAs
+ * were isolated before we downgraded mmap_lock.
+ */
+ unmap_region(mm, &mt_detach, vma, prev, next, start, end, !downgrade);
/* Statistics and freeing VMAs */
mas_set(&mas_detach, start);
remove_mt(mm, &mas_detach);
@@ -2453,19 +2408,18 @@ end_split_failed:
__mt_destroy(&mt_detach);
start_split_failed:
map_count_exceeded:
- mas_destroy(mas);
return error;
}
/*
- * do_mas_munmap() - munmap a given range.
- * @mas: The maple state
+ * do_vmi_munmap() - munmap a given range.
+ * @vmi: The vma iterator
* @mm: The mm_struct
* @start: The start address to munmap
* @len: The length of the range to munmap
* @uf: The userfaultfd list_head
* @downgrade: set to true if the user wants to attempt to write_downgrade the
- * mmap_sem
+ * mmap_lock
*
* This function takes a @mas that is either pointing to the previous VMA or set
* to MA_START and sets it up to remove the mapping(s). The @len will be
@@ -2473,7 +2427,7 @@ map_count_exceeded:
*
* Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise.
*/
-int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
+int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool downgrade)
{
@@ -2491,11 +2445,11 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
arch_unmap(mm, start, end);
/* Find the first overlapping VMA */
- vma = mas_find(mas, end - 1);
+ vma = vma_find(vmi, end);
if (!vma)
return 0;
- return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade);
+ return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade);
}
/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
@@ -2507,9 +2461,9 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
struct list_head *uf)
{
- MA_STATE(mas, &mm->mm_mt, start, start);
+ VMA_ITERATOR(vmi, mm, start);
- return do_mas_munmap(&mas, mm, start, len, uf, false);
+ return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}
unsigned long mmap_region(struct file *file, unsigned long addr,
@@ -2525,7 +2479,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long merge_start = addr, merge_end = end;
pgoff_t vm_pgoff;
int error;
- MA_STATE(mas, &mm->mm_mt, addr, end - 1);
+ VMA_ITERATOR(vmi, mm, addr);
/* Check against address space limit. */
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
@@ -2543,7 +2497,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
}
/* Unmap any existing mapping in the area */
- if (do_mas_munmap(&mas, mm, addr, len, uf, false))
+ if (do_vmi_munmap(&vmi, mm, addr, len, uf, false))
return -ENOMEM;
/*
@@ -2556,8 +2510,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vm_flags |= VM_ACCOUNT;
}
- next = mas_next(&mas, ULONG_MAX);
- prev = mas_prev(&mas, 0);
+ next = vma_next(&vmi);
+ prev = vma_prev(&vmi);
if (vm_flags & VM_SPECIAL)
goto cannot_expand;
@@ -2585,13 +2539,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/* Actually expand, if possible */
if (vma &&
- !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) {
+ !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
khugepaged_enter_vma(vma, vm_flags);
goto expanded;
}
- mas.index = addr;
- mas.last = end - 1;
cannot_expand:
/*
* Determine the object being mapped and call the appropriate
@@ -2604,9 +2556,10 @@ cannot_expand:
goto unacct_error;
}
+ vma_iter_set(&vmi, addr);
vma->vm_start = addr;
vma->vm_end = end;
- vma->vm_flags = vm_flags;
+ vm_flags_init(vma, vm_flags);
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
@@ -2626,19 +2579,20 @@ cannot_expand:
* Expansion is handled above, merging is handled below.
* Drivers should not alter the address of the VMA.
*/
- if (WARN_ON((addr != vma->vm_start))) {
- error = -EINVAL;
+ error = -EINVAL;
+ if (WARN_ON((addr != vma->vm_start)))
goto close_and_free_vma;
- }
- mas_reset(&mas);
+ vma_iter_set(&vmi, addr);
/*
* If vm_flags changed after call_mmap(), we should try merge
* vma again as we may succeed this time.
*/
if (unlikely(vm_flags != vma->vm_flags && prev)) {
- merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
- NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
+ merge = vma_merge(&vmi, mm, prev, vma->vm_start,
+ vma->vm_end, vma->vm_flags, NULL,
+ vma->vm_file, vma->vm_pgoff, NULL,
+ NULL_VM_UFFD_CTX, NULL);
if (merge) {
/*
* ->mmap() can change vma->vm_file and fput
@@ -2665,9 +2619,8 @@ cannot_expand:
vma_set_anonymous(vma);
}
- /* Allow architectures to sanity-check the vm_flags */
- if (!arch_validate_flags(vma->vm_flags)) {
- error = -EINVAL;
+ if (map_deny_write_exec(vma, vma->vm_flags)) {
+ error = -EACCES;
if (file)
goto close_and_free_vma;
else if (vma->vm_file)
@@ -2676,20 +2629,19 @@ cannot_expand:
goto free_vma;
}
- if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
- error = -ENOMEM;
- if (file)
- goto close_and_free_vma;
- else if (vma->vm_file)
- goto unmap_and_free_vma;
- else
- goto free_vma;
- }
+ /* Allow architectures to sanity-check the vm_flags */
+ error = -EINVAL;
+ if (!arch_validate_flags(vma->vm_flags))
+ goto close_and_free_vma;
+
+ error = -ENOMEM;
+ if (vma_iter_prealloc(&vmi))
+ goto close_and_free_vma;
if (vma->vm_file)
i_mmap_lock_write(vma->vm_file->f_mapping);
- vma_mas_store(vma, &mas);
+ vma_iter_store(&vmi, vma);
mm->map_count++;
if (vma->vm_file) {
if (vma->vm_flags & VM_SHARED)
@@ -2720,7 +2672,7 @@ expanded:
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ vm_flags_clear(vma, VM_LOCKED_MASK);
else
mm->locked_vm += (len >> PAGE_SHIFT);
}
@@ -2735,7 +2687,7 @@ expanded:
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
- vma->vm_flags |= VM_SOFTDIRTY;
+ vm_flags_set(vma, VM_SOFTDIRTY);
vma_set_page_prot(vma);
@@ -2743,14 +2695,18 @@ expanded:
return addr;
close_and_free_vma:
- if (vma->vm_ops && vma->vm_ops->close)
+ if (file && vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
+
+ if (file || vma->vm_file) {
unmap_and_free_vma:
- fput(vma->vm_file);
- vma->vm_file = NULL;
+ fput(vma->vm_file);
+ vma->vm_file = NULL;
- /* Undo any partial mapping done by a device driver. */
- unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end);
+ /* Undo any partial mapping done by a device driver. */
+ unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start,
+ vma->vm_end, true);
+ }
if (file && (vm_flags & VM_SHARED))
mapping_unmap_writable(file->f_mapping);
free_vma:
@@ -2767,12 +2723,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
int ret;
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);
- MA_STATE(mas, &mm->mm_mt, start, start);
+ VMA_ITERATOR(vmi, mm, start);
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade);
+ ret = do_vmi_munmap(&vmi, mm, start, len, &uf, downgrade);
/*
* Returning 1 indicates mmap_lock is downgraded.
* But 1 is not legal return value of vm_munmap() and munmap(), reset
@@ -2885,33 +2841,34 @@ out:
}
/*
- * brk_munmap() - Unmap a parital vma.
- * @mas: The maple tree state.
- * @vma: The vma to be modified
- * @newbrk: the start of the address to unmap
- * @oldbrk: The end of the address to unmap
+ * do_vma_munmap() - Unmap a full or partial vma.
+ * @vmi: The vma iterator pointing at the vma
+ * @vma: The first vma to be munmapped
+ * @start: the start of the address to unmap
+ * @end: The end of the address to unmap
* @uf: The userfaultfd list_head
+ * @downgrade: Attempt to downgrade or not
*
- * Returns: 1 on success.
- * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if
- * possible.
+ * Returns: 0 on success and not downgraded, 1 on success and downgraded.
+ * unmaps a VMA mapping when the vma iterator is already in position.
+ * Does not handle alignment.
*/
-static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
- unsigned long newbrk, unsigned long oldbrk,
- struct list_head *uf)
+int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct list_head *uf, bool downgrade)
{
struct mm_struct *mm = vma->vm_mm;
int ret;
- arch_unmap(mm, newbrk, oldbrk);
- ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true);
+ arch_unmap(mm, start, end);
+ ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade);
validate_mm_mt(mm);
return ret;
}
/*
* do_brk_flags() - Increase the brk vma if the flags match.
- * @mas: The maple tree state.
+ * @vmi: The vma iterator
* @addr: The start address
* @len: The length of the increase
* @vma: The vma,
@@ -2921,10 +2878,11 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
* do not match then create a new anonymous VMA. Eventually we may be able to
* do some brk-specific accounting here.
*/
-static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
+static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long addr, unsigned long len, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct vma_prepare vp;
validate_mm_mt(mm);
/*
@@ -2948,23 +2906,17 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
if (vma && vma->vm_end == addr && !vma_policy(vma) &&
can_vma_merge_after(vma, flags, NULL, NULL,
addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
- mas_set_range(mas, vma->vm_start, addr + len - 1);
- if (mas_preallocate(mas, vma, GFP_KERNEL))
+ if (vma_iter_prealloc(vmi))
goto unacct_fail;
vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
- if (vma->anon_vma) {
- anon_vma_lock_write(vma->anon_vma);
- anon_vma_interval_tree_pre_update_vma(vma);
- }
+ init_vma_prep(&vp, vma);
+ vma_prepare(&vp);
vma->vm_end = addr + len;
- vma->vm_flags |= VM_SOFTDIRTY;
- mas_store_prealloc(mas, vma);
+ vm_flags_set(vma, VM_SOFTDIRTY);
+ vma_iter_store(vmi, vma);
- if (vma->anon_vma) {
- anon_vma_interval_tree_post_update_vma(vma);
- anon_vma_unlock_write(vma->anon_vma);
- }
+ vma_complete(&vp, vmi, mm);
khugepaged_enter_vma(vma, flags);
goto out;
}
@@ -2978,10 +2930,9 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_pgoff = addr >> PAGE_SHIFT;
- vma->vm_flags = flags;
+ vm_flags_init(vma, flags);
vma->vm_page_prot = vm_get_page_prot(flags);
- mas_set_range(mas, vma->vm_start, addr + len - 1);
- if (mas_store_gfp(mas, vma, GFP_KERNEL))
+ if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
goto mas_store_fail;
mm->map_count++;
@@ -2991,7 +2942,7 @@ out:
mm->data_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
- vma->vm_flags |= VM_SOFTDIRTY;
+ vm_flags_set(vma, VM_SOFTDIRTY);
validate_mm(mm);
return 0;
@@ -3010,7 +2961,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
int ret;
bool populate;
LIST_HEAD(uf);
- MA_STATE(mas, &mm->mm_mt, addr, addr);
+ VMA_ITERATOR(vmi, mm, addr);
len = PAGE_ALIGN(request);
if (len < request)
@@ -3029,12 +2980,12 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (ret)
goto limits_failed;
- ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0);
+ ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0);
if (ret)
goto munmap_failed;
- vma = mas_prev(&mas, 0);
- ret = do_brk_flags(&mas, vma, addr, len, flags);
+ vma = vma_prev(&vmi);
+ ret = do_brk_flags(&vmi, vma, addr, len, flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
@@ -3082,7 +3033,7 @@ void exit_mmap(struct mm_struct *mm)
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX);
+ unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX, false);
mmap_read_unlock(mm);
/*
@@ -3170,6 +3121,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
bool faulted_in_anon_vma = true;
+ VMA_ITERATOR(vmi, mm, addr);
validate_mm_mt(mm);
/*
@@ -3185,7 +3137,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
- new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+ new_vma = vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (new_vma) {
@@ -3390,8 +3342,8 @@ static struct vm_area_struct *__install_special_mapping(
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ vm_flags_init(vma, (vm_flags | mm->def_flags |
+ VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
vma->vm_ops = ops;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index f45ff1b7626a..50c0dde1354f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -1120,13 +1120,3 @@ void mmu_notifier_synchronize(void)
synchronize_srcu(&srcu);
}
EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
-
-bool
-mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
-{
- if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
- return false;
- /* Return true if the vma still have the read flag set. */
- return range->vma->vm_flags & VM_READ;
-}
-EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 908df12caa26..231929f119d9 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -80,13 +80,13 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
return pte_dirty(pte);
}
-static unsigned long change_pte_range(struct mmu_gather *tlb,
+static long change_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pte_t *pte, oldpte;
spinlock_t *ptl;
- unsigned long pages = 0;
+ long pages = 0;
int target_node = NUMA_NO_NODE;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
@@ -177,12 +177,10 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
oldpte = ptep_modify_prot_start(vma, addr, pte);
ptent = pte_modify(oldpte, newprot);
- if (uffd_wp) {
- ptent = pte_wrprotect(ptent);
+ if (uffd_wp)
ptent = pte_mkuffd_wp(ptent);
- } else if (uffd_wp_resolve) {
+ else if (uffd_wp_resolve)
ptent = pte_clear_uffd_wp(ptent);
- }
/*
* In some writable, shared mappings, we might want
@@ -245,7 +243,13 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
newpte = pte_swp_mksoft_dirty(newpte);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
- } else if (pte_marker_entry_uffd_wp(entry)) {
+ } else if (is_pte_marker_entry(entry)) {
+ /*
+ * Ignore swapin errors unconditionally,
+ * because any access should sigbus anyway.
+ */
+ if (is_swapin_error_entry(entry))
+ continue;
/*
* If this is uffd-wp pte marker and we'd like
* to unprotect it, drop it; the next page
@@ -326,36 +330,42 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
/*
* If wr-protecting the range for file-backed, populate pgtable for the case
* when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc()
- * failed it means no memory, we don't have a better option but stop.
+ * failed we treat it the same way as pgtable allocation failures during
+ * page faults by kicking OOM and returning error.
*/
#define change_pmd_prepare(vma, pmd, cp_flags) \
- do { \
+ ({ \
+ long err = 0; \
if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \
- if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd))) \
- break; \
+ if (pte_alloc(vma->vm_mm, pmd)) \
+ err = -ENOMEM; \
} \
- } while (0)
+ err; \
+ })
+
/*
* This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to
* have separate change_pmd_prepare() because pte_alloc() returns 0 on success,
* while {pmd|pud|p4d}_alloc() returns the valid pointer on success.
*/
#define change_prepare(vma, high, low, addr, cp_flags) \
- do { \
+ ({ \
+ long err = 0; \
if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \
low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
- if (WARN_ON_ONCE(p == NULL)) \
- break; \
+ if (p == NULL) \
+ err = -ENOMEM; \
} \
- } while (0)
+ err; \
+ })
-static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
+static inline long change_pmd_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pmd_t *pmd;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0;
unsigned long nr_huge_updates = 0;
struct mmu_notifier_range range;
@@ -363,11 +373,15 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
- unsigned long this_pages;
+ long ret;
next = pmd_addr_end(addr, end);
- change_pmd_prepare(vma, pmd, cp_flags);
+ ret = change_pmd_prepare(vma, pmd, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
/*
* Automatic NUMA balancing walks the tables with mmap_lock
* held for read. It's possible a parallel update to occur
@@ -384,7 +398,7 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
if (!range.start) {
mmu_notifier_range_init(&range,
MMU_NOTIFY_PROTECTION_VMA, 0,
- vma, vma->vm_mm, addr, end);
+ vma->vm_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
}
@@ -397,7 +411,11 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
* cleared; make sure pmd populated if
* necessary, then fall-through to pte level.
*/
- change_pmd_prepare(vma, pmd, cp_flags);
+ ret = change_pmd_prepare(vma, pmd, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
} else {
/*
* change_huge_pmd() does not defer TLB flushes,
@@ -418,9 +436,8 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
}
/* fall through, the trans huge pmd just split */
}
- this_pages = change_pte_range(tlb, vma, pmd, addr, next,
- newprot, cp_flags);
- pages += this_pages;
+ pages += change_pte_range(tlb, vma, pmd, addr, next,
+ newprot, cp_flags);
next:
cond_resched();
} while (pmd++, addr = next, addr != end);
@@ -433,18 +450,20 @@ next:
return pages;
}
-static inline unsigned long change_pud_range(struct mmu_gather *tlb,
+static inline long change_pud_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pud_t *pud;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0, ret;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- change_prepare(vma, pud, pmd, addr, cp_flags);
+ ret = change_prepare(vma, pud, pmd, addr, cp_flags);
+ if (ret)
+ return ret;
if (pud_none_or_clear_bad(pud))
continue;
pages += change_pmd_range(tlb, vma, pud, addr, next, newprot,
@@ -454,18 +473,20 @@ static inline unsigned long change_pud_range(struct mmu_gather *tlb,
return pages;
}
-static inline unsigned long change_p4d_range(struct mmu_gather *tlb,
+static inline long change_p4d_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
p4d_t *p4d;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0, ret;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
- change_prepare(vma, p4d, pud, addr, cp_flags);
+ ret = change_prepare(vma, p4d, pud, addr, cp_flags);
+ if (ret)
+ return ret;
if (p4d_none_or_clear_bad(p4d))
continue;
pages += change_pud_range(tlb, vma, p4d, addr, next, newprot,
@@ -475,21 +496,25 @@ static inline unsigned long change_p4d_range(struct mmu_gather *tlb,
return pages;
}
-static unsigned long change_protection_range(struct mmu_gather *tlb,
+static long change_protection_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0, ret;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
tlb_start_vma(tlb, vma);
do {
next = pgd_addr_end(addr, end);
- change_prepare(vma, pgd, p4d, addr, cp_flags);
+ ret = change_prepare(vma, pgd, p4d, addr, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
if (pgd_none_or_clear_bad(pgd))
continue;
pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot,
@@ -501,15 +526,27 @@ static unsigned long change_protection_range(struct mmu_gather *tlb,
return pages;
}
-unsigned long change_protection(struct mmu_gather *tlb,
+long change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgprot_t newprot,
- unsigned long cp_flags)
+ unsigned long end, unsigned long cp_flags)
{
- unsigned long pages;
+ pgprot_t newprot = vma->vm_page_prot;
+ long pages;
BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
+#ifdef CONFIG_NUMA_BALANCING
+ /*
+ * Ordinary protection updates (mprotect, uffd-wp, softdirty tracking)
+ * are expected to reflect their requirements via VMA flags such that
+ * vma_set_page_prot() will adjust vma->vm_page_prot accordingly.
+ */
+ if (cp_flags & MM_CP_PROT_NUMA)
+ newprot = PAGE_NONE;
+#else
+ WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
+#endif
+
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot,
cp_flags);
@@ -548,9 +585,9 @@ static const struct mm_walk_ops prot_none_walk_ops = {
};
int
-mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
- struct vm_area_struct **pprev, unsigned long start,
- unsigned long end, unsigned long newflags)
+mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
+ struct vm_area_struct *vma, struct vm_area_struct **pprev,
+ unsigned long start, unsigned long end, unsigned long newflags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long oldflags = vma->vm_flags;
@@ -605,7 +642,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
* First try to merge with previous and/or next vma.
*/
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- *pprev = vma_merge(mm, *pprev, start, end, newflags,
+ *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*pprev) {
@@ -617,13 +654,13 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
*pprev = vma;
if (start != vma->vm_start) {
- error = split_vma(mm, vma, start, 1);
+ error = split_vma(vmi, vma, start, 1);
if (error)
goto fail;
}
if (end != vma->vm_end) {
- error = split_vma(mm, vma, end, 0);
+ error = split_vma(vmi, vma, end, 0);
if (error)
goto fail;
}
@@ -633,12 +670,12 @@ success:
* vm_flags and vm_page_prot are protected by the mmap_lock
* held in write mode.
*/
- vma->vm_flags = newflags;
+ vm_flags_reset(vma, newflags);
if (vma_wants_manual_pte_write_upgrade(vma))
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
vma_set_page_prot(vma);
- change_protection(tlb, vma, start, end, vma->vm_page_prot, mm_cp_flags);
+ change_protection(tlb, vma, start, end, mm_cp_flags);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
@@ -672,7 +709,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
(prot & PROT_READ);
struct mmu_gather tlb;
- MA_STATE(mas, &current->mm->mm_mt, 0, 0);
+ struct vma_iterator vmi;
start = untagged_addr(start);
@@ -704,8 +741,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
goto out;
- mas_set(&mas, start);
- vma = mas_find(&mas, ULONG_MAX);
+ vma_iter_init(&vmi, current->mm, start);
+ vma = vma_find(&vmi, end);
error = -ENOMEM;
if (!vma)
goto out;
@@ -728,18 +765,22 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
}
}
+ prev = vma_prev(&vmi);
if (start > vma->vm_start)
prev = vma;
- else
- prev = mas_prev(&mas, 0);
tlb_gather_mmu(&tlb, current->mm);
- for (nstart = start ; ; ) {
+ nstart = start;
+ tmp = vma->vm_start;
+ for_each_vma_range(vmi, vma, end) {
unsigned long mask_off_old_flags;
unsigned long newflags;
int new_vma_pkey;
- /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
+ if (vma->vm_start != tmp) {
+ error = -ENOMEM;
+ break;
+ }
/* Does the application expect PROT_READ to imply PROT_EXEC */
if (rier && (vma->vm_flags & VM_MAYEXEC))
@@ -762,6 +803,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
break;
}
+ if (map_deny_write_exec(vma, newflags)) {
+ error = -EACCES;
+ goto out;
+ }
+
/* Allow architectures to sanity-check the new flags */
if (!arch_validate_flags(newflags)) {
error = -EINVAL;
@@ -782,25 +828,19 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
break;
}
- error = mprotect_fixup(&tlb, vma, &prev, nstart, tmp, newflags);
+ error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags);
if (error)
break;
+ tmp = vma_iter_end(&vmi);
nstart = tmp;
-
- if (nstart < prev->vm_end)
- nstart = prev->vm_end;
- if (nstart >= end)
- break;
-
- vma = find_vma(current->mm, prev->vm_end);
- if (!vma || vma->vm_start != nstart) {
- error = -ENOMEM;
- break;
- }
prot = reqprot;
}
tlb_finish_mmu(&tlb);
+
+ if (vma_iter_end(&vmi) < end)
+ error = -ENOMEM;
+
out:
mmap_write_unlock(current->mm);
return error;
diff --git a/mm/mremap.c b/mm/mremap.c
index fe587c5d6591..411a85682b58 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -498,7 +498,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
new_addr, len);
flush_cache_range(vma, old_addr, old_end);
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
old_addr, old_end);
mmu_notifier_invalidate_range_start(&range);
@@ -580,11 +580,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long vm_flags = vma->vm_flags;
unsigned long new_pgoff;
unsigned long moved_len;
- unsigned long excess = 0;
+ unsigned long account_start = 0;
+ unsigned long account_end = 0;
unsigned long hiwater_vm;
- int split = 0;
int err = 0;
bool need_rmap_locks;
+ struct vma_iterator vmi;
/*
* We'd prefer to avoid failure later on in do_munmap:
@@ -661,11 +662,11 @@ static unsigned long move_vma(struct vm_area_struct *vma,
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
- vma->vm_flags &= ~VM_ACCOUNT;
- excess = vma->vm_end - vma->vm_start - old_len;
- if (old_addr > vma->vm_start &&
- old_addr + old_len < vma->vm_end)
- split = 1;
+ vm_flags_clear(vma, VM_ACCOUNT);
+ if (vma->vm_start < old_addr)
+ account_start = vma->vm_start;
+ if (vma->vm_end > old_addr + old_len)
+ account_end = vma->vm_end;
}
/*
@@ -686,7 +687,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
/* We always clear VM_LOCKED[ONFAULT] on the old vma */
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ vm_flags_clear(vma, VM_LOCKED_MASK);
/*
* anon_vma links of the old vma is no longer needed after its page
@@ -700,11 +701,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return new_addr;
}
- if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
+ vma_iter_init(&vmi, mm, old_addr);
+ if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
/* OOM: unable to split vma, just get accounts right */
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
vm_acct_memory(old_len >> PAGE_SHIFT);
- excess = 0;
+ account_start = account_end = 0;
}
if (vm_flags & VM_LOCKED) {
@@ -715,10 +717,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
- if (excess) {
- vma->vm_flags |= VM_ACCOUNT;
- if (split)
- find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT;
+ if (account_start) {
+ vma = vma_prev(&vmi);
+ vm_flags_set(vma, VM_ACCOUNT);
+ }
+
+ if (account_end) {
+ vma = vma_next(&vmi);
+ vm_flags_set(vma, VM_ACCOUNT);
}
return new_addr;
@@ -978,14 +984,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
- * do_mas_munmap does all the needed commit accounting, and
+ * do_vmi_munmap does all the needed commit accounting, and
* downgrades mmap_lock to read if so directed.
*/
if (old_len >= new_len) {
int retval;
- MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len);
+ VMA_ITERATOR(vmi, mm, addr + new_len);
- retval = do_mas_munmap(&mas, mm, addr + new_len,
+ retval = do_vmi_munmap(&vmi, mm, addr + new_len,
old_len - new_len, &uf_unmap, true);
/* Returning 1 indicates mmap_lock is downgraded to read. */
if (retval == 1) {
@@ -1018,6 +1024,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long extension_end = addr + new_len;
pgoff_t extension_pgoff = vma->vm_pgoff +
((extension_start - vma->vm_start) >> PAGE_SHIFT);
+ VMA_ITERATOR(vmi, mm, extension_start);
if (vma->vm_flags & VM_ACCOUNT) {
if (security_vm_enough_memory_mm(mm, pages)) {
@@ -1027,16 +1034,29 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
/*
- * Function vma_merge() is called on the extension we are adding to
- * the already existing vma, vma_merge() will merge this extension with
- * the already existing vma (expand operation itself) and possibly also
- * with the next vma if it becomes adjacent to the expanded vma and
- * otherwise compatible.
+ * Function vma_merge() is called on the extension we
+ * are adding to the already existing vma, vma_merge()
+ * will merge this extension with the already existing
+ * vma (expand operation itself) and possibly also with
+ * the next vma if it becomes adjacent to the expanded
+ * vma and otherwise compatible.
+ *
+ * However, vma_merge() can currently fail due to
+ * is_mergeable_vma() check for vm_ops->close (see the
+ * comment there). Yet this should not prevent vma
+ * expanding, so perform a simple expand for such vma.
+ * Ideally the check for close op should be only done
+ * when a vma would be actually removed due to a merge.
*/
- vma = vma_merge(mm, vma, extension_start, extension_end,
- vma->vm_flags, vma->anon_vma, vma->vm_file,
- extension_pgoff, vma_policy(vma),
+ if (!vma->vm_ops || !vma->vm_ops->close) {
+ vma = vma_merge(&vmi, mm, vma, extension_start,
+ extension_end, vma->vm_flags, vma->anon_vma,
+ vma->vm_file, extension_pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ } else if (vma_expand(&vmi, vma, vma->vm_start,
+ addr + new_len, vma->vm_pgoff, NULL)) {
+ vma = NULL;
+ }
if (!vma) {
vm_unacct_memory(pages);
ret = -ENOMEM;
diff --git a/mm/nommu.c b/mm/nommu.c
index 214c70e1d059..57ba243c6a37 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -173,7 +173,7 @@ static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
mmap_write_lock(current->mm);
vma = find_vma(current->mm, (unsigned long)ret);
if (vma)
- vma->vm_flags |= VM_USERMAP;
+ vm_flags_set(vma, VM_USERMAP);
mmap_write_unlock(current->mm);
}
@@ -544,22 +544,8 @@ static void put_nommu_region(struct vm_region *region)
__put_nommu_region(region);
}
-void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas)
-{
- mas_set_range(mas, vma->vm_start, vma->vm_end - 1);
- mas_store_prealloc(mas, vma);
-}
-
-void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
-{
- mas->index = vma->vm_start;
- mas->last = vma->vm_end - 1;
- mas_store_prealloc(mas, NULL);
-}
-
static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
{
- mm->map_count++;
vma->vm_mm = mm;
/* add the VMA to the mapping */
@@ -574,43 +560,6 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
}
}
-/*
- * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm().
- * @mas: The maple state with preallocations.
- * @mm: The mm_struct
- * @vma: The vma to add
- *
- */
-static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm,
- struct vm_area_struct *vma)
-{
- BUG_ON(!vma->vm_region);
-
- setup_vma_to_mm(vma, mm);
-
- /* add the VMA to the tree */
- vma_mas_store(vma, mas);
-}
-
-/*
- * add a VMA into a process's mm_struct in the appropriate place in the list
- * and tree and add to the address space's page tree also if not an anonymous
- * page
- * - should be called with mm->mmap_lock held writelocked
- */
-static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
-
- if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
- pr_warn("Allocation of vma tree for process %d failed\n",
- current->pid);
- return -ENOMEM;
- }
- mas_add_vma_to_mm(&mas, mm, vma);
- return 0;
-}
-
static void cleanup_vma_from_mm(struct vm_area_struct *vma)
{
vma->vm_mm->map_count--;
@@ -626,14 +575,15 @@ static void cleanup_vma_from_mm(struct vm_area_struct *vma)
i_mmap_unlock_write(mapping);
}
}
+
/*
* delete a VMA from its owning mm_struct and address space
*/
static int delete_vma_from_mm(struct vm_area_struct *vma)
{
- MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0);
+ VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start);
- if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ if (vma_iter_prealloc(&vmi)) {
pr_warn("Allocation of vma tree for process %d failed\n",
current->pid);
return -ENOMEM;
@@ -641,10 +591,9 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
cleanup_vma_from_mm(vma);
/* remove from the MM's tree and list */
- vma_mas_remove(vma, &mas);
+ vma_iter_clear(&vmi, vma->vm_start, vma->vm_end);
return 0;
}
-
/*
* destroy a VMA record
*/
@@ -675,9 +624,9 @@ EXPORT_SYMBOL(find_vma_intersection);
*/
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- MA_STATE(mas, &mm->mm_mt, addr, addr);
+ VMA_ITERATOR(vmi, mm, addr);
- return mas_walk(&mas);
+ return vma_iter_load(&vmi);
}
EXPORT_SYMBOL(find_vma);
@@ -709,9 +658,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
{
struct vm_area_struct *vma;
unsigned long end = addr + len;
- MA_STATE(mas, &mm->mm_mt, addr, addr);
+ VMA_ITERATOR(vmi, mm, addr);
- vma = mas_walk(&mas);
+ vma = vma_iter_load(&vmi);
if (!vma)
return NULL;
if (vma->vm_start != addr)
@@ -892,29 +841,36 @@ static unsigned long determine_vm_flags(struct file *file,
unsigned long vm_flags;
vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
- /* vm_flags |= mm->def_flags; */
- if (!(capabilities & NOMMU_MAP_DIRECT)) {
- /* attempt to share read-only copies of mapped file chunks */
+ if (!file) {
+ /*
+ * MAP_ANONYMOUS. MAP_SHARED is mapped to MAP_PRIVATE, because
+ * there is no fork().
+ */
vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- if (file && !(prot & PROT_WRITE))
- vm_flags |= VM_MAYSHARE;
+ } else if (flags & MAP_PRIVATE) {
+ /* MAP_PRIVATE file mapping */
+ if (capabilities & NOMMU_MAP_DIRECT)
+ vm_flags |= (capabilities & NOMMU_VMFLAGS);
+ else
+ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+ if (!(prot & PROT_WRITE) && !current->ptrace)
+ /*
+ * R/O private file mapping which cannot be used to
+ * modify memory, especially also not via active ptrace
+ * (e.g., set breakpoints) or later by upgrading
+ * permissions (no mprotect()). We can try overlaying
+ * the file mapping, which will work e.g., on chardevs,
+ * ramfs/tmpfs/shmfs and romfs/cramf.
+ */
+ vm_flags |= VM_MAYOVERLAY;
} else {
- /* overlay a shareable mapping on the backing device or inode
- * if possible - used for chardevs, ramfs/tmpfs/shmfs and
- * romfs/cramfs */
- vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
- if (flags & MAP_SHARED)
- vm_flags |= VM_SHARED;
+ /* MAP_SHARED file mapping: NOMMU_MAP_DIRECT is set. */
+ vm_flags |= VM_SHARED | VM_MAYSHARE |
+ (capabilities & NOMMU_VMFLAGS);
}
- /* refuse to let anyone share private mappings with this process if
- * it's being traced - otherwise breakpoints set in it may interfere
- * with another untraced process
- */
- if ((flags & MAP_PRIVATE) && current->ptrace)
- vm_flags &= ~VM_MAYSHARE;
-
return vm_flags;
}
@@ -952,15 +908,18 @@ static int do_mmap_private(struct vm_area_struct *vma,
void *base;
int ret, order;
- /* invoke the file's mapping function so that it can keep track of
- * shared mappings on devices or memory
- * - VM_MAYSHARE will be set if it may attempt to share
+ /*
+ * Invoke the file's mapping function so that it can keep track of
+ * shared mappings on devices or memory. VM_MAYOVERLAY will be set if
+ * it may attempt to share, which will make is_nommu_shared_mapping()
+ * happy.
*/
if (capabilities & NOMMU_MAP_DIRECT) {
ret = call_mmap(vma->vm_file, vma);
+ /* shouldn't return success if we're not sharing */
+ if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags)))
+ ret = -ENOSYS;
if (ret == 0) {
- /* shouldn't return success if we're not sharing */
- BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
vma->vm_region->vm_top = vma->vm_region->vm_end;
return 0;
}
@@ -991,7 +950,8 @@ static int do_mmap_private(struct vm_area_struct *vma,
atomic_long_add(total, &mmap_pages_allocated);
- region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
+ vm_flags_set(vma, VM_MAPPED_COPY);
+ region->vm_flags = vma->vm_flags;
region->vm_start = (unsigned long) base;
region->vm_end = region->vm_start + len;
region->vm_top = region->vm_start + (total << PAGE_SHIFT);
@@ -1052,7 +1012,7 @@ unsigned long do_mmap(struct file *file,
vm_flags_t vm_flags;
unsigned long capabilities, result;
int ret;
- MA_STATE(mas, &current->mm->mm_mt, 0, 0);
+ VMA_ITERATOR(vmi, current->mm, 0);
*populate = 0;
@@ -1081,14 +1041,14 @@ unsigned long do_mmap(struct file *file,
if (!vma)
goto error_getting_vma;
- if (mas_preallocate(&mas, vma, GFP_KERNEL))
- goto error_maple_preallocate;
+ if (vma_iter_prealloc(&vmi))
+ goto error_vma_iter_prealloc;
region->vm_usage = 1;
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
- vma->vm_flags = vm_flags;
+ vm_flags_init(vma, vm_flags);
vma->vm_pgoff = pgoff;
if (file) {
@@ -1106,7 +1066,7 @@ unsigned long do_mmap(struct file *file,
* these cases, sharing is handled in the driver or filesystem rather
* than here
*/
- if (vm_flags & VM_MAYSHARE) {
+ if (is_nommu_shared_mapping(vm_flags)) {
struct vm_region *pregion;
unsigned long pglen, rpglen, pgend, rpgend, start;
@@ -1116,7 +1076,7 @@ unsigned long do_mmap(struct file *file,
for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
pregion = rb_entry(rb, struct vm_region, vm_rb);
- if (!(pregion->vm_flags & VM_MAYSHARE))
+ if (!is_nommu_shared_mapping(pregion->vm_flags))
continue;
/* search for overlapping mappings on the same file */
@@ -1152,7 +1112,7 @@ unsigned long do_mmap(struct file *file,
vma->vm_end = start + len;
if (pregion->vm_flags & VM_MAPPED_COPY)
- vma->vm_flags |= VM_MAPPED_COPY;
+ vm_flags_set(vma, VM_MAPPED_COPY);
else {
ret = do_mmap_shared_file(vma);
if (ret < 0) {
@@ -1224,7 +1184,11 @@ unsigned long do_mmap(struct file *file,
current->mm->total_vm += len >> PAGE_SHIFT;
share:
- mas_add_vma_to_mm(&mas, current->mm, vma);
+ BUG_ON(!vma->vm_region);
+ setup_vma_to_mm(vma, current->mm);
+ current->mm->map_count++;
+ /* add the VMA to the tree */
+ vma_iter_store(&vmi, vma);
/* we flush the region from the icache only when the first executable
* mapping of it is made */
@@ -1240,6 +1204,7 @@ share:
error_just_free:
up_write(&nommu_region_sem);
error:
+ vma_iter_free(&vmi);
if (region->vm_file)
fput(region->vm_file);
kmem_cache_free(vm_region_jar, region);
@@ -1250,7 +1215,6 @@ error:
sharing_violation:
up_write(&nommu_region_sem);
- mas_destroy(&mas);
pr_warn("Attempt to share mismatched mappings\n");
ret = -EINVAL;
goto error;
@@ -1268,7 +1232,7 @@ error_getting_region:
show_free_areas(0, NULL);
return -ENOMEM;
-error_maple_preallocate:
+error_vma_iter_prealloc:
kmem_cache_free(vm_region_jar, region);
vm_area_free(vma);
pr_warn("Allocation of vma tree for process %d failed\n", current->pid);
@@ -1334,19 +1298,20 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
* split a vma into two pieces at address 'addr', a new vma is allocated either
* for the first part or the tail.
*/
-int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
struct vm_area_struct *new;
struct vm_region *region;
unsigned long npages;
- MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
+ struct mm_struct *mm;
/* we're only permitted to split anonymous regions (these should have
* only a single usage on the region) */
if (vma->vm_file)
return -ENOMEM;
+ mm = vma->vm_mm;
if (mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
@@ -1358,10 +1323,10 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (!new)
goto err_vma_dup;
- if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ if (vma_iter_prealloc(vmi)) {
pr_warn("Allocation of vma tree for process %d failed\n",
current->pid);
- goto err_mas_preallocate;
+ goto err_vmi_preallocate;
}
/* most fields are the same, copy all, and then fixup */
@@ -1395,12 +1360,11 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
setup_vma_to_mm(vma, mm);
setup_vma_to_mm(new, mm);
- mas_set_range(&mas, vma->vm_start, vma->vm_end - 1);
- mas_store(&mas, vma);
- vma_mas_store(new, &mas);
+ vma_iter_store(vmi, new);
+ mm->map_count++;
return 0;
-err_mas_preallocate:
+err_vmi_preallocate:
vm_area_free(new);
err_vma_dup:
kmem_cache_free(vm_region_jar, region);
@@ -1411,7 +1375,7 @@ err_vma_dup:
* shrink a VMA by removing the specified chunk from either the beginning or
* the end
*/
-static int shrink_vma(struct mm_struct *mm,
+static int vmi_shrink_vma(struct vma_iterator *vmi,
struct vm_area_struct *vma,
unsigned long from, unsigned long to)
{
@@ -1419,14 +1383,19 @@ static int shrink_vma(struct mm_struct *mm,
/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
- if (delete_vma_from_mm(vma))
+ if (vma_iter_prealloc(vmi)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
return -ENOMEM;
- if (from > vma->vm_start)
+ }
+
+ if (from > vma->vm_start) {
+ vma_iter_clear(vmi, from, vma->vm_end);
vma->vm_end = from;
- else
+ } else {
+ vma_iter_clear(vmi, vma->vm_start, to);
vma->vm_start = to;
- if (add_vma_to_mm(mm, vma))
- return -ENOMEM;
+ }
/* cut the backing region down to size */
region = vma->vm_region;
@@ -1454,7 +1423,7 @@ static int shrink_vma(struct mm_struct *mm,
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
{
- MA_STATE(mas, &mm->mm_mt, start, start);
+ VMA_ITERATOR(vmi, mm, start);
struct vm_area_struct *vma;
unsigned long end;
int ret = 0;
@@ -1466,7 +1435,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
end = start + len;
/* find the first potentially overlapping VMA */
- vma = mas_find(&mas, end - 1);
+ vma = vma_find(&vmi, end);
if (!vma) {
static int limit;
if (limit < 5) {
@@ -1485,7 +1454,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
return -EINVAL;
if (end == vma->vm_end)
goto erase_whole_vma;
- vma = mas_next(&mas, end - 1);
+ vma = vma_find(&vmi, end);
} while (vma);
return -EINVAL;
} else {
@@ -1499,17 +1468,18 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
if (end != vma->vm_end && offset_in_page(end))
return -EINVAL;
if (start != vma->vm_start && end != vma->vm_end) {
- ret = split_vma(mm, vma, start, 1);
+ ret = split_vma(&vmi, vma, start, 1);
if (ret < 0)
return ret;
}
- return shrink_vma(mm, vma, start, end);
+ return vmi_shrink_vma(&vmi, vma, start, end);
}
erase_whole_vma:
if (delete_vma_from_mm(vma))
ret = -ENOMEM;
- delete_vma(mm, vma);
+ else
+ delete_vma(mm, vma);
return ret;
}
@@ -1597,7 +1567,7 @@ static unsigned long do_mremap(unsigned long addr,
if (vma->vm_end != vma->vm_start + old_len)
return (unsigned long) -EFAULT;
- if (vma->vm_flags & VM_MAYSHARE)
+ if (is_nommu_shared_mapping(vma->vm_flags))
return (unsigned long) -EPERM;
if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
@@ -1632,7 +1602,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
if (addr != (pfn << PAGE_SHIFT))
return -EINVAL;
- vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
return 0;
}
EXPORT_SYMBOL(remap_pfn_range);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1276e49b31b0..044e1eed720e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -542,7 +542,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
struct mmu_gather tlb;
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
- vma, mm, vma->vm_start,
+ mm, vma->vm_start,
vma->vm_end);
tlb_gather_mmu(&tlb, mm);
if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ad608ef2a243..516b1aa247e8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2398,15 +2398,15 @@ int write_cache_pages(struct address_space *mapping,
int ret = 0;
int done = 0;
int error;
- struct pagevec pvec;
- int nr_pages;
+ struct folio_batch fbatch;
+ int nr_folios;
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
int range_whole = 0;
xa_mark_t tag;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* prev offset */
end = -1;
@@ -2426,17 +2426,18 @@ int write_cache_pages(struct address_space *mapping,
while (!done && (index <= end)) {
int i;
- nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
- tag);
- if (nr_pages == 0)
+ nr_folios = filemap_get_folios_tag(mapping, &index, end,
+ tag, &fbatch);
+
+ if (nr_folios == 0)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr_folios; i++) {
+ struct folio *folio = fbatch.folios[i];
- done_index = page->index;
+ done_index = folio->index;
- lock_page(page);
+ folio_lock(folio);
/*
* Page truncated or invalidated. We can freely skip it
@@ -2446,30 +2447,30 @@ int write_cache_pages(struct address_space *mapping,
* even if there is now a new, dirty page at the same
* pagecache address.
*/
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(folio->mapping != mapping)) {
continue_unlock:
- unlock_page(page);
+ folio_unlock(folio);
continue;
}
- if (!PageDirty(page)) {
+ if (!folio_test_dirty(folio)) {
/* someone wrote it for us */
goto continue_unlock;
}
- if (PageWriteback(page)) {
+ if (folio_test_writeback(folio)) {
if (wbc->sync_mode != WB_SYNC_NONE)
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
else
goto continue_unlock;
}
- BUG_ON(PageWriteback(page));
- if (!clear_page_dirty_for_io(page))
+ BUG_ON(folio_test_writeback(folio));
+ if (!folio_clear_dirty_for_io(folio))
goto continue_unlock;
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
- error = (*writepage)(page, wbc, data);
+ error = writepage(folio, wbc, data);
if (unlikely(error)) {
/*
* Handle errors according to the type of
@@ -2484,11 +2485,12 @@ continue_unlock:
* the first error.
*/
if (error == AOP_WRITEPAGE_ACTIVATE) {
- unlock_page(page);
+ folio_unlock(folio);
error = 0;
} else if (wbc->sync_mode != WB_SYNC_ALL) {
ret = error;
- done_index = page->index + 1;
+ done_index = folio->index +
+ folio_nr_pages(folio);
done = 1;
break;
}
@@ -2508,7 +2510,7 @@ continue_unlock:
break;
}
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
}
@@ -2526,47 +2528,15 @@ continue_unlock:
}
EXPORT_SYMBOL(write_cache_pages);
-/*
- * Function used by generic_writepages to call the real writepage
- * function and set the mapping flags on error
- */
-static int __writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+static int writepage_cb(struct folio *folio, struct writeback_control *wbc,
+ void *data)
{
struct address_space *mapping = data;
- int ret = mapping->a_ops->writepage(page, wbc);
+ int ret = mapping->a_ops->writepage(&folio->page, wbc);
mapping_set_error(mapping, ret);
return ret;
}
-/**
- * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- *
- * Return: %0 on success, negative error code otherwise
- */
-int generic_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct blk_plug plug;
- int ret;
-
- /* deal with chardevs and other special file */
- if (!mapping->a_ops->writepage)
- return 0;
-
- blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __writepage, mapping);
- blk_finish_plug(&plug);
- return ret;
-}
-
-EXPORT_SYMBOL(generic_writepages);
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2577,11 +2547,20 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
wb = inode_to_wb_wbc(mapping->host, wbc);
wb_bandwidth_estimate_start(wb);
while (1) {
- if (mapping->a_ops->writepages)
+ if (mapping->a_ops->writepages) {
ret = mapping->a_ops->writepages(mapping, wbc);
- else
- ret = generic_writepages(mapping, wbc);
- if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
+ } else if (mapping->a_ops->writepage) {
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, writepage_cb,
+ mapping);
+ blk_finish_plug(&plug);
+ } else {
+ /* deal with chardevs and other special files */
+ ret = 0;
+ }
+ if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
break;
/*
@@ -2673,7 +2652,7 @@ static void folio_account_dirtied(struct folio *folio,
struct bdi_writeback *wb;
long nr = folio_nr_pages(folio);
- inode_attach_wb(inode, &folio->page);
+ inode_attach_wb(inode, folio);
wb = inode_to_wb(inode);
__lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
@@ -2713,7 +2692,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
*
* The caller must hold lock_page_memcg(). Most callers have the folio
* locked. A few have the folio blocked from truncation through other
- * means (eg zap_page_range() has it mapped and is holding the page table
+ * means (eg zap_vma_pages() has it mapped and is holding the page table
* lock). This can also be called from mark_buffer_dirty(), which I
* cannot prove is always protected against truncate.
*/
@@ -2846,11 +2825,11 @@ bool folio_mark_dirty(struct folio *folio)
if (likely(mapping)) {
/*
- * readahead/lru_deactivate_page could remain
+ * readahead/folio_deactivate could remain
* PG_readahead/PG_reclaim due to race with folio_end_writeback
* About readahead, if the folio is written, the flags would be
* reset. So no problem.
- * About lru_deactivate_page, if the folio is redirtied,
+ * About folio_deactivate, if the folio is redirtied,
* the flag will be reset. So no problem. but if the
* folio is used by readahead it will confuse readahead
* and make it restart the size rampup process. But it's
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0745aedebb37..ac1fc986af44 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -430,6 +430,8 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
+bool deferred_struct_pages __meminitdata;
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* During boot we initialize deferred pages on-demand, as needed, but once
@@ -443,15 +445,15 @@ static inline bool deferred_pages_enabled(void)
return static_branch_unlikely(&deferred_pages);
}
-/* Returns true if the struct page for the pfn is uninitialised */
-static inline bool __meminit early_page_uninitialised(unsigned long pfn)
+/* Returns true if the struct page for the pfn is initialised */
+static inline bool __meminit early_page_initialised(unsigned long pfn)
{
int nid = early_pfn_to_nid(pfn);
if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
- return true;
+ return false;
- return false;
+ return true;
}
/*
@@ -498,9 +500,9 @@ static inline bool deferred_pages_enabled(void)
return false;
}
-static inline bool early_page_uninitialised(unsigned long pfn)
+static inline bool early_page_initialised(unsigned long pfn)
{
- return false;
+ return true;
}
static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
@@ -775,11 +777,13 @@ void free_compound_page(struct page *page)
static void prep_compound_head(struct page *page, unsigned int order)
{
+ struct folio *folio = (struct folio *)page;
+
set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
set_compound_order(page, order);
- atomic_set(compound_mapcount_ptr(page), -1);
- atomic_set(subpages_mapcount_ptr(page), 0);
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(&folio->_entire_mapcount, -1);
+ atomic_set(&folio->_nr_pages_mapped, 0);
+ atomic_set(&folio->_pincount, 0);
}
static void prep_compound_tail(struct page *head, int tail_idx)
@@ -805,7 +809,7 @@ void prep_compound_page(struct page *page, unsigned int order)
void destroy_large_folio(struct folio *folio)
{
- enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor;
+ enum compound_dtor_id dtor = folio->_folio_dtor;
VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
compound_page_dtors[dtor](&folio->page);
@@ -1291,6 +1295,7 @@ static inline bool free_page_is_bad(struct page *page)
static int free_tail_pages_check(struct page *head_page, struct page *page)
{
+ struct folio *folio = (struct folio *)head_page;
int ret = 1;
/*
@@ -1306,16 +1311,16 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
switch (page - head_page) {
case 1:
/* the first tail page: these may be in place of ->mapping */
- if (unlikely(head_compound_mapcount(head_page))) {
- bad_page(page, "nonzero compound_mapcount");
+ if (unlikely(folio_entire_mapcount(folio))) {
+ bad_page(page, "nonzero entire_mapcount");
goto out;
}
- if (unlikely(atomic_read(subpages_mapcount_ptr(head_page)))) {
- bad_page(page, "nonzero subpages_mapcount");
+ if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
+ bad_page(page, "nonzero nr_pages_mapped");
goto out;
}
- if (unlikely(head_compound_pincount(head_page))) {
- bad_page(page, "nonzero compound_pincount");
+ if (unlikely(atomic_read(&folio->_pincount))) {
+ bad_page(page, "nonzero pincount");
goto out;
}
break;
@@ -1356,6 +1361,8 @@ out:
* see the comment next to it.
* 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
* see the comment next to it.
+ * 4. The allocation is excluded from being checked due to sampling,
+ * see the call to kasan_unpoison_pages.
*
* Poisoning pages during deferred memory init will greatly lengthen the
* process and cause problem in large memory systems as the deferred pages
@@ -1403,7 +1410,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
* Do not let hwpoison pages hit pcplists/buddy
* Untie memcg state and reset page's owner
*/
- if (memcg_kmem_enabled() && PageMemcgKmem(page))
+ if (memcg_kmem_online() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
reset_page_owner(page, order);
page_table_check_free(page, order);
@@ -1434,7 +1441,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
}
if (PageMappingFlags(page))
page->mapping = NULL;
- if (memcg_kmem_enabled() && PageMemcgKmem(page))
+ if (memcg_kmem_online() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
if (check_free && free_page_is_bad(page))
bad++;
@@ -1641,7 +1648,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
pg_data_t *pgdat;
int nid, zid;
- if (!early_page_uninitialised(pfn))
+ if (early_page_initialised(pfn))
return;
nid = early_pfn_to_nid(pfn);
@@ -1804,7 +1811,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
- if (early_page_uninitialised(pfn))
+ if (!early_page_initialised(pfn))
return;
if (!kmsan_memblock_free_pages(page, order)) {
/* KMSAN will take care of these pages. */
@@ -2468,7 +2475,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
{
bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
!should_skip_init(gfp_flags);
- bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+ bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+ bool reset_tags = true;
int i;
set_page_private(page, 0);
@@ -2491,30 +2499,43 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
*/
/*
- * If memory tags should be zeroed (which happens only when memory
- * should be initialized as well).
+ * If memory tags should be zeroed
+ * (which happens only when memory should be initialized as well).
*/
- if (init_tags) {
- /* Initialize both memory and tags. */
+ if (zero_tags) {
+ /* Initialize both memory and memory tags. */
for (i = 0; i != 1 << order; ++i)
tag_clear_highpage(page + i);
- /* Note that memory is already initialized by the loop above. */
+ /* Take note that memory was initialized by the loop above. */
init = false;
}
if (!should_skip_kasan_unpoison(gfp_flags)) {
- /* Unpoison shadow memory or set memory tags. */
- kasan_unpoison_pages(page, order, init);
-
- /* Note that memory is already initialized by KASAN. */
- if (kasan_has_integrated_init())
- init = false;
- } else {
- /* Ensure page_address() dereferencing does not fault. */
+ /* Try unpoisoning (or setting tags) and initializing memory. */
+ if (kasan_unpoison_pages(page, order, init)) {
+ /* Take note that memory was initialized by KASAN. */
+ if (kasan_has_integrated_init())
+ init = false;
+ /* Take note that memory tags were set by KASAN. */
+ reset_tags = false;
+ } else {
+ /*
+ * KASAN decided to exclude this allocation from being
+ * (un)poisoned due to sampling. Make KASAN skip
+ * poisoning when the allocation is freed.
+ */
+ SetPageSkipKASanPoison(page);
+ }
+ }
+ /*
+ * If memory tags have not been set by KASAN, reset the page tags to
+ * ensure page_address() dereferencing does not fault.
+ */
+ if (reset_tags) {
for (i = 0; i != 1 << order; ++i)
page_kasan_tag_reset(page + i);
}
- /* If memory is still not initialized, do it now. */
+ /* If memory is still not initialized, initialize it now. */
if (init)
kernel_init_pages(page, 1 << order);
/* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
@@ -2582,10 +2603,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
*
* The other migratetypes do not have fallbacks.
*/
-static int fallbacks[MIGRATE_TYPES][3] = {
- [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
+static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = {
+ [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE },
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE },
};
#ifdef CONFIG_CMA
@@ -2844,11 +2865,8 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
return -1;
*can_steal = false;
- for (i = 0;; i++) {
+ for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
fallback_mt = fallbacks[migratetype][i];
- if (fallback_mt == MIGRATE_TYPES)
- break;
-
if (free_area_empty(area, fallback_mt))
continue;
@@ -3706,10 +3724,20 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
* reserved for high-order atomic allocation, so order-0
* request should skip it.
*/
- if (order > 0 && alloc_flags & ALLOC_HARDER)
+ if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
page = __rmqueue(zone, order, migratetype, alloc_flags);
+
+ /*
+ * If the allocation fails, allow OOM handling access
+ * to HIGHATOMIC reserves as failing now is worse than
+ * failing a high-order atomic allocation in the
+ * future.
+ */
+ if (!page && (alloc_flags & ALLOC_OOM))
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+
if (!page) {
spin_unlock_irqrestore(&zone->lock, flags);
return NULL;
@@ -3939,15 +3967,14 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
static inline long __zone_watermark_unusable_free(struct zone *z,
unsigned int order, unsigned int alloc_flags)
{
- const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
long unusable_free = (1 << order) - 1;
/*
- * If the caller does not have rights to ALLOC_HARDER then subtract
- * the high-atomic reserves. This will over-estimate the size of the
- * atomic reserve but it avoids a search.
+ * If the caller does not have rights to reserves below the min
+ * watermark then subtract the high-atomic reserves. This will
+ * over-estimate the size of the atomic reserve but it avoids a search.
*/
- if (likely(!alloc_harder))
+ if (likely(!(alloc_flags & ALLOC_RESERVES)))
unusable_free += z->nr_reserved_highatomic;
#ifdef CONFIG_CMA
@@ -3971,25 +3998,37 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
{
long min = mark;
int o;
- const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
- if (alloc_flags & ALLOC_HIGH)
- min -= min / 2;
+ if (unlikely(alloc_flags & ALLOC_RESERVES)) {
+ /*
+ * __GFP_HIGH allows access to 50% of the min reserve as well
+ * as OOM.
+ */
+ if (alloc_flags & ALLOC_MIN_RESERVE) {
+ min -= min / 2;
+
+ /*
+ * Non-blocking allocations (e.g. GFP_ATOMIC) can
+ * access more reserves than just __GFP_HIGH. Other
+ * non-blocking allocations requests such as GFP_NOWAIT
+ * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
+ * access to the min reserve.
+ */
+ if (alloc_flags & ALLOC_NON_BLOCK)
+ min -= min / 4;
+ }
- if (unlikely(alloc_harder)) {
/*
- * OOM victims can try even harder than normal ALLOC_HARDER
+ * OOM victims can try even harder than the normal reserve
* users on the grounds that it's definitely going to be in
* the exit path shortly and free memory. Any allocation it
* makes during the free path will be small and short-lived.
*/
if (alloc_flags & ALLOC_OOM)
min -= min / 2;
- else
- min -= min / 4;
}
/*
@@ -4023,8 +4062,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
return true;
}
#endif
- if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
+ if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
+ !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
return true;
+ }
}
return false;
}
@@ -4064,13 +4105,14 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
free_pages))
return true;
+
/*
- * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
+ * Ignore watermark boosting for __GFP_HIGH order-0 allocations
* when checking the min watermark. The min watermark is the
* point where boosting is ignored so that kswapd is woken up
* when below the low watermark.
*/
- if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
+ if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
&& ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
mark = z->_watermark[WMARK_MIN];
return __zone_watermark_ok(z, order, mark, highest_zoneidx,
@@ -4244,7 +4286,7 @@ retry:
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
*/
- if (static_branch_unlikely(&deferred_pages)) {
+ if (deferred_pages_enabled()) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
@@ -4286,14 +4328,14 @@ try_this_zone:
* If this is a high-order atomic allocation then check
* if the pageblock should be reserved for the future
*/
- if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
+ if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
reserve_highatomic_pageblock(page, zone, order);
return page;
} else {
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
- if (static_branch_unlikely(&deferred_pages)) {
+ if (deferred_pages_enabled()) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
@@ -4813,41 +4855,48 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
}
static inline unsigned int
-gfp_to_alloc_flags(gfp_t gfp_mask)
+gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
{
unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
/*
- * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
+ * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
* and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
* to save two branches.
*/
- BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
+ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
/*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
*/
alloc_flags |= (__force int)
(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
- if (gfp_mask & __GFP_ATOMIC) {
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
/*
* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
* if it can't schedule.
*/
- if (!(gfp_mask & __GFP_NOMEMALLOC))
- alloc_flags |= ALLOC_HARDER;
+ if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+ alloc_flags |= ALLOC_NON_BLOCK;
+
+ if (order > 0)
+ alloc_flags |= ALLOC_HIGHATOMIC;
+ }
+
/*
- * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
- * comment for __cpuset_node_allowed().
+ * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
+ * GFP_ATOMIC) rather than fail, see the comment for
+ * __cpuset_node_allowed().
*/
- alloc_flags &= ~ALLOC_CPUSET;
+ if (alloc_flags & ALLOC_MIN_RESERVE)
+ alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && in_task())
- alloc_flags |= ALLOC_HARDER;
+ alloc_flags |= ALLOC_MIN_RESERVE;
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
@@ -5028,14 +5077,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned int zonelist_iter_cookie;
int reserve_flags;
- /*
- * We also sanity check to catch abuse of atomic reserves being used by
- * callers that are not in atomic context.
- */
- if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
- (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
- gfp_mask &= ~__GFP_ATOMIC;
-
restart:
compaction_retries = 0;
no_progress_loops = 0;
@@ -5048,7 +5089,7 @@ restart:
* kswapd needs to be woken up, and to avoid the cost of setting up
* alloc_flags precisely. So we do that now.
*/
- alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
/*
* We need to recalculate the starting point for the zonelist iterator
@@ -5276,12 +5317,13 @@ nopage:
WARN_ON_ONCE_GFP(costly_order, gfp_mask);
/*
- * Help non-failing allocations by giving them access to memory
- * reserves but do not use ALLOC_NO_WATERMARKS because this
+ * Help non-failing allocations by giving some access to memory
+ * reserves normally used for high priority non-blocking
+ * allocations but do not use ALLOC_NO_WATERMARKS because this
* could deplete whole memory reserves which would just make
- * the situation worse
+ * the situation worse.
*/
- page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
+ page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
if (page)
goto got_pg;
@@ -5390,7 +5432,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
goto out;
/* Bulk allocator does not support memcg accounting. */
- if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT))
+ if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT))
goto failed;
/* Use the single page allocator for one page. */
@@ -5562,7 +5604,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
out:
- if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page &&
+ if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
__free_pages(page, order);
page = NULL;
@@ -5631,9 +5673,12 @@ EXPORT_SYMBOL(get_zeroed_page);
*/
void __free_pages(struct page *page, unsigned int order)
{
+ /* get PageHead before we drop reference */
+ int head = PageHead(page);
+
if (put_page_testzero(page))
free_the_page(page, order);
- else if (!PageHead(page))
+ else if (!head)
while (order-- > 0)
free_the_page(page + (1 << order), order);
}
@@ -6761,8 +6806,10 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
if (context == MEMINIT_EARLY) {
if (overlap_memmap_init(zone, &pfn))
continue;
- if (defer_init(nid, pfn, zone_end_pfn))
+ if (defer_init(nid, pfn, zone_end_pfn)) {
+ deferred_struct_pages = true;
break;
+ }
}
page = pfn_to_page(pfn);
@@ -7926,6 +7973,7 @@ static void __init free_area_init_node(int nid)
pgdat_set_deferred_range(pgdat);
free_area_init_core(pgdat);
+ lru_gen_init_pgdat(pgdat);
}
static void __init free_area_init_memoryless_node(int nid)
@@ -8360,11 +8408,9 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Allocator not initialized yet */
pgdat = arch_alloc_nodedata(nid);
- if (!pgdat) {
- pr_err("Cannot allocate %zuB for node %d.\n",
- sizeof(*pgdat), nid);
- continue;
- }
+ if (!pgdat)
+ panic("Cannot allocate %zuB for node %d.\n",
+ sizeof(*pgdat), nid);
arch_refresh_nodedata(nid, pgdat);
free_area_init_memoryless_node(nid);
@@ -8568,7 +8614,7 @@ static int page_alloc_cpu_dead(unsigned int cpu)
struct zone *zone;
lru_add_drain_cpu(cpu);
- mlock_page_drain_remote(cpu);
+ mlock_drain_remote(cpu);
drain_pages(cpu);
/*
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 4ee522fd381c..dc1626be458b 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -71,6 +71,7 @@ static bool need_page_idle(void)
}
static struct page_ext_operations page_idle_ops __initdata = {
.need = need_page_idle,
+ .need_shared_flags = true,
};
#endif
@@ -86,12 +87,12 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
#endif
};
-unsigned long page_ext_size = sizeof(struct page_ext);
+unsigned long page_ext_size;
static unsigned long total_usage;
static struct page_ext *lookup_page_ext(const struct page *page);
-bool early_page_ext;
+bool early_page_ext __meminitdata;
static int __init setup_early_page_ext(char *str)
{
early_page_ext = true;
@@ -106,7 +107,16 @@ static bool __init invoke_need_callbacks(void)
bool need = false;
for (i = 0; i < entries; i++) {
- if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
+ if (page_ext_ops[i]->need()) {
+ if (page_ext_ops[i]->need_shared_flags) {
+ page_ext_size = sizeof(struct page_ext);
+ break;
+ }
+ }
+ }
+
+ for (i = 0; i < entries; i++) {
+ if (page_ext_ops[i]->need()) {
page_ext_ops[i]->offset = page_ext_size;
page_ext_size += page_ext_ops[i]->size;
need = true;
diff --git a/mm/page_idle.c b/mm/page_idle.c
index bc08332a609c..41ea77f22011 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -31,19 +31,22 @@
*
* This function tries to get a user memory page by pfn as described above.
*/
-static struct page *page_idle_get_page(unsigned long pfn)
+static struct folio *page_idle_get_folio(unsigned long pfn)
{
struct page *page = pfn_to_online_page(pfn);
+ struct folio *folio;
- if (!page || !PageLRU(page) ||
- !get_page_unless_zero(page))
+ if (!page || PageTail(page))
return NULL;
- if (unlikely(!PageLRU(page))) {
- put_page(page);
- page = NULL;
+ folio = page_folio(page);
+ if (!folio_test_lru(folio) || !folio_try_get(folio))
+ return NULL;
+ if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) {
+ folio_put(folio);
+ folio = NULL;
}
- return page;
+ return folio;
}
static bool page_idle_clear_pte_refs_one(struct folio *folio,
@@ -83,10 +86,8 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio,
return true;
}
-static void page_idle_clear_pte_refs(struct page *page)
+static void page_idle_clear_pte_refs(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
/*
* Since rwc.try_lock is unused, rwc is effectively immutable, so we
* can make it static to save some cycles and stack.
@@ -115,7 +116,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
loff_t pos, size_t count)
{
u64 *out = (u64 *)buf;
- struct page *page;
+ struct folio *folio;
unsigned long pfn, end_pfn;
int bit;
@@ -134,19 +135,19 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
bit = pfn % BITMAP_CHUNK_BITS;
if (!bit)
*out = 0ULL;
- page = page_idle_get_page(pfn);
- if (page) {
- if (page_is_idle(page)) {
+ folio = page_idle_get_folio(pfn);
+ if (folio) {
+ if (folio_test_idle(folio)) {
/*
* The page might have been referenced via a
* pte, in which case it is not idle. Clear
* refs and recheck.
*/
- page_idle_clear_pte_refs(page);
- if (page_is_idle(page))
+ page_idle_clear_pte_refs(folio);
+ if (folio_test_idle(folio))
*out |= 1ULL << bit;
}
- put_page(page);
+ folio_put(folio);
}
if (bit == BITMAP_CHUNK_BITS - 1)
out++;
@@ -160,7 +161,7 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
loff_t pos, size_t count)
{
const u64 *in = (u64 *)buf;
- struct page *page;
+ struct folio *folio;
unsigned long pfn, end_pfn;
int bit;
@@ -178,11 +179,11 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
if ((*in >> bit) & 1) {
- page = page_idle_get_page(pfn);
- if (page) {
- page_idle_clear_pte_refs(page);
- set_page_idle(page);
- put_page(page);
+ folio = page_idle_get_folio(pfn);
+ if (folio) {
+ page_idle_clear_pte_refs(folio);
+ folio_set_idle(folio);
+ folio_put(folio);
}
}
if (bit == BITMAP_CHUNK_BITS - 1)
diff --git a/mm/page_io.c b/mm/page_io.c
index 3a5f921b932e..87b682d18850 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,7 +18,6 @@
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
-#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/frontswap.h>
#include <linux/blkdev.h>
@@ -28,7 +27,7 @@
#include <linux/delayacct.h>
#include "swap.h"
-static void end_swap_bio_write(struct bio *bio)
+static void __end_swap_bio_write(struct bio *bio)
{
struct page *page = bio_first_page_all(bio);
@@ -49,13 +48,17 @@ static void end_swap_bio_write(struct bio *bio)
ClearPageReclaim(page);
}
end_page_writeback(page);
+}
+
+static void end_swap_bio_write(struct bio *bio)
+{
+ __end_swap_bio_write(bio);
bio_put(bio);
}
-static void end_swap_bio_read(struct bio *bio)
+static void __end_swap_bio_read(struct bio *bio)
{
struct page *page = bio_first_page_all(bio);
- struct task_struct *waiter = bio->bi_private;
if (bio->bi_status) {
SetPageError(page);
@@ -63,18 +66,16 @@ static void end_swap_bio_read(struct bio *bio)
pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
- goto out;
+ } else {
+ SetPageUptodate(page);
}
-
- SetPageUptodate(page);
-out:
unlock_page(page);
- WRITE_ONCE(bio->bi_private, NULL);
+}
+
+static void end_swap_bio_read(struct bio *bio)
+{
+ __end_swap_bio_read(bio);
bio_put(bio);
- if (waiter) {
- blk_wake_io_task(waiter);
- put_task_struct(waiter);
- }
}
int generic_swapfile_activate(struct swap_info_struct *sis,
@@ -181,11 +182,11 @@ bad_bmap:
int swap_writepage(struct page *page, struct writeback_control *wbc)
{
struct folio *folio = page_folio(page);
- int ret = 0;
+ int ret;
if (folio_free_swap(folio)) {
folio_unlock(folio);
- goto out;
+ return 0;
}
/*
* Arch code may have to preserve more data than just the page
@@ -195,17 +196,16 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
if (ret) {
folio_mark_dirty(folio);
folio_unlock(folio);
- goto out;
+ return ret;
}
if (frontswap_store(&folio->page) == 0) {
folio_start_writeback(folio);
folio_unlock(folio);
folio_end_writeback(folio);
- goto out;
+ return 0;
}
- ret = __swap_writepage(&folio->page, wbc);
-out:
- return ret;
+ __swap_writepage(&folio->page, wbc);
+ return 0;
}
static inline void count_swpout_vm_event(struct page *page)
@@ -292,7 +292,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
mempool_free(sio, sio_pool);
}
-static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
+static void swap_writepage_fs(struct page *page, struct writeback_control *wbc)
{
struct swap_iocb *sio = NULL;
struct swap_info_struct *sis = page_swap_info(page);
@@ -318,9 +318,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
sio->pages = 0;
sio->len = 0;
}
- sio->bvec[sio->pages].bv_page = page;
- sio->bvec[sio->pages].bv_len = thp_size(page);
- sio->bvec[sio->pages].bv_offset = 0;
+ bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0);
sio->len += thp_size(page);
sio->pages += 1;
if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
@@ -329,30 +327,33 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
}
if (wbc->swap_plug)
*wbc->swap_plug = sio;
-
- return 0;
}
-int __swap_writepage(struct page *page, struct writeback_control *wbc)
+static void swap_writepage_bdev_sync(struct page *page,
+ struct writeback_control *wbc, struct swap_info_struct *sis)
{
- struct bio *bio;
- int ret;
- struct swap_info_struct *sis = page_swap_info(page);
+ struct bio_vec bv;
+ struct bio bio;
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- /*
- * ->flags can be updated non-atomicially (scan_swap_map_slots),
- * but that will never affect SWP_FS_OPS, so the data_race
- * is safe.
- */
- if (data_race(sis->flags & SWP_FS_OPS))
- return swap_writepage_fs(page, wbc);
+ bio_init(&bio, sis->bdev, &bv, 1,
+ REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
+ bio.bi_iter.bi_sector = swap_page_sector(page);
+ bio_add_page(&bio, page, thp_size(page), 0);
- ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
- if (!ret) {
- count_swpout_vm_event(page);
- return 0;
- }
+ bio_associate_blkg_from_page(&bio, page);
+ count_swpout_vm_event(page);
+
+ set_page_writeback(page);
+ unlock_page(page);
+
+ submit_bio_wait(&bio);
+ __end_swap_bio_write(&bio);
+}
+
+static void swap_writepage_bdev_async(struct page *page,
+ struct writeback_control *wbc, struct swap_info_struct *sis)
+{
+ struct bio *bio;
bio = bio_alloc(sis->bdev, 1,
REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
@@ -366,8 +367,24 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc)
set_page_writeback(page);
unlock_page(page);
submit_bio(bio);
+}
- return 0;
+void __swap_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ /*
+ * ->flags can be updated non-atomicially (scan_swap_map_slots),
+ * but that will never affect SWP_FS_OPS, so the data_race
+ * is safe.
+ */
+ if (data_race(sis->flags & SWP_FS_OPS))
+ swap_writepage_fs(page, wbc);
+ else if (sis->flags & SWP_SYNCHRONOUS_IO)
+ swap_writepage_bdev_sync(page, wbc, sis);
+ else
+ swap_writepage_bdev_async(page, wbc, sis);
}
void swap_write_unplug(struct swap_iocb *sio)
@@ -432,9 +449,7 @@ static void swap_readpage_fs(struct page *page,
sio->pages = 0;
sio->len = 0;
}
- sio->bvec[sio->pages].bv_page = page;
- sio->bvec[sio->pages].bv_len = thp_size(page);
- sio->bvec[sio->pages].bv_offset = 0;
+ bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0);
sio->len += thp_size(page);
sio->pages += 1;
if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
@@ -445,11 +460,41 @@ static void swap_readpage_fs(struct page *page,
*plug = sio;
}
-int swap_readpage(struct page *page, bool synchronous,
- struct swap_iocb **plug)
+static void swap_readpage_bdev_sync(struct page *page,
+ struct swap_info_struct *sis)
+{
+ struct bio_vec bv;
+ struct bio bio;
+
+ bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
+ bio.bi_iter.bi_sector = swap_page_sector(page);
+ bio_add_page(&bio, page, thp_size(page), 0);
+ /*
+ * Keep this task valid during swap readpage because the oom killer may
+ * attempt to access it in the page fault retry time check.
+ */
+ get_task_struct(current);
+ count_vm_event(PSWPIN);
+ submit_bio_wait(&bio);
+ __end_swap_bio_read(&bio);
+ put_task_struct(current);
+}
+
+static void swap_readpage_bdev_async(struct page *page,
+ struct swap_info_struct *sis)
{
struct bio *bio;
- int ret = 0;
+
+ bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
+ bio->bi_iter.bi_sector = swap_page_sector(page);
+ bio->bi_end_io = end_swap_bio_read;
+ bio_add_page(bio, page, thp_size(page), 0);
+ count_vm_event(PSWPIN);
+ submit_bio(bio);
+}
+
+void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
+{
struct swap_info_struct *sis = page_swap_info(page);
bool workingset = PageWorkingset(page);
unsigned long pflags;
@@ -473,55 +518,19 @@ int swap_readpage(struct page *page, bool synchronous,
if (frontswap_load(page) == 0) {
SetPageUptodate(page);
unlock_page(page);
- goto out;
- }
-
- if (data_race(sis->flags & SWP_FS_OPS)) {
+ } else if (data_race(sis->flags & SWP_FS_OPS)) {
swap_readpage_fs(page, plug);
- goto out;
- }
-
- if (sis->flags & SWP_SYNCHRONOUS_IO) {
- ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
- if (!ret) {
- count_vm_event(PSWPIN);
- goto out;
- }
- }
-
- ret = 0;
- bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
- bio->bi_iter.bi_sector = swap_page_sector(page);
- bio->bi_end_io = end_swap_bio_read;
- bio_add_page(bio, page, thp_size(page), 0);
- /*
- * Keep this task valid during swap readpage because the oom killer may
- * attempt to access it in the page fault retry time check.
- */
- if (synchronous) {
- get_task_struct(current);
- bio->bi_private = current;
- }
- count_vm_event(PSWPIN);
- bio_get(bio);
- submit_bio(bio);
- while (synchronous) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!READ_ONCE(bio->bi_private))
- break;
-
- blk_io_schedule();
+ } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) {
+ swap_readpage_bdev_sync(page, sis);
+ } else {
+ swap_readpage_bdev_async(page, sis);
}
- __set_current_state(TASK_RUNNING);
- bio_put(bio);
-out:
if (workingset) {
delayacct_thrashing_end(&in_thrashing);
psi_memstall_leave(&pflags);
}
delayacct_swapin_end();
- return ret;
}
void __swap_read_unplug(struct swap_iocb *sio)
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 2d27f532df4c..220cdeddc295 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -48,7 +48,7 @@ static int __init early_page_owner_param(char *buf)
int ret = kstrtobool(buf, &page_owner_enabled);
if (page_owner_enabled)
- stack_depot_want_early_init();
+ stack_depot_request_early_init();
return ret;
}
@@ -99,6 +99,7 @@ struct page_ext_operations page_owner_ops = {
.size = sizeof(struct page_owner),
.need = need_page_owner,
.init = init_page_owner,
+ .need_shared_flags = true,
};
static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
@@ -162,6 +163,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext,
{
struct page_owner *page_owner;
int i;
+ u64 ts_nsec = local_clock();
for (i = 0; i < (1 << order); i++) {
page_owner = get_page_owner(page_ext);
@@ -171,7 +173,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext,
page_owner->last_migrate_reason = -1;
page_owner->pid = current->pid;
page_owner->tgid = current->tgid;
- page_owner->ts_nsec = local_clock();
+ page_owner->ts_nsec = ts_nsec;
strscpy(page_owner->comm, current->comm,
sizeof(page_owner->comm));
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 79a8554f024c..c65813a9dc78 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -356,7 +356,8 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
mutex_lock(&page_reporting_mutex);
/* nothing to do if already in use */
- if (rcu_access_pointer(pr_dev_info)) {
+ if (rcu_dereference_protected(pr_dev_info,
+ lockdep_is_held(&page_reporting_mutex))) {
err = -EBUSY;
goto err_out;
}
@@ -401,7 +402,8 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev)
{
mutex_lock(&page_reporting_mutex);
- if (rcu_access_pointer(pr_dev_info) == prdev) {
+ if (prdev == rcu_dereference_protected(pr_dev_info,
+ lockdep_is_held(&page_reporting_mutex))) {
/* Disable page reporting notification */
RCU_INIT_POINTER(pr_dev_info, NULL);
synchronize_rcu();
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 93e633c1d587..25d8610c0042 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -45,6 +45,7 @@ struct page_ext_operations page_table_check_ops = {
.size = sizeof(struct page_table_check),
.need = need_page_table_check,
.init = init_page_table_check,
+ .need_shared_flags = false,
};
static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 93e13fc17d3c..4e448cfbc6ef 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -168,9 +168,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
/* The only possible mapping was handled on last iteration */
if (pvmw->pte)
return not_found(pvmw);
-
- /* when pud is not present, pte will be NULL */
- pvmw->pte = huge_pte_offset(mm, pvmw->address, size);
+ /*
+ * All callers that get here will already hold the
+ * i_mmap_rwsem. Therefore, no additional locks need to be
+ * taken before calling hugetlb_walk().
+ */
+ pvmw->pte = hugetlb_walk(vma, pvmw->address, size);
if (!pvmw->pte)
return false;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7f1c9b274906..cb23f8a15c13 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -302,18 +302,18 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
+ hugetlb_vma_lock_read(vma);
do {
next = hugetlb_entry_end(h, addr, end);
- pte = huge_pte_offset(walk->mm, addr & hmask, sz);
-
+ pte = hugetlb_walk(vma, addr & hmask, sz);
if (pte)
err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
else if (ops->pte_hole)
err = ops->pte_hole(addr, next, -1, walk);
-
if (err)
break;
} while (addr = next, addr != end);
+ hugetlb_vma_unlock_read(vma);
return err;
}
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 70b1ea23f4d2..f9847c131998 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -4,6 +4,7 @@
#include <linux/types.h>
#include <linux/percpu.h>
+#include <linux/memcontrol.h>
/*
* pcpu_block_md is the metadata block struct.
@@ -118,14 +119,15 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
* @size: size of area to allocate in bytes
*
* For each accounted object there is an extra space which is used to store
- * obj_cgroup membership. Charge it too.
+ * obj_cgroup membership if kmemcg is not disabled. Charge it too.
*/
static inline size_t pcpu_obj_full_size(size_t size)
{
size_t extra_size = 0;
#ifdef CONFIG_MEMCG_KMEM
- extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+ if (!mem_cgroup_kmem_disabled())
+ extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
#endif
return size * num_possible_cpus() + extra_size;
diff --git a/mm/percpu.c b/mm/percpu.c
index acd78da0493b..28e07ede46f6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1625,7 +1625,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
{
struct obj_cgroup *objcg;
- if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
+ if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
return true;
objcg = get_obj_cgroup_from_current();
diff --git a/mm/readahead.c b/mm/readahead.c
index b10f0cf81d80..47afbca1d122 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -801,21 +801,25 @@ void readahead_expand(struct readahead_control *ractl,
/* Expand the leading edge downwards */
while (ractl->_index > new_index) {
unsigned long index = ractl->_index - 1;
- struct page *page = xa_load(&mapping->i_pages, index);
+ struct folio *folio = xa_load(&mapping->i_pages, index);
- if (page && !xa_is_value(page))
- return; /* Page apparently present */
+ if (folio && !xa_is_value(folio))
+ return; /* Folio apparently present */
- page = __page_cache_alloc(gfp_mask);
- if (!page)
+ folio = filemap_alloc_folio(gfp_mask, 0);
+ if (!folio)
return;
- if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
- put_page(page);
+ if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
+ folio_put(folio);
return;
}
-
+ if (unlikely(folio_test_workingset(folio)) &&
+ !ractl->_workingset) {
+ ractl->_workingset = true;
+ psi_memstall_enter(&ractl->_pflags);
+ }
ractl->_nr_pages++;
- ractl->_index = page->index;
+ ractl->_index = folio->index;
}
new_len += new_start - readahead_pos(ractl);
@@ -824,19 +828,20 @@ void readahead_expand(struct readahead_control *ractl,
/* Expand the trailing edge upwards */
while (ractl->_nr_pages < new_nr_pages) {
unsigned long index = ractl->_index + ractl->_nr_pages;
- struct page *page = xa_load(&mapping->i_pages, index);
+ struct folio *folio = xa_load(&mapping->i_pages, index);
- if (page && !xa_is_value(page))
- return; /* Page apparently present */
+ if (folio && !xa_is_value(folio))
+ return; /* Folio apparently present */
- page = __page_cache_alloc(gfp_mask);
- if (!page)
+ folio = filemap_alloc_folio(gfp_mask, 0);
+ if (!folio)
return;
- if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
- put_page(page);
+ if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
+ folio_put(folio);
return;
}
- if (unlikely(PageWorkingset(page)) && !ractl->_workingset) {
+ if (unlikely(folio_test_workingset(folio)) &&
+ !ractl->_workingset) {
ractl->_workingset = true;
psi_memstall_enter(&ractl->_pflags);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index b616870a09be..8632e02661ac 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -262,11 +262,12 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
*
- * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and
- * anon_vma_fork(). The first three want an exact copy of src, while the last
- * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
- * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
- * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
+ * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
+ * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
+ * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
+ * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
+ * call, we can identify this case by checking (!dst->anon_vma &&
+ * src->anon_vma).
*
* If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
* and reuse existing anon_vma which has no vmas and only one child anon_vma.
@@ -823,25 +824,14 @@ static bool folio_referenced_one(struct folio *folio,
}
if (pvmw.pte) {
- if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
- !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
+ if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
lru_gen_look_around(&pvmw);
referenced++;
}
if (ptep_clear_flush_young_notify(vma, address,
- pvmw.pte)) {
- /*
- * Don't treat a reference through
- * a sequentially read mapping as such.
- * If the folio has been used in another mapping,
- * we will catch it; if this other mapping is
- * already gone, the unmap path will have set
- * the referenced flag or activated the folio.
- */
- if (likely(!(vma->vm_flags & VM_SEQ_READ)))
- referenced++;
- }
+ pvmw.pte))
+ referenced++;
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_flush_young_notify(vma, address,
pvmw.pmd))
@@ -875,7 +865,20 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
struct folio_referenced_arg *pra = arg;
struct mem_cgroup *memcg = pra->memcg;
- if (!mm_match_cgroup(vma->vm_mm, memcg))
+ /*
+ * Ignore references from this mapping if it has no recency. If the
+ * folio has been used in another mapping, we will catch it; if this
+ * other mapping is already gone, the unmap path will have set the
+ * referenced flag or activated the folio in zap_pte_range().
+ */
+ if (!vma_has_recency(vma))
+ return true;
+
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip counting on behalf
+ * of references from different cgroups.
+ */
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
return true;
return false;
@@ -906,6 +909,7 @@ int folio_referenced(struct folio *folio, int is_locked,
.arg = (void *)&pra,
.anon_lock = folio_lock_anon_vma_read,
.try_lock = true,
+ .invalid_vma = invalid_folio_referenced_vma,
};
*vm_flags = 0;
@@ -921,15 +925,6 @@ int folio_referenced(struct folio *folio, int is_locked,
return 1;
}
- /*
- * If we are reclaiming on behalf of a cgroup, skip
- * counting on behalf of references from different
- * cgroups
- */
- if (memcg) {
- rwc.invalid_vma = invalid_folio_referenced_vma;
- }
-
rmap_walk(folio, &rwc);
*vm_flags = pra.vm_flags;
@@ -950,9 +945,8 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
* We have to assume the worse case ie pmd for invalidation. Note that
* the folio can not be freed from this function.
*/
- mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
- 0, vma, vma->vm_mm, address,
- vma_address_end(pvmw));
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
+ vma->vm_mm, address, vma_address_end(pvmw));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(pvmw)) {
@@ -1085,26 +1079,26 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
return page_vma_mkclean_one(&pvmw);
}
-int total_compound_mapcount(struct page *head)
+int folio_total_mapcount(struct folio *folio)
{
- int mapcount = head_compound_mapcount(head);
- int nr_subpages;
+ int mapcount = folio_entire_mapcount(folio);
+ int nr_pages;
int i;
- /* In the common case, avoid the loop when no subpages mapped by PTE */
- if (head_subpages_mapcount(head) == 0)
+ /* In the common case, avoid the loop when no pages mapped by PTE */
+ if (folio_nr_pages_mapped(folio) == 0)
return mapcount;
/*
- * Add all the PTE mappings of those subpages mapped by PTE.
- * Limit the loop, knowing that only subpages_mapcount are mapped?
+ * Add all the PTE mappings of those pages mapped by PTE.
+ * Limit the loop to folio_nr_pages_mapped()?
* Perhaps: given all the raciness, that may be a good or a bad idea.
*/
- nr_subpages = thp_nr_pages(head);
- for (i = 0; i < nr_subpages; i++)
- mapcount += atomic_read(&head[i]._mapcount);
+ nr_pages = folio_nr_pages(folio);
+ for (i = 0; i < nr_pages; i++)
+ mapcount += atomic_read(&folio_page(folio, i)->_mapcount);
/* But each of those _mapcounts was based on -1 */
- mapcount += nr_subpages;
+ mapcount += nr_pages;
return mapcount;
}
@@ -1138,19 +1132,20 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
/**
* __page_set_anon_rmap - set up new anonymous rmap
- * @page: Page or Hugepage to add to rmap
+ * @folio: Folio which contains page.
+ * @page: Page to add to rmap.
* @vma: VM area to add page to.
* @address: User virtual address of the mapping
* @exclusive: the page is exclusively owned by the current process
*/
-static void __page_set_anon_rmap(struct page *page,
+static void __page_set_anon_rmap(struct folio *folio, struct page *page,
struct vm_area_struct *vma, unsigned long address, int exclusive)
{
struct anon_vma *anon_vma = vma->anon_vma;
BUG_ON(!anon_vma);
- if (PageAnon(page))
+ if (folio_test_anon(folio))
goto out;
/*
@@ -1162,14 +1157,14 @@ static void __page_set_anon_rmap(struct page *page,
anon_vma = anon_vma->root;
/*
- * page_idle does a lockless/optimistic rmap scan on page->mapping.
+ * page_idle does a lockless/optimistic rmap scan on folio->mapping.
* Make sure the compiler doesn't split the stores of anon_vma and
* the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
* could mistake the mapping for a struct address_space and crash.
*/
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
- page->index = linear_page_index(vma, address);
+ WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
+ folio->index = linear_page_index(vma, address);
out:
if (exclusive)
SetPageAnonExclusive(page);
@@ -1214,36 +1209,32 @@ static void __page_check_anon_rmap(struct page *page,
* and to ensure that PageAnon is not being upgraded racily to PageKsm
* (but PageKsm is never downgraded to PageAnon).
*/
-void page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, rmap_t flags)
+void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, rmap_t flags)
{
- atomic_t *mapped;
+ struct folio *folio = page_folio(page);
+ atomic_t *mapped = &folio->_nr_pages_mapped;
int nr = 0, nr_pmdmapped = 0;
bool compound = flags & RMAP_COMPOUND;
bool first = true;
- if (unlikely(PageKsm(page)))
- lock_page_memcg(page);
-
/* Is page being mapped by PTE? Is this its first map to be added? */
if (likely(!compound)) {
first = atomic_inc_and_test(&page->_mapcount);
nr = first;
- if (first && PageCompound(page)) {
- mapped = subpages_mapcount_ptr(compound_head(page));
+ if (first && folio_test_large(folio)) {
nr = atomic_inc_return_relaxed(mapped);
nr = (nr < COMPOUND_MAPPED);
}
- } else if (PageTransHuge(page)) {
+ } else if (folio_test_pmd_mappable(folio)) {
/* That test is redundant: it's for safety or to optimize out */
- first = atomic_inc_and_test(compound_mapcount_ptr(page));
+ first = atomic_inc_and_test(&folio->_entire_mapcount);
if (first) {
- mapped = subpages_mapcount_ptr(page);
nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) {
- nr_pmdmapped = thp_nr_pages(page);
- nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED);
+ nr_pmdmapped = folio_nr_pages(folio);
+ nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of a remove and another add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1258,59 +1249,57 @@ void page_add_anon_rmap(struct page *page,
VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
if (nr_pmdmapped)
- __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pmdmapped);
+ __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped);
if (nr)
- __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
+ __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
- if (unlikely(PageKsm(page)))
- unlock_page_memcg(page);
-
- /* address might be in next vma when migration races vma_adjust */
- else if (first)
- __page_set_anon_rmap(page, vma, address,
- !!(flags & RMAP_EXCLUSIVE));
- else
- __page_check_anon_rmap(page, vma, address);
+ if (likely(!folio_test_ksm(folio))) {
+ /* address might be in next vma when migration races vma_merge */
+ if (first)
+ __page_set_anon_rmap(folio, page, vma, address,
+ !!(flags & RMAP_EXCLUSIVE));
+ else
+ __page_check_anon_rmap(page, vma, address);
+ }
- mlock_vma_page(page, vma, compound);
+ mlock_vma_folio(folio, vma, compound);
}
/**
- * page_add_new_anon_rmap - add mapping to a new anonymous page
- * @page: the page to add the mapping to
+ * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
+ * @folio: The folio to add the mapping to.
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
- * If it's a compound page, it is accounted as a compound page. As the page
- * is new, it's assume to get mapped exclusively by a single process.
- *
- * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * Like page_add_anon_rmap() but must only be called on *new* folios.
* This means the inc-and-test can be bypassed.
- * Page does not have to be locked.
+ * The folio does not have to be locked.
+ *
+ * If the folio is large, it is accounted as a THP. As the folio
+ * is new, it's assumed to be mapped exclusively by a single process.
*/
-void page_add_new_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long address)
{
int nr;
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
- __SetPageSwapBacked(page);
+ __folio_set_swapbacked(folio);
- if (likely(!PageCompound(page))) {
+ if (likely(!folio_test_pmd_mappable(folio))) {
/* increment count (starts at -1) */
- atomic_set(&page->_mapcount, 0);
+ atomic_set(&folio->_mapcount, 0);
nr = 1;
} else {
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */
- atomic_set(compound_mapcount_ptr(page), 0);
- atomic_set(subpages_mapcount_ptr(page), COMPOUND_MAPPED);
- nr = thp_nr_pages(page);
- __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
+ atomic_set(&folio->_entire_mapcount, 0);
+ atomic_set(&folio->_nr_pages_mapped, COMPOUND_MAPPED);
+ nr = folio_nr_pages(folio);
+ __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);
}
- __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
- __page_set_anon_rmap(page, vma, address, 1);
+ __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
+ __page_set_anon_rmap(folio, &folio->page, vma, address, 1);
}
/**
@@ -1321,35 +1310,33 @@ void page_add_new_anon_rmap(struct page *page,
*
* The caller needs to hold the pte lock.
*/
-void page_add_file_rmap(struct page *page,
- struct vm_area_struct *vma, bool compound)
+void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
+ bool compound)
{
- atomic_t *mapped;
+ struct folio *folio = page_folio(page);
+ atomic_t *mapped = &folio->_nr_pages_mapped;
int nr = 0, nr_pmdmapped = 0;
bool first;
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
- lock_page_memcg(page);
/* Is page being mapped by PTE? Is this its first map to be added? */
if (likely(!compound)) {
first = atomic_inc_and_test(&page->_mapcount);
nr = first;
- if (first && PageCompound(page)) {
- mapped = subpages_mapcount_ptr(compound_head(page));
+ if (first && folio_test_large(folio)) {
nr = atomic_inc_return_relaxed(mapped);
nr = (nr < COMPOUND_MAPPED);
}
- } else if (PageTransHuge(page)) {
+ } else if (folio_test_pmd_mappable(folio)) {
/* That test is redundant: it's for safety or to optimize out */
- first = atomic_inc_and_test(compound_mapcount_ptr(page));
+ first = atomic_inc_and_test(&folio->_entire_mapcount);
if (first) {
- mapped = subpages_mapcount_ptr(page);
nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) {
- nr_pmdmapped = thp_nr_pages(page);
- nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED);
+ nr_pmdmapped = folio_nr_pages(folio);
+ nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of a remove and another add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1361,13 +1348,12 @@ void page_add_file_rmap(struct page *page,
}
if (nr_pmdmapped)
- __mod_lruvec_page_state(page, PageSwapBacked(page) ?
+ __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ?
NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped);
if (nr)
- __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
- unlock_page_memcg(page);
+ __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);
- mlock_vma_page(page, vma, compound);
+ mlock_vma_folio(folio, vma, compound);
}
/**
@@ -1378,43 +1364,41 @@ void page_add_file_rmap(struct page *page,
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page,
- struct vm_area_struct *vma, bool compound)
+void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
+ bool compound)
{
- atomic_t *mapped;
+ struct folio *folio = page_folio(page);
+ atomic_t *mapped = &folio->_nr_pages_mapped;
int nr = 0, nr_pmdmapped = 0;
bool last;
+ enum node_stat_item idx;
VM_BUG_ON_PAGE(compound && !PageHead(page), page);
/* Hugetlb pages are not counted in NR_*MAPPED */
- if (unlikely(PageHuge(page))) {
+ if (unlikely(folio_test_hugetlb(folio))) {
/* hugetlb pages are always mapped with pmds */
- atomic_dec(compound_mapcount_ptr(page));
+ atomic_dec(&folio->_entire_mapcount);
return;
}
- lock_page_memcg(page);
-
/* Is page being unmapped by PTE? Is this its last map to be removed? */
if (likely(!compound)) {
last = atomic_add_negative(-1, &page->_mapcount);
nr = last;
- if (last && PageCompound(page)) {
- mapped = subpages_mapcount_ptr(compound_head(page));
+ if (last && folio_test_large(folio)) {
nr = atomic_dec_return_relaxed(mapped);
nr = (nr < COMPOUND_MAPPED);
}
- } else if (PageTransHuge(page)) {
+ } else if (folio_test_pmd_mappable(folio)) {
/* That test is redundant: it's for safety or to optimize out */
- last = atomic_add_negative(-1, compound_mapcount_ptr(page));
+ last = atomic_add_negative(-1, &folio->_entire_mapcount);
if (last) {
- mapped = subpages_mapcount_ptr(page);
nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped);
if (likely(nr < COMPOUND_MAPPED)) {
- nr_pmdmapped = thp_nr_pages(page);
- nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED);
+ nr_pmdmapped = folio_nr_pages(folio);
+ nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of another remove and an add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1426,34 +1410,37 @@ void page_remove_rmap(struct page *page,
}
if (nr_pmdmapped) {
- __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_THPS :
- (PageSwapBacked(page) ? NR_SHMEM_PMDMAPPED :
- NR_FILE_PMDMAPPED), -nr_pmdmapped);
+ if (folio_test_anon(folio))
+ idx = NR_ANON_THPS;
+ else if (folio_test_swapbacked(folio))
+ idx = NR_SHMEM_PMDMAPPED;
+ else
+ idx = NR_FILE_PMDMAPPED;
+ __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped);
}
if (nr) {
- __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_MAPPED :
- NR_FILE_MAPPED, -nr);
+ idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
+ __lruvec_stat_mod_folio(folio, idx, -nr);
+
/*
- * Queue anon THP for deferred split if at least one small
- * page of the compound page is unmapped, but at least one
- * small page is still mapped.
+ * Queue anon THP for deferred split if at least one
+ * page of the folio is unmapped and at least one page
+ * is still mapped.
*/
- if (PageTransCompound(page) && PageAnon(page))
+ if (folio_test_pmd_mappable(folio) && folio_test_anon(folio))
if (!compound || nr < nr_pmdmapped)
- deferred_split_huge_page(compound_head(page));
+ deferred_split_folio(folio);
}
/*
- * It would be tidy to reset PageAnon mapping when fully unmapped,
- * but that might overwrite a racing page_add_anon_rmap
- * which increments mapcount after us but sets mapping
- * before us: so leave the reset to free_pages_prepare,
- * and remember that it's only reliable while mapped.
+ * It would be tidy to reset folio_test_anon mapping when fully
+ * unmapped, but that might overwrite a racing page_add_anon_rmap
+ * which increments mapcount after us but sets mapping before us:
+ * so leave the reset to free_pages_prepare, and remember that
+ * it's only reliable while mapped.
*/
- unlock_page_memcg(page);
-
- munlock_vma_page(page, vma, compound);
+ munlock_vma_folio(folio, vma, compound);
}
/*
@@ -1491,7 +1478,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* try_to_unmap() must hold a reference on the folio.
*/
range.end = vma_address_end(&pvmw);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, range.end);
if (folio_test_hugetlb(folio)) {
/*
@@ -1615,7 +1602,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) {
+ if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
@@ -1725,17 +1712,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
- /*
- * Note: We *don't* remember if the page was mapped
- * exclusively in the swap pte if the architecture
- * doesn't support __HAVE_ARCH_PTE_SWP_EXCLUSIVE. In
- * that case, swapin code has to re-determine that
- * manually and might detect the page as possibly
- * shared, for example, if there are other references on
- * the page or if the page is under writeback. We made
- * sure that there are no GUP pins on the page that
- * would rely on it, so for GUP pins this is fine.
- */
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist))
@@ -1779,7 +1755,7 @@ discard:
*/
page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
if (vma->vm_flags & VM_LOCKED)
- mlock_page_drain_local();
+ mlock_drain_local();
folio_put(folio);
}
@@ -1866,7 +1842,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* try_to_unmap() must hold a reference on the page.
*/
range.end = vma_address_end(&pvmw);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, range.end);
if (folio_test_hugetlb(folio)) {
/*
@@ -1976,7 +1952,21 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
} else {
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
/* Nuke the page table entry. */
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ if (should_defer_flush(mm, flags)) {
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the folio.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+
+ set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
+ } else {
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ }
}
/* Set the dirty flag on the folio now the pte is gone. */
@@ -2120,7 +2110,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
if (vma->vm_flags & VM_LOCKED)
- mlock_page_drain_local();
+ mlock_drain_local();
folio_put(folio);
}
@@ -2148,10 +2138,10 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
/*
* Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
- * TTU_SPLIT_HUGE_PMD and TTU_SYNC flags.
+ * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
*/
if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
- TTU_SYNC)))
+ TTU_SYNC | TTU_BATCH_FLUSH)))
return;
if (folio_is_zone_device(folio) &&
@@ -2196,7 +2186,7 @@ static bool page_make_device_exclusive_one(struct folio *folio,
swp_entry_t entry;
pte_t swp_pte;
- mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
vma->vm_mm, address, min(vma->vm_end,
address + folio_size(folio)),
args->owner);
@@ -2543,27 +2533,28 @@ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
unsigned long address, rmap_t flags)
{
+ struct folio *folio = page_folio(page);
struct anon_vma *anon_vma = vma->anon_vma;
int first;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
BUG_ON(!anon_vma);
- /* address might be in next vma when migration races vma_adjust */
- first = atomic_inc_and_test(compound_mapcount_ptr(page));
+ /* address might be in next vma when migration races vma_merge */
+ first = atomic_inc_and_test(&folio->_entire_mapcount);
VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
if (first)
- __page_set_anon_rmap(page, vma, address,
+ __page_set_anon_rmap(folio, page, vma, address,
!!(flags & RMAP_EXCLUSIVE));
}
-void hugepage_add_new_anon_rmap(struct page *page,
+void hugepage_add_new_anon_rmap(struct folio *folio,
struct vm_area_struct *vma, unsigned long address)
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
/* increment count (starts at -1) */
- atomic_set(compound_mapcount_ptr(page), 0);
- ClearHPageRestoreReserve(page);
- __page_set_anon_rmap(page, vma, address, 1);
+ atomic_set(&folio->_entire_mapcount, 0);
+ folio_clear_hugetlb_restore_reserve(folio);
+ __page_set_anon_rmap(folio, &folio->page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 04c3ac9448a1..0b502625cd30 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -128,7 +128,7 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
return -EAGAIN;
- vma->vm_flags |= VM_LOCKED | VM_DONTDUMP;
+ vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP);
vma->vm_ops = &secretmem_vm_ops;
return 0;
@@ -162,7 +162,7 @@ const struct address_space_operations secretmem_aops = {
.migrate_folio = secretmem_migrate_folio,
};
-static int secretmem_setattr(struct user_namespace *mnt_userns,
+static int secretmem_setattr(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
@@ -175,7 +175,7 @@ static int secretmem_setattr(struct user_namespace *mnt_userns,
if ((ia_valid & ATTR_SIZE) && inode->i_size)
ret = -EINVAL;
else
- ret = simple_setattr(mnt_userns, dentry, iattr);
+ ret = simple_setattr(idmap, dentry, iattr);
filemap_invalidate_unlock(mapping);
@@ -190,7 +190,7 @@ static struct vfsmount *secretmem_mnt;
static struct file *secretmem_file_create(unsigned long flags)
{
- struct file *file = ERR_PTR(-ENOMEM);
+ struct file *file;
struct inode *inode;
const char *anon_name = "[secretmem]";
const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
diff --git a/mm/shmem.c b/mm/shmem.c
index c301487be5fb..448f393d8ab2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -33,6 +33,7 @@
#include <linux/random.h>
#include <linux/sched/signal.h>
#include <linux/export.h>
+#include <linux/shmem_fs.h>
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
@@ -58,7 +59,6 @@ static struct vfsmount *shm_mnt;
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
-#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/percpu_counter.h>
@@ -468,22 +468,19 @@ static bool shmem_confirm_swap(struct address_space *mapping,
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
-bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
- pgoff_t index, bool shmem_huge_force)
+bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
+ struct mm_struct *mm, unsigned long vm_flags)
{
loff_t i_size;
if (!S_ISREG(inode->i_mode))
return false;
- if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
+ if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags)))
return false;
- if (shmem_huge_force)
- return true;
- if (shmem_huge == SHMEM_HUGE_FORCE)
- return true;
if (shmem_huge == SHMEM_HUGE_DENY)
return false;
+ if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
+ return true;
switch (SHMEM_SB(inode->i_sb)->huge) {
case SHMEM_HUGE_ALWAYS:
@@ -495,7 +492,7 @@ bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
return true;
fallthrough;
case SHMEM_HUGE_ADVISE:
- if (vma && (vma->vm_flags & VM_HUGEPAGE))
+ if (mm && (vm_flags & VM_HUGEPAGE))
return true;
fallthrough;
default:
@@ -678,8 +675,8 @@ static long shmem_unused_huge_count(struct super_block *sb,
#define shmem_huge SHMEM_HUGE_DENY
-bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
- pgoff_t index, bool shmem_huge_force)
+bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
+ struct mm_struct *mm, unsigned long vm_flags)
{
return false;
}
@@ -1047,7 +1044,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
-static int shmem_getattr(struct user_namespace *mnt_userns,
+static int shmem_getattr(struct mnt_idmap *idmap,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
@@ -1068,9 +1065,9 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(idmap, inode, stat);
- if (shmem_is_huge(NULL, inode, 0, false))
+ if (shmem_is_huge(inode, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
if (request_mask & STATX_BTIME) {
@@ -1082,7 +1079,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
return 0;
}
-static int shmem_setattr(struct user_namespace *mnt_userns,
+static int shmem_setattr(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
@@ -1091,10 +1088,16 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
bool update_mtime = false;
bool update_ctime = true;
- error = setattr_prepare(&init_user_ns, dentry, attr);
+ error = setattr_prepare(idmap, dentry, attr);
if (error)
return error;
+ if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
+ if ((inode->i_mode ^ attr->ia_mode) & 0111) {
+ return -EPERM;
+ }
+ }
+
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
@@ -1129,9 +1132,9 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
}
}
- setattr_copy(&init_user_ns, inode, attr);
+ setattr_copy(idmap, inode, attr);
if (attr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
+ error = posix_acl_chmod(idmap, dentry, inode->i_mode);
if (!error && update_ctime) {
inode->i_ctime = current_time(inode);
if (update_mtime)
@@ -1735,6 +1738,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
+ struct swap_info_struct *si;
struct folio *folio = NULL;
swp_entry_t swap;
int error;
@@ -1746,6 +1750,14 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (is_swapin_error_entry(swap))
return -EIO;
+ si = get_swap_device(swap);
+ if (!si) {
+ if (!shmem_confirm_swap(mapping, index, swap))
+ return -EEXIST;
+ else
+ return -EINVAL;
+ }
+
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
if (!folio) {
@@ -1806,6 +1818,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
delete_from_swap_cache(folio);
folio_mark_dirty(folio);
swap_free(swap);
+ put_swap_device(si);
*foliop = folio;
return 0;
@@ -1819,6 +1832,7 @@ unlock:
folio_unlock(folio);
folio_put(folio);
}
+ put_swap_device(si);
return error;
}
@@ -1911,7 +1925,8 @@ repeat:
return 0;
}
- if (!shmem_is_huge(vma, inode, index, false))
+ if (!shmem_is_huge(inode, index, false,
+ vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0))
goto alloc_nohuge;
huge_gfp = vma_thp_gfp_mask(vma);
@@ -2289,7 +2304,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
return ret;
/* arm64 - allow memory tagging on RAM-based files */
- vma->vm_flags |= VM_MTE_ALLOWED;
+ vm_flags_set(vma, VM_MTE_ALLOWED);
file_accessed(file);
/* This is anonymous shared memory if it is unlinked at the time of mmap */
@@ -2329,8 +2344,9 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
#define shmem_initxattrs NULL
#endif
-static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
- umode_t mode, dev_t dev, unsigned long flags)
+static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb,
+ struct inode *dir, umode_t mode, dev_t dev,
+ unsigned long flags)
{
struct inode *inode;
struct shmem_inode_info *info;
@@ -2343,7 +2359,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
inode = new_inode(sb);
if (inode) {
inode->i_ino = ino;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(idmap, inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_generation = get_random_u32();
@@ -2563,33 +2579,23 @@ shmem_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
if (pos + copied > inode->i_size)
i_size_write(inode, pos + copied);
- if (!PageUptodate(page)) {
- struct page *head = compound_head(page);
- if (PageTransCompound(page)) {
- int i;
-
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- if (head + i == page)
- continue;
- clear_highpage(head + i);
- flush_dcache_page(head + i);
- }
- }
- if (copied < PAGE_SIZE) {
- unsigned from = pos & (PAGE_SIZE - 1);
- zero_user_segments(page, 0, from,
- from + copied, PAGE_SIZE);
+ if (!folio_test_uptodate(folio)) {
+ if (copied < folio_size(folio)) {
+ size_t from = offset_in_folio(folio, pos);
+ folio_zero_segments(folio, 0, from,
+ from + copied, folio_size(folio));
}
- SetPageUptodate(head);
+ folio_mark_uptodate(folio);
}
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
@@ -2915,13 +2921,13 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
* File creation. Allocate an inode, and we're done..
*/
static int
-shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
int error = -ENOSPC;
- inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
+ inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
error = simple_acl_create(dir, inode);
if (error)
@@ -2946,13 +2952,13 @@ out_iput:
}
static int
-shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct file *file, umode_t mode)
{
struct inode *inode;
int error = -ENOSPC;
- inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
+ inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
if (inode) {
error = security_inode_init_security(inode, dir,
NULL,
@@ -2970,22 +2976,22 @@ out_iput:
return error;
}
-static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
int error;
- if ((error = shmem_mknod(&init_user_ns, dir, dentry,
- mode | S_IFDIR, 0)))
+ error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
+ if (error)
return error;
inc_nlink(dir);
return 0;
}
-static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
+static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
- return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
+ return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
}
/*
@@ -3045,7 +3051,7 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
return shmem_unlink(dir, dentry);
}
-static int shmem_whiteout(struct user_namespace *mnt_userns,
+static int shmem_whiteout(struct mnt_idmap *idmap,
struct inode *old_dir, struct dentry *old_dentry)
{
struct dentry *whiteout;
@@ -3055,7 +3061,7 @@ static int shmem_whiteout(struct user_namespace *mnt_userns,
if (!whiteout)
return -ENOMEM;
- error = shmem_mknod(&init_user_ns, old_dir, whiteout,
+ error = shmem_mknod(idmap, old_dir, whiteout,
S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
dput(whiteout);
if (error)
@@ -3078,7 +3084,7 @@ static int shmem_whiteout(struct user_namespace *mnt_userns,
* it exists so that the VFS layer correctly free's it when it
* gets overwritten.
*/
-static int shmem_rename2(struct user_namespace *mnt_userns,
+static int shmem_rename2(struct mnt_idmap *idmap,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
@@ -3098,7 +3104,7 @@ static int shmem_rename2(struct user_namespace *mnt_userns,
if (flags & RENAME_WHITEOUT) {
int error;
- error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
+ error = shmem_whiteout(idmap, old_dir, old_dentry);
if (error)
return error;
}
@@ -3124,7 +3130,7 @@ static int shmem_rename2(struct user_namespace *mnt_userns,
return 0;
}
-static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
int error;
@@ -3136,7 +3142,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (len > PAGE_SIZE)
return -ENAMETOOLONG;
- inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
+ inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
VM_NORESERVE);
if (!inode)
return -ENOSPC;
@@ -3229,7 +3235,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
return 0;
}
-static int shmem_fileattr_set(struct user_namespace *mnt_userns,
+static int shmem_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa)
{
struct inode *inode = d_inode(dentry);
@@ -3303,7 +3309,7 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler,
}
static int shmem_xattr_handler_set(const struct xattr_handler *handler,
- struct user_namespace *mnt_userns,
+ struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
@@ -3819,7 +3825,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#endif
uuid_gen(&sb->s_uuid);
- inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
+ inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
+ VM_NORESERVE);
if (!inode)
goto failed;
inode->i_uid = sbinfo->uid;
@@ -4044,7 +4051,11 @@ static struct file_system_type shmem_fs_type = {
.parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
+#ifdef CONFIG_SHMEM
+ .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
+#else
.fs_flags = FS_USERNS_MOUNT,
+#endif
};
void __init shmem_init(void)
@@ -4196,7 +4207,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
#define shmem_vm_ops generic_file_vm_ops
#define shmem_anon_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
-#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
+#define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
@@ -4219,8 +4230,11 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l
if (shmem_acct_size(flags, size))
return ERR_PTR(-ENOMEM);
- inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
- flags);
+ if (is_idmapped_mnt(mnt))
+ return ERR_PTR(-EINVAL);
+
+ inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
+ S_IFREG | S_IRWXUGO, 0, flags);
if (unlikely(!inode)) {
shmem_unacct_size(flags, size);
return ERR_PTR(-ENOSPC);
@@ -4306,9 +4320,9 @@ int shmem_zero_setup(struct vm_area_struct *vma)
}
/**
- * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
- * @mapping: the page's address_space
- * @index: the page index
+ * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the folio's address_space
+ * @index: the folio index
* @gfp: the page allocator flags to use if allocating
*
* This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
@@ -4320,13 +4334,12 @@ int shmem_zero_setup(struct vm_area_struct *vma)
* i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
* with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
*/
-struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
- pgoff_t index, gfp_t gfp)
+struct folio *shmem_read_folio_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
{
#ifdef CONFIG_SHMEM
struct inode *inode = mapping->host;
struct folio *folio;
- struct page *page;
int error;
BUG_ON(!shmem_mapping(mapping));
@@ -4336,6 +4349,25 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
return ERR_PTR(error);
folio_unlock(folio);
+ return folio;
+#else
+ /*
+ * The tiny !SHMEM case uses ramfs without swap
+ */
+ return mapping_read_folio_gfp(mapping, index, gfp);
+#endif
+}
+EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
+
+struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
+{
+ struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
+ struct page *page;
+
+ if (IS_ERR(folio))
+ return &folio->page;
+
page = folio_file_page(folio, index);
if (PageHWPoison(page)) {
folio_put(folio);
@@ -4343,11 +4375,5 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
}
return page;
-#else
- /*
- * The tiny !SHMEM case uses ramfs without swap
- */
- return read_cache_page_gfp(mapping, index, gfp);
-#endif
}
EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index b05295bab322..39c3491e28a3 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -246,18 +246,21 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
}
EXPORT_SYMBOL(shrinker_debugfs_rename);
-void shrinker_debugfs_remove(struct shrinker *shrinker)
+struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker)
{
+ struct dentry *entry = shrinker->debugfs_entry;
+
lockdep_assert_held(&shrinker_rwsem);
kfree_const(shrinker->name);
shrinker->name = NULL;
- if (!shrinker->debugfs_entry)
- return;
+ if (entry) {
+ ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
+ shrinker->debugfs_entry = NULL;
+ }
- debugfs_remove_recursive(shrinker->debugfs_entry);
- ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
+ return entry;
}
static int __init shrinker_debugfs_init(void)
diff --git a/mm/slab.c b/mm/slab.c
index 0d35747b9472..edbe722fb906 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1370,7 +1370,7 @@ static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
/* Make the flag visible before any changes to folio->mapping */
smp_wmb();
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
- if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0)))
+ if (sk_memalloc_socks() && folio_is_pfmemalloc(folio))
slab_set_pfmemalloc(slab);
return slab;
diff --git a/mm/slab.h b/mm/slab.h
index 7cc432969945..43966aa5fadf 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -323,6 +323,14 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
}
#endif
+static inline bool is_kmalloc_cache(struct kmem_cache *s)
+{
+#ifndef CONFIG_SLOB
+ return (s->flags & SLAB_KMALLOC);
+#else
+ return false;
+#endif
+}
/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
@@ -486,7 +494,7 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
{
struct obj_cgroup *objcg;
- if (!memcg_kmem_enabled())
+ if (!memcg_kmem_online())
return true;
if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))
@@ -527,7 +535,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
unsigned long off;
size_t i;
- if (!memcg_kmem_enabled() || !objcg)
+ if (!memcg_kmem_online() || !objcg)
return;
for (i = 0; i < size; i++) {
@@ -559,7 +567,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
struct obj_cgroup **objcgs;
int i;
- if (!memcg_kmem_enabled())
+ if (!memcg_kmem_online())
return;
objcgs = slab_objcgs(slab);
@@ -641,7 +649,7 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
static __always_inline void account_slab(struct slab *slab, int order,
struct kmem_cache *s, gfp_t gfp)
{
- if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT))
+ if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
memcg_alloc_slab_cgroups(slab, s, gfp, true);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
@@ -651,7 +659,7 @@ static __always_inline void account_slab(struct slab *slab, int order,
static __always_inline void unaccount_slab(struct slab *slab, int order,
struct kmem_cache *s)
{
- if (memcg_kmem_enabled())
+ if (memcg_kmem_online())
memcg_free_slab_cgroups(slab);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1cba98acc486..bf4e777cfe90 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -670,7 +670,6 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset,
usersize);
- kasan_cache_create_kmalloc(s);
list_add(&s->list, &slab_caches);
s->refcount = 1;
return s;
diff --git a/mm/slub.c b/mm/slub.c
index 1013834fb7bb..39327e98fce3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1592,7 +1592,7 @@ static int __init setup_slub_debug(char *str)
} else {
slab_list_specified = true;
if (flags & SLAB_STORE_USER)
- stack_depot_want_early_init();
+ stack_depot_request_early_init();
}
}
@@ -1611,7 +1611,7 @@ static int __init setup_slub_debug(char *str)
out:
slub_debug = global_flags;
if (slub_debug & SLAB_STORE_USER)
- stack_depot_want_early_init();
+ stack_depot_request_early_init();
if (slub_debug != 0 || slub_debug_string)
static_branch_enable(&slub_debug_enabled);
else
@@ -1859,7 +1859,7 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node,
__folio_set_slab(folio);
/* Make the flag visible before any changes to folio->mapping */
smp_wmb();
- if (page_is_pfmemalloc(folio_page(folio, 0)))
+ if (folio_is_pfmemalloc(folio))
slab_set_pfmemalloc(slab);
return slab;
diff --git a/mm/sparse.c b/mm/sparse.c
index 2779b419ef2a..fb7aeb1899a4 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -318,6 +318,7 @@ size_t mem_section_usage_size(void)
return sizeof(struct mem_section_usage) + usemap_size();
}
+#ifdef CONFIG_MEMORY_HOTREMOVE
static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
{
#ifndef CONFIG_NUMA
@@ -328,7 +329,6 @@ static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
#endif
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
diff --git a/mm/swap.c b/mm/swap.c
index 70e2063ef43a..57cb01b042f6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -158,36 +158,6 @@ void put_pages_list(struct list_head *pages)
}
EXPORT_SYMBOL(put_pages_list);
-/*
- * get_kernel_pages() - pin kernel pages in memory
- * @kiov: An array of struct kvec structures
- * @nr_segs: number of segments to pin
- * @write: pinning for read/write, currently ignored
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_segs long.
- *
- * Returns number of pages pinned. This may be fewer than the number requested.
- * If nr_segs is 0 or negative, returns 0. If no pages were pinned, returns 0.
- * Each page returned must be released with a put_page() call when it is
- * finished with.
- */
-int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
- struct page **pages)
-{
- int seg;
-
- for (seg = 0; seg < nr_segs; seg++) {
- if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
- return seg;
-
- pages[seg] = kmap_to_page(kiov[seg].iov_base);
- get_page(pages[seg]);
- }
-
- return seg;
-}
-EXPORT_SYMBOL_GPL(get_kernel_pages);
-
typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);
static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
@@ -201,7 +171,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
* Is an smp_mb__after_atomic() still required here, before
* folio_evictable() tests the mlocked flag, to rule out the possibility
* of stranding an evictable folio on an unevictable LRU? I think
- * not, because __munlock_page() only clears the mlocked flag
+ * not, because __munlock_folio() only clears the mlocked flag
* while the LRU lock is held.
*
* (That is not true of __page_cache_release(), and not necessarily
@@ -216,7 +186,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
folio_set_unevictable(folio);
/*
* folio->mlock_count = !!folio_test_mlocked(folio)?
- * But that leaves __mlock_page() in doubt whether another
+ * But that leaves __mlock_folio() in doubt whether another
* actor has already counted the mlock or not. Err on the
* safe side, underestimate, let page reclaim fix it, rather
* than leaving a page on the unevictable LRU indefinitely.
@@ -562,7 +532,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
- mlock_new_page(&folio->page);
+ mlock_new_folio(folio);
else
folio_add_lru(folio);
}
@@ -733,17 +703,15 @@ void deactivate_file_folio(struct folio *folio)
}
/*
- * deactivate_page - deactivate a page
- * @page: page to deactivate
+ * folio_deactivate - deactivate a folio
+ * @folio: folio to deactivate
*
- * deactivate_page() moves @page to the inactive list if @page was on the active
- * list and was not an unevictable page. This is done to accelerate the reclaim
- * of @page.
+ * folio_deactivate() moves @folio to the inactive list if @folio was on the
+ * active list and was not unevictable. This is done to accelerate the
+ * reclaim of @folio.
*/
-void deactivate_page(struct page *page)
+void folio_deactivate(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
(folio_test_active(folio) || lru_gen_enabled())) {
struct folio_batch *fbatch;
@@ -757,16 +725,14 @@ void deactivate_page(struct page *page)
}
/**
- * mark_page_lazyfree - make an anon page lazyfree
- * @page: page to deactivate
+ * folio_mark_lazyfree - make an anon folio lazyfree
+ * @folio: folio to deactivate
*
- * mark_page_lazyfree() moves @page to the inactive file list.
- * This is done to accelerate the reclaim of @page.
+ * folio_mark_lazyfree() moves @folio to the inactive file list.
+ * This is done to accelerate the reclaim of @folio.
*/
-void mark_page_lazyfree(struct page *page)
+void folio_mark_lazyfree(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
if (folio_test_lru(folio) && folio_test_anon(folio) &&
folio_test_swapbacked(folio) && !folio_test_swapcache(folio) &&
!folio_test_unevictable(folio)) {
@@ -785,7 +751,7 @@ void lru_add_drain(void)
local_lock(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
local_unlock(&cpu_fbatches.lock);
- mlock_page_drain_local();
+ mlock_drain_local();
}
/*
@@ -800,7 +766,7 @@ static void lru_add_and_bh_lrus_drain(void)
lru_add_drain_cpu(smp_processor_id());
local_unlock(&cpu_fbatches.lock);
invalidate_bh_lrus_cpu();
- mlock_page_drain_local();
+ mlock_drain_local();
}
void lru_add_drain_cpu_zone(struct zone *zone)
@@ -809,7 +775,7 @@ void lru_add_drain_cpu_zone(struct zone *zone)
lru_add_drain_cpu(smp_processor_id());
drain_local_pages(zone);
local_unlock(&cpu_fbatches.lock);
- mlock_page_drain_local();
+ mlock_drain_local();
}
#ifdef CONFIG_SMP
@@ -832,7 +798,7 @@ static bool cpu_needs_drain(unsigned int cpu)
folio_batch_count(&fbatches->lru_deactivate) ||
folio_batch_count(&fbatches->lru_lazyfree) ||
folio_batch_count(&fbatches->activate) ||
- need_mlock_page_drain(cpu) ||
+ need_mlock_drain(cpu) ||
has_bh_in_lru(cpu, NULL);
}
@@ -1119,16 +1085,6 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
fbatch->nr = j;
}
-unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
- struct address_space *mapping, pgoff_t *index, pgoff_t end,
- xa_mark_t tag)
-{
- pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
- PAGEVEC_SIZE, pvec->pages);
- return pagevec_count(pvec);
-}
-EXPORT_SYMBOL(pagevec_lookup_range_tag);
-
/*
* Perform any setup for the swap system
*/
diff --git a/mm/swap.h b/mm/swap.h
index f78065c8ef52..7c033d793f15 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -8,8 +8,7 @@
/* linux/mm/page_io.c */
int sio_pool_init(void);
struct swap_iocb;
-int swap_readpage(struct page *page, bool do_poll,
- struct swap_iocb **plug);
+void swap_readpage(struct page *page, bool do_poll, struct swap_iocb **plug);
void __swap_read_unplug(struct swap_iocb *plug);
static inline void swap_read_unplug(struct swap_iocb *plug)
{
@@ -18,7 +17,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
}
void swap_write_unplug(struct swap_iocb *sio);
int swap_writepage(struct page *page, struct writeback_control *wbc);
-int __swap_writepage(struct page *page, struct writeback_control *wbc);
+void __swap_writepage(struct page *page, struct writeback_control *wbc);
/* linux/mm/swap_state.c */
/* One swap address space for each 64M swap space */
@@ -64,10 +63,9 @@ static inline unsigned int folio_swap_flags(struct folio *folio)
}
#else /* CONFIG_SWAP */
struct swap_iocb;
-static inline int swap_readpage(struct page *page, bool do_poll,
- struct swap_iocb **plug)
+static inline void swap_readpage(struct page *page, bool do_poll,
+ struct swap_iocb **plug)
{
- return 0;
}
static inline void swap_write_unplug(struct swap_iocb *sio)
{
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2927507b43d8..7a003d8abb37 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -94,6 +94,8 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
unsigned long i, nr = folio_nr_pages(folio);
void *old;
+ xas_set_update(&xas, workingset_update_node);
+
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
@@ -145,6 +147,8 @@ void __delete_from_swap_cache(struct folio *folio,
pgoff_t idx = swp_offset(entry);
XA_STATE(xas, &address_space->i_pages, idx);
+ xas_set_update(&xas, workingset_update_node);
+
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
@@ -252,6 +256,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
struct address_space *address_space = swap_address_space(entry);
XA_STATE(xas, &address_space->i_pages, curr);
+ xas_set_update(&xas, workingset_update_node);
+
xa_lock_irq(&address_space->i_pages);
xas_for_each(&xas, old, end) {
if (!xa_is_value(old))
@@ -321,19 +327,15 @@ static inline bool swap_use_vma_readahead(void)
* unlocked and with its refcount incremented - we rely on the kernel
* lock getting page table operations atomic even if we drop the folio
* lock before returning.
+ *
+ * Caller must lock the swap device or hold a reference to keep it valid.
*/
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
struct folio *folio;
- struct swap_info_struct *si;
- si = get_swap_device(entry);
- if (!si)
- return NULL;
folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
- put_swap_device(si);
-
if (folio) {
bool vma_ra = swap_use_vma_readahead();
bool readahead;
@@ -693,28 +695,15 @@ void exit_swap_address_space(unsigned int type)
swapper_spaces[type] = NULL;
}
-static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
- unsigned long faddr,
- unsigned long lpfn,
- unsigned long rpfn,
- unsigned long *start,
- unsigned long *end)
-{
- *start = max3(lpfn, PFN_DOWN(vma->vm_start),
- PFN_DOWN(faddr & PMD_MASK));
- *end = min3(rpfn, PFN_DOWN(vma->vm_end),
- PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
-}
-
static void swap_ra_info(struct vm_fault *vmf,
- struct vma_swap_readahead *ra_info)
+ struct vma_swap_readahead *ra_info)
{
struct vm_area_struct *vma = vmf->vma;
unsigned long ra_val;
- unsigned long faddr, pfn, fpfn;
+ unsigned long faddr, pfn, fpfn, lpfn, rpfn;
unsigned long start, end;
pte_t *pte, *orig_pte;
- unsigned int max_win, hits, prev_win, win, left;
+ unsigned int max_win, hits, prev_win, win;
#ifndef CONFIG_64BIT
pte_t *tpte;
#endif
@@ -727,8 +716,6 @@ static void swap_ra_info(struct vm_fault *vmf,
}
faddr = vmf->address;
- orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
-
fpfn = PFN_DOWN(faddr);
ra_val = GET_SWAP_RA_VAL(vma);
pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
@@ -739,22 +726,28 @@ static void swap_ra_info(struct vm_fault *vmf,
atomic_long_set(&vma->swap_readahead_info,
SWAP_RA_VAL(faddr, win, 0));
- if (win == 1) {
- pte_unmap(orig_pte);
+ if (win == 1)
return;
- }
/* Copy the PTEs because the page table may be unmapped */
- if (fpfn == pfn + 1)
- swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
- else if (pfn == fpfn + 1)
- swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
- &start, &end);
- else {
- left = (win - 1) / 2;
- swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
- &start, &end);
+ orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
+ if (fpfn == pfn + 1) {
+ lpfn = fpfn;
+ rpfn = fpfn + win;
+ } else if (pfn == fpfn + 1) {
+ lpfn = fpfn - win + 1;
+ rpfn = fpfn + 1;
+ } else {
+ unsigned int left = (win - 1) / 2;
+
+ lpfn = fpfn - left;
+ rpfn = fpfn + win - left;
}
+ start = max3(lpfn, PFN_DOWN(vma->vm_start),
+ PFN_DOWN(faddr & PMD_MASK));
+ end = min3(rpfn, PFN_DOWN(vma->vm_end),
+ PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
+
ra_info->nr_pte = end - start;
ra_info->offset = fpfn - start;
pte -= ra_info->offset;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 908a529bca12..62ba2bf577d7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1098,8 +1098,7 @@ start_over:
spin_unlock(&si->lock);
if (n_ret || size == SWAPFILE_CLUSTER)
goto check_out;
- pr_debug("scan_swap_map of si %d failed to find offset\n",
- si->type);
+ cond_resched();
spin_lock(&swap_avail_lock);
nextsi:
@@ -1763,12 +1762,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
struct page *swapcache;
spinlock_t *ptl;
pte_t *pte, new_pte;
+ bool hwposioned = false;
int ret = 1;
swapcache = page;
page = ksm_might_need_to_copy(page, vma, addr);
if (unlikely(!page))
return -ENOMEM;
+ else if (unlikely(PTR_ERR(page) == -EHWPOISON))
+ hwposioned = true;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
@@ -1776,15 +1778,19 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
goto out;
}
- if (unlikely(!PageUptodate(page))) {
- pte_t pteval;
+ if (unlikely(hwposioned || !PageUptodate(page))) {
+ swp_entry_t swp_entry;
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
- pteval = swp_entry_to_pte(make_swapin_error_entry());
- set_pte_at(vma->vm_mm, addr, pte, pteval);
- swap_free(entry);
+ if (hwposioned) {
+ swp_entry = make_hwpoison_entry(swapcache);
+ page = swapcache;
+ } else {
+ swp_entry = make_swapin_error_entry();
+ }
+ new_pte = swp_entry_to_pte(swp_entry);
ret = 0;
- goto out;
+ goto setpte;
}
/* See do_swap_page() */
@@ -1816,6 +1822,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
new_pte = pte_mksoft_dirty(new_pte);
if (pte_swp_uffd_wp(*pte))
new_pte = pte_mkuffd_wp(new_pte);
+setpte:
set_pte_at(vma->vm_mm, addr, pte, new_pte);
swap_free(entry);
out:
@@ -1835,13 +1842,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_t *pte;
struct swap_info_struct *si;
int ret = 0;
- volatile unsigned char *swap_map;
si = swap_info[type];
pte = pte_offset_map(pmd, addr);
do {
struct folio *folio;
unsigned long offset;
+ unsigned char swp_count;
if (!is_swap_pte(*pte))
continue;
@@ -1852,7 +1859,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
offset = swp_offset(entry);
pte_unmap(pte);
- swap_map = &si->swap_map[offset];
folio = swap_cache_get_folio(entry, vma, addr);
if (!folio) {
struct page *page;
@@ -1869,8 +1875,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
folio = page_folio(page);
}
if (!folio) {
- if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
+ swp_count = READ_ONCE(si->swap_map[offset]);
+ if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
goto try_next;
+
return -ENOMEM;
}
@@ -3069,7 +3077,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (p->bdev && bdev_stable_writes(p->bdev))
p->flags |= SWP_STABLE_WRITES;
- if (p->bdev && p->bdev->bd_disk->fops->rw_page)
+ if (p->bdev && bdev_synchronous(p->bdev))
p->flags |= SWP_SYNCHRONOUS_IO;
if (p->bdev && bdev_nonrot(p->bdev)) {
@@ -3642,7 +3650,7 @@ void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
* We've already scheduled a throttle, avoid taking the global swap
* lock.
*/
- if (current->throttle_queue)
+ if (current->throttle_disk)
return;
spin_lock(&swap_avail_lock);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0499907b6f1a..53c3d916ff66 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -74,24 +74,10 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
_dst_pte = pte_mkdirty(_dst_pte);
if (page_in_cache && !vm_shared)
writable = false;
-
- /*
- * Always mark a PTE as write-protected when needed, regardless of
- * VM_WRITE, which the user might change.
- */
- if (wp_copy) {
- _dst_pte = pte_mkuffd_wp(_dst_pte);
- writable = false;
- }
-
if (writable)
_dst_pte = pte_mkwrite(_dst_pte);
- else
- /*
- * We need this to make sure write bit removed; as mk_pte()
- * could return a pte with write bit set.
- */
- _dst_pte = pte_wrprotect(_dst_pte);
+ if (wp_copy)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
@@ -724,21 +710,31 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
mmap_changing, 0);
}
-void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
+long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
unsigned long start, unsigned long len, bool enable_wp)
{
+ unsigned int mm_cp_flags;
struct mmu_gather tlb;
- pgprot_t newprot;
+ long ret;
if (enable_wp)
- newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
+ mm_cp_flags = MM_CP_UFFD_WP;
else
- newprot = vm_get_page_prot(dst_vma->vm_flags);
+ mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
+ /*
+ * vma->vm_page_prot already reflects that uffd-wp is enabled for this
+ * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
+ * to be write-protected as default whenever protection changes.
+ * Try upgrading write permissions manually.
+ */
+ if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
+ mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
tlb_gather_mmu(&tlb, dst_mm);
- change_protection(&tlb, dst_vma, start, start + len, newprot,
- enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+ ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
tlb_finish_mmu(&tlb);
+
+ return ret;
}
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
@@ -747,7 +743,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
{
struct vm_area_struct *dst_vma;
unsigned long page_mask;
- int err;
+ long err;
/*
* Sanitize the command parameters:
@@ -786,9 +782,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
goto out_unlock;
}
- uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
+ err = uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
+
+ /* Return 0 on success, <0 on failures */
+ if (err > 0)
+ err = 0;
- err = 0;
out_unlock:
mmap_read_unlock(dst_mm);
return err;
diff --git a/mm/util.c b/mm/util.c
index b56c92fb910f..b8ed9dbc7fd5 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -120,7 +120,8 @@ EXPORT_SYMBOL(kstrndup);
* @len: memory region length
* @gfp: GFP mask to use
*
- * Return: newly allocated copy of @src or %NULL in case of error
+ * Return: newly allocated copy of @src or %NULL in case of error,
+ * result is physically contiguous. Use kfree() to free.
*/
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
@@ -134,6 +135,27 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
EXPORT_SYMBOL(kmemdup);
/**
+ * kvmemdup - duplicate region of memory
+ *
+ * @src: memory region to duplicate
+ * @len: memory region length
+ * @gfp: GFP mask to use
+ *
+ * Return: newly allocated copy of @src or %NULL in case of error,
+ * result may be not physically contiguous. Use kvfree() to free.
+ */
+void *kvmemdup(const void *src, size_t len, gfp_t gfp)
+{
+ void *p;
+
+ p = kvmalloc(len, gfp);
+ if (p)
+ memcpy(p, src, len);
+ return p;
+}
+EXPORT_SYMBOL(kvmemdup);
+
+/**
* kmemdup_nul - Create a NUL-terminated string from unterminated data
* @s: The data to stringify
* @len: The size of the data
@@ -945,7 +967,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
- pr_warn_ratelimited("%s: pid: %d, comm: %s, no enough memory for the allocation\n",
+ pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n",
__func__, current->pid, current->comm);
vm_unacct_memory(pages);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ca71de7c9d77..ef910bf349e1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -89,17 +89,6 @@ struct vfree_deferred {
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
-static void __vunmap(const void *, int);
-
-static void free_work(struct work_struct *w)
-{
- struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
- struct llist_node *t, *llnode;
-
- llist_for_each_safe(llnode, t, llist_del_all(&p->list))
- __vunmap((void *)llnode, 1);
-}
-
/*** Page table manipulation functions ***/
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
@@ -656,6 +645,7 @@ int is_vmalloc_or_module_addr(const void *x)
#endif
return is_vmalloc_addr(x);
}
+EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
/*
* Walk a vmap address to the struct page it maps. Huge vmap mappings will
@@ -1589,7 +1579,8 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
- int node, gfp_t gfp_mask)
+ int node, gfp_t gfp_mask,
+ unsigned long va_flags)
{
struct vmap_area *va;
unsigned long freed;
@@ -1597,9 +1588,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
int purged = 0;
int ret;
- BUG_ON(!size);
- BUG_ON(offset_in_page(size));
- BUG_ON(!is_power_of_2(align));
+ if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
+ return ERR_PTR(-EINVAL);
if (unlikely(!vmap_initialized))
return ERR_PTR(-EBUSY);
@@ -1635,6 +1625,7 @@ retry:
va->va_start = addr;
va->va_end = addr + size;
va->vm = NULL;
+ va->flags = va_flags;
spin_lock(&vmap_area_lock);
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
@@ -1815,9 +1806,9 @@ static void drain_vmap_area_work(struct work_struct *work)
}
/*
- * Free a vmap area, caller ensuring that the area has been unmapped
- * and flush_cache_vunmap had been called for the correct range
- * previously.
+ * Free a vmap area, caller ensuring that the area has been unmapped,
+ * unlinked and flush_cache_vunmap had been called for the correct
+ * range previously.
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
@@ -1825,9 +1816,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
unsigned long va_start = va->va_start;
unsigned long nr_lazy;
- spin_lock(&vmap_area_lock);
- unlink_va(va, &vmap_area_root);
- spin_unlock(&vmap_area_lock);
+ if (WARN_ON_ONCE(!list_empty(&va->list)))
+ return;
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
PAGE_SHIFT, &vmap_lazy_nr);
@@ -1871,6 +1861,19 @@ struct vmap_area *find_vmap_area(unsigned long addr)
return va;
}
+static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
+{
+ struct vmap_area *va;
+
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area(addr, &vmap_area_root);
+ if (va)
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ return va;
+}
+
/*** Per cpu kva allocator ***/
/*
@@ -1901,6 +1904,10 @@ struct vmap_area *find_vmap_area(unsigned long addr)
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
+#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/
+#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/
+#define VMAP_FLAGS_MASK 0x3
+
struct vmap_block_queue {
spinlock_t lock;
struct list_head free;
@@ -1910,6 +1917,7 @@ struct vmap_block {
spinlock_t lock;
struct vmap_area *va;
unsigned long free, dirty;
+ DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
unsigned long dirty_min, dirty_max; /*< dirty range */
struct list_head free_list;
struct rcu_head rcu_head;
@@ -1975,7 +1983,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
- node, gfp_mask);
+ node, gfp_mask,
+ VMAP_RAM|VMAP_BLOCK);
if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
@@ -1986,10 +1995,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
vb->va = va;
/* At least something should be left free */
BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
+ bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
vb->free = VMAP_BBMAP_BITS - (1UL << order);
vb->dirty = 0;
vb->dirty_min = VMAP_BBMAP_BITS;
vb->dirty_max = 0;
+ bitmap_set(vb->used_map, 0, (1UL << order));
INIT_LIST_HEAD(&vb->free_list);
vb_idx = addr_to_vb_idx(va->va_start);
@@ -2015,6 +2026,10 @@ static void free_vmap_block(struct vmap_block *vb)
tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
+ spin_lock(&vmap_area_lock);
+ unlink_va(vb->va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
free_vmap_area_noflush(vb->va);
kfree_rcu(vb, rcu_head);
}
@@ -2095,6 +2110,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
pages_off = VMAP_BBMAP_BITS - vb->free;
vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
vb->free -= 1UL << order;
+ bitmap_set(vb->used_map, pages_off, (1UL << order));
if (vb->free == 0) {
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
@@ -2128,6 +2144,9 @@ static void vb_free(unsigned long addr, unsigned long size)
order = get_order(size);
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
+ spin_lock(&vb->lock);
+ bitmap_clear(vb->used_map, offset, (1UL << order));
+ spin_unlock(&vb->lock);
vunmap_range_noflush(addr, addr + size);
@@ -2236,8 +2255,10 @@ void vm_unmap_ram(const void *mem, unsigned int count)
return;
}
- va = find_vmap_area(addr);
- BUG_ON(!va);
+ va = find_unlink_vmap_area(addr);
+ if (WARN_ON_ONCE(!va))
+ return;
+
debug_check_no_locks_freed((void *)va->va_start,
(va->va_end - va->va_start));
free_unmap_vmap_area(va);
@@ -2272,7 +2293,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
} else {
struct vmap_area *va;
va = alloc_vmap_area(size, PAGE_SIZE,
- VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+ VMALLOC_START, VMALLOC_END,
+ node, GFP_KERNEL, VMAP_RAM);
if (IS_ERR(va))
return NULL;
@@ -2416,48 +2438,6 @@ static void vmap_init_free_space(void)
}
}
-void __init vmalloc_init(void)
-{
- struct vmap_area *va;
- struct vm_struct *tmp;
- int i;
-
- /*
- * Create the cache for vmap_area objects.
- */
- vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
-
- for_each_possible_cpu(i) {
- struct vmap_block_queue *vbq;
- struct vfree_deferred *p;
-
- vbq = &per_cpu(vmap_block_queue, i);
- spin_lock_init(&vbq->lock);
- INIT_LIST_HEAD(&vbq->free);
- p = &per_cpu(vfree_deferred, i);
- init_llist_head(&p->list);
- INIT_WORK(&p->wq, free_work);
- }
-
- /* Import existing vmlist entries. */
- for (tmp = vmlist; tmp; tmp = tmp->next) {
- va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
- if (WARN_ON_ONCE(!va))
- continue;
-
- va->va_start = (unsigned long)tmp->addr;
- va->va_end = va->va_start + tmp->size;
- va->vm = tmp;
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
- }
-
- /*
- * Now we can initialize a free vmap space.
- */
- vmap_init_free_space();
- vmap_initialized = true;
-}
-
static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
@@ -2512,7 +2492,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;
- va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
+ va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
if (IS_ERR(va)) {
kfree(area);
return NULL;
@@ -2604,25 +2584,26 @@ struct vm_struct *find_vm_area(const void *addr)
struct vm_struct *remove_vm_area(const void *addr)
{
struct vmap_area *va;
+ struct vm_struct *vm;
might_sleep();
- spin_lock(&vmap_area_lock);
- va = __find_vmap_area((unsigned long)addr, &vmap_area_root);
- if (va && va->vm) {
- struct vm_struct *vm = va->vm;
-
- va->vm = NULL;
- spin_unlock(&vmap_area_lock);
+ if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
+ addr))
+ return NULL;
- kasan_free_module_shadow(vm);
- free_unmap_vmap_area(va);
+ va = find_unlink_vmap_area((unsigned long)addr);
+ if (!va || !va->vm)
+ return NULL;
+ vm = va->vm;
- return vm;
- }
+ debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
+ debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
+ kasan_free_module_shadow(vm);
+ kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));
- spin_unlock(&vmap_area_lock);
- return NULL;
+ free_unmap_vmap_area(va);
+ return vm;
}
static inline void set_area_direct_map(const struct vm_struct *area,
@@ -2636,37 +2617,23 @@ static inline void set_area_direct_map(const struct vm_struct *area,
set_direct_map(area->pages[i]);
}
-/* Handle removing and resetting vm mappings related to the vm_struct. */
-static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
+/*
+ * Flush the vm mapping and reset the direct map.
+ */
+static void vm_reset_perms(struct vm_struct *area)
{
unsigned long start = ULONG_MAX, end = 0;
unsigned int page_order = vm_area_page_order(area);
- int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
int flush_dmap = 0;
int i;
- remove_vm_area(area->addr);
-
- /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
- if (!flush_reset)
- return;
-
/*
- * If not deallocating pages, just do the flush of the VM area and
- * return.
- */
- if (!deallocate_pages) {
- vm_unmap_aliases();
- return;
- }
-
- /*
- * If execution gets here, flush the vm mapping and reset the direct
- * map. Find the start and end range of the direct mappings to make sure
+ * Find the start and end range of the direct mappings to make sure that
* the vm_unmap_aliases() flush includes the direct map.
*/
for (i = 0; i < area->nr_pages; i += 1U << page_order) {
unsigned long addr = (unsigned long)page_address(area->pages[i]);
+
if (addr) {
unsigned long page_size;
@@ -2687,66 +2654,13 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
set_area_direct_map(area, set_direct_map_default_noflush);
}
-static void __vunmap(const void *addr, int deallocate_pages)
+static void delayed_vfree_work(struct work_struct *w)
{
- struct vm_struct *area;
-
- if (!addr)
- return;
-
- if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
- addr))
- return;
-
- area = find_vm_area(addr);
- if (unlikely(!area)) {
- WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
- addr);
- return;
- }
-
- debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
- debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
-
- kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
-
- vm_remove_mappings(area, deallocate_pages);
-
- if (deallocate_pages) {
- int i;
-
- for (i = 0; i < area->nr_pages; i++) {
- struct page *page = area->pages[i];
-
- BUG_ON(!page);
- mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
- /*
- * High-order allocs for huge vmallocs are split, so
- * can be freed as an array of order-0 allocations
- */
- __free_pages(page, 0);
- cond_resched();
- }
- atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
-
- kvfree(area->pages);
- }
-
- kfree(area);
-}
-
-static inline void __vfree_deferred(const void *addr)
-{
- /*
- * Use raw_cpu_ptr() because this can be called from preemptible
- * context. Preemption is absolutely fine here, because the llist_add()
- * implementation is lockless, so it works even if we are adding to
- * another cpu's list. schedule_work() should be fine with this too.
- */
- struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
+ struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
+ struct llist_node *t, *llnode;
- if (llist_add((struct llist_node *)addr, &p->list))
- schedule_work(&p->wq);
+ llist_for_each_safe(llnode, t, llist_del_all(&p->list))
+ vfree(llnode);
}
/**
@@ -2758,21 +2672,19 @@ static inline void __vfree_deferred(const void *addr)
*/
void vfree_atomic(const void *addr)
{
- BUG_ON(in_nmi());
+ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
+ BUG_ON(in_nmi());
kmemleak_free(addr);
- if (!addr)
- return;
- __vfree_deferred(addr);
-}
-
-static void __vfree(const void *addr)
-{
- if (unlikely(in_interrupt()))
- __vfree_deferred(addr);
- else
- __vunmap(addr, 1);
+ /*
+ * Use raw_cpu_ptr() because this can be called from preemptible
+ * context. Preemption is absolutely fine here, because the llist_add()
+ * implementation is lockless, so it works even if we are adding to
+ * another cpu's list. schedule_work() should be fine with this too.
+ */
+ if (addr && llist_add((struct llist_node *)addr, &p->list))
+ schedule_work(&p->wq);
}
/**
@@ -2794,16 +2706,45 @@ static void __vfree(const void *addr)
*/
void vfree(const void *addr)
{
- BUG_ON(in_nmi());
+ struct vm_struct *vm;
+ int i;
- kmemleak_free(addr);
+ if (unlikely(in_interrupt())) {
+ vfree_atomic(addr);
+ return;
+ }
- might_sleep_if(!in_interrupt());
+ BUG_ON(in_nmi());
+ kmemleak_free(addr);
+ might_sleep();
if (!addr)
return;
- __vfree(addr);
+ vm = remove_vm_area(addr);
+ if (unlikely(!vm)) {
+ WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+ addr);
+ return;
+ }
+
+ if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
+ vm_reset_perms(vm);
+ for (i = 0; i < vm->nr_pages; i++) {
+ struct page *page = vm->pages[i];
+
+ BUG_ON(!page);
+ mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
+ /*
+ * High-order allocs for huge vmallocs are split, so
+ * can be freed as an array of order-0 allocations
+ */
+ __free_pages(page, 0);
+ cond_resched();
+ }
+ atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
+ kvfree(vm->pages);
+ kfree(vm);
}
EXPORT_SYMBOL(vfree);
@@ -2818,10 +2759,20 @@ EXPORT_SYMBOL(vfree);
*/
void vunmap(const void *addr)
{
+ struct vm_struct *vm;
+
BUG_ON(in_interrupt());
might_sleep();
- if (addr)
- __vunmap(addr, 0);
+
+ if (!addr)
+ return;
+ vm = remove_vm_area(addr);
+ if (unlikely(!vm)) {
+ WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
+ addr);
+ return;
+ }
+ kfree(vm);
}
EXPORT_SYMBOL(vunmap);
@@ -2849,6 +2800,9 @@ void *vmap(struct page **pages, unsigned int count,
might_sleep();
+ if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
+ return NULL;
+
/*
* Your top guard is someone else's bottom guard. Not having a top
* guard compromises someone else's mappings too.
@@ -3031,7 +2985,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
int ret;
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
- gfp_mask |= __GFP_NOWARN;
+
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
@@ -3067,7 +3021,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
/*
* If not enough pages were obtained to accomplish an
- * allocation request, free them via __vfree() if any.
+ * allocation request, free them via vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
warn_alloc(gfp_mask, NULL,
@@ -3107,7 +3061,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
return area->addr;
fail:
- __vfree(area->addr);
+ vfree(area->addr);
return NULL;
}
@@ -3510,6 +3464,68 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
return copied;
}
+static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
+{
+ char *start;
+ struct vmap_block *vb;
+ unsigned long offset;
+ unsigned int rs, re, n;
+
+ /*
+ * If it's area created by vm_map_ram() interface directly, but
+ * not further subdividing and delegating management to vmap_block,
+ * handle it here.
+ */
+ if (!(flags & VMAP_BLOCK)) {
+ aligned_vread(buf, addr, count);
+ return;
+ }
+
+ /*
+ * Area is split into regions and tracked with vmap_block, read out
+ * each region and zero fill the hole between regions.
+ */
+ vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
+ if (!vb)
+ goto finished;
+
+ spin_lock(&vb->lock);
+ if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
+ spin_unlock(&vb->lock);
+ goto finished;
+ }
+ for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
+ if (!count)
+ break;
+ start = vmap_block_vaddr(vb->va->va_start, rs);
+ while (addr < start) {
+ if (count == 0)
+ goto unlock;
+ *buf = '\0';
+ buf++;
+ addr++;
+ count--;
+ }
+ /*it could start reading from the middle of used region*/
+ offset = offset_in_page(addr);
+ n = ((re - rs + 1) << PAGE_SHIFT) - offset;
+ if (n > count)
+ n = count;
+ aligned_vread(buf, start+offset, n);
+
+ buf += n;
+ addr += n;
+ count -= n;
+ }
+unlock:
+ spin_unlock(&vb->lock);
+
+finished:
+ /* zero-fill the left dirty or free regions */
+ if (count)
+ memset(buf, 0, count);
+}
+
/**
* vread() - read vmalloc area in a safe way.
* @buf: buffer for reading data
@@ -3540,7 +3556,7 @@ long vread(char *buf, char *addr, unsigned long count)
struct vm_struct *vm;
char *vaddr, *buf_start = buf;
unsigned long buflen = count;
- unsigned long n;
+ unsigned long n, size, flags;
addr = kasan_reset_tag(addr);
@@ -3561,12 +3577,26 @@ long vread(char *buf, char *addr, unsigned long count)
if (!count)
break;
- if (!va->vm)
+ vm = va->vm;
+ flags = va->flags & VMAP_FLAGS_MASK;
+ /*
+ * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
+ * be set together with VMAP_RAM.
+ */
+ WARN_ON(flags == VMAP_BLOCK);
+
+ if (!vm && !flags)
continue;
- vm = va->vm;
- vaddr = (char *) vm->addr;
- if (addr >= vaddr + get_vm_area_size(vm))
+ if (vm && (vm->flags & VM_UNINITIALIZED))
+ continue;
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
+
+ vaddr = (char *) va->va_start;
+ size = vm ? get_vm_area_size(vm) : va_size(va);
+
+ if (addr >= vaddr + size)
continue;
while (addr < vaddr) {
if (count == 0)
@@ -3576,10 +3606,13 @@ long vread(char *buf, char *addr, unsigned long count)
addr++;
count--;
}
- n = vaddr + get_vm_area_size(vm) - addr;
+ n = vaddr + size - addr;
if (n > count)
n = count;
- if (!(vm->flags & VM_IOREMAP))
+
+ if (flags & VMAP_RAM)
+ vmap_ram_vread(buf, addr, n, flags);
+ else if (!(vm->flags & VM_IOREMAP))
aligned_vread(buf, addr, n);
else /* IOREMAP area is treated as memory hole */
memset(buf, 0, n);
@@ -3657,7 +3690,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
size -= PAGE_SIZE;
} while (size > 0);
- vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
return 0;
}
@@ -4125,14 +4158,11 @@ static int s_show(struct seq_file *m, void *p)
va = list_entry(p, struct vmap_area, list);
- /*
- * s_show can encounter race with remove_vm_area, !vm on behalf
- * of vmap area is being tear down or vm_map_ram allocation.
- */
if (!va->vm) {
- seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
- (void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start);
+ if (va->flags & VMAP_RAM)
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
goto final;
}
@@ -4202,3 +4232,45 @@ static int __init proc_vmalloc_init(void)
module_init(proc_vmalloc_init);
#endif
+
+void __init vmalloc_init(void)
+{
+ struct vmap_area *va;
+ struct vm_struct *tmp;
+ int i;
+
+ /*
+ * Create the cache for vmap_area objects.
+ */
+ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
+
+ for_each_possible_cpu(i) {
+ struct vmap_block_queue *vbq;
+ struct vfree_deferred *p;
+
+ vbq = &per_cpu(vmap_block_queue, i);
+ spin_lock_init(&vbq->lock);
+ INIT_LIST_HEAD(&vbq->free);
+ p = &per_cpu(vfree_deferred, i);
+ init_llist_head(&p->list);
+ INIT_WORK(&p->wq, delayed_vfree_work);
+ }
+
+ /* Import existing vmlist entries. */
+ for (tmp = vmlist; tmp; tmp = tmp->next) {
+ va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (WARN_ON_ONCE(!va))
+ continue;
+
+ va->va_start = (unsigned long)tmp->addr;
+ va->va_end = va->va_start + tmp->size;
+ va->vm = tmp;
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ }
+
+ /*
+ * Now we can initialize a free vmap space.
+ */
+ vmap_init_free_space();
+ vmap_initialized = true;
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd6637fcd8f9..9c1c5e8b24b8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -55,6 +55,8 @@
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/khugepaged.h>
+#include <linux/rculist_nulls.h>
+#include <linux/random.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -135,12 +137,6 @@ struct scan_control {
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;
-#ifdef CONFIG_LRU_GEN
- /* help kswapd make better choices among multiple memcgs */
- unsigned int memcgs_need_aging:1;
- unsigned long last_reclaimed;
-#endif
-
/* Allocation order */
s8 order;
@@ -449,6 +445,11 @@ static bool cgroup_reclaim(struct scan_control *sc)
return sc->target_mem_cgroup;
}
+static bool global_reclaim(struct scan_control *sc)
+{
+ return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
+}
+
/**
* writeback_throttling_sane - is the usual dirty throttling mechanism available?
* @sc: scan_control in question
@@ -499,6 +500,11 @@ static bool cgroup_reclaim(struct scan_control *sc)
return false;
}
+static bool global_reclaim(struct scan_control *sc)
+{
+ return true;
+}
+
static bool writeback_throttling_sane(struct scan_control *sc)
{
return true;
@@ -741,6 +747,8 @@ EXPORT_SYMBOL(register_shrinker);
*/
void unregister_shrinker(struct shrinker *shrinker)
{
+ struct dentry *debugfs_entry;
+
if (!(shrinker->flags & SHRINKER_REGISTERED))
return;
@@ -749,9 +757,11 @@ void unregister_shrinker(struct shrinker *shrinker)
shrinker->flags &= ~SHRINKER_REGISTERED;
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
unregister_memcg_shrinker(shrinker);
- shrinker_debugfs_remove(shrinker);
+ debugfs_entry = shrinker_debugfs_remove(shrinker);
up_write(&shrinker_rwsem);
+ debugfs_remove_recursive(debugfs_entry);
+
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
}
@@ -905,7 +915,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
}
/* Call non-slab shrinkers even though kmem is disabled */
- if (!memcg_kmem_enabled() &&
+ if (!memcg_kmem_online() &&
!(shrinker->flags & SHRINKER_NONSLAB))
continue;
@@ -1920,7 +1930,7 @@ retry:
!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
* Immediately reclaim when written back.
- * Similar in principle to deactivate_page()
+ * Similar in principle to folio_deactivate()
* except we already have the folio isolated
* and know it's dirty
*/
@@ -2327,12 +2337,12 @@ move:
* (2) The lru_lock must not be held.
* (3) Interrupts must be enabled.
*
- * Return: 0 if the folio was removed from an LRU list.
- * -EBUSY if the folio was not on an LRU list.
+ * Return: true if the folio was removed from an LRU list.
+ * false if the folio was not on an LRU list.
*/
-int folio_isolate_lru(struct folio *folio)
+bool folio_isolate_lru(struct folio *folio)
{
- int ret = -EBUSY;
+ bool ret = false;
VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
@@ -2343,7 +2353,7 @@ int folio_isolate_lru(struct folio *folio)
lruvec = folio_lruvec_lock_irq(folio);
lruvec_del_folio(lruvec, folio);
unlock_page_lruvec_irq(lruvec);
- ret = 0;
+ ret = true;
}
return ret;
@@ -3176,6 +3186,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
+#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
+
static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
{
struct pglist_data *pgdat = NODE_DATA(nid);
@@ -3201,6 +3214,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ if (!sc->may_swap)
+ return 0;
+
if (!can_demote(pgdat->node_id, sc) &&
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
@@ -3215,13 +3231,105 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
{
- /* see the comment on lru_gen_struct */
+ /* see the comment on lru_gen_folio */
return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
}
/******************************************************************************
+ * Bloom filters
+ ******************************************************************************/
+
+/*
+ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
+ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
+ * bits in a bitmap, k is the number of hash functions and n is the number of
+ * inserted items.
+ *
+ * Page table walkers use one of the two filters to reduce their search space.
+ * To get rid of non-leaf entries that no longer have enough leaf entries, the
+ * aging uses the double-buffering technique to flip to the other filter each
+ * time it produces a new generation. For non-leaf entries that have enough
+ * leaf entries, the aging carries them over to the next generation in
+ * walk_pmd_range(); the eviction also report them when walking the rmap
+ * in lru_gen_look_around().
+ *
+ * For future optimizations:
+ * 1. It's not necessary to keep both filters all the time. The spare one can be
+ * freed after the RCU grace period and reallocated if needed again.
+ * 2. And when reallocating, it's worth scaling its size according to the number
+ * of inserted entries in the other filter, to reduce the memory overhead on
+ * small systems and false positives on large systems.
+ * 3. Jenkins' hash function is an alternative to Knuth's.
+ */
+#define BLOOM_FILTER_SHIFT 15
+
+static inline int filter_gen_from_seq(unsigned long seq)
+{
+ return seq % NR_BLOOM_FILTERS;
+}
+
+static void get_item_key(void *item, int *key)
+{
+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
+
+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
+
+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
+ key[1] = hash >> BLOOM_FILTER_SHIFT;
+}
+
+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+ int key[2];
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+ if (!filter)
+ return true;
+
+ get_item_key(item, key);
+
+ return test_bit(key[0], filter) && test_bit(key[1], filter);
+}
+
+static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+ int key[2];
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+ if (!filter)
+ return;
+
+ get_item_key(item, key);
+
+ if (!test_bit(key[0], filter))
+ set_bit(key[0], filter);
+ if (!test_bit(key[1], filter))
+ set_bit(key[1], filter);
+}
+
+static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+{
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = lruvec->mm_state.filters[gen];
+ if (filter) {
+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
+ return;
+ }
+
+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
+}
+
+/******************************************************************************
* mm_struct list
******************************************************************************/
@@ -3323,13 +3431,16 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
if (mem_cgroup_disabled())
return;
+ /* migration can happen before addition */
+ if (!mm->lru_gen.memcg)
+ return;
+
rcu_read_lock();
memcg = mem_cgroup_from_task(task);
rcu_read_unlock();
if (memcg == mm->lru_gen.memcg)
return;
- VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
lru_gen_del_mm(mm);
@@ -3337,94 +3448,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
}
#endif
-/*
- * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
- * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
- * bits in a bitmap, k is the number of hash functions and n is the number of
- * inserted items.
- *
- * Page table walkers use one of the two filters to reduce their search space.
- * To get rid of non-leaf entries that no longer have enough leaf entries, the
- * aging uses the double-buffering technique to flip to the other filter each
- * time it produces a new generation. For non-leaf entries that have enough
- * leaf entries, the aging carries them over to the next generation in
- * walk_pmd_range(); the eviction also report them when walking the rmap
- * in lru_gen_look_around().
- *
- * For future optimizations:
- * 1. It's not necessary to keep both filters all the time. The spare one can be
- * freed after the RCU grace period and reallocated if needed again.
- * 2. And when reallocating, it's worth scaling its size according to the number
- * of inserted entries in the other filter, to reduce the memory overhead on
- * small systems and false positives on large systems.
- * 3. Jenkins' hash function is an alternative to Knuth's.
- */
-#define BLOOM_FILTER_SHIFT 15
-
-static inline int filter_gen_from_seq(unsigned long seq)
-{
- return seq % NR_BLOOM_FILTERS;
-}
-
-static void get_item_key(void *item, int *key)
-{
- u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
-
- BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
-
- key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
- key[1] = hash >> BLOOM_FILTER_SHIFT;
-}
-
-static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
-{
- unsigned long *filter;
- int gen = filter_gen_from_seq(seq);
-
- filter = lruvec->mm_state.filters[gen];
- if (filter) {
- bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
- return;
- }
-
- filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
- WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
-}
-
-static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
-{
- int key[2];
- unsigned long *filter;
- int gen = filter_gen_from_seq(seq);
-
- filter = READ_ONCE(lruvec->mm_state.filters[gen]);
- if (!filter)
- return;
-
- get_item_key(item, key);
-
- if (!test_bit(key[0], filter))
- set_bit(key[0], filter);
- if (!test_bit(key[1], filter))
- set_bit(key[1], filter);
-}
-
-static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
-{
- int key[2];
- unsigned long *filter;
- int gen = filter_gen_from_seq(seq);
-
- filter = READ_ONCE(lruvec->mm_state.filters[gen]);
- if (!filter)
- return true;
-
- get_item_key(item, key);
-
- return test_bit(key[0], filter) && test_bit(key[1], filter);
-}
-
static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
{
int i;
@@ -3612,7 +3635,7 @@ struct ctrl_pos {
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
struct ctrl_pos *pos)
{
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
pos->refaulted = lrugen->avg_refaulted[type][tier] +
@@ -3627,7 +3650,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
{
int hist, tier;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
@@ -3704,7 +3727,7 @@ static int folio_update_gen(struct folio *folio, int gen)
static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
int type = folio_is_file_lru(folio);
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
@@ -3749,7 +3772,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
{
int gen, type, zone;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
walk->batched = 0;
@@ -3782,7 +3805,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
if (is_vm_hugetlb_page(vma))
return true;
- if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
+ if (!vma_has_recency(vma))
+ return true;
+
+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
return true;
if (vma == get_gate_vma(vma->vm_mm))
@@ -3977,8 +4003,8 @@ restart:
}
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
-static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
- struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
+static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
+ struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
{
int i;
pmd_t *pmd;
@@ -3991,18 +4017,19 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
VM_WARN_ON_ONCE(pud_leaf(*pud));
/* try to batch at most 1+MIN_LRU_BATCH+1 entries */
- if (*start == -1) {
- *start = next;
+ if (*first == -1) {
+ *first = addr;
+ bitmap_zero(bitmap, MIN_LRU_BATCH);
return;
}
- i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
+ i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first);
if (i && i <= MIN_LRU_BATCH) {
__set_bit(i - 1, bitmap);
return;
}
- pmd = pmd_offset(pud, *start);
+ pmd = pmd_offset(pud, *first);
ptl = pmd_lockptr(args->mm, pmd);
if (!spin_trylock(ptl))
@@ -4013,15 +4040,16 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
do {
unsigned long pfn;
struct folio *folio;
- unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
+
+ /* don't round down the first address */
+ addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
pfn = get_pmd_pfn(pmd[i], vma, addr);
if (pfn == -1)
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (arch_has_hw_nonleaf_pmd_young() &&
- get_cap(LRU_GEN_NONLEAF_YOUNG))
+ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
}
@@ -4050,12 +4078,11 @@ next:
arch_leave_lazy_mmu_mode();
spin_unlock(ptl);
done:
- *start = -1;
- bitmap_zero(bitmap, MIN_LRU_BATCH);
+ *first = -1;
}
#else
-static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
- struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
+static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
+ struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
{
}
#endif
@@ -4068,9 +4095,9 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
unsigned long next;
unsigned long addr;
struct vm_area_struct *vma;
- unsigned long pos = -1;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
+ unsigned long first = -1;
struct lru_gen_mm_walk *walk = args->private;
- unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -4109,18 +4136,17 @@ restart:
if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
continue;
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
continue;
}
#endif
walk->mm_stats[MM_NONLEAF_TOTAL]++;
- if (arch_has_hw_nonleaf_pmd_young() &&
- get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) {
if (!pmd_young(val))
continue;
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
}
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
@@ -4137,7 +4163,7 @@ restart:
update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
}
- walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
+ walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
goto restart;
@@ -4227,7 +4253,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
} while (err == -EAGAIN);
}
-static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
{
struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
@@ -4235,7 +4261,7 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
VM_WARN_ON_ONCE(walk);
walk = &pgdat->mm_walk;
- } else if (!pgdat && !walk) {
+ } else if (!walk && force_alloc) {
VM_WARN_ON_ONCE(current_is_kswapd());
walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -4263,7 +4289,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
{
int zone;
int remaining = MAX_LRU_BATCH;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
if (type == LRU_GEN_ANON && !can_swap)
@@ -4271,7 +4297,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
/* prevent cold/hot inversion if force_scan is true */
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- struct list_head *head = &lrugen->lists[old_gen][type][zone];
+ struct list_head *head = &lrugen->folios[old_gen][type][zone];
while (!list_empty(head)) {
struct folio *folio = lru_to_folio(head);
@@ -4282,7 +4308,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
new_gen = folio_inc_gen(lruvec, folio, false);
- list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]);
+ list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
if (!--remaining)
return false;
@@ -4299,7 +4325,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
{
int gen, type, zone;
bool success = false;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
@@ -4310,7 +4336,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
gen = lru_gen_from_seq(min_seq[type]);
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- if (!list_empty(&lrugen->lists[gen][type][zone]))
+ if (!list_empty(&lrugen->folios[gen][type][zone]))
goto next;
}
@@ -4320,7 +4346,7 @@ next:
;
}
- /* see the comment on lru_gen_struct */
+ /* see the comment on lru_gen_folio */
if (can_swap) {
min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
@@ -4342,7 +4368,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
{
int prev, next;
int type, zone;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
spin_lock_irq(&lruvec->lru_lock);
@@ -4400,7 +4426,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
bool success;
struct lru_gen_mm_walk *walk;
struct mm_struct *mm = NULL;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
@@ -4416,12 +4442,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
* handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults.
*/
- if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
+ if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
}
- walk = set_mm_walk(NULL);
+ walk = set_mm_walk(NULL, true);
if (!walk) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
@@ -4444,8 +4470,7 @@ done:
if (sc->priority <= DEF_PRIORITY - 2)
wait_event_killable(lruvec->mm_state.wait,
max_seq < READ_ONCE(lrugen->max_seq));
-
- return max_seq < READ_ONCE(lrugen->max_seq);
+ return false;
}
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
@@ -4458,97 +4483,56 @@ done:
return true;
}
-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+/******************************************************************************
+ * working set protection
+ ******************************************************************************/
+
+static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
- unsigned long old = 0;
- unsigned long young = 0;
unsigned long total = 0;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ bool can_swap = get_swappiness(lruvec, sc);
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
for (type = !can_swap; type < ANON_AND_FILE; type++) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
- unsigned long size = 0;
-
gen = lru_gen_from_seq(seq);
for (zone = 0; zone < MAX_NR_ZONES; zone++)
- size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
-
- total += size;
- if (seq == max_seq)
- young += size;
- else if (seq + MIN_NR_GENS == max_seq)
- old += size;
+ total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
}
}
- /* try to scrape all its memory if this memcg was deleted */
- *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
-
- /*
- * The aging tries to be lazy to reduce the overhead, while the eviction
- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
- * ideal number of generations is MIN_NR_GENS+1.
- */
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
- return true;
- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
- return false;
-
- /*
- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
- * of the total number of pages for each generation. A reasonable range
- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
- * aging cares about the upper bound of hot pages, while the eviction
- * cares about the lower bound of cold pages.
- */
- if (young * MIN_NR_GENS > total)
- return true;
- if (old * (MIN_NR_GENS + 2) < total)
- return true;
-
- return false;
+ /* whether the size is big enough to be helpful */
+ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
}
-static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
+static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
+ unsigned long min_ttl)
{
- bool need_aging;
- unsigned long nr_to_scan;
- int swappiness = get_swappiness(lruvec, sc);
+ int gen;
+ unsigned long birth;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
+ /* see the comment on lru_gen_folio */
+ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
- mem_cgroup_calculate_protection(NULL, memcg);
-
- if (mem_cgroup_below_min(NULL, memcg))
+ if (time_is_after_jiffies(birth + min_ttl))
return false;
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
-
- if (min_ttl) {
- int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
- unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
-
- if (time_is_after_jiffies(birth + min_ttl))
- return false;
-
- /* the size is likely too small to be helpful */
- if (!nr_to_scan && sc->priority != DEF_PRIORITY)
- return false;
- }
+ if (!lruvec_is_sizable(lruvec, sc))
+ return false;
- if (need_aging)
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
+ mem_cgroup_calculate_protection(NULL, memcg);
- return true;
+ return !mem_cgroup_below_min(NULL, memcg);
}
/* to protect the working set of the last N jiffies */
@@ -4557,46 +4541,30 @@ static unsigned long lru_gen_min_ttl __read_mostly;
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
- bool success = false;
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
VM_WARN_ON_ONCE(!current_is_kswapd());
- sc->last_reclaimed = sc->nr_reclaimed;
-
- /*
- * To reduce the chance of going into the aging path, which can be
- * costly, optimistically skip it if the flag below was cleared in the
- * eviction path. This improves the overall performance when multiple
- * memcgs are available.
- */
- if (!sc->memcgs_need_aging) {
- sc->memcgs_need_aging = true;
+ /* check the order to exclude compaction-induced reclaim */
+ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
return;
- }
-
- set_mm_walk(pgdat);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- if (age_lruvec(lruvec, sc, min_ttl))
- success = true;
+ if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
+ mem_cgroup_iter_break(NULL, memcg);
+ return;
+ }
cond_resched();
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
- clear_mm_walk();
-
- /* check the order to exclude compaction-induced reclaim */
- if (success || !min_ttl || sc->order)
- return;
-
/*
* The main goal is to OOM kill if every generation from all memcgs is
* younger than min_ttl. However, another possibility is all memcgs are
- * either below min or empty.
+ * either too small or below min.
*/
if (mutex_trylock(&oom_lock)) {
struct oom_control oc = {
@@ -4609,6 +4577,10 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
}
}
+/******************************************************************************
+ * rmap/PT walk feedback
+ ******************************************************************************/
+
/*
* This function exploits spatial locality when shrink_folio_list() walks the
* rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
@@ -4619,13 +4591,12 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
int i;
- pte_t *pte;
unsigned long start;
unsigned long end;
- unsigned long addr;
struct lru_gen_mm_walk *walk;
int young = 0;
- unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+ pte_t *pte = pvmw->pte;
+ unsigned long addr = pvmw->address;
struct folio *folio = pfn_folio(pvmw->pfn);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
@@ -4642,25 +4613,28 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
/* avoid taking the LRU lock under the PTL when possible */
walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
- start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
- end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+ start = max(addr & PMD_MASK, pvmw->vma->vm_start);
+ end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
- if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
end = start + MIN_LRU_BATCH * PAGE_SIZE;
- else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2)
start = end - MIN_LRU_BATCH * PAGE_SIZE;
else {
- start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
- end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
+ start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2;
+ end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2;
}
}
- pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
+ /* folio_update_gen() requires stable folio_memcg() */
+ if (!mem_cgroup_trylock_pages(memcg))
+ return;
- rcu_read_lock();
arch_enter_lazy_mmu_mode();
+ pte -= (addr - start) / PAGE_SIZE;
+
for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
unsigned long pfn;
@@ -4685,58 +4659,171 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
!folio_test_swapcache(folio)))
folio_mark_dirty(folio);
+ if (walk) {
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+
+ continue;
+ }
+
old_gen = folio_lru_gen(folio);
if (old_gen < 0)
folio_set_referenced(folio);
else if (old_gen != new_gen)
- __set_bit(i, bitmap);
+ folio_activate(folio);
}
arch_leave_lazy_mmu_mode();
- rcu_read_unlock();
+ mem_cgroup_unlock_pages();
/* feedback from rmap walkers to page table walkers */
if (suitable_to_scan(i, young))
update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+}
- if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
- for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
- folio = pfn_folio(pte_pfn(pte[i]));
- folio_activate(folio);
- }
- return;
+/******************************************************************************
+ * memcg LRU
+ ******************************************************************************/
+
+/* see the comment on MEMCG_NR_GENS */
+enum {
+ MEMCG_LRU_NOP,
+ MEMCG_LRU_HEAD,
+ MEMCG_LRU_TAIL,
+ MEMCG_LRU_OLD,
+ MEMCG_LRU_YOUNG,
+};
+
+#ifdef CONFIG_MEMCG
+
+static int lru_gen_memcg_seg(struct lruvec *lruvec)
+{
+ return READ_ONCE(lruvec->lrugen.seg);
+}
+
+static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
+{
+ int seg;
+ int old, new;
+ int bin = get_random_u32_below(MEMCG_NR_BINS);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ spin_lock(&pgdat->memcg_lru.lock);
+
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
+
+ seg = 0;
+ new = old = lruvec->lrugen.gen;
+
+ /* see the comment on MEMCG_NR_GENS */
+ if (op == MEMCG_LRU_HEAD)
+ seg = MEMCG_LRU_HEAD;
+ else if (op == MEMCG_LRU_TAIL)
+ seg = MEMCG_LRU_TAIL;
+ else if (op == MEMCG_LRU_OLD)
+ new = get_memcg_gen(pgdat->memcg_lru.seq);
+ else if (op == MEMCG_LRU_YOUNG)
+ new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
+ else
+ VM_WARN_ON_ONCE(true);
+
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
+
+ if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
+ hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
+ else
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
+
+ pgdat->memcg_lru.nr_memcgs[old]--;
+ pgdat->memcg_lru.nr_memcgs[new]++;
+
+ lruvec->lrugen.gen = new;
+ WRITE_ONCE(lruvec->lrugen.seg, seg);
+
+ if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
+
+ spin_unlock(&pgdat->memcg_lru.lock);
+}
+
+void lru_gen_online_memcg(struct mem_cgroup *memcg)
+{
+ int gen;
+ int nid;
+ int bin = get_random_u32_below(MEMCG_NR_BINS);
+
+ for_each_node(nid) {
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ spin_lock(&pgdat->memcg_lru.lock);
+
+ VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
+
+ gen = get_memcg_gen(pgdat->memcg_lru.seq);
+
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
+ pgdat->memcg_lru.nr_memcgs[gen]++;
+
+ lruvec->lrugen.gen = gen;
+
+ spin_unlock(&pgdat->memcg_lru.lock);
}
+}
- /* folio_update_gen() requires stable folio_memcg() */
- if (!mem_cgroup_trylock_pages(memcg))
- return;
+void lru_gen_offline_memcg(struct mem_cgroup *memcg)
+{
+ int nid;
- if (!walk) {
- spin_lock_irq(&lruvec->lru_lock);
- new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
}
+}
- for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
- folio = pfn_folio(pte_pfn(pte[i]));
- if (folio_memcg_rcu(folio) != memcg)
- continue;
+void lru_gen_release_memcg(struct mem_cgroup *memcg)
+{
+ int gen;
+ int nid;
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen < 0 || old_gen == new_gen)
- continue;
+ for_each_node(nid) {
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
- if (walk)
- update_batch_size(walk, folio, old_gen, new_gen);
- else
- lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+ spin_lock(&pgdat->memcg_lru.lock);
+
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
+
+ gen = lruvec->lrugen.gen;
+
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
+ pgdat->memcg_lru.nr_memcgs[gen]--;
+
+ if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
+
+ spin_unlock(&pgdat->memcg_lru.lock);
}
+}
+
+void lru_gen_soft_reclaim(struct lruvec *lruvec)
+{
+ /* see the comment on MEMCG_NR_GENS */
+ if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
+}
- if (!walk)
- spin_unlock_irq(&lruvec->lru_lock);
+#else /* !CONFIG_MEMCG */
- mem_cgroup_unlock_pages();
+static int lru_gen_memcg_seg(struct lruvec *lruvec)
+{
+ return 0;
}
+#endif
+
/******************************************************************************
* the eviction
******************************************************************************/
@@ -4750,7 +4837,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
int tier = lru_tier_from_refs(refs);
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
@@ -4775,7 +4862,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
/* promoted */
if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
- list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
}
@@ -4784,7 +4871,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
gen = folio_inc_gen(lruvec, folio, false);
- list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
lrugen->protected[hist][type][tier - 1] + delta);
@@ -4796,7 +4883,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
if (folio_test_locked(folio) || folio_test_writeback(folio) ||
(type == LRU_GEN_FILE && folio_test_dirty(folio))) {
gen = folio_inc_gen(lruvec, folio, true);
- list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
}
@@ -4807,12 +4894,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
{
bool success;
- /* unmapping inhibited */
- if (!sc->may_unmap && folio_mapped(folio))
- return false;
-
/* swapping inhibited */
- if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
+ if (!(sc->gfp_mask & __GFP_IO) &&
(folio_test_dirty(folio) ||
(folio_test_anon(folio) && !folio_test_swapcache(folio))))
return false;
@@ -4850,7 +4933,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int scanned = 0;
int isolated = 0;
int remaining = MAX_LRU_BATCH;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
VM_WARN_ON_ONCE(!list_empty(list));
@@ -4863,7 +4946,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
for (zone = sc->reclaim_idx; zone >= 0; zone--) {
LIST_HEAD(moved);
int skipped = 0;
- struct list_head *head = &lrugen->lists[gen][type][zone];
+ struct list_head *head = &lrugen->folios[gen][type][zone];
while (!list_empty(head)) {
struct folio *folio = lru_to_folio(head);
@@ -4909,9 +4992,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
__count_vm_events(PGSCAN_ANON + type, isolated);
/*
- * There might not be eligible pages due to reclaim_idx, may_unmap and
- * may_writepage. Check the remaining to prevent livelock if it's not
- * making progress.
+ * There might not be eligible folios due to reclaim_idx. Check the
+ * remaining to prevent livelock if it's not making progress.
*/
return isolated || !remaining ? scanned : 0;
}
@@ -5006,8 +5088,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
return scanned;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
- bool *need_swapping)
+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{
int type;
int scanned;
@@ -5096,153 +5177,348 @@ retry:
goto retry;
}
- if (need_swapping && type == LRU_GEN_ANON)
- *need_swapping = true;
-
return scanned;
}
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
+ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+{
+ int gen, type, zone;
+ unsigned long old = 0;
+ unsigned long young = 0;
+ unsigned long total = 0;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ /* whether this lruvec is completely out of cold folios */
+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
+ *nr_to_scan = 0;
+ return true;
+ }
+
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
+ unsigned long size = 0;
+
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+ total += size;
+ if (seq == max_seq)
+ young += size;
+ else if (seq + MIN_NR_GENS == max_seq)
+ old += size;
+ }
+ }
+
+ /* try to scrape all its memory if this memcg was deleted */
+ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+
+ /*
+ * The aging tries to be lazy to reduce the overhead, while the eviction
+ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
+ * ideal number of generations is MIN_NR_GENS+1.
+ */
+ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
+ return false;
+
+ /*
+ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
+ * of the total number of pages for each generation. A reasonable range
+ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
+ * aging cares about the upper bound of hot pages, while the eviction
+ * cares about the lower bound of cold pages.
+ */
+ if (young * MIN_NR_GENS > total)
+ return true;
+ if (old * (MIN_NR_GENS + 2) < total)
+ return true;
+
+ return false;
+}
+
/*
* For future optimizations:
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
* reclaim.
*/
-static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
- bool can_swap, bool *need_aging)
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
{
unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
- DEFINE_MIN_SEQ(lruvec);
- if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) ||
- (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) &&
- !sc->memcg_low_reclaim))
+ if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
return 0;
- *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
- if (!*need_aging)
+ if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
return nr_to_scan;
/* skip the aging path at the default priority */
if (sc->priority == DEF_PRIORITY)
- goto done;
+ return nr_to_scan;
- /* leave the work to lru_gen_age_node() */
- if (current_is_kswapd())
- return 0;
+ /* skip this lruvec as it's low on cold folios */
+ return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
+}
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
- return nr_to_scan;
-done:
- return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
+static unsigned long get_nr_to_reclaim(struct scan_control *sc)
+{
+ /* don't abort memcg reclaim to ensure fairness */
+ if (!global_reclaim(sc))
+ return -1;
+
+ return max(sc->nr_to_reclaim, compact_gap(sc->order));
}
-static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
- struct scan_control *sc, bool need_swapping)
+static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
- int i;
- DEFINE_MAX_SEQ(lruvec);
+ long nr_to_scan;
+ unsigned long scanned = 0;
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+ int swappiness = get_swappiness(lruvec, sc);
- if (!current_is_kswapd()) {
- /* age each memcg at most once to ensure fairness */
- if (max_seq - seq > 1)
- return true;
+ /* clean file folios are more likely to exist */
+ if (swappiness && !(sc->gfp_mask & __GFP_IO))
+ swappiness = 1;
- /* over-swapping can increase allocation latency */
- if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
- return true;
+ while (true) {
+ int delta;
- /* give this thread a chance to exit and free its memory */
- if (fatal_signal_pending(current)) {
- sc->nr_reclaimed += MIN_LRU_BATCH;
- return true;
- }
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
+ if (nr_to_scan <= 0)
+ break;
- if (cgroup_reclaim(sc))
- return false;
- } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
- return false;
+ delta = evict_folios(lruvec, sc, swappiness);
+ if (!delta)
+ break;
- /* keep scanning at low priorities to ensure fairness */
- if (sc->priority > DEF_PRIORITY - 2)
- return false;
+ scanned += delta;
+ if (scanned >= nr_to_scan)
+ break;
- /*
- * A minimum amount of work was done under global memory pressure. For
- * kswapd, it may be overshooting. For direct reclaim, the allocation
- * may succeed if all suitable zones are somewhat safe. In either case,
- * it's better to stop now, and restart later if necessary.
- */
- for (i = 0; i <= sc->reclaim_idx; i++) {
- unsigned long wmark;
- struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ break;
- if (!managed_zone(zone))
+ cond_resched();
+ }
+
+ /* whether try_to_inc_max_seq() was successful */
+ return nr_to_scan < 0;
+}
+
+static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
+{
+ bool success;
+ unsigned long scanned = sc->nr_scanned;
+ unsigned long reclaimed = sc->nr_reclaimed;
+ int seg = lru_gen_memcg_seg(lruvec);
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ /* see the comment on MEMCG_NR_GENS */
+ if (!lruvec_is_sizable(lruvec, sc))
+ return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
+
+ mem_cgroup_calculate_protection(NULL, memcg);
+
+ if (mem_cgroup_below_min(NULL, memcg))
+ return MEMCG_LRU_YOUNG;
+
+ if (mem_cgroup_below_low(NULL, memcg)) {
+ /* see the comment on MEMCG_NR_GENS */
+ if (seg != MEMCG_LRU_TAIL)
+ return MEMCG_LRU_TAIL;
+
+ memcg_memory_event(memcg, MEMCG_LOW);
+ }
+
+ success = try_to_shrink_lruvec(lruvec, sc);
+
+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
+
+ if (!sc->proactive)
+ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
+
+ sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
+ current->reclaim_state->reclaimed_slab = 0;
+
+ return success ? MEMCG_LRU_YOUNG : 0;
+}
+
+#ifdef CONFIG_MEMCG
+
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int op;
+ int gen;
+ int bin;
+ int first_bin;
+ struct lruvec *lruvec;
+ struct lru_gen_folio *lrugen;
+ struct mem_cgroup *memcg;
+ const struct hlist_nulls_node *pos;
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+
+ bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
+restart:
+ op = 0;
+ memcg = NULL;
+ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
+
+ rcu_read_lock();
+
+ hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
+ if (op)
+ lru_gen_rotate_memcg(lruvec, op);
+
+ mem_cgroup_put(memcg);
+
+ lruvec = container_of(lrugen, struct lruvec, lrugen);
+ memcg = lruvec_memcg(lruvec);
+
+ if (!mem_cgroup_tryget(memcg)) {
+ op = 0;
+ memcg = NULL;
continue;
+ }
- wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
- if (wmark > zone_page_state(zone, NR_FREE_PAGES))
- return false;
+ rcu_read_unlock();
+
+ op = shrink_one(lruvec, sc);
+
+ rcu_read_lock();
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ break;
}
- sc->nr_reclaimed += MIN_LRU_BATCH;
+ rcu_read_unlock();
- return true;
+ if (op)
+ lru_gen_rotate_memcg(lruvec, op);
+
+ mem_cgroup_put(memcg);
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ return;
+
+ /* restart if raced with lru_gen_rotate_memcg() */
+ if (gen != get_nulls_value(pos))
+ goto restart;
+
+ /* try the rest of the bins of the current generation */
+ bin = get_memcg_bin(bin + 1);
+ if (bin != first_bin)
+ goto restart;
}
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
struct blk_plug plug;
- bool need_aging = false;
- bool need_swapping = false;
- unsigned long scanned = 0;
- unsigned long reclaimed = sc->nr_reclaimed;
- DEFINE_MAX_SEQ(lruvec);
+
+ VM_WARN_ON_ONCE(global_reclaim(sc));
+ VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
lru_add_drain();
blk_start_plug(&plug);
- set_mm_walk(lruvec_pgdat(lruvec));
+ set_mm_walk(NULL, sc->proactive);
- while (true) {
- int delta;
- int swappiness;
- unsigned long nr_to_scan;
+ if (try_to_shrink_lruvec(lruvec, sc))
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
- if (sc->may_swap)
- swappiness = get_swappiness(lruvec, sc);
- else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
- swappiness = 1;
- else
- swappiness = 0;
+ clear_mm_walk();
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
- if (!nr_to_scan)
- goto done;
+ blk_finish_plug(&plug);
+}
- delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
- if (!delta)
- goto done;
+#else /* !CONFIG_MEMCG */
- scanned += delta;
- if (scanned >= nr_to_scan)
- break;
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ BUILD_BUG();
+}
- if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
- break;
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+ BUILD_BUG();
+}
- cond_resched();
- }
+#endif
+
+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int priority;
+ unsigned long reclaimable;
+ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
+
+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
+ return;
+ /*
+ * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
+ * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
+ * estimated reclaimed_to_scanned_ratio = inactive / total.
+ */
+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (get_swappiness(lruvec, sc))
+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ reclaimable /= MEMCG_NR_GENS;
+
+ /* round down reclaimable and round up sc->nr_to_reclaim */
+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
+
+ sc->priority = clamp(priority, 0, DEF_PRIORITY);
+}
+
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ struct blk_plug plug;
+ unsigned long reclaimed = sc->nr_reclaimed;
+
+ VM_WARN_ON_ONCE(!global_reclaim(sc));
+
+ /*
+ * Unmapped clean folios are already prioritized. Scanning for more of
+ * them is likely futile and can cause high reclaim latency when there
+ * is a large number of memcgs.
+ */
+ if (!sc->may_writepage || !sc->may_unmap)
+ goto done;
+
+ lru_add_drain();
+
+ blk_start_plug(&plug);
+
+ set_mm_walk(pgdat, sc->proactive);
+
+ set_initial_priority(pgdat, sc);
+
+ if (current_is_kswapd())
+ sc->nr_reclaimed = 0;
+
+ if (mem_cgroup_disabled())
+ shrink_one(&pgdat->__lruvec, sc);
+ else
+ shrink_many(pgdat, sc);
+
+ if (current_is_kswapd())
+ sc->nr_reclaimed += reclaimed;
- /* see the comment in lru_gen_age_node() */
- if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
- sc->memcgs_need_aging = false;
-done:
clear_mm_walk();
blk_finish_plug(&plug);
+done:
+ /* kswapd should never fail */
+ pgdat->kswapd_failures = 0;
}
/******************************************************************************
@@ -5251,7 +5527,7 @@ done:
static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
{
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
if (lrugen->enabled) {
enum lru_list lru;
@@ -5264,7 +5540,7 @@ static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
int gen, type, zone;
for_each_gen_type_zone(gen, type, zone) {
- if (!list_empty(&lrugen->lists[gen][type][zone]))
+ if (!list_empty(&lrugen->folios[gen][type][zone]))
return false;
}
}
@@ -5309,7 +5585,7 @@ static bool drain_evictable(struct lruvec *lruvec)
int remaining = MAX_LRU_BATCH;
for_each_gen_type_zone(gen, type, zone) {
- struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
+ struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
while (!list_empty(head)) {
bool success;
@@ -5530,7 +5806,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
int i;
int type, tier;
int hist = lru_hist_from_seq(seq);
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
@@ -5580,7 +5856,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
unsigned long seq;
bool full = !debugfs_real_fops(m->file)->write;
struct lruvec *lruvec = v;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
int nid = lruvec_pgdat(lruvec)->node_id;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
@@ -5677,7 +5953,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
if (sc->nr_reclaimed >= nr_to_reclaim)
return 0;
- if (!evict_folios(lruvec, sc, swappiness, NULL))
+ if (!evict_folios(lruvec, sc, swappiness))
return 0;
cond_resched();
@@ -5698,11 +5974,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (!mem_cgroup_disabled()) {
rcu_read_lock();
+
memcg = mem_cgroup_from_id(memcg_id);
-#ifdef CONFIG_MEMCG
- if (memcg && !css_tryget(&memcg->css))
+ if (!mem_cgroup_tryget(memcg))
memcg = NULL;
-#endif
+
rcu_read_unlock();
if (!memcg)
@@ -5762,7 +6038,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
set_task_reclaim_state(current, &sc.reclaim_state);
flags = memalloc_noreclaim_save();
blk_start_plug(&plug);
- if (!set_mm_walk(NULL)) {
+ if (!set_mm_walk(NULL, true)) {
err = -ENOMEM;
goto done;
}
@@ -5834,7 +6110,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
{
int i;
int gen, type, zone;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
lrugen->max_seq = MIN_NR_GENS + 1;
lrugen->enabled = lru_gen_enabled();
@@ -5843,13 +6119,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
lrugen->timestamps[i] = jiffies;
for_each_gen_type_zone(gen, type, zone)
- INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+ INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
lruvec->mm_state.seq = MIN_NR_GENS;
init_waitqueue_head(&lruvec->mm_state.wait);
}
#ifdef CONFIG_MEMCG
+
+void lru_gen_init_pgdat(struct pglist_data *pgdat)
+{
+ int i, j;
+
+ spin_lock_init(&pgdat->memcg_lru.lock);
+
+ for (i = 0; i < MEMCG_NR_GENS; i++) {
+ for (j = 0; j < MEMCG_NR_BINS; j++)
+ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
+ }
+}
+
void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
INIT_LIST_HEAD(&memcg->mm_list.fifo);
@@ -5861,19 +6150,25 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
int i;
int nid;
+ VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo));
+
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
+ VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers);
VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
sizeof(lruvec->lrugen.nr_pages)));
+ lruvec->lrugen.list.next = LIST_POISON1;
+
for (i = 0; i < NR_BLOOM_FILTERS; i++) {
bitmap_free(lruvec->mm_state.filters[i]);
lruvec->mm_state.filters[i] = NULL;
}
}
}
-#endif
+
+#endif /* CONFIG_MEMCG */
static int __init init_lru_gen(void)
{
@@ -5900,6 +6195,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
{
}
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+}
+
#endif /* CONFIG_LRU_GEN */
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
@@ -5913,7 +6212,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
bool proportional_reclaim;
struct blk_plug plug;
- if (lru_gen_enabled()) {
+ if (lru_gen_enabled() && !global_reclaim(sc)) {
lru_gen_shrink_lruvec(lruvec, sc);
return;
}
@@ -6156,6 +6455,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
struct lruvec *target_lruvec;
bool reclaimable = false;
+ if (lru_gen_enabled() && global_reclaim(sc)) {
+ lru_gen_shrink_node(pgdat, sc);
+ return;
+ }
+
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
@@ -6754,8 +7058,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
- unsigned int reclaim_options,
- nodemask_t *nodemask)
+ unsigned int reclaim_options)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
@@ -6770,7 +7073,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
- .nodemask = nodemask,
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
diff --git a/mm/workingset.c b/mm/workingset.c
index 1a86645b7b3c..00c6f4d9d9be 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct folio *folio)
unsigned long token;
unsigned long min_seq;
struct lruvec *lruvec;
- struct lru_gen_struct *lrugen;
+ struct lru_gen_folio *lrugen;
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
@@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
unsigned long token;
unsigned long min_seq;
struct lruvec *lruvec;
- struct lru_gen_struct *lrugen;
+ struct lru_gen_folio *lrugen;
struct mem_cgroup *memcg;
struct pglist_data *pgdat;
int type = folio_is_file_lru(folio);
@@ -457,6 +457,7 @@ void workingset_refault(struct folio *folio, void *shadow)
*/
nr = folio_nr_pages(folio);
memcg = folio_memcg(folio);
+ pgdat = folio_pgdat(folio);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
@@ -474,7 +475,7 @@ void workingset_refault(struct folio *folio, void *shadow)
workingset_size += lruvec_page_state(eviction_lruvec,
NR_INACTIVE_FILE);
}
- if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
+ if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_ACTIVE_ANON);
if (file) {
@@ -656,11 +657,14 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out;
}
- if (!spin_trylock(&mapping->host->i_lock)) {
- xa_unlock(&mapping->i_pages);
- spin_unlock_irq(lru_lock);
- ret = LRU_RETRY;
- goto out;
+ /* For page cache we need to hold i_lock */
+ if (mapping->host != NULL) {
+ if (!spin_trylock(&mapping->host->i_lock)) {
+ xa_unlock(&mapping->i_pages);
+ spin_unlock_irq(lru_lock);
+ ret = LRU_RETRY;
+ goto out;
+ }
}
list_lru_isolate(lru, item);
@@ -682,9 +686,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
out_invalid:
xa_unlock_irq(&mapping->i_pages);
- if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
- spin_unlock(&mapping->host->i_lock);
+ if (mapping->host != NULL) {
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+ }
ret = LRU_REMOVED_RETRY;
out:
cond_resched();
diff --git a/mm/z3fold.c b/mm/z3fold.c
index a4de0c317ac7..0cef845d397b 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1450,7 +1450,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
struct z3fold_header *zhdr;
struct z3fold_pool *pool;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(PageIsolated(page), page);
if (test_bit(PAGE_HEADLESS, &page->private))
@@ -1490,7 +1489,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page,
struct z3fold_header *zhdr, *new_zhdr;
struct z3fold_pool *pool;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 9445bee6b014..3aed46ab7e6c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -73,13 +73,6 @@
*/
#define ZS_ALIGN 8
-/*
- * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
- * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
- */
-#define ZS_MAX_ZSPAGE_ORDER 2
-#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
-
#define ZS_HANDLE_SIZE (sizeof(unsigned long))
/*
@@ -113,17 +106,36 @@
* have room for two bit at least.
*/
#define OBJ_ALLOCATED_TAG 1
-#define OBJ_TAG_BITS 1
+
+#ifdef CONFIG_ZPOOL
+/*
+ * The second least-significant bit in the object's header identifies if the
+ * value stored at the header is a deferred handle from the last reclaim
+ * attempt.
+ *
+ * As noted above, this is valid because we have room for two bits.
+ */
+#define OBJ_DEFERRED_HANDLE_TAG 2
+#define OBJ_TAG_BITS 2
+#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)
+#else
+#define OBJ_TAG_BITS 1
+#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
+#endif /* CONFIG_ZPOOL */
+
#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
#define HUGE_BITS 1
#define FULLNESS_BITS 2
#define CLASS_BITS 8
-#define ISOLATED_BITS 3
+#define ISOLATED_BITS 5
#define MAGIC_VAL_BITS 8
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
+
+#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL))
+
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
#define ZS_MIN_ALLOC_SIZE \
MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
@@ -222,6 +234,12 @@ struct link_free {
* Handle of allocated object.
*/
unsigned long handle;
+#ifdef CONFIG_ZPOOL
+ /*
+ * Deferred handle of a reclaimed object.
+ */
+ unsigned long deferred_handle;
+#endif
};
};
@@ -272,8 +290,6 @@ struct zspage {
/* links the zspage to the lru list in the pool */
struct list_head lru;
bool under_reclaim;
- /* list of unfreed handles whose objects have been reclaimed */
- unsigned long *deferred_handles;
#endif
struct zs_pool *pool;
@@ -802,42 +818,6 @@ out:
return newfg;
}
-/*
- * We have to decide on how many pages to link together
- * to form a zspage for each size class. This is important
- * to reduce wastage due to unusable space left at end of
- * each zspage which is given as:
- * wastage = Zp % class_size
- * usage = Zp - wastage
- * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
- *
- * For example, for size class of 3/8 * PAGE_SIZE, we should
- * link together 3 PAGE_SIZE sized pages to form a zspage
- * since then we can perfectly fit in 8 such objects.
- */
-static int get_pages_per_zspage(int class_size)
-{
- int i, max_usedpc = 0;
- /* zspage order which gives maximum used size per KB */
- int max_usedpc_order = 1;
-
- for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
- int zspage_size;
- int waste, usedpc;
-
- zspage_size = i * PAGE_SIZE;
- waste = zspage_size % class_size;
- usedpc = (zspage_size - waste) * 100 / zspage_size;
-
- if (usedpc > max_usedpc) {
- max_usedpc = usedpc;
- max_usedpc_order = i;
- }
- }
-
- return max_usedpc_order;
-}
-
static struct zspage *get_zspage(struct page *page)
{
struct zspage *zspage = (struct zspage *)page_private(page);
@@ -897,7 +877,8 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
-static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
+static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
+ int tag)
{
unsigned long handle;
struct zspage *zspage = get_zspage(page);
@@ -908,13 +889,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
} else
handle = *(unsigned long *)obj;
- if (!(handle & OBJ_ALLOCATED_TAG))
+ if (!(handle & tag))
return false;
- *phandle = handle & ~OBJ_ALLOCATED_TAG;
+ /* Clear all tags before returning the handle */
+ *phandle = handle & ~OBJ_TAG_MASK;
return true;
}
+static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
+{
+ return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
+}
+
+#ifdef CONFIG_ZPOOL
+static bool obj_stores_deferred_handle(struct page *page, void *obj,
+ unsigned long *phandle)
+{
+ return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG);
+}
+#endif
+
static void reset_page(struct page *page)
{
__ClearPageMovable(page);
@@ -946,22 +941,36 @@ unlock:
}
#ifdef CONFIG_ZPOOL
+static unsigned long find_deferred_handle_obj(struct size_class *class,
+ struct page *page, int *obj_idx);
+
/*
* Free all the deferred handles whose objects are freed in zs_free.
*/
-static void free_handles(struct zs_pool *pool, struct zspage *zspage)
+static void free_handles(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage)
{
- unsigned long handle = (unsigned long)zspage->deferred_handles;
+ int obj_idx = 0;
+ struct page *page = get_first_page(zspage);
+ unsigned long handle;
- while (handle) {
- unsigned long nxt_handle = handle_to_obj(handle);
+ while (1) {
+ handle = find_deferred_handle_obj(class, page, &obj_idx);
+ if (!handle) {
+ page = get_next_page(page);
+ if (!page)
+ break;
+ obj_idx = 0;
+ continue;
+ }
cache_free_handle(pool, handle);
- handle = nxt_handle;
+ obj_idx++;
}
}
#else
-static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {}
+static inline void free_handles(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage) {}
#endif
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
@@ -979,7 +988,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
VM_BUG_ON(fg != ZS_EMPTY);
/* Free all deferred handles from zs_free */
- free_handles(pool, zspage);
+ free_handles(pool, class, zspage);
next = page = get_first_page(zspage);
do {
@@ -1067,7 +1076,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
#ifdef CONFIG_ZPOOL
INIT_LIST_HEAD(&zspage->lru);
zspage->under_reclaim = false;
- zspage->deferred_handles = NULL;
#endif
set_freeobj(zspage, 0);
@@ -1568,7 +1576,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(zs_malloc);
-static void obj_free(int class_size, unsigned long obj)
+static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
{
struct link_free *link;
struct zspage *zspage;
@@ -1582,15 +1590,29 @@ static void obj_free(int class_size, unsigned long obj)
zspage = get_zspage(f_page);
vaddr = kmap_atomic(f_page);
-
- /* Insert this object in containing zspage's freelist */
link = (struct link_free *)(vaddr + f_offset);
- if (likely(!ZsHugePage(zspage)))
- link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
- else
- f_page->index = 0;
+
+ if (handle) {
+#ifdef CONFIG_ZPOOL
+ /* Stores the (deferred) handle in the object's header */
+ *handle |= OBJ_DEFERRED_HANDLE_TAG;
+ *handle &= ~OBJ_ALLOCATED_TAG;
+
+ if (likely(!ZsHugePage(zspage)))
+ link->deferred_handle = *handle;
+ else
+ f_page->index = *handle;
+#endif
+ } else {
+ /* Insert this object in containing zspage's freelist */
+ if (likely(!ZsHugePage(zspage)))
+ link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
+ else
+ f_page->index = 0;
+ set_freeobj(zspage, f_objidx);
+ }
+
kunmap_atomic(vaddr);
- set_freeobj(zspage, f_objidx);
mod_zspage_inuse(zspage, -1);
}
@@ -1615,7 +1637,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
zspage = get_zspage(f_page);
class = zspage_class(pool, zspage);
- obj_free(class->size, obj);
class_stat_dec(class, OBJ_USED, 1);
#ifdef CONFIG_ZPOOL
@@ -1624,15 +1645,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
* Reclaim needs the handles during writeback. It'll free
* them along with the zspage when it's done with them.
*
- * Record current deferred handle at the memory location
- * whose address is given by handle.
+ * Record current deferred handle in the object's header.
*/
- record_obj(handle, (unsigned long)zspage->deferred_handles);
- zspage->deferred_handles = (unsigned long *)handle;
+ obj_free(class->size, obj, &handle);
spin_unlock(&pool->lock);
return;
}
#endif
+ obj_free(class->size, obj, NULL);
+
fullness = fix_fullness_group(class, zspage);
if (fullness == ZS_EMPTY)
free_zspage(pool, class, zspage);
@@ -1713,11 +1734,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
}
/*
- * Find alloced object in zspage from index object and
+ * Find object with a certain tag in zspage from index object and
* return handle.
*/
-static unsigned long find_alloced_obj(struct size_class *class,
- struct page *page, int *obj_idx)
+static unsigned long find_tagged_obj(struct size_class *class,
+ struct page *page, int *obj_idx, int tag)
{
unsigned int offset;
int index = *obj_idx;
@@ -1728,7 +1749,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
offset += class->size * index;
while (offset < PAGE_SIZE) {
- if (obj_allocated(page, addr + offset, &handle))
+ if (obj_tagged(page, addr + offset, &handle, tag))
break;
offset += class->size;
@@ -1742,6 +1763,28 @@ static unsigned long find_alloced_obj(struct size_class *class,
return handle;
}
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct size_class *class,
+ struct page *page, int *obj_idx)
+{
+ return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
+}
+
+#ifdef CONFIG_ZPOOL
+/*
+ * Find object storing a deferred handle in header in zspage from index object
+ * and return handle.
+ */
+static unsigned long find_deferred_handle_obj(struct size_class *class,
+ struct page *page, int *obj_idx)
+{
+ return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG);
+}
+#endif
+
struct zs_compact_control {
/* Source spage for migration which could be a subpage of zspage */
struct page *s_page;
@@ -1784,7 +1827,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
record_obj(handle, free_obj);
- obj_free(class->size, used_obj);
+ obj_free(class->size, used_obj, NULL);
}
/* Remember last position in this iteration */
@@ -1973,7 +2016,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
* Page is locked so zspage couldn't be destroyed. For detail, look at
* lock_zspage in free_zspage.
*/
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(PageIsolated(page), page);
zspage = get_zspage(page);
@@ -2005,7 +2047,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
if (mode == MIGRATE_SYNC_NO_COPY)
return -EINVAL;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
/* The page is locked, so this pointer must remain valid */
@@ -2070,7 +2111,6 @@ static void zs_page_putback(struct page *page)
{
struct zspage *zspage;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
zspage = get_zspage(page);
@@ -2321,6 +2361,27 @@ static int zs_register_shrinker(struct zs_pool *pool)
pool->name);
}
+static int calculate_zspage_chain_size(int class_size)
+{
+ int i, min_waste = INT_MAX;
+ int chain_size = 1;
+
+ if (is_power_of_2(class_size))
+ return chain_size;
+
+ for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
+ int waste;
+
+ waste = (i * PAGE_SIZE) % class_size;
+ if (waste < min_waste) {
+ min_waste = waste;
+ chain_size = i;
+ }
+ }
+
+ return chain_size;
+}
+
/**
* zs_create_pool - Creates an allocation pool to work from.
* @name: pool name to be created
@@ -2365,7 +2426,7 @@ struct zs_pool *zs_create_pool(const char *name)
size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
if (size > ZS_MAX_ALLOC_SIZE)
size = ZS_MAX_ALLOC_SIZE;
- pages_per_zspage = get_pages_per_zspage(size);
+ pages_per_zspage = calculate_zspage_chain_size(size);
objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
/*
@@ -2478,6 +2539,90 @@ void zs_destroy_pool(struct zs_pool *pool)
EXPORT_SYMBOL_GPL(zs_destroy_pool);
#ifdef CONFIG_ZPOOL
+static void restore_freelist(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage)
+{
+ unsigned int obj_idx = 0;
+ unsigned long handle, off = 0; /* off is within-page offset */
+ struct page *page = get_first_page(zspage);
+ struct link_free *prev_free = NULL;
+ void *prev_page_vaddr = NULL;
+
+ /* in case no free object found */
+ set_freeobj(zspage, (unsigned int)(-1UL));
+
+ while (page) {
+ void *vaddr = kmap_atomic(page);
+ struct page *next_page;
+
+ while (off < PAGE_SIZE) {
+ void *obj_addr = vaddr + off;
+
+ /* skip allocated object */
+ if (obj_allocated(page, obj_addr, &handle)) {
+ obj_idx++;
+ off += class->size;
+ continue;
+ }
+
+ /* free deferred handle from reclaim attempt */
+ if (obj_stores_deferred_handle(page, obj_addr, &handle))
+ cache_free_handle(pool, handle);
+
+ if (prev_free)
+ prev_free->next = obj_idx << OBJ_TAG_BITS;
+ else /* first free object found */
+ set_freeobj(zspage, obj_idx);
+
+ prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
+ /* if last free object in a previous page, need to unmap */
+ if (prev_page_vaddr) {
+ kunmap_atomic(prev_page_vaddr);
+ prev_page_vaddr = NULL;
+ }
+
+ obj_idx++;
+ off += class->size;
+ }
+
+ /*
+ * Handle the last (full or partial) object on this page.
+ */
+ next_page = get_next_page(page);
+ if (next_page) {
+ if (!prev_free || prev_page_vaddr) {
+ /*
+ * There is no free object in this page, so we can safely
+ * unmap it.
+ */
+ kunmap_atomic(vaddr);
+ } else {
+ /* update prev_page_vaddr since prev_free is on this page */
+ prev_page_vaddr = vaddr;
+ }
+ } else { /* this is the last page */
+ if (prev_free) {
+ /*
+ * Reset OBJ_TAG_BITS bit to last link to tell
+ * whether it's allocated object or not.
+ */
+ prev_free->next = -1UL << OBJ_TAG_BITS;
+ }
+
+ /* unmap previous page (if not done yet) */
+ if (prev_page_vaddr) {
+ kunmap_atomic(prev_page_vaddr);
+ prev_page_vaddr = NULL;
+ }
+
+ kunmap_atomic(vaddr);
+ }
+
+ page = next_page;
+ off %= PAGE_SIZE;
+ }
+}
+
static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
{
int i, obj_idx, ret = 0;
@@ -2561,6 +2706,12 @@ next:
return 0;
}
+ /*
+ * Eviction fails on one of the handles, so we need to restore zspage.
+ * We need to rebuild its freelist (and free stored deferred handles),
+ * put it back to the correct size class, and add it to the LRU list.
+ */
+ restore_freelist(pool, class, zspage);
putback_zspage(class, zspage);
list_add(&zspage->lru, &pool->lru);
unlock_zspage(zspage);