summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig32
-rw-r--r--mm/Makefile3
-rw-r--r--mm/backing-dev.c6
-rw-r--r--mm/cma.c14
-rw-r--r--mm/compaction.c113
-rw-r--r--mm/damon/core-test.h84
-rw-r--r--mm/damon/core.c63
-rw-r--r--mm/damon/ops-common.c2
-rw-r--r--mm/damon/paddr.c2
-rw-r--r--mm/damon/sysfs-common.h2
-rw-r--r--mm/damon/sysfs-schemes.c107
-rw-r--r--mm/damon/sysfs.c26
-rw-r--r--mm/damon/vaddr.c25
-rw-r--r--mm/debug_vm_pgtable.c18
-rw-r--r--mm/filemap.c242
-rw-r--r--mm/folio-compat.c2
-rw-r--r--mm/frontswap.c283
-rw-r--r--mm/gup.c108
-rw-r--r--mm/hmm.c1
-rw-r--r--mm/huge_memory.c138
-rw-r--r--mm/hugetlb.c494
-rw-r--r--mm/hugetlb_vmemmap.c34
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/internal.h78
-rw-r--r--mm/ioremap.c41
-rw-r--r--mm/kfence/core.c123
-rw-r--r--mm/kfence/kfence.h5
-rw-r--r--mm/kfence/kfence_test.c7
-rw-r--r--mm/kfence/report.c3
-rw-r--r--mm/khugepaged.c511
-rw-r--r--mm/kmemleak.c15
-rw-r--r--mm/kmsan/hooks.c4
-rw-r--r--mm/kmsan/shadow.c8
-rw-r--r--mm/ksm.c71
-rw-r--r--mm/madvise.c24
-rw-r--r--mm/mapping_dirty_helpers.c11
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c138
-rw-r--r--mm/memfd.c58
-rw-r--r--mm/memory-failure.c176
-rw-r--r--mm/memory-tiers.c19
-rw-r--r--mm/memory.c365
-rw-r--r--mm/memory_hotplug.c192
-rw-r--r--mm/mempolicy.c44
-rw-r--r--mm/memtest.c22
-rw-r--r--mm/migrate.c9
-rw-r--r--mm/migrate_device.c31
-rw-r--r--mm/mincore.c1
-rw-r--r--mm/mlock.c13
-rw-r--r--mm/mm_init.c37
-rw-r--r--mm/mmap.c254
-rw-r--r--mm/mmu_gather.c1
-rw-r--r--mm/mmu_notifier.c50
-rw-r--r--mm/mprotect.c8
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c55
-rw-r--r--mm/oom_kill.c3
-rw-r--r--mm/page-writeback.c49
-rw-r--r--mm/page_alloc.c150
-rw-r--r--mm/page_ext.c101
-rw-r--r--mm/page_io.c80
-rw-r--r--mm/page_isolation.c8
-rw-r--r--mm/page_owner.c2
-rw-r--r--mm/page_poison.c1
-rw-r--r--mm/page_table_check.c62
-rw-r--r--mm/page_vma_mapped.c12
-rw-r--r--mm/pagewalk.c41
-rw-r--r--mm/pgtable-generic.c97
-rw-r--r--mm/readahead.c13
-rw-r--r--mm/rmap.c145
-rw-r--r--mm/secretmem.c14
-rw-r--r--mm/shmem.c881
-rw-r--r--mm/shmem_quota.c350
-rw-r--r--mm/show_mem.c10
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slab_common.c54
-rw-r--r--mm/slub.c58
-rw-r--r--mm/sparse-vmemmap.c3
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/swap.h1
-rw-r--r--mm/swap_state.c23
-rw-r--r--mm/swapfile.c77
-rw-r--r--mm/truncate.c12
-rw-r--r--mm/userfaultfd.c87
-rw-r--r--mm/util.c15
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmpressure.c8
-rw-r--r--mm/vmscan.c58
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/workingset.c1
-rw-r--r--mm/z3fold.c27
-rw-r--r--mm/zsmalloc.c93
-rw-r--r--mm/zswap.c393
94 files changed, 4044 insertions, 3079 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 09130434e30d..264a2df5ecf5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -25,7 +25,6 @@ menuconfig SWAP
config ZSWAP
bool "Compressed cache for swap pages"
depends on SWAP
- select FRONTSWAP
select CRYPTO
select ZPOOL
help
@@ -337,6 +336,23 @@ config SLUB_CPU_PARTIAL
which requires the taking of locks that may cause latency spikes.
Typically one would choose no for a realtime system.
+config RANDOM_KMALLOC_CACHES
+ default n
+ depends on SLUB && !SLUB_TINY
+ bool "Randomize slab caches for normal kmalloc"
+ help
+ A hardening feature that creates multiple copies of slab caches for
+ normal kmalloc allocation and makes kmalloc randomly pick one based
+ on code address, which makes the attackers more difficult to spray
+ vulnerable memory objects on the heap for the purpose of exploiting
+ memory vulnerabilities.
+
+ Currently the number of copies is set to 16, a reasonably large value
+ that effectively diverges the memory objects allocated for different
+ subsystems or modules into different caches, at the expense of a
+ limited degree of memory and CPU overhead that relates to hardware and
+ system workload.
+
endmenu # SLAB allocator options
config SHUFFLE_PAGE_ALLOCATOR
@@ -487,7 +503,10 @@ config SPARSEMEM_VMEMMAP
# Select this config option from the architecture Kconfig, if it is preferred
# to enable the feature of HugeTLB/dev_dax vmemmap optimization.
#
-config ARCH_WANT_OPTIMIZE_VMEMMAP
+config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+ bool
+
+config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
bool
config HAVE_MEMBLOCK_PHYS_MAP
@@ -569,6 +588,9 @@ config MHP_MEMMAP_ON_MEMORY
endif # MEMORY_HOTPLUG
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+ bool
+
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
@@ -870,9 +892,6 @@ config USE_PERCPU_NUMA_NODE_ID
config HAVE_SETUP_PER_CPU_AREA
bool
-config FRONTSWAP
- bool
-
config CMA
bool "Contiguous Memory Allocator"
depends on MMU
@@ -1144,6 +1163,9 @@ config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY
config IO_MAPPING
bool
+config MEMFD_CREATE
+ bool "Enable memfd_create() system call" if EXPERT
+
config SECRETMEM
default y
bool "Enable memfd_secret() system call" if EXPERT
diff --git a/mm/Makefile b/mm/Makefile
index 678530a07326..ec65984e2ade 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,7 +51,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
- compaction.o show_mem.o\
+ compaction.o show_mem.o shmem_quota.o\
interval_tree.o list_lru.o workingset.o \
debug.o gup.o mmap_lock.o $(mmu-y)
@@ -72,7 +72,6 @@ ifdef CONFIG_MMU
endif
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o
-obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3ffc3cfa7a14..1e3447bccdb1 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -16,6 +16,7 @@
#include <linux/writeback.h>
#include <linux/device.h>
#include <trace/events/writeback.h>
+#include "internal.h"
struct backing_dev_info noop_backing_dev_info;
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
@@ -34,8 +35,6 @@ LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;
-#define K(x) ((x) << (PAGE_SHIFT - 10))
-
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -733,9 +732,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
might_alloc(gfp);
- if (!memcg_css->parent)
- return &bdi->wb;
-
do {
wb = wb_get_lookup(bdi, memcg_css);
} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
diff --git a/mm/cma.c b/mm/cma.c
index a4cfe995e11e..da2967c6a223 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -267,6 +267,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
if (alignment && !is_power_of_2(alignment))
return -EINVAL;
+ if (!IS_ENABLED(CONFIG_NUMA))
+ nid = NUMA_NO_NODE;
+
/* Sanitise input arguments. */
alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES);
if (fixed && base & (alignment - 1)) {
@@ -372,14 +375,15 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
if (ret)
goto free_mem;
- pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
- &base);
+ pr_info("Reserved %ld MiB at %pa on node %d\n", (unsigned long)size / SZ_1M,
+ &base, nid);
return 0;
free_mem:
memblock_phys_free(base, size);
err:
- pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
+ pr_err("Failed to reserve %ld MiB on node %d\n", (unsigned long)size / SZ_1M,
+ nid);
return ret;
}
@@ -436,8 +440,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
if (!cma || !cma->count || !cma->bitmap)
goto out;
- pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma,
- count, align);
+ pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__,
+ (void *)cma, cma->name, count, align);
if (!count)
goto out;
diff --git a/mm/compaction.c b/mm/compaction.c
index dbc9f86b1934..38c8d216c6a3 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -249,11 +249,36 @@ static unsigned long skip_offline_sections(unsigned long start_pfn)
return 0;
}
+
+/*
+ * If the PFN falls into an offline section, return the end PFN of the
+ * next online section in reverse. If the PFN falls into an online section
+ * or if there is no next online section in reverse, return 0.
+ */
+static unsigned long skip_offline_sections_reverse(unsigned long start_pfn)
+{
+ unsigned long start_nr = pfn_to_section_nr(start_pfn);
+
+ if (!start_nr || online_section_nr(start_nr))
+ return 0;
+
+ while (start_nr-- > 0) {
+ if (online_section_nr(start_nr))
+ return section_nr_to_pfn(start_nr) + PAGES_PER_SECTION;
+ }
+
+ return 0;
+}
#else
static unsigned long skip_offline_sections(unsigned long start_pfn)
{
return 0;
}
+
+static unsigned long skip_offline_sections_reverse(unsigned long start_pfn)
+{
+ return 0;
+}
#endif
/*
@@ -438,12 +463,13 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
{
struct zone *zone = cc->zone;
- pfn = pageblock_end_pfn(pfn);
-
/* Set for isolation rather than compaction */
if (cc->no_set_skip_hint)
return;
+ pfn = pageblock_end_pfn(pfn);
+
+ /* Update where async and sync compaction should restart */
if (pfn > zone->compact_cached_migrate_pfn[0])
zone->compact_cached_migrate_pfn[0] = pfn;
if (cc->mode != MIGRATE_ASYNC &&
@@ -465,7 +491,6 @@ static void update_pageblock_skip(struct compact_control *cc,
set_pageblock_skip(page);
- /* Update where async and sync compaction should restart */
if (pfn < zone->compact_cached_free_pfn)
zone->compact_cached_free_pfn = pfn;
}
@@ -564,7 +589,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
bool strict)
{
int nr_scanned = 0, total_isolated = 0;
- struct page *cursor;
+ struct page *page;
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
@@ -574,12 +599,11 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (strict)
stride = 1;
- cursor = pfn_to_page(blockpfn);
+ page = pfn_to_page(blockpfn);
/* Isolate free pages. */
- for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) {
+ for (; blockpfn < end_pfn; blockpfn += stride, page += stride) {
int isolated;
- struct page *page = cursor;
/*
* Periodically drop the lock (if held) regardless of its
@@ -604,7 +628,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (likely(order <= MAX_ORDER)) {
blockpfn += (1UL << order) - 1;
- cursor += (1UL << order) - 1;
+ page += (1UL << order) - 1;
nr_scanned += (1UL << order) - 1;
}
goto isolate_fail;
@@ -641,14 +665,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
}
/* Advance to the end of split page */
blockpfn += isolated - 1;
- cursor += isolated - 1;
+ page += isolated - 1;
continue;
isolate_fail:
if (strict)
break;
- else
- continue;
}
@@ -715,8 +737,6 @@ isolate_freepages_range(struct compact_control *cc,
/* Protect pfn from changing by isolate_freepages_block */
unsigned long isolate_start_pfn = pfn;
- block_end_pfn = min(block_end_pfn, end_pfn);
-
/*
* pfn could pass the block_end_pfn if isolated freepage
* is more than pageblock order. In this case, we adjust
@@ -725,9 +745,10 @@ isolate_freepages_range(struct compact_control *cc,
if (pfn >= block_end_pfn) {
block_start_pfn = pageblock_start_pfn(pfn);
block_end_pfn = pageblock_end_pfn(pfn);
- block_end_pfn = min(block_end_pfn, end_pfn);
}
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
if (!pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, cc->zone))
break;
@@ -912,11 +933,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/*
* Check if the pageblock has already been marked skipped.
- * Only the aligned PFN is checked as the caller isolates
+ * Only the first PFN is checked as the caller isolates
* COMPACT_CLUSTER_MAX at a time so the second call must
* not falsely conclude that the block should be skipped.
*/
- if (!valid_page && pageblock_aligned(low_pfn)) {
+ if (!valid_page && (pageblock_aligned(low_pfn) ||
+ low_pfn == cc->zone->zone_start_pfn)) {
if (!isolation_suitable(cc, page)) {
low_pfn = end_pfn;
folio = NULL;
@@ -1075,13 +1097,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
bool migrate_dirty;
/*
- * Only pages without mappings or that have a
- * ->migrate_folio callback are possible to migrate
- * without blocking. However, we can be racing with
- * truncation so it's necessary to lock the page
- * to stabilise the mapping as truncation holds
- * the page lock until after the page is removed
- * from the page cache.
+ * Only folios without mappings or that have
+ * a ->migrate_folio callback are possible to
+ * migrate without blocking. However, we may
+ * be racing with truncation, which can free
+ * the mapping. Truncation holds the folio lock
+ * until after the folio is removed from the page
+ * cache so holding it ourselves is sufficient.
*/
if (!folio_trylock(folio))
goto isolate_fail_put;
@@ -1119,6 +1141,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
skip_updated = true;
if (test_and_set_skip(cc, valid_page) &&
!cc->finish_pageblock) {
+ low_pfn = end_pfn;
goto isolate_abort;
}
}
@@ -1420,10 +1443,8 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn)
isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
/* Skip this pageblock in the future as it's full or nearly full */
- if (start_pfn == end_pfn)
+ if (start_pfn == end_pfn && !cc->no_set_skip_hint)
set_pageblock_skip(page);
-
- return;
}
/* Search orders in round-robin fashion */
@@ -1500,7 +1521,7 @@ static void fast_isolate_freepages(struct compact_control *cc)
spin_lock_irqsave(&cc->zone->lock, flags);
freelist = &area->free_list[MIGRATE_MOVABLE];
- list_for_each_entry_reverse(freepage, freelist, lru) {
+ list_for_each_entry_reverse(freepage, freelist, buddy_list) {
unsigned long pfn;
order_scanned++;
@@ -1529,7 +1550,7 @@ static void fast_isolate_freepages(struct compact_control *cc)
break;
}
- /* Use a minimum pfn if a preferred one was not found */
+ /* Use a maximum candidate pfn if a preferred one was not found */
if (!page && high_pfn) {
page = pfn_to_page(high_pfn);
@@ -1668,8 +1689,15 @@ static void isolate_freepages(struct compact_control *cc)
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
- if (!page)
+ if (!page) {
+ unsigned long next_pfn;
+
+ next_pfn = skip_offline_sections_reverse(block_start_pfn);
+ if (next_pfn)
+ block_start_pfn = max(next_pfn, low_pfn);
+
continue;
+ }
/* Check the block is suitable for migration */
if (!suitable_migration_target(cc, page))
@@ -1685,7 +1713,8 @@ static void isolate_freepages(struct compact_control *cc)
/* Update the skip hint if the full pageblock was scanned */
if (isolate_start_pfn == block_end_pfn)
- update_pageblock_skip(cc, page, block_start_pfn);
+ update_pageblock_skip(cc, page, block_start_pfn -
+ pageblock_nr_pages);
/* Are enough freepages isolated? */
if (cc->nr_freepages >= cc->nr_migratepages) {
@@ -1883,7 +1912,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
spin_lock_irqsave(&cc->zone->lock, flags);
freelist = &area->free_list[MIGRATE_MOVABLE];
- list_for_each_entry(freepage, freelist, lru) {
+ list_for_each_entry(freepage, freelist, buddy_list) {
unsigned long free_pfn;
if (nr_scanned++ >= limit) {
@@ -1957,9 +1986,9 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
block_start_pfn = cc->zone->zone_start_pfn;
/*
- * fast_find_migrateblock marks a pageblock skipped so to avoid
- * the isolation_suitable check below, check whether the fast
- * search was successful.
+ * fast_find_migrateblock() has already ensured the pageblock is not
+ * set with a skipped flag, so to avoid the isolation_suitable check
+ * below again, check whether the fast search was successful.
*/
fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;
@@ -2002,7 +2031,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
* before making it "skip" so other compaction instances do
* not scan the same block.
*/
- if (pageblock_aligned(low_pfn) &&
+ if ((pageblock_aligned(low_pfn) ||
+ low_pfn == cc->zone->zone_start_pfn) &&
!fast_find_block && !isolation_suitable(cc, page))
continue;
@@ -2112,7 +2142,7 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat)
return score;
}
-static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
+static unsigned int fragmentation_score_wmark(bool low)
{
unsigned int wmark_low;
@@ -2132,7 +2162,7 @@ static bool should_proactive_compact_node(pg_data_t *pgdat)
if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat))
return false;
- wmark_high = fragmentation_score_wmark(pgdat, false);
+ wmark_high = fragmentation_score_wmark(false);
return fragmentation_score_node(pgdat) > wmark_high;
}
@@ -2171,7 +2201,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
return COMPACT_PARTIAL_SKIPPED;
score = fragmentation_score_zone(cc->zone);
- wmark_low = fragmentation_score_wmark(pgdat, true);
+ wmark_low = fragmentation_score_wmark(true);
if (score > wmark_low)
ret = COMPACT_CONTINUE;
@@ -2478,7 +2508,8 @@ rescan:
goto check_drain;
case ISOLATE_SUCCESS:
update_cached = false;
- last_migrated_pfn = iteration_start_pfn;
+ last_migrated_pfn = max(cc->zone->zone_start_pfn,
+ pageblock_start_pfn(cc->migrate_pfn - 1));
}
err = migrate_pages(&cc->migratepages, compaction_alloc,
@@ -2501,7 +2532,7 @@ rescan:
}
/*
* If an ASYNC or SYNC_LIGHT fails to migrate a page
- * within the current order-aligned block and
+ * within the pageblock_order-aligned block and
* fast_find_migrateblock may be used then scan the
* remainder of the pageblock. This will mark the
* pageblock "skip" to avoid rescanning in the near
@@ -2867,7 +2898,7 @@ int compaction_register_node(struct node *node)
void compaction_unregister_node(struct node *node)
{
- return device_remove_file(&node->dev, &dev_attr_compact);
+ device_remove_file(&node->dev, &dev_attr_compact);
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index c11210124344..6cc8b245586d 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -320,25 +320,97 @@ static void damon_test_update_monitoring_result(struct kunit *test)
static void damon_test_set_attrs(struct kunit *test)
{
- struct damon_ctx ctx;
+ struct damon_ctx *c = damon_new_ctx();
struct damon_attrs valid_attrs = {
.min_nr_regions = 10, .max_nr_regions = 1000,
.sample_interval = 5000, .aggr_interval = 100000,};
struct damon_attrs invalid_attrs;
- KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &valid_attrs), 0);
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &valid_attrs), 0);
invalid_attrs = valid_attrs;
invalid_attrs.min_nr_regions = 1;
- KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL);
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL);
invalid_attrs = valid_attrs;
invalid_attrs.max_nr_regions = 9;
- KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL);
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL);
invalid_attrs = valid_attrs;
invalid_attrs.aggr_interval = 4999;
- KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL);
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL);
+}
+
+static void damos_test_new_filter(struct kunit *test)
+{
+ struct damos_filter *filter;
+
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+ KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON);
+ KUNIT_EXPECT_EQ(test, filter->matching, true);
+ KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list);
+ KUNIT_EXPECT_PTR_EQ(test, filter->list.next, &filter->list);
+ damos_destroy_filter(filter);
+}
+
+static void damos_test_filter_out(struct kunit *test)
+{
+ struct damon_target *t;
+ struct damon_region *r, *r2;
+ struct damos_filter *f;
+
+ f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true);
+ f->addr_range = (struct damon_addr_range){
+ .start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6};
+
+ t = damon_new_target();
+ r = damon_new_region(DAMON_MIN_REGION * 3, DAMON_MIN_REGION * 5);
+ damon_add_region(r, t);
+
+ /* region in the range */
+ KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
+
+ /* region before the range */
+ r->ar.start = DAMON_MIN_REGION * 1;
+ r->ar.end = DAMON_MIN_REGION * 2;
+ KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
+
+ /* region after the range */
+ r->ar.start = DAMON_MIN_REGION * 6;
+ r->ar.end = DAMON_MIN_REGION * 8;
+ KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
+
+ /* region started before the range */
+ r->ar.start = DAMON_MIN_REGION * 1;
+ r->ar.end = DAMON_MIN_REGION * 4;
+ KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ /* filter should have split the region */
+ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1);
+ KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2);
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2);
+ r2 = damon_next_region(r);
+ KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 2);
+ KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 4);
+ damon_destroy_region(r2, t);
+
+ /* region started in the range */
+ r->ar.start = DAMON_MIN_REGION * 2;
+ r->ar.end = DAMON_MIN_REGION * 8;
+ KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f));
+ /* filter should have split the region */
+ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2);
+ KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6);
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2);
+ r2 = damon_next_region(r);
+ KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 6);
+ KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 8);
+ damon_destroy_region(r2, t);
+
+ damon_free_target(t);
+ damos_free_filter(f);
}
static struct kunit_case damon_test_cases[] = {
@@ -353,6 +425,8 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_set_regions),
KUNIT_CASE(damon_test_update_monitoring_result),
KUNIT_CASE(damon_test_set_attrs),
+ KUNIT_CASE(damos_test_new_filter),
+ KUNIT_CASE(damos_test_filter_out),
{},
};
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 91cff7f2997e..bcd2bd9d6c10 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -273,6 +273,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type,
return NULL;
filter->type = type;
filter->matching = matching;
+ INIT_LIST_HEAD(&filter->list);
return filter;
}
@@ -877,6 +878,66 @@ static void damos_update_stat(struct damos *s,
s->stat.sz_applied += sz_applied;
}
+static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos_filter *filter)
+{
+ bool matched = false;
+ struct damon_target *ti;
+ int target_idx = 0;
+ unsigned long start, end;
+
+ switch (filter->type) {
+ case DAMOS_FILTER_TYPE_TARGET:
+ damon_for_each_target(ti, ctx) {
+ if (ti == t)
+ break;
+ target_idx++;
+ }
+ matched = target_idx == filter->target_idx;
+ break;
+ case DAMOS_FILTER_TYPE_ADDR:
+ start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION);
+ end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION);
+
+ /* inside the range */
+ if (start <= r->ar.start && r->ar.end <= end) {
+ matched = true;
+ break;
+ }
+ /* outside of the range */
+ if (r->ar.end <= start || end <= r->ar.start) {
+ matched = false;
+ break;
+ }
+ /* start before the range and overlap */
+ if (r->ar.start < start) {
+ damon_split_region_at(t, r, start - r->ar.start);
+ matched = false;
+ break;
+ }
+ /* start inside the range */
+ damon_split_region_at(t, r, end - r->ar.start);
+ matched = true;
+ break;
+ default:
+ break;
+ }
+
+ return matched == filter->matching;
+}
+
+static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *s)
+{
+ struct damos_filter *filter;
+
+ damos_for_each_filter(filter, s) {
+ if (__damos_filter_out(ctx, t, r, filter))
+ return true;
+ }
+ return false;
+}
+
static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
struct damon_region *r, struct damos *s)
{
@@ -894,6 +955,8 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
goto update_stat;
damon_split_region_at(t, r, sz);
}
+ if (damos_filter_out(c, t, r, s))
+ return;
ktime_get_coarse_ts64(&begin);
if (c->callback.before_damos_apply)
err = c->callback.before_damos_apply(c, t, r, s);
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index e940802a15a4..ac1c3fa80f98 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -54,7 +54,7 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr
void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- struct folio *folio = damon_get_folio(pmd_pfn(*pmd));
+ struct folio *folio = damon_get_folio(pmd_pfn(pmdp_get(pmd)));
if (!folio)
return;
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 40801e38fcf0..909db25efb35 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -94,7 +94,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
mmu_notifier_test_young(vma->vm_mm, addr);
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- *accessed = pmd_young(*pvmw.pmd) ||
+ *accessed = pmd_young(pmdp_get(pvmw.pmd)) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
#else
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index db677eba78fd..fd482a0639b4 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -47,7 +47,7 @@ void damon_sysfs_schemes_update_stats(
int damon_sysfs_schemes_update_regions_start(
struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx);
+ struct damon_ctx *ctx, bool total_bytes_only);
int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 50cf89dcd898..527e7d17eb3b 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -117,6 +117,7 @@ struct damon_sysfs_scheme_regions {
struct kobject kobj;
struct list_head regions_list;
int nr_regions;
+ unsigned long total_bytes;
};
static struct damon_sysfs_scheme_regions *
@@ -128,9 +129,19 @@ damon_sysfs_scheme_regions_alloc(void)
regions->kobj = (struct kobject){};
INIT_LIST_HEAD(&regions->regions_list);
regions->nr_regions = 0;
+ regions->total_bytes = 0;
return regions;
}
+static ssize_t total_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_regions *regions = container_of(kobj,
+ struct damon_sysfs_scheme_regions, kobj);
+
+ return sysfs_emit(buf, "%lu\n", regions->total_bytes);
+}
+
static void damon_sysfs_scheme_regions_rm_dirs(
struct damon_sysfs_scheme_regions *regions)
{
@@ -148,7 +159,11 @@ static void damon_sysfs_scheme_regions_release(struct kobject *kobj)
kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj));
}
+static struct kobj_attribute damon_sysfs_scheme_regions_total_bytes_attr =
+ __ATTR_RO_MODE(total_bytes, 0400);
+
static struct attribute *damon_sysfs_scheme_regions_attrs[] = {
+ &damon_sysfs_scheme_regions_total_bytes_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions);
@@ -267,6 +282,8 @@ struct damon_sysfs_scheme_filter {
enum damos_filter_type type;
bool matching;
char *memcg_path;
+ struct damon_addr_range addr_range;
+ int target_idx;
};
static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
@@ -278,6 +295,8 @@ static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
static const char * const damon_sysfs_scheme_filter_type_strs[] = {
"anon",
"memcg",
+ "addr",
+ "target",
};
static ssize_t type_show(struct kobject *kobj,
@@ -358,6 +377,63 @@ static ssize_t memcg_path_store(struct kobject *kobj,
return count;
}
+static ssize_t addr_start_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%lu\n", filter->addr_range.start);
+}
+
+static ssize_t addr_start_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ int err = kstrtoul(buf, 0, &filter->addr_range.start);
+
+ return err ? err : count;
+}
+
+static ssize_t addr_end_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%lu\n", filter->addr_range.end);
+}
+
+static ssize_t addr_end_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ int err = kstrtoul(buf, 0, &filter->addr_range.end);
+
+ return err ? err : count;
+}
+
+static ssize_t damon_target_idx_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%d\n", filter->target_idx);
+}
+
+static ssize_t damon_target_idx_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ int err = kstrtoint(buf, 0, &filter->target_idx);
+
+ return err ? err : count;
+}
+
static void damon_sysfs_scheme_filter_release(struct kobject *kobj)
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
@@ -376,10 +452,22 @@ static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr =
static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr =
__ATTR_RW_MODE(memcg_path, 0600);
+static struct kobj_attribute damon_sysfs_scheme_filter_addr_start_attr =
+ __ATTR_RW_MODE(addr_start, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_addr_end_attr =
+ __ATTR_RW_MODE(addr_end, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_damon_target_idx_attr =
+ __ATTR_RW_MODE(damon_target_idx, 0600);
+
static struct attribute *damon_sysfs_scheme_filter_attrs[] = {
&damon_sysfs_scheme_filter_type_attr.attr,
&damon_sysfs_scheme_filter_matching_attr.attr,
&damon_sysfs_scheme_filter_memcg_path_attr.attr,
+ &damon_sysfs_scheme_filter_addr_start_attr.attr,
+ &damon_sysfs_scheme_filter_addr_end_attr.attr,
+ &damon_sysfs_scheme_filter_damon_target_idx_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter);
@@ -1469,7 +1557,17 @@ static int damon_sysfs_set_scheme_filters(struct damos *scheme,
damos_destroy_filter(filter);
return err;
}
+ } else if (filter->type == DAMOS_FILTER_TYPE_ADDR) {
+ if (sysfs_filter->addr_range.end <
+ sysfs_filter->addr_range.start) {
+ damos_destroy_filter(filter);
+ return -EINVAL;
+ }
+ filter->addr_range = sysfs_filter->addr_range;
+ } else if (filter->type == DAMOS_FILTER_TYPE_TARGET) {
+ filter->target_idx = sysfs_filter->target_idx;
}
+
damos_add_filter(scheme, filter);
}
return 0;
@@ -1620,6 +1718,7 @@ void damon_sysfs_schemes_update_stats(
*/
static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback;
static int damon_sysfs_schemes_region_idx;
+static bool damos_regions_upd_total_bytes_only;
/*
* DAMON callback that called before damos apply. While this callback is
@@ -1648,6 +1747,10 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
return 0;
sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
+ sysfs_regions->total_bytes += r->ar.end - r->ar.start;
+ if (damos_regions_upd_total_bytes_only)
+ return 0;
+
region = damon_sysfs_scheme_region_alloc(r);
list_add_tail(&region->list, &sysfs_regions->regions_list);
sysfs_regions->nr_regions++;
@@ -1678,6 +1781,7 @@ int damon_sysfs_schemes_clear_regions(
sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++];
damon_sysfs_scheme_regions_rm_dirs(
sysfs_scheme->tried_regions);
+ sysfs_scheme->tried_regions->total_bytes = 0;
}
return 0;
}
@@ -1685,10 +1789,11 @@ int damon_sysfs_schemes_clear_regions(
/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
int damon_sysfs_schemes_update_regions_start(
struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx)
+ struct damon_ctx *ctx, bool total_bytes_only)
{
damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx);
damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
+ damos_regions_upd_total_bytes_only = total_bytes_only;
ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
return 0;
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 33e1d5c9cb54..b86ba7b0a921 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1000,6 +1000,11 @@ enum damon_sysfs_cmd {
*/
DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS,
/*
+ * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: Update
+ * tried_regions/total_bytes sysfs files for each scheme.
+ */
+ DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES,
+ /*
* @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried
* regions
*/
@@ -1021,6 +1026,7 @@ static const char * const damon_sysfs_cmd_strs[] = {
"off",
"commit",
"update_schemes_stats",
+ "update_schemes_tried_bytes",
"update_schemes_tried_regions",
"clear_schemes_tried_regions",
};
@@ -1206,12 +1212,14 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
struct damon_sysfs_kdamond *kdamond;
+ enum damon_sysfs_cmd cmd;
/* damon_sysfs_schemes_update_regions_stop() might not yet called */
kdamond = damon_sysfs_cmd_request.kdamond;
- if (kdamond && damon_sysfs_cmd_request.cmd ==
- DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS &&
- ctx == kdamond->damon_ctx) {
+ cmd = damon_sysfs_cmd_request.cmd;
+ if (kdamond && ctx == kdamond->damon_ctx &&
+ (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS ||
+ cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES)) {
damon_sysfs_schemes_update_regions_stop(ctx);
mutex_unlock(&damon_sysfs_lock);
}
@@ -1248,14 +1256,15 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
}
static int damon_sysfs_upd_schemes_regions_start(
- struct damon_sysfs_kdamond *kdamond)
+ struct damon_sysfs_kdamond *kdamond, bool total_bytes_only)
{
struct damon_ctx *ctx = kdamond->damon_ctx;
if (!ctx)
return -EINVAL;
return damon_sysfs_schemes_update_regions_start(
- kdamond->contexts->contexts_arr[0]->schemes, ctx);
+ kdamond->contexts->contexts_arr[0]->schemes, ctx,
+ total_bytes_only);
}
static int damon_sysfs_upd_schemes_regions_stop(
@@ -1332,6 +1341,7 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
{
struct damon_sysfs_kdamond *kdamond;
static bool damon_sysfs_schemes_regions_updating;
+ bool total_bytes_only = false;
int err = 0;
/* avoid deadlock due to concurrent state_store('off') */
@@ -1348,9 +1358,13 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
case DAMON_SYSFS_CMD_COMMIT:
err = damon_sysfs_commit_input(kdamond);
break;
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES:
+ total_bytes_only = true;
+ fallthrough;
case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
if (!damon_sysfs_schemes_regions_updating) {
- err = damon_sysfs_upd_schemes_regions_start(kdamond);
+ err = damon_sysfs_upd_schemes_regions_start(kdamond,
+ total_bytes_only);
if (!err) {
damon_sysfs_schemes_regions_updating = true;
goto keep_lock_out;
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 2fcc9731528a..4c81a9dbd044 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -301,16 +301,19 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
pte_t *pte;
+ pmd_t pmde;
spinlock_t *ptl;
- if (pmd_trans_huge(*pmd)) {
+ if (pmd_trans_huge(pmdp_get(pmd))) {
ptl = pmd_lock(walk->mm, pmd);
- if (!pmd_present(*pmd)) {
+ pmde = pmdp_get(pmd);
+
+ if (!pmd_present(pmde)) {
spin_unlock(ptl);
return 0;
}
- if (pmd_trans_huge(*pmd)) {
+ if (pmd_trans_huge(pmde)) {
damon_pmdp_mkold(pmd, walk->vma, addr);
spin_unlock(ptl);
return 0;
@@ -386,6 +389,7 @@ out:
static const struct mm_walk_ops damon_mkold_ops = {
.pmd_entry = damon_mkold_pmd_entry,
.hugetlb_entry = damon_mkold_hugetlb_entry,
+ .walk_lock = PGWALK_RDLOCK,
};
static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
@@ -439,21 +443,25 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
struct damon_young_walk_private *priv = walk->private;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_trans_huge(*pmd)) {
+ if (pmd_trans_huge(pmdp_get(pmd))) {
+ pmd_t pmde;
+
ptl = pmd_lock(walk->mm, pmd);
- if (!pmd_present(*pmd)) {
+ pmde = pmdp_get(pmd);
+
+ if (!pmd_present(pmde)) {
spin_unlock(ptl);
return 0;
}
- if (!pmd_trans_huge(*pmd)) {
+ if (!pmd_trans_huge(pmde)) {
spin_unlock(ptl);
goto regular_page;
}
- folio = damon_get_folio(pmd_pfn(*pmd));
+ folio = damon_get_folio(pmd_pfn(pmde));
if (!folio)
goto huge_out;
- if (pmd_young(*pmd) || !folio_test_idle(folio) ||
+ if (pmd_young(pmde) || !folio_test_idle(folio) ||
mmu_notifier_test_young(walk->mm,
addr))
priv->young = true;
@@ -525,6 +533,7 @@ out:
static const struct mm_walk_ops damon_young_ops = {
.pmd_entry = damon_young_pmd_entry,
.hugetlb_entry = damon_young_hugetlb_entry,
+ .walk_lock = PGWALK_RDLOCK,
};
static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index b457ca17cef7..48e329ea5ba3 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -302,7 +302,7 @@ static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx)
unsigned long val = idx, *ptr = &val;
pud_t pud;
- if (!has_transparent_hugepage())
+ if (!has_transparent_pud_hugepage())
return;
pr_debug("Validating PUD basic (%pGv)\n", ptr);
@@ -343,7 +343,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
unsigned long vaddr = args->vaddr;
pud_t pud;
- if (!has_transparent_hugepage())
+ if (!has_transparent_pud_hugepage())
return;
page = (args->pud_pfn != ULONG_MAX) ? pfn_to_page(args->pud_pfn) : NULL;
@@ -385,7 +385,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
#ifndef __PAGETABLE_PMD_FOLDED
- pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1);
+ pudp_huge_get_and_clear_full(args->vma, vaddr, args->pudp, 1);
pud = READ_ONCE(*args->pudp);
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
@@ -405,7 +405,7 @@ static void __init pud_leaf_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (!has_transparent_hugepage())
+ if (!has_transparent_pud_hugepage())
return;
pr_debug("Validating PUD leaf\n");
@@ -732,7 +732,7 @@ static void __init pud_devmap_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (!has_transparent_hugepage())
+ if (!has_transparent_pud_hugepage())
return;
pr_debug("Validating PUD devmap\n");
@@ -981,7 +981,7 @@ static void __init pud_thp_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (!has_transparent_hugepage())
+ if (!has_transparent_pud_hugepage())
return;
pr_debug("Validating PUD based THP\n");
@@ -1022,8 +1022,7 @@ static void __init destroy_args(struct pgtable_debug_args *args)
/* Free (huge) page */
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
- has_transparent_hugepage() &&
+ has_transparent_pud_hugepage() &&
args->pud_pfn != ULONG_MAX) {
if (args->is_contiguous_page) {
free_contig_range(args->pud_pfn,
@@ -1274,8 +1273,7 @@ static int __init init_args(struct pgtable_debug_args *args)
* if we fail to allocate (huge) pages.
*/
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
- has_transparent_hugepage()) {
+ has_transparent_pud_hugepage()) {
page = debug_vm_pgtable_alloc_huge_page(args,
HPAGE_PUD_SHIFT - PAGE_SHIFT);
if (page) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 9e44a49bbd74..bf6219d9aaac 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1669,46 +1669,47 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
/*
* Return values:
- * true - folio is locked; mmap_lock is still held.
- * false - folio is not locked.
- * mmap_lock has been released (mmap_read_unlock(), unless flags had both
- * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
- * which case mmap_lock is still held.
- *
- * If neither ALLOW_RETRY nor KILLABLE are set, will always return true
- * with the folio locked and the mmap_lock unperturbed.
+ * 0 - folio is locked.
+ * non-zero - folio is not locked.
+ * mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
+ * vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
+ * FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
+ *
+ * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
+ * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
*/
-bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
- unsigned int flags)
+vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
{
+ unsigned int flags = vmf->flags;
+
if (fault_flag_allow_retry_first(flags)) {
/*
- * CAUTION! In this case, mmap_lock is not released
- * even though return 0.
+ * CAUTION! In this case, mmap_lock/per-VMA lock is not
+ * released even though returning VM_FAULT_RETRY.
*/
if (flags & FAULT_FLAG_RETRY_NOWAIT)
- return false;
+ return VM_FAULT_RETRY;
- mmap_read_unlock(mm);
+ release_fault_lock(vmf);
if (flags & FAULT_FLAG_KILLABLE)
folio_wait_locked_killable(folio);
else
folio_wait_locked(folio);
- return false;
+ return VM_FAULT_RETRY;
}
if (flags & FAULT_FLAG_KILLABLE) {
bool ret;
ret = __folio_lock_killable(folio);
if (ret) {
- mmap_read_unlock(mm);
- return false;
+ release_fault_lock(vmf);
+ return VM_FAULT_RETRY;
}
} else {
__folio_lock(folio);
}
- return true;
+ return 0;
}
/**
@@ -1855,30 +1856,15 @@ out:
*
* Looks up the page cache entry at @mapping & @index.
*
- * @fgp_flags can be zero or more of these flags:
- *
- * * %FGP_ACCESSED - The folio will be marked accessed.
- * * %FGP_LOCK - The folio is returned locked.
- * * %FGP_CREAT - If no page is present then a new page is allocated using
- * @gfp and added to the page cache and the VM's LRU list.
- * The page is returned locked and with an increased refcount.
- * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
- * page is already in cache. If the page was allocated, unlock it before
- * returning so the caller can do the same dance.
- * * %FGP_WRITE - The page will be written to by the caller.
- * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
- * * %FGP_NOWAIT - Don't get blocked by page lock.
- * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
- *
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
* if the %GFP flags specified for %FGP_CREAT are atomic.
*
- * If there is a page cache page, it is returned with an increased refcount.
+ * If this function returns a folio, it is returned with an increased refcount.
*
* Return: The found folio or an ERR_PTR() otherwise.
*/
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp)
+ fgf_t fgp_flags, gfp_t gfp)
{
struct folio *folio;
@@ -1920,7 +1906,9 @@ repeat:
folio_wait_stable(folio);
no_page:
if (!folio && (fgp_flags & FGP_CREAT)) {
+ unsigned order = FGF_GET_ORDER(fgp_flags);
int err;
+
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
gfp |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
@@ -1929,26 +1917,44 @@ no_page:
gfp &= ~GFP_KERNEL;
gfp |= GFP_NOWAIT | __GFP_NOWARN;
}
-
- folio = filemap_alloc_folio(gfp, 0);
- if (!folio)
- return ERR_PTR(-ENOMEM);
-
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
- /* Init accessed so avoid atomic mark_page_accessed later */
- if (fgp_flags & FGP_ACCESSED)
- __folio_set_referenced(folio);
+ if (!mapping_large_folio_support(mapping))
+ order = 0;
+ if (order > MAX_PAGECACHE_ORDER)
+ order = MAX_PAGECACHE_ORDER;
+ /* If we're not aligned, allocate a smaller folio */
+ if (index & ((1UL << order) - 1))
+ order = __ffs(index);
- err = filemap_add_folio(mapping, folio, index, gfp);
- if (unlikely(err)) {
+ do {
+ gfp_t alloc_gfp = gfp;
+
+ err = -ENOMEM;
+ if (order == 1)
+ order = 0;
+ if (order > 0)
+ alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ folio = filemap_alloc_folio(alloc_gfp, order);
+ if (!folio)
+ continue;
+
+ /* Init accessed so avoid atomic mark_page_accessed later */
+ if (fgp_flags & FGP_ACCESSED)
+ __folio_set_referenced(folio);
+
+ err = filemap_add_folio(mapping, folio, index, gfp);
+ if (!err)
+ break;
folio_put(folio);
folio = NULL;
- if (err == -EEXIST)
- goto repeat;
- }
+ } while (order-- > 0);
+ if (err == -EEXIST)
+ goto repeat;
+ if (err)
+ return ERR_PTR(err);
/*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
@@ -2075,7 +2081,7 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
if (!xa_is_value(folio)) {
if (folio->index < *start)
goto put;
- if (folio->index + folio_nr_pages(folio) - 1 > end)
+ if (folio_next_index(folio) - 1 > end)
goto put;
if (!folio_trylock(folio))
goto put;
@@ -2167,16 +2173,6 @@ out:
}
EXPORT_SYMBOL(filemap_get_folios);
-static inline
-bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
-{
- if (!folio_test_large(folio) || folio_test_hugetlb(folio))
- return false;
- if (index >= max)
- return false;
- return index < folio->index + folio_nr_pages(folio) - 1;
-}
-
/**
* filemap_get_folios_contig - Get a batch of contiguous folios
* @mapping: The address_space to search
@@ -2242,7 +2238,7 @@ update_start:
if (folio_test_hugetlb(folio))
*start = folio->index + 1;
else
- *start = folio->index + folio_nr_pages(folio);
+ *start = folio_next_index(folio);
}
out:
rcu_read_unlock();
@@ -2359,7 +2355,7 @@ static void filemap_get_read_batch(struct address_space *mapping,
break;
if (folio_test_readahead(folio))
break;
- xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1);
+ xas_advance(&xas, folio_next_index(folio) - 1);
continue;
put_folio:
folio_put(folio);
@@ -2632,6 +2628,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
int i, error = 0;
bool writably_mapped;
loff_t isize, end_offset;
+ loff_t last_pos = ra->prev_pos;
if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
@@ -2682,8 +2679,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
* When a read accesses the same folio several times, only
* mark it as accessed the first time.
*/
- if (!pos_same_folio(iocb->ki_pos, ra->prev_pos - 1,
- fbatch.folios[0]))
+ if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
+ fbatch.folios[0]))
folio_mark_accessed(fbatch.folios[0]);
for (i = 0; i < folio_batch_count(&fbatch); i++) {
@@ -2710,7 +2707,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
already_read += copied;
iocb->ki_pos += copied;
- ra->prev_pos = iocb->ki_pos;
+ last_pos = iocb->ki_pos;
if (copied < bytes) {
error = -EFAULT;
@@ -2724,7 +2721,7 @@ put_folios:
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
file_accessed(filp);
-
+ ra->prev_pos = last_pos;
return already_read ? already_read : error;
}
EXPORT_SYMBOL_GPL(filemap_read);
@@ -3434,10 +3431,10 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
return false;
}
-static struct folio *next_uptodate_page(struct folio *folio,
- struct address_space *mapping,
- struct xa_state *xas, pgoff_t end_pgoff)
+static struct folio *next_uptodate_folio(struct xa_state *xas,
+ struct address_space *mapping, pgoff_t end_pgoff)
{
+ struct folio *folio = xas_next_entry(xas, end_pgoff);
unsigned long max_idx;
do {
@@ -3475,20 +3472,65 @@ skip:
return NULL;
}
-static inline struct folio *first_map_page(struct address_space *mapping,
- struct xa_state *xas,
- pgoff_t end_pgoff)
+/*
+ * Map page range [start_page, start_page + nr_pages) of folio.
+ * start_page is gotten from start by folio_page(folio, start)
+ */
+static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
+ struct folio *folio, unsigned long start,
+ unsigned long addr, unsigned int nr_pages)
{
- return next_uptodate_page(xas_find(xas, end_pgoff),
- mapping, xas, end_pgoff);
-}
+ vm_fault_t ret = 0;
+ struct vm_area_struct *vma = vmf->vma;
+ struct file *file = vma->vm_file;
+ struct page *page = folio_page(folio, start);
+ unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
+ unsigned int count = 0;
+ pte_t *old_ptep = vmf->pte;
-static inline struct folio *next_map_page(struct address_space *mapping,
- struct xa_state *xas,
- pgoff_t end_pgoff)
-{
- return next_uptodate_page(xas_next_entry(xas, end_pgoff),
- mapping, xas, end_pgoff);
+ do {
+ if (PageHWPoison(page + count))
+ goto skip;
+
+ if (mmap_miss > 0)
+ mmap_miss--;
+
+ /*
+ * NOTE: If there're PTE markers, we'll leave them to be
+ * handled in the specific fault path, and it'll prohibit the
+ * fault-around logic.
+ */
+ if (!pte_none(vmf->pte[count]))
+ goto skip;
+
+ count++;
+ continue;
+skip:
+ if (count) {
+ set_pte_range(vmf, folio, page, count, addr);
+ folio_ref_add(folio, count);
+ if (in_range(vmf->address, addr, count))
+ ret = VM_FAULT_NOPAGE;
+ }
+
+ count++;
+ page += count;
+ vmf->pte += count;
+ addr += count * PAGE_SIZE;
+ count = 0;
+ } while (--nr_pages > 0);
+
+ if (count) {
+ set_pte_range(vmf, folio, page, count, addr);
+ folio_ref_add(folio, count);
+ if (in_range(vmf->address, addr, count))
+ ret = VM_FAULT_NOPAGE;
+ }
+
+ vmf->pte = old_ptep;
+ WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
+
+ return ret;
}
vm_fault_t filemap_map_pages(struct vm_fault *vmf,
@@ -3501,12 +3543,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
unsigned long addr;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct folio *folio;
- struct page *page;
- unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
vm_fault_t ret = 0;
+ int nr_pages = 0;
rcu_read_lock();
- folio = first_map_page(mapping, &xas, end_pgoff);
+ folio = next_uptodate_folio(&xas, mapping, end_pgoff);
if (!folio)
goto out;
@@ -3523,17 +3564,13 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
goto out;
}
do {
-again:
- page = folio_file_page(folio, xas.xa_index);
- if (PageHWPoison(page))
- goto unlock;
-
- if (mmap_miss > 0)
- mmap_miss--;
+ unsigned long end;
addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index;
+ end = folio->index + folio_nr_pages(folio) - 1;
+ nr_pages = min(end, end_pgoff) - xas.xa_index + 1;
/*
* NOTE: If there're PTE markers, we'll leave them to be
@@ -3543,32 +3580,17 @@ again:
if (!pte_none(ptep_get(vmf->pte)))
goto unlock;
- /* We're about to handle the fault */
- if (vmf->address == addr)
- ret = VM_FAULT_NOPAGE;
+ ret |= filemap_map_folio_range(vmf, folio,
+ xas.xa_index - folio->index, addr, nr_pages);
- do_set_pte(vmf, page, addr);
- /* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, addr, vmf->pte);
- if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
- xas.xa_index++;
- folio_ref_inc(folio);
- goto again;
- }
- folio_unlock(folio);
- continue;
unlock:
- if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
- xas.xa_index++;
- goto again;
- }
folio_unlock(folio);
folio_put(folio);
- } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL);
+ folio = next_uptodate_folio(&xas, mapping, end_pgoff);
+ } while (folio);
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
rcu_read_unlock();
- WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
return ret;
}
EXPORT_SYMBOL(filemap_map_pages);
@@ -4072,6 +4094,8 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp)
struct address_space * const mapping = folio->mapping;
BUG_ON(!folio_test_locked(folio));
+ if (!folio_needs_release(folio))
+ return true;
if (folio_test_writeback(folio))
return false;
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index c6f056c20503..10c3247542cb 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -92,7 +92,7 @@ EXPORT_SYMBOL(add_to_page_cache_lru);
noinline
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp)
+ fgf_t fgp_flags, gfp_t gfp)
{
struct folio *folio;
diff --git a/mm/frontswap.c b/mm/frontswap.c
deleted file mode 100644
index 2fb5df3384b8..000000000000
--- a/mm/frontswap.c
+++ /dev/null
@@ -1,283 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Frontswap frontend
- *
- * This code provides the generic "frontend" layer to call a matching
- * "backend" driver implementation of frontswap. See
- * Documentation/mm/frontswap.rst for more information.
- *
- * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
- * Author: Dan Magenheimer
- */
-
-#include <linux/mman.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
-#include <linux/security.h>
-#include <linux/module.h>
-#include <linux/debugfs.h>
-#include <linux/frontswap.h>
-#include <linux/swapfile.h>
-
-DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key);
-
-/*
- * frontswap_ops are added by frontswap_register_ops, and provide the
- * frontswap "backend" implementation functions. Multiple implementations
- * may be registered, but implementations can never deregister. This
- * is a simple singly-linked list of all registered implementations.
- */
-static const struct frontswap_ops *frontswap_ops __read_mostly;
-
-#ifdef CONFIG_DEBUG_FS
-/*
- * Counters available via /sys/kernel/debug/frontswap (if debugfs is
- * properly configured). These are for information only so are not protected
- * against increment races.
- */
-static u64 frontswap_loads;
-static u64 frontswap_succ_stores;
-static u64 frontswap_failed_stores;
-static u64 frontswap_invalidates;
-
-static inline void inc_frontswap_loads(void)
-{
- data_race(frontswap_loads++);
-}
-static inline void inc_frontswap_succ_stores(void)
-{
- data_race(frontswap_succ_stores++);
-}
-static inline void inc_frontswap_failed_stores(void)
-{
- data_race(frontswap_failed_stores++);
-}
-static inline void inc_frontswap_invalidates(void)
-{
- data_race(frontswap_invalidates++);
-}
-#else
-static inline void inc_frontswap_loads(void) { }
-static inline void inc_frontswap_succ_stores(void) { }
-static inline void inc_frontswap_failed_stores(void) { }
-static inline void inc_frontswap_invalidates(void) { }
-#endif
-
-/*
- * Due to the asynchronous nature of the backends loading potentially
- * _after_ the swap system has been activated, we have chokepoints
- * on all frontswap functions to not call the backend until the backend
- * has registered.
- *
- * This would not guards us against the user deciding to call swapoff right as
- * we are calling the backend to initialize (so swapon is in action).
- * Fortunately for us, the swapon_mutex has been taken by the callee so we are
- * OK. The other scenario where calls to frontswap_store (called via
- * swap_writepage) is racing with frontswap_invalidate_area (called via
- * swapoff) is again guarded by the swap subsystem.
- *
- * While no backend is registered all calls to frontswap_[store|load|
- * invalidate_area|invalidate_page] are ignored or fail.
- *
- * The time between the backend being registered and the swap file system
- * calling the backend (via the frontswap_* functions) is indeterminate as
- * frontswap_ops is not atomic_t (or a value guarded by a spinlock).
- * That is OK as we are comfortable missing some of these calls to the newly
- * registered backend.
- *
- * Obviously the opposite (unloading the backend) must be done after all
- * the frontswap_[store|load|invalidate_area|invalidate_page] start
- * ignoring or failing the requests. However, there is currently no way
- * to unload a backend once it is registered.
- */
-
-/*
- * Register operations for frontswap
- */
-int frontswap_register_ops(const struct frontswap_ops *ops)
-{
- if (frontswap_ops)
- return -EINVAL;
-
- frontswap_ops = ops;
- static_branch_inc(&frontswap_enabled_key);
- return 0;
-}
-
-/*
- * Called when a swap device is swapon'd.
- */
-void frontswap_init(unsigned type, unsigned long *map)
-{
- struct swap_info_struct *sis = swap_info[type];
-
- VM_BUG_ON(sis == NULL);
-
- /*
- * p->frontswap is a bitmap that we MUST have to figure out which page
- * has gone in frontswap. Without it there is no point of continuing.
- */
- if (WARN_ON(!map))
- return;
- /*
- * Irregardless of whether the frontswap backend has been loaded
- * before this function or it will be later, we _MUST_ have the
- * p->frontswap set to something valid to work properly.
- */
- frontswap_map_set(sis, map);
-
- if (!frontswap_enabled())
- return;
- frontswap_ops->init(type);
-}
-
-static bool __frontswap_test(struct swap_info_struct *sis,
- pgoff_t offset)
-{
- if (sis->frontswap_map)
- return test_bit(offset, sis->frontswap_map);
- return false;
-}
-
-static inline void __frontswap_set(struct swap_info_struct *sis,
- pgoff_t offset)
-{
- set_bit(offset, sis->frontswap_map);
- atomic_inc(&sis->frontswap_pages);
-}
-
-static inline void __frontswap_clear(struct swap_info_struct *sis,
- pgoff_t offset)
-{
- clear_bit(offset, sis->frontswap_map);
- atomic_dec(&sis->frontswap_pages);
-}
-
-/*
- * "Store" data from a page to frontswap and associate it with the page's
- * swaptype and offset. Page must be locked and in the swap cache.
- * If frontswap already contains a page with matching swaptype and
- * offset, the frontswap implementation may either overwrite the data and
- * return success or invalidate the page from frontswap and return failure.
- */
-int __frontswap_store(struct page *page)
-{
- int ret = -1;
- swp_entry_t entry = { .val = page_private(page), };
- int type = swp_type(entry);
- struct swap_info_struct *sis = swap_info[type];
- pgoff_t offset = swp_offset(entry);
-
- VM_BUG_ON(!frontswap_ops);
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(sis == NULL);
-
- /*
- * If a dup, we must remove the old page first; we can't leave the
- * old page no matter if the store of the new page succeeds or fails,
- * and we can't rely on the new page replacing the old page as we may
- * not store to the same implementation that contains the old page.
- */
- if (__frontswap_test(sis, offset)) {
- __frontswap_clear(sis, offset);
- frontswap_ops->invalidate_page(type, offset);
- }
-
- ret = frontswap_ops->store(type, offset, page);
- if (ret == 0) {
- __frontswap_set(sis, offset);
- inc_frontswap_succ_stores();
- } else {
- inc_frontswap_failed_stores();
- }
-
- return ret;
-}
-
-/*
- * "Get" data from frontswap associated with swaptype and offset that were
- * specified when the data was put to frontswap and use it to fill the
- * specified page with data. Page must be locked and in the swap cache.
- */
-int __frontswap_load(struct page *page)
-{
- int ret = -1;
- swp_entry_t entry = { .val = page_private(page), };
- int type = swp_type(entry);
- struct swap_info_struct *sis = swap_info[type];
- pgoff_t offset = swp_offset(entry);
- bool exclusive = false;
-
- VM_BUG_ON(!frontswap_ops);
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(sis == NULL);
-
- if (!__frontswap_test(sis, offset))
- return -1;
-
- /* Try loading from each implementation, until one succeeds. */
- ret = frontswap_ops->load(type, offset, page, &exclusive);
- if (ret == 0) {
- inc_frontswap_loads();
- if (exclusive) {
- SetPageDirty(page);
- __frontswap_clear(sis, offset);
- }
- }
- return ret;
-}
-
-/*
- * Invalidate any data from frontswap associated with the specified swaptype
- * and offset so that a subsequent "get" will fail.
- */
-void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
-{
- struct swap_info_struct *sis = swap_info[type];
-
- VM_BUG_ON(!frontswap_ops);
- VM_BUG_ON(sis == NULL);
-
- if (!__frontswap_test(sis, offset))
- return;
-
- frontswap_ops->invalidate_page(type, offset);
- __frontswap_clear(sis, offset);
- inc_frontswap_invalidates();
-}
-
-/*
- * Invalidate all data from frontswap associated with all offsets for the
- * specified swaptype.
- */
-void __frontswap_invalidate_area(unsigned type)
-{
- struct swap_info_struct *sis = swap_info[type];
-
- VM_BUG_ON(!frontswap_ops);
- VM_BUG_ON(sis == NULL);
-
- if (sis->frontswap_map == NULL)
- return;
-
- frontswap_ops->invalidate_area(type);
- atomic_set(&sis->frontswap_pages, 0);
- bitmap_zero(sis->frontswap_map, sis->max);
-}
-
-static int __init init_frontswap(void)
-{
-#ifdef CONFIG_DEBUG_FS
- struct dentry *root = debugfs_create_dir("frontswap", NULL);
- if (root == NULL)
- return -ENXIO;
- debugfs_create_u64("loads", 0444, root, &frontswap_loads);
- debugfs_create_u64("succ_stores", 0444, root, &frontswap_succ_stores);
- debugfs_create_u64("failed_stores", 0444, root,
- &frontswap_failed_stores);
- debugfs_create_u64("invalidates", 0444, root, &frontswap_invalidates);
-#endif
- return 0;
-}
-
-module_init(init_frontswap);
diff --git a/mm/gup.c b/mm/gup.c
index 44c2658cc128..2f8a2d89fde1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -597,7 +597,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
pte = ptep_get(ptep);
if (!pte_present(pte))
goto no_page;
- if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
+ if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags))
goto no_page;
page = vm_normal_page(vma, address, pte);
@@ -714,7 +714,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
if (likely(!pmd_trans_huge(pmdval)))
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
- if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags))
+ if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
return no_page_table(vma, flags);
ptl = pmd_lock(mm, pmd);
@@ -811,7 +811,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
struct follow_page_context *ctx)
{
pgd_t *pgd;
- struct page *page;
struct mm_struct *mm = vma->vm_mm;
ctx->page_mask = 0;
@@ -820,16 +819,10 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
* Call hugetlb_follow_page_mask for hugetlb vmas as it will use
* special hugetlb page table walking code. This eliminates the
* need to check for hugetlb entries in the general walking code.
- *
- * hugetlb_follow_page_mask is only for follow_page() handling here.
- * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
*/
- if (is_vm_hugetlb_page(vma)) {
- page = hugetlb_follow_page_mask(vma, address, flags);
- if (!page)
- page = no_page_table(vma, flags);
- return page;
- }
+ if (is_vm_hugetlb_page(vma))
+ return hugetlb_follow_page_mask(vma, address, flags,
+ &ctx->page_mask);
pgd = pgd_offset(mm, address);
@@ -851,6 +844,10 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
if (WARN_ON_ONCE(foll_flags & FOLL_PIN))
return NULL;
+ /*
+ * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect
+ * to fail on PROT_NONE-mapped pages.
+ */
page = follow_page_mask(vma, address, foll_flags, &ctx);
if (ctx.pgmap)
put_dev_pagemap(ctx.pgmap);
@@ -1211,7 +1208,7 @@ static long __get_user_pages(struct mm_struct *mm,
if (!vma && in_gate_area(mm, start)) {
ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
- pages ? &pages[i] : NULL);
+ pages ? &page : NULL);
if (ret)
goto out;
ctx.page_mask = 0;
@@ -1225,22 +1222,6 @@ static long __get_user_pages(struct mm_struct *mm,
ret = check_vma_flags(vma, gup_flags);
if (ret)
goto out;
-
- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages,
- &start, &nr_pages, i,
- gup_flags, locked);
- if (!*locked) {
- /*
- * We've got a VM_FAULT_RETRY
- * and we've lost mmap_lock.
- * We must stop here.
- */
- BUG_ON(gup_flags & FOLL_NOWAIT);
- goto out;
- }
- continue;
- }
}
retry:
/*
@@ -1281,22 +1262,58 @@ retry:
ret = PTR_ERR(page);
goto out;
}
-
- goto next_page;
} else if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
}
- if (pages) {
- pages[i] = page;
- flush_anon_page(vma, page, start);
- flush_dcache_page(page);
- ctx.page_mask = 0;
- }
next_page:
page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
+
+ if (pages) {
+ struct page *subpage;
+ unsigned int j;
+
+ /*
+ * This must be a large folio (and doesn't need to
+ * be the whole folio; it can be part of it), do
+ * the refcount work for all the subpages too.
+ *
+ * NOTE: here the page may not be the head page
+ * e.g. when start addr is not thp-size aligned.
+ * try_grab_folio() should have taken care of tail
+ * pages.
+ */
+ if (page_increm > 1) {
+ struct folio *folio;
+
+ /*
+ * Since we already hold refcount on the
+ * large folio, this should never fail.
+ */
+ folio = try_grab_folio(page, page_increm - 1,
+ foll_flags);
+ if (WARN_ON_ONCE(!folio)) {
+ /*
+ * Release the 1st page ref if the
+ * folio is problematic, fail hard.
+ */
+ gup_put_folio(page_folio(page), 1,
+ foll_flags);
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ for (j = 0; j < page_increm; j++) {
+ subpage = nth_page(page, j);
+ pages[i + j] = subpage;
+ flush_anon_page(vma, subpage, start + j * PAGE_SIZE);
+ flush_dcache_page(subpage);
+ }
+ }
+
i += page_increm;
start += page_increm * PAGE_SIZE;
nr_pages -= page_increm;
@@ -2551,7 +2568,14 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
struct page *page;
struct folio *folio;
- if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
+ /*
+ * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
+ * pte_access_permitted() better should reject these pages
+ * either way: otherwise, GUP-fast might succeed in
+ * cases where ordinary GUP would fail due to VMA access
+ * permissions.
+ */
+ if (pte_protnone(pte))
goto pte_unmap;
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
@@ -2576,7 +2600,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
if (!folio)
goto pte_unmap;
- if (unlikely(page_is_secretmem(page))) {
+ if (unlikely(folio_is_secretmem(folio))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2970,8 +2994,8 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
pmd_devmap(pmd))) {
- if (pmd_protnone(pmd) &&
- !gup_can_follow_protnone(flags))
+ /* See gup_pte_range() */
+ if (pmd_protnone(pmd))
return 0;
if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
@@ -3151,7 +3175,7 @@ static int internal_get_user_pages_fast(unsigned long start,
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN | FOLL_GET |
FOLL_FAST_ONLY | FOLL_NOFAULT |
- FOLL_PCI_P2PDMA)))
+ FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT)))
return -EINVAL;
if (gup_flags & FOLL_PIN)
diff --git a/mm/hmm.c b/mm/hmm.c
index 855e25e59d8f..277ddcab4947 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -562,6 +562,7 @@ static const struct mm_walk_ops hmm_walk_ops = {
.pte_hole = hmm_vma_walk_hole,
.hugetlb_entry = hmm_vma_walk_hugetlb_entry,
.test_walk = hmm_vma_walk_test,
+ .walk_lock = PGWALK_RDLOCK,
};
/**
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 554f6f82d225..064fbd90822b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -577,25 +577,20 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio)
}
#endif
-void prep_transhuge_page(struct page *page)
+void folio_prep_large_rmappable(struct folio *folio)
{
- struct folio *folio = (struct folio *)page;
-
VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
INIT_LIST_HEAD(&folio->_deferred_list);
- folio_set_compound_dtor(folio, TRANSHUGE_PAGE_DTOR);
+ folio_set_large_rmappable(folio);
}
-static inline bool is_transparent_hugepage(struct page *page)
+static inline bool is_transparent_hugepage(struct folio *folio)
{
- struct folio *folio;
-
- if (!PageCompound(page))
+ if (!folio_test_large(folio))
return false;
- folio = page_folio(page);
return is_huge_zero_page(&folio->page) ||
- folio->_folio_dtor == TRANSHUGE_PAGE_DTOR;
+ folio_test_large_rmappable(folio);
}
static unsigned long __thp_get_unmapped_area(struct file *filp,
@@ -1467,8 +1462,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
return ERR_PTR(-EFAULT);
- /* Full NUMA hinting faults to serialise migration in fault paths */
- if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags))
+ if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
return NULL;
if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
@@ -1613,7 +1607,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If other processes are mapping this folio, we couldn't discard
* the folio unless they all do MADV_FREE so let's skip the folio.
*/
- if (folio_mapcount(folio) != 1)
+ if (folio_estimated_sharers(folio) != 1)
goto out;
if (!folio_trylock(folio))
@@ -1982,7 +1976,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (!ptl)
return 0;
- pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
+ pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
if (vma_is_special_huge(vma)) {
spin_unlock(ptl);
@@ -2004,7 +1998,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
count_vm_event(THP_SPLIT_PUD);
- pudp_huge_clear_flush_notify(vma, haddr, pud);
+ pudp_huge_clear_flush(vma, haddr, pud);
}
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
@@ -2024,11 +2018,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
out:
spin_unlock(ptl);
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pudp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
+ mmu_notifier_invalidate_range_end(&range);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -2095,7 +2085,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
count_vm_event(THP_SPLIT_PMD);
if (!vma_is_anonymous(vma)) {
- old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
/*
* We are going to unmap this huge page. So
* just go ahead and zap it
@@ -2125,8 +2115,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (is_huge_zero_pmd(*pmd)) {
/*
* FIXME: Do we want to invalidate secondary mmu by calling
- * mmu_notifier_invalidate_range() see comments below inside
- * __split_huge_pmd() ?
+ * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
+ * inside __split_huge_pmd() ?
*
* We are going from a zero huge page write protected to zero
* small page also write protected so it does not seems useful
@@ -2256,7 +2246,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_mksoft_dirty(entry);
if (uffd_wp)
entry = pte_mkuffd_wp(entry);
- page_add_anon_rmap(page + i, vma, addr, false);
+ page_add_anon_rmap(page + i, vma, addr, RMAP_NONE);
}
VM_BUG_ON(!pte_none(ptep_get(pte)));
set_pte_at(mm, addr, pte, entry);
@@ -2305,20 +2295,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
out:
spin_unlock(ptl);
- /*
- * No need to double call mmu_notifier->invalidate_range() callback.
- * They are 3 cases to consider inside __split_huge_pmd_locked():
- * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
- * 2) __split_huge_zero_page_pmd() read only zero page and any write
- * fault will trigger a flush_notify before pointing to a new page
- * (it is fine if the secondary mmu keeps pointing to the old zero
- * page in the meantime)
- * 3) Split a huge pmd into pte pointing to the same page. No need
- * to invalidate secondary tlb entry they are all still valid.
- * any further changes to individual pte will notify. So no need
- * to call mmu_notifier->invalidate_range()
- */
- mmu_notifier_invalidate_range_only_end(&range);
+ mmu_notifier_invalidate_range_end(&range);
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
@@ -2425,10 +2402,16 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
}
}
-static void __split_huge_page_tail(struct page *head, int tail,
+static void __split_huge_page_tail(struct folio *folio, int tail,
struct lruvec *lruvec, struct list_head *list)
{
+ struct page *head = &folio->page;
struct page *page_tail = head + tail;
+ /*
+ * Careful: new_folio is not a "real" folio before we cleared PageTail.
+ * Don't pass it around before clear_compound_head().
+ */
+ struct folio *new_folio = (struct folio *)page_tail;
VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
@@ -2470,18 +2453,15 @@ static void __split_huge_page_tail(struct page *head, int tail,
page_tail->index = head->index + tail;
/*
- * page->private should not be set in tail pages with the exception
- * of swap cache pages that store the swp_entry_t in tail pages.
- * Fix up and warn once if private is unexpectedly set.
- *
- * What of 32-bit systems, on which folio->_pincount overlays
- * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and
- * pincount must be 0 for folio_ref_freeze() to have succeeded.
+ * page->private should not be set in tail pages. Fix up and warn once
+ * if private is unexpectedly set.
*/
- if (!folio_test_swapcache(page_folio(head))) {
- VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail);
+ if (unlikely(page_tail->private)) {
+ VM_WARN_ON_ONCE_PAGE(true, page_tail);
page_tail->private = 0;
}
+ if (folio_test_swapcache(folio))
+ new_folio->swap.val = folio->swap.val + tail;
/* Page flags must be visible before we make the page non-compound. */
smp_wmb();
@@ -2522,16 +2502,14 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
unsigned int nr = thp_nr_pages(head);
- int i;
+ int i, nr_dropped = 0;
/* complete memcg works before add pages to LRU */
split_page_memcg(head, nr);
- if (PageAnon(head) && PageSwapCache(head)) {
- swp_entry_t entry = { .val = page_private(head) };
-
- offset = swp_offset(entry);
- swap_cache = swap_address_space(entry);
+ if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
+ offset = swp_offset(folio->swap);
+ swap_cache = swap_address_space(folio->swap);
xa_lock(&swap_cache->i_pages);
}
@@ -2541,13 +2519,13 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageHasHWPoisoned(head);
for (i = nr - 1; i >= 1; i--) {
- __split_huge_page_tail(head, i, lruvec, list);
+ __split_huge_page_tail(folio, i, lruvec, list);
/* Some pages can be beyond EOF: drop them from page cache */
if (head[i].index >= end) {
struct folio *tail = page_folio(head + i);
if (shmem_mapping(head->mapping))
- shmem_uncharge(head->mapping->host, 1);
+ nr_dropped++;
else if (folio_test_clear_dirty(tail))
folio_account_cleaned(tail,
inode_to_wb(folio->mapping->host));
@@ -2584,13 +2562,12 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
local_irq_enable();
+ if (nr_dropped)
+ shmem_uncharge(head->mapping->host, nr_dropped);
remap_page(folio, nr);
- if (PageSwapCache(head)) {
- swp_entry_t entry = { .val = page_private(head) };
-
- split_swap_cluster(entry);
- }
+ if (folio_test_swapcache(folio))
+ split_swap_cluster(folio->swap);
for (i = 0; i < nr; i++) {
struct page *subpage = head + i;
@@ -2698,8 +2675,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
gfp = current_gfp_context(mapping_gfp_mask(mapping) &
GFP_RECLAIM_MASK);
- if (folio_test_private(folio) &&
- !filemap_release_folio(folio, gfp)) {
+ if (!filemap_release_folio(folio, gfp)) {
ret = -EBUSY;
goto out;
}
@@ -2796,10 +2772,9 @@ out:
return ret;
}
-void free_transhuge_page(struct page *page)
+void folio_undo_large_rmappable(struct folio *folio)
{
- struct folio *folio = (struct folio *)page;
- struct deferred_split *ds_queue = get_deferred_split_queue(folio);
+ struct deferred_split *ds_queue;
unsigned long flags;
/*
@@ -2807,15 +2782,16 @@ void free_transhuge_page(struct page *page)
* deferred_list. If folio is not in deferred_list, it's safe
* to check without acquiring the split_queue_lock.
*/
- if (data_race(!list_empty(&folio->_deferred_list))) {
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
- if (!list_empty(&folio->_deferred_list)) {
- ds_queue->split_queue_len--;
- list_del(&folio->_deferred_list);
- }
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+ if (data_race(list_empty(&folio->_deferred_list)))
+ return;
+
+ ds_queue = get_deferred_split_queue(folio);
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ if (!list_empty(&folio->_deferred_list)) {
+ ds_queue->split_queue_len--;
+ list_del(&folio->_deferred_list);
}
- free_compound_page(page);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
void deferred_split_folio(struct folio *folio)
@@ -3034,6 +3010,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
struct vm_area_struct *vma = vma_lookup(mm, addr);
struct page *page;
+ struct folio *folio;
if (!vma)
break;
@@ -3050,22 +3027,23 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
if (IS_ERR_OR_NULL(page))
continue;
- if (!is_transparent_hugepage(page))
+ folio = page_folio(page);
+ if (!is_transparent_hugepage(folio))
goto next;
total++;
- if (!can_split_folio(page_folio(page), NULL))
+ if (!can_split_folio(folio, NULL))
goto next;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto next;
- if (!split_huge_page(page))
+ if (!split_folio(folio))
split++;
- unlock_page(page);
+ folio_unlock(folio);
next:
- put_page(page);
+ folio_put(folio);
cond_resched();
}
mmap_read_unlock(mm);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 64a3239b6407..ba6d39b71cb1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,6 +34,7 @@
#include <linux/nospec.h>
#include <linux/delayacct.h>
#include <linux/memory.h>
+#include <linux/mm_inline.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -967,9 +968,14 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
}
EXPORT_SYMBOL_GPL(linear_hugepage_index);
-/*
- * Return the size of the pages allocated when backing a VMA. In the majority
- * cases this will be same size as used by the page table entries.
+/**
+ * vma_kernel_pagesize - Page size granularity for this VMA.
+ * @vma: The user mapping.
+ *
+ * Folios in this VMA will be aligned to, and at least the size of the
+ * number of bytes returned by this function.
+ *
+ * Return: The default size of the folios allocated when backing a VMA.
*/
unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
@@ -1483,6 +1489,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
for (i = 1; i < nr_pages; i++) {
p = folio_page(folio, i);
+ p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE;
p->mapping = NULL;
clear_compound_head(p);
if (!demote)
@@ -1579,9 +1586,19 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order) { }
#endif
+static inline void __clear_hugetlb_destructor(struct hstate *h,
+ struct folio *folio)
+{
+ lockdep_assert_held(&hugetlb_lock);
+
+ folio_clear_hugetlb(folio);
+}
+
/*
- * Remove hugetlb folio from lists, and update dtor so that the folio appears
- * as just a compound page.
+ * Remove hugetlb folio from lists.
+ * If vmemmap exists for the folio, update dtor so that the folio appears
+ * as just a compound page. Otherwise, wait until after allocating vmemmap
+ * to update dtor.
*
* A reference is held on the folio, except in the case of demote.
*
@@ -1612,31 +1629,19 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
}
/*
- * Very subtle
- *
- * For non-gigantic pages set the destructor to the normal compound
- * page dtor. This is needed in case someone takes an additional
- * temporary ref to the page, and freeing is delayed until they drop
- * their reference.
- *
- * For gigantic pages set the destructor to the null dtor. This
- * destructor will never be called. Before freeing the gigantic
- * page destroy_compound_gigantic_folio will turn the folio into a
- * simple group of pages. After this the destructor does not
- * apply.
- *
- * This handles the case where more than one ref is held when and
- * after update_and_free_hugetlb_folio is called.
- *
- * In the case of demote we do not ref count the page as it will soon
- * be turned into a page of smaller size.
+ * We can only clear the hugetlb destructor after allocating vmemmap
+ * pages. Otherwise, someone (memory error handling) may try to write
+ * to tail struct pages.
+ */
+ if (!folio_test_hugetlb_vmemmap_optimized(folio))
+ __clear_hugetlb_destructor(h, folio);
+
+ /*
+ * In the case of demote we do not ref count the page as it will soon
+ * be turned into a page of smaller size.
*/
if (!demote)
folio_ref_unfreeze(folio, 1);
- if (hstate_is_gigantic(h))
- folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
- else
- folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
h->nr_huge_pages--;
h->nr_huge_pages_node[nid]--;
@@ -1673,7 +1678,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
h->surplus_huge_pages_node[nid]++;
}
- folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
+ folio_set_hugetlb(folio);
folio_change_private(folio, NULL);
/*
* We have to set hugetlb_vmemmap_optimized again as above
@@ -1689,10 +1694,10 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
zeroed = folio_put_testzero(folio);
if (unlikely(!zeroed))
/*
- * It is VERY unlikely soneone else has taken a ref on
- * the page. In this case, we simply return as the
- * hugetlb destructor (free_huge_page) will be called
- * when this other ref is dropped.
+ * It is VERY unlikely soneone else has taken a ref
+ * on the folio. In this case, we simply return as
+ * free_huge_folio() will be called when this other ref
+ * is dropped.
*/
return;
@@ -1703,8 +1708,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
static void __update_and_free_hugetlb_folio(struct hstate *h,
struct folio *folio)
{
- int i;
- struct page *subpage;
+ bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio);
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
@@ -1735,12 +1739,14 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
if (unlikely(folio_test_hwpoison(folio)))
folio_clear_hugetlb_hwpoison(folio);
- for (i = 0; i < pages_per_huge_page(h); i++) {
- subpage = folio_page(folio, i);
- subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
- 1 << PG_referenced | 1 << PG_dirty |
- 1 << PG_active | 1 << PG_private |
- 1 << PG_writeback);
+ /*
+ * If vmemmap pages were allocated above, then we need to clear the
+ * hugetlb destructor under the hugetlb lock.
+ */
+ if (clear_dtor) {
+ spin_lock_irq(&hugetlb_lock);
+ __clear_hugetlb_destructor(h, folio);
+ spin_unlock_irq(&hugetlb_lock);
}
/*
@@ -1784,11 +1790,10 @@ static void free_hpage_workfn(struct work_struct *work)
node = node->next;
page->mapping = NULL;
/*
- * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
- * is going to trigger because a previous call to
- * remove_hugetlb_folio() will call folio_set_compound_dtor
- * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate()
- * directly.
+ * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
+ * folio_hstate() is going to trigger because a previous call to
+ * remove_hugetlb_folio() will clear the hugetlb bit, so do
+ * not use folio_hstate() directly.
*/
h = size_to_hstate(page_size(page));
@@ -1847,13 +1852,12 @@ struct hstate *size_to_hstate(unsigned long size)
return NULL;
}
-void free_huge_page(struct page *page)
+void free_huge_folio(struct folio *folio)
{
/*
* Can't pass hstate in here because it is called from the
* compound page destructor.
*/
- struct folio *folio = page_folio(page);
struct hstate *h = folio_hstate(folio);
int nid = folio_nid(folio);
struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
@@ -1908,7 +1912,7 @@ void free_huge_page(struct page *page)
spin_unlock_irqrestore(&hugetlb_lock, flags);
update_and_free_hugetlb_folio(h, folio, true);
} else {
- arch_clear_hugepage_flags(page);
+ arch_clear_hugepage_flags(&folio->page);
enqueue_hugetlb_folio(h, folio);
spin_unlock_irqrestore(&hugetlb_lock, flags);
}
@@ -1928,7 +1932,7 @@ static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
hugetlb_vmemmap_optimize(h, &folio->page);
INIT_LIST_HEAD(&folio->lru);
- folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
+ folio_set_hugetlb(folio);
hugetlb_set_folio_subpool(folio, NULL);
set_hugetlb_cgroup(folio, NULL);
set_hugetlb_cgroup_rsvd(folio, NULL);
@@ -2043,28 +2047,10 @@ int PageHuge(struct page *page)
if (!PageCompound(page))
return 0;
folio = page_folio(page);
- return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
+ return folio_test_hugetlb(folio);
}
EXPORT_SYMBOL_GPL(PageHuge);
-/**
- * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs
- * @folio: The folio to test.
- *
- * Context: Any context. Caller should have a reference on the folio to
- * prevent it from being turned into a tail page.
- * Return: True for hugetlbfs folios, false for anon folios or folios
- * belonging to other filesystems.
- */
-bool folio_test_hugetlb(struct folio *folio)
-{
- if (!folio_test_large(folio))
- return false;
-
- return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
-}
-EXPORT_SYMBOL_GPL(folio_test_hugetlb);
-
/*
* Find and lock address space (mapping) in write mode.
*
@@ -2218,7 +2204,7 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node,
nodes_allowed, node_alloc_noretry);
if (folio) {
- free_huge_page(&folio->page); /* free it into the hugepage allocator */
+ free_huge_folio(folio); /* free it into the hugepage allocator */
return 1;
}
}
@@ -2401,13 +2387,13 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
* We could have raced with the pool size change.
* Double check that and simply deallocate the new page
* if we would end up overcommiting the surpluses. Abuse
- * temporary page to workaround the nasty free_huge_page
+ * temporary page to workaround the nasty free_huge_folio
* codeflow
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
folio_set_hugetlb_temporary(folio);
spin_unlock_irq(&hugetlb_lock);
- free_huge_page(&folio->page);
+ free_huge_folio(folio);
return NULL;
}
@@ -2519,8 +2505,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
__must_hold(&hugetlb_lock)
{
LIST_HEAD(surplus_list);
- struct folio *folio;
- struct page *page, *tmp;
+ struct folio *folio, *tmp;
int ret;
long i;
long needed, allocated;
@@ -2580,21 +2565,21 @@ retry:
ret = 0;
/* Free the needed pages to the hugetlb pool */
- list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ list_for_each_entry_safe(folio, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
/* Add the page to the hugetlb allocator */
- enqueue_hugetlb_folio(h, page_folio(page));
+ enqueue_hugetlb_folio(h, folio);
}
free:
spin_unlock_irq(&hugetlb_lock);
/*
* Free unnecessary surplus pages to the buddy allocator.
- * Pages have no ref count, call free_huge_page directly.
+ * Pages have no ref count, call free_huge_folio directly.
*/
- list_for_each_entry_safe(page, tmp, &surplus_list, lru)
- free_huge_page(page);
+ list_for_each_entry_safe(folio, tmp, &surplus_list, lru)
+ free_huge_folio(folio);
spin_lock_irq(&hugetlb_lock);
return ret;
@@ -2808,11 +2793,11 @@ static long vma_del_reservation(struct hstate *h,
* 2) No reservation was in place for the page, so hugetlb_restore_reserve is
* not set. However, alloc_hugetlb_folio always updates the reserve map.
*
- * In case 1, free_huge_page later in the error path will increment the
- * global reserve count. But, free_huge_page does not have enough context
+ * In case 1, free_huge_folio later in the error path will increment the
+ * global reserve count. But, free_huge_folio does not have enough context
* to adjust the reservation map. This case deals primarily with private
* mappings. Adjust the reserve map here to be consistent with global
- * reserve count adjustments to be made by free_huge_page. Make sure the
+ * reserve count adjustments to be made by free_huge_folio. Make sure the
* reserve map indicates there is a reservation present.
*
* In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio.
@@ -2828,7 +2813,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
* Rare out of memory condition in reserve map
* manipulation. Clear hugetlb_restore_reserve so
* that global reserve count will not be incremented
- * by free_huge_page. This will make it appear
+ * by free_huge_folio. This will make it appear
* as though the reservation for this folio was
* consumed. This may prevent the task from
* faulting in the folio at a later time. This
@@ -3204,7 +3189,7 @@ static void __init gather_bootmem_prealloc(void)
if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
WARN_ON(folio_test_reserved(folio));
prep_new_hugetlb_folio(h, folio, folio_nid(folio));
- free_huge_page(page); /* add to the hugepage allocator */
+ free_huge_folio(folio); /* add to the hugepage allocator */
} else {
/* VERY unlikely inflated ref count on a tail page */
free_gigantic_folio(folio, huge_page_order(h));
@@ -3236,7 +3221,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
&node_states[N_MEMORY], NULL);
if (!folio)
break;
- free_huge_page(&folio->page); /* free it into the hugepage allocator */
+ free_huge_folio(folio); /* free it into the hugepage allocator */
}
cond_resched();
}
@@ -3514,7 +3499,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
while (count > persistent_huge_pages(h)) {
/*
* If this allocation races such that we no longer need the
- * page, free_huge_page will handle it by freeing the page
+ * page, free_huge_folio will handle it by freeing the page
* and reducing the surplus.
*/
spin_unlock_irq(&hugetlb_lock);
@@ -3630,7 +3615,7 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
prep_compound_page(subpage, target_hstate->order);
folio_change_private(inner_folio, NULL);
prep_new_hugetlb_folio(target_hstate, inner_folio, nid);
- free_huge_page(subpage);
+ free_huge_folio(inner_folio);
}
mutex_unlock(&target_hstate->resize_lock);
@@ -4747,7 +4732,7 @@ void hugetlb_show_meminfo_node(int nid)
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
{
seq_printf(m, "HugetlbPages:\t%8lu kB\n",
- atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
+ K(atomic_long_read(&mm->hugetlb_usage)));
}
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
@@ -5028,7 +5013,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
src_vma->vm_start,
src_vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
- mmap_assert_write_locked(src);
+ vma_assert_write_locked(src_vma);
raw_write_seqcount_begin(&src->write_protect_seq);
} else {
/*
@@ -5101,15 +5086,12 @@ again:
entry = huge_pte_clear_uffd_wp(entry);
set_huge_pte_at(dst, addr, dst_pte, entry);
} else if (unlikely(is_pte_marker(entry))) {
- /* No swap on hugetlb */
- WARN_ON_ONCE(
- is_swapin_error_entry(pte_to_swp_entry(entry)));
- /*
- * We copy the pte marker only if the dst vma has
- * uffd-wp enabled.
- */
- if (userfaultfd_wp(dst_vma))
- set_huge_pte_at(dst, addr, dst_pte, entry);
+ pte_marker marker = copy_pte_marker(
+ pte_to_swp_entry(entry), dst_vma);
+
+ if (marker)
+ set_huge_pte_at(dst, addr, dst_pte,
+ make_pte_marker(marker));
} else {
entry = huge_ptep_get(src_pte);
pte_folio = page_folio(pte_page(entry));
@@ -5281,9 +5263,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
}
if (shared_pmd)
- flush_tlb_range(vma, range.start, range.end);
+ flush_hugetlb_tlb_range(vma, range.start, range.end);
else
- flush_tlb_range(vma, old_end - len, old_end);
+ flush_hugetlb_tlb_range(vma, old_end - len, old_end);
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
hugetlb_vma_unlock_write(vma);
@@ -5690,7 +5672,6 @@ retry_avoidcopy:
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
- mmu_notifier_invalidate_range(mm, range.start, range.end);
page_remove_rmap(&old_folio->page, vma, true);
hugepage_add_new_anon_rmap(new_folio, vma, haddr);
if (huge_pte_uffd_wp(pte))
@@ -5721,7 +5702,6 @@ out_release_old:
/*
* Return whether there is a pagecache page to back given address within VMA.
- * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
*/
static bool hugetlbfs_pagecache_present(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
@@ -6066,6 +6046,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
+ /* TODO: Handle faults under the VMA lock */
+ if (flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
@@ -6090,14 +6076,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
entry = huge_ptep_get(ptep);
- /* PTE markers should be handled the same way as none pte */
- if (huge_pte_none_mostly(entry))
+ if (huge_pte_none_mostly(entry)) {
+ if (is_pte_marker(entry)) {
+ pte_marker marker =
+ pte_marker_get(pte_to_swp_entry(entry));
+
+ if (marker & PTE_MARKER_POISONED) {
+ ret = VM_FAULT_HWPOISON_LARGE;
+ goto out_mutex;
+ }
+ }
+
/*
+ * Other PTE markers should be handled the same way as none PTE.
+ *
* hugetlb_no_page will drop vma lock and hugetlb fault
* mutex internally, which make us return immediately.
*/
return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
entry, flags);
+ }
ret = 0;
@@ -6253,6 +6251,25 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
int writable;
bool folio_in_pagecache = false;
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
+ ptl = huge_pte_lock(h, dst_mm, dst_pte);
+
+ /* Don't overwrite any existing PTEs (even markers) */
+ if (!huge_pte_none(huge_ptep_get(dst_pte))) {
+ spin_unlock(ptl);
+ return -EEXIST;
+ }
+
+ _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
+ set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+ spin_unlock(ptl);
+ return 0;
+ }
+
if (is_continue) {
ret = -EFAULT;
folio = filemap_lock_folio(mapping, idx);
@@ -6422,39 +6439,9 @@ out_release_nounlock:
}
#endif /* CONFIG_USERFAULTFD */
-static void record_subpages(struct page *page, struct vm_area_struct *vma,
- int refs, struct page **pages)
-{
- int nr;
-
- for (nr = 0; nr < refs; nr++) {
- if (likely(pages))
- pages[nr] = nth_page(page, nr);
- }
-}
-
-static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
- unsigned int flags, pte_t *pte,
- bool *unshare)
-{
- pte_t pteval = huge_ptep_get(pte);
-
- *unshare = false;
- if (is_swap_pte(pteval))
- return true;
- if (huge_pte_write(pteval))
- return false;
- if (flags & FOLL_WRITE)
- return true;
- if (gup_must_unshare(vma, flags, pte_page(pteval))) {
- *unshare = true;
- return true;
- }
- return false;
-}
-
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
+ unsigned long address, unsigned int flags,
+ unsigned int *page_mask)
{
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
@@ -6462,13 +6449,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
struct page *page = NULL;
spinlock_t *ptl;
pte_t *pte, entry;
-
- /*
- * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
- * follow_hugetlb_page().
- */
- if (WARN_ON_ONCE(flags & FOLL_PIN))
- return NULL;
+ int ret;
hugetlb_vma_lock_read(vma);
pte = hugetlb_walk(vma, haddr, huge_page_size(h));
@@ -6478,8 +6459,23 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
ptl = huge_pte_lock(h, mm, pte);
entry = huge_ptep_get(pte);
if (pte_present(entry)) {
- page = pte_page(entry) +
- ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+ page = pte_page(entry);
+
+ if (!huge_pte_write(entry)) {
+ if (flags & FOLL_WRITE) {
+ page = NULL;
+ goto out;
+ }
+
+ if (gup_must_unshare(vma, flags, page)) {
+ /* Tell the caller to do unsharing */
+ page = ERR_PTR(-EMLINK);
+ goto out;
+ }
+ }
+
+ page += ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+
/*
* Note that page may be a sub-page, and with vmemmap
* optimizations the page struct may be read only.
@@ -6489,208 +6485,29 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
* try_grab_page() should always be able to get the page here,
* because we hold the ptl lock and have verified pte_present().
*/
- if (try_grab_page(page, flags)) {
- page = NULL;
+ ret = try_grab_page(page, flags);
+
+ if (WARN_ON_ONCE(ret)) {
+ page = ERR_PTR(ret);
goto out;
}
+
+ *page_mask = (1U << huge_page_order(h)) - 1;
}
out:
spin_unlock(ptl);
out_unlock:
hugetlb_vma_unlock_read(vma);
- return page;
-}
-
-long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, unsigned long *position,
- unsigned long *nr_pages, long i, unsigned int flags,
- int *locked)
-{
- unsigned long pfn_offset;
- unsigned long vaddr = *position;
- unsigned long remainder = *nr_pages;
- struct hstate *h = hstate_vma(vma);
- int err = -EFAULT, refs;
- while (vaddr < vma->vm_end && remainder) {
- pte_t *pte;
- spinlock_t *ptl = NULL;
- bool unshare = false;
- int absent;
- struct page *page;
-
- /*
- * If we have a pending SIGKILL, don't keep faulting pages and
- * potentially allocating memory.
- */
- if (fatal_signal_pending(current)) {
- remainder = 0;
- break;
- }
-
- hugetlb_vma_lock_read(vma);
- /*
- * Some archs (sparc64, sh*) have multiple pte_ts to
- * each hugepage. We have to make sure we get the
- * first, for the page indexing below to work.
- *
- * Note that page table lock is not held when pte is null.
- */
- pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
- huge_page_size(h));
- if (pte)
- ptl = huge_pte_lock(h, mm, pte);
- absent = !pte || huge_pte_none(huge_ptep_get(pte));
-
- /*
- * When coredumping, it suits get_dump_page if we just return
- * an error where there's an empty slot with no huge pagecache
- * to back it. This way, we avoid allocating a hugepage, and
- * the sparse dumpfile avoids allocating disk blocks, but its
- * huge holes still show up with zeroes where they need to be.
- */
- if (absent && (flags & FOLL_DUMP) &&
- !hugetlbfs_pagecache_present(h, vma, vaddr)) {
- if (pte)
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- remainder = 0;
- break;
- }
-
- /*
- * We need call hugetlb_fault for both hugepages under migration
- * (in which case hugetlb_fault waits for the migration,) and
- * hwpoisoned hugepages (in which case we need to prevent the
- * caller from accessing to them.) In order to do this, we use
- * here is_swap_pte instead of is_hugetlb_entry_migration and
- * is_hugetlb_entry_hwpoisoned. This is because it simply covers
- * both cases, and because we can't follow correct pages
- * directly from any kind of swap entries.
- */
- if (absent ||
- __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
- vm_fault_t ret;
- unsigned int fault_flags = 0;
-
- if (pte)
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
-
- if (flags & FOLL_WRITE)
- fault_flags |= FAULT_FLAG_WRITE;
- else if (unshare)
- fault_flags |= FAULT_FLAG_UNSHARE;
- if (locked) {
- fault_flags |= FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_KILLABLE;
- if (flags & FOLL_INTERRUPTIBLE)
- fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
- }
- if (flags & FOLL_NOWAIT)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_RETRY_NOWAIT;
- if (flags & FOLL_TRIED) {
- /*
- * Note: FAULT_FLAG_ALLOW_RETRY and
- * FAULT_FLAG_TRIED can co-exist
- */
- fault_flags |= FAULT_FLAG_TRIED;
- }
- ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
- if (ret & VM_FAULT_ERROR) {
- err = vm_fault_to_errno(ret, flags);
- remainder = 0;
- break;
- }
- if (ret & VM_FAULT_RETRY) {
- if (locked &&
- !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
- *locked = 0;
- *nr_pages = 0;
- /*
- * VM_FAULT_RETRY must not return an
- * error, it will return zero
- * instead.
- *
- * No need to update "position" as the
- * caller will not check it after
- * *nr_pages is set to 0.
- */
- return i;
- }
- continue;
- }
-
- pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
- page = pte_page(huge_ptep_get(pte));
-
- VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
- !PageAnonExclusive(page), page);
-
- /*
- * If subpage information not requested, update counters
- * and skip the same_page loop below.
- */
- if (!pages && !pfn_offset &&
- (vaddr + huge_page_size(h) < vma->vm_end) &&
- (remainder >= pages_per_huge_page(h))) {
- vaddr += huge_page_size(h);
- remainder -= pages_per_huge_page(h);
- i += pages_per_huge_page(h);
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- continue;
- }
-
- /* vaddr may not be aligned to PAGE_SIZE */
- refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
- (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
-
- if (pages)
- record_subpages(nth_page(page, pfn_offset),
- vma, refs,
- likely(pages) ? pages + i : NULL);
-
- if (pages) {
- /*
- * try_grab_folio() should always succeed here,
- * because: a) we hold the ptl lock, and b) we've just
- * checked that the huge page is present in the page
- * tables. If the huge page is present, then the tail
- * pages must also be present. The ptl prevents the
- * head page and tail pages from being rearranged in
- * any way. As this is hugetlb, the pages will never
- * be p2pdma or not longterm pinable. So this page
- * must be available at this point, unless the page
- * refcount overflowed:
- */
- if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
- flags))) {
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- remainder = 0;
- err = -ENOMEM;
- break;
- }
- }
-
- vaddr += (refs << PAGE_SHIFT);
- remainder -= refs;
- i += refs;
-
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- }
- *nr_pages = remainder;
/*
- * setting position is actually required only if remainder is
- * not zero but it's faster not to add a "if (remainder)"
- * branch.
+ * Fixup retval for dump requests: if pagecache doesn't exist,
+ * don't try to allocate a new page but just skip it.
*/
- *position = vaddr;
+ if (!page && (flags & FOLL_DUMP) &&
+ !hugetlbfs_pagecache_present(h, vma, address))
+ page = ERR_PTR(-EFAULT);
- return i ? i : err;
+ return page;
}
long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -6822,8 +6639,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
else
flush_hugetlb_tlb_range(vma, start, end);
/*
- * No need to call mmu_notifier_invalidate_range() we are downgrading
- * page table protection not changing it to point to a new page.
+ * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
+ * downgrading page table protection not changing it to point to a new
+ * page.
*
* See Documentation/mm/mmu_notifier.rst
*/
@@ -7467,7 +7285,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
/*
- * No need to call mmu_notifier_invalidate_range(), see
+ * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index c2007ef5e9b0..4b9734777f69 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -36,14 +36,22 @@ struct vmemmap_remap_walk {
struct list_head *vmemmap_pages;
};
-static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
{
pmd_t __pmd;
int i;
unsigned long addr = start;
- struct page *page = pmd_page(*pmd);
- pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
+ struct page *head;
+ pte_t *pgtable;
+
+ spin_lock(&init_mm.page_table_lock);
+ head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
+ spin_unlock(&init_mm.page_table_lock);
+ if (!head)
+ return 0;
+
+ pgtable = pte_alloc_one_kernel(&init_mm);
if (!pgtable)
return -ENOMEM;
@@ -53,7 +61,7 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
pte_t entry, *pte;
pgprot_t pgprot = PAGE_KERNEL;
- entry = mk_pte(page + i, pgprot);
+ entry = mk_pte(head + i, pgprot);
pte = pte_offset_kernel(&__pmd, addr);
set_pte_at(&init_mm, addr, pte, entry);
}
@@ -65,8 +73,8 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
* be treated as indepdenent small pages (as they can be freed
* individually).
*/
- if (!PageReserved(page))
- split_page(page, get_order(PMD_SIZE));
+ if (!PageReserved(head))
+ split_page(head, get_order(PMD_SIZE));
/* Make pte visible before pmd. See comment in pmd_install(). */
smp_wmb();
@@ -80,20 +88,6 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
return 0;
}
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
-{
- int leaf;
-
- spin_lock(&init_mm.page_table_lock);
- leaf = pmd_leaf(*pmd);
- spin_unlock(&init_mm.page_table_lock);
-
- if (!leaf)
- return 0;
-
- return __split_vmemmap_huge_pmd(pmd, start);
-}
-
static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end,
struct vmemmap_remap_walk *walk)
diff --git a/mm/init-mm.c b/mm/init-mm.c
index efa97b57acfd..cfd367822cdd 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -17,6 +17,8 @@
#define INIT_MM_CONTEXT(name)
#endif
+const struct vm_operations_struct vma_dummy_vm_ops;
+
/*
* For dynamically allocated mm_structs, there is a dynamically sized cpumask
* at the end of the structure, the size of which depends on the maximum CPU
diff --git a/mm/internal.h b/mm/internal.h
index 018f5c342f90..30cf724ddbce 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,6 +62,12 @@ void page_writeback_init(void);
#define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1)
/*
+ * Flags passed to __show_mem() and show_free_areas() to suppress output in
+ * various contexts.
+ */
+#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
+
+/*
* How many individual pages have an elevated _mapcount. Excludes
* the folio's entire_mapcount.
*/
@@ -103,7 +109,7 @@ bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
void folio_activate(struct folio *folio);
-void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *start_vma, unsigned long floor,
unsigned long ceiling, bool mm_wr_locked);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
@@ -170,6 +176,17 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
}
+/*
+ * Return true if a folio needs ->release_folio() calling upon it.
+ */
+static inline bool folio_needs_release(struct folio *folio)
+{
+ struct address_space *mapping = folio_mapping(folio);
+
+ return folio_has_private(folio) ||
+ (mapping && mapping_release_always(mapping));
+}
+
extern unsigned long highest_memmap_pfn;
/*
@@ -390,17 +407,18 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
return;
- folio->_folio_order = order;
+ folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
#ifdef CONFIG_64BIT
folio->_folio_nr_pages = 1U << order;
#endif
}
+void folio_undo_large_rmappable(struct folio *folio);
+
static inline void prep_compound_head(struct page *page, unsigned int order)
{
struct folio *folio = (struct folio *)page;
- folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
folio_set_order(folio, order);
atomic_set(&folio->_entire_mapcount, -1);
atomic_set(&folio->_nr_pages_mapped, 0);
@@ -689,7 +707,7 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
if (fault_flag_allow_retry_first(flags) &&
!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
fpin = get_file(vmf->vma->vm_file);
- mmap_read_unlock(vmf->vma->vm_mm);
+ release_fault_lock(vmf);
}
return fpin;
}
@@ -924,6 +942,13 @@ int migrate_device_coherent_page(struct page *page);
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
int __must_check try_grab_page(struct page *page, unsigned int flags);
+/*
+ * mm/huge_memory.c
+ */
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd,
+ unsigned int flags);
+
enum {
/* mark page accessed */
FOLL_TOUCH = 1 << 16,
@@ -998,6 +1023,16 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
smp_rmb();
/*
+ * During GUP-fast we might not get called on the head page for a
+ * hugetlb page that is mapped using cont-PTE, because GUP-fast does
+ * not work with the abstracted hugetlb PTEs that always point at the
+ * head page. For hugetlb, PageAnonExclusive only applies on the head
+ * page (as it cannot be partially COW-shared), so lookup the head page.
+ */
+ if (unlikely(!PageHead(page) && PageHuge(page)))
+ page = compound_head(page);
+
+ /*
* Note that PageKsm() pages cannot be exclusive, and consequently,
* cannot get pinned.
*/
@@ -1005,6 +1040,7 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
}
extern bool mirrored_kernelcore;
+extern bool memblock_has_mirror(void);
static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
{
@@ -1024,21 +1060,39 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
return !(vma->vm_flags & VM_SOFTDIRTY);
}
+static inline void vma_iter_config(struct vma_iterator *vmi,
+ unsigned long index, unsigned long last)
+{
+ MAS_BUG_ON(&vmi->mas, vmi->mas.node != MAS_START &&
+ (vmi->mas.index > index || vmi->mas.last < index));
+ __mas_set_range(&vmi->mas, index, last - 1);
+}
+
/*
* VMA Iterator functions shared between nommu and mmap
*/
-static inline int vma_iter_prealloc(struct vma_iterator *vmi)
+static inline int vma_iter_prealloc(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
{
- return mas_preallocate(&vmi->mas, GFP_KERNEL);
+ return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
}
-static inline void vma_iter_clear(struct vma_iterator *vmi,
- unsigned long start, unsigned long end)
+static inline void vma_iter_clear(struct vma_iterator *vmi)
{
- mas_set_range(&vmi->mas, start, end - 1);
mas_store_prealloc(&vmi->mas, NULL);
}
+static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
+ unsigned long start, unsigned long end, gfp_t gfp)
+{
+ __mas_set_range(&vmi->mas, start, end - 1);
+ mas_store_gfp(&vmi->mas, NULL, gfp);
+ if (unlikely(mas_is_err(&vmi->mas)))
+ return -ENOMEM;
+
+ return 0;
+}
+
static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
{
return mas_walk(&vmi->mas);
@@ -1068,8 +1122,7 @@ static inline void vma_iter_store(struct vma_iterator *vmi,
((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
vma_iter_invalidate(vmi);
- vmi->mas.index = vma->vm_start;
- vmi->mas.last = vma->vm_end - 1;
+ __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
mas_store_prealloc(&vmi->mas, vma);
}
@@ -1080,8 +1133,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
vma_iter_invalidate(vmi);
- vmi->mas.index = vma->vm_start;
- vmi->mas.last = vma->vm_end - 1;
+ __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
mas_store_gfp(&vmi->mas, vma, gfp);
if (unlikely(mas_is_err(&vmi->mas)))
return -ENOMEM;
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 8652426282cc..3e049dfb28bd 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -10,14 +10,19 @@
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/export.h>
+#include <linux/ioremap.h>
-void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
- unsigned long prot)
+void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size,
+ pgprot_t prot)
{
unsigned long offset, vaddr;
phys_addr_t last_addr;
struct vm_struct *area;
+ /* An early platform driver might end up here */
+ if (WARN_ON_ONCE(!slab_is_available()))
+ return NULL;
+
/* Disallow wrap-around or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
@@ -28,34 +33,42 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
phys_addr -= offset;
size = PAGE_ALIGN(size + offset);
- if (!ioremap_allowed(phys_addr, size, prot))
- return NULL;
-
- area = get_vm_area_caller(size, VM_IOREMAP,
- __builtin_return_address(0));
+ area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START,
+ IOREMAP_END, __builtin_return_address(0));
if (!area)
return NULL;
vaddr = (unsigned long)area->addr;
area->phys_addr = phys_addr;
- if (ioremap_page_range(vaddr, vaddr + size, phys_addr,
- __pgprot(prot))) {
+ if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
free_vm_area(area);
return NULL;
}
return (void __iomem *)(vaddr + offset);
}
+
+#ifndef ioremap_prot
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot)
+{
+ return generic_ioremap_prot(phys_addr, size, __pgprot(prot));
+}
EXPORT_SYMBOL(ioremap_prot);
+#endif
-void iounmap(volatile void __iomem *addr)
+void generic_iounmap(volatile void __iomem *addr)
{
void *vaddr = (void *)((unsigned long)addr & PAGE_MASK);
- if (!iounmap_allowed(vaddr))
- return;
-
- if (is_vmalloc_addr(vaddr))
+ if (is_ioremap_addr(vaddr))
vunmap(vaddr);
}
+
+#ifndef iounmap
+void iounmap(volatile void __iomem *addr)
+{
+ generic_iounmap(addr);
+}
EXPORT_SYMBOL(iounmap);
+#endif
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index dad3c0eb70a0..96fd0411f5c5 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -116,7 +116,15 @@ EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
* backing pages (in __kfence_pool).
*/
static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
-struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+struct kfence_metadata *kfence_metadata __read_mostly;
+
+/*
+ * If kfence_metadata is not NULL, it may be accessed by kfence_shutdown_cache().
+ * So introduce kfence_metadata_init to initialize metadata, and then make
+ * kfence_metadata visible after initialization is successful. This prevents
+ * potential UAF or access to uninitialized metadata.
+ */
+static struct kfence_metadata *kfence_metadata_init __read_mostly;
/* Freelist with available objects. */
static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
@@ -591,7 +599,7 @@ static unsigned long kfence_init_pool(void)
__folio_set_slab(slab_folio(slab));
#ifdef CONFIG_MEMCG
- slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
+ slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg |
MEMCG_DATA_OBJCGS;
#endif
}
@@ -610,7 +618,7 @@ static unsigned long kfence_init_pool(void)
}
for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
- struct kfence_metadata *meta = &kfence_metadata[i];
+ struct kfence_metadata *meta = &kfence_metadata_init[i];
/* Initialize metadata. */
INIT_LIST_HEAD(&meta->list);
@@ -626,6 +634,12 @@ static unsigned long kfence_init_pool(void)
addr += 2 * PAGE_SIZE;
}
+ /*
+ * Make kfence_metadata visible only when initialization is successful.
+ * Otherwise, if the initialization fails and kfence_metadata is freed,
+ * it may cause UAF in kfence_shutdown_cache().
+ */
+ smp_store_release(&kfence_metadata, kfence_metadata_init);
return 0;
reset_slab:
@@ -672,26 +686,10 @@ static bool __init kfence_init_pool_early(void)
*/
memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
__kfence_pool = NULL;
- return false;
-}
-
-static bool kfence_init_pool_late(void)
-{
- unsigned long addr, free_size;
- addr = kfence_init_pool();
-
- if (!addr)
- return true;
+ memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE);
+ kfence_metadata_init = NULL;
- /* Same as above. */
- free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
-#ifdef CONFIG_CONTIG_ALLOC
- free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE);
-#else
- free_pages_exact((void *)addr, free_size);
-#endif
- __kfence_pool = NULL;
return false;
}
@@ -841,19 +839,30 @@ static void toggle_allocation_gate(struct work_struct *work)
/* === Public interface ===================================================== */
-void __init kfence_alloc_pool(void)
+void __init kfence_alloc_pool_and_metadata(void)
{
if (!kfence_sample_interval)
return;
- /* if the pool has already been initialized by arch, skip the below. */
- if (__kfence_pool)
- return;
-
- __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
-
+ /*
+ * If the pool has already been initialized by arch, there is no need to
+ * re-allocate the memory pool.
+ */
if (!__kfence_pool)
+ __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
+
+ if (!__kfence_pool) {
pr_err("failed to allocate pool\n");
+ return;
+ }
+
+ /* The memory allocated by memblock has been zeroed out. */
+ kfence_metadata_init = memblock_alloc(KFENCE_METADATA_SIZE, PAGE_SIZE);
+ if (!kfence_metadata_init) {
+ pr_err("failed to allocate metadata\n");
+ memblock_free(__kfence_pool, KFENCE_POOL_SIZE);
+ __kfence_pool = NULL;
+ }
}
static void kfence_init_enable(void)
@@ -895,33 +904,69 @@ void __init kfence_init(void)
static int kfence_init_late(void)
{
- const unsigned long nr_pages = KFENCE_POOL_SIZE / PAGE_SIZE;
+ const unsigned long nr_pages_pool = KFENCE_POOL_SIZE / PAGE_SIZE;
+ const unsigned long nr_pages_meta = KFENCE_METADATA_SIZE / PAGE_SIZE;
+ unsigned long addr = (unsigned long)__kfence_pool;
+ unsigned long free_size = KFENCE_POOL_SIZE;
+ int err = -ENOMEM;
+
#ifdef CONFIG_CONTIG_ALLOC
struct page *pages;
- pages = alloc_contig_pages(nr_pages, GFP_KERNEL, first_online_node, NULL);
+ pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node,
+ NULL);
if (!pages)
return -ENOMEM;
+
__kfence_pool = page_to_virt(pages);
+ pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node,
+ NULL);
+ if (pages)
+ kfence_metadata_init = page_to_virt(pages);
#else
- if (nr_pages > MAX_ORDER_NR_PAGES) {
+ if (nr_pages_pool > MAX_ORDER_NR_PAGES ||
+ nr_pages_meta > MAX_ORDER_NR_PAGES) {
pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n");
return -EINVAL;
}
+
__kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL);
if (!__kfence_pool)
return -ENOMEM;
+
+ kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL);
#endif
- if (!kfence_init_pool_late()) {
- pr_err("%s failed\n", __func__);
- return -EBUSY;
+ if (!kfence_metadata_init)
+ goto free_pool;
+
+ memzero_explicit(kfence_metadata_init, KFENCE_METADATA_SIZE);
+ addr = kfence_init_pool();
+ if (!addr) {
+ kfence_init_enable();
+ kfence_debugfs_init();
+ return 0;
}
- kfence_init_enable();
- kfence_debugfs_init();
+ pr_err("%s failed\n", __func__);
+ free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
+ err = -EBUSY;
- return 0;
+#ifdef CONFIG_CONTIG_ALLOC
+ free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata_init)),
+ nr_pages_meta);
+free_pool:
+ free_contig_range(page_to_pfn(virt_to_page((void *)addr)),
+ free_size / PAGE_SIZE);
+#else
+ free_pages_exact((void *)kfence_metadata_init, KFENCE_METADATA_SIZE);
+free_pool:
+ free_pages_exact((void *)addr, free_size);
+#endif
+
+ kfence_metadata_init = NULL;
+ __kfence_pool = NULL;
+ return err;
}
static int kfence_enable_late(void)
@@ -941,6 +986,10 @@ void kfence_shutdown_cache(struct kmem_cache *s)
struct kfence_metadata *meta;
int i;
+ /* Pairs with release in kfence_init_pool(). */
+ if (!smp_load_acquire(&kfence_metadata))
+ return;
+
for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
bool in_use;
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 392fb273e7bd..f46fbb03062b 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -102,7 +102,10 @@ struct kfence_metadata {
#endif
};
-extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+#define KFENCE_METADATA_SIZE PAGE_ALIGN(sizeof(struct kfence_metadata) * \
+ CONFIG_KFENCE_NUM_OBJECTS)
+
+extern struct kfence_metadata *kfence_metadata;
static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
{
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 9e008a336d9f..95b2b84c296d 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -212,7 +212,9 @@ static void test_cache_destroy(void)
static inline size_t kmalloc_cache_alignment(size_t size)
{
- return kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)]->align;
+ /* just to get ->align so no need to pass in the real caller */
+ enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, 0);
+ return kmalloc_caches[type][__kmalloc_index(size, false)]->align;
}
/* Must always inline to match stack trace against caller. */
@@ -282,8 +284,9 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
if (is_kfence_address(alloc)) {
struct slab *slab = virt_to_slab(alloc);
+ enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, _RET_IP_);
struct kmem_cache *s = test_cache ?:
- kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)];
+ kmalloc_caches[type][__kmalloc_index(size, false)];
/*
* Verify that various helpers return the right values
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index 197430a5be4a..c509aed326ce 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -13,6 +13,7 @@
#include <linux/printk.h>
#include <linux/sched/debug.h>
#include <linux/seq_file.h>
+#include <linux/sprintf.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
#include <trace/events/error_report.h>
@@ -26,8 +27,6 @@
#define ARCH_FUNC_PREFIX ""
#endif
-extern bool no_hash_pointers;
-
/* Helper function to either print to a seq_file or to console. */
__printf(2, 3)
static void seq_con_printf(struct seq_file *seq, const char *fmt, ...)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 78c8d5d8b628..88433cc25d8a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -19,6 +19,7 @@
#include <linux/page_table_check.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
+#include <linux/ksm.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -92,8 +93,6 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __read_mostly;
-#define MAX_PTE_MAPPED_THP 8
-
struct collapse_control {
bool is_khugepaged;
@@ -107,15 +106,9 @@ struct collapse_control {
/**
* struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
* @slot: hash lookup from mm to mm_slot
- * @nr_pte_mapped_thp: number of pte mapped THP
- * @pte_mapped_thp: address array corresponding pte mapped THP
*/
struct khugepaged_mm_slot {
struct mm_slot slot;
-
- /* pte-mapped THP in this mm */
- int nr_pte_mapped_thp;
- unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
};
/**
@@ -709,6 +702,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
spin_lock(ptl);
ptep_clear(vma->vm_mm, address, _pte);
spin_unlock(ptl);
+ ksm_might_unmap_zero_page(vma->vm_mm, pteval);
}
} else {
src_page = pte_page(pteval);
@@ -902,7 +896,7 @@ static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
return false;
}
- prep_transhuge_page(*hpage);
+ folio_prep_large_rmappable((struct folio *)*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return true;
}
@@ -1439,51 +1433,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
}
#ifdef CONFIG_SHMEM
-/*
- * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
- * khugepaged should try to collapse the page table.
- *
- * Note that following race exists:
- * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A,
- * emptying the A's ->pte_mapped_thp[] array.
- * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and
- * retract_page_tables() finds a VMA in mm_struct A mapping the same extent
- * (at virtual address X) and adds an entry (for X) into mm_struct A's
- * ->pte-mapped_thp[] array.
- * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X,
- * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry
- * (for X) into mm_struct A's ->pte-mapped_thp[] array.
- * Thus, it's possible the same address is added multiple times for the same
- * mm_struct. Should this happen, we'll simply attempt
- * collapse_pte_mapped_thp() multiple times for the same address, under the same
- * exclusive mmap_lock, and assuming the first call is successful, subsequent
- * attempts will return quickly (without grabbing any additional locks) when
- * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap
- * check, and since this is a rare occurrence, the cost of preventing this
- * "multiple-add" is thought to be more expensive than just handling it, should
- * it occur.
- */
-static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr)
-{
- struct khugepaged_mm_slot *mm_slot;
- struct mm_slot *slot;
- bool ret = false;
-
- VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
-
- spin_lock(&khugepaged_mm_lock);
- slot = mm_slot_lookup(mm_slots_hash, mm);
- mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
- if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) {
- mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
- ret = true;
- }
- spin_unlock(&khugepaged_mm_lock);
- return ret;
-}
-
-/* hpage must be locked, and mmap_lock must be held in write */
+/* hpage must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmdp, struct page *hpage)
{
@@ -1495,7 +1445,7 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
};
VM_BUG_ON(!PageTransHuge(hpage));
- mmap_assert_write_locked(vma->vm_mm);
+ mmap_assert_locked(vma->vm_mm);
if (do_set_pmd(&vmf, hpage))
return SCAN_FAIL;
@@ -1504,48 +1454,6 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
return SCAN_SUCCEED;
}
-/*
- * A note about locking:
- * Trying to take the page table spinlocks would be useless here because those
- * are only used to synchronize:
- *
- * - modifying terminal entries (ones that point to a data page, not to another
- * page table)
- * - installing *new* non-terminal entries
- *
- * Instead, we need roughly the same kind of protection as free_pgtables() or
- * mm_take_all_locks() (but only for a single VMA):
- * The mmap lock together with this VMA's rmap locks covers all paths towards
- * the page table entries we're messing with here, except for hardware page
- * table walks and lockless_pages_from_mm().
- */
-static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp)
-{
- pmd_t pmd;
- struct mmu_notifier_range range;
-
- mmap_assert_write_locked(mm);
- if (vma->vm_file)
- lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
- /*
- * All anon_vmas attached to the VMA have the same root and are
- * therefore locked by the same lock.
- */
- if (vma->anon_vma)
- lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
- addr + HPAGE_PMD_SIZE);
- mmu_notifier_invalidate_range_start(&range);
- pmd = pmdp_collapse_flush(vma, addr, pmdp);
- tlb_remove_table_sync_one();
- mmu_notifier_invalidate_range_end(&range);
- mm_dec_nr_ptes(mm);
- page_table_check_pte_clear_range(mm, addr, pmd);
- pte_free(mm, pmd_pgtable(pmd));
-}
-
/**
* collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
* address haddr.
@@ -1561,26 +1469,29 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd)
{
+ struct mmu_notifier_range range;
+ bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
- pmd_t *pmd;
- spinlock_t *ptl;
- int count = 0, result = SCAN_FAIL;
+ pmd_t *pmd, pgt_pmd;
+ spinlock_t *pml = NULL, *ptl;
+ int nr_ptes = 0, result = SCAN_FAIL;
int i;
- mmap_assert_write_locked(mm);
+ mmap_assert_locked(mm);
+
+ /* First check VMA found, in case page tables are being torn down */
+ if (!vma || !vma->vm_file ||
+ !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
+ return SCAN_VMA_CHECK;
/* Fast check before locking page if already PMD-mapped */
result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
if (result == SCAN_PMD_MAPPED)
return result;
- if (!vma || !vma->vm_file ||
- !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
- return SCAN_VMA_CHECK;
-
/*
* If we are here, we've succeeded in replacing all the native pages
* in the page cache with a single hugepage. If a mm were to fault-in
@@ -1610,41 +1521,24 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto drop_hpage;
}
+ result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
switch (result) {
case SCAN_SUCCEED:
break;
case SCAN_PMD_NONE:
/*
- * In MADV_COLLAPSE path, possible race with khugepaged where
- * all pte entries have been removed and pmd cleared. If so,
- * skip all the pte checks and just update the pmd mapping.
+ * All pte entries have been removed and pmd cleared.
+ * Skip all the pte checks and just update the pmd mapping.
*/
goto maybe_install_pmd;
default:
goto drop_hpage;
}
- /* Lock the vma before taking i_mmap and page table locks */
- vma_start_write(vma);
-
- /*
- * We need to lock the mapping so that from here on, only GUP-fast and
- * hardware page walks can access the parts of the page tables that
- * we're operating on.
- * See collapse_and_free_pmd().
- */
- i_mmap_lock_write(vma->vm_file->f_mapping);
-
- /*
- * This spinlock should be unnecessary: Nobody else should be accessing
- * the page tables under spinlock protection here, only
- * lockless_pages_from_mm() and the hardware page walker can access page
- * tables while all the high-level locks are held in write mode.
- */
result = SCAN_FAIL;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
- if (!start_pte)
- goto drop_immap;
+ if (!start_pte) /* mmap_lock + page lock should prevent this */
+ goto drop_hpage;
/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
@@ -1671,10 +1565,34 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
*/
if (hpage + i != page)
goto abort;
- count++;
}
- /* step 2: adjust rmap */
+ pte_unmap_unlock(start_pte, ptl);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ notified = true;
+
+ /*
+ * pmd_lock covers a wider range than ptl, and (if split from mm's
+ * page_table_lock) ptl nests inside pml. The less time we hold pml,
+ * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
+ * inserts a valid as-if-COWed PTE without even looking up page cache.
+ * So page lock of hpage does not protect from it, so we must not drop
+ * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
+ */
+ if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
+ pml = pmd_lock(mm, pmd);
+
+ start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
+ if (!start_pte) /* mmap_lock + page lock should prevent this */
+ goto abort;
+ if (!pml)
+ spin_lock(ptl);
+ else if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+
+ /* step 2: clear page table and adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
struct page *page;
@@ -1682,189 +1600,164 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (pte_none(ptent))
continue;
+ /*
+ * We dropped ptl after the first scan, to do the mmu_notifier:
+ * page lock stops more PTEs of the hpage being faulted in, but
+ * does not stop write faults COWing anon copies from existing
+ * PTEs; and does not stop those being swapped out or migrated.
+ */
+ if (!pte_present(ptent)) {
+ result = SCAN_PTE_NON_PRESENT;
+ goto abort;
+ }
page = vm_normal_page(vma, addr, ptent);
- if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+ if (hpage + i != page)
goto abort;
+
+ /*
+ * Must clear entry, or a racing truncate may re-remove it.
+ * TLB flush can be left until pmdp_collapse_flush() does it.
+ * PTE dirty? Shmem page is already dirty; file is read-only.
+ */
+ ptep_clear(mm, addr, pte);
page_remove_rmap(page, vma, false);
+ nr_ptes++;
}
- pte_unmap_unlock(start_pte, ptl);
+ pte_unmap(start_pte);
+ if (!pml)
+ spin_unlock(ptl);
/* step 3: set proper refcount and mm_counters. */
- if (count) {
- page_ref_sub(hpage, count);
- add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+ if (nr_ptes) {
+ page_ref_sub(hpage, nr_ptes);
+ add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
}
- /* step 4: remove pte entries */
- /* we make no change to anon, but protect concurrent anon page lookup */
- if (vma->anon_vma)
- anon_vma_lock_write(vma->anon_vma);
+ /* step 4: remove empty page table */
+ if (!pml) {
+ pml = pmd_lock(mm, pmd);
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ }
+ pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
+ pmdp_get_lockless_sync();
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
- collapse_and_free_pmd(mm, vma, haddr, pmd);
+ mmu_notifier_invalidate_range_end(&range);
- if (vma->anon_vma)
- anon_vma_unlock_write(vma->anon_vma);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ mm_dec_nr_ptes(mm);
+ page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
+ pte_free_defer(mm, pmd_pgtable(pgt_pmd));
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
? set_huge_pmd(vma, haddr, pmd, hpage)
: SCAN_SUCCEED;
-
+ goto drop_hpage;
+abort:
+ if (nr_ptes) {
+ flush_tlb_mm(mm);
+ page_ref_sub(hpage, nr_ptes);
+ add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
+ }
+ if (start_pte)
+ pte_unmap_unlock(start_pte, ptl);
+ if (pml && pml != ptl)
+ spin_unlock(pml);
+ if (notified)
+ mmu_notifier_invalidate_range_end(&range);
drop_hpage:
unlock_page(hpage);
put_page(hpage);
return result;
-
-abort:
- pte_unmap_unlock(start_pte, ptl);
-drop_immap:
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- goto drop_hpage;
-}
-
-static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
-{
- struct mm_slot *slot = &mm_slot->slot;
- struct mm_struct *mm = slot->mm;
- int i;
-
- if (likely(mm_slot->nr_pte_mapped_thp == 0))
- return;
-
- if (!mmap_write_trylock(mm))
- return;
-
- if (unlikely(hpage_collapse_test_exit(mm)))
- goto out;
-
- for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
- collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false);
-
-out:
- mm_slot->nr_pte_mapped_thp = 0;
- mmap_write_unlock(mm);
}
-static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
- struct mm_struct *target_mm,
- unsigned long target_addr, struct page *hpage,
- struct collapse_control *cc)
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
- int target_result = SCAN_FAIL;
- i_mmap_lock_write(mapping);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- int result = SCAN_FAIL;
- struct mm_struct *mm = NULL;
- unsigned long addr = 0;
- pmd_t *pmd;
- bool is_target = false;
+ struct mmu_notifier_range range;
+ struct mm_struct *mm;
+ unsigned long addr;
+ pmd_t *pmd, pgt_pmd;
+ spinlock_t *pml;
+ spinlock_t *ptl;
+ bool skipped_uffd = false;
/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
- * got written to. These VMAs are likely not worth investing
- * mmap_write_lock(mm) as PMD-mapping is likely to be split
- * later.
- *
- * Note that vma->anon_vma check is racy: it can be set up after
- * the check but before we took mmap_lock by the fault path.
- * But page lock would prevent establishing any new ptes of the
- * page, so we are safe.
- *
- * An alternative would be drop the check, but check that page
- * table is clear before calling pmdp_collapse_flush() under
- * ptl. It has higher chance to recover THP for the VMA, but
- * has higher cost too. It would also probably require locking
- * the anon_vma.
+ * got written to. These VMAs are likely not worth removing
+ * page tables from, as PMD-mapping is likely to be split later.
*/
- if (READ_ONCE(vma->anon_vma)) {
- result = SCAN_PAGE_ANON;
- goto next;
- }
+ if (READ_ONCE(vma->anon_vma))
+ continue;
+
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (addr & ~HPAGE_PMD_MASK ||
- vma->vm_end < addr + HPAGE_PMD_SIZE) {
- result = SCAN_VMA_CHECK;
- goto next;
- }
+ vma->vm_end < addr + HPAGE_PMD_SIZE)
+ continue;
+
mm = vma->vm_mm;
- is_target = mm == target_mm && addr == target_addr;
- result = find_pmd_or_thp_or_none(mm, addr, &pmd);
- if (result != SCAN_SUCCEED)
- goto next;
+ if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
+ continue;
+
+ if (hpage_collapse_test_exit(mm))
+ continue;
/*
- * We need exclusive mmap_lock to retract page table.
- *
- * We use trylock due to lock inversion: we need to acquire
- * mmap_lock while holding page lock. Fault path does it in
- * reverse order. Trylock is a way to avoid deadlock.
- *
- * Also, it's not MADV_COLLAPSE's job to collapse other
- * mappings - let khugepaged take care of them later.
+ * When a vma is registered with uffd-wp, we cannot recycle
+ * the page table because there may be pte markers installed.
+ * Other vmas can still have the same file mapped hugely, but
+ * skip this one: it will always be mapped in small page size
+ * for uffd-wp registered ranges.
*/
- result = SCAN_PTE_MAPPED_HUGEPAGE;
- if ((cc->is_khugepaged || is_target) &&
- mmap_write_trylock(mm)) {
- /* trylock for the same lock inversion as above */
- if (!vma_try_start_write(vma))
- goto unlock_next;
+ if (userfaultfd_wp(vma))
+ continue;
+
+ /* PTEs were notified when unmapped; but now for the PMD? */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ addr, addr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ pml = pmd_lock(mm, pmd);
+ ptl = pte_lockptr(mm, pmd);
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
- /*
- * Re-check whether we have an ->anon_vma, because
- * collapse_and_free_pmd() requires that either no
- * ->anon_vma exists or the anon_vma is locked.
- * We already checked ->anon_vma above, but that check
- * is racy because ->anon_vma can be populated under the
- * mmap lock in read mode.
- */
- if (vma->anon_vma) {
- result = SCAN_PAGE_ANON;
- goto unlock_next;
- }
- /*
- * When a vma is registered with uffd-wp, we can't
- * recycle the pmd pgtable because there can be pte
- * markers installed. Skip it only, so the rest mm/vma
- * can still have the same file mapped hugely, however
- * it'll always mapped in small page size for uffd-wp
- * registered ranges.
- */
- if (hpage_collapse_test_exit(mm)) {
- result = SCAN_ANY_PROCESS;
- goto unlock_next;
- }
- if (userfaultfd_wp(vma)) {
- result = SCAN_PTE_UFFD_WP;
- goto unlock_next;
- }
- collapse_and_free_pmd(mm, vma, addr, pmd);
- if (!cc->is_khugepaged && is_target)
- result = set_huge_pmd(vma, addr, pmd, hpage);
- else
- result = SCAN_SUCCEED;
-
-unlock_next:
- mmap_write_unlock(mm);
- goto next;
- }
/*
- * Calling context will handle target mm/addr. Otherwise, let
- * khugepaged try again later.
+ * Huge page lock is still held, so normally the page table
+ * must remain empty; and we have already skipped anon_vma
+ * and userfaultfd_wp() vmas. But since the mmap_lock is not
+ * held, it is still possible for a racing userfaultfd_ioctl()
+ * to have inserted ptes or markers. Now that we hold ptlock,
+ * repeating the anon_vma check protects from one category,
+ * and repeating the userfaultfd_wp() check from another.
*/
- if (!is_target) {
- khugepaged_add_pte_mapped_thp(mm, addr);
- continue;
+ if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
+ skipped_uffd = true;
+ } else {
+ pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
+ pmdp_get_lockless_sync();
+ }
+
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ if (!skipped_uffd) {
+ mm_dec_nr_ptes(mm);
+ page_table_check_pte_clear_range(mm, addr, pgt_pmd);
+ pte_free_defer(mm, pmd_pgtable(pgt_pmd));
}
-next:
- if (is_target)
- target_result = result;
}
- i_mmap_unlock_write(mapping);
- return target_result;
+ i_mmap_unlock_read(mapping);
}
/**
@@ -1955,10 +1848,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
goto xa_locked;
}
}
- if (!shmem_charge(mapping->host, 1)) {
- result = SCAN_FAIL;
- goto xa_locked;
- }
nr_none++;
continue;
}
@@ -2076,8 +1965,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
goto out_unlock;
}
- if (folio_has_private(folio) &&
- !filemap_release_folio(folio, GFP_KERNEL)) {
+ if (!filemap_release_folio(folio, GFP_KERNEL)) {
result = SCAN_PAGE_HAS_PRIVATE;
folio_putback_lru(folio);
goto out_unlock;
@@ -2145,8 +2033,13 @@ xa_unlocked:
*/
try_to_unmap_flush();
- if (result != SCAN_SUCCEED)
+ if (result == SCAN_SUCCEED && nr_none &&
+ !shmem_charge(mapping->host, nr_none))
+ result = SCAN_FAIL;
+ if (result != SCAN_SUCCEED) {
+ nr_none = 0;
goto rollback;
+ }
/*
* The old pages are locked, so they won't change anymore.
@@ -2259,9 +2152,11 @@ immap_locked:
/*
* Remove pte page tables, so we can re-fault the page as huge.
+ * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
*/
- result = retract_page_tables(mapping, start, mm, addr, hpage,
- cc);
+ retract_page_tables(mapping, start);
+ if (cc && !cc->is_khugepaged)
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
unlock_page(hpage);
/*
@@ -2283,8 +2178,8 @@ rollback:
if (nr_none) {
xas_lock_irq(&xas);
mapping->nrpages -= nr_none;
- shmem_uncharge(mapping->host, nr_none);
xas_unlock_irq(&xas);
+ shmem_uncharge(mapping->host, nr_none);
}
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
@@ -2422,16 +2317,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
{
BUILD_BUG();
}
-
-static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
-{
-}
-
-static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr)
-{
- return false;
-}
#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
@@ -2461,7 +2346,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
- khugepaged_collapse_pte_mapped_thps(mm_slot);
mm = slot->mm;
/*
@@ -2514,36 +2398,27 @@ skip:
khugepaged_scan.address);
mmap_read_unlock(mm);
- *result = hpage_collapse_scan_file(mm,
- khugepaged_scan.address,
- file, pgoff, cc);
mmap_locked = false;
+ *result = hpage_collapse_scan_file(mm,
+ khugepaged_scan.address, file, pgoff, cc);
fput(file);
+ if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
+ mmap_read_lock(mm);
+ if (hpage_collapse_test_exit(mm))
+ goto breakouterloop;
+ *result = collapse_pte_mapped_thp(mm,
+ khugepaged_scan.address, false);
+ if (*result == SCAN_PMD_MAPPED)
+ *result = SCAN_SUCCEED;
+ mmap_read_unlock(mm);
+ }
} else {
*result = hpage_collapse_scan_pmd(mm, vma,
- khugepaged_scan.address,
- &mmap_locked,
- cc);
+ khugepaged_scan.address, &mmap_locked, cc);
}
- switch (*result) {
- case SCAN_PTE_MAPPED_HUGEPAGE: {
- pmd_t *pmd;
-
- *result = find_pmd_or_thp_or_none(mm,
- khugepaged_scan.address,
- &pmd);
- if (*result != SCAN_SUCCEED)
- break;
- if (!khugepaged_add_pte_mapped_thp(mm,
- khugepaged_scan.address))
- break;
- } fallthrough;
- case SCAN_SUCCEED:
+
+ if (*result == SCAN_SUCCEED)
++khugepaged_pages_collapsed;
- break;
- default:
- break;
- }
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
@@ -2889,9 +2764,9 @@ handle_result:
case SCAN_PTE_MAPPED_HUGEPAGE:
BUG_ON(mmap_locked);
BUG_ON(*prev);
- mmap_write_lock(mm);
+ mmap_read_lock(mm);
result = collapse_pte_mapped_thp(mm, addr, true);
- mmap_write_unlock(mm);
+ mmap_read_unlock(mm);
goto handle_result;
/* Whitelisted set of results where continuing OK */
case SCAN_PMD_NULL:
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a2d34226e3c8..2918150e31bd 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -218,7 +218,7 @@ static int kmemleak_enabled = 1;
/* same as above but only for the kmemleak_free() callback */
static int kmemleak_free_enabled = 1;
/* set in the late_initcall if there were no errors */
-static int kmemleak_initialized;
+static int kmemleak_late_initialized;
/* set if a kmemleak warning was issued */
static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
@@ -610,7 +610,12 @@ static noinline depot_stack_handle_t set_track_prepare(void)
unsigned long entries[MAX_TRACE];
unsigned int nr_entries;
- if (!kmemleak_initialized)
+ /*
+ * Use object_cache to determine whether kmemleak_init() has
+ * been invoked. stack_depot_early_init() is called before
+ * kmemleak_init() in mm_core_init().
+ */
+ if (!object_cache)
return 0;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
trace_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
@@ -2052,7 +2057,7 @@ static void kmemleak_disable(void)
kmemleak_enabled = 0;
/* check whether it is too early for a kernel thread */
- if (kmemleak_initialized)
+ if (kmemleak_late_initialized)
schedule_work(&cleanup_work);
else
kmemleak_free_enabled = 0;
@@ -2117,7 +2122,7 @@ void __init kmemleak_init(void)
*/
static int __init kmemleak_late_init(void)
{
- kmemleak_initialized = 1;
+ kmemleak_late_initialized = 1;
debugfs_create_file("kmemleak", 0644, NULL, NULL, &kmemleak_fops);
@@ -2125,7 +2130,7 @@ static int __init kmemleak_late_init(void)
/*
* Some error occurred and kmemleak was disabled. There is a
* small chance that kmemleak_disable() was called immediately
- * after setting kmemleak_initialized and we may end up with
+ * after setting kmemleak_late_initialized and we may end up with
* two clean-up threads but serialized by scan_mutex.
*/
schedule_work(&cleanup_work);
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index ec0da72e65aa..5d6e2dee5692 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -117,7 +117,7 @@ void kmsan_kfree_large(const void *ptr)
page = virt_to_head_page((void *)ptr);
KMSAN_WARN_ON(ptr != page_address(page));
kmsan_internal_poison_memory((void *)ptr,
- PAGE_SIZE << compound_order(page),
+ page_size(page),
GFP_KERNEL,
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
@@ -339,7 +339,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
* internal KMSAN checks.
*/
while (size > 0) {
- page_offset = addr % PAGE_SIZE;
+ page_offset = offset_in_page(addr);
to_go = min(PAGE_SIZE - page_offset, (u64)size);
kmsan_handle_dma_page((void *)addr, to_go, dir);
addr += to_go;
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index b8bb95eea5e3..87318f9170f1 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -145,7 +145,7 @@ void *kmsan_get_metadata(void *address, bool is_origin)
return NULL;
if (!page_has_metadata(page))
return NULL;
- off = addr % PAGE_SIZE;
+ off = offset_in_page(addr);
return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off;
}
@@ -210,7 +210,7 @@ void kmsan_free_page(struct page *page, unsigned int order)
return;
kmsan_enter_runtime();
kmsan_internal_poison_memory(page_address(page),
- PAGE_SIZE << compound_order(page),
+ page_size(page),
GFP_KERNEL,
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
@@ -281,8 +281,8 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end)
struct page *page;
u64 size;
- start = (void *)ALIGN_DOWN((u64)start, PAGE_SIZE);
- size = ALIGN((u64)end - (u64)start, PAGE_SIZE);
+ start = (void *)PAGE_ALIGN_DOWN((u64)start);
+ size = PAGE_ALIGN((u64)end - (u64)start);
shadow = memblock_alloc(size, PAGE_SIZE);
origin = memblock_alloc(size, PAGE_SIZE);
for (u64 addr = 0; addr < size; addr += PAGE_SIZE) {
diff --git a/mm/ksm.c b/mm/ksm.c
index ba266359da55..8d6aee05421d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -242,6 +242,9 @@ static struct kmem_cache *rmap_item_cache;
static struct kmem_cache *stable_node_cache;
static struct kmem_cache *mm_slot_cache;
+/* The number of pages scanned */
+static unsigned long ksm_pages_scanned;
+
/* The number of nodes in the stable tree */
static unsigned long ksm_pages_shared;
@@ -278,6 +281,9 @@ static unsigned int zero_checksum __read_mostly;
/* Whether to merge empty (zeroed) pages with actual zero pages */
static bool ksm_use_zero_pages __read_mostly;
+/* The number of zero pages which is placed by KSM */
+unsigned long ksm_zero_pages;
+
#ifdef CONFIG_NUMA
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
@@ -448,13 +454,20 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
if (is_migration_entry(entry))
page = pfn_swap_entry_to_page(entry);
}
- ret = page && PageKsm(page);
+ /* return 1 if the page is an normal ksm page or KSM-placed zero page */
+ ret = (page && PageKsm(page)) || is_ksm_zero_pte(*pte);
pte_unmap_unlock(pte, ptl);
return ret;
}
static const struct mm_walk_ops break_ksm_ops = {
.pmd_entry = break_ksm_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops break_ksm_lock_vma_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK,
};
/*
@@ -470,16 +483,17 @@ static const struct mm_walk_ops break_ksm_ops = {
* of the process that owns 'vma'. We also do not want to enforce
* protection keys here anyway.
*/
-static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
{
vm_fault_t ret = 0;
+ const struct mm_walk_ops *ops = lock_vma ?
+ &break_ksm_lock_vma_ops : &break_ksm_ops;
do {
int ksm_page;
cond_resched();
- ksm_page = walk_page_range_vma(vma, addr, addr + 1,
- &break_ksm_ops, NULL);
+ ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
if (WARN_ON_ONCE(ksm_page < 0))
return ksm_page;
if (!ksm_page)
@@ -565,7 +579,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (vma)
- break_ksm(vma, addr);
+ break_ksm(vma, addr, false);
mmap_read_unlock(mm);
}
@@ -871,7 +885,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
* in cmp_and_merge_page on one of the rmap_items we would be removing.
*/
static int unmerge_ksm_pages(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end, bool lock_vma)
{
unsigned long addr;
int err = 0;
@@ -882,7 +896,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
if (signal_pending(current))
err = -ERESTARTSYS;
else
- err = break_ksm(vma, addr);
+ err = break_ksm(vma, addr, lock_vma);
}
return err;
}
@@ -1029,7 +1043,7 @@ static int unmerge_and_remove_all_rmap_items(void)
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
continue;
err = unmerge_ksm_pages(vma,
- vma->vm_start, vma->vm_end);
+ vma->vm_start, vma->vm_end, false);
if (err)
goto error;
}
@@ -1222,8 +1236,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
page_add_anon_rmap(kpage, vma, addr, RMAP_NONE);
newpte = mk_pte(kpage, vma->vm_page_prot);
} else {
- newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
- vma->vm_page_prot));
+ /*
+ * Use pte_mkdirty to mark the zero page mapped by KSM, and then
+ * we can easily track all KSM-placed zero pages by checking if
+ * the dirty bit in zero page's PTE is set.
+ */
+ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
+ ksm_zero_pages++;
+ mm->ksm_zero_pages++;
/*
* We're replacing an anonymous page with a zero page, which is
* not anonymous. We need to do proper accounting otherwise we
@@ -2466,8 +2486,9 @@ static void ksm_do_scan(unsigned int scan_npages)
{
struct ksm_rmap_item *rmap_item;
struct page *page;
+ unsigned int npages = scan_npages;
- while (scan_npages-- && likely(!freezing(current))) {
+ while (npages-- && likely(!freezing(current))) {
cond_resched();
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
@@ -2475,6 +2496,8 @@ static void ksm_do_scan(unsigned int scan_npages)
cmp_and_merge_page(page, rmap_item);
put_page(page);
}
+
+ ksm_pages_scanned += scan_npages - npages;
}
static int ksmd_should_run(void)
@@ -2530,7 +2553,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
return 0;
if (vma->anon_vma) {
- err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end);
+ err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true);
if (err)
return err;
}
@@ -2668,7 +2691,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
return 0; /* just ignore the advice */
if (vma->anon_vma) {
- err = unmerge_ksm_pages(vma, start, end);
+ err = unmerge_ksm_pages(vma, start, end, true);
if (err)
return err;
}
@@ -2784,6 +2807,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
anon_vma->root == vma->anon_vma->root) {
return page; /* still no need to copy it */
}
+ if (PageHWPoison(page))
+ return ERR_PTR(-EHWPOISON);
if (!PageUptodate(page))
return page; /* let do_swap_page report the error */
@@ -3082,7 +3107,7 @@ static void wait_while_offlining(void)
#ifdef CONFIG_PROC_FS
long ksm_process_profit(struct mm_struct *mm)
{
- return mm->ksm_merging_pages * PAGE_SIZE -
+ return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE -
mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
}
#endif /* CONFIG_PROC_FS */
@@ -3313,6 +3338,13 @@ static ssize_t max_page_sharing_store(struct kobject *kobj,
}
KSM_ATTR(max_page_sharing);
+static ssize_t pages_scanned_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", ksm_pages_scanned);
+}
+KSM_ATTR_RO(pages_scanned);
+
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -3351,12 +3383,19 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
}
KSM_ATTR_RO(pages_volatile);
+static ssize_t ksm_zero_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%ld\n", ksm_zero_pages);
+}
+KSM_ATTR_RO(ksm_zero_pages);
+
static ssize_t general_profit_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
long general_profit;
- general_profit = ksm_pages_sharing * PAGE_SIZE -
+ general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE -
ksm_rmap_items * sizeof(struct ksm_rmap_item);
return sysfs_emit(buf, "%ld\n", general_profit);
@@ -3414,10 +3453,12 @@ static struct attribute *ksm_attrs[] = {
&sleep_millisecs_attr.attr,
&pages_to_scan_attr.attr,
&run_attr.attr,
+ &pages_scanned_attr.attr,
&pages_shared_attr.attr,
&pages_sharing_attr.attr,
&pages_unshared_attr.attr,
&pages_volatile_attr.attr,
+ &ksm_zero_pages_attr.attr,
&full_scans_attr.attr,
#ifdef CONFIG_NUMA
&merge_across_nodes_attr.attr,
diff --git a/mm/madvise.c b/mm/madvise.c
index 886f06066622..4dded5d27e7e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -173,9 +173,8 @@ static int madvise_update_vma(struct vm_area_struct *vma,
}
success:
- /*
- * vm_flags is protected by the mmap_lock held in write mode.
- */
+ /* vm_flags is protected by the mmap_lock held in write mode. */
+ vma_start_write(vma);
vm_flags_reset(vma, new_flags);
if (!vma->vm_file || vma_is_anon_shmem(vma)) {
error = replace_anon_vma_name(vma, anon_name);
@@ -218,7 +217,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
ptep = NULL;
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
- vma, addr, false, &splug);
+ vma, addr, &splug);
if (page)
put_page(page);
}
@@ -233,6 +232,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
static const struct mm_walk_ops swapin_walk_ops = {
.pmd_entry = swapin_walk_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
};
static void shmem_swapin_range(struct vm_area_struct *vma,
@@ -262,7 +262,7 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
rcu_read_unlock();
page = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
- vma, addr, false, &splug);
+ vma, addr, &splug);
if (page)
put_page(page);
@@ -383,7 +383,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
folio = pfn_folio(pmd_pfn(orig_pmd));
/* Do not interfere with other mappings of this folio */
- if (folio_mapcount(folio) != 1)
+ if (folio_estimated_sharers(folio) != 1)
goto huge_unlock;
if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -413,6 +413,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
folio_clear_referenced(folio);
folio_test_clear_young(folio);
+ if (folio_test_active(folio))
+ folio_set_workingset(folio);
if (pageout) {
if (folio_isolate_lru(folio)) {
if (folio_test_unevictable(folio))
@@ -457,7 +459,7 @@ regular_folio:
if (folio_test_large(folio)) {
int err;
- if (folio_mapcount(folio) != 1)
+ if (folio_estimated_sharers(folio) != 1)
break;
if (pageout_anon_only_filter && !folio_test_anon(folio))
break;
@@ -510,6 +512,8 @@ regular_folio:
*/
folio_clear_referenced(folio);
folio_test_clear_young(folio);
+ if (folio_test_active(folio))
+ folio_set_workingset(folio);
if (pageout) {
if (folio_isolate_lru(folio)) {
if (folio_test_unevictable(folio))
@@ -534,6 +538,7 @@ regular_folio:
static const struct mm_walk_ops cold_walk_ops = {
.pmd_entry = madvise_cold_or_pageout_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static void madvise_cold_page_range(struct mmu_gather *tlb,
@@ -660,7 +665,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
free_swap_and_cache(entry);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} else if (is_hwpoison_entry(entry) ||
- is_swapin_error_entry(entry)) {
+ is_poisoned_swp_entry(entry)) {
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
}
continue;
@@ -678,7 +683,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (folio_test_large(folio)) {
int err;
- if (folio_mapcount(folio) != 1)
+ if (folio_estimated_sharers(folio) != 1)
break;
if (!folio_trylock(folio))
break;
@@ -757,6 +762,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
static const struct mm_walk_ops madvise_free_walk_ops = {
.pmd_entry = madvise_free_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static int madvise_free_single_vma(struct vm_area_struct *vma,
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index a26dd8bcfcdb..2f8829b3541a 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -288,13 +288,14 @@ EXPORT_SYMBOL_GPL(wp_shared_mapping_range);
* @end: Pointer to the number of the last set bit in @bitmap.
* none set. The value is modified as new bits are set by the function.
*
- * Note: When this function returns there is no guarantee that a CPU has
+ * When this function returns there is no guarantee that a CPU has
* not already dirtied new ptes. However it will not clean any ptes not
* reported in the bitmap. The guarantees are as follows:
- * a) All ptes dirty when the function starts executing will end up recorded
- * in the bitmap.
- * b) All ptes dirtied after that will either remain dirty, be recorded in the
- * bitmap or both.
+ *
+ * * All ptes dirty when the function starts executing will end up recorded
+ * in the bitmap.
+ * * All ptes dirtied after that will either remain dirty, be recorded in the
+ * bitmap or both.
*
* If a caller needs to make sure all dirty ptes are picked up and none
* additional are added, it first needs to write-protect the address-space
diff --git a/mm/memblock.c b/mm/memblock.c
index f9e61e565a53..913b2520a9a0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -161,6 +161,11 @@ static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock;
static int memblock_reserved_in_slab __initdata_memblock;
+bool __init_memblock memblock_has_mirror(void)
+{
+ return system_has_some_mirror;
+}
+
static enum memblock_flags __init_memblock choose_memblock_flags(void)
{
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e8ca4bdcb03c..b29b850cf399 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -197,7 +197,7 @@ static struct move_charge_struct {
};
/*
- * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
+ * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
@@ -742,6 +742,10 @@ struct memcg_vmstats {
long state[MEMCG_NR_STAT];
unsigned long events[NR_MEMCG_EVENTS];
+ /* Non-hierarchical (CPU aggregated) page state & events */
+ long state_local[MEMCG_NR_STAT];
+ unsigned long events_local[NR_MEMCG_EVENTS];
+
/* Pending child counts during tree propagation */
long state_pending[MEMCG_NR_STAT];
unsigned long events_pending[NR_MEMCG_EVENTS];
@@ -775,11 +779,8 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
/* idx can be of type enum memcg_stat_item or node_stat_item. */
static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
{
- long x = 0;
- int cpu;
+ long x = READ_ONCE(memcg->vmstats->state_local[idx]);
- for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -926,16 +927,12 @@ static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
- long x = 0;
- int cpu;
int index = memcg_events_index(event);
if (index < 0)
return 0;
- for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_percpu->events[index], cpu);
- return x;
+ return READ_ONCE(memcg->vmstats->events_local[index]);
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -1629,7 +1626,6 @@ static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
WARN_ON_ONCE(seq_buf_has_overflowed(s));
}
-#define K(x) ((x) << (PAGE_SHIFT-10))
/**
* mem_cgroup_print_oom_context: Print OOM information relevant to
* memory controller.
@@ -3036,21 +3032,21 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
return objcg;
}
-struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
+struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
{
struct obj_cgroup *objcg;
if (!memcg_kmem_online())
return NULL;
- if (PageMemcgKmem(page)) {
- objcg = __folio_objcg(page_folio(page));
+ if (folio_memcg_kmem(folio)) {
+ objcg = __folio_objcg(folio);
obj_cgroup_get(objcg);
} else {
struct mem_cgroup *memcg;
rcu_read_lock();
- memcg = __folio_memcg(page_folio(page));
+ memcg = __folio_memcg(folio);
if (memcg)
objcg = __get_obj_cgroup_from_memcg(memcg);
else
@@ -3871,10 +3867,6 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
case _MEMSWAP:
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
- case _KMEM:
- /* kmem.limit_in_bytes is deprecated. */
- ret = -EOPNOTSUPP;
- break;
case _TCP:
ret = memcg_update_tcp_max(memcg, nr_pages);
break;
@@ -5086,12 +5078,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
},
#endif
{
- .name = "kmem.limit_in_bytes",
- .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
- .write = mem_cgroup_write,
- .read_u64 = mem_cgroup_read_u64,
- },
- {
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
@@ -5165,6 +5151,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
* those references are manageable from userspace.
*/
+#define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
static DEFINE_IDR(mem_cgroup_idr);
static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
@@ -5526,7 +5513,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct memcg_vmstats_percpu *statc;
- long delta, v;
+ long delta, delta_cpu, v;
int i, nid;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
@@ -5542,19 +5529,23 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
memcg->vmstats->state_pending[i] = 0;
/* Add CPU changes on this level since the last flush */
+ delta_cpu = 0;
v = READ_ONCE(statc->state[i]);
if (v != statc->state_prev[i]) {
- delta += v - statc->state_prev[i];
+ delta_cpu = v - statc->state_prev[i];
+ delta += delta_cpu;
statc->state_prev[i] = v;
}
- if (!delta)
- continue;
-
/* Aggregate counts on this level and propagate upwards */
- memcg->vmstats->state[i] += delta;
- if (parent)
- parent->vmstats->state_pending[i] += delta;
+ if (delta_cpu)
+ memcg->vmstats->state_local[i] += delta_cpu;
+
+ if (delta) {
+ memcg->vmstats->state[i] += delta;
+ if (parent)
+ parent->vmstats->state_pending[i] += delta;
+ }
}
for (i = 0; i < NR_MEMCG_EVENTS; i++) {
@@ -5562,18 +5553,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (delta)
memcg->vmstats->events_pending[i] = 0;
+ delta_cpu = 0;
v = READ_ONCE(statc->events[i]);
if (v != statc->events_prev[i]) {
- delta += v - statc->events_prev[i];
+ delta_cpu = v - statc->events_prev[i];
+ delta += delta_cpu;
statc->events_prev[i] = v;
}
- if (!delta)
- continue;
+ if (delta_cpu)
+ memcg->vmstats->events_local[i] += delta_cpu;
- memcg->vmstats->events[i] += delta;
- if (parent)
- parent->vmstats->events_pending[i] += delta;
+ if (delta) {
+ memcg->vmstats->events[i] += delta;
+ if (parent)
+ parent->vmstats->events_pending[i] += delta;
+ }
}
for_each_node_state(nid, N_MEMORY) {
@@ -5591,18 +5586,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (delta)
pn->lruvec_stats.state_pending[i] = 0;
+ delta_cpu = 0;
v = READ_ONCE(lstatc->state[i]);
if (v != lstatc->state_prev[i]) {
- delta += v - lstatc->state_prev[i];
+ delta_cpu = v - lstatc->state_prev[i];
+ delta += delta_cpu;
lstatc->state_prev[i] = v;
}
- if (!delta)
- continue;
+ if (delta_cpu)
+ pn->lruvec_stats.state_local[i] += delta_cpu;
- pn->lruvec_stats.state[i] += delta;
- if (ppn)
- ppn->lruvec_stats.state_pending[i] += delta;
+ if (delta) {
+ pn->lruvec_stats.state[i] += delta;
+ if (ppn)
+ ppn->lruvec_stats.state_pending[i] += delta;
+ }
}
}
}
@@ -5648,7 +5647,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
{
struct page *page = vm_normal_page(vma, addr, ptent);
- if (!page || !page_mapped(page))
+ if (!page)
return NULL;
if (PageAnon(page)) {
if (!(mc.flags & MOVE_ANON))
@@ -5657,8 +5656,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
if (!(mc.flags & MOVE_FILE))
return NULL;
}
- if (!get_page_unless_zero(page))
- return NULL;
+ get_page(page);
return page;
}
@@ -5766,7 +5764,7 @@ static int mem_cgroup_move_account(struct page *page,
if (folio_mapped(folio)) {
__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
- if (folio_test_transhuge(folio)) {
+ if (folio_test_pmd_mappable(folio)) {
__mod_lruvec_state(from_vec, NR_ANON_THPS,
-nr_pages);
__mod_lruvec_state(to_vec, NR_ANON_THPS,
@@ -5852,25 +5850,20 @@ out:
* @ptent: the pte to be checked
* @target: the pointer the target page or swap ent will be stored(can be NULL)
*
- * Returns
- * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
- * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
- * move charge. if @target is not NULL, the page is stored in target->page
- * with extra refcnt got(Callers should handle it).
- * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
- * target for charge migration. if @target is not NULL, the entry is stored
- * in target->ent.
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and
- * thus not on the lru.
- * For now we such page is charge like a regular page would be as for all
- * intent and purposes it is just special memory taking the place of a
- * regular page.
- *
- * See Documentations/vm/hmm.txt and include/linux/hmm.h
- *
- * Called with pte lock held.
+ * Context: Called with pte lock held.
+ * Return:
+ * * MC_TARGET_NONE - If the pte is not a target for move charge.
+ * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
+ * move charge. If @target is not NULL, the page is stored in target->page
+ * with extra refcnt taken (Caller should release it).
+ * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
+ * target for charge migration. If @target is not NULL, the entry is
+ * stored in target->ent.
+ * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and
+ * thus not on the lru. For now such page is charged like a regular page
+ * would be as it is just special memory taking the place of a regular page.
+ * See Documentations/vm/hmm.txt and include/linux/hmm.h
*/
-
static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
@@ -6024,6 +6017,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
static const struct mm_walk_ops precharge_walk_ops = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
@@ -6303,6 +6297,7 @@ put: /* get_mctgt_type() gets & locks the page */
static const struct mm_walk_ops charge_walk_ops = {
.pmd_entry = mem_cgroup_move_charge_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static void mem_cgroup_move_charge(void)
@@ -6696,8 +6691,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
lru_add_drain_all();
reclaimed = try_to_free_mem_cgroup_pages(memcg,
- nr_to_reclaim - nr_reclaimed,
- GFP_KERNEL, reclaim_options);
+ min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX),
+ GFP_KERNEL, reclaim_options);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
@@ -7535,9 +7530,6 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
struct mem_cgroup *memcg;
unsigned short id;
- if (mem_cgroup_disabled())
- return;
-
id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
@@ -7787,7 +7779,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
* @objcg: the object cgroup
* @size: size of compressed object
*
- * This forces the charge after obj_cgroup_may_swap() allowed
+ * This forces the charge after obj_cgroup_may_zswap() allowed
* compression and storage in zwap for this cgroup to go ahead.
*/
void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
diff --git a/mm/memfd.c b/mm/memfd.c
index e763e76f1106..1cad1904fc26 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -268,11 +268,33 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
+static int check_sysctl_memfd_noexec(unsigned int *flags)
+{
+#ifdef CONFIG_SYSCTL
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ int sysctl = pidns_memfd_noexec_scope(ns);
+
+ if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
+ if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)
+ *flags |= MFD_NOEXEC_SEAL;
+ else
+ *flags |= MFD_EXEC;
+ }
+
+ if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) {
+ pr_err_ratelimited(
+ "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n",
+ current->comm, task_pid_nr(current), sysctl);
+ return -EACCES;
+ }
+#endif
+ return 0;
+}
+
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
{
- char comm[TASK_COMM_LEN];
unsigned int *file_seals;
struct file *file;
int fd, error;
@@ -294,35 +316,15 @@ SYSCALL_DEFINE2(memfd_create,
return -EINVAL;
if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
-#ifdef CONFIG_SYSCTL
- int sysctl = MEMFD_NOEXEC_SCOPE_EXEC;
- struct pid_namespace *ns;
-
- ns = task_active_pid_ns(current);
- if (ns)
- sysctl = ns->memfd_noexec_scope;
-
- switch (sysctl) {
- case MEMFD_NOEXEC_SCOPE_EXEC:
- flags |= MFD_EXEC;
- break;
- case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL:
- flags |= MFD_NOEXEC_SEAL;
- break;
- default:
- pr_warn_once(
- "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n",
- task_pid_nr(current), get_task_comm(comm, current));
- return -EINVAL;
- }
-#else
- flags |= MFD_EXEC;
-#endif
- pr_warn_once(
- "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n",
- task_pid_nr(current), get_task_comm(comm, current));
+ pr_info_ratelimited(
+ "%s[%d]: memfd_create() called without MFD_EXEC or MFD_NOEXEC_SEAL set\n",
+ current->comm, task_pid_nr(current));
}
+ error = check_sysctl_memfd_noexec(&flags);
+ if (error < 0)
+ return error;
+
/* length includes terminating zero */
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
if (len <= 0)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e245191e6b04..881c35ef1daa 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -39,7 +39,6 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
-#include <linux/kernel-page-flags.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/dax.h>
@@ -50,7 +49,6 @@
#include <linux/swap.h>
#include <linux/backing-dev.h>
#include <linux/migrate.h>
-#include <linux/suspend.h>
#include <linux/slab.h>
#include <linux/swapops.h>
#include <linux/hugetlb.h>
@@ -59,7 +57,6 @@
#include <linux/memremap.h>
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
-#include <linux/page-isolation.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
#include <linux/sysctl.h>
@@ -75,13 +72,15 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
static bool hw_memory_failure __read_mostly = false;
-inline void num_poisoned_pages_inc(unsigned long pfn)
+static DEFINE_MUTEX(mf_mutex);
+
+void num_poisoned_pages_inc(unsigned long pfn)
{
atomic_long_inc(&num_poisoned_pages);
memblk_nr_poison_inc(pfn);
}
-inline void num_poisoned_pages_sub(unsigned long pfn, long i)
+void num_poisoned_pages_sub(unsigned long pfn, long i)
{
atomic_long_sub(i, &num_poisoned_pages);
if (pfn != -1UL)
@@ -363,17 +362,14 @@ void shake_page(struct page *p)
{
if (PageHuge(p))
return;
-
- if (!PageSlab(p)) {
- lru_add_drain_all();
- if (PageLRU(p) || is_free_buddy_page(p))
- return;
- }
-
/*
* TODO: Could shrink slab caches here if a lightweight range-based
* shrinker will be available.
*/
+ if (PageSlab(p))
+ return;
+
+ lru_add_drain_all();
}
EXPORT_SYMBOL_GPL(shake_page);
@@ -614,7 +610,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
pgoff = page_to_pgoff(page);
read_lock(&tasklist_lock);
- for_each_process (tsk) {
+ for_each_process(tsk) {
struct anon_vma_chain *vmac;
struct task_struct *t = task_early_kill(tsk, force_early);
@@ -658,7 +654,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
/*
* Send early kill signal to tasks where a vma covers
* the page but the corrupted page is not necessarily
- * mapped it in its pte.
+ * mapped in its pte.
* Assume applications who requested early kill want
* to be informed of all such data corruptions.
*/
@@ -831,6 +827,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
static const struct mm_walk_ops hwp_walk_ops = {
.pmd_entry = hwpoison_pte_range,
.hugetlb_entry = hwpoison_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -939,14 +936,12 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
struct folio *folio = page_folio(p);
int err = mapping->a_ops->error_remove_page(mapping, p);
- if (err != 0) {
+ if (err != 0)
pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
- } else if (folio_has_private(folio) &&
- !filemap_release_folio(folio, GFP_NOIO)) {
+ else if (!filemap_release_folio(folio, GFP_NOIO))
pr_info("%#lx: failed to release buffers\n", pfn);
- } else {
+ else
ret = MF_RECOVERED;
- }
} else {
/*
* If the file system doesn't support it just invalidate
@@ -1192,9 +1187,6 @@ static int me_huge_page(struct page_state *ps, struct page *p)
struct address_space *mapping;
bool extra_pins = false;
- if (!PageHuge(hpage))
- return MF_DELAYED;
-
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
@@ -1394,8 +1386,15 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags)
bool hugetlb = false;
ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false);
- if (hugetlb)
- return ret;
+ if (hugetlb) {
+ /* Make sure hugetlb demotion did not happen from under us. */
+ if (folio == page_folio(page))
+ return ret;
+ if (ret > 0) {
+ folio_put(folio);
+ folio = page_folio(page);
+ }
+ }
/*
* This check prevents from calling folio_try_get() for any
@@ -1484,8 +1483,13 @@ static int __get_unpoison_page(struct page *page)
bool hugetlb = false;
ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true);
- if (hugetlb)
- return ret;
+ if (hugetlb) {
+ /* Make sure hugetlb demotion did not happen from under us. */
+ if (folio == page_folio(page))
+ return ret;
+ if (ret > 0)
+ folio_put(folio);
+ }
/*
* PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
@@ -1813,6 +1817,7 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
#endif /* CONFIG_FS_DAX */
#ifdef CONFIG_HUGETLB_PAGE
+
/*
* Struct raw_hwp_page represents information about "raw error page",
* constructing singly linked list from ->_hugetlb_hwpoison field of folio.
@@ -1827,16 +1832,49 @@ static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
return (struct llist_head *)&folio->_hugetlb_hwpoison;
}
+bool is_raw_hwpoison_page_in_hugepage(struct page *page)
+{
+ struct llist_head *raw_hwp_head;
+ struct raw_hwp_page *p;
+ struct folio *folio = page_folio(page);
+ bool ret = false;
+
+ if (!folio_test_hwpoison(folio))
+ return false;
+
+ if (!folio_test_hugetlb(folio))
+ return PageHWPoison(page);
+
+ /*
+ * When RawHwpUnreliable is set, kernel lost track of which subpages
+ * are HWPOISON. So return as if ALL subpages are HWPOISONed.
+ */
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
+ return true;
+
+ mutex_lock(&mf_mutex);
+
+ raw_hwp_head = raw_hwp_list_head(folio);
+ llist_for_each_entry(p, raw_hwp_head->first, node) {
+ if (page == p->page) {
+ ret = true;
+ break;
+ }
+ }
+
+ mutex_unlock(&mf_mutex);
+
+ return ret;
+}
+
static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
{
- struct llist_head *head;
- struct llist_node *t, *tnode;
+ struct llist_node *head;
+ struct raw_hwp_page *p, *next;
unsigned long count = 0;
- head = raw_hwp_list_head(folio);
- llist_for_each_safe(tnode, t, head->first) {
- struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
-
+ head = llist_del_all(raw_hwp_list_head(folio));
+ llist_for_each_entry_safe(p, next, head, node) {
if (move_flag)
SetPageHWPoison(p->page);
else
@@ -1844,7 +1882,6 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
kfree(p);
count++;
}
- llist_del_all(head);
return count;
}
@@ -1852,7 +1889,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
{
struct llist_head *head;
struct raw_hwp_page *raw_hwp;
- struct llist_node *t, *tnode;
+ struct raw_hwp_page *p, *next;
int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
/*
@@ -1863,9 +1900,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return -EHWPOISON;
head = raw_hwp_list_head(folio);
- llist_for_each_safe(tnode, t, head->first) {
- struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
-
+ llist_for_each_entry_safe(p, next, head->first, node) {
if (p->page == page)
return -EHWPOISON;
}
@@ -1916,6 +1951,8 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
{
if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return;
+ if (folio_test_hugetlb_vmemmap_optimized(folio))
+ return;
folio_clear_hwpoison(folio);
folio_free_raw_hwp(folio, true);
}
@@ -2080,8 +2117,6 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
{
int rc = -ENXIO;
- put_ref_page(pfn, flags);
-
/* device metadata space is not recoverable */
if (!pgmap_pfn_valid(pgmap, pfn))
goto out;
@@ -2104,12 +2139,11 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
out:
/* drop pgmap ref acquired in caller */
put_dev_pagemap(pgmap);
- action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
+ if (rc != -EOPNOTSUPP)
+ action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
return rc;
}
-static DEFINE_MUTEX(mf_mutex);
-
/**
* memory_failure - Handle memory failure of a page.
* @pfn: Page Number of the corrupted page
@@ -2125,7 +2159,7 @@ static DEFINE_MUTEX(mf_mutex);
* detected by a background scrubber)
*
* Must run in process context (e.g. a work queue) with interrupts
- * enabled and no spinlocks hold.
+ * enabled and no spinlocks held.
*
* Return: 0 for successfully handled the memory error,
* -EOPNOTSUPP for hwpoison_filter() filtered the error event,
@@ -2157,6 +2191,7 @@ int memory_failure(unsigned long pfn, int flags)
if (pfn_valid(pfn)) {
pgmap = get_dev_pagemap(pfn, NULL);
+ put_ref_page(pfn, flags);
if (pgmap) {
res = memory_failure_dev_pagemap(pfn, flags,
pgmap);
@@ -2183,8 +2218,6 @@ try_again:
goto unlock_mutex;
}
- hpage = compound_head(p);
-
/*
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
@@ -2223,13 +2256,14 @@ try_again:
}
}
+ hpage = compound_head(p);
if (PageTransHuge(hpage)) {
/*
* The flag must be set after the refcount is bumped
* otherwise it may race with THP split.
* And the flag can't be set in get_hwpoison_page() since
* it is called by soft offline too and it is just called
- * for !MF_COUNT_INCREASE. So here seems to be the best
+ * for !MF_COUNT_INCREASED. So here seems to be the best
* place.
*
* Don't need care about the above error handling paths for
@@ -2466,7 +2500,7 @@ int unpoison_memory(unsigned long pfn)
{
struct folio *folio;
struct page *p;
- int ret = -EBUSY;
+ int ret = -EBUSY, ghp;
unsigned long count = 1;
bool huge = false;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -2487,7 +2521,7 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
- if (!folio_test_hwpoison(folio)) {
+ if (!PageHWPoison(p)) {
unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
pfn, &unpoison_rs);
goto unlock_mutex;
@@ -2499,6 +2533,13 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
+ if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio))
+ goto unlock_mutex;
+
+ /*
+ * Note that folio->_mapcount is overloaded in SLAB, so the simple test
+ * in folio_mapped() has to be done after folio_test_slab() is checked.
+ */
if (folio_mapped(folio)) {
unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
pfn, &unpoison_rs);
@@ -2511,32 +2552,28 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
- if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio))
- goto unlock_mutex;
-
- ret = get_hwpoison_page(p, MF_UNPOISON);
- if (!ret) {
+ ghp = get_hwpoison_page(p, MF_UNPOISON);
+ if (!ghp) {
if (PageHuge(p)) {
huge = true;
count = folio_free_raw_hwp(folio, false);
- if (count == 0) {
- ret = -EBUSY;
+ if (count == 0)
goto unlock_mutex;
- }
}
ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY;
- } else if (ret < 0) {
- if (ret == -EHWPOISON) {
+ } else if (ghp < 0) {
+ if (ghp == -EHWPOISON) {
ret = put_page_back_buddy(p) ? 0 : -EBUSY;
- } else
+ } else {
+ ret = ghp;
unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
pfn, &unpoison_rs);
+ }
} else {
if (PageHuge(p)) {
huge = true;
count = folio_free_raw_hwp(folio, false);
if (count == 0) {
- ret = -EBUSY;
folio_put(folio);
goto unlock_mutex;
}
@@ -2586,10 +2623,10 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
/*
* If we succeed to isolate the page, we grabbed another refcount on
- * the page, so we can safely drop the one we got from get_any_pages().
+ * the page, so we can safely drop the one we got from get_any_page().
* If we failed to isolate the page, it means that we cannot go further
* and we will return an error, so drop the reference we got from
- * get_any_pages() as well.
+ * get_any_page() as well.
*/
put_page(page);
return isolated;
@@ -2622,7 +2659,7 @@ static int soft_offline_in_use_page(struct page *page)
}
lock_page(page);
- if (!PageHuge(page))
+ if (!huge)
wait_on_page_writeback(page);
if (PageHWPoison(page)) {
unlock_page(page);
@@ -2631,7 +2668,7 @@ static int soft_offline_in_use_page(struct page *page)
return 0;
}
- if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page))
+ if (!huge && PageLRU(page) && !PageSwapCache(page))
/*
* Try to invalidate first. This should work for
* non dirty unmapped page cache pages.
@@ -2737,10 +2774,13 @@ retry:
if (ret > 0) {
ret = soft_offline_in_use_page(page);
} else if (ret == 0) {
- if (!page_handle_poison(page, true, false) && try_again) {
- try_again = false;
- flags &= ~MF_COUNT_INCREASED;
- goto retry;
+ if (!page_handle_poison(page, true, false)) {
+ if (try_again) {
+ try_again = false;
+ flags &= ~MF_COUNT_INCREASED;
+ goto retry;
+ }
+ ret = -EBUSY;
}
}
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index a516e303e304..37a4f59d9585 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -560,11 +560,11 @@ struct memory_dev_type *alloc_memory_type(int adistance)
}
EXPORT_SYMBOL_GPL(alloc_memory_type);
-void destroy_memory_type(struct memory_dev_type *memtype)
+void put_memory_type(struct memory_dev_type *memtype)
{
kref_put(&memtype->kref, release_memtype);
}
-EXPORT_SYMBOL_GPL(destroy_memory_type);
+EXPORT_SYMBOL_GPL(put_memory_type);
void init_node_memory_type(int node, struct memory_dev_type *memtype)
{
@@ -586,7 +586,7 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
*/
if (!node_memory_types[node].map_count) {
node_memory_types[node].memtype = NULL;
- kref_put(&memtype->kref, release_memtype);
+ put_memory_type(memtype);
}
mutex_unlock(&memory_tier_lock);
}
@@ -672,16 +672,16 @@ bool numa_demotion_enabled = false;
#ifdef CONFIG_MIGRATION
#ifdef CONFIG_SYSFS
-static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t demotion_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%s\n",
numa_demotion_enabled ? "true" : "false");
}
-static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t demotion_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
ssize_t ret;
@@ -693,8 +693,7 @@ static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
}
static struct kobj_attribute numa_demotion_enabled_attr =
- __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
- numa_demotion_enabled_store);
+ __ATTR_RW(demotion_enabled);
static struct attribute *numa_attrs[] = {
&numa_demotion_enabled_attr.attr,
diff --git a/mm/memory.c b/mm/memory.c
index 36289f33327e..6c264d2f969c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -77,7 +77,6 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
-#include <linux/net_mm.h>
#include <trace/events/kmem.h>
@@ -361,12 +360,10 @@ void free_pgd_range(struct mmu_gather *tlb,
} while (pgd++, addr = next, addr != end);
}
-void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *vma, unsigned long floor,
unsigned long ceiling, bool mm_wr_locked)
{
- MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
-
do {
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
@@ -375,7 +372,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
* Note: USER_PGTABLES_CEILING may be passed as ceiling and may
* be 0. This will underflow and is okay.
*/
- next = mas_find(&mas, ceiling - 1);
+ next = mas_find(mas, ceiling - 1);
/*
* Hide vma from rmap and truncate_pagecache before freeing
@@ -396,7 +393,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
- next = mas_find(&mas, ceiling - 1);
+ next = mas_find(mas, ceiling - 1);
if (mm_wr_locked)
vma_start_write(vma);
unlink_anon_vmas(vma);
@@ -860,8 +857,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return -EBUSY;
return -ENOENT;
} else if (is_pte_marker_entry(entry)) {
- if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma))
- set_pte_at(dst_mm, addr, dst_pte, pte);
+ pte_marker marker = copy_pte_marker(entry, dst_vma);
+
+ if (marker)
+ set_pte_at(dst_mm, addr, dst_pte,
+ make_pte_marker(marker));
return 0;
}
if (!userfaultfd_wp(dst_vma))
@@ -1312,7 +1312,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
* Use the raw variant of the seqcount_t write API to avoid
* lockdep complaining about preemptibility.
*/
- mmap_assert_write_locked(src_mm);
+ vma_assert_write_locked(src_vma);
raw_write_seqcount_begin(&src_mm->write_protect_seq);
}
@@ -1434,8 +1434,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
tlb_remove_tlb_entry(tlb, pte, addr);
zap_install_uffd_wp_if_needed(vma, addr, pte, details,
ptent);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ ksm_might_unmap_zero_page(mm, ptent);
continue;
+ }
delay_rmap = 0;
if (!PageAnon(page)) {
@@ -1501,7 +1503,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
!zap_drop_file_uffd_wp(details))
continue;
} else if (is_hwpoison_entry(entry) ||
- is_swapin_error_entry(entry)) {
+ is_poisoned_swp_entry(entry)) {
if (!should_zap_cows(details))
continue;
} else {
@@ -1692,10 +1694,12 @@ static void unmap_single_vma(struct mmu_gather *tlb,
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
- * @mt: the maple tree
+ * @mas: the maple state
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
+ * @tree_end: The maximum index to check
+ * @mm_wr_locked: lock flag
*
* Unmap all pages in the vma list.
*
@@ -1708,9 +1712,10 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr, bool mm_wr_locked)
+ unsigned long end_addr, unsigned long tree_end,
+ bool mm_wr_locked)
{
struct mmu_notifier_range range;
struct zap_details details = {
@@ -1718,7 +1723,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
/* Careful - we need to zap private pages too! */
.even_cows = true,
};
- MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
start_addr, end_addr);
@@ -1726,7 +1730,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
do {
unmap_single_vma(tlb, vma, start_addr, end_addr, &details,
mm_wr_locked);
- } while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
+ } while ((vma = mas_find(mas, tree_end - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
}
@@ -1866,7 +1870,6 @@ out:
return retval;
}
-#ifdef pte_index
static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
@@ -1881,7 +1884,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
}
/* insert_pages() amortizes the cost of spinlock operations
- * when inserting pages in a loop. Arch *must* define pte_index.
+ * when inserting pages in a loop.
*/
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num, pgprot_t prot)
@@ -1940,7 +1943,6 @@ out:
*num = remaining_pages_total;
return ret;
}
-#endif /* ifdef pte_index */
/**
* vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
@@ -1960,7 +1962,6 @@ out:
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num)
{
-#ifdef pte_index
const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
if (addr < vma->vm_start || end_addr >= vma->vm_end)
@@ -1972,18 +1973,6 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
}
/* Defer page refcount checking till we're about to map that page. */
return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
-#else
- unsigned long idx = 0, pgcount = *num;
- int err = -EINVAL;
-
- for (; idx < pgcount; ++idx) {
- err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
- if (err)
- break;
- }
- *num = pgcount - idx;
- return err;
-#endif /* ifdef pte_index */
}
EXPORT_SYMBOL(vm_insert_pages);
@@ -2859,7 +2848,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
entry = pte_mkyoung(vmf->orig_pte);
if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
- update_mmu_cache(vma, addr, vmf->pte);
+ update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
}
/*
@@ -2928,10 +2917,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
-static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
{
vm_fault_t ret;
- struct page *page = vmf->page;
unsigned int old_flags = vmf->flags;
vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
@@ -2946,14 +2934,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
- lock_page(page);
- if (!page->mapping) {
- unlock_page(page);
+ folio_lock(folio);
+ if (!folio->mapping) {
+ folio_unlock(folio);
return 0; /* retry */
}
ret |= VM_FAULT_LOCKED;
} else
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
return ret;
}
@@ -2966,20 +2954,20 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping;
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
bool dirtied;
bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
- dirtied = set_page_dirty(page);
- VM_BUG_ON_PAGE(PageAnon(page), page);
+ dirtied = folio_mark_dirty(folio);
+ VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
/*
- * Take a local copy of the address_space - page.mapping may be zeroed
- * by truncate after unlock_page(). The address_space itself remains
- * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * Take a local copy of the address_space - folio.mapping may be zeroed
+ * by truncate after folio_unlock(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on folio_unlock()'s
* release semantics to prevent the compiler from undoing this copying.
*/
- mapping = page_rmapping(page);
- unlock_page(page);
+ mapping = folio_raw_mapping(folio);
+ folio_unlock(folio);
if (!page_mkwrite)
file_update_time(vma->vm_file);
@@ -3037,7 +3025,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
pte_unmap_unlock(vmf->pte, vmf->ptl);
count_vm_event(PGREUSE);
}
@@ -3129,6 +3117,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
+ ksm_might_unmap_zero_page(mm, vmf->orig_pte);
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
@@ -3150,7 +3139,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* that left a window where the new PTE could be loaded into
* some TLBs while the old PTE remains in others.
*/
- ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+ ptep_clear_flush(vma, vmf->address, vmf->pte);
folio_add_new_anon_rmap(new_folio, vma, vmf->address);
folio_add_lru_vma(new_folio, vma);
/*
@@ -3160,7 +3149,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
BUG_ON(unshare && pte_write(entry));
set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
if (old_folio) {
/*
* Only after switching the pte to the new page may
@@ -3196,11 +3185,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
}
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above ptep_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
+ mmu_notifier_invalidate_range_end(&range);
if (new_folio)
folio_put(new_folio);
@@ -3270,6 +3255,11 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
vm_fault_t ret;
pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vmf->vma);
+ return VM_FAULT_RETRY;
+ }
+
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vmf);
if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
@@ -3280,36 +3270,42 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
return 0;
}
-static vm_fault_t wp_page_shared(struct vm_fault *vmf)
+static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = 0;
- get_page(vmf->page);
+ folio_get(folio);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
vm_fault_t tmp;
pte_unmap_unlock(vmf->pte, vmf->ptl);
- tmp = do_page_mkwrite(vmf);
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ folio_put(folio);
+ vma_end_read(vmf->vma);
+ return VM_FAULT_RETRY;
+ }
+
+ tmp = do_page_mkwrite(vmf, folio);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(vmf->page);
+ folio_put(folio);
return tmp;
}
tmp = finish_mkwrite_fault(vmf);
if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
- unlock_page(vmf->page);
- put_page(vmf->page);
+ folio_unlock(folio);
+ folio_put(folio);
return tmp;
}
} else {
wp_page_reuse(vmf);
- lock_page(vmf->page);
+ folio_lock(folio);
}
ret |= fault_dirty_shared_page(vmf);
- put_page(vmf->page);
+ folio_put(folio);
return ret;
}
@@ -3360,6 +3356,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+ if (vmf->page)
+ folio = page_folio(vmf->page);
+
/*
* Shared mapping: we are guaranteed to have VM_WRITE and
* FAULT_FLAG_WRITE set at this point.
@@ -3374,12 +3373,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
*/
if (!vmf->page)
return wp_pfn_shared(vmf);
- return wp_page_shared(vmf);
+ return wp_page_shared(vmf, folio);
}
- if (vmf->page)
- folio = page_folio(vmf->page);
-
/*
* Private mapping: create an exclusive anonymous page copy if reuse
* is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
@@ -3433,6 +3429,12 @@ reuse:
return 0;
}
copy:
+ if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vma_end_read(vmf->vma);
+ return VM_FAULT_RETRY;
+ }
+
/*
* Ok, we need to copy. Oh, well..
*/
@@ -3496,7 +3498,7 @@ void unmap_mapping_folio(struct folio *folio)
VM_BUG_ON(!folio_test_locked(folio));
first_index = folio->index;
- last_index = folio->index + folio_nr_pages(folio) - 1;
+ last_index = folio_next_index(folio) - 1;
details.even_cows = false;
details.single_folio = folio;
@@ -3583,6 +3585,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
struct folio *folio = page_folio(vmf->page);
struct vm_area_struct *vma = vmf->vma;
struct mmu_notifier_range range;
+ vm_fault_t ret;
/*
* We need a reference to lock the folio because we don't hold
@@ -3595,9 +3598,10 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
if (!folio_try_get(folio))
return 0;
- if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
+ ret = folio_lock_or_retry(folio, vmf);
+ if (ret) {
folio_put(folio);
- return VM_FAULT_RETRY;
+ return ret;
}
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
vma->vm_mm, vmf->address & PAGE_MASK,
@@ -3648,7 +3652,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
* none pte. Otherwise it means the pte could have changed, so retry.
*
* This should also cover the case where e.g. the pte changed
- * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
+ * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
* So is_pte_marker() check is not enough to safely drop the pte.
*/
if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
@@ -3694,8 +3698,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
/* Higher priority than uffd-wp when data corrupted */
- if (marker & PTE_MARKER_SWAPIN_ERROR)
- return VM_FAULT_SIGBUS;
+ if (marker & PTE_MARKER_POISONED)
+ return VM_FAULT_HWPOISON;
if (pte_marker_entry_uffd_wp(entry))
return pte_marker_handle_uffd_wp(vmf);
@@ -3722,18 +3726,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
- int locked;
vm_fault_t ret = 0;
void *shadow = NULL;
if (!pte_unmap_same(vmf))
goto out;
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- ret = VM_FAULT_RETRY;
- goto out;
- }
-
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
@@ -3743,6 +3741,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->page = pfn_swap_entry_to_page(entry);
ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ /*
+ * migrate_to_ram is not yet ready to operate
+ * under VMA lock.
+ */
+ vma_end_read(vma);
+ ret = VM_FAULT_RETRY;
+ goto out;
+ }
+
vmf->page = pfn_swap_entry_to_page(entry);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
@@ -3806,7 +3814,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_add_lru(folio);
/* To provide entry to swap_readpage() */
- folio_set_swap_entry(folio, entry);
+ folio->swap = entry;
swap_readpage(page, true, NULL);
folio->private = NULL;
}
@@ -3844,12 +3852,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_release;
}
- locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags);
-
- if (!locked) {
- ret |= VM_FAULT_RETRY;
+ ret |= folio_lock_or_retry(folio, vmf);
+ if (ret & VM_FAULT_RETRY)
goto out_release;
- }
if (swapcache) {
/*
@@ -3860,7 +3865,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* changed.
*/
if (unlikely(!folio_test_swapcache(folio) ||
- page_private(page) != entry.val))
+ page_swap_entry(page).val != entry.val))
goto out_page;
/*
@@ -4027,7 +4032,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
unlock:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4151,7 +4156,7 @@ setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
unlock:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4246,7 +4251,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
- int i;
vm_fault_t ret = VM_FAULT_FALLBACK;
if (!transhuge_vma_suitable(vma, haddr))
@@ -4279,8 +4283,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
if (unlikely(!pmd_none(*vmf->pmd)))
goto out;
- for (i = 0; i < HPAGE_PMD_NR; i++)
- flush_icache_page(vma, page + i);
+ flush_icache_pages(vma, page, HPAGE_PMD_NR);
entry = mk_huge_pmd(page, vma->vm_page_prot);
if (write)
@@ -4313,15 +4316,24 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
}
#endif
-void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
+/**
+ * set_pte_range - Set a range of PTEs to point to pages in a folio.
+ * @vmf: Fault decription.
+ * @folio: The folio that contains @page.
+ * @page: The first page to create a PTE for.
+ * @nr: The number of PTEs to create.
+ * @addr: The first address to create a PTE for.
+ */
+void set_pte_range(struct vm_fault *vmf, struct folio *folio,
+ struct page *page, unsigned int nr, unsigned long addr)
{
struct vm_area_struct *vma = vmf->vma;
bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
bool write = vmf->flags & FAULT_FLAG_WRITE;
- bool prefault = vmf->address != addr;
+ bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
pte_t entry;
- flush_icache_page(vma, page);
+ flush_icache_pages(vma, page, nr);
entry = mk_pte(page, vma->vm_page_prot);
if (prefault && arch_wants_old_prefaulted_pte())
@@ -4335,14 +4347,18 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
- inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, addr);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
+ VM_BUG_ON_FOLIO(nr != 1, folio);
+ folio_add_new_anon_rmap(folio, vma, addr);
+ folio_add_lru_vma(folio, vma);
} else {
- inc_mm_counter(vma->vm_mm, mm_counter_file(page));
- page_add_file_rmap(page, vma, false);
+ add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+ folio_add_file_rmap_range(folio, page, nr, vma, false);
}
- set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
+ set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr);
}
static bool vmf_pte_changed(struct vm_fault *vmf)
@@ -4410,11 +4426,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
/* Re-check under ptl */
if (likely(!vmf_pte_changed(vmf))) {
- do_set_pte(vmf, page, vmf->address);
-
- /* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ struct folio *folio = page_folio(page);
+ set_pte_range(vmf, folio, page, 1, vmf->address);
ret = 0;
} else {
update_mmu_tlb(vma, vmf->address, vmf->pte);
@@ -4533,6 +4547,7 @@ static inline bool should_fault_around(struct vm_fault *vmf)
static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
vm_fault_t ret = 0;
+ struct folio *folio;
/*
* Let's call ->map_pages() first and use ->fault() as fallback
@@ -4545,14 +4560,20 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
return ret;
}
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vmf->vma);
+ return VM_FAULT_RETRY;
+ }
+
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
ret |= finish_fault(vmf);
- unlock_page(vmf->page);
+ folio = page_folio(vmf->page);
+ folio_unlock(folio);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- put_page(vmf->page);
+ folio_put(folio);
return ret;
}
@@ -4561,6 +4582,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
@@ -4599,21 +4625,29 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret, tmp;
+ struct folio *folio;
+
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
+ folio = page_folio(vmf->page);
+
/*
* Check if the backing address space wants to know that the page is
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {
- unlock_page(vmf->page);
- tmp = do_page_mkwrite(vmf);
+ folio_unlock(folio);
+ tmp = do_page_mkwrite(vmf, folio);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(vmf->page);
+ folio_put(folio);
return tmp;
}
}
@@ -4621,8 +4655,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
- unlock_page(vmf->page);
- put_page(vmf->page);
+ folio_unlock(folio);
+ folio_put(folio);
return ret;
}
@@ -4811,43 +4845,45 @@ out_map:
if (writable)
pte = pte_mkwrite(pte, vma);
ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
- if (vma_is_anonymous(vmf->vma))
+ struct vm_area_struct *vma = vmf->vma;
+ if (vma_is_anonymous(vma))
return do_huge_pmd_anonymous_page(vmf);
- if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+ if (vma->vm_ops->huge_fault)
+ return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
return VM_FAULT_FALLBACK;
}
/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
vm_fault_t ret;
- if (vma_is_anonymous(vmf->vma)) {
+ if (vma_is_anonymous(vma)) {
if (likely(!unshare) &&
- userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
+ userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf);
}
- if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- if (vmf->vma->vm_ops->huge_fault) {
- ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ if (vma->vm_ops->huge_fault) {
+ ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
}
/* COW or write-notify handled on pte level: split pmd. */
- __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
@@ -4856,11 +4892,12 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ struct vm_area_struct *vma = vmf->vma;
/* No support for anonymous transparent PUD pages yet */
- if (vma_is_anonymous(vmf->vma))
+ if (vma_is_anonymous(vma))
return VM_FAULT_FALLBACK;
- if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+ if (vma->vm_ops->huge_fault)
+ return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}
@@ -4869,21 +4906,22 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
/* No support for anonymous transparent PUD pages yet */
- if (vma_is_anonymous(vmf->vma))
+ if (vma_is_anonymous(vma))
goto split;
- if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- if (vmf->vma->vm_ops->huge_fault) {
- ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ if (vma->vm_ops->huge_fault) {
+ ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
}
split:
/* COW or write-notify not handled on PUD level: split pud.*/
- __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
+ __split_huge_pud(vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
return VM_FAULT_FALLBACK;
}
@@ -4960,7 +4998,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
vmf->flags & FAULT_FLAG_WRITE)) {
- update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vmf, vmf->vma, vmf->address,
+ vmf->pte, 1);
} else {
/* Skip spurious TLB flush for retried page fault */
if (vmf->flags & FAULT_FLAG_TRIED)
@@ -4981,10 +5020,10 @@ unlock:
}
/*
- * By the time we get here, we already hold the mm semaphore
- *
- * The mmap_lock may have been released depending on flags and our
- * return value. See filemap_fault() and __folio_lock_or_retry().
+ * On entry, we hold either the VMA lock or the mmap_lock
+ * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in
+ * the result, the mmap_lock is not held on exit. See filemap_fault()
+ * and __folio_lock_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
@@ -5082,7 +5121,7 @@ retry_pud:
/**
* mm_account_fault - Do page fault accounting
- *
+ * @mm: mm from which memcg should be extracted. It can be NULL.
* @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
* of perf event counters, but we'll still do the per-task accounting to
* the task who triggered this page fault.
@@ -5190,6 +5229,17 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
!is_cow_mapping(vma->vm_flags)))
return VM_FAULT_SIGSEGV;
}
+#ifdef CONFIG_PER_VMA_LOCK
+ /*
+ * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
+ * the assumption that lock is dropped on VM_FAULT_RETRY.
+ */
+ if (WARN_ON_ONCE((*flags &
+ (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) ==
+ (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)))
+ return VM_FAULT_SIGSEGV;
+#endif
+
return 0;
}
@@ -5258,11 +5308,8 @@ EXPORT_SYMBOL_GPL(handle_mm_fault);
static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
- /* Even if this succeeds, make it clear we *might* have slept */
- if (likely(mmap_read_trylock(mm))) {
- might_sleep();
+ if (likely(mmap_read_trylock(mm)))
return true;
- }
if (regs && !user_mode(regs)) {
unsigned long ip = instruction_pointer(regs);
@@ -5390,31 +5437,21 @@ retry:
if (!vma)
goto inval;
- /* Only anonymous and tcp vmas are supported for now */
- if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
- goto inval;
-
- /* find_mergeable_anon_vma uses adjacent vmas which are not locked */
- if (!vma->anon_vma && !vma_is_tcp(vma))
- goto inval;
-
if (!vma_start_read(vma))
goto inval;
/*
- * Due to the possibility of userfault handler dropping mmap_lock, avoid
- * it for now and fall back to page fault handling under mmap_lock.
+ * find_mergeable_anon_vma uses adjacent vmas which are not locked.
+ * This check must happen after vma_start_read(); otherwise, a
+ * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
+ * from its anon_vma.
*/
- if (userfaultfd_armed(vma)) {
- vma_end_read(vma);
- goto inval;
- }
+ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
+ goto inval_end_read;
/* Check since vm_start/vm_end might change before we lock the VMA */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
- vma_end_read(vma);
- goto inval;
- }
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ goto inval_end_read;
/* Check if the VMA got isolated after we found it */
if (vma->detached) {
@@ -5426,6 +5463,9 @@ retry:
rcu_read_unlock();
return vma;
+
+inval_end_read:
+ vma_end_read(vma);
inval:
rcu_read_unlock();
count_vm_vma_lock_event(VMA_LOCK_ABORT);
@@ -5702,6 +5742,9 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
if (mmap_read_lock_killable(mm))
return 0;
+ /* Untag the address before looking up the VMA */
+ addr = untagged_addr_remote(mm, addr);
+
/* Avoid triggering the temporary warning in __get_user_pages */
if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
return 0;
@@ -6056,19 +6099,19 @@ void __init ptlock_cache_init(void)
SLAB_PANIC, NULL);
}
-bool ptlock_alloc(struct page *page)
+bool ptlock_alloc(struct ptdesc *ptdesc)
{
spinlock_t *ptl;
ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
if (!ptl)
return false;
- page->ptl = ptl;
+ ptdesc->ptl = ptl;
return true;
}
-void ptlock_free(struct page *page)
+void ptlock_free(struct ptdesc *ptdesc)
{
- kmem_cache_free(page_ptl_cachep, page->ptl);
+ kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3f231cf1b410..1b03f4ec6fd2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -41,17 +41,83 @@
#include "internal.h"
#include "shuffle.h"
+enum {
+ MEMMAP_ON_MEMORY_DISABLE = 0,
+ MEMMAP_ON_MEMORY_ENABLE,
+ MEMMAP_ON_MEMORY_FORCE,
+};
+
+static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
+
+static inline unsigned long memory_block_memmap_size(void)
+{
+ return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
+}
+
+static inline unsigned long memory_block_memmap_on_memory_pages(void)
+{
+ unsigned long nr_pages = PFN_UP(memory_block_memmap_size());
+
+ /*
+ * In "forced" memmap_on_memory mode, we add extra pages to align the
+ * vmemmap size to cover full pageblocks. That way, we can add memory
+ * even if the vmemmap size is not properly aligned, however, we might waste
+ * memory.
+ */
+ if (memmap_mode == MEMMAP_ON_MEMORY_FORCE)
+ return pageblock_align(nr_pages);
+ return nr_pages;
+}
+
#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
/*
* memory_hotplug.memmap_on_memory parameter
*/
-static bool memmap_on_memory __ro_after_init;
-module_param(memmap_on_memory, bool, 0444);
-MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+static int set_memmap_mode(const char *val, const struct kernel_param *kp)
+{
+ int ret, mode;
+ bool enabled;
+
+ if (sysfs_streq(val, "force") || sysfs_streq(val, "FORCE")) {
+ mode = MEMMAP_ON_MEMORY_FORCE;
+ } else {
+ ret = kstrtobool(val, &enabled);
+ if (ret < 0)
+ return ret;
+ if (enabled)
+ mode = MEMMAP_ON_MEMORY_ENABLE;
+ else
+ mode = MEMMAP_ON_MEMORY_DISABLE;
+ }
+ *((int *)kp->arg) = mode;
+ if (mode == MEMMAP_ON_MEMORY_FORCE) {
+ unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
+
+ pr_info_once("Memory hotplug will waste %ld pages in each memory block\n",
+ memmap_pages - PFN_UP(memory_block_memmap_size()));
+ }
+ return 0;
+}
+
+static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
+{
+ if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE)
+ return sprintf(buffer, "force\n");
+ return param_get_bool(buffer, kp);
+}
+
+static const struct kernel_param_ops memmap_mode_ops = {
+ .set = set_memmap_mode,
+ .get = get_memmap_mode,
+};
+module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n"
+ "With value \"force\" it could result in memory wastage due "
+ "to memmap size limitations (Y/N/force)");
static inline bool mhp_memmap_on_memory(void)
{
- return memmap_on_memory;
+ return memmap_mode != MEMMAP_ON_MEMORY_DISABLE;
}
#else
static inline bool mhp_memmap_on_memory(void)
@@ -1247,11 +1313,22 @@ static int online_memory_block(struct memory_block *mem, void *arg)
return device_online(&mem->dev);
}
-bool mhp_supports_memmap_on_memory(unsigned long size)
+#ifndef arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
+{
+ /*
+ * As default, we want the vmemmap to span a complete PMD such that we
+ * can map the vmemmap using a single PMD if supported by the
+ * architecture.
+ */
+ return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+}
+#endif
+
+static bool mhp_supports_memmap_on_memory(unsigned long size)
{
- unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
- unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
- unsigned long remaining_size = size - vmemmap_size;
+ unsigned long vmemmap_size = memory_block_memmap_size();
+ unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
/*
* Besides having arch support and the feature enabled at runtime, we
@@ -1279,10 +1356,28 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
* altmap as an alternative source of memory, and we do not exactly
* populate a single PMD.
*/
- return mhp_memmap_on_memory() &&
- size == memory_block_size_bytes() &&
- IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
- IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+ if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+ return false;
+
+ /*
+ * Make sure the vmemmap allocation is fully contained
+ * so that we always allocate vmemmap memory from altmap area.
+ */
+ if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE))
+ return false;
+
+ /*
+ * start pfn should be pageblock_nr_pages aligned for correctly
+ * setting migrate types
+ */
+ if (!pageblock_aligned(memmap_pages))
+ return false;
+
+ if (memmap_pages == PHYS_PFN(memory_block_size_bytes()))
+ /* No effective hotplugged memory doesn't make sense. */
+ return false;
+
+ return arch_supports_memmap_on_memory(vmemmap_size);
}
/*
@@ -1295,7 +1390,10 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
enum memblock_flags memblock_flags = MEMBLOCK_NONE;
- struct vmem_altmap mhp_altmap = {};
+ struct vmem_altmap mhp_altmap = {
+ .base_pfn = PHYS_PFN(res->start),
+ .end_pfn = PHYS_PFN(res->end),
+ };
struct memory_group *group = NULL;
u64 start, size;
bool new_node = false;
@@ -1339,26 +1437,29 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
* Self hosted memmap array
*/
if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
- if (!mhp_supports_memmap_on_memory(size)) {
- ret = -EINVAL;
- goto error;
+ if (mhp_supports_memmap_on_memory(size)) {
+ mhp_altmap.free = memory_block_memmap_on_memory_pages();
+ params.altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
+ if (!params.altmap) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap));
}
- mhp_altmap.free = PHYS_PFN(size);
- mhp_altmap.base_pfn = PHYS_PFN(start);
- params.altmap = &mhp_altmap;
+ /* fallback to not using altmap */
}
/* call arch's memory hotadd */
ret = arch_add_memory(nid, start, size, &params);
if (ret < 0)
- goto error;
+ goto error_free;
/* create memory block devices after memory was added */
- ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
- group);
+ ret = create_memory_block_devices(start, size, params.altmap, group);
if (ret) {
arch_remove_memory(start, size, NULL);
- goto error;
+ goto error_free;
}
if (new_node) {
@@ -1395,6 +1496,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
+error_free:
+ kfree(params.altmap);
error:
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
@@ -1843,6 +1946,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
do {
pfn = start_pfn;
do {
+ /*
+ * Historically we always checked for any signal and
+ * can't limit it to fatal signals without eventually
+ * breaking user space.
+ */
if (signal_pending(current)) {
ret = -EINTR;
reason = "signal backoff";
@@ -1956,12 +2064,18 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
return 0;
}
-static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+static int test_has_altmap_cb(struct memory_block *mem, void *arg)
{
+ struct memory_block **mem_ptr = (struct memory_block **)arg;
/*
- * If not set, continue with the next block.
+ * return the memblock if we have altmap
+ * and break callback.
*/
- return mem->nr_vmemmap_pages;
+ if (mem->altmap) {
+ *mem_ptr = mem;
+ return 1;
+ }
+ return 0;
}
static int check_cpu_on_node(int nid)
@@ -2036,10 +2150,9 @@ EXPORT_SYMBOL(try_offline_node);
static int __ref try_remove_memory(u64 start, u64 size)
{
- struct vmem_altmap mhp_altmap = {};
- struct vmem_altmap *altmap = NULL;
- unsigned long nr_vmemmap_pages;
+ struct memory_block *mem;
int rc = 0, nid = NUMA_NO_NODE;
+ struct vmem_altmap *altmap = NULL;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -2061,23 +2174,20 @@ static int __ref try_remove_memory(u64 start, u64 size)
* the same granularity it was added - a single memory block.
*/
if (mhp_memmap_on_memory()) {
- nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
- get_nr_vmemmap_pages_cb);
- if (nr_vmemmap_pages) {
+ rc = walk_memory_blocks(start, size, &mem, test_has_altmap_cb);
+ if (rc) {
if (size != memory_block_size_bytes()) {
pr_warn("Refuse to remove %#llx - %#llx,"
"wrong granularity\n",
start, start + size);
return -EINVAL;
}
-
+ altmap = mem->altmap;
/*
- * Let remove_pmd_table->free_hugepage_table do the
- * right thing if we used vmem_altmap when hot-adding
- * the range.
+ * Mark altmap NULL so that we can add a debug
+ * check on memblock free.
*/
- mhp_altmap.alloc = nr_vmemmap_pages;
- altmap = &mhp_altmap;
+ mem->altmap = NULL;
}
}
@@ -2094,6 +2204,12 @@ static int __ref try_remove_memory(u64 start, u64 size)
arch_remove_memory(start, size, altmap);
+ /* Verify that all vmemmap pages have actually been freed. */
+ if (altmap) {
+ WARN(altmap->alloc, "Altmap not fully unmapped");
+ kfree(altmap);
+ }
+
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
memblock_phys_free(start, size);
memblock_remove(start, size);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index edc25195f5bd..42b5567e3773 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -384,8 +384,10 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
VMA_ITERATOR(vmi, mm, 0);
mmap_write_lock(mm);
- for_each_vma(vmi, vma)
+ for_each_vma(vmi, vma) {
+ vma_start_write(vma);
mpol_rebind_policy(vma->vm_policy, new);
+ }
mmap_write_unlock(mm);
}
@@ -716,6 +718,14 @@ static const struct mm_walk_ops queue_pages_walk_ops = {
.hugetlb_entry = queue_folios_hugetlb,
.pmd_entry = queue_folios_pte_range,
.test_walk = queue_pages_test_walk,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
+ .hugetlb_entry = queue_folios_hugetlb,
+ .pmd_entry = queue_folios_pte_range,
+ .test_walk = queue_pages_test_walk,
+ .walk_lock = PGWALK_WRLOCK,
};
/*
@@ -736,7 +746,7 @@ static const struct mm_walk_ops queue_pages_walk_ops = {
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
nodemask_t *nodes, unsigned long flags,
- struct list_head *pagelist)
+ struct list_head *pagelist, bool lock_vma)
{
int err;
struct queue_pages qp = {
@@ -747,8 +757,10 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
.end = end,
.first = NULL,
};
+ const struct mm_walk_ops *ops = lock_vma ?
+ &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
- err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
+ err = walk_page_range(mm, start, end, ops, &qp);
if (!qp.first)
/* whole range in hole */
@@ -768,6 +780,8 @@ static int vma_replace_policy(struct vm_area_struct *vma,
struct mempolicy *old;
struct mempolicy *new;
+ vma_assert_write_locked(vma);
+
pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
vma->vm_ops, vma->vm_file,
@@ -1074,7 +1088,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
vma = find_vma(mm, 0);
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
- flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist, false);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_migration_target, NULL,
@@ -1313,8 +1327,12 @@ static long do_mbind(unsigned long start, unsigned long len,
if (err)
goto mpol_out;
+ /*
+ * Lock the VMAs before scanning for pages to migrate, to ensure we don't
+ * miss a concurrently inserted page.
+ */
ret = queue_pages_range(mm, start, end, nmask,
- flags | MPOL_MF_INVERT, &pagelist);
+ flags | MPOL_MF_INVERT, &pagelist, true);
if (ret < 0) {
err = ret;
@@ -1538,6 +1556,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
break;
}
+ vma_start_write(vma);
new->home_node = home_node;
err = mbind_range(&vmi, vma, &prev, start, end, new);
mpol_put(new);
@@ -2176,9 +2195,9 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
mpol_cond_put(pol);
gfp |= __GFP_COMP;
page = alloc_page_interleave(gfp, order, nid);
- if (page && order > 1)
- prep_transhuge_page(page);
folio = (struct folio *)page;
+ if (folio && order > 1)
+ folio_prep_large_rmappable(folio);
goto out;
}
@@ -2189,9 +2208,9 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
gfp |= __GFP_COMP;
page = alloc_pages_preferred_many(gfp, order, node, pol);
mpol_cond_put(pol);
- if (page && order > 1)
- prep_transhuge_page(page);
folio = (struct folio *)page;
+ if (folio && order > 1)
+ folio_prep_large_rmappable(folio);
goto out;
}
@@ -2287,10 +2306,11 @@ EXPORT_SYMBOL(alloc_pages);
struct folio *folio_alloc(gfp_t gfp, unsigned order)
{
struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+ struct folio *folio = (struct folio *)page;
- if (page && order > 1)
- prep_transhuge_page(page);
- return (struct folio *)page;
+ if (folio && order > 1)
+ folio_prep_large_rmappable(folio);
+ return folio;
}
EXPORT_SYMBOL(folio_alloc);
diff --git a/mm/memtest.c b/mm/memtest.c
index 57149dfee438..32f3e9dda837 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -3,9 +3,10 @@
#include <linux/types.h>
#include <linux/init.h>
#include <linux/memblock.h>
+#include <linux/seq_file.h>
-bool early_memtest_done;
-phys_addr_t early_memtest_bad_size;
+static bool early_memtest_done;
+static phys_addr_t early_memtest_bad_size;
static u64 patterns[] __initdata = {
/* The first entry has to be 0 to leave memtest with zeroed memory */
@@ -117,3 +118,20 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end)
do_one_pass(patterns[idx], start, end);
}
}
+
+void memtest_report_meminfo(struct seq_file *m)
+{
+ unsigned long early_memtest_bad_size_kb;
+
+ if (!IS_ENABLED(CONFIG_PROC_FS))
+ return;
+
+ if (!early_memtest_done)
+ return;
+
+ early_memtest_bad_size_kb = early_memtest_bad_size >> 10;
+ if (early_memtest_bad_size && !early_memtest_bad_size_kb)
+ early_memtest_bad_size_kb = 1;
+ /* When 0 is reported, it means there actually was a successful test */
+ seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb);
+}
diff --git a/mm/migrate.c b/mm/migrate.c
index 18f58b7e0aff..b7fa020003f3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -684,7 +684,7 @@ int migrate_folio(struct address_space *mapping, struct folio *dst,
}
EXPORT_SYMBOL(migrate_folio);
-#ifdef CONFIG_BLOCK
+#ifdef CONFIG_BUFFER_HEAD
/* Returns true if all buffers are successfully locked */
static bool buffer_migrate_lock_buffers(struct buffer_head *head,
enum migrate_mode mode)
@@ -773,7 +773,7 @@ recheck_buffers:
bh = head;
do {
- set_bh_page(bh, &dst->page, bh_offset(bh));
+ folio_set_bh(bh, dst, bh_offset(bh));
bh = bh->b_this_page;
} while (bh != head);
@@ -837,7 +837,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping,
return __buffer_migrate_folio(mapping, dst, src, mode, true);
}
EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
-#endif
+#endif /* CONFIG_BUFFER_HEAD */
int filemap_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
@@ -922,8 +922,7 @@ static int fallback_migrate_folio(struct address_space *mapping,
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
- if (folio_test_private(src) &&
- !filemap_release_folio(src, GFP_KERNEL))
+ if (!filemap_release_folio(src, GFP_KERNEL))
return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
return migrate_folio(mapping, dst, src, mode);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index df280aa461e2..8ac1f79f754a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -279,6 +279,7 @@ next:
static const struct mm_walk_ops migrate_vma_walk_ops = {
.pmd_entry = migrate_vma_collect_pmd,
.pte_hole = migrate_vma_collect_hole,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -658,7 +659,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
if (flush) {
flush_cache_page(vma, addr, pte_pfn(orig_pte));
- ptep_clear_flush_notify(vma, addr, ptep);
+ ptep_clear_flush(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, entry);
update_mmu_cache(vma, addr, ptep);
} else {
@@ -727,13 +728,22 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (is_device_private_page(newpage) ||
is_device_coherent_page(newpage)) {
- /*
- * For now only support anonymous memory migrating to
- * device private or coherent memory.
- */
if (mapping) {
- src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ struct folio *folio;
+
+ folio = page_folio(page);
+
+ /*
+ * For now only support anonymous memory migrating to
+ * device private or coherent memory.
+ *
+ * Try to get rid of swap cache if possible.
+ */
+ if (!folio_test_anon(folio) ||
+ !folio_free_swap(folio)) {
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
}
} else if (is_zone_device_page(newpage)) {
/*
@@ -754,13 +764,8 @@ static void __migrate_device_pages(unsigned long *src_pfns,
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
}
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
- * did already call it.
- */
if (notified)
- mmu_notifier_invalidate_range_only_end(&range);
+ mmu_notifier_invalidate_range_end(&range);
}
/**
diff --git a/mm/mincore.c b/mm/mincore.c
index b7f7a516b26c..dad3622cc963 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -176,6 +176,7 @@ static const struct mm_walk_ops mincore_walk_ops = {
.pmd_entry = mincore_pte_range,
.pte_hole = mincore_unmapped_range,
.hugetlb_entry = mincore_hugetlb,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
diff --git a/mm/mlock.c b/mm/mlock.c
index d7db94519884..06bdfab83b58 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -371,6 +371,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
{
static const struct mm_walk_ops mlock_walk_ops = {
.pmd_entry = mlock_pte_range,
+ .walk_lock = PGWALK_WRLOCK_VERIFY,
};
/*
@@ -386,6 +387,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
*/
if (newflags & VM_LOCKED)
newflags |= VM_IO;
+ vma_start_write(vma);
vm_flags_reset_once(vma, newflags);
lru_add_drain();
@@ -460,9 +462,9 @@ success:
* It's okay if try_to_unmap_one unmaps a page just after we
* set VM_LOCKED, populate_vma_page_range will bring it back.
*/
-
if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
/* No work to do, and mlocking twice would be wrong */
+ vma_start_write(vma);
vm_flags_reset(vma, newflags);
} else {
mlock_vma_pages_range(vma, start, end, newflags);
@@ -477,7 +479,6 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
{
unsigned long nstart, end, tmp;
struct vm_area_struct *vma, *prev;
- int error;
VMA_ITERATOR(vmi, current->mm, start);
VM_BUG_ON(offset_in_page(start));
@@ -498,6 +499,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
nstart = start;
tmp = vma->vm_start;
for_each_vma_range(vmi, vma, end) {
+ int error;
vm_flags_t newflags;
if (vma->vm_start != tmp)
@@ -511,14 +513,15 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
tmp = end;
error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
if (error)
- break;
+ return error;
+ tmp = vma_iter_end(&vmi);
nstart = tmp;
}
- if (vma_iter_end(&vmi) < end)
+ if (tmp < end)
return -ENOMEM;
- return error;
+ return 0;
}
/*
diff --git a/mm/mm_init.c b/mm/mm_init.c
index a1963c3322af..50f2f34745af 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -79,7 +79,7 @@ void __init mminit_verify_pageflags_layout(void)
int shift, width;
unsigned long or_mask, add_mask;
- shift = 8 * sizeof(unsigned long);
+ shift = BITS_PER_LONG;
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
@@ -154,7 +154,6 @@ early_param("mminit_loglevel", set_mminit_loglevel);
#endif /* CONFIG_DEBUG_MEMORY_INIT */
struct kobject *mm_kobj;
-EXPORT_SYMBOL_GPL(mm_kobj);
#ifdef CONFIG_SMP
s32 vm_committed_as_batch = 32;
@@ -377,6 +376,11 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (mirrored_kernelcore) {
bool mem_below_4gb_not_mirrored = false;
+ if (!memblock_has_mirror()) {
+ pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n");
+ goto out;
+ }
+
for_each_mem_region(r) {
if (memblock_is_mirror(r))
continue;
@@ -1020,7 +1024,7 @@ static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
if (!vmemmap_can_optimize(altmap, pgmap))
return pgmap_vmemmap_nr(pgmap);
- return 2 * (PAGE_SIZE / sizeof(struct page));
+ return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
}
static void __ref memmap_init_compound(struct page *head,
@@ -1105,7 +1109,6 @@ void __ref memmap_init_zone_device(struct zone *zone,
*/
static void __init adjust_zone_range_for_zone_movable(int nid,
unsigned long zone_type,
- unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zone_start_pfn,
unsigned long *zone_end_pfn)
@@ -1222,9 +1225,8 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
/* Get the start and end of the zone */
*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
- adjust_zone_range_for_zone_movable(nid, zone_type,
- node_start_pfn, node_end_pfn,
- zone_start_pfn, zone_end_pfn);
+ adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn,
+ zone_start_pfn, zone_end_pfn);
/* Check that this node has pages within the zone's required range */
if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
@@ -1424,9 +1426,9 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
usemapsize = roundup(zonesize, pageblock_nr_pages);
usemapsize = usemapsize >> pageblock_order;
usemapsize *= NR_PAGEBLOCK_BITS;
- usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
+ usemapsize = roundup(usemapsize, BITS_PER_LONG);
- return usemapsize / 8;
+ return usemapsize / BITS_PER_BYTE;
}
static void __ref setup_usemap(struct zone *zone)
@@ -1681,8 +1683,7 @@ static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
*
* It returns the start and end page frame of a node based on information
* provided by memblock_set_node(). If called for a node
- * with no available memory, a warning is printed and the start and end
- * PFNs will be 0.
+ * with no available memory, the start and end PFNs will be 0.
*/
void __init get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
@@ -1737,7 +1738,7 @@ static void __init free_area_init_node(int nid)
}
/* Any regular or high memory on that node ? */
-static void check_for_memory(pg_data_t *pgdat)
+static void __init check_for_memory(pg_data_t *pgdat)
{
enum zone_type zone_type;
@@ -2490,15 +2491,7 @@ void *__init alloc_large_system_hash(const char *tablename,
else
numentries <<= (PAGE_SHIFT - scale);
- /* Make sure we've got at least a 0-order allocation.. */
- if (unlikely(flags & HASH_SMALL)) {
- /* Makes no sense without HASH_EARLY */
- WARN_ON(!(flags & HASH_EARLY));
- if (!(numentries >> *_hash_shift)) {
- numentries = 1UL << *_hash_shift;
- BUG_ON(!numentries);
- }
- } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ if (unlikely((numentries * bucketsize) < PAGE_SIZE))
numentries = PAGE_SIZE / bucketsize;
}
numentries = roundup_pow_of_two(numentries);
@@ -2778,7 +2771,7 @@ void __init mm_core_init(void)
*/
page_ext_init_flatmem();
mem_debugging_and_hardening_init();
- kfence_alloc_pool();
+ kfence_alloc_pool_and_metadata();
report_meminit();
kmsan_init_shadow();
stack_depot_early_init();
diff --git a/mm/mmap.c b/mm/mmap.c
index 11dcf50cb933..b56a7f0c9f85 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -76,10 +76,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
-static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
+static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
struct vm_area_struct *vma, struct vm_area_struct *prev,
struct vm_area_struct *next, unsigned long start,
- unsigned long end, bool mm_wr_locked);
+ unsigned long end, unsigned long tree_end, bool mm_wr_locked);
static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
@@ -154,18 +154,6 @@ static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
return mas_prev(&vmi->mas, min);
}
-static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
- unsigned long start, unsigned long end, gfp_t gfp)
-{
- vmi->mas.index = start;
- vmi->mas.last = end - 1;
- mas_store_gfp(&vmi->mas, NULL, gfp);
- if (unlikely(mas_is_err(&vmi->mas)))
- return -ENOMEM;
-
- return 0;
-}
-
/*
* check_brk_limits() - Use platform specific check of range & verify mlock
* limits.
@@ -409,17 +397,17 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
VMA_ITERATOR(vmi, mm, 0);
struct address_space *mapping = NULL;
- if (vma_iter_prealloc(&vmi))
+ vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
+ if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
- }
+ vma_start_write(vma);
vma_iter_store(&vmi, vma);
- if (mapping) {
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ i_mmap_lock_write(mapping);
__vma_link_file(vma, mapping);
i_mmap_unlock_write(mapping);
}
@@ -474,15 +462,6 @@ static inline void init_vma_prep(struct vma_prepare *vp,
*/
static inline void vma_prepare(struct vma_prepare *vp)
{
- vma_start_write(vp->vma);
- if (vp->adj_next)
- vma_start_write(vp->adj_next);
- /* vp->insert is always a newly created VMA, no need for locking */
- if (vp->remove)
- vma_start_write(vp->remove);
- if (vp->remove2)
- vma_start_write(vp->remove2);
-
if (vp->file) {
uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
@@ -597,6 +576,7 @@ again:
}
if (vp->insert && vp->file)
uprobe_mmap(vp->insert);
+ validate_mm(mm);
}
/*
@@ -615,6 +595,7 @@ static inline int dup_anon_vma(struct vm_area_struct *dst,
* anon pages imported.
*/
if (src->anon_vma && !dst->anon_vma) {
+ vma_assert_write_locked(dst);
dst->anon_vma = src->anon_vma;
return anon_vma_clone(dst, src);
}
@@ -646,10 +627,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
bool remove_next = false;
struct vma_prepare vp;
+ vma_start_write(vma);
if (next && (vma != next) && (end == next->vm_end)) {
int ret;
remove_next = true;
+ vma_start_write(next);
ret = dup_anon_vma(vma, next);
if (ret)
return ret;
@@ -662,23 +645,19 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
/* Only handles expanding */
VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);
- if (vma_iter_prealloc(vmi))
+ /* Note: vma iterator must be pointing to 'start' */
+ vma_iter_config(vmi, start, end);
+ if (vma_iter_prealloc(vmi, vma))
goto nomem;
vma_prepare(&vp);
vma_adjust_trans_huge(vma, start, end, 0);
- /* VMA iterator points to previous, so set to start if necessary */
- if (vma_iter_addr(vmi) != start)
- vma_iter_set(vmi, start);
-
vma->vm_start = start;
vma->vm_end = end;
vma->vm_pgoff = pgoff;
- /* Note: mas must be pointing to the expanding VMA */
vma_iter_store(vmi, vma);
vma_complete(&vp, vmi, vma->vm_mm);
- validate_mm(vma->vm_mm);
return 0;
nomem:
@@ -701,24 +680,25 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
- if (vma_iter_prealloc(vmi))
+ if (vma->vm_start < start)
+ vma_iter_config(vmi, vma->vm_start, start);
+ else
+ vma_iter_config(vmi, end, vma->vm_end);
+
+ if (vma_iter_prealloc(vmi, NULL))
return -ENOMEM;
+ vma_start_write(vma);
+
init_vma_prep(&vp, vma);
vma_prepare(&vp);
vma_adjust_trans_huge(vma, start, end, 0);
- if (vma->vm_start < start)
- vma_iter_clear(vmi, vma->vm_start, start);
-
- if (vma->vm_end > end)
- vma_iter_clear(vmi, end, vma->vm_end);
-
+ vma_iter_clear(vmi);
vma->vm_start = start;
vma->vm_end = end;
vma->vm_pgoff = pgoff;
vma_complete(&vp, vmi, vma->vm_mm);
- validate_mm(vma->vm_mm);
return 0;
}
@@ -891,7 +871,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
long adj_start = 0;
- validate_mm(mm);
/*
* We later require that vma->vm_flags == vm_flags,
* so this tests vma->vm_flags & VM_SPECIAL, too.
@@ -936,16 +915,21 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
if (!merge_prev && !merge_next)
return NULL; /* Not mergeable. */
+ if (merge_prev)
+ vma_start_write(prev);
+
res = vma = prev;
remove = remove2 = adjust = NULL;
/* Can we merge both the predecessor and the successor? */
if (merge_prev && merge_next &&
is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
+ vma_start_write(next);
remove = next; /* case 1 */
vma_end = next->vm_end;
err = dup_anon_vma(prev, next);
if (curr) { /* case 6 */
+ vma_start_write(curr);
remove = curr;
remove2 = next;
if (!next->anon_vma)
@@ -953,6 +937,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
}
} else if (merge_prev) { /* case 2 */
if (curr) {
+ vma_start_write(curr);
err = dup_anon_vma(prev, curr);
if (end == curr->vm_end) { /* case 7 */
remove = curr;
@@ -962,8 +947,10 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
}
}
} else { /* merge_next */
+ vma_start_write(next);
res = next;
if (prev && addr < prev->vm_end) { /* case 4 */
+ vma_start_write(prev);
vma_end = addr;
adjust = next;
adj_start = -(prev->vm_end - addr);
@@ -979,6 +966,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
vma_pgoff = next->vm_pgoff - pglen;
if (curr) { /* case 8 */
vma_pgoff = curr->vm_pgoff;
+ vma_start_write(curr);
remove = curr;
err = dup_anon_vma(next, curr);
}
@@ -989,7 +977,17 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
if (err)
return NULL;
- if (vma_iter_prealloc(vmi))
+ if (vma_start < vma->vm_start || vma_end > vma->vm_end)
+ vma_expanded = true;
+
+ if (vma_expanded) {
+ vma_iter_config(vmi, vma_start, vma_end);
+ } else {
+ vma_iter_config(vmi, adjust->vm_start + adj_start,
+ adjust->vm_end);
+ }
+
+ if (vma_iter_prealloc(vmi, vma))
return NULL;
init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
@@ -998,8 +996,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
vma_prepare(&vp);
vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
- if (vma_start < vma->vm_start || vma_end > vma->vm_end)
- vma_expanded = true;
vma->vm_start = vma_start;
vma->vm_end = vma_end;
@@ -1018,10 +1014,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
}
vma_complete(&vp, vmi, mm);
- vma_iter_free(vmi);
- validate_mm(mm);
khugepaged_enter_vma(res, vm_flags);
-
return res;
}
@@ -1196,7 +1189,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm;
int pkey = 0;
- validate_mm(mm);
*populate = 0;
if (!len)
@@ -1943,7 +1935,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
struct vm_area_struct *next;
unsigned long gap_addr;
int error = 0;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, address);
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
@@ -1968,7 +1960,11 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/* Check that both stack segments have the same anon_vma? */
}
- if (mas_preallocate(&mas, GFP_KERNEL))
+ if (next)
+ mas_prev_range(&mas, address);
+
+ __mas_set_range(&mas, vma->vm_start, address - 1);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
@@ -2013,7 +2009,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
/* Overwrite old entry in mtree. */
- mas_set_range(&mas, vma->vm_start, address - 1);
mas_store_prealloc(&mas, vma);
anon_vma_interval_tree_post_update_vma(vma);
spin_unlock(&mm->page_table_lock);
@@ -2025,6 +2020,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma(vma, vma->vm_flags);
mas_destroy(&mas);
+ validate_mm(mm);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -2057,7 +2053,11 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
return -ENOMEM;
}
- if (mas_preallocate(&mas, GFP_KERNEL))
+ if (prev)
+ mas_next_range(&mas, vma->vm_start);
+
+ __mas_set_range(&mas, address, vma->vm_end - 1);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
@@ -2103,7 +2103,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
vma->vm_start = address;
vma->vm_pgoff -= grow;
/* Overwrite old entry in mtree. */
- mas_set_range(&mas, address, vma->vm_end - 1);
mas_store_prealloc(&mas, vma);
anon_vma_interval_tree_post_update_vma(vma);
spin_unlock(&mm->page_table_lock);
@@ -2115,6 +2114,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma(vma, vma->vm_flags);
mas_destroy(&mas);
+ validate_mm(mm);
return error;
}
@@ -2292,7 +2292,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
remove_vma(vma, false);
}
vm_unacct_memory(nr_accounted);
- validate_mm(mm);
}
/*
@@ -2300,18 +2299,20 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
*
* Called with the mm semaphore held.
*/
-static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
+static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
struct vm_area_struct *vma, struct vm_area_struct *prev,
- struct vm_area_struct *next,
- unsigned long start, unsigned long end, bool mm_wr_locked)
+ struct vm_area_struct *next, unsigned long start,
+ unsigned long end, unsigned long tree_end, bool mm_wr_locked)
{
struct mmu_gather tlb;
+ unsigned long mt_start = mas->index;
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked);
- free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked);
+ mas_set(mas, mt_start);
+ free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING,
mm_wr_locked);
tlb_finish_mmu(&tlb);
@@ -2329,8 +2330,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct *new;
int err;
- validate_mm(vma->vm_mm);
-
WARN_ON(vma->vm_start >= addr);
WARN_ON(vma->vm_end <= addr);
@@ -2344,10 +2343,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (!new)
return -ENOMEM;
- err = -ENOMEM;
- if (vma_iter_prealloc(vmi))
- goto out_free_vma;
-
if (new_below) {
new->vm_end = addr;
} else {
@@ -2355,6 +2350,11 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
+ err = -ENOMEM;
+ vma_iter_config(vmi, new->vm_start, new->vm_end);
+ if (vma_iter_prealloc(vmi, new))
+ goto out_free_vma;
+
err = vma_dup_policy(vma, new);
if (err)
goto out_free_vmi;
@@ -2369,6 +2369,9 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
+ vma_start_write(vma);
+ vma_start_write(new);
+
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
@@ -2387,7 +2390,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
/* Success. */
if (new_below)
vma_next(vmi);
- validate_mm(vma->vm_mm);
return 0;
out_free_mpol:
@@ -2396,7 +2398,6 @@ out_free_vmi:
vma_iter_free(vmi);
out_free_vma:
vm_area_free(new);
- validate_mm(vma->vm_mm);
return err;
}
@@ -2439,7 +2440,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long locked_vm = 0;
MA_STATE(mas_detach, &mt_detach, 0, 0);
mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
- mt_set_external_lock(&mt_detach, &mm->mmap_lock);
+ mt_on_stack(mt_detach);
/*
* If we need to split any vma, do it now to save pain later.
@@ -2460,22 +2461,17 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
goto map_count_exceeded;
- error = __split_vma(vmi, vma, start, 0);
+ error = __split_vma(vmi, vma, start, 1);
if (error)
goto start_split_failed;
-
- vma = vma_iter_load(vmi);
}
- prev = vma_prev(vmi);
- if (unlikely((!prev)))
- vma_iter_set(vmi, start);
-
/*
* Detach a range of VMAs from the mm. Using next as a temp variable as
* it is always overwritten.
*/
- for_each_vma_range(*vmi, next, end) {
+ next = vma;
+ do {
/* Does it split the end? */
if (next->vm_end > end) {
error = __split_vma(vmi, next, end, 0);
@@ -2483,7 +2479,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
goto end_split_failed;
}
vma_start_write(next);
- mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1);
+ mas_set(&mas_detach, count);
error = mas_store_gfp(&mas_detach, next, GFP_KERNEL);
if (error)
goto munmap_gather_failed;
@@ -2511,34 +2507,31 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
BUG_ON(next->vm_start < start);
BUG_ON(next->vm_start > end);
#endif
- }
-
- if (vma_iter_end(vmi) > end)
- next = vma_iter_load(vmi);
-
- if (!next)
- next = vma_next(vmi);
+ } for_each_vma_range(*vmi, next, end);
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
/* Make sure no VMAs are about to be lost. */
{
- MA_STATE(test, &mt_detach, start, end - 1);
+ MA_STATE(test, &mt_detach, 0, 0);
struct vm_area_struct *vma_mas, *vma_test;
int test_count = 0;
vma_iter_set(vmi, start);
rcu_read_lock();
- vma_test = mas_find(&test, end - 1);
+ vma_test = mas_find(&test, count - 1);
for_each_vma_range(*vmi, vma_mas, end) {
BUG_ON(vma_mas != vma_test);
test_count++;
- vma_test = mas_next(&test, end - 1);
+ vma_test = mas_next(&test, count - 1);
}
rcu_read_unlock();
BUG_ON(count != test_count);
}
#endif
- vma_iter_set(vmi, start);
+
+ while (vma_iter_addr(vmi) > start)
+ vma_iter_prev_range(vmi);
+
error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
if (error)
goto clear_tree_failed;
@@ -2549,19 +2542,26 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (unlock)
mmap_write_downgrade(mm);
+ prev = vma_iter_prev_range(vmi);
+ next = vma_next(vmi);
+ if (next)
+ vma_iter_prev_range(vmi);
+
/*
* We can free page tables without write-locking mmap_lock because VMAs
* were isolated before we downgraded mmap_lock.
*/
- unmap_region(mm, &mt_detach, vma, prev, next, start, end, !unlock);
+ mas_set(&mas_detach, 1);
+ unmap_region(mm, &mas_detach, vma, prev, next, start, end, count,
+ !unlock);
/* Statistics and freeing VMAs */
- mas_set(&mas_detach, start);
+ mas_set(&mas_detach, 0);
remove_mt(mm, &mas_detach);
- __mt_destroy(&mt_detach);
validate_mm(mm);
if (unlock)
mmap_read_unlock(mm);
+ __mt_destroy(&mt_detach);
return 0;
clear_tree_failed:
@@ -2685,8 +2685,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
next = vma_next(&vmi);
prev = vma_prev(&vmi);
- if (vm_flags & VM_SPECIAL)
+ if (vm_flags & VM_SPECIAL) {
+ if (prev)
+ vma_iter_next_range(&vmi);
goto cannot_expand;
+ }
/* Attempt to expand an old mapping */
/* Check next */
@@ -2707,9 +2710,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
merge_start = prev->vm_start;
vma = prev;
vm_pgoff = prev->vm_pgoff;
+ } else if (prev) {
+ vma_iter_next_range(&vmi);
}
-
/* Actually expand, if possible */
if (vma &&
!vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
@@ -2717,9 +2721,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
goto expanded;
}
+ if (vma == prev)
+ vma_iter_set(&vmi, addr);
cannot_expand:
- if (prev)
- vma_iter_next_range(&vmi);
/*
* Determine the object being mapped and call the appropriate
@@ -2732,7 +2736,7 @@ cannot_expand:
goto unacct_error;
}
- vma_iter_set(&vmi, addr);
+ vma_iter_config(&vmi, addr, end);
vma->vm_start = addr;
vma->vm_end = end;
vm_flags_init(vma, vm_flags);
@@ -2759,7 +2763,7 @@ cannot_expand:
if (WARN_ON((addr != vma->vm_start)))
goto close_and_free_vma;
- vma_iter_set(&vmi, addr);
+ vma_iter_config(&vmi, addr, end);
/*
* If vm_flags changed after call_mmap(), we should try merge
* vma again as we may succeed this time.
@@ -2806,17 +2810,15 @@ cannot_expand:
goto close_and_free_vma;
error = -ENOMEM;
- if (vma_iter_prealloc(&vmi))
+ if (vma_iter_prealloc(&vmi, vma))
goto close_and_free_vma;
/* Lock the VMA since it is modified after insertion into VMA tree */
vma_start_write(vma);
- if (vma->vm_file)
- i_mmap_lock_write(vma->vm_file->f_mapping);
-
vma_iter_store(&vmi, vma);
mm->map_count++;
if (vma->vm_file) {
+ i_mmap_lock_write(vma->vm_file->f_mapping);
if (vma->vm_flags & VM_SHARED)
mapping_allow_writable(vma->vm_file->f_mapping);
@@ -2877,9 +2879,10 @@ unmap_and_free_vma:
fput(vma->vm_file);
vma->vm_file = NULL;
+ vma_iter_set(&vmi, vma->vm_end);
/* Undo any partial mapping done by a device driver. */
- unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start,
- vma->vm_end, true);
+ unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
+ vma->vm_end, vma->vm_end, true);
}
if (file && (vm_flags & VM_SHARED))
mapping_unmap_writable(file->f_mapping);
@@ -3049,7 +3052,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm = current->mm;
struct vma_prepare vp;
- validate_mm(mm);
/*
* Check against address space limits by the changed size
* Note: This happens *after* clearing old mappings in some code paths.
@@ -3071,9 +3073,12 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (vma && vma->vm_end == addr && !vma_policy(vma) &&
can_vma_merge_after(vma, flags, NULL, NULL,
addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
- if (vma_iter_prealloc(vmi))
+ vma_iter_config(vmi, vma->vm_start, addr + len);
+ if (vma_iter_prealloc(vmi, vma))
goto unacct_fail;
+ vma_start_write(vma);
+
init_vma_prep(&vp, vma);
vma_prepare(&vp);
vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
@@ -3086,6 +3091,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
goto out;
}
+ if (vma)
+ vma_iter_next_range(vmi);
/* create a vma struct for an anonymous mapping */
vma = vm_area_alloc(mm);
if (!vma)
@@ -3097,10 +3104,12 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma->vm_pgoff = addr >> PAGE_SHIFT;
vm_flags_init(vma, flags);
vma->vm_page_prot = vm_get_page_prot(flags);
+ vma_start_write(vma);
if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
goto mas_store_fail;
mm->map_count++;
+ validate_mm(mm);
ksm_add_vma(vma);
out:
perf_event_mmap(vma);
@@ -3109,7 +3118,6 @@ out:
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vm_flags_set(vma, VM_SOFTDIRTY);
- validate_mm(mm);
return 0;
mas_store_fail:
@@ -3199,7 +3207,7 @@ void exit_mmap(struct mm_struct *mm)
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX, false);
+ unmap_vmas(&tlb, &mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
mmap_read_unlock(mm);
/*
@@ -3209,7 +3217,8 @@ void exit_mmap(struct mm_struct *mm)
set_bit(MMF_OOM_SKIP, &mm->flags);
mmap_write_lock(mm);
mt_clear_in_rcu(&mm->mm_mt);
- free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
+ mas_set(&mas, vma->vm_end);
+ free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS,
USER_PGTABLES_CEILING, true);
tlb_finish_mmu(&tlb);
@@ -3218,6 +3227,7 @@ void exit_mmap(struct mm_struct *mm)
* enabled, without holding any MM locks besides the unreachable
* mmap_write_lock.
*/
+ mas_set(&mas, vma->vm_end);
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
@@ -3290,7 +3300,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
bool faulted_in_anon_vma = true;
VMA_ITERATOR(vmi, mm, addr);
- validate_mm(mm);
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
@@ -3344,12 +3353,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
- vma_start_write(new_vma);
if (vma_link(mm, new_vma))
goto out_vma_link;
*need_rmap_locks = false;
}
- validate_mm(mm);
return new_vma;
out_vma_link:
@@ -3365,7 +3372,6 @@ out_free_mempol:
out_free_vma:
vm_area_free(new_vma);
out:
- validate_mm(mm);
return NULL;
}
@@ -3502,7 +3508,6 @@ static struct vm_area_struct *__install_special_mapping(
int ret;
struct vm_area_struct *vma;
- validate_mm(mm);
vma = vm_area_alloc(mm);
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
@@ -3525,12 +3530,10 @@ static struct vm_area_struct *__install_special_mapping(
perf_event_mmap(vma);
- validate_mm(mm);
return vma;
out:
vm_area_free(vma);
- validate_mm(mm);
return ERR_PTR(ret);
}
@@ -3662,6 +3665,12 @@ int mm_take_all_locks(struct mm_struct *mm)
mutex_lock(&mm_all_locks_mutex);
+ /*
+ * vma_start_write() does not have a complement in mm_drop_all_locks()
+ * because vma_start_write() is always asymmetrical; it marks a VMA as
+ * being written to until mmap_write_unlock() or mmap_write_downgrade()
+ * is reached.
+ */
mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
@@ -3758,7 +3767,6 @@ void mm_drop_all_locks(struct mm_struct *mm)
if (vma->vm_file && vma->vm_file->f_mapping)
vm_unlock_mapping(vma->vm_file->f_mapping);
}
- vma_end_write_all(mm);
mutex_unlock(&mm_all_locks_mutex);
}
@@ -3788,7 +3796,7 @@ static int init_user_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
@@ -3809,7 +3817,7 @@ static int init_admin_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
@@ -3853,7 +3861,7 @@ static int reserve_mem_notifier(struct notifier_block *nb,
break;
case MEM_OFFLINE:
- free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
if (sysctl_user_reserve_kbytes > free_kbytes) {
init_user_reserve();
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index ea9683e12936..4f559f4ddd21 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -63,6 +63,7 @@ static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_
/**
* tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
* @tlb: the current mmu_gather
+ * @vma: The memory area from which the pages are being removed.
*
* Note that because of how tlb_next_batch() above works, we will
* never start multiple new batches with pending delayed rmaps, so
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 50c0dde1354f..ec3b068cbbe6 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -199,7 +199,7 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
* invalidate_start/end and is colliding.
*
* The locking looks broadly like this:
- * mn_tree_invalidate_start(): mmu_interval_read_begin():
+ * mn_itree_inv_start(): mmu_interval_read_begin():
* spin_lock
* seq = READ_ONCE(interval_sub->invalidate_seq);
* seq == subs->invalidate_seq
@@ -207,7 +207,7 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
* spin_lock
* seq = ++subscriptions->invalidate_seq
* spin_unlock
- * op->invalidate_range():
+ * op->invalidate():
* user_lock
* mmu_interval_set_seq()
* interval_sub->invalidate_seq = seq
@@ -551,7 +551,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
static void
mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
- struct mmu_notifier_range *range, bool only_end)
+ struct mmu_notifier_range *range)
{
struct mmu_notifier *subscription;
int id;
@@ -559,24 +559,6 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
srcu_read_lock_held(&srcu)) {
- /*
- * Call invalidate_range here too to avoid the need for the
- * subsystem of having to register an invalidate_range_end
- * call-back when there is invalidate_range already. Usually a
- * subsystem registers either invalidate_range_start()/end() or
- * invalidate_range(), so this will be no additional overhead
- * (besides the pointer check).
- *
- * We skip call to invalidate_range() if we know it is safe ie
- * call site use mmu_notifier_invalidate_range_only_end() which
- * is safe to do when we know that a call to invalidate_range()
- * already happen under page table lock.
- */
- if (!only_end && subscription->ops->invalidate_range)
- subscription->ops->invalidate_range(subscription,
- range->mm,
- range->start,
- range->end);
if (subscription->ops->invalidate_range_end) {
if (!mmu_notifier_range_blockable(range))
non_block_start();
@@ -589,8 +571,7 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
srcu_read_unlock(&srcu, id);
}
-void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
- bool only_end)
+void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
struct mmu_notifier_subscriptions *subscriptions =
range->mm->notifier_subscriptions;
@@ -600,12 +581,12 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
mn_itree_inv_end(subscriptions);
if (!hlist_empty(&subscriptions->list))
- mn_hlist_invalidate_end(subscriptions, range, only_end);
+ mn_hlist_invalidate_end(subscriptions, range);
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}
-void __mmu_notifier_invalidate_range(struct mm_struct *mm,
- unsigned long start, unsigned long end)
+void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
struct mmu_notifier *subscription;
int id;
@@ -614,9 +595,10 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
hlist_for_each_entry_rcu(subscription,
&mm->notifier_subscriptions->list, hlist,
srcu_read_lock_held(&srcu)) {
- if (subscription->ops->invalidate_range)
- subscription->ops->invalidate_range(subscription, mm,
- start, end);
+ if (subscription->ops->arch_invalidate_secondary_tlbs)
+ subscription->ops->arch_invalidate_secondary_tlbs(
+ subscription, mm,
+ start, end);
}
srcu_read_unlock(&srcu, id);
}
@@ -635,6 +617,16 @@ int __mmu_notifier_register(struct mmu_notifier *subscription,
mmap_assert_write_locked(mm);
BUG_ON(atomic_read(&mm->mm_users) <= 0);
+ /*
+ * Subsystems should only register for invalidate_secondary_tlbs() or
+ * invalidate_range_start()/end() callbacks, not both.
+ */
+ if (WARN_ON_ONCE(subscription &&
+ (subscription->ops->arch_invalidate_secondary_tlbs &&
+ (subscription->ops->invalidate_range_start ||
+ subscription->ops->invalidate_range_end))))
+ return -EINVAL;
+
if (!mm->notifier_subscriptions) {
/*
* kmalloc cannot be called under mm_take_all_locks(), but we
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b342e0196e01..b94fbb45d5c7 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -213,7 +213,7 @@ static long change_pte_range(struct mmu_gather *tlb,
} else if (is_writable_device_private_entry(entry)) {
/*
* We do not preserve soft-dirtiness. See
- * copy_one_pte() for explanation.
+ * copy_nonpresent_pte() for explanation.
*/
entry = make_readable_device_private_entry(
swp_offset(entry));
@@ -230,10 +230,10 @@ static long change_pte_range(struct mmu_gather *tlb,
newpte = pte_swp_mkuffd_wp(newpte);
} else if (is_pte_marker_entry(entry)) {
/*
- * Ignore swapin errors unconditionally,
+ * Ignore error swap entries unconditionally,
* because any access should sigbus anyway.
*/
- if (is_swapin_error_entry(entry))
+ if (is_poisoned_swp_entry(entry))
continue;
/*
* If this is uffd-wp pte marker and we'd like
@@ -568,6 +568,7 @@ static const struct mm_walk_ops prot_none_walk_ops = {
.pte_entry = prot_none_pte_entry,
.hugetlb_entry = prot_none_hugetlb_entry,
.test_walk = prot_none_test,
+ .walk_lock = PGWALK_WRLOCK,
};
int
@@ -656,6 +657,7 @@ success:
* vm_flags and vm_page_prot are protected by the mmap_lock
* held in write mode.
*/
+ vma_start_write(vma);
vm_flags_reset(vma, newflags);
if (vma_wants_manual_pte_write_upgrade(vma))
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
diff --git a/mm/mremap.c b/mm/mremap.c
index 11e06e4ab33b..056478c106ee 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -349,7 +349,7 @@ static inline bool move_normal_pud(struct vm_area_struct *vma,
}
#endif
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
{
diff --git a/mm/nommu.c b/mm/nommu.c
index fe19614b9c19..7f9e9e5a0e12 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -583,7 +583,8 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
{
VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start);
- if (vma_iter_prealloc(&vmi)) {
+ vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
+ if (vma_iter_prealloc(&vmi, vma)) {
pr_warn("Allocation of vma tree for process %d failed\n",
current->pid);
return -ENOMEM;
@@ -591,7 +592,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
cleanup_vma_from_mm(vma);
/* remove from the MM's tree and list */
- vma_iter_clear(&vmi, vma->vm_start, vma->vm_end);
+ vma_iter_clear(&vmi);
return 0;
}
/*
@@ -1003,7 +1004,7 @@ error_free:
enomem:
pr_err("Allocation of length %lu from process %d (%s) failed\n",
len, current->pid, current->comm);
- show_free_areas(0, NULL);
+ show_mem();
return -ENOMEM;
}
@@ -1054,9 +1055,6 @@ unsigned long do_mmap(struct file *file,
if (!vma)
goto error_getting_vma;
- if (vma_iter_prealloc(&vmi))
- goto error_vma_iter_prealloc;
-
region->vm_usage = 1;
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
@@ -1198,6 +1196,10 @@ unsigned long do_mmap(struct file *file,
share:
BUG_ON(!vma->vm_region);
+ vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
+ if (vma_iter_prealloc(&vmi, vma))
+ goto error_just_free;
+
setup_vma_to_mm(vma, current->mm);
current->mm->map_count++;
/* add the VMA to the tree */
@@ -1236,22 +1238,14 @@ error_getting_vma:
kmem_cache_free(vm_region_jar, region);
pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
len, current->pid);
- show_free_areas(0, NULL);
+ show_mem();
return -ENOMEM;
error_getting_region:
pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
len, current->pid);
- show_free_areas(0, NULL);
- return -ENOMEM;
-
-error_vma_iter_prealloc:
- kmem_cache_free(vm_region_jar, region);
- vm_area_free(vma);
- pr_warn("Allocation of vma tree for process %d failed\n", current->pid);
- show_free_areas(0, NULL);
+ show_mem();
return -ENOMEM;
-
}
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
@@ -1336,12 +1330,6 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (!new)
goto err_vma_dup;
- if (vma_iter_prealloc(vmi)) {
- pr_warn("Allocation of vma tree for process %d failed\n",
- current->pid);
- goto err_vmi_preallocate;
- }
-
/* most fields are the same, copy all, and then fixup */
*region = *vma->vm_region;
new->vm_region = region;
@@ -1355,6 +1343,13 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
region->vm_pgoff = new->vm_pgoff += npages;
}
+ vma_iter_config(vmi, new->vm_start, new->vm_end);
+ if (vma_iter_prealloc(vmi, vma)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ goto err_vmi_preallocate;
+ }
+
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
@@ -1396,17 +1391,13 @@ static int vmi_shrink_vma(struct vma_iterator *vmi,
/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
- if (vma_iter_prealloc(vmi)) {
- pr_warn("Allocation of vma tree for process %d failed\n",
- current->pid);
- return -ENOMEM;
- }
-
if (from > vma->vm_start) {
- vma_iter_clear(vmi, from, vma->vm_end);
+ if (vma_iter_clear_gfp(vmi, from, vma->vm_end, GFP_KERNEL))
+ return -ENOMEM;
vma->vm_end = from;
} else {
- vma_iter_clear(vmi, vma->vm_start, to);
+ if (vma_iter_clear_gfp(vmi, vma->vm_start, to, GFP_KERNEL))
+ return -ENOMEM;
vma->vm_start = to;
}
@@ -1809,7 +1800,7 @@ static int __meminit init_user_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
@@ -1830,7 +1821,7 @@ static int __meminit init_admin_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 612b5597d3af..44bde56ecd02 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -479,8 +479,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
static bool oom_killer_disabled __read_mostly;
-#define K(x) ((x) << (PAGE_SHIFT-10))
-
/*
* task->mm can be NULL if the task is the exited group leader. So to
* determine whether the task is using a particular mm, we examine all the
@@ -994,7 +992,6 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
mmdrop(mm);
put_task_struct(victim);
}
-#undef K
/*
* Kill provided task unless it's secured by setting
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d3f42009bb70..b8d3d7040a50 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1193,7 +1193,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
* write_bandwidth = ---------------------------------------------------
* period
*
- * @written may have decreased due to folio_account_redirty().
+ * @written may have decreased due to folio_redirty_for_writepage().
* Avoid underflowing @bw calculation.
*/
bw = written - min(written, wb->written_stamp);
@@ -2712,37 +2712,6 @@ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
EXPORT_SYMBOL(filemap_dirty_folio);
/**
- * folio_account_redirty - Manually account for redirtying a page.
- * @folio: The folio which is being redirtied.
- *
- * Most filesystems should call folio_redirty_for_writepage() instead
- * of this fuction. If your filesystem is doing writeback outside the
- * context of a writeback_control(), it can call this when redirtying
- * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED,
- * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN,
- * WB_WRITTEN) in long term. The mismatches will lead to systematic errors
- * in balanced_dirty_ratelimit and the dirty pages position control.
- */
-void folio_account_redirty(struct folio *folio)
-{
- struct address_space *mapping = folio->mapping;
-
- if (mapping && mapping_can_writeback(mapping)) {
- struct inode *inode = mapping->host;
- struct bdi_writeback *wb;
- struct wb_lock_cookie cookie = {};
- long nr = folio_nr_pages(folio);
-
- wb = unlocked_inode_to_wb_begin(inode, &cookie);
- current->nr_dirtied -= nr;
- node_stat_mod_folio(folio, NR_DIRTIED, -nr);
- wb_stat_mod(wb, WB_DIRTIED, -nr);
- unlocked_inode_to_wb_end(inode, &cookie);
- }
-}
-EXPORT_SYMBOL(folio_account_redirty);
-
-/**
* folio_redirty_for_writepage - Decline to write a dirty folio.
* @wbc: The writeback control.
* @folio: The folio.
@@ -2757,13 +2726,23 @@ EXPORT_SYMBOL(folio_account_redirty);
bool folio_redirty_for_writepage(struct writeback_control *wbc,
struct folio *folio)
{
- bool ret;
+ struct address_space *mapping = folio->mapping;
long nr = folio_nr_pages(folio);
+ bool ret;
wbc->pages_skipped += nr;
- ret = filemap_dirty_folio(folio->mapping, folio);
- folio_account_redirty(folio);
+ ret = filemap_dirty_folio(mapping, folio);
+ if (mapping && mapping_can_writeback(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct wb_lock_cookie cookie = {};
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
+ current->nr_dirtied -= nr;
+ node_stat_mod_folio(folio, NR_DIRTIED, -nr);
+ wb_stat_mod(wb, WB_DIRTIED, -nr);
+ unlocked_inode_to_wb_end(inode, &cookie);
+ }
return ret;
}
EXPORT_SYMBOL(folio_redirty_for_writepage);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7d3460c7a480..452459836b71 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -284,17 +284,6 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
#endif
};
-static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
- [NULL_COMPOUND_DTOR] = NULL,
- [COMPOUND_PAGE_DTOR] = free_compound_page,
-#ifdef CONFIG_HUGETLB_PAGE
- [HUGETLB_PAGE_DTOR] = free_huge_page,
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
-#endif
-};
-
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
static int watermark_boost_factor __read_mostly = 15000;
@@ -371,10 +360,16 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
-static __always_inline
-unsigned long __get_pfnblock_flags_mask(const struct page *page,
- unsigned long pfn,
- unsigned long mask)
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+unsigned long get_pfnblock_flags_mask(const struct page *page,
+ unsigned long pfn, unsigned long mask)
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
@@ -393,24 +388,10 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page,
return (word >> bitidx) & mask;
}
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
-unsigned long get_pfnblock_flags_mask(const struct page *page,
- unsigned long pfn, unsigned long mask)
-{
- return __get_pfnblock_flags_mask(page, pfn, mask);
-}
-
static __always_inline int get_pfnblock_migratetype(const struct page *page,
unsigned long pfn)
{
- return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
+ return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
/**
@@ -459,7 +440,7 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
- int ret = 0;
+ int ret;
unsigned seq;
unsigned long pfn = page_to_pfn(page);
unsigned long sp, start_pfn;
@@ -468,8 +449,7 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
seq = zone_span_seqbegin(zone);
start_pfn = zone->zone_start_pfn;
sp = zone->spanned_pages;
- if (!zone_spans_pfn(zone, pfn))
- ret = 1;
+ ret = !zone_spans_pfn(zone, pfn);
} while (zone_span_seqretry(zone, seq));
if (ret)
@@ -539,8 +519,6 @@ out:
static inline unsigned int order_to_pindex(int migratetype, int order)
{
- int base = order;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != pageblock_order);
@@ -550,7 +528,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order)
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif
- return (MIGRATE_PCPTYPES * base) + migratetype;
+ return (MIGRATE_PCPTYPES * order) + migratetype;
}
static inline int pindex_to_order(unsigned int pindex)
@@ -594,19 +572,10 @@ static inline void free_the_page(struct page *page, unsigned int order)
* The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
* in bit 0 of page->compound_head. The rest of bits is pointer to head page.
*
- * The first tail page's ->compound_dtor holds the offset in array of compound
- * page destructors. See compound_page_dtors.
- *
* The first tail page's ->compound_order holds the order of allocation.
* This usage means that zero-order pages may not be compound.
*/
-void free_compound_page(struct page *page)
-{
- mem_cgroup_uncharge(page_folio(page));
- free_the_page(page, compound_order(page));
-}
-
void prep_compound_page(struct page *page, unsigned int order)
{
int i;
@@ -621,10 +590,16 @@ void prep_compound_page(struct page *page, unsigned int order)
void destroy_large_folio(struct folio *folio)
{
- enum compound_dtor_id dtor = folio->_folio_dtor;
+ if (folio_test_hugetlb(folio)) {
+ free_huge_folio(folio);
+ return;
+ }
+
+ if (folio_test_large_rmappable(folio))
+ folio_undo_large_rmappable(folio);
- VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
- compound_page_dtors[dtor](&folio->page);
+ mem_cgroup_uncharge(folio);
+ free_the_page(&folio->page, folio_order(folio));
}
static inline void set_buddy_order(struct page *page, unsigned int order)
@@ -824,7 +799,7 @@ static inline void __free_one_page(struct page *page,
* pageblock isolation could cause incorrect freepage or CMA
* accounting or HIGHATOMIC accounting.
*/
- int buddy_mt = get_pageblock_migratetype(buddy);
+ int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
if (migratetype != buddy_mt
&& (!migratetype_is_mergeable(migratetype) ||
@@ -900,7 +875,7 @@ int split_free_page(struct page *free_page,
goto out;
}
- mt = get_pageblock_migratetype(free_page);
+ mt = get_pfnblock_migratetype(free_page, free_page_pfn);
if (likely(!is_migrate_isolate(mt)))
__mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -1132,7 +1107,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
if (compound)
- ClearPageHasHWPoisoned(page);
+ page[1].flags &= ~PAGE_FLAGS_SECOND;
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_page_prepare(page, page + i);
@@ -1210,8 +1185,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
int pindex)
{
unsigned long flags;
- int min_pindex = 0;
- int max_pindex = NR_PCP_LISTS - 1;
unsigned int order;
bool isolated_pageblocks;
struct page *page;
@@ -1234,17 +1207,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* Remove pages from lists in a round-robin fashion. */
do {
- if (++pindex > max_pindex)
- pindex = min_pindex;
+ if (++pindex > NR_PCP_LISTS - 1)
+ pindex = 0;
list = &pcp->lists[pindex];
- if (!list_empty(list))
- break;
-
- if (pindex == max_pindex)
- max_pindex--;
- if (pindex == min_pindex)
- min_pindex++;
- } while (1);
+ } while (list_empty(list));
order = pindex_to_order(pindex);
nr_pages = 1 << order;
@@ -1834,6 +1800,10 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
free_pages = move_freepages_block(zone, page, start_type,
&movable_pages);
+ /* moving whole block can fail due to zone boundary conditions */
+ if (!free_pages)
+ goto single_page;
+
/*
* Determine how many pages are compatible with our allocation.
* For movable allocation, it's the number of movable pages which
@@ -1855,14 +1825,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
else
alike_pages = 0;
}
-
- /* moving whole block can fail due to zone boundary conditions */
- if (!free_pages)
- goto single_page;
-
/*
* If a sufficient number of pages in the block are either free or of
- * comparable migratability as our allocation, claim the whole block.
+ * compatible migratability as our allocation, claim the whole block.
*/
if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
@@ -1912,8 +1877,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
* Reserve a pageblock for exclusive use of high-order atomic allocations if
* there are no empty page blocks that contain a page with a suitable order
*/
-static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
- unsigned int alloc_order)
+static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
{
int mt;
unsigned long max_managed, flags;
@@ -2353,10 +2317,10 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
return true;
}
-static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
- bool free_high)
+static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
{
int min_nr_free, max_nr_free;
+ int batch = READ_ONCE(pcp->batch);
/* Free everything if batch freeing high-order pages. */
if (unlikely(free_high))
@@ -2423,9 +2387,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
high = nr_pcp_high(pcp, zone, free_high);
if (pcp->count >= high) {
- int batch = READ_ONCE(pcp->batch);
-
- free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex);
}
}
@@ -3225,7 +3187,7 @@ try_this_zone:
* if the pageblock should be reserved for the future
*/
if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
- reserve_highatomic_pageblock(page, zone, order);
+ reserve_highatomic_pageblock(page, zone);
return page;
} else {
@@ -4508,10 +4470,11 @@ struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
{
struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
preferred_nid, nodemask);
+ struct folio *folio = (struct folio *)page;
- if (page && order > 1)
- prep_transhuge_page(page);
- return (struct folio *)page;
+ if (folio && order > 1)
+ folio_prep_large_rmappable(folio);
+ return folio;
}
EXPORT_SYMBOL(__folio_alloc);
@@ -5139,19 +5102,17 @@ static void __build_all_zonelists(void *data)
unsigned long flags;
/*
- * Explicitly disable this CPU's interrupts before taking seqlock
- * to prevent any IRQ handler from calling into the page allocator
- * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
+ * The zonelist_update_seq must be acquired with irqsave because the
+ * reader can be invoked from IRQ with GFP_ATOMIC.
*/
- local_irq_save(flags);
+ write_seqlock_irqsave(&zonelist_update_seq, flags);
/*
- * Explicitly disable this CPU's synchronous printk() before taking
- * seqlock to prevent any printk() from trying to hold port->lock, for
+ * Also disable synchronous printk() to prevent any printk() from
+ * trying to hold port->lock, for
* tty_insert_flip_string_and_push_buffer() on other CPU might be
* calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
*/
printk_deferred_enter();
- write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -5188,9 +5149,8 @@ static void __build_all_zonelists(void *data)
#endif
}
- write_sequnlock(&zonelist_update_seq);
printk_deferred_exit();
- local_irq_restore(flags);
+ write_sequnlock_irqrestore(&zonelist_update_seq, flags);
}
static noinline void __init
@@ -5694,9 +5654,9 @@ static void __setup_per_zone_wmarks(void)
struct zone *zone;
unsigned long flags;
- /* Calculate total number of !ZONE_HIGHMEM pages */
+ /* Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE pages */
for_each_zone(zone) {
- if (!is_highmem(zone))
+ if (!is_highmem(zone) && zone_idx(zone) != ZONE_MOVABLE)
lowmem_pages += zone_managed_pages(zone);
}
@@ -5706,15 +5666,15 @@ static void __setup_per_zone_wmarks(void)
spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone_managed_pages(zone);
do_div(tmp, lowmem_pages);
- if (is_highmem(zone)) {
+ if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
- * need highmem pages, so cap pages_min to a small
- * value here.
+ * need highmem and movable zones pages, so cap pages_min
+ * to a small value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
* deltas control async page reclaim, and so should
- * not be capped for highmem.
+ * not be capped for highmem and movable zones.
*/
unsigned long min_pages;
diff --git a/mm/page_ext.c b/mm/page_ext.c
index dc1626be458b..4548fcc66d74 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -90,7 +90,6 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
unsigned long page_ext_size;
static unsigned long total_usage;
-static struct page_ext *lookup_page_ext(const struct page *page);
bool early_page_ext __meminitdata;
static int __init setup_early_page_ext(char *str)
@@ -137,62 +136,16 @@ static void __init invoke_init_callbacks(void)
}
}
-#ifndef CONFIG_SPARSEMEM
-void __init page_ext_init_flatmem_late(void)
-{
- invoke_init_callbacks();
-}
-#endif
-
static inline struct page_ext *get_entry(void *base, unsigned long index)
{
return base + page_ext_size * index;
}
-/**
- * page_ext_get() - Get the extended information for a page.
- * @page: The page we're interested in.
- *
- * Ensures that the page_ext will remain valid until page_ext_put()
- * is called.
- *
- * Return: NULL if no page_ext exists for this page.
- * Context: Any context. Caller may not sleep until they have called
- * page_ext_put().
- */
-struct page_ext *page_ext_get(struct page *page)
-{
- struct page_ext *page_ext;
-
- rcu_read_lock();
- page_ext = lookup_page_ext(page);
- if (!page_ext) {
- rcu_read_unlock();
- return NULL;
- }
-
- return page_ext;
-}
-
-/**
- * page_ext_put() - Working with page extended information is done.
- * @page_ext: Page extended information received from page_ext_get().
- *
- * The page extended information of the page may not be valid after this
- * function is called.
- *
- * Return: None.
- * Context: Any context with corresponding page_ext_get() is called.
- */
-void page_ext_put(struct page_ext *page_ext)
+#ifndef CONFIG_SPARSEMEM
+void __init page_ext_init_flatmem_late(void)
{
- if (unlikely(!page_ext))
- return;
-
- rcu_read_unlock();
+ invoke_init_callbacks();
}
-#ifndef CONFIG_SPARSEMEM
-
void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
{
@@ -424,13 +377,14 @@ static int __meminit online_page_ext(unsigned long start_pfn,
return 0;
/* rollback */
+ end = pfn - PAGES_PER_SECTION;
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_ext(pfn);
return -ENOMEM;
}
-static int __meminit offline_page_ext(unsigned long start_pfn,
+static void __meminit offline_page_ext(unsigned long start_pfn,
unsigned long nr_pages)
{
unsigned long start, end, pfn;
@@ -454,8 +408,6 @@ static int __meminit offline_page_ext(unsigned long start_pfn,
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_ext(pfn);
- return 0;
-
}
static int __meminit page_ext_callback(struct notifier_block *self,
@@ -537,3 +489,46 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
}
#endif
+
+/**
+ * page_ext_get() - Get the extended information for a page.
+ * @page: The page we're interested in.
+ *
+ * Ensures that the page_ext will remain valid until page_ext_put()
+ * is called.
+ *
+ * Return: NULL if no page_ext exists for this page.
+ * Context: Any context. Caller may not sleep until they have called
+ * page_ext_put().
+ */
+struct page_ext *page_ext_get(struct page *page)
+{
+ struct page_ext *page_ext;
+
+ rcu_read_lock();
+ page_ext = lookup_page_ext(page);
+ if (!page_ext) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ return page_ext;
+}
+
+/**
+ * page_ext_put() - Working with page extended information is done.
+ * @page_ext: Page extended information received from page_ext_get().
+ *
+ * The page extended information of the page may not be valid after this
+ * function is called.
+ *
+ * Return: None.
+ * Context: Any context with corresponding page_ext_get() is called.
+ */
+void page_ext_put(struct page_ext *page_ext)
+{
+ if (unlikely(!page_ext))
+ return;
+
+ rcu_read_unlock();
+}
diff --git a/mm/page_io.c b/mm/page_io.c
index 684cd3c7b59b..fe4c21af23f2 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -19,20 +19,19 @@
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/writeback.h>
-#include <linux/frontswap.h>
#include <linux/blkdev.h>
#include <linux/psi.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
#include <linux/delayacct.h>
+#include <linux/zswap.h>
#include "swap.h"
static void __end_swap_bio_write(struct bio *bio)
{
- struct page *page = bio_first_page_all(bio);
+ struct folio *folio = bio_first_folio_all(bio);
if (bio->bi_status) {
- SetPageError(page);
/*
* We failed to write the page out to swap-space.
* Re-dirty the page in order to avoid it being reclaimed.
@@ -41,13 +40,13 @@ static void __end_swap_bio_write(struct bio *bio)
*
* Also clear PG_reclaim to avoid folio_rotate_reclaimable()
*/
- set_page_dirty(page);
+ folio_mark_dirty(folio);
pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
- ClearPageReclaim(page);
+ folio_clear_reclaim(folio);
}
- end_page_writeback(page);
+ folio_end_writeback(folio);
}
static void end_swap_bio_write(struct bio *bio)
@@ -58,18 +57,16 @@ static void end_swap_bio_write(struct bio *bio)
static void __end_swap_bio_read(struct bio *bio)
{
- struct page *page = bio_first_page_all(bio);
+ struct folio *folio = bio_first_folio_all(bio);
if (bio->bi_status) {
- SetPageError(page);
- ClearPageUptodate(page);
pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
} else {
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
- unlock_page(page);
+ folio_unlock(folio);
}
static void end_swap_bio_read(struct bio *bio)
@@ -198,7 +195,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
folio_unlock(folio);
return ret;
}
- if (frontswap_store(&folio->page) == 0) {
+ if (zswap_store(folio)) {
folio_start_writeback(folio);
folio_unlock(folio);
folio_end_writeback(folio);
@@ -208,22 +205,22 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
return 0;
}
-static inline void count_swpout_vm_event(struct page *page)
+static inline void count_swpout_vm_event(struct folio *folio)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (unlikely(PageTransHuge(page)))
+ if (unlikely(folio_test_pmd_mappable(folio)))
count_vm_event(THP_SWPOUT);
#endif
- count_vm_events(PSWPOUT, thp_nr_pages(page));
+ count_vm_events(PSWPOUT, folio_nr_pages(folio));
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
+static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
{
struct cgroup_subsys_state *css;
struct mem_cgroup *memcg;
- memcg = page_memcg(page);
+ memcg = folio_memcg(folio);
if (!memcg)
return;
@@ -233,7 +230,7 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
rcu_read_unlock();
}
#else
-#define bio_associate_blkg_from_page(bio, page) do { } while (0)
+#define bio_associate_blkg_from_page(bio, folio) do { } while (0)
#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
struct swap_iocb {
@@ -283,7 +280,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
}
} else {
for (p = 0; p < sio->pages; p++)
- count_swpout_vm_event(sio->bvec[p].bv_page);
+ count_swpout_vm_event(page_folio(sio->bvec[p].bv_page));
}
for (p = 0; p < sio->pages; p++)
@@ -334,17 +331,18 @@ static void swap_writepage_bdev_sync(struct page *page,
{
struct bio_vec bv;
struct bio bio;
+ struct folio *folio = page_folio(page);
bio_init(&bio, sis->bdev, &bv, 1,
REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
bio.bi_iter.bi_sector = swap_page_sector(page);
__bio_add_page(&bio, page, thp_size(page), 0);
- bio_associate_blkg_from_page(&bio, page);
- count_swpout_vm_event(page);
+ bio_associate_blkg_from_page(&bio, folio);
+ count_swpout_vm_event(folio);
- set_page_writeback(page);
- unlock_page(page);
+ folio_start_writeback(folio);
+ folio_unlock(folio);
submit_bio_wait(&bio);
__end_swap_bio_write(&bio);
@@ -354,6 +352,7 @@ static void swap_writepage_bdev_async(struct page *page,
struct writeback_control *wbc, struct swap_info_struct *sis)
{
struct bio *bio;
+ struct folio *folio = page_folio(page);
bio = bio_alloc(sis->bdev, 1,
REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
@@ -362,10 +361,10 @@ static void swap_writepage_bdev_async(struct page *page,
bio->bi_end_io = end_swap_bio_write;
__bio_add_page(bio, page, thp_size(page), 0);
- bio_associate_blkg_from_page(bio, page);
- count_swpout_vm_event(page);
- set_page_writeback(page);
- unlock_page(page);
+ bio_associate_blkg_from_page(bio, folio);
+ count_swpout_vm_event(folio);
+ folio_start_writeback(folio);
+ folio_unlock(folio);
submit_bio(bio);
}
@@ -406,19 +405,17 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
if (ret == sio->len) {
for (p = 0; p < sio->pages; p++) {
- struct page *page = sio->bvec[p].bv_page;
+ struct folio *folio = page_folio(sio->bvec[p].bv_page);
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
}
count_vm_events(PSWPIN, sio->pages);
} else {
for (p = 0; p < sio->pages; p++) {
- struct page *page = sio->bvec[p].bv_page;
+ struct folio *folio = page_folio(sio->bvec[p].bv_page);
- SetPageError(page);
- ClearPageUptodate(page);
- unlock_page(page);
+ folio_unlock(folio);
}
pr_alert_ratelimited("Read-error on swap-device\n");
}
@@ -495,14 +492,15 @@ static void swap_readpage_bdev_async(struct page *page,
void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
{
+ struct folio *folio = page_folio(page);
struct swap_info_struct *sis = page_swap_info(page);
- bool workingset = PageWorkingset(page);
+ bool workingset = folio_test_workingset(folio);
unsigned long pflags;
bool in_thrashing;
- VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageUptodate(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
/*
* Count submission time as memory stall and delay. When the device
@@ -515,9 +513,9 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
}
delayacct_swapin_start();
- if (frontswap_load(page) == 0) {
- SetPageUptodate(page);
- unlock_page(page);
+ if (zswap_load(folio)) {
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
} else if (data_race(sis->flags & SWP_FS_OPS)) {
swap_readpage_fs(page, plug);
} else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) {
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 6599cc965e21..bcf99ba747a0 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -79,17 +79,17 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* handle each tail page individually in migration.
*/
if (PageHuge(page) || PageTransCompound(page)) {
- struct page *head = compound_head(page);
+ struct folio *folio = page_folio(page);
unsigned int skip_pages;
if (PageHuge(page)) {
- if (!hugepage_migration_supported(page_hstate(head)))
+ if (!hugepage_migration_supported(folio_hstate(folio)))
return page;
- } else if (!PageLRU(head) && !__PageMovable(head)) {
+ } else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) {
return page;
}
- skip_pages = compound_nr(head) - (page - head);
+ skip_pages = folio_nr_pages(folio) - folio_page_idx(folio, page);
pfn += skip_pages - 1;
continue;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index c93baef0148f..4e2723e1b300 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -104,7 +104,7 @@ struct page_ext_operations page_owner_ops = {
static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
{
- return (void *)page_ext + page_owner_ops.offset;
+ return page_ext_data(page_ext, &page_owner_ops);
}
static noinline depot_stack_handle_t save_stack(gfp_t flags)
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 98438985e1ed..b4f456437b7e 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -4,7 +4,6 @@
#include <linux/mm.h>
#include <linux/mmdebug.h>
#include <linux/highmem.h>
-#include <linux/page_ext.h>
#include <linux/poison.h>
#include <linux/ratelimit.h>
#include <linux/kasan.h>
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 93ec7690a0d8..af69c3c8f7c2 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -51,15 +51,14 @@ struct page_ext_operations page_table_check_ops = {
static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
{
BUG_ON(!page_ext);
- return (void *)(page_ext) + page_table_check_ops.offset;
+ return page_ext_data(page_ext, &page_table_check_ops);
}
/*
* An entry is removed from the page table, decrement the counters for that page
* verify that it is of correct type and counters do not become negative.
*/
-static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
- unsigned long pfn, unsigned long pgcnt)
+static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
{
struct page_ext *page_ext;
struct page *page;
@@ -95,8 +94,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
* verify that it is of correct type and is not being mapped with a different
* type to a different process.
*/
-static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
- unsigned long pfn, unsigned long pgcnt,
+static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
bool rw)
{
struct page_ext *page_ext;
@@ -151,85 +149,75 @@ void __page_table_check_zero(struct page *page, unsigned int order)
page_ext_put(page_ext);
}
-void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t pte)
+void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
if (&init_mm == mm)
return;
if (pte_user_accessible_page(pte)) {
- page_table_check_clear(mm, addr, pte_pfn(pte),
- PAGE_SIZE >> PAGE_SHIFT);
+ page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pte_clear);
-void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
- pmd_t pmd)
+void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
if (&init_mm == mm)
return;
if (pmd_user_accessible_page(pmd)) {
- page_table_check_clear(mm, addr, pmd_pfn(pmd),
- PMD_SIZE >> PAGE_SHIFT);
+ page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pmd_clear);
-void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
- pud_t pud)
+void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
if (&init_mm == mm)
return;
if (pud_user_accessible_page(pud)) {
- page_table_check_clear(mm, addr, pud_pfn(pud),
- PUD_SIZE >> PAGE_SHIFT);
+ page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pud_clear);
-void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
+void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
+ unsigned int nr)
{
+ unsigned int i;
+
if (&init_mm == mm)
return;
- __page_table_check_pte_clear(mm, addr, ptep_get(ptep));
- if (pte_user_accessible_page(pte)) {
- page_table_check_set(mm, addr, pte_pfn(pte),
- PAGE_SIZE >> PAGE_SHIFT,
- pte_write(pte));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pte_clear(mm, ptep_get(ptep + i));
+ if (pte_user_accessible_page(pte))
+ page_table_check_set(pte_pfn(pte), nr, pte_write(pte));
}
-EXPORT_SYMBOL(__page_table_check_pte_set);
+EXPORT_SYMBOL(__page_table_check_ptes_set);
-void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t pmd)
+void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd)
{
if (&init_mm == mm)
return;
- __page_table_check_pmd_clear(mm, addr, *pmdp);
+ __page_table_check_pmd_clear(mm, *pmdp);
if (pmd_user_accessible_page(pmd)) {
- page_table_check_set(mm, addr, pmd_pfn(pmd),
- PMD_SIZE >> PAGE_SHIFT,
+ page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT,
pmd_write(pmd));
}
}
EXPORT_SYMBOL(__page_table_check_pmd_set);
-void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
- pud_t *pudp, pud_t pud)
+void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud)
{
if (&init_mm == mm)
return;
- __page_table_check_pud_clear(mm, addr, *pudp);
+ __page_table_check_pud_clear(mm, *pudp);
if (pud_user_accessible_page(pud)) {
- page_table_check_set(mm, addr, pud_pfn(pud),
- PUD_SIZE >> PAGE_SHIFT,
+ page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT,
pud_write(pud));
}
}
@@ -249,7 +237,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm,
if (WARN_ON(!ptep))
return;
for (i = 0; i < PTRS_PER_PTE; i++) {
- __page_table_check_pte_clear(mm, addr, ptep_get(ptep));
+ __page_table_check_pte_clear(mm, ptep_get(ptep));
addr += PAGE_SIZE;
ptep++;
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 49e0d28f0379..e0b368e545ed 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -73,20 +73,22 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
}
/**
- * check_pte - check if @pvmw->page is mapped at the @pvmw->pte
- * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking
+ * check_pte - check if [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) is
+ * mapped at the @pvmw->pte
+ * @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range
+ * for checking
*
- * page_vma_mapped_walk() found a place where @pvmw->page is *potentially*
+ * page_vma_mapped_walk() found a place where pfn range is *potentially*
* mapped. check_pte() has to validate this.
*
* pvmw->pte may point to empty PTE, swap PTE or PTE pointing to
* arbitrary page.
*
* If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration
- * entry that points to @pvmw->page or any subpage in case of THP.
+ * entry that points to [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages)
*
* If PVMW_MIGRATION flag is not set, returns true if pvmw->pte points to
- * pvmw->page or any subpage in case of THP.
+ * [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages)
*
* Otherwise, return false.
*
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 64437105fe0d..9b2d23fbf4d3 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -48,8 +48,11 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (walk->no_vma) {
/*
* pte_offset_map() might apply user-specific validation.
+ * Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
+ * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
+ * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
*/
- if (walk->mm == &init_mm)
+ if (walk->mm == &init_mm || addr >= TASK_SIZE)
pte = pte_offset_kernel(pmd, addr);
else
pte = pte_offset_map(pmd, addr);
@@ -397,6 +400,33 @@ static int __walk_page_range(unsigned long start, unsigned long end,
return err;
}
+static inline void process_mm_walk_lock(struct mm_struct *mm,
+ enum page_walk_lock walk_lock)
+{
+ if (walk_lock == PGWALK_RDLOCK)
+ mmap_assert_locked(mm);
+ else
+ mmap_assert_write_locked(mm);
+}
+
+static inline void process_vma_walk_lock(struct vm_area_struct *vma,
+ enum page_walk_lock walk_lock)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ switch (walk_lock) {
+ case PGWALK_WRLOCK:
+ vma_start_write(vma);
+ break;
+ case PGWALK_WRLOCK_VERIFY:
+ vma_assert_write_locked(vma);
+ break;
+ case PGWALK_RDLOCK:
+ /* PGWALK_RDLOCK is handled by process_mm_walk_lock */
+ break;
+ }
+#endif
+}
+
/**
* walk_page_range - walk page table with caller specific callbacks
* @mm: mm_struct representing the target process of page table walk
@@ -456,7 +486,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
if (!walk.mm)
return -EINVAL;
- mmap_assert_locked(walk.mm);
+ process_mm_walk_lock(walk.mm, ops->walk_lock);
vma = find_vma(walk.mm, start);
do {
@@ -471,6 +501,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
if (ops->pte_hole)
err = ops->pte_hole(start, next, -1, &walk);
} else { /* inside vma */
+ process_vma_walk_lock(vma, ops->walk_lock);
walk.vma = vma;
next = min(end, vma->vm_end);
vma = find_vma(mm, vma->vm_end);
@@ -546,7 +577,8 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
if (start < vma->vm_start || end > vma->vm_end)
return -EINVAL;
- mmap_assert_locked(walk.mm);
+ process_mm_walk_lock(walk.mm, ops->walk_lock);
+ process_vma_walk_lock(vma, ops->walk_lock);
return __walk_page_range(start, end, &walk);
}
@@ -563,7 +595,8 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
if (!walk.mm)
return -EINVAL;
- mmap_assert_locked(walk.mm);
+ process_mm_walk_lock(walk.mm, ops->walk_lock);
+ process_vma_walk_lock(vma, ops->walk_lock);
return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4d454953046f..4fcd959dcc4d 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -13,6 +13,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mm_inline.h>
+#include <asm/pgalloc.h>
#include <asm/tlb.h>
/*
@@ -230,14 +231,62 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
return pmd;
}
#endif
+
+/* arch define pte_free_defer in asm/pgalloc.h for its own implementation */
+#ifndef pte_free_defer
+static void pte_free_now(struct rcu_head *head)
+{
+ struct page *page;
+
+ page = container_of(head, struct page, rcu_head);
+ pte_free(NULL /* mm not passed and not used */, (pgtable_t)page);
+}
+
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+ struct page *page;
+
+ page = pgtable;
+ call_rcu(&page->rcu_head, pte_free_now);
+}
+#endif /* pte_free_defer */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \
+ (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU))
+/*
+ * See the comment above ptep_get_lockless() in include/linux/pgtable.h:
+ * the barriers in pmdp_get_lockless() cannot guarantee that the value in
+ * pmd_high actually belongs with the value in pmd_low; but holding interrupts
+ * off blocks the TLB flush between present updates, which guarantees that a
+ * successful __pte_offset_map() points to a page from matched halves.
+ */
+static unsigned long pmdp_get_lockless_start(void)
+{
+ unsigned long irqflags;
+
+ local_irq_save(irqflags);
+ return irqflags;
+}
+static void pmdp_get_lockless_end(unsigned long irqflags)
+{
+ local_irq_restore(irqflags);
+}
+#else
+static unsigned long pmdp_get_lockless_start(void) { return 0; }
+static void pmdp_get_lockless_end(unsigned long irqflags) { }
+#endif
+
pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
{
+ unsigned long irqflags;
pmd_t pmdval;
- /* rcu_read_lock() to be added later */
+ rcu_read_lock();
+ irqflags = pmdp_get_lockless_start();
pmdval = pmdp_get_lockless(pmd);
+ pmdp_get_lockless_end(irqflags);
+
if (pmdvalp)
*pmdvalp = pmdval;
if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
@@ -250,7 +299,7 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
}
return __pte_map(&pmdval, addr);
nomap:
- /* rcu_read_unlock() to be added later */
+ rcu_read_unlock();
return NULL;
}
@@ -266,6 +315,50 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
return pte;
}
+/*
+ * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation
+ * __pte_offset_map_lock() below, is usually called with the pmd pointer for
+ * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while
+ * holding mmap_lock or vma lock for read or for write; or in truncate or rmap
+ * context, while holding file's i_mmap_lock or anon_vma lock for read (or for
+ * write). In a few cases, it may be used with pmd pointing to a pmd_t already
+ * copied to or constructed on the stack.
+ *
+ * When successful, it returns the pte pointer for addr, with its page table
+ * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent
+ * modification by software, with a pointer to that spinlock in ptlp (in some
+ * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's
+ * struct page). pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards.
+ *
+ * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no
+ * page table at *pmd: if, for example, the page table has just been removed,
+ * or replaced by the huge pmd of a THP. (When successful, *pmd is rechecked
+ * after acquiring the ptlock, and retried internally if it changed: so that a
+ * page table can be safely removed or replaced by THP while holding its lock.)
+ *
+ * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above,
+ * just returns the pte pointer for addr, its page table kmapped if necessary;
+ * or NULL if there is no page table at *pmd. It does not attempt to lock the
+ * page table, so cannot normally be used when the page table is to be updated,
+ * or when entries read must be stable. But it does take rcu_read_lock(): so
+ * that even when page table is racily removed, it remains a valid though empty
+ * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s
+ * afterwards.
+ *
+ * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
+ * but when successful, it also outputs a pointer to the spinlock in ptlp - as
+ * pte_offset_map_lock() does, but in this case without locking it. This helps
+ * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
+ * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock
+ * pointer for the page table that it returns. In principle, the caller should
+ * recheck *pmd once the lock is taken; in practice, no callsite needs that -
+ * either the mmap_lock for write, or pte_same() check on contents, is enough.
+ *
+ * Note that free_pgtables(), used after unmapping detached vmas, or when
+ * exiting the whole mm, does not take page table lock before freeing a page
+ * table, and may not use RCU at all: "outsiders" like khugepaged should avoid
+ * pte_offset_map() and co once the vma is detached from mm or mm_users is zero.
+ */
pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, spinlock_t **ptlp)
{
diff --git a/mm/readahead.c b/mm/readahead.c
index a9c999aa19af..e815c114de21 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -461,19 +461,6 @@ static int try_context_readahead(struct address_space *mapping,
return 1;
}
-/*
- * There are some parts of the kernel which assume that PMD entries
- * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
- * limit the maximum allocation order to PMD size. I'm not aware of any
- * assumptions about maximum order if THP are disabled, but 8 seems like
- * a good order (that's 1MB if you're using 4kB pages)
- */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
-#else
-#define MAX_PAGECACHE_ORDER 8
-#endif
-
static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
pgoff_t mark, unsigned int order, gfp_t gfp)
{
diff --git a/mm/rmap.c b/mm/rmap.c
index 0c0d8857dfce..ec7f8e6c9e48 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -642,7 +642,8 @@ void try_to_unmap_flush_dirty(void)
#define TLB_FLUSH_BATCH_PENDING_LARGE \
(TLB_FLUSH_BATCH_PENDING_MASK / 2)
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
+ unsigned long uaddr)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
int batch;
@@ -651,7 +652,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval)
if (!pte_accessible(mm, pteval))
return;
- arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
+ arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
tlb_ubc->flush_required = true;
/*
@@ -688,17 +689,10 @@ retry:
*/
static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
- bool should_defer = false;
-
if (!(flags & TTU_BATCH_FLUSH))
return false;
- /* If remote CPUs need to be flushed then defer batch the flush */
- if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
- should_defer = true;
- put_cpu();
-
- return should_defer;
+ return arch_tlbbatch_should_defer(mm);
}
/*
@@ -723,7 +717,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
if (pending != flushed) {
- flush_tlb_mm(mm);
+ arch_flush_tlb_batched_pending(mm);
/*
* If the new TLB flushing is pending during flushing, leave
* mm->tlb_flush_batched as is, to avoid losing flushing.
@@ -733,7 +727,8 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
}
}
#else
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
+ unsigned long uaddr)
{
}
@@ -990,13 +985,6 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
#endif
}
- /*
- * No need to call mmu_notifier_invalidate_range() as we are
- * downgrading page table protection not changing it to point
- * to a new page.
- *
- * See Documentation/mm/mmu_notifier.rst
- */
if (ret)
cleaned++;
}
@@ -1175,14 +1163,14 @@ out:
/**
* __page_check_anon_rmap - sanity check anonymous rmap addition
- * @page: the page to add the mapping to
+ * @folio: The folio containing @page.
+ * @page: the page to check the mapping of
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*/
-static void __page_check_anon_rmap(struct page *page,
+static void __page_check_anon_rmap(struct folio *folio, struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- struct folio *folio = page_folio(page);
/*
* The page's anon-rmap details (mapping and index) are guaranteed to
* be set up correctly at this point.
@@ -1262,7 +1250,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
__page_set_anon_rmap(folio, page, vma, address,
!!(flags & RMAP_EXCLUSIVE));
else
- __page_check_anon_rmap(page, vma, address);
+ __page_check_anon_rmap(folio, page, vma, address);
}
mlock_vma_folio(folio, vma, compound);
@@ -1306,31 +1294,39 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
}
/**
- * page_add_file_rmap - add pte mapping to a file page
- * @page: the page to add the mapping to
+ * folio_add_file_rmap_range - add pte mapping to page range of a folio
+ * @folio: The folio to add the mapping to
+ * @page: The first page to add
+ * @nr_pages: The number of pages which will be mapped
* @vma: the vm area in which the mapping is added
* @compound: charge the page as compound or small page
*
+ * The page range of folio is defined by [first_page, first_page + nr_pages)
+ *
* The caller needs to hold the pte lock.
*/
-void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
- bool compound)
+void folio_add_file_rmap_range(struct folio *folio, struct page *page,
+ unsigned int nr_pages, struct vm_area_struct *vma,
+ bool compound)
{
- struct folio *folio = page_folio(page);
atomic_t *mapped = &folio->_nr_pages_mapped;
- int nr = 0, nr_pmdmapped = 0;
- bool first;
+ unsigned int nr_pmdmapped = 0, first;
+ int nr = 0;
- VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
+ VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
/* Is page being mapped by PTE? Is this its first map to be added? */
if (likely(!compound)) {
- first = atomic_inc_and_test(&page->_mapcount);
- nr = first;
- if (first && folio_test_large(folio)) {
- nr = atomic_inc_return_relaxed(mapped);
- nr = (nr < COMPOUND_MAPPED);
- }
+ do {
+ first = atomic_inc_and_test(&page->_mapcount);
+ if (first && folio_test_large(folio)) {
+ first = atomic_inc_return_relaxed(mapped);
+ first = (first < COMPOUND_MAPPED);
+ }
+
+ if (first)
+ nr++;
+ } while (page++, --nr_pages > 0);
} else if (folio_test_pmd_mappable(folio)) {
/* That test is redundant: it's for safety or to optimize out */
@@ -1360,6 +1356,30 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
}
/**
+ * page_add_file_rmap - add pte mapping to a file page
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @compound: charge the page as compound or small page
+ *
+ * The caller needs to hold the pte lock.
+ */
+void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
+ bool compound)
+{
+ struct folio *folio = page_folio(page);
+ unsigned int nr_pages;
+
+ VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
+
+ if (likely(!compound))
+ nr_pages = 1;
+ else
+ nr_pages = folio_nr_pages(folio);
+
+ folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
+}
+
+/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
* @vma: the vm area from which the mapping is removed
@@ -1554,8 +1574,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
flush_tlb_range(vma,
range.start, range.end);
- mmu_notifier_invalidate_range(mm,
- range.start, range.end);
/*
* The ref count of the PMD page was
* dropped which is part of the way map
@@ -1586,7 +1604,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- set_tlb_ubc_flush_pending(mm, pteval);
+ set_tlb_ubc_flush_pending(mm, pteval, address);
} else {
pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
@@ -1628,11 +1646,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* copied pages.
*/
dec_mm_counter(mm, mm_counter(&folio->page));
- /* We have to invalidate as we cleared the pte */
- mmu_notifier_invalidate_range(mm, address,
- address + PAGE_SIZE);
} else if (folio_test_anon(folio)) {
- swp_entry_t entry = { .val = page_private(subpage) };
+ swp_entry_t entry = page_swap_entry(subpage);
pte_t swp_pte;
/*
* Store the swap location in the pte.
@@ -1642,9 +1657,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
folio_test_swapcache(folio))) {
WARN_ON_ONCE(1);
ret = false;
- /* We have to invalidate as we cleared the pte */
- mmu_notifier_invalidate_range(mm, address,
- address + PAGE_SIZE);
page_vma_mapped_walk_done(&pvmw);
break;
}
@@ -1675,9 +1687,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
if (ref_count == 1 + map_count &&
!folio_test_dirty(folio)) {
- /* Invalidate as we cleared the pte */
- mmu_notifier_invalidate_range(mm,
- address, address + PAGE_SIZE);
dec_mm_counter(mm, MM_ANONPAGES);
goto discard;
}
@@ -1732,9 +1741,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
- /* Invalidate as we cleared the pte */
- mmu_notifier_invalidate_range(mm, address,
- address + PAGE_SIZE);
} else {
/*
* This is a locked file-backed folio,
@@ -1750,13 +1756,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter_file(&folio->page));
}
discard:
- /*
- * No need to call mmu_notifier_invalidate_range() it has be
- * done above for all cases requiring it to happen under page
- * table lock before mmu_notifier_invalidate_range_end()
- *
- * See Documentation/mm/mmu_notifier.rst
- */
page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
@@ -1935,8 +1934,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
flush_tlb_range(vma,
range.start, range.end);
- mmu_notifier_invalidate_range(mm,
- range.start, range.end);
/*
* The ref count of the PMD page was
@@ -1969,7 +1966,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- set_tlb_ubc_flush_pending(mm, pteval);
+ set_tlb_ubc_flush_pending(mm, pteval, address);
} else {
pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
@@ -2041,9 +2038,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* copied pages.
*/
dec_mm_counter(mm, mm_counter(&folio->page));
- /* We have to invalidate as we cleared the pte */
- mmu_notifier_invalidate_range(mm, address,
- address + PAGE_SIZE);
} else {
swp_entry_t entry;
pte_t swp_pte;
@@ -2107,13 +2101,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
}
- /*
- * No need to call mmu_notifier_invalidate_range() it has be
- * done above for all cases requiring it to happen under page
- * table lock before mmu_notifier_invalidate_range_end()
- *
- * See Documentation/mm/mmu_notifier.rst
- */
page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
@@ -2402,11 +2389,12 @@ out:
/*
* rmap_walk_anon - do something to anonymous page using the object-based
* rmap method
- * @page: the page to be handled
+ * @folio: the folio to be handled
* @rwc: control variable according to each walk type
+ * @locked: caller holds relevant rmap lock
*
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the anon_vma struct it points to.
+ * Find all the mappings of a folio using the mapping pointer and the vma
+ * chains contained in the anon_vma struct it points to.
*/
static void rmap_walk_anon(struct folio *folio,
struct rmap_walk_control *rwc, bool locked)
@@ -2450,10 +2438,11 @@ static void rmap_walk_anon(struct folio *folio,
/*
* rmap_walk_file - do something to file page using the object-based rmap method
- * @page: the page to be handled
+ * @folio: the folio to be handled
* @rwc: control variable according to each walk type
+ * @locked: caller holds relevant rmap lock
*
- * Find all the mappings of a page using the mapping pointer and the vma chains
+ * Find all the mappings of a folio using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
*/
static void rmap_walk_file(struct folio *folio,
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 86442a15d12f..3afb5ad701e1 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -55,6 +55,7 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
gfp_t gfp = vmf->gfp_mask;
unsigned long addr;
struct page *page;
+ struct folio *folio;
vm_fault_t ret;
int err;
@@ -66,23 +67,24 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
retry:
page = find_lock_page(mapping, offset);
if (!page) {
- page = alloc_page(gfp | __GFP_ZERO);
- if (!page) {
+ folio = folio_alloc(gfp | __GFP_ZERO, 0);
+ if (!folio) {
ret = VM_FAULT_OOM;
goto out;
}
+ page = &folio->page;
err = set_direct_map_invalid_noflush(page);
if (err) {
- put_page(page);
+ folio_put(folio);
ret = vmf_error(err);
goto out;
}
- __SetPageUptodate(page);
- err = add_to_page_cache_lru(page, mapping, offset, gfp);
+ __folio_mark_uptodate(folio);
+ err = filemap_add_folio(mapping, folio, offset, gfp);
if (unlikely(err)) {
- put_page(page);
+ folio_put(folio);
/*
* If a split of large page was required, it
* already happened when we marked the page invalid
diff --git a/mm/shmem.c b/mm/shmem.c
index 2f2e0e618072..02e62fccc80d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -78,6 +78,7 @@ static struct vfsmount *shm_mnt;
#include <uapi/linux/memfd.h>
#include <linux/rmap.h>
#include <linux/uuid.h>
+#include <linux/quotaops.h>
#include <linux/uaccess.h>
@@ -89,6 +90,9 @@ static struct vfsmount *shm_mnt;
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20
+/* Pretend that one inode + its dentry occupy this much memory */
+#define BOGO_INODE_SIZE 1024
+
/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128
@@ -116,11 +120,14 @@ struct shmem_options {
int huge;
int seen;
bool noswap;
+ unsigned short quota_types;
+ struct shmem_quota_limits qlimits;
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
#define SHMEM_SEEN_NOSWAP 16
+#define SHMEM_SEEN_QUOTA 32
};
#ifdef CONFIG_TMPFS
@@ -133,7 +140,8 @@ static unsigned long shmem_default_max_inodes(void)
{
unsigned long nr_pages = totalram_pages();
- return min(nr_pages - totalhigh_pages(), nr_pages / 2);
+ return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
+ ULONG_MAX / BOGO_INODE_SIZE);
}
#endif
@@ -199,33 +207,47 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}
-static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
+static int shmem_inode_acct_block(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ int err = -ENOSPC;
if (shmem_acct_block(info->flags, pages))
- return false;
+ return err;
+ might_sleep(); /* when quotas */
if (sbinfo->max_blocks) {
if (percpu_counter_compare(&sbinfo->used_blocks,
sbinfo->max_blocks - pages) > 0)
goto unacct;
+
+ err = dquot_alloc_block_nodirty(inode, pages);
+ if (err)
+ goto unacct;
+
percpu_counter_add(&sbinfo->used_blocks, pages);
+ } else {
+ err = dquot_alloc_block_nodirty(inode, pages);
+ if (err)
+ goto unacct;
}
- return true;
+ return 0;
unacct:
shmem_unacct_blocks(info->flags, pages);
- return false;
+ return err;
}
-static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
+static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ might_sleep(); /* when quotas */
+ dquot_free_block_nodirty(inode, pages);
+
if (sbinfo->max_blocks)
percpu_counter_sub(&sbinfo->used_blocks, pages);
shmem_unacct_blocks(info->flags, pages);
@@ -254,6 +276,47 @@ bool vma_is_shmem(struct vm_area_struct *vma)
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
+#ifdef CONFIG_TMPFS_QUOTA
+
+static int shmem_enable_quotas(struct super_block *sb,
+ unsigned short quota_types)
+{
+ int type, err = 0;
+
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
+ for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
+ if (!(quota_types & (1 << type)))
+ continue;
+ err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
+ DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED);
+ if (err)
+ goto out_err;
+ }
+ return 0;
+
+out_err:
+ pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
+ type, err);
+ for (type--; type >= 0; type--)
+ dquot_quota_off(sb, type);
+ return err;
+}
+
+static void shmem_disable_quotas(struct super_block *sb)
+{
+ int type;
+
+ for (type = 0; type < SHMEM_MAXQUOTAS; type++)
+ dquot_quota_off(sb, type);
+}
+
+static struct dquot **shmem_get_dquots(struct inode *inode)
+{
+ return SHMEM_I(inode)->i_dquot;
+}
+#endif /* CONFIG_TMPFS_QUOTA */
+
/*
* shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
* produces a novel ino for the newly allocated inode.
@@ -272,11 +335,11 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
if (!(sb->s_flags & SB_KERNMOUNT)) {
raw_spin_lock(&sbinfo->stat_lock);
if (sbinfo->max_inodes) {
- if (!sbinfo->free_inodes) {
+ if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
raw_spin_unlock(&sbinfo->stat_lock);
return -ENOSPC;
}
- sbinfo->free_inodes--;
+ sbinfo->free_ispace -= BOGO_INODE_SIZE;
}
if (inop) {
ino = sbinfo->next_ino++;
@@ -330,12 +393,12 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
return 0;
}
-static void shmem_free_inode(struct super_block *sb)
+static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
if (sbinfo->max_inodes) {
raw_spin_lock(&sbinfo->stat_lock);
- sbinfo->free_inodes++;
+ sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
raw_spin_unlock(&sbinfo->stat_lock);
}
}
@@ -343,62 +406,65 @@ static void shmem_free_inode(struct super_block *sb)
/**
* shmem_recalc_inode - recalculate the block usage of an inode
* @inode: inode to recalc
+ * @alloced: the change in number of pages allocated to inode
+ * @swapped: the change in number of pages swapped from inode
*
* We have to calculate the free blocks since the mm can drop
* undirtied hole pages behind our back.
*
* But normally info->alloced == inode->i_mapping->nrpages + info->swapped
* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
- *
- * It has to be called with the spinlock held.
*/
-static void shmem_recalc_inode(struct inode *inode)
+static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{
struct shmem_inode_info *info = SHMEM_I(inode);
long freed;
- freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
- if (freed > 0) {
+ spin_lock(&info->lock);
+ info->alloced += alloced;
+ info->swapped += swapped;
+ freed = info->alloced - info->swapped -
+ READ_ONCE(inode->i_mapping->nrpages);
+ /*
+ * Special case: whereas normally shmem_recalc_inode() is called
+ * after i_mapping->nrpages has already been adjusted (up or down),
+ * shmem_writepage() has to raise swapped before nrpages is lowered -
+ * to stop a racing shmem_recalc_inode() from thinking that a page has
+ * been freed. Compensate here, to avoid the need for a followup call.
+ */
+ if (swapped > 0)
+ freed += swapped;
+ if (freed > 0)
info->alloced -= freed;
- inode->i_blocks -= freed * BLOCKS_PER_PAGE;
+ spin_unlock(&info->lock);
+
+ /* The quota case may block */
+ if (freed > 0)
shmem_inode_unacct_blocks(inode, freed);
- }
}
bool shmem_charge(struct inode *inode, long pages)
{
- struct shmem_inode_info *info = SHMEM_I(inode);
- unsigned long flags;
+ struct address_space *mapping = inode->i_mapping;
- if (!shmem_inode_acct_block(inode, pages))
+ if (shmem_inode_acct_block(inode, pages))
return false;
/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
- inode->i_mapping->nrpages += pages;
-
- spin_lock_irqsave(&info->lock, flags);
- info->alloced += pages;
- inode->i_blocks += pages * BLOCKS_PER_PAGE;
- shmem_recalc_inode(inode);
- spin_unlock_irqrestore(&info->lock, flags);
+ xa_lock_irq(&mapping->i_pages);
+ mapping->nrpages += pages;
+ xa_unlock_irq(&mapping->i_pages);
+ shmem_recalc_inode(inode, pages, 0);
return true;
}
void shmem_uncharge(struct inode *inode, long pages)
{
- struct shmem_inode_info *info = SHMEM_I(inode);
- unsigned long flags;
-
+ /* pages argument is currently unused: keep it to help debugging */
/* nrpages adjustment done by __filemap_remove_folio() or caller */
- spin_lock_irqsave(&info->lock, flags);
- info->alloced -= pages;
- inode->i_blocks -= pages * BLOCKS_PER_PAGE;
- shmem_recalc_inode(inode);
- spin_unlock_irqrestore(&info->lock, flags);
-
- shmem_inode_unacct_blocks(inode, pages);
+ shmem_recalc_inode(inode, 0, 0);
}
/*
@@ -806,14 +872,16 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
unsigned long swapped = 0;
+ unsigned long max = end - 1;
rcu_read_lock();
- xas_for_each(&xas, page, end - 1) {
+ xas_for_each(&xas, page, max) {
if (xas_retry(&xas, page))
continue;
if (xa_is_value(page))
swapped++;
-
+ if (xas.xa_index == max)
+ break;
if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
@@ -970,7 +1038,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
same_folio = lend < folio_pos(folio) + folio_size(folio);
folio_mark_dirty(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
- start = folio->index + folio_nr_pages(folio);
+ start = folio_next_index(folio);
if (same_folio)
end = folio->index;
}
@@ -1038,16 +1106,13 @@ whole_folios:
folio_batch_release(&fbatch);
}
- spin_lock_irq(&info->lock);
- info->swapped -= nr_swaps_freed;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, 0, -nr_swaps_freed);
}
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -1059,11 +1124,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
struct inode *inode = path->dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
- if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
- spin_lock_irq(&info->lock);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
- }
+ if (info->alloced - info->swapped != inode->i_mapping->nrpages)
+ shmem_recalc_inode(inode, 0, 0);
+
if (info->fsflags & FS_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
if (info->fsflags & FS_IMMUTABLE_FL)
@@ -1073,7 +1136,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
if (shmem_is_huge(inode, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
@@ -1140,13 +1203,28 @@ static int shmem_setattr(struct mnt_idmap *idmap,
}
}
+ if (is_quota_modification(idmap, inode, attr)) {
+ error = dquot_initialize(inode);
+ if (error)
+ return error;
+ }
+
+ /* Transfer quota accounting */
+ if (i_uid_needs_update(idmap, attr, inode) ||
+ i_gid_needs_update(idmap, attr, inode)) {
+ error = dquot_transfer(idmap, inode, attr);
+
+ if (error)
+ return error;
+ }
+
setattr_copy(idmap, inode, attr);
if (attr->ia_valid & ATTR_MODE)
error = posix_acl_chmod(idmap, dentry, inode->i_mode);
if (!error && update_ctime) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (update_mtime)
- inode->i_mtime = inode->i_ctime;
+ inode->i_mtime = inode_get_ctime(inode);
inode_inc_iversion(inode);
}
return error;
@@ -1156,6 +1234,7 @@ static void shmem_evict_inode(struct inode *inode)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ size_t freed = 0;
if (shmem_mapping(inode->i_mapping)) {
shmem_unacct_size(info->flags, inode->i_size);
@@ -1182,10 +1261,14 @@ static void shmem_evict_inode(struct inode *inode)
}
}
- simple_xattrs_free(&info->xattrs);
+ simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL);
+ shmem_free_inode(inode->i_sb, freed);
WARN_ON(inode->i_blocks);
- shmem_free_inode(inode->i_sb);
clear_inode(inode);
+#ifdef CONFIG_TMPFS_QUOTA
+ dquot_free_inode(inode);
+ dquot_drop(inode);
+#endif
}
static int shmem_find_swap_entries(struct address_space *mapping,
@@ -1429,11 +1512,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (add_to_swap_cache(folio, swap,
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
NULL) == 0) {
- spin_lock_irq(&info->lock);
- shmem_recalc_inode(inode);
- info->swapped++;
- spin_unlock_irq(&info->lock);
-
+ shmem_recalc_inode(inode, 0, 1);
swap_shmem_alloc(swap);
shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
@@ -1588,13 +1667,14 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
struct shmem_inode_info *info = SHMEM_I(inode);
struct folio *folio;
int nr;
- int err = -ENOSPC;
+ int err;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
huge = false;
nr = huge ? HPAGE_PMD_NR : 1;
- if (!shmem_inode_acct_block(inode, nr))
+ err = shmem_inode_acct_block(inode, nr);
+ if (err)
goto failed;
if (huge)
@@ -1640,7 +1720,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
int error;
old = *foliop;
- entry = folio_swap_entry(old);
+ entry = old->swap;
swap_index = swp_offset(entry);
swap_mapping = swap_address_space(entry);
@@ -1661,7 +1741,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
__folio_set_locked(new);
__folio_set_swapbacked(new);
folio_mark_uptodate(new);
- folio_set_swap_entry(new, entry);
+ new->swap = entry;
folio_set_swapcache(new);
/*
@@ -1703,11 +1783,10 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
struct folio *folio, swp_entry_t swap)
{
struct address_space *mapping = inode->i_mapping;
- struct shmem_inode_info *info = SHMEM_I(inode);
swp_entry_t swapin_error;
void *old;
- swapin_error = make_swapin_error_entry();
+ swapin_error = make_poisoned_swp_entry();
old = xa_cmpxchg_irq(&mapping->i_pages, index,
swp_to_radix_entry(swap),
swp_to_radix_entry(swapin_error), 0);
@@ -1716,16 +1795,12 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
folio_wait_writeback(folio);
delete_from_swap_cache(folio);
- spin_lock_irq(&info->lock);
/*
- * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
- * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
- * shmem_evict_inode.
+ * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
+ * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
+ * in shmem_evict_inode().
*/
- info->alloced--;
- info->swapped--;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, -1, -1);
swap_free(swap);
}
@@ -1752,7 +1827,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
swap = radix_to_swp_entry(*foliop);
*foliop = NULL;
- if (is_swapin_error_entry(swap))
+ if (is_poisoned_swp_entry(swap))
return -EIO;
si = get_swap_device(swap);
@@ -1783,7 +1858,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
/* We have to do this with folio locked to prevent races */
folio_lock(folio);
if (!folio_test_swapcache(folio) ||
- folio_swap_entry(folio).val != swap.val ||
+ folio->swap.val != swap.val ||
!shmem_confirm_swap(mapping, index, swap)) {
error = -EEXIST;
goto unlock;
@@ -1812,10 +1887,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (error)
goto failed;
- spin_lock_irq(&info->lock);
- info->swapped--;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, 0, -1);
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
@@ -1980,13 +2052,9 @@ alloc_nohuge:
charge_mm);
if (error)
goto unacct;
- folio_add_lru(folio);
- spin_lock_irq(&info->lock);
- info->alloced += folio_nr_pages(folio);
- inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ folio_add_lru(folio);
+ shmem_recalc_inode(inode, folio_nr_pages(folio), 0);
alloced = true;
if (folio_test_pmd_mappable(folio) &&
@@ -2035,9 +2103,7 @@ clear:
if (alloced) {
folio_clear_dirty(folio);
filemap_remove_folio(folio);
- spin_lock_irq(&info->lock);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, 0, 0);
}
error = -EINVAL;
goto unlock;
@@ -2063,9 +2129,7 @@ unlock:
folio_put(folio);
}
if (error == -ENOSPC && !once++) {
- spin_lock_irq(&info->lock);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, 0, 0);
goto repeat;
}
if (error == -EEXIST)
@@ -2326,6 +2390,12 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+static int shmem_file_open(struct inode *inode, struct file *file)
+{
+ file->f_mode |= FMODE_CAN_ODIRECT;
+ return generic_file_open(inode, file);
+}
+
#ifdef CONFIG_TMPFS_XATTR
static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
@@ -2355,77 +2425,127 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
#define shmem_initxattrs NULL
#endif
-static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb,
- struct inode *dir, umode_t mode, dev_t dev,
- unsigned long flags)
+static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
+{
+ return &SHMEM_I(inode)->dir_offsets;
+}
+
+static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
+ struct super_block *sb,
+ struct inode *dir, umode_t mode,
+ dev_t dev, unsigned long flags)
{
struct inode *inode;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
ino_t ino;
+ int err;
+
+ err = shmem_reserve_inode(sb, &ino);
+ if (err)
+ return ERR_PTR(err);
- if (shmem_reserve_inode(sb, &ino))
- return NULL;
inode = new_inode(sb);
- if (inode) {
- inode->i_ino = ino;
- inode_init_owner(idmap, inode, dir, mode);
- inode->i_blocks = 0;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
- inode->i_generation = get_random_u32();
- info = SHMEM_I(inode);
- memset(info, 0, (char *)inode - (char *)info);
- spin_lock_init(&info->lock);
- atomic_set(&info->stop_eviction, 0);
- info->seals = F_SEAL_SEAL;
- info->flags = flags & VM_NORESERVE;
- info->i_crtime = inode->i_mtime;
- info->fsflags = (dir == NULL) ? 0 :
- SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
- if (info->fsflags)
- shmem_set_inode_flags(inode, info->fsflags);
- INIT_LIST_HEAD(&info->shrinklist);
- INIT_LIST_HEAD(&info->swaplist);
- if (sbinfo->noswap)
- mapping_set_unevictable(inode->i_mapping);
- simple_xattrs_init(&info->xattrs);
- cache_no_acl(inode);
- mapping_set_large_folios(inode->i_mapping);
-
- switch (mode & S_IFMT) {
- default:
- inode->i_op = &shmem_special_inode_operations;
- init_special_inode(inode, mode, dev);
- break;
- case S_IFREG:
- inode->i_mapping->a_ops = &shmem_aops;
- inode->i_op = &shmem_inode_operations;
- inode->i_fop = &shmem_file_operations;
- mpol_shared_policy_init(&info->policy,
- shmem_get_sbmpol(sbinfo));
- break;
- case S_IFDIR:
- inc_nlink(inode);
- /* Some things misbehave if size == 0 on a directory */
- inode->i_size = 2 * BOGO_DIRENT_SIZE;
- inode->i_op = &shmem_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
- break;
- case S_IFLNK:
- /*
- * Must not load anything in the rbtree,
- * mpol_free_shared_policy will not be called.
- */
- mpol_shared_policy_init(&info->policy, NULL);
- break;
- }
+ if (!inode) {
+ shmem_free_inode(sb, 0);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ inode->i_ino = ino;
+ inode_init_owner(idmap, inode, dir, mode);
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ inode->i_generation = get_random_u32();
+ info = SHMEM_I(inode);
+ memset(info, 0, (char *)inode - (char *)info);
+ spin_lock_init(&info->lock);
+ atomic_set(&info->stop_eviction, 0);
+ info->seals = F_SEAL_SEAL;
+ info->flags = flags & VM_NORESERVE;
+ info->i_crtime = inode->i_mtime;
+ info->fsflags = (dir == NULL) ? 0 :
+ SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
+ if (info->fsflags)
+ shmem_set_inode_flags(inode, info->fsflags);
+ INIT_LIST_HEAD(&info->shrinklist);
+ INIT_LIST_HEAD(&info->swaplist);
+ INIT_LIST_HEAD(&info->swaplist);
+ if (sbinfo->noswap)
+ mapping_set_unevictable(inode->i_mapping);
+ simple_xattrs_init(&info->xattrs);
+ cache_no_acl(inode);
+ mapping_set_large_folios(inode->i_mapping);
+
+ switch (mode & S_IFMT) {
+ default:
+ inode->i_op = &shmem_special_inode_operations;
+ init_special_inode(inode, mode, dev);
+ break;
+ case S_IFREG:
+ inode->i_mapping->a_ops = &shmem_aops;
+ inode->i_op = &shmem_inode_operations;
+ inode->i_fop = &shmem_file_operations;
+ mpol_shared_policy_init(&info->policy,
+ shmem_get_sbmpol(sbinfo));
+ break;
+ case S_IFDIR:
+ inc_nlink(inode);
+ /* Some things misbehave if size == 0 on a directory */
+ inode->i_size = 2 * BOGO_DIRENT_SIZE;
+ inode->i_op = &shmem_dir_inode_operations;
+ inode->i_fop = &simple_offset_dir_operations;
+ simple_offset_init(shmem_get_offset_ctx(inode));
+ break;
+ case S_IFLNK:
+ /*
+ * Must not load anything in the rbtree,
+ * mpol_free_shared_policy will not be called.
+ */
+ mpol_shared_policy_init(&info->policy, NULL);
+ break;
+ }
+
+ lockdep_annotate_inode_mutex_key(inode);
+ return inode;
+}
- lockdep_annotate_inode_mutex_key(inode);
- } else
- shmem_free_inode(sb);
+#ifdef CONFIG_TMPFS_QUOTA
+static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
+ struct super_block *sb, struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
+{
+ int err;
+ struct inode *inode;
+
+ inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
+ if (IS_ERR(inode))
+ return inode;
+
+ err = dquot_initialize(inode);
+ if (err)
+ goto errout;
+
+ err = dquot_alloc_inode(inode);
+ if (err) {
+ dquot_drop(inode);
+ goto errout;
+ }
return inode;
+
+errout:
+ inode->i_flags |= S_NOQUOTA;
+ iput(inode);
+ return ERR_PTR(err);
+}
+#else
+static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
+ struct super_block *sb, struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
+{
+ return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
}
+#endif /* CONFIG_TMPFS_QUOTA */
#ifdef CONFIG_USERFAULTFD
int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
@@ -2445,7 +2565,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
int ret;
pgoff_t max_off;
- if (!shmem_inode_acct_block(inode, 1)) {
+ if (shmem_inode_acct_block(inode, 1)) {
/*
* We may have got a page, returned -ENOENT triggering a retry,
* and now we find ourselves with -ENOMEM. Release the page, to
@@ -2527,12 +2647,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
if (ret)
goto out_delete_from_cache;
- spin_lock_irq(&info->lock);
- info->alloced++;
- inode->i_blocks += BLOCKS_PER_PAGE;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
-
+ shmem_recalc_inode(inode, 1, 0);
folio_unlock(folio);
return 0;
out_delete_from_cache:
@@ -2731,6 +2846,28 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval ? retval : error;
}
+static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto unlock;
+ ret = file_remove_privs(file);
+ if (ret)
+ goto unlock;
+ ret = file_update_time(file);
+ if (ret)
+ goto unlock;
+ ret = generic_perform_write(iocb, from);
+unlock:
+ inode_unlock(inode);
+ return ret;
+}
+
static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -2796,7 +2933,8 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (*ppos >= i_size_read(inode))
break;
- error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ);
+ error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio,
+ SGP_READ);
if (error) {
if (error == -EINVAL)
error = 0;
@@ -2805,7 +2943,9 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (folio) {
folio_unlock(folio);
- if (folio_test_hwpoison(folio)) {
+ if (folio_test_hwpoison(folio) ||
+ (folio_test_large(folio) &&
+ folio_test_has_hwpoisoned(folio))) {
error = -EIO;
break;
}
@@ -2841,7 +2981,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
folio_put(folio);
folio = NULL;
} else {
- n = splice_zeropage_into_pipe(pipe, *ppos, len);
+ n = splice_zeropage_into_pipe(pipe, *ppos, part);
}
if (!n)
@@ -3052,7 +3192,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
}
if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes;
- buf->f_ffree = sbinfo->free_inodes;
+ buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
}
/* else leave those fields 0 like simple_statfs */
@@ -3069,27 +3209,32 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
- int error = -ENOSPC;
+ int error;
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
- if (inode) {
- error = simple_acl_create(dir, inode);
- if (error)
- goto out_iput;
- error = security_inode_init_security(inode, dir,
- &dentry->d_name,
- shmem_initxattrs, NULL);
- if (error && error != -EOPNOTSUPP)
- goto out_iput;
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- error = 0;
- dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_ctime = dir->i_mtime = current_time(dir);
- inode_inc_iversion(dir);
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
- }
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
+ error = security_inode_init_security(inode, dir,
+ &dentry->d_name,
+ shmem_initxattrs, NULL);
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+
+ error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
+ if (error)
+ goto out_iput;
+
+ dir->i_size += BOGO_DIRENT_SIZE;
+ dir->i_mtime = inode_set_ctime_current(dir);
+ inode_inc_iversion(dir);
+ d_instantiate(dentry, inode);
+ dget(dentry); /* Extra count - pin the dentry in core */
return error;
+
out_iput:
iput(inode);
return error;
@@ -3100,20 +3245,26 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct file *file, umode_t mode)
{
struct inode *inode;
- int error = -ENOSPC;
+ int error;
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
- if (inode) {
- error = security_inode_init_security(inode, dir,
- NULL,
- shmem_initxattrs, NULL);
- if (error && error != -EOPNOTSUPP)
- goto out_iput;
- error = simple_acl_create(dir, inode);
- if (error)
- goto out_iput;
- d_tmpfile(file, inode);
+
+ if (IS_ERR(inode)) {
+ error = PTR_ERR(inode);
+ goto err_out;
}
+
+ error = security_inode_init_security(inode, dir,
+ NULL,
+ shmem_initxattrs, NULL);
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
+ d_tmpfile(file, inode);
+
+err_out:
return finish_open_simple(file, error);
out_iput:
iput(inode);
@@ -3159,8 +3310,16 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
goto out;
}
+ ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
+ if (ret) {
+ if (inode->i_nlink)
+ shmem_free_inode(inode->i_sb, 0);
+ goto out;
+ }
+
dir->i_size += BOGO_DIRENT_SIZE;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
inode_inc_iversion(dir);
inc_nlink(inode);
ihold(inode); /* New dentry reference */
@@ -3175,10 +3334,13 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = d_inode(dentry);
if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
- shmem_free_inode(inode->i_sb);
+ shmem_free_inode(inode->i_sb, 0);
+
+ simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
dir->i_size -= BOGO_DIRENT_SIZE;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
inode_inc_iversion(dir);
drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - this does all the work */
@@ -3235,24 +3397,29 @@ static int shmem_rename2(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = S_ISDIR(inode->i_mode);
+ int error;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE)
- return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
+ return simple_offset_rename_exchange(old_dir, old_dentry,
+ new_dir, new_dentry);
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
if (flags & RENAME_WHITEOUT) {
- int error;
-
error = shmem_whiteout(idmap, old_dir, old_dentry);
if (error)
return error;
}
+ simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry);
+ error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry);
+ if (error)
+ return error;
+
if (d_really_is_positive(new_dentry)) {
(void) shmem_unlink(new_dir, new_dentry);
if (they_are_dirs) {
@@ -3266,9 +3433,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
old_dir->i_size -= BOGO_DIRENT_SIZE;
new_dir->i_size += BOGO_DIRENT_SIZE;
- old_dir->i_ctime = old_dir->i_mtime =
- new_dir->i_ctime = new_dir->i_mtime =
- inode->i_ctime = current_time(old_dir);
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
return 0;
@@ -3288,31 +3453,32 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
VM_NORESERVE);
- if (!inode)
- return -ENOSPC;
+
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
error = security_inode_init_security(inode, dir, &dentry->d_name,
shmem_initxattrs, NULL);
- if (error && error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+
+ error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
+ if (error)
+ goto out_iput;
inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) {
inode->i_link = kmemdup(symname, len, GFP_KERNEL);
if (!inode->i_link) {
- iput(inode);
- return -ENOMEM;
+ error = -ENOMEM;
+ goto out_remove_offset;
}
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
- if (error) {
- iput(inode);
- return error;
- }
+ if (error)
+ goto out_remove_offset;
inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
memcpy(folio_address(folio), symname, len);
@@ -3322,11 +3488,17 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
folio_put(folio);
}
dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry);
return 0;
+
+out_remove_offset:
+ simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
+out_iput:
+ iput(inode);
+ return error;
}
static void shmem_put_link(void *arg)
@@ -3394,7 +3566,7 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap,
(fa->flags & SHMEM_FL_USER_MODIFIABLE);
shmem_set_inode_flags(inode, info->fsflags);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
return 0;
}
@@ -3414,21 +3586,40 @@ static int shmem_initxattrs(struct inode *inode,
void *fs_info)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
const struct xattr *xattr;
struct simple_xattr *new_xattr;
+ size_t ispace = 0;
size_t len;
+ if (sbinfo->max_inodes) {
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ ispace += simple_xattr_space(xattr->name,
+ xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
+ }
+ if (ispace) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->free_ispace < ispace)
+ ispace = 0;
+ else
+ sbinfo->free_ispace -= ispace;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ if (!ispace)
+ return -ENOSPC;
+ }
+ }
+
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
if (!new_xattr)
- return -ENOMEM;
+ break;
len = strlen(xattr->name) + 1;
new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!new_xattr->name) {
kvfree(new_xattr);
- return -ENOMEM;
+ break;
}
memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
@@ -3439,6 +3630,16 @@ static int shmem_initxattrs(struct inode *inode,
simple_xattr_add(&info->xattrs, new_xattr);
}
+ if (xattr->name != NULL) {
+ if (ispace) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_ispace += ispace;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ }
+ simple_xattrs_free(&info->xattrs, NULL);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -3459,15 +3660,40 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler,
size_t size, int flags)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- int err;
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct simple_xattr *old_xattr;
+ size_t ispace = 0;
name = xattr_full_name(handler, name);
- err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
- if (!err) {
- inode->i_ctime = current_time(inode);
+ if (value && sbinfo->max_inodes) {
+ ispace = simple_xattr_space(name, size);
+ raw_spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->free_ispace < ispace)
+ ispace = 0;
+ else
+ sbinfo->free_ispace -= ispace;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ if (!ispace)
+ return -ENOSPC;
+ }
+
+ old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags);
+ if (!IS_ERR(old_xattr)) {
+ ispace = 0;
+ if (old_xattr && sbinfo->max_inodes)
+ ispace = simple_xattr_space(old_xattr->name,
+ old_xattr->size);
+ simple_xattr_free(old_xattr);
+ old_xattr = NULL;
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
}
- return err;
+ if (ispace) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_ispace += ispace;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ }
+ return PTR_ERR(old_xattr);
}
static const struct xattr_handler shmem_security_xattr_handler = {
@@ -3482,9 +3708,16 @@ static const struct xattr_handler shmem_trusted_xattr_handler = {
.set = shmem_xattr_handler_set,
};
+static const struct xattr_handler shmem_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = shmem_xattr_handler_get,
+ .set = shmem_xattr_handler_set,
+};
+
static const struct xattr_handler *shmem_xattr_handlers[] = {
&shmem_security_xattr_handler,
&shmem_trusted_xattr_handler,
+ &shmem_user_xattr_handler,
NULL
};
@@ -3497,6 +3730,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
static const struct inode_operations shmem_short_symlink_operations = {
.getattr = shmem_getattr,
+ .setattr = shmem_setattr,
.get_link = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3505,6 +3739,7 @@ static const struct inode_operations shmem_short_symlink_operations = {
static const struct inode_operations shmem_symlink_inode_operations = {
.getattr = shmem_getattr,
+ .setattr = shmem_setattr,
.get_link = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3604,6 +3839,13 @@ enum shmem_param {
Opt_inode32,
Opt_inode64,
Opt_noswap,
+ Opt_quota,
+ Opt_usrquota,
+ Opt_grpquota,
+ Opt_usrquota_block_hardlimit,
+ Opt_usrquota_inode_hardlimit,
+ Opt_grpquota_block_hardlimit,
+ Opt_grpquota_inode_hardlimit,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -3626,6 +3868,15 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_flag ("inode32", Opt_inode32),
fsparam_flag ("inode64", Opt_inode64),
fsparam_flag ("noswap", Opt_noswap),
+#ifdef CONFIG_TMPFS_QUOTA
+ fsparam_flag ("quota", Opt_quota),
+ fsparam_flag ("usrquota", Opt_usrquota),
+ fsparam_flag ("grpquota", Opt_grpquota),
+ fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
+ fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
+ fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
+ fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
+#endif
{}
};
@@ -3636,6 +3887,8 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
unsigned long long size;
char *rest;
int opt;
+ kuid_t kuid;
+ kgid_t kgid;
opt = fs_parse(fc, shmem_fs_parameters, param, &result);
if (opt < 0)
@@ -3657,13 +3910,13 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_nr_blocks:
ctx->blocks = memparse(param->string, &rest);
- if (*rest || ctx->blocks > S64_MAX)
+ if (*rest || ctx->blocks > LONG_MAX)
goto bad_value;
ctx->seen |= SHMEM_SEEN_BLOCKS;
break;
case Opt_nr_inodes:
ctx->inodes = memparse(param->string, &rest);
- if (*rest)
+ if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
goto bad_value;
ctx->seen |= SHMEM_SEEN_INODES;
break;
@@ -3671,14 +3924,32 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
ctx->mode = result.uint_32 & 07777;
break;
case Opt_uid:
- ctx->uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(ctx->uid))
+ kuid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(kuid))
+ goto bad_value;
+
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fc->user_ns, kuid))
goto bad_value;
+
+ ctx->uid = kuid;
break;
case Opt_gid:
- ctx->gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(ctx->gid))
+ kgid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(kgid))
+ goto bad_value;
+
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fc->user_ns, kgid))
goto bad_value;
+
+ ctx->gid = kgid;
break;
case Opt_huge:
ctx->huge = result.uint_32;
@@ -3717,6 +3988,60 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
ctx->noswap = true;
ctx->seen |= SHMEM_SEEN_NOSWAP;
break;
+ case Opt_quota:
+ if (fc->user_ns != &init_user_ns)
+ return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
+ ctx->seen |= SHMEM_SEEN_QUOTA;
+ ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
+ break;
+ case Opt_usrquota:
+ if (fc->user_ns != &init_user_ns)
+ return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
+ ctx->seen |= SHMEM_SEEN_QUOTA;
+ ctx->quota_types |= QTYPE_MASK_USR;
+ break;
+ case Opt_grpquota:
+ if (fc->user_ns != &init_user_ns)
+ return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
+ ctx->seen |= SHMEM_SEEN_QUOTA;
+ ctx->quota_types |= QTYPE_MASK_GRP;
+ break;
+ case Opt_usrquota_block_hardlimit:
+ size = memparse(param->string, &rest);
+ if (*rest || !size)
+ goto bad_value;
+ if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
+ return invalfc(fc,
+ "User quota block hardlimit too large.");
+ ctx->qlimits.usrquota_bhardlimit = size;
+ break;
+ case Opt_grpquota_block_hardlimit:
+ size = memparse(param->string, &rest);
+ if (*rest || !size)
+ goto bad_value;
+ if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
+ return invalfc(fc,
+ "Group quota block hardlimit too large.");
+ ctx->qlimits.grpquota_bhardlimit = size;
+ break;
+ case Opt_usrquota_inode_hardlimit:
+ size = memparse(param->string, &rest);
+ if (*rest || !size)
+ goto bad_value;
+ if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
+ return invalfc(fc,
+ "User quota inode hardlimit too large.");
+ ctx->qlimits.usrquota_ihardlimit = size;
+ break;
+ case Opt_grpquota_inode_hardlimit:
+ size = memparse(param->string, &rest);
+ if (*rest || !size)
+ goto bad_value;
+ if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
+ return invalfc(fc,
+ "Group quota inode hardlimit too large.");
+ ctx->qlimits.grpquota_ihardlimit = size;
+ break;
}
return 0;
@@ -3772,21 +4097,17 @@ static int shmem_parse_options(struct fs_context *fc, void *data)
/*
* Reconfigure a shmem filesystem.
- *
- * Note that we disallow change from limited->unlimited blocks/inodes while any
- * are in use; but we must separately disallow unlimited->limited, because in
- * that case we have no record of how much is already in use.
*/
static int shmem_reconfigure(struct fs_context *fc)
{
struct shmem_options *ctx = fc->fs_private;
struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
- unsigned long inodes;
+ unsigned long used_isp;
struct mempolicy *mpol = NULL;
const char *err;
raw_spin_lock(&sbinfo->stat_lock);
- inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+ used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;
if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
if (!sbinfo->max_blocks) {
@@ -3804,7 +4125,7 @@ static int shmem_reconfigure(struct fs_context *fc)
err = "Cannot retroactively limit inodes";
goto out;
}
- if (ctx->inodes < inodes) {
+ if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
err = "Too few inodes for current use";
goto out;
}
@@ -3824,6 +4145,24 @@ static int shmem_reconfigure(struct fs_context *fc)
goto out;
}
+ if (ctx->seen & SHMEM_SEEN_QUOTA &&
+ !sb_any_quota_loaded(fc->root->d_sb)) {
+ err = "Cannot enable quota on remount";
+ goto out;
+ }
+
+#ifdef CONFIG_TMPFS_QUOTA
+#define CHANGED_LIMIT(name) \
+ (ctx->qlimits.name## hardlimit && \
+ (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))
+
+ if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
+ CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
+ err = "Cannot change global quota limit on remount";
+ goto out;
+ }
+#endif /* CONFIG_TMPFS_QUOTA */
+
if (ctx->seen & SHMEM_SEEN_HUGE)
sbinfo->huge = ctx->huge;
if (ctx->seen & SHMEM_SEEN_INUMS)
@@ -3832,7 +4171,7 @@ static int shmem_reconfigure(struct fs_context *fc)
sbinfo->max_blocks = ctx->blocks;
if (ctx->seen & SHMEM_SEEN_INODES) {
sbinfo->max_inodes = ctx->inodes;
- sbinfo->free_inodes = ctx->inodes - inodes;
+ sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
}
/*
@@ -3861,8 +4200,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
struct mempolicy *mpol;
if (sbinfo->max_blocks != shmem_default_max_blocks())
- seq_printf(seq, ",size=%luk",
- sbinfo->max_blocks << (PAGE_SHIFT - 10));
+ seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks));
if (sbinfo->max_inodes != shmem_default_max_inodes())
seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
if (sbinfo->mode != (0777 | S_ISVTX))
@@ -3915,6 +4253,9 @@ static void shmem_put_super(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+#ifdef CONFIG_TMPFS_QUOTA
+ shmem_disable_quotas(sb);
+#endif
free_percpu(sbinfo->ino_batch);
percpu_counter_destroy(&sbinfo->used_blocks);
mpol_put(sbinfo->mpol);
@@ -3927,12 +4268,13 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
struct shmem_options *ctx = fc->fs_private;
struct inode *inode;
struct shmem_sb_info *sbinfo;
+ int error = -ENOMEM;
/* Round up to L1_CACHE_BYTES to resist false sharing */
sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
L1_CACHE_BYTES), GFP_KERNEL);
if (!sbinfo)
- return -ENOMEM;
+ return error;
sb->s_fs_info = sbinfo;
@@ -3959,7 +4301,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_NOUSER;
#endif
sbinfo->max_blocks = ctx->blocks;
- sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+ sbinfo->max_inodes = ctx->inodes;
+ sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
if (sb->s_flags & SB_KERNMOUNT) {
sbinfo->ino_batch = alloc_percpu(ino_t);
if (!sbinfo->ino_batch)
@@ -3993,10 +4336,27 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#endif
uuid_gen(&sb->s_uuid);
+#ifdef CONFIG_TMPFS_QUOTA
+ if (ctx->seen & SHMEM_SEEN_QUOTA) {
+ sb->dq_op = &shmem_quota_operations;
+ sb->s_qcop = &dquot_quotactl_sysfile_ops;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+
+ /* Copy the default limits from ctx into sbinfo */
+ memcpy(&sbinfo->qlimits, &ctx->qlimits,
+ sizeof(struct shmem_quota_limits));
+
+ if (shmem_enable_quotas(sb, ctx->quota_types))
+ goto failed;
+ }
+#endif /* CONFIG_TMPFS_QUOTA */
+
inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
VM_NORESERVE);
- if (!inode)
+ if (IS_ERR(inode)) {
+ error = PTR_ERR(inode);
goto failed;
+ }
inode->i_uid = sbinfo->uid;
inode->i_gid = sbinfo->gid;
sb->s_root = d_make_root(inode);
@@ -4006,7 +4366,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
failed:
shmem_put_super(sb);
- return -ENOMEM;
+ return error;
}
static int shmem_get_tree(struct fs_context *fc)
@@ -4056,6 +4416,8 @@ static void shmem_destroy_inode(struct inode *inode)
{
if (S_ISREG(inode->i_mode))
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
+ if (S_ISDIR(inode->i_mode))
+ simple_offset_destroy(shmem_get_offset_ctx(inode));
}
static void shmem_init_inode(void *foo)
@@ -4099,12 +4461,12 @@ EXPORT_SYMBOL(shmem_aops);
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
- .open = generic_file_open,
+ .open = shmem_file_open,
.get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
.read_iter = shmem_file_read_iter,
- .write_iter = generic_file_write_iter,
+ .write_iter = shmem_file_write_iter,
.fsync = noop_fsync,
.splice_read = shmem_file_splice_read,
.splice_write = iter_file_splice_write,
@@ -4136,6 +4498,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
.mknod = shmem_mknod,
.rename = shmem_rename2,
.tmpfile = shmem_tmpfile,
+ .get_offset_ctx = shmem_get_offset_ctx,
#endif
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -4167,6 +4530,9 @@ static const struct super_operations shmem_ops = {
.statfs = shmem_statfs,
.show_options = shmem_show_options,
#endif
+#ifdef CONFIG_TMPFS_QUOTA
+ .get_dquots = shmem_get_dquots,
+#endif
.evict_inode = shmem_evict_inode,
.drop_inode = generic_delete_inode,
.put_super = shmem_put_super,
@@ -4220,7 +4586,7 @@ static struct file_system_type shmem_fs_type = {
#endif
.kill_sb = kill_litter_super,
#ifdef CONFIG_SHMEM
- .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
+ .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
#else
.fs_flags = FS_USERNS_MOUNT,
#endif
@@ -4232,6 +4598,14 @@ void __init shmem_init(void)
shmem_init_inodecache();
+#ifdef CONFIG_TMPFS_QUOTA
+ error = register_quota_format(&shmem_quota_format);
+ if (error < 0) {
+ pr_err("Could not register quota format\n");
+ goto out3;
+ }
+#endif
+
error = register_filesystem(&shmem_fs_type);
if (error) {
pr_err("Could not register tmpfs\n");
@@ -4256,6 +4630,10 @@ void __init shmem_init(void)
out1:
unregister_filesystem(&shmem_fs_type);
out2:
+#ifdef CONFIG_TMPFS_QUOTA
+ unregister_quota_format(&shmem_quota_format);
+out3:
+#endif
shmem_destroy_inodecache();
shm_mnt = ERR_PTR(error);
}
@@ -4375,10 +4753,16 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
#define shmem_vm_ops generic_file_vm_ops
#define shmem_anon_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
-#define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
+static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
+{
+ struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
+ return inode ? inode : ERR_PTR(-ENOSPC);
+}
+
#endif /* CONFIG_SHMEM */
/* common code */
@@ -4403,9 +4787,10 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags);
- if (unlikely(!inode)) {
+
+ if (IS_ERR(inode)) {
shmem_unacct_size(flags, size);
- return ERR_PTR(-ENOSPC);
+ return ERR_CAST(inode);
}
inode->i_flags |= i_flags;
inode->i_size = size;
diff --git a/mm/shmem_quota.c b/mm/shmem_quota.c
new file mode 100644
index 000000000000..062d1c1097ae
--- /dev/null
+++ b/mm/shmem_quota.c
@@ -0,0 +1,350 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * In memory quota format relies on quota infrastructure to store dquot
+ * information for us. While conventional quota formats for file systems
+ * with persistent storage can load quota information into dquot from the
+ * storage on-demand and hence quota dquot shrinker can free any dquot
+ * that is not currently being used, it must be avoided here. Otherwise we
+ * can lose valuable information, user provided limits, because there is
+ * no persistent storage to load the information from afterwards.
+ *
+ * One information that in-memory quota format needs to keep track of is
+ * a sorted list of ids for each quota type. This is done by utilizing
+ * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
+ * type.
+ *
+ * This format can be used to support quota on file system without persistent
+ * storage such as tmpfs.
+ *
+ * Author: Lukas Czerner <lczerner@redhat.com>
+ * Carlos Maiolino <cmaiolino@redhat.com>
+ *
+ * Copyright (C) 2023 Red Hat, Inc.
+ */
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/shmem_fs.h>
+
+#include <linux/quotaops.h>
+#include <linux/quota.h>
+
+#ifdef CONFIG_TMPFS_QUOTA
+
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define SHMEM_MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */
+#define SHMEM_MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */
+
+struct quota_id {
+ struct rb_node node;
+ qid_t id;
+ qsize_t bhardlimit;
+ qsize_t bsoftlimit;
+ qsize_t ihardlimit;
+ qsize_t isoftlimit;
+};
+
+static int shmem_check_quota_file(struct super_block *sb, int type)
+{
+ /* There is no real quota file, nothing to do */
+ return 1;
+}
+
+/*
+ * There is no real quota file. Just allocate rb_root for quota ids and
+ * set limits
+ */
+static int shmem_read_file_info(struct super_block *sb, int type)
+{
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct mem_dqinfo *info = &dqopt->info[type];
+
+ info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
+ if (!info->dqi_priv)
+ return -ENOMEM;
+
+ info->dqi_max_spc_limit = SHMEM_QUOTA_MAX_SPC_LIMIT;
+ info->dqi_max_ino_limit = SHMEM_QUOTA_MAX_INO_LIMIT;
+
+ info->dqi_bgrace = SHMEM_MAX_DQ_TIME;
+ info->dqi_igrace = SHMEM_MAX_IQ_TIME;
+ info->dqi_flags = 0;
+
+ return 0;
+}
+
+static int shmem_write_file_info(struct super_block *sb, int type)
+{
+ /* There is no real quota file, nothing to do */
+ return 0;
+}
+
+/*
+ * Free all the quota_id entries in the rb tree and rb_root.
+ */
+static int shmem_free_file_info(struct super_block *sb, int type)
+{
+ struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
+ struct rb_root *root = info->dqi_priv;
+ struct quota_id *entry;
+ struct rb_node *node;
+
+ info->dqi_priv = NULL;
+ node = rb_first(root);
+ while (node) {
+ entry = rb_entry(node, struct quota_id, node);
+ node = rb_next(&entry->node);
+
+ rb_erase(&entry->node, root);
+ kfree(entry);
+ }
+
+ kfree(root);
+ return 0;
+}
+
+static int shmem_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, qid->type);
+ struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
+ qid_t id = from_kqid(&init_user_ns, *qid);
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct quota_id *entry = NULL;
+ int ret = 0;
+
+ if (!sb_has_quota_active(sb, qid->type))
+ return -ESRCH;
+
+ down_read(&dqopt->dqio_sem);
+ while (node) {
+ entry = rb_entry(node, struct quota_id, node);
+
+ if (id < entry->id)
+ node = node->rb_left;
+ else if (id > entry->id)
+ node = node->rb_right;
+ else
+ goto got_next_id;
+ }
+
+ if (!entry) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ if (id > entry->id) {
+ node = rb_next(&entry->node);
+ if (!node) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ entry = rb_entry(node, struct quota_id, node);
+ }
+
+got_next_id:
+ *qid = make_kqid(&init_user_ns, qid->type, entry->id);
+out_unlock:
+ up_read(&dqopt->dqio_sem);
+ return ret;
+}
+
+/*
+ * Load dquot with limits from existing entry, or create the new entry if
+ * it does not exist.
+ */
+static int shmem_acquire_dquot(struct dquot *dquot)
+{
+ struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
+ struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
+ struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info;
+ struct rb_node *parent = NULL, *new_node = NULL;
+ struct quota_id *new_entry, *entry;
+ qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
+ struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+ int ret = 0;
+
+ mutex_lock(&dquot->dq_lock);
+
+ down_write(&dqopt->dqio_sem);
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct quota_id, node);
+
+ if (id < entry->id)
+ n = &(*n)->rb_left;
+ else if (id > entry->id)
+ n = &(*n)->rb_right;
+ else
+ goto found;
+ }
+
+ /* We don't have entry for this id yet, create it */
+ new_entry = kzalloc(sizeof(struct quota_id), GFP_NOFS);
+ if (!new_entry) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ new_entry->id = id;
+ if (dquot->dq_id.type == USRQUOTA) {
+ new_entry->bhardlimit = sbinfo->qlimits.usrquota_bhardlimit;
+ new_entry->ihardlimit = sbinfo->qlimits.usrquota_ihardlimit;
+ } else if (dquot->dq_id.type == GRPQUOTA) {
+ new_entry->bhardlimit = sbinfo->qlimits.grpquota_bhardlimit;
+ new_entry->ihardlimit = sbinfo->qlimits.grpquota_ihardlimit;
+ }
+
+ new_node = &new_entry->node;
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, (struct rb_root *)info->dqi_priv);
+ entry = new_entry;
+
+found:
+ /* Load the stored limits from the tree */
+ spin_lock(&dquot->dq_dqb_lock);
+ dquot->dq_dqb.dqb_bhardlimit = entry->bhardlimit;
+ dquot->dq_dqb.dqb_bsoftlimit = entry->bsoftlimit;
+ dquot->dq_dqb.dqb_ihardlimit = entry->ihardlimit;
+ dquot->dq_dqb.dqb_isoftlimit = entry->isoftlimit;
+
+ if (!dquot->dq_dqb.dqb_bhardlimit &&
+ !dquot->dq_dqb.dqb_bsoftlimit &&
+ !dquot->dq_dqb.dqb_ihardlimit &&
+ !dquot->dq_dqb.dqb_isoftlimit)
+ set_bit(DQ_FAKE_B, &dquot->dq_flags);
+ spin_unlock(&dquot->dq_dqb_lock);
+
+ /* Make sure flags update is visible after dquot has been filled */
+ smp_mb__before_atomic();
+ set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_unlock:
+ up_write(&dqopt->dqio_sem);
+ mutex_unlock(&dquot->dq_lock);
+ return ret;
+}
+
+static bool shmem_is_empty_dquot(struct dquot *dquot)
+{
+ struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info;
+ qsize_t bhardlimit;
+ qsize_t ihardlimit;
+
+ if (dquot->dq_id.type == USRQUOTA) {
+ bhardlimit = sbinfo->qlimits.usrquota_bhardlimit;
+ ihardlimit = sbinfo->qlimits.usrquota_ihardlimit;
+ } else if (dquot->dq_id.type == GRPQUOTA) {
+ bhardlimit = sbinfo->qlimits.grpquota_bhardlimit;
+ ihardlimit = sbinfo->qlimits.grpquota_ihardlimit;
+ }
+
+ if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+ (dquot->dq_dqb.dqb_curspace == 0 &&
+ dquot->dq_dqb.dqb_curinodes == 0 &&
+ dquot->dq_dqb.dqb_bhardlimit == bhardlimit &&
+ dquot->dq_dqb.dqb_ihardlimit == ihardlimit))
+ return true;
+
+ return false;
+}
+/*
+ * Store limits from dquot in the tree unless it's fake. If it is fake
+ * remove the id from the tree since there is no useful information in
+ * there.
+ */
+static int shmem_release_dquot(struct dquot *dquot)
+{
+ struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
+ struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
+ qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
+ struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+ struct quota_id *entry = NULL;
+
+ mutex_lock(&dquot->dq_lock);
+ /* Check whether we are not racing with some other dqget() */
+ if (dquot_is_busy(dquot))
+ goto out_dqlock;
+
+ down_write(&dqopt->dqio_sem);
+ while (node) {
+ entry = rb_entry(node, struct quota_id, node);
+
+ if (id < entry->id)
+ node = node->rb_left;
+ else if (id > entry->id)
+ node = node->rb_right;
+ else
+ goto found;
+ }
+
+ /* We should always find the entry in the rb tree */
+ WARN_ONCE(1, "quota id %u from dquot %p, not in rb tree!\n", id, dquot);
+ up_write(&dqopt->dqio_sem);
+ mutex_unlock(&dquot->dq_lock);
+ return -ENOENT;
+
+found:
+ if (shmem_is_empty_dquot(dquot)) {
+ /* Remove entry from the tree */
+ rb_erase(&entry->node, info->dqi_priv);
+ kfree(entry);
+ } else {
+ /* Store the limits in the tree */
+ spin_lock(&dquot->dq_dqb_lock);
+ entry->bhardlimit = dquot->dq_dqb.dqb_bhardlimit;
+ entry->bsoftlimit = dquot->dq_dqb.dqb_bsoftlimit;
+ entry->ihardlimit = dquot->dq_dqb.dqb_ihardlimit;
+ entry->isoftlimit = dquot->dq_dqb.dqb_isoftlimit;
+ spin_unlock(&dquot->dq_dqb_lock);
+ }
+
+ clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+ up_write(&dqopt->dqio_sem);
+
+out_dqlock:
+ mutex_unlock(&dquot->dq_lock);
+ return 0;
+}
+
+static int shmem_mark_dquot_dirty(struct dquot *dquot)
+{
+ return 0;
+}
+
+static int shmem_dquot_write_info(struct super_block *sb, int type)
+{
+ return 0;
+}
+
+static const struct quota_format_ops shmem_format_ops = {
+ .check_quota_file = shmem_check_quota_file,
+ .read_file_info = shmem_read_file_info,
+ .write_file_info = shmem_write_file_info,
+ .free_file_info = shmem_free_file_info,
+};
+
+struct quota_format_type shmem_quota_format = {
+ .qf_fmt_id = QFMT_SHMEM,
+ .qf_ops = &shmem_format_ops,
+ .qf_owner = THIS_MODULE
+};
+
+const struct dquot_operations shmem_quota_operations = {
+ .acquire_dquot = shmem_acquire_dquot,
+ .release_dquot = shmem_release_dquot,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
+ .write_info = shmem_dquot_write_info,
+ .mark_dirty = shmem_mark_dquot_dirty,
+ .get_next_id = shmem_get_next_id,
+};
+#endif /* CONFIG_TMPFS_QUOTA */
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 01f8e9905817..4b888b18bdde 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -186,7 +186,7 @@ static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
* SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
* cpuset.
*/
-void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
+static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
{
unsigned long free_pcp = 0;
int cpu, nid;
@@ -251,9 +251,9 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i
" writeback:%lukB"
" shmem:%lukB"
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- " shmem_thp: %lukB"
- " shmem_pmdmapped: %lukB"
- " anon_thp: %lukB"
+ " shmem_thp:%lukB"
+ " shmem_pmdmapped:%lukB"
+ " anon_thp:%lukB"
#endif
" writeback_tmp:%lukB"
" kernel_stack:%lukB"
@@ -406,7 +406,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
struct zone *zone;
printk("Mem-Info:\n");
- __show_free_areas(filter, nodemask, max_zone_idx);
+ show_free_areas(filter, nodemask, max_zone_idx);
for_each_populated_zone(zone) {
diff --git a/mm/slab.c b/mm/slab.c
index 88194391d553..9ad3d0f2d1a5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1670,7 +1670,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
if (freelist_size > KMALLOC_MAX_CACHE_SIZE) {
freelist_cache_size = PAGE_SIZE << get_order(freelist_size);
} else {
- freelist_cache = kmalloc_slab(freelist_size, 0u);
+ freelist_cache = kmalloc_slab(freelist_size, 0u, _RET_IP_);
if (!freelist_cache)
continue;
freelist_cache_size = freelist_cache->size;
diff --git a/mm/slab.h b/mm/slab.h
index 9c0e09d0f81f..799a315695c6 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -282,7 +282,7 @@ void setup_kmalloc_cache_index_table(void);
void create_kmalloc_caches(slab_flags_t);
/* Find the kmalloc slab corresponding for a certain size */
-struct kmem_cache *kmalloc_slab(size_t, gfp_t);
+struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller);
void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
int node, size_t orig_size,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d1555ea2981a..cd71f9581e67 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -678,6 +678,11 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ };
EXPORT_SYMBOL(kmalloc_caches);
+#ifdef CONFIG_RANDOM_KMALLOC_CACHES
+unsigned long random_kmalloc_seed __ro_after_init;
+EXPORT_SYMBOL(random_kmalloc_seed);
+#endif
+
/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -720,7 +725,7 @@ static inline unsigned int size_index_elem(unsigned int bytes)
* Find the kmem_cache structure that serves a given size of
* allocation
*/
-struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
+struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller)
{
unsigned int index;
@@ -735,7 +740,7 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
index = fls(size - 1);
}
- return kmalloc_caches[kmalloc_type(flags)][index];
+ return kmalloc_caches[kmalloc_type(flags, caller)][index];
}
size_t kmalloc_size_roundup(size_t size)
@@ -752,8 +757,11 @@ size_t kmalloc_size_roundup(size_t size)
if (size > KMALLOC_MAX_CACHE_SIZE)
return PAGE_SIZE << get_order(size);
- /* The flags don't matter since size_index is common to all. */
- c = kmalloc_slab(size, GFP_KERNEL);
+ /*
+ * The flags don't matter since size_index is common to all.
+ * Neither does the caller for just getting ->object_size.
+ */
+ c = kmalloc_slab(size, GFP_KERNEL, 0);
return c ? c->object_size : 0;
}
EXPORT_SYMBOL(kmalloc_size_roundup);
@@ -776,12 +784,35 @@ EXPORT_SYMBOL(kmalloc_size_roundup);
#define KMALLOC_RCL_NAME(sz)
#endif
+#ifdef CONFIG_RANDOM_KMALLOC_CACHES
+#define __KMALLOC_RANDOM_CONCAT(a, b) a ## b
+#define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz)
+#define KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 1] = "kmalloc-rnd-01-" #sz,
+#define KMA_RAND_2(sz) KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 2] = "kmalloc-rnd-02-" #sz,
+#define KMA_RAND_3(sz) KMA_RAND_2(sz) .name[KMALLOC_RANDOM_START + 3] = "kmalloc-rnd-03-" #sz,
+#define KMA_RAND_4(sz) KMA_RAND_3(sz) .name[KMALLOC_RANDOM_START + 4] = "kmalloc-rnd-04-" #sz,
+#define KMA_RAND_5(sz) KMA_RAND_4(sz) .name[KMALLOC_RANDOM_START + 5] = "kmalloc-rnd-05-" #sz,
+#define KMA_RAND_6(sz) KMA_RAND_5(sz) .name[KMALLOC_RANDOM_START + 6] = "kmalloc-rnd-06-" #sz,
+#define KMA_RAND_7(sz) KMA_RAND_6(sz) .name[KMALLOC_RANDOM_START + 7] = "kmalloc-rnd-07-" #sz,
+#define KMA_RAND_8(sz) KMA_RAND_7(sz) .name[KMALLOC_RANDOM_START + 8] = "kmalloc-rnd-08-" #sz,
+#define KMA_RAND_9(sz) KMA_RAND_8(sz) .name[KMALLOC_RANDOM_START + 9] = "kmalloc-rnd-09-" #sz,
+#define KMA_RAND_10(sz) KMA_RAND_9(sz) .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz,
+#define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz,
+#define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz,
+#define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz,
+#define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz,
+#define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz,
+#else // CONFIG_RANDOM_KMALLOC_CACHES
+#define KMALLOC_RANDOM_NAME(N, sz)
+#endif
+
#define INIT_KMALLOC_INFO(__size, __short_size) \
{ \
.name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
KMALLOC_RCL_NAME(__short_size) \
KMALLOC_CGROUP_NAME(__short_size) \
KMALLOC_DMA_NAME(__short_size) \
+ KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size) \
.size = __size, \
}
@@ -864,10 +895,9 @@ void __init setup_kmalloc_cache_index_table(void)
static unsigned int __kmalloc_minalign(void)
{
-#ifdef CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC
- if (io_tlb_default_mem.nslabs)
+ if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
+ is_swiotlb_allocated())
return ARCH_KMALLOC_MINALIGN;
-#endif
return dma_get_cache_alignment();
}
@@ -890,6 +920,11 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
flags |= SLAB_CACHE_DMA;
}
+#ifdef CONFIG_RANDOM_KMALLOC_CACHES
+ if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END)
+ flags |= SLAB_NO_MERGE;
+#endif
+
/*
* If CONFIG_MEMCG_KMEM is enabled, disable cache merging for
* KMALLOC_NORMAL caches.
@@ -941,6 +976,9 @@ void __init create_kmalloc_caches(slab_flags_t flags)
new_kmalloc_cache(2, type, flags);
}
}
+#ifdef CONFIG_RANDOM_KMALLOC_CACHES
+ random_kmalloc_seed = get_random_u64();
+#endif
/* Kmalloc array is now usable */
slab_state = UP;
@@ -976,7 +1014,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller
return ret;
}
- s = kmalloc_slab(size, flags);
+ s = kmalloc_slab(size, flags, caller);
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
diff --git a/mm/slub.c b/mm/slub.c
index e3b5d5c0eb3a..f7940048138c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -361,43 +361,51 @@ static struct workqueue_struct *flushwq;
*******************************************************************/
/*
+ * freeptr_t represents a SLUB freelist pointer, which might be encoded
+ * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
+ */
+typedef struct { unsigned long v; } freeptr_t;
+
+/*
* Returns freelist pointer (ptr). With hardening, this is obfuscated
* with an XOR of the address where the pointer is held and a per-cache
* random number.
*/
-static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
- unsigned long ptr_addr)
+static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s,
+ void *ptr, unsigned long ptr_addr)
{
+ unsigned long encoded;
+
#ifdef CONFIG_SLAB_FREELIST_HARDENED
- /*
- * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
- * Normally, this doesn't cause any issues, as both set_freepointer()
- * and get_freepointer() are called with a pointer with the same tag.
- * However, there are some issues with CONFIG_SLUB_DEBUG code. For
- * example, when __free_slub() iterates over objects in a cache, it
- * passes untagged pointers to check_object(). check_object() in turns
- * calls get_freepointer() with an untagged pointer, which causes the
- * freepointer to be restored incorrectly.
- */
- return (void *)((unsigned long)ptr ^ s->random ^
- swab((unsigned long)kasan_reset_tag((void *)ptr_addr)));
+ encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr);
#else
- return ptr;
+ encoded = (unsigned long)ptr;
#endif
+ return (freeptr_t){.v = encoded};
}
-/* Returns the freelist pointer recorded at location ptr_addr. */
-static inline void *freelist_dereference(const struct kmem_cache *s,
- void *ptr_addr)
+static inline void *freelist_ptr_decode(const struct kmem_cache *s,
+ freeptr_t ptr, unsigned long ptr_addr)
{
- return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
- (unsigned long)ptr_addr);
+ void *decoded;
+
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+ decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr));
+#else
+ decoded = (void *)ptr.v;
+#endif
+ return decoded;
}
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
+ unsigned long ptr_addr;
+ freeptr_t p;
+
object = kasan_reset_tag(object);
- return freelist_dereference(s, object + s->offset);
+ ptr_addr = (unsigned long)object + s->offset;
+ p = *(freeptr_t *)(ptr_addr);
+ return freelist_ptr_decode(s, p, ptr_addr);
}
#ifndef CONFIG_SLUB_TINY
@@ -421,15 +429,15 @@ __no_kmsan_checks
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
unsigned long freepointer_addr;
- void *p;
+ freeptr_t p;
if (!debug_pagealloc_enabled_static())
return get_freepointer(s, object);
object = kasan_reset_tag(object);
freepointer_addr = (unsigned long)object + s->offset;
- copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
- return freelist_ptr(s, p, freepointer_addr);
+ copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p));
+ return freelist_ptr_decode(s, p, freepointer_addr);
}
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
@@ -441,7 +449,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
#endif
freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
- *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
+ *(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr);
}
/* Loop over all objects in a slab */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a044a130405b..a2cbe44c48e1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -358,6 +358,7 @@ int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
return 0;
}
+#ifndef vmemmap_populate_compound_pages
/*
* For compound pages bigger than section size (e.g. x86 1G compound
* pages with 2M subsection size) fill the rest of sections as tail
@@ -446,6 +447,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
return 0;
}
+#endif
+
struct page * __meminit __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
diff --git a/mm/sparse.c b/mm/sparse.c
index 297a8b772e8d..77d91e565045 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -172,8 +172,7 @@ static void __section_mark_present(struct mem_section *ms,
#define for_each_present_section_nr(start, section_nr) \
for (section_nr = next_present_section_nr(start-1); \
- ((section_nr != -1) && \
- (section_nr <= __highest_present_section_nr)); \
+ section_nr != -1; \
section_nr = next_present_section_nr(section_nr))
static inline unsigned long first_present_section_nr(void)
diff --git a/mm/swap.h b/mm/swap.h
index 7c033d793f15..8a3c7a0ace4f 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -46,7 +46,6 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
unsigned long addr,
- bool do_poll,
struct swap_iocb **plug);
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f8ea7015bad4..b3b14bd0dd64 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,9 +63,8 @@ static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
void show_swap_cache_info(void)
{
printk("%lu pages in swap cache\n", total_swapcache_pages());
- printk("Free swap = %ldkB\n",
- get_nr_swap_pages() << (PAGE_SHIFT - 10));
- printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
+ printk("Free swap = %ldkB\n", K(get_nr_swap_pages()));
+ printk("Total swap = %lukB\n", K(total_swap_pages));
}
void *get_shadow_from_swap_cache(swp_entry_t entry)
@@ -101,6 +100,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
folio_ref_add(folio, nr);
folio_set_swapcache(folio);
+ folio->swap = entry;
do {
xas_lock_irq(&xas);
@@ -114,7 +114,6 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
if (shadowp)
*shadowp = old;
}
- set_page_private(folio_page(folio, i), entry.val + i);
xas_store(&xas, folio);
xas_next(&xas);
}
@@ -155,9 +154,9 @@ void __delete_from_swap_cache(struct folio *folio,
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, shadow);
VM_BUG_ON_PAGE(entry != folio, entry);
- set_page_private(folio_page(folio, i), 0);
xas_next(&xas);
}
+ folio->swap.val = 0;
folio_clear_swapcache(folio);
address_space->nrpages -= nr;
__node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
@@ -233,7 +232,7 @@ fail:
*/
void delete_from_swap_cache(struct folio *folio)
{
- swp_entry_t entry = folio_swap_entry(folio);
+ swp_entry_t entry = folio->swap;
struct address_space *address_space = swap_address_space(entry);
xa_lock_irq(&address_space->i_pages);
@@ -527,15 +526,14 @@ fail_put_swap:
*/
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
- unsigned long addr, bool do_poll,
- struct swap_iocb **plug)
+ unsigned long addr, struct swap_iocb **plug)
{
bool page_was_allocated;
struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
vma, addr, &page_was_allocated);
if (page_was_allocated)
- swap_readpage(retpage, do_poll, plug);
+ swap_readpage(retpage, false, plug);
return retpage;
}
@@ -630,7 +628,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct swap_info_struct *si = swp_swap_info(entry);
struct blk_plug plug;
struct swap_iocb *splug = NULL;
- bool do_poll = true, page_allocated;
+ bool page_allocated;
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address;
@@ -638,7 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!mask)
goto skip;
- do_poll = false;
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
end_offset = offset | mask;
@@ -670,7 +667,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
/* The page was likely read above, so no need for plugging here */
- return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL);
+ return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL);
}
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -838,7 +835,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
skip:
/* The page was likely read above, so no need for plugging here */
return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
- ra_info.win == 1, NULL);
+ NULL);
}
/**
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8e6dde68b389..e52f486834eb 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -35,17 +35,18 @@
#include <linux/memcontrol.h>
#include <linux/poll.h>
#include <linux/oom.h>
-#include <linux/frontswap.h>
#include <linux/swapfile.h>
#include <linux/export.h>
#include <linux/swap_slots.h>
#include <linux/sort.h>
#include <linux/completion.h>
#include <linux/suspend.h>
+#include <linux/zswap.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
+#include "internal.h"
#include "swap.h"
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
@@ -95,7 +96,7 @@ static PLIST_HEAD(swap_active_head);
static struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);
-struct swap_info_struct *swap_info[MAX_SWAPFILES];
+static struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
@@ -714,10 +715,8 @@ static void add_to_avail_list(struct swap_info_struct *p)
int nid;
spin_lock(&swap_avail_lock);
- for_each_node(nid) {
- WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
+ for_each_node(nid)
plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
- }
spin_unlock(&swap_avail_lock);
}
@@ -746,7 +745,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_slot_free_notify = NULL;
while (offset <= end) {
arch_swap_invalidate_page(si->type, offset);
- frontswap_invalidate_page(si->type, offset);
+ zswap_invalidate(si->type, offset);
if (swap_slot_free_notify)
swap_slot_free_notify(si->bdev, offset);
offset++;
@@ -1537,7 +1536,7 @@ unlock_out:
static bool folio_swapped(struct folio *folio)
{
- swp_entry_t entry = folio_swap_entry(folio);
+ swp_entry_t entry = folio->swap;
struct swap_info_struct *si = _swap_info_get(entry);
if (!si)
@@ -1746,7 +1745,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
struct page *swapcache;
spinlock_t *ptl;
pte_t *pte, new_pte, old_pte;
- bool hwposioned = false;
+ bool hwpoisoned = PageHWPoison(page);
int ret = 1;
swapcache = page;
@@ -1754,7 +1753,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
if (unlikely(!page))
return -ENOMEM;
else if (unlikely(PTR_ERR(page) == -EHWPOISON))
- hwposioned = true;
+ hwpoisoned = true;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
@@ -1765,21 +1764,28 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
old_pte = ptep_get(pte);
- if (unlikely(hwposioned || !PageUptodate(page))) {
+ if (unlikely(hwpoisoned || !PageUptodate(page))) {
swp_entry_t swp_entry;
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
- if (hwposioned) {
+ if (hwpoisoned) {
swp_entry = make_hwpoison_entry(swapcache);
page = swapcache;
} else {
- swp_entry = make_swapin_error_entry();
+ swp_entry = make_poisoned_swp_entry();
}
new_pte = swp_entry_to_pte(swp_entry);
ret = 0;
goto setpte;
}
+ /*
+ * Some architectures may have to restore extra metadata to the page
+ * when reading from swap. This metadata may be indexed by swap entry
+ * so this must be called before swap_free().
+ */
+ arch_swap_restore(entry, page_folio(page));
+
/* See do_swap_page() */
BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
BUG_ON(PageAnon(page) && PageAnonExclusive(page));
@@ -2330,16 +2336,18 @@ static void _enable_swap_info(struct swap_info_struct *p)
* swap_info_struct.
*/
plist_add(&p->list, &swap_active_head);
- add_to_avail_list(p);
+
+ /* add to available list iff swap device is not full */
+ if (p->highest_bit)
+ add_to_avail_list(p);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
- struct swap_cluster_info *cluster_info,
- unsigned long *frontswap_map)
+ struct swap_cluster_info *cluster_info)
{
- if (IS_ENABLED(CONFIG_FRONTSWAP))
- frontswap_init(p->type, frontswap_map);
+ zswap_swapon(p->type);
+
spin_lock(&swap_lock);
spin_lock(&p->lock);
setup_swap_info(p, prio, swap_map, cluster_info);
@@ -2382,7 +2390,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct swap_info_struct *p = NULL;
unsigned char *swap_map;
struct swap_cluster_info *cluster_info;
- unsigned long *frontswap_map;
struct file *swap_file, *victim;
struct address_space *mapping;
struct inode *inode;
@@ -2507,12 +2514,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->swap_map = NULL;
cluster_info = p->cluster_info;
p->cluster_info = NULL;
- frontswap_map = frontswap_map_get(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
arch_swap_invalidate_area(p->type);
- frontswap_invalidate_area(p->type);
- frontswap_map_set(p, NULL);
+ zswap_swapoff(p->type);
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
@@ -2520,7 +2525,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->cluster_next_cpu = NULL;
vfree(swap_map);
kvfree(cluster_info);
- kvfree(frontswap_map);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
exit_swap_address_space(p->type);
@@ -2632,8 +2636,8 @@ static int swap_show(struct seq_file *swap, void *v)
return 0;
}
- bytes = si->pages << (PAGE_SHIFT - 10);
- inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10);
+ bytes = K(si->pages);
+ inuse = K(READ_ONCE(si->inuse_pages));
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
@@ -2858,8 +2862,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
}
if (last_page > maxpages) {
pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
- maxpages << (PAGE_SHIFT - 10),
- last_page << (PAGE_SHIFT - 10));
+ K(maxpages), K(last_page));
}
if (maxpages > last_page) {
maxpages = last_page + 1;
@@ -2987,7 +2990,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
unsigned long maxpages;
unsigned char *swap_map = NULL;
struct swap_cluster_info *cluster_info = NULL;
- unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
bool inced_nr_rotate_swap = false;
@@ -3127,11 +3129,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = nr_extents;
goto bad_swap_unlock_inode;
}
- /* frontswap enabled? set up bit-per-page map for frontswap */
- if (IS_ENABLED(CONFIG_FRONTSWAP))
- frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
- sizeof(long),
- GFP_KERNEL);
if ((swap_flags & SWAP_FLAG_DISCARD) &&
p->bdev && bdev_max_discard_sectors(p->bdev)) {
@@ -3184,16 +3181,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (swap_flags & SWAP_FLAG_PREFER)
prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
- enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
+ enable_swap_info(p, prio, swap_map, cluster_info);
- pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
- p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
- nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
+ pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n",
+ K(p->pages), name->name, p->prio, nr_extents,
+ K((unsigned long long)span),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
(p->flags & SWP_DISCARDABLE) ? "D" : "",
(p->flags & SWP_AREA_DISCARD) ? "s" : "",
- (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
- (frontswap_map) ? "FS" : "");
+ (p->flags & SWP_PAGE_DISCARD) ? "c" : "");
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
@@ -3223,7 +3219,6 @@ bad_swap:
spin_unlock(&swap_lock);
vfree(swap_map);
kvfree(cluster_info);
- kvfree(frontswap_map);
if (inced_nr_rotate_swap)
atomic_dec(&nr_rotate_swap);
if (swap_file)
@@ -3374,7 +3369,7 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry)
struct swap_info_struct *page_swap_info(struct page *page)
{
- swp_entry_t entry = { .val = page_private(page) };
+ swp_entry_t entry = page_swap_entry(page);
return swp_swap_info(entry);
}
@@ -3389,7 +3384,7 @@ EXPORT_SYMBOL_GPL(swapcache_mapping);
pgoff_t __page_file_index(struct page *page)
{
- swp_entry_t swap = { .val = page_private(page) };
+ swp_entry_t swap = page_swap_entry(page);
return swp_offset(swap);
}
EXPORT_SYMBOL_GPL(__page_file_index);
diff --git a/mm/truncate.c b/mm/truncate.c
index 95d1291d269b..8e3aa9e8618e 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,7 +19,6 @@
#include <linux/highmem.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/buffer_head.h> /* grr. try_to_release_page */
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include "internal.h"
@@ -276,7 +275,7 @@ static long mapping_evict_folio(struct address_space *mapping,
if (folio_ref_count(folio) >
folio_nr_pages(folio) + folio_has_private(folio) + 1)
return 0;
- if (folio_has_private(folio) && !filemap_release_folio(folio, 0))
+ if (!filemap_release_folio(folio, 0))
return 0;
return remove_mapping(mapping, folio);
@@ -378,7 +377,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
if (!IS_ERR(folio)) {
same_folio = lend < folio_pos(folio) + folio_size(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
- start = folio->index + folio_nr_pages(folio);
+ start = folio_next_index(folio);
if (same_folio)
end = folio->index;
}
@@ -573,8 +572,7 @@ static int invalidate_complete_folio2(struct address_space *mapping,
if (folio->mapping != mapping)
return 0;
- if (folio_has_private(folio) &&
- !filemap_release_folio(folio, GFP_KERNEL))
+ if (!filemap_release_folio(folio, GFP_KERNEL))
return 0;
spin_lock(&mapping->host->i_lock);
@@ -657,11 +655,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
}
folio_lock(folio);
- VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
- if (folio->mapping != mapping) {
+ if (unlikely(folio->mapping != mapping)) {
folio_unlock(folio);
continue;
}
+ VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
folio_wait_writeback(folio);
if (folio_mapped(folio))
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index b322ac54ea20..96d9eae5c7cc 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -45,6 +45,22 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
return dst_vma;
}
+/* Check if dst_addr is outside of file's size. Must be called with ptl held. */
+static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ struct inode *inode;
+ pgoff_t offset, max_off;
+
+ if (!dst_vma->vm_file)
+ return false;
+
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ return offset >= max_off;
+}
+
/*
* Install PTEs, to map dst_addr (within dst_vma) to page.
*
@@ -64,8 +80,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
bool page_in_cache = page_mapping(page);
spinlock_t *ptl;
struct folio *folio;
- struct inode *inode;
- pgoff_t offset, max_off;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
_dst_pte = pte_mkdirty(_dst_pte);
@@ -81,14 +95,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
if (!dst_pte)
goto out;
- if (vma_is_shmem(dst_vma)) {
- /* serialize against truncate with the page table lock */
- inode = dst_vma->vm_file->f_inode;
- offset = linear_page_index(dst_vma, dst_addr);
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (mfill_file_over_size(dst_vma, dst_addr)) {
ret = -EFAULT;
- if (unlikely(offset >= max_off))
- goto out_unlock;
+ goto out_unlock;
}
ret = -EEXIST;
@@ -211,8 +220,6 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
int ret;
- pgoff_t offset, max_off;
- struct inode *inode;
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
@@ -220,14 +227,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
if (!dst_pte)
goto out;
- if (dst_vma->vm_file) {
- /* the shmem MAP_PRIVATE case requires checking the i_size */
- inode = dst_vma->vm_file->f_inode;
- offset = linear_page_index(dst_vma, dst_addr);
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (mfill_file_over_size(dst_vma, dst_addr)) {
ret = -EFAULT;
- if (unlikely(offset >= max_off))
- goto out_unlock;
+ goto out_unlock;
}
ret = -EEXIST;
if (!pte_none(ptep_get(dst_pte)))
@@ -286,6 +288,44 @@ out_release:
goto out;
}
+/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
+static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ uffd_flags_t flags)
+{
+ int ret;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+
+ _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
+ ret = -EAGAIN;
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!dst_pte)
+ goto out;
+
+ if (mfill_file_over_size(dst_vma, dst_addr)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = -EEXIST;
+ /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
+ if (!pte_none(*dst_pte))
+ goto out_unlock;
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+out:
+ return ret;
+}
+
static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
@@ -481,6 +521,9 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
return mfill_atomic_pte_continue(dst_pmd, dst_vma,
dst_addr, flags);
+ } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
+ return mfill_atomic_pte_poison(dst_pmd, dst_vma,
+ dst_addr, flags);
}
/*
@@ -702,6 +745,14 @@ ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
}
+ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, atomic_t *mmap_changing,
+ uffd_flags_t flags)
+{
+ return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
+}
+
long uffd_wp_range(struct vm_area_struct *dst_vma,
unsigned long start, unsigned long len, bool enable_wp)
{
diff --git a/mm/util.c b/mm/util.c
index 8e7fc6cacab4..f08b655da917 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -396,7 +396,10 @@ static int mmap_is_legacy(struct rlimit *rlim_stack)
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
- if (rlim_stack->rlim_cur == RLIM_INFINITY)
+ /* On parisc the stack always grows up - so a unlimited stack should
+ * not be an indicator to use the legacy memory layout. */
+ if (rlim_stack->rlim_cur == RLIM_INFINITY &&
+ !IS_ENABLED(CONFIG_STACK_GROWSUP))
return 1;
return sysctl_legacy_va_layout;
@@ -734,12 +737,6 @@ void *vcalloc(size_t n, size_t size)
}
EXPORT_SYMBOL(vcalloc);
-/* Neutral page->mapping pointer to address_space or anon_vma or other */
-void *page_rmapping(struct page *page)
-{
- return folio_raw_mapping(page_folio(page));
-}
-
struct anon_vma *folio_anon_vma(struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
@@ -770,7 +767,7 @@ struct address_space *folio_mapping(struct folio *folio)
return NULL;
if (unlikely(folio_test_swapcache(folio)))
- return swap_address_space(folio_swap_entry(folio));
+ return swap_address_space(folio->swap);
mapping = folio->mapping;
if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
@@ -1125,7 +1122,7 @@ void page_offline_end(void)
}
EXPORT_SYMBOL(page_offline_end);
-#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
+#ifndef flush_dcache_folio
void flush_dcache_folio(struct folio *folio)
{
long i, nr = folio_nr_pages(folio);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 93cf99aba335..228a4a5312f2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2979,6 +2979,10 @@ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
free_vm_area(area);
return NULL;
}
+
+ flush_cache_vmap((unsigned long)area->addr,
+ (unsigned long)area->addr + count * PAGE_SIZE);
+
return area->addr;
}
EXPORT_SYMBOL_GPL(vmap_pfn);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index b52644771cc4..22c6689d9302 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -244,6 +244,14 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
if (mem_cgroup_disabled())
return;
+ /*
+ * The in-kernel users only care about the reclaim efficiency
+ * for this @memcg rather than the whole subtree, and there
+ * isn't and won't be any in-kernel user in a legacy cgroup.
+ */
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !tree)
+ return;
+
vmpr = memcg_to_vmpressure(memcg);
/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1080209a568b..6f13394b112e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1423,7 +1423,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
}
if (folio_test_swapcache(folio)) {
- swp_entry_t swap = folio_swap_entry(folio);
+ swp_entry_t swap = folio->swap;
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
@@ -2064,7 +2064,7 @@ retry:
* (refcount == 1) it can be freed. Otherwise, leave
* the folio on the LRU so it is swappable.
*/
- if (folio_has_private(folio)) {
+ if (folio_needs_release(folio)) {
if (!filemap_release_folio(folio, sc->gfp_mask))
goto activate_locked;
if (!mapping && folio_ref_count(folio) == 1) {
@@ -2729,9 +2729,9 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
if (unlikely(buffer_heads_over_limit)) {
- if (folio_test_private(folio) && folio_trylock(folio)) {
- if (folio_test_private(folio))
- filemap_release_folio(folio, 0);
+ if (folio_needs_release(folio) &&
+ folio_trylock(folio)) {
+ filemap_release_folio(folio, 0);
folio_unlock(folio);
}
}
@@ -4284,6 +4284,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
static const struct mm_walk_ops mm_walk_ops = {
.test_walk = should_skip_vma,
.p4d_entry = walk_pud_range,
+ .walk_lock = PGWALK_RDLOCK,
};
int err;
@@ -4439,7 +4440,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
int prev, next;
int type, zone;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
-
+restart:
spin_lock_irq(&lruvec->lru_lock);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
@@ -4450,11 +4451,12 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
- while (!inc_min_seq(lruvec, type, can_swap)) {
- spin_unlock_irq(&lruvec->lru_lock);
- cond_resched();
- spin_lock_irq(&lruvec->lru_lock);
- }
+ if (inc_min_seq(lruvec, type, can_swap))
+ continue;
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ goto restart;
}
/*
@@ -4655,6 +4657,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
struct folio *folio = pfn_folio(pvmw->pfn);
+ bool can_swap = !folio_is_file_lru(folio);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
@@ -4703,7 +4706,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (!pte_young(ptent))
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
if (!folio)
continue;
@@ -4853,16 +4856,17 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg)
spin_lock_irq(&pgdat->memcg_lru.lock);
- VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
+ if (hlist_nulls_unhashed(&lruvec->lrugen.list))
+ goto unlock;
gen = lruvec->lrugen.gen;
- hlist_nulls_del_rcu(&lruvec->lrugen.list);
+ hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
pgdat->memcg_lru.nr_memcgs[gen]--;
if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
-
+unlock:
spin_unlock_irq(&pgdat->memcg_lru.lock);
}
}
@@ -4889,7 +4893,8 @@ static int lru_gen_memcg_seg(struct lruvec *lruvec)
* the eviction
******************************************************************************/
-static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
+static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
+ int tier_idx)
{
bool success;
int gen = folio_lru_gen(folio);
@@ -4939,6 +4944,13 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
return true;
}
+ /* ineligible */
+ if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
+ gen = folio_inc_gen(lruvec, folio, false);
+ list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+ return true;
+ }
+
/* waiting for writeback */
if (folio_test_locked(folio) || folio_test_writeback(folio) ||
(type == LRU_GEN_FILE && folio_test_dirty(folio))) {
@@ -4987,7 +4999,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int type, int tier, struct list_head *list)
{
- int gen, zone;
+ int i;
+ int gen;
enum vm_event_item item;
int sorted = 0;
int scanned = 0;
@@ -5003,9 +5016,10 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
gen = lru_gen_from_seq(lrugen->min_seq[type]);
- for (zone = sc->reclaim_idx; zone >= 0; zone--) {
+ for (i = MAX_NR_ZONES; i > 0; i--) {
LIST_HEAD(moved);
int skipped = 0;
+ int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
struct list_head *head = &lrugen->folios[gen][type][zone];
while (!list_empty(head)) {
@@ -5019,7 +5033,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
scanned += delta;
- if (sort_folio(lruvec, folio, tier))
+ if (sort_folio(lruvec, folio, sc, tier))
sorted += delta;
else if (isolate_folio(lruvec, folio, sc)) {
list_add(&folio->lru, list);
@@ -5434,8 +5448,10 @@ restart:
rcu_read_lock();
hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
- if (op)
+ if (op) {
lru_gen_rotate_memcg(lruvec, op);
+ op = 0;
+ }
mem_cgroup_put(memcg);
@@ -5443,7 +5459,7 @@ restart:
memcg = lruvec_memcg(lruvec);
if (!mem_cgroup_tryget(memcg)) {
- op = 0;
+ lru_gen_release_memcg(memcg);
memcg = NULL;
continue;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b731d57996c5..00e81e99c6ee 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -26,7 +26,6 @@
#include <linux/writeback.h>
#include <linux/compaction.h>
#include <linux/mm_inline.h>
-#include <linux/page_ext.h>
#include <linux/page_owner.h>
#include <linux/sched/isolation.h>
diff --git a/mm/workingset.c b/mm/workingset.c
index 4686ae363000..da58a26d0d4d 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -664,6 +664,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct lruvec *lruvec;
int i;
+ mem_cgroup_flush_stats();
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
pages += lruvec_page_state_local(lruvec,
diff --git a/mm/z3fold.c b/mm/z3fold.c
index e84de91ecccb..7c76b396b74c 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -133,8 +133,6 @@ struct z3fold_header {
* @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
* @c_handle: cache for z3fold_buddy_slots allocation
- * @zpool: zpool driver
- * @zpool_ops: zpool operations structure with an evict callback
* @compact_wq: workqueue for page layout background optimization
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
@@ -480,6 +478,16 @@ static void release_z3fold_page_locked_list(struct kref *ref)
__release_z3fold_page(zhdr, true);
}
+static inline int put_z3fold_locked(struct z3fold_header *zhdr)
+{
+ return kref_put(&zhdr->refcount, release_z3fold_page_locked);
+}
+
+static inline int put_z3fold_locked_list(struct z3fold_header *zhdr)
+{
+ return kref_put(&zhdr->refcount, release_z3fold_page_locked_list);
+}
+
static void free_pages_work(struct work_struct *w)
{
struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
@@ -666,7 +674,7 @@ static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
return new_zhdr;
out_fail:
- if (new_zhdr && !kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) {
+ if (new_zhdr && !put_z3fold_locked(new_zhdr)) {
add_to_unbuddied(pool, new_zhdr);
z3fold_page_unlock(new_zhdr);
}
@@ -741,7 +749,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
list_del_init(&zhdr->buddy);
spin_unlock(&pool->lock);
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
+ if (put_z3fold_locked(zhdr))
return;
if (test_bit(PAGE_STALE, &page->private) ||
@@ -752,7 +760,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
if (!zhdr->foreign_handles && buddy_single(zhdr) &&
zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
- if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ if (!put_z3fold_locked(zhdr)) {
clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
@@ -878,7 +886,7 @@ lookup:
return zhdr;
out_fail:
- if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ if (!put_z3fold_locked(zhdr)) {
add_to_unbuddied(pool, zhdr);
z3fold_page_unlock(zhdr);
}
@@ -1012,8 +1020,7 @@ retry:
if (zhdr) {
bud = get_free_buddy(zhdr, chunks);
if (bud == HEADLESS) {
- if (!kref_put(&zhdr->refcount,
- release_z3fold_page_locked))
+ if (!put_z3fold_locked(zhdr))
z3fold_page_unlock(zhdr);
pr_err("No free chunks in unbuddied\n");
WARN_ON(1);
@@ -1129,7 +1136,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
if (!page_claimed)
free_handle(handle, zhdr);
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list))
+ if (put_z3fold_locked_list(zhdr))
return;
if (page_claimed) {
/* the page has not been claimed by us */
@@ -1346,7 +1353,7 @@ static void z3fold_page_putback(struct page *page)
if (!list_empty(&zhdr->buddy))
list_del_init(&zhdr->buddy);
INIT_LIST_HEAD(&page->lru);
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
+ if (put_z3fold_locked(zhdr))
return;
if (list_empty(&zhdr->buddy))
add_to_unbuddied(pool, zhdr);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 3f057970504e..b58f957429f0 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -795,8 +795,8 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
-static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
- int tag)
+static inline bool obj_allocated(struct page *page, void *obj,
+ unsigned long *phandle)
{
unsigned long handle;
struct zspage *zspage = get_zspage(page);
@@ -807,7 +807,7 @@ static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
} else
handle = *(unsigned long *)obj;
- if (!(handle & tag))
+ if (!(handle & OBJ_ALLOCATED_TAG))
return false;
/* Clear all tags before returning the handle */
@@ -815,11 +815,6 @@ static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
return true;
}
-static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
-{
- return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
-}
-
static void reset_page(struct page *page)
{
__ClearPageMovable(page);
@@ -1147,6 +1142,11 @@ static bool zspage_full(struct size_class *class, struct zspage *zspage)
return get_zspage_inuse(zspage) == class->objs_per_zspage;
}
+static bool zspage_empty(struct zspage *zspage)
+{
+ return get_zspage_inuse(zspage) == 0;
+}
+
/**
* zs_lookup_class_index() - Returns index of the zsmalloc &size_class
* that hold objects of the provided size.
@@ -1546,11 +1546,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
}
/*
- * Find object with a certain tag in zspage from index object and
+ * Find alloced object in zspage from index object and
* return handle.
*/
-static unsigned long find_tagged_obj(struct size_class *class,
- struct page *page, int *obj_idx, int tag)
+static unsigned long find_alloced_obj(struct size_class *class,
+ struct page *page, int *obj_idx)
{
unsigned int offset;
int index = *obj_idx;
@@ -1561,7 +1561,7 @@ static unsigned long find_tagged_obj(struct size_class *class,
offset += class->size * index;
while (offset < PAGE_SIZE) {
- if (obj_tagged(page, addr + offset, &handle, tag))
+ if (obj_allocated(page, addr + offset, &handle))
break;
offset += class->size;
@@ -1575,35 +1575,14 @@ static unsigned long find_tagged_obj(struct size_class *class,
return handle;
}
-/*
- * Find alloced object in zspage from index object and
- * return handle.
- */
-static unsigned long find_alloced_obj(struct size_class *class,
- struct page *page, int *obj_idx)
-{
- return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
-}
-
-struct zs_compact_control {
- /* Source spage for migration which could be a subpage of zspage */
- struct page *s_page;
- /* Destination page for migration which should be a first page
- * of zspage. */
- struct page *d_page;
- /* Starting object index within @s_page which used for live object
- * in the subpage. */
- int obj_idx;
-};
-
-static void migrate_zspage(struct zs_pool *pool, struct size_class *class,
- struct zs_compact_control *cc)
+static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage,
+ struct zspage *dst_zspage)
{
unsigned long used_obj, free_obj;
unsigned long handle;
- struct page *s_page = cc->s_page;
- struct page *d_page = cc->d_page;
- int obj_idx = cc->obj_idx;
+ int obj_idx = 0;
+ struct page *s_page = get_first_page(src_zspage);
+ struct size_class *class = pool->size_class[src_zspage->class];
while (1) {
handle = find_alloced_obj(class, s_page, &obj_idx);
@@ -1615,21 +1594,21 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class,
continue;
}
- /* Stop if there is no more space */
- if (zspage_full(class, get_zspage(d_page)))
- break;
-
used_obj = handle_to_obj(handle);
- free_obj = obj_malloc(pool, get_zspage(d_page), handle);
+ free_obj = obj_malloc(pool, dst_zspage, handle);
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
record_obj(handle, free_obj);
obj_free(class->size, used_obj);
- }
- /* Remember last position in this iteration */
- cc->s_page = s_page;
- cc->obj_idx = obj_idx;
+ /* Stop if there is no more space */
+ if (zspage_full(class, dst_zspage))
+ break;
+
+ /* Stop if there are no more objects to migrate */
+ if (zspage_empty(src_zspage))
+ break;
+ }
}
static struct zspage *isolate_src_zspage(struct size_class *class)
@@ -1798,6 +1777,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
{
+ struct zs_pool *pool;
struct zspage *zspage;
/*
@@ -1807,9 +1787,10 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
VM_BUG_ON_PAGE(PageIsolated(page), page);
zspage = get_zspage(page);
- migrate_write_lock(zspage);
+ pool = zspage->pool;
+ spin_lock(&pool->lock);
inc_zspage_isolation(zspage);
- migrate_write_unlock(zspage);
+ spin_unlock(&pool->lock);
return true;
}
@@ -1875,12 +1856,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
kunmap_atomic(s_addr);
replace_sub_page(class, zspage, newpage, page);
+ dec_zspage_isolation(zspage);
/*
* Since we complete the data copy and set up new zspage structure,
* it's okay to release the pool's lock.
*/
spin_unlock(&pool->lock);
- dec_zspage_isolation(zspage);
migrate_write_unlock(zspage);
get_page(newpage);
@@ -1897,14 +1878,16 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
static void zs_page_putback(struct page *page)
{
+ struct zs_pool *pool;
struct zspage *zspage;
VM_BUG_ON_PAGE(!PageIsolated(page), page);
zspage = get_zspage(page);
- migrate_write_lock(zspage);
+ pool = zspage->pool;
+ spin_lock(&pool->lock);
dec_zspage_isolation(zspage);
- migrate_write_unlock(zspage);
+ spin_unlock(&pool->lock);
}
static const struct movable_operations zsmalloc_mops = {
@@ -2004,7 +1987,6 @@ static unsigned long zs_can_compact(struct size_class *class)
static unsigned long __zs_compact(struct zs_pool *pool,
struct size_class *class)
{
- struct zs_compact_control cc;
struct zspage *src_zspage = NULL;
struct zspage *dst_zspage = NULL;
unsigned long pages_freed = 0;
@@ -2022,7 +2004,6 @@ static unsigned long __zs_compact(struct zs_pool *pool,
if (!dst_zspage)
break;
migrate_write_lock(dst_zspage);
- cc.d_page = get_first_page(dst_zspage);
}
src_zspage = isolate_src_zspage(class);
@@ -2031,9 +2012,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
migrate_write_lock_nested(src_zspage);
- cc.obj_idx = 0;
- cc.s_page = get_first_page(src_zspage);
- migrate_zspage(pool, class, &cc);
+ migrate_zspage(pool, src_zspage, dst_zspage);
fg = putback_zspage(class, src_zspage);
migrate_write_unlock(src_zspage);
diff --git a/mm/zswap.c b/mm/zswap.c
index 62195f72bf56..412b1409a0d7 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -2,7 +2,7 @@
/*
* zswap.c - zswap driver file
*
- * zswap is a backend for frontswap that takes pages that are in the process
+ * zswap is a cache that takes pages that are in the process
* of being swapped out and attempts to compress and store them in a
* RAM-based memory pool. This can result in a significant I/O reduction on
* the swap device and, in the case where decompressing from RAM is faster
@@ -20,7 +20,6 @@
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
-#include <linux/frontswap.h>
#include <linux/rbtree.h>
#include <linux/swap.h>
#include <linux/crypto.h>
@@ -28,7 +27,7 @@
#include <linux/mempool.h>
#include <linux/zpool.h>
#include <crypto/acompress.h>
-
+#include <linux/zswap.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/swapops.h>
@@ -142,6 +141,9 @@ static bool zswap_exclusive_loads_enabled = IS_ENABLED(
CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
+/* Number of zpools in zswap_pool (empirically determined for scalability) */
+#define ZSWAP_NR_ZPOOLS 32
+
/*********************************
* data structures
**********************************/
@@ -161,7 +163,7 @@ struct crypto_acomp_ctx {
* needs to be verified that it's still valid in the tree.
*/
struct zswap_pool {
- struct zpool *zpool;
+ struct zpool *zpools[ZSWAP_NR_ZPOOLS];
struct crypto_acomp_ctx __percpu *acomp_ctx;
struct kref kref;
struct list_head list;
@@ -180,7 +182,7 @@ struct zswap_pool {
* page within zswap.
*
* rbnode - links the entry into red-black tree for the appropriate swap type
- * offset - the swap offset for the entry. Index into the red-black tree.
+ * swpentry - associated swap entry, the offset indexes into the red-black tree
* refcount - the number of outstanding reference to the entry. This is needed
* to protect against premature freeing of the entry by code
* concurrent calls to load, invalidate, and writeback. The lock
@@ -193,6 +195,7 @@ struct zswap_pool {
* pool - the zswap_pool the entry's data is in
* handle - zpool allocation handle that stores the compressed page data
* value - value of the same-value filled pages which have same content
+ * objcg - the obj_cgroup that the compressed memory is charged to
* lru - handle to the pool's lru used to evict pages.
*/
struct zswap_entry {
@@ -248,7 +251,7 @@ static bool zswap_has_pool;
#define zswap_pool_debug(msg, p) \
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
- zpool_get_type((p)->zpool))
+ zpool_get_type((p)->zpools[0]))
static int zswap_writeback_entry(struct zswap_entry *entry,
struct zswap_tree *tree);
@@ -272,11 +275,13 @@ static void zswap_update_total_size(void)
{
struct zswap_pool *pool;
u64 total = 0;
+ int i;
rcu_read_lock();
list_for_each_entry_rcu(pool, &zswap_pools, list)
- total += zpool_get_total_size(pool->zpool);
+ for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+ total += zpool_get_total_size(pool->zpools[i]);
rcu_read_unlock();
@@ -365,6 +370,16 @@ static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
return false;
}
+static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
+{
+ int i = 0;
+
+ if (ZSWAP_NR_ZPOOLS > 1)
+ i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
+
+ return entry->pool->zpools[i];
+}
+
/*
* Carries out the common pattern of freeing and entry's zpool allocation,
* freeing the entry itself, and decrementing the number of stored pages.
@@ -381,7 +396,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
spin_lock(&entry->pool->lru_lock);
list_del(&entry->lru);
spin_unlock(&entry->pool->lru_lock);
- zpool_free(entry->pool->zpool, entry->handle);
+ zpool_free(zswap_find_zpool(entry), entry->handle);
zswap_pool_put(entry->pool);
}
zswap_entry_cache_free(entry);
@@ -403,9 +418,9 @@ static void zswap_entry_put(struct zswap_tree *tree,
{
int refcount = --entry->refcount;
- BUG_ON(refcount < 0);
+ WARN_ON_ONCE(refcount < 0);
if (refcount == 0) {
- zswap_rb_erase(&tree->rbroot, entry);
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
zswap_free_entry(entry);
}
}
@@ -590,7 +605,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
list_for_each_entry_rcu(pool, &zswap_pools, list) {
if (strcmp(pool->tfm_name, compressor))
continue;
- if (strcmp(zpool_get_type(pool->zpool), type))
+ /* all zpools share the same type */
+ if (strcmp(zpool_get_type(pool->zpools[0]), type))
continue;
/* if we can't get it, it's about to be destroyed */
if (!zswap_pool_get(pool))
@@ -695,6 +711,7 @@ static void shrink_worker(struct work_struct *w)
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
+ int i;
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
@@ -715,15 +732,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
if (!pool)
return NULL;
- /* unique name for each pool specifically required by zsmalloc */
- snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
+ for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
+ /* unique name for each pool specifically required by zsmalloc */
+ snprintf(name, 38, "zswap%x",
+ atomic_inc_return(&zswap_pools_count));
- pool->zpool = zpool_create_pool(type, name, gfp);
- if (!pool->zpool) {
- pr_err("%s zpool not available\n", type);
- goto error;
+ pool->zpools[i] = zpool_create_pool(type, name, gfp);
+ if (!pool->zpools[i]) {
+ pr_err("%s zpool not available\n", type);
+ goto error;
+ }
}
- pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
+ pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
@@ -755,8 +775,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
error:
if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx);
- if (pool->zpool)
- zpool_destroy_pool(pool->zpool);
+ while (i--)
+ zpool_destroy_pool(pool->zpools[i]);
kfree(pool);
return NULL;
}
@@ -805,11 +825,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
static void zswap_pool_destroy(struct zswap_pool *pool)
{
+ int i;
+
zswap_pool_debug("destroying", pool);
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
free_percpu(pool->acomp_ctx);
- zpool_destroy_pool(pool->zpool);
+ for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+ zpool_destroy_pool(pool->zpools[i]);
kfree(pool);
}
@@ -1017,43 +1040,6 @@ static int zswap_enabled_param_set(const char *val,
/*********************************
* writeback code
**********************************/
-/* return enum for zswap_get_swap_cache_page */
-enum zswap_get_swap_ret {
- ZSWAP_SWAPCACHE_NEW,
- ZSWAP_SWAPCACHE_EXIST,
- ZSWAP_SWAPCACHE_FAIL,
-};
-
-/*
- * zswap_get_swap_cache_page
- *
- * This is an adaption of read_swap_cache_async()
- *
- * This function tries to find a page with the given swap entry
- * in the swapper_space address space (the swap cache). If the page
- * is found, it is returned in retpage. Otherwise, a page is allocated,
- * added to the swap cache, and returned in retpage.
- *
- * If success, the swap cache page is returned in retpage
- * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
- * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
- * the new page is added to swapcache and locked
- * Returns ZSWAP_SWAPCACHE_FAIL on error
- */
-static int zswap_get_swap_cache_page(swp_entry_t entry,
- struct page **retpage)
-{
- bool page_was_allocated;
-
- *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
- NULL, 0, &page_was_allocated);
- if (page_was_allocated)
- return ZSWAP_SWAPCACHE_NEW;
- if (!*retpage)
- return ZSWAP_SWAPCACHE_FAIL;
- return ZSWAP_SWAPCACHE_EXIST;
-}
-
/*
* Attempts to free an entry by adding a page to the swap cache,
* decompressing the entry data into the page, and issuing a
@@ -1061,7 +1047,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
*
* This can be thought of as a "resumed writeback" of the page
* to the swap device. We are basically resuming the same swap
- * writeback path that was intercepted with the frontswap_store()
+ * writeback path that was intercepted with the zswap_store()
* in the first place. After the page has been decompressed into
* the swap cache, the compressed version stored by zswap can be
* freed.
@@ -1073,8 +1059,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
struct page *page;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- struct zpool *pool = entry->pool->zpool;
-
+ struct zpool *pool = zswap_find_zpool(entry);
+ bool page_was_allocated;
u8 *src, *tmp = NULL;
unsigned int dlen;
int ret;
@@ -1089,65 +1075,66 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
}
/* try to allocate swap cache page */
- switch (zswap_get_swap_cache_page(swpentry, &page)) {
- case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
+ page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
+ &page_was_allocated);
+ if (!page) {
ret = -ENOMEM;
goto fail;
+ }
- case ZSWAP_SWAPCACHE_EXIST:
- /* page is already in the swap cache, ignore for now */
+ /* Found an existing page, we raced with load/swapin */
+ if (!page_was_allocated) {
put_page(page);
ret = -EEXIST;
goto fail;
+ }
- case ZSWAP_SWAPCACHE_NEW: /* page is locked */
- /*
- * Having a local reference to the zswap entry doesn't exclude
- * swapping from invalidating and recycling the swap slot. Once
- * the swapcache is secured against concurrent swapping to and
- * from the slot, recheck that the entry is still current before
- * writing.
- */
- spin_lock(&tree->lock);
- if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
- spin_unlock(&tree->lock);
- delete_from_swap_cache(page_folio(page));
- ret = -ENOMEM;
- goto fail;
- }
+ /*
+ * Page is locked, and the swapcache is now secured against
+ * concurrent swapping to and from the slot. Verify that the
+ * swap entry hasn't been invalidated and recycled behind our
+ * backs (our zswap_entry reference doesn't prevent that), to
+ * avoid overwriting a new swap page with old compressed data.
+ */
+ spin_lock(&tree->lock);
+ if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
+ delete_from_swap_cache(page_folio(page));
+ ret = -ENOMEM;
+ goto fail;
+ }
+ spin_unlock(&tree->lock);
- /* decompress */
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
- dlen = PAGE_SIZE;
+ /* decompress */
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+ dlen = PAGE_SIZE;
- src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
- if (!zpool_can_sleep_mapped(pool)) {
- memcpy(tmp, src, entry->length);
- src = tmp;
- zpool_unmap_handle(pool, entry->handle);
- }
+ src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
+ if (!zpool_can_sleep_mapped(pool)) {
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+ zpool_unmap_handle(pool, entry->handle);
+ }
- mutex_lock(acomp_ctx->mutex);
- sg_init_one(&input, src, entry->length);
- sg_init_table(&output, 1);
- sg_set_page(&output, page, PAGE_SIZE, 0);
- acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
- ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
- dlen = acomp_ctx->req->dlen;
- mutex_unlock(acomp_ctx->mutex);
-
- if (!zpool_can_sleep_mapped(pool))
- kfree(tmp);
- else
- zpool_unmap_handle(pool, entry->handle);
+ mutex_lock(acomp_ctx->mutex);
+ sg_init_one(&input, src, entry->length);
+ sg_init_table(&output, 1);
+ sg_set_page(&output, page, PAGE_SIZE, 0);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
+ ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
+ mutex_unlock(acomp_ctx->mutex);
- BUG_ON(ret);
- BUG_ON(dlen != PAGE_SIZE);
+ if (!zpool_can_sleep_mapped(pool))
+ kfree(tmp);
+ else
+ zpool_unmap_handle(pool, entry->handle);
- /* page is up to date */
- SetPageUptodate(page);
- }
+ BUG_ON(ret);
+ BUG_ON(dlen != PAGE_SIZE);
+
+ /* page is up to date */
+ SetPageUptodate(page);
/* move it to the tail of the inactive list after end_writeback */
SetPageReclaim(page);
@@ -1158,16 +1145,16 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
zswap_written_back_pages++;
return ret;
+
fail:
if (!zpool_can_sleep_mapped(pool))
kfree(tmp);
/*
- * if we get here due to ZSWAP_SWAPCACHE_EXIST
- * a load may be happening concurrently.
- * it is safe and okay to not free the entry.
- * it is also okay to return !0
- */
+ * If we get here because the page is already in swapcache, a
+ * load may be happening concurrently. It is safe and okay to
+ * not free the entry. It is also okay to return !0.
+ */
return ret;
}
@@ -1201,47 +1188,44 @@ static void zswap_fill_page(void *ptr, unsigned long value)
memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
}
-/*********************************
-* frontswap hooks
-**********************************/
-/* attempts to compress and store an single page */
-static int zswap_frontswap_store(unsigned type, pgoff_t offset,
- struct page *page)
+bool zswap_store(struct folio *folio)
{
+ swp_entry_t swp = folio->swap;
+ int type = swp_type(swp);
+ pgoff_t offset = swp_offset(swp);
+ struct page *page = &folio->page;
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry, *dupentry;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
struct obj_cgroup *objcg = NULL;
struct zswap_pool *pool;
- int ret;
+ struct zpool *zpool;
unsigned int dlen = PAGE_SIZE;
unsigned long handle, value;
char *buf;
u8 *src, *dst;
gfp_t gfp;
+ int ret;
- /* THP isn't supported */
- if (PageTransHuge(page)) {
- ret = -EINVAL;
- goto reject;
- }
+ VM_WARN_ON_ONCE(!folio_test_locked(folio));
+ VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
- if (!zswap_enabled || !tree) {
- ret = -ENODEV;
- goto reject;
- }
+ /* Large folios aren't supported */
+ if (folio_test_large(folio))
+ return false;
+
+ if (!zswap_enabled || !tree)
+ return false;
/*
* XXX: zswap reclaim does not work with cgroups yet. Without a
* cgroup-aware entry LRU, we will push out entries system-wide based on
* local cgroup limits.
*/
- objcg = get_obj_cgroup_from_page(page);
- if (objcg && !obj_cgroup_may_zswap(objcg)) {
- ret = -ENOMEM;
+ objcg = get_obj_cgroup_from_folio(folio);
+ if (objcg && !obj_cgroup_may_zswap(objcg))
goto reject;
- }
/* reclaim space if needed */
if (zswap_is_full()) {
@@ -1251,10 +1235,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
}
if (zswap_pool_reached_full) {
- if (!zswap_can_accept()) {
- ret = -ENOMEM;
+ if (!zswap_can_accept())
goto shrink;
- } else
+ else
zswap_pool_reached_full = false;
}
@@ -1262,7 +1245,6 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
entry = zswap_entry_cache_alloc(GFP_KERNEL);
if (!entry) {
zswap_reject_kmemcache_fail++;
- ret = -ENOMEM;
goto reject;
}
@@ -1279,17 +1261,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
kunmap_atomic(src);
}
- if (!zswap_non_same_filled_pages_enabled) {
- ret = -EINVAL;
+ if (!zswap_non_same_filled_pages_enabled)
goto freepage;
- }
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
- if (!entry->pool) {
- ret = -EINVAL;
+ if (!entry->pool)
goto freepage;
- }
/* compress */
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
@@ -1309,25 +1287,24 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
* synchronous in fact.
* Theoretically, acomp supports users send multiple acomp requests in one
* acomp instance, then get those requests done simultaneously. but in this
- * case, frontswap actually does store and load page by page, there is no
+ * case, zswap actually does store and load page by page, there is no
* existing method to send the second page before the first page is done
- * in one thread doing frontswap.
+ * in one thread doing zwap.
* but in different threads running on different cpu, we have different
* acomp instance, so multiple threads can do (de)compression in parallel.
*/
ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
dlen = acomp_ctx->req->dlen;
- if (ret) {
- ret = -EINVAL;
+ if (ret)
goto put_dstmem;
- }
/* store */
+ zpool = zswap_find_zpool(entry);
gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- if (zpool_malloc_support_movable(entry->pool->zpool))
+ if (zpool_malloc_support_movable(zpool))
gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
- ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
+ ret = zpool_malloc(zpool, dlen, gfp, &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;
@@ -1336,9 +1313,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
zswap_reject_alloc_fail++;
goto put_dstmem;
}
- buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
+ buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
memcpy(buf, dst, dlen);
- zpool_unmap_handle(entry->pool->zpool, handle);
+ zpool_unmap_handle(zpool, handle);
mutex_unlock(acomp_ctx->mutex);
/* populate entry */
@@ -1356,15 +1333,10 @@ insert_entry:
/* map */
spin_lock(&tree->lock);
- do {
- ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
- if (ret == -EEXIST) {
- zswap_duplicate_entry++;
- /* remove from rbtree */
- zswap_rb_erase(&tree->rbroot, dupentry);
- zswap_entry_put(tree, dupentry);
- }
- } while (ret == -EEXIST);
+ while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
+ zswap_duplicate_entry++;
+ zswap_invalidate_entry(tree, dupentry);
+ }
if (entry->length) {
spin_lock(&entry->pool->lru_lock);
list_add(&entry->lru, &entry->pool->lru);
@@ -1377,7 +1349,7 @@ insert_entry:
zswap_update_total_size();
count_vm_event(ZSWPOUT);
- return 0;
+ return true;
put_dstmem:
mutex_unlock(acomp_ctx->mutex);
@@ -1387,38 +1359,38 @@ freepage:
reject:
if (objcg)
obj_cgroup_put(objcg);
- return ret;
+ return false;
shrink:
pool = zswap_pool_last_get();
if (pool)
queue_work(shrink_wq, &pool->shrink_work);
- ret = -ENOMEM;
goto reject;
}
-/*
- * returns 0 if the page was successfully decompressed
- * return -1 on entry not found or error
-*/
-static int zswap_frontswap_load(unsigned type, pgoff_t offset,
- struct page *page, bool *exclusive)
+bool zswap_load(struct folio *folio)
{
+ swp_entry_t swp = folio->swap;
+ int type = swp_type(swp);
+ pgoff_t offset = swp_offset(swp);
+ struct page *page = &folio->page;
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
u8 *src, *dst, *tmp;
+ struct zpool *zpool;
unsigned int dlen;
- int ret;
+ bool ret;
+
+ VM_WARN_ON_ONCE(!folio_test_locked(folio));
/* find */
spin_lock(&tree->lock);
entry = zswap_entry_find_get(&tree->rbroot, offset);
if (!entry) {
- /* entry was written back */
spin_unlock(&tree->lock);
- return -1;
+ return false;
}
spin_unlock(&tree->lock);
@@ -1426,26 +1398,27 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
dst = kmap_atomic(page);
zswap_fill_page(dst, entry->value);
kunmap_atomic(dst);
- ret = 0;
+ ret = true;
goto stats;
}
- if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+ zpool = zswap_find_zpool(entry);
+ if (!zpool_can_sleep_mapped(zpool)) {
tmp = kmalloc(entry->length, GFP_KERNEL);
if (!tmp) {
- ret = -ENOMEM;
+ ret = false;
goto freeentry;
}
}
/* decompress */
dlen = PAGE_SIZE;
- src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
+ src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
- if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+ if (!zpool_can_sleep_mapped(zpool)) {
memcpy(tmp, src, entry->length);
src = tmp;
- zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ zpool_unmap_handle(zpool, entry->handle);
}
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
@@ -1454,24 +1427,25 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
sg_init_table(&output, 1);
sg_set_page(&output, page, PAGE_SIZE, 0);
acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
- ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+ if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait))
+ WARN_ON(1);
mutex_unlock(acomp_ctx->mutex);
- if (zpool_can_sleep_mapped(entry->pool->zpool))
- zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ if (zpool_can_sleep_mapped(zpool))
+ zpool_unmap_handle(zpool, entry->handle);
else
kfree(tmp);
- BUG_ON(ret);
+ ret = true;
stats:
count_vm_event(ZSWPIN);
if (entry->objcg)
count_objcg_event(entry->objcg, ZSWPIN);
freeentry:
spin_lock(&tree->lock);
- if (!ret && zswap_exclusive_loads_enabled) {
+ if (ret && zswap_exclusive_loads_enabled) {
zswap_invalidate_entry(tree, entry);
- *exclusive = true;
+ folio_mark_dirty(folio);
} else if (entry->length) {
spin_lock(&entry->pool->lru_lock);
list_move(&entry->lru, &entry->pool->lru);
@@ -1483,8 +1457,7 @@ freeentry:
return ret;
}
-/* frees an entry in zswap */
-static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
+void zswap_invalidate(int type, pgoff_t offset)
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
@@ -1501,8 +1474,22 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
spin_unlock(&tree->lock);
}
-/* frees all zswap entries for the given swap type */
-static void zswap_frontswap_invalidate_area(unsigned type)
+void zswap_swapon(int type)
+{
+ struct zswap_tree *tree;
+
+ tree = kzalloc(sizeof(*tree), GFP_KERNEL);
+ if (!tree) {
+ pr_err("alloc failed, zswap disabled for swap type %d\n", type);
+ return;
+ }
+
+ tree->rbroot = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ zswap_trees[type] = tree;
+}
+
+void zswap_swapoff(int type)
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry, *n;
@@ -1520,29 +1507,6 @@ static void zswap_frontswap_invalidate_area(unsigned type)
zswap_trees[type] = NULL;
}
-static void zswap_frontswap_init(unsigned type)
-{
- struct zswap_tree *tree;
-
- tree = kzalloc(sizeof(*tree), GFP_KERNEL);
- if (!tree) {
- pr_err("alloc failed, zswap disabled for swap type %d\n", type);
- return;
- }
-
- tree->rbroot = RB_ROOT;
- spin_lock_init(&tree->lock);
- zswap_trees[type] = tree;
-}
-
-static const struct frontswap_ops zswap_frontswap_ops = {
- .store = zswap_frontswap_store,
- .load = zswap_frontswap_load,
- .invalidate_page = zswap_frontswap_invalidate_page,
- .invalidate_area = zswap_frontswap_invalidate_area,
- .init = zswap_frontswap_init
-};
-
/*********************************
* debugfs functions
**********************************/
@@ -1619,7 +1583,7 @@ static int zswap_setup(void)
pool = __zswap_pool_create_fallback();
if (pool) {
pr_info("loaded using pool %s/%s\n", pool->tfm_name,
- zpool_get_type(pool->zpool));
+ zpool_get_type(pool->zpools[0]));
list_add(&pool->list, &zswap_pools);
zswap_has_pool = true;
} else {
@@ -1631,16 +1595,11 @@ static int zswap_setup(void)
if (!shrink_wq)
goto fallback_fail;
- ret = frontswap_register_ops(&zswap_frontswap_ops);
- if (ret)
- goto destroy_wq;
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
zswap_init_state = ZSWAP_INIT_SUCCEED;
return 0;
-destroy_wq:
- destroy_workqueue(shrink_wq);
fallback_fail:
if (pool)
zswap_pool_destroy(pool);