summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/Makefile4
-rw-r--r--mm/bootmem_info.c2
-rw-r--r--mm/cma.c2
-rw-r--r--mm/compaction.c91
-rw-r--r--mm/damon/Kconfig12
-rw-r--r--mm/damon/core-test.h29
-rw-r--r--mm/damon/core.c298
-rw-r--r--mm/damon/dbgfs.c3
-rw-r--r--mm/damon/lru_sort.c6
-rw-r--r--mm/damon/ops-common.c5
-rw-r--r--mm/damon/paddr.c11
-rw-r--r--mm/damon/reclaim.c2
-rw-r--r--mm/damon/sysfs-common.h2
-rw-r--r--mm/damon/sysfs-schemes.c133
-rw-r--r--mm/damon/sysfs-test.h86
-rw-r--r--mm/damon/sysfs.c130
-rw-r--r--mm/damon/vaddr-test.h2
-rw-r--r--mm/damon/vaddr.c25
-rw-r--r--mm/debug_vm_pgtable.c4
-rw-r--r--mm/filemap.c219
-rw-r--r--mm/gup.c16
-rw-r--r--mm/huge_memory.c134
-rw-r--r--mm/hugetlb.c750
-rw-r--r--mm/hugetlb_cgroup.c20
-rw-r--r--mm/hugetlb_vmemmap.c301
-rw-r--r--mm/hugetlb_vmemmap.h31
-rw-r--r--mm/internal.h147
-rw-r--r--mm/kasan/kasan.h1
-rw-r--r--mm/kasan/kasan_test.c19
-rw-r--r--mm/kasan/kasan_test_module.c2
-rw-r--r--mm/kasan/quarantine.c4
-rw-r--r--mm/kasan/report.c6
-rw-r--r--mm/kasan/report_generic.c6
-rw-r--r--mm/kasan/shadow.c2
-rw-r--r--mm/khugepaged.c139
-rw-r--r--mm/kmemleak.c144
-rw-r--r--mm/kmsan/core.c127
-rw-r--r--mm/kmsan/kmsan_test.c109
-rw-r--r--mm/kmsan/shadow.c9
-rw-r--r--mm/ksm.c116
-rw-r--r--mm/madvise.c33
-rw-r--r--mm/memblock.c49
-rw-r--r--mm/memcontrol.c414
-rw-r--r--mm/memfd.c6
-rw-r--r--mm/memory-failure.c29
-rw-r--r--mm/memory-tiers.c175
-rw-r--r--mm/memory.c315
-rw-r--r--mm/memory_hotplug.c8
-rw-r--r--mm/mempolicy.c1049
-rw-r--r--mm/migrate.c246
-rw-r--r--mm/mlock.c95
-rw-r--r--mm/mm_init.c4
-rw-r--r--mm/mmap.c207
-rw-r--r--mm/mmzone.c6
-rw-r--r--mm/mprotect.c58
-rw-r--r--mm/mremap.c107
-rw-r--r--mm/nommu.c13
-rw-r--r--mm/oom_kill.c16
-rw-r--r--mm/page-writeback.c37
-rw-r--r--mm/page_alloc.c343
-rw-r--r--mm/page_io.c8
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/percpu.c43
-rw-r--r--mm/readahead.c3
-rw-r--r--mm/rmap.c187
-rw-r--r--mm/shmem.c618
-rw-r--r--mm/show_mem.c11
-rw-r--r--mm/shrinker.c809
-rw-r--r--mm/shrinker_debug.c35
-rw-r--r--mm/slab.h15
-rw-r--r--mm/slab_common.c84
-rw-r--r--mm/slub.c73
-rw-r--r--mm/swap.h9
-rw-r--r--mm/swap_state.c92
-rw-r--r--mm/swapfile.c23
-rw-r--r--mm/util.c5
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c734
-rw-r--r--mm/vmstat.c30
-rw-r--r--mm/workingset.c29
-rw-r--r--mm/zsmalloc.c29
-rw-r--r--mm/zswap.c39
83 files changed, 5784 insertions, 3471 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 264a2df5ecf5..89971a894b60 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -130,6 +130,7 @@ config ZSWAP_COMPRESSOR_DEFAULT
choice
prompt "Default allocator"
depends on ZSWAP
+ default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU
default ZSWAP_ZPOOL_DEFAULT_ZBUD
help
Selects the default allocator for the compressed cache for
@@ -704,6 +705,17 @@ config HUGETLB_PAGE_SIZE_VARIABLE
config CONTIG_ALLOC
def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
+config PCP_BATCH_SCALE_MAX
+ int "Maximum scale factor of PCP (Per-CPU pageset) batch allocate/free"
+ default 5
+ range 0 6
+ help
+ In page allocator, PCP (Per-CPU pageset) is refilled and drained in
+ batches. The batch number is scaled automatically to improve page
+ allocation/free throughput. But too large scale factor may hurt
+ latency. This option sets the upper limit of scale factor to limit
+ the maximum latency.
+
config PHYS_ADDR_T_64BIT
def_bool 64BIT
diff --git a/mm/Makefile b/mm/Makefile
index ec65984e2ade..33873c8aedb3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -48,8 +48,8 @@ endif
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page-writeback.o folio-compat.o \
- readahead.o swap.o truncate.o vmscan.o shmem.o \
- util.o mmzone.o vmstat.o backing-dev.o \
+ readahead.o swap.o truncate.o vmscan.o shrinker.o \
+ shmem.o util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
compaction.o show_mem.o shmem_quota.o\
interval_tree.o list_lru.o workingset.o \
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
index b1efebfcf94b..fa7cb0c87c03 100644
--- a/mm/bootmem_info.c
+++ b/mm/bootmem_info.c
@@ -34,7 +34,7 @@ void put_page_bootmem(struct page *page)
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
- kmemleak_free_part(page_to_virt(page), PAGE_SIZE);
+ kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);
free_reserved_page(page);
}
}
diff --git a/mm/cma.c b/mm/cma.c
index da2967c6a223..2b2494fd6b59 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -505,7 +505,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
*/
if (page) {
for (i = 0; i < count; i++)
- page_kasan_tag_reset(page + i);
+ page_kasan_tag_reset(nth_page(page, i));
}
if (ret && !no_warn) {
diff --git a/mm/compaction.c b/mm/compaction.c
index 38c8d216c6a3..01ba298739dd 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -382,6 +382,7 @@ static void __reset_isolation_suitable(struct zone *zone)
bool source_set = false;
bool free_set = false;
+ /* Only flush if a full compaction finished recently */
if (!zone->compact_blockskip_flush)
return;
@@ -434,9 +435,7 @@ void reset_isolation_suitable(pg_data_t *pgdat)
if (!populated_zone(zone))
continue;
- /* Only flush if a full compaction finished recently */
- if (zone->compact_blockskip_flush)
- __reset_isolation_suitable(zone);
+ __reset_isolation_suitable(zone);
}
}
@@ -626,11 +625,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (PageCompound(page)) {
const unsigned int order = compound_order(page);
- if (likely(order <= MAX_ORDER)) {
+ if (blockpfn + (1UL << order) <= end_pfn) {
blockpfn += (1UL << order) - 1;
page += (1UL << order) - 1;
nr_scanned += (1UL << order) - 1;
}
+
goto isolate_fail;
}
@@ -678,8 +678,7 @@ isolate_fail:
spin_unlock_irqrestore(&cc->zone->lock, flags);
/*
- * There is a tiny chance that we have read bogus compound_order(),
- * so be careful to not go outside of the pageblock.
+ * Be careful to not go outside of the pageblock.
*/
if (unlikely(blockpfn > end_pfn))
blockpfn = end_pfn;
@@ -1395,8 +1394,8 @@ move_freelist_head(struct list_head *freelist, struct page *freepage)
{
LIST_HEAD(sublist);
- if (!list_is_last(freelist, &freepage->lru)) {
- list_cut_before(&sublist, freelist, &freepage->lru);
+ if (!list_is_first(&freepage->buddy_list, freelist)) {
+ list_cut_before(&sublist, freelist, &freepage->buddy_list);
list_splice_tail(&sublist, freelist);
}
}
@@ -1412,8 +1411,8 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage)
{
LIST_HEAD(sublist);
- if (!list_is_first(freelist, &freepage->lru)) {
- list_cut_position(&sublist, freelist, &freepage->lru);
+ if (!list_is_last(&freepage->buddy_list, freelist)) {
+ list_cut_position(&sublist, freelist, &freepage->buddy_list);
list_splice_tail(&sublist, freelist);
}
}
@@ -2066,8 +2065,10 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
}
/*
- * order == -1 is expected when compacting via
- * /proc/sys/vm/compact_memory
+ * order == -1 is expected when compacting proactively via
+ * 1. /proc/sys/vm/compact_memory
+ * 2. /sys/devices/system/node/nodex/compact
+ * 3. /proc/sys/vm/compaction_proactiveness
*/
static inline bool is_via_compact_memory(int order)
{
@@ -2377,6 +2378,30 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
return false;
}
+/*
+ * Should we do compaction for target allocation order.
+ * Return COMPACT_SUCCESS if allocation for target order can be already
+ * satisfied
+ * Return COMPACT_SKIPPED if compaction for target order is likely to fail
+ * Return COMPACT_CONTINUE if compaction for target order should be ran
+ */
+static enum compact_result
+compaction_suit_allocation_order(struct zone *zone, unsigned int order,
+ int highest_zoneidx, unsigned int alloc_flags)
+{
+ unsigned long watermark;
+
+ watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+ if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
+ alloc_flags))
+ return COMPACT_SUCCESS;
+
+ if (!compaction_suitable(zone, order, highest_zoneidx))
+ return COMPACT_SKIPPED;
+
+ return COMPACT_CONTINUE;
+}
+
static enum compact_result
compact_zone(struct compact_control *cc, struct capture_control *capc)
{
@@ -2402,19 +2427,11 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
cc->migratetype = gfp_migratetype(cc->gfp_mask);
if (!is_via_compact_memory(cc->order)) {
- unsigned long watermark;
-
- /* Allocation can already succeed, nothing to do */
- watermark = wmark_pages(cc->zone,
- cc->alloc_flags & ALLOC_WMARK_MASK);
- if (zone_watermark_ok(cc->zone, cc->order, watermark,
- cc->highest_zoneidx, cc->alloc_flags))
- return COMPACT_SUCCESS;
-
- /* Compaction is likely to fail */
- if (!compaction_suitable(cc->zone, cc->order,
- cc->highest_zoneidx))
- return COMPACT_SKIPPED;
+ ret = compaction_suit_allocation_order(cc->zone, cc->order,
+ cc->highest_zoneidx,
+ cc->alloc_flags);
+ if (ret != COMPACT_CONTINUE)
+ return ret;
}
/*
@@ -2913,6 +2930,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
int zoneid;
struct zone *zone;
enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
+ enum compact_result ret;
for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) {
zone = &pgdat->node_zones[zoneid];
@@ -2920,14 +2938,10 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
if (!populated_zone(zone))
continue;
- /* Allocation can already succeed, check other zones */
- if (zone_watermark_ok(zone, pgdat->kcompactd_max_order,
- min_wmark_pages(zone),
- highest_zoneidx, 0))
- continue;
-
- if (compaction_suitable(zone, pgdat->kcompactd_max_order,
- highest_zoneidx))
+ ret = compaction_suit_allocation_order(zone,
+ pgdat->kcompactd_max_order,
+ highest_zoneidx, ALLOC_WMARK_MIN);
+ if (ret == COMPACT_CONTINUE)
return true;
}
@@ -2950,6 +2964,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
.ignore_skip_hint = false,
.gfp_mask = GFP_KERNEL,
};
+ enum compact_result ret;
+
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
cc.highest_zoneidx);
count_compact_event(KCOMPACTD_WAKE);
@@ -2964,12 +2980,9 @@ static void kcompactd_do_work(pg_data_t *pgdat)
if (compaction_deferred(zone, cc.order))
continue;
- /* Allocation can already succeed, nothing to do */
- if (zone_watermark_ok(zone, cc.order,
- min_wmark_pages(zone), zoneid, 0))
- continue;
-
- if (!compaction_suitable(zone, cc.order, zoneid))
+ ret = compaction_suit_allocation_order(zone,
+ cc.order, zoneid, ALLOC_WMARK_MIN);
+ if (ret != COMPACT_CONTINUE)
continue;
if (kthread_should_stop())
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 436c6b4cb5ec..29f43fbc2eff 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -59,6 +59,18 @@ config DAMON_SYSFS
This builds the sysfs interface for DAMON. The user space can use
the interface for arbitrary data access monitoring.
+config DAMON_SYSFS_KUNIT_TEST
+ bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
+ depends on DAMON_SYSFS && KUNIT=y
+ default KUNIT_ALL_TESTS
+ help
+ This builds the DAMON sysfs interface Kunit test suite.
+
+ For more information on KUnit and unit tests in general, please refer
+ to the KUnit documentation.
+
+ If unsure, say N.
+
config DAMON_DBGFS
bool "DAMON debugfs interface (DEPRECATED!)"
depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 6cc8b245586d..649adf91ebc5 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -30,7 +30,7 @@ static void damon_test_regions(struct kunit *test)
damon_add_region(r, t);
KUNIT_EXPECT_EQ(test, 1u, damon_nr_regions(t));
- damon_del_region(r, t);
+ damon_destroy_region(r, t);
KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t));
damon_free_target(t);
@@ -94,6 +94,7 @@ static void damon_test_aggregate(struct kunit *test)
for (ir = 0; ir < 3; ir++) {
r = damon_new_region(saddr[it][ir], eaddr[it][ir]);
r->nr_accesses = accesses[it][ir];
+ r->nr_accesses_bp = accesses[it][ir] * 10000;
damon_add_region(r, t);
}
it++;
@@ -147,9 +148,11 @@ static void damon_test_merge_two(struct kunit *test)
t = damon_new_target();
r = damon_new_region(0, 100);
r->nr_accesses = 10;
+ r->nr_accesses_bp = 100000;
damon_add_region(r, t);
r2 = damon_new_region(100, 300);
r2->nr_accesses = 20;
+ r2->nr_accesses_bp = 200000;
damon_add_region(r2, t);
damon_merge_two_regions(t, r, r2);
@@ -196,6 +199,7 @@ static void damon_test_merge_regions_of(struct kunit *test)
for (i = 0; i < ARRAY_SIZE(sa); i++) {
r = damon_new_region(sa[i], ea[i]);
r->nr_accesses = nrs[i];
+ r->nr_accesses_bp = nrs[i] * 10000;
damon_add_region(r, t);
}
@@ -265,6 +269,8 @@ static void damon_test_ops_registration(struct kunit *test)
/* Check double-registration failure again */
KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
+
+ damon_destroy_ctx(c);
}
static void damon_test_set_regions(struct kunit *test)
@@ -297,6 +303,7 @@ static void damon_test_update_monitoring_result(struct kunit *test)
struct damon_region *r = damon_new_region(3, 7);
r->nr_accesses = 15;
+ r->nr_accesses_bp = 150000;
r->age = 20;
new_attrs = (struct damon_attrs){
@@ -316,6 +323,8 @@ static void damon_test_update_monitoring_result(struct kunit *test)
damon_update_monitoring_result(r, &old_attrs, &new_attrs);
KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
KUNIT_EXPECT_EQ(test, r->age, 20);
+
+ damon_free_region(r);
}
static void damon_test_set_attrs(struct kunit *test)
@@ -339,6 +348,23 @@ static void damon_test_set_attrs(struct kunit *test)
invalid_attrs = valid_attrs;
invalid_attrs.aggr_interval = 4999;
KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL);
+
+ damon_destroy_ctx(c);
+}
+
+static void damon_test_moving_sum(struct kunit *test)
+{
+ unsigned int mvsum = 50000, nomvsum = 50000, len_window = 10;
+ unsigned int new_values[] = {10000, 0, 10000, 0, 0, 0, 10000, 0, 0, 0};
+ unsigned int expects[] = {55000, 50000, 55000, 50000, 45000, 40000,
+ 45000, 40000, 35000, 30000};
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(new_values); i++) {
+ mvsum = damon_moving_sum(mvsum, nomvsum, len_window,
+ new_values[i]);
+ KUNIT_EXPECT_EQ(test, mvsum, expects[i]);
+ }
}
static void damos_test_new_filter(struct kunit *test)
@@ -425,6 +451,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_set_regions),
KUNIT_CASE(damon_test_update_monitoring_result),
KUNIT_CASE(damon_test_set_attrs),
+ KUNIT_CASE(damon_test_moving_sum),
KUNIT_CASE(damos_test_new_filter),
KUNIT_CASE(damos_test_filter_out),
{},
diff --git a/mm/damon/core.c b/mm/damon/core.c
index bcd2bd9d6c10..630077d95dc6 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -128,6 +128,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
region->ar.start = start;
region->ar.end = end;
region->nr_accesses = 0;
+ region->nr_accesses_bp = 0;
INIT_LIST_HEAD(&region->list);
region->age = 0;
@@ -312,7 +313,9 @@ static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota)
}
struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
- enum damos_action action, struct damos_quota *quota,
+ enum damos_action action,
+ unsigned long apply_interval_us,
+ struct damos_quota *quota,
struct damos_watermarks *wmarks)
{
struct damos *scheme;
@@ -322,6 +325,13 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
return NULL;
scheme->pattern = *pattern;
scheme->action = action;
+ scheme->apply_interval_us = apply_interval_us;
+ /*
+ * next_apply_sis will be set when kdamond starts. While kdamond is
+ * running, it will also updated when it is added to the DAMON context,
+ * or damon_attrs are updated.
+ */
+ scheme->next_apply_sis = 0;
INIT_LIST_HEAD(&scheme->filters);
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
@@ -334,9 +344,21 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
return scheme;
}
+static void damos_set_next_apply_sis(struct damos *s, struct damon_ctx *ctx)
+{
+ unsigned long sample_interval = ctx->attrs.sample_interval ?
+ ctx->attrs.sample_interval : 1;
+ unsigned long apply_interval = s->apply_interval_us ?
+ s->apply_interval_us : ctx->attrs.aggr_interval;
+
+ s->next_apply_sis = ctx->passed_sample_intervals +
+ apply_interval / sample_interval;
+}
+
void damon_add_scheme(struct damon_ctx *ctx, struct damos *s)
{
list_add_tail(&s->list, &ctx->schemes);
+ damos_set_next_apply_sis(s, ctx);
}
static void damon_del_scheme(struct damos *s)
@@ -427,8 +449,10 @@ struct damon_ctx *damon_new_ctx(void)
ctx->attrs.aggr_interval = 100 * 1000;
ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
- ktime_get_coarse_ts64(&ctx->last_aggregation);
- ctx->last_ops_update = ctx->last_aggregation;
+ ctx->passed_sample_intervals = 0;
+ /* These will be set from kdamond_init_intervals_sis() */
+ ctx->next_aggregation_sis = 0;
+ ctx->next_ops_update_sis = 0;
mutex_init(&ctx->kdamond_lock);
@@ -476,20 +500,14 @@ static unsigned int damon_age_for_new_attrs(unsigned int age,
static unsigned int damon_accesses_bp_to_nr_accesses(
unsigned int accesses_bp, struct damon_attrs *attrs)
{
- unsigned int max_nr_accesses =
- attrs->aggr_interval / attrs->sample_interval;
-
- return accesses_bp * max_nr_accesses / 10000;
+ return accesses_bp * damon_max_nr_accesses(attrs) / 10000;
}
/* convert nr_accesses to access ratio in bp (per 10,000) */
static unsigned int damon_nr_accesses_to_accesses_bp(
unsigned int nr_accesses, struct damon_attrs *attrs)
{
- unsigned int max_nr_accesses =
- attrs->aggr_interval / attrs->sample_interval;
-
- return nr_accesses * 10000 / max_nr_accesses;
+ return nr_accesses * 10000 / damon_max_nr_accesses(attrs);
}
static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses,
@@ -506,6 +524,7 @@ static void damon_update_monitoring_result(struct damon_region *r,
{
r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses,
old_attrs, new_attrs);
+ r->nr_accesses_bp = r->nr_accesses * 10000;
r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs);
}
@@ -541,13 +560,21 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx,
* @ctx: monitoring context
* @attrs: monitoring attributes
*
- * This function should not be called while the kdamond is running.
+ * This function should be called while the kdamond is not running, or an
+ * access check results aggregation is not ongoing (e.g., from
+ * &struct damon_callback->after_aggregation or
+ * &struct damon_callback->after_wmarks_check callbacks).
+ *
* Every time interval is in micro-seconds.
*
* Return: 0 on success, negative error code otherwise.
*/
int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
{
+ unsigned long sample_interval = attrs->sample_interval ?
+ attrs->sample_interval : 1;
+ struct damos *s;
+
if (attrs->min_nr_regions < 3)
return -EINVAL;
if (attrs->min_nr_regions > attrs->max_nr_regions)
@@ -555,8 +582,17 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
if (attrs->sample_interval > attrs->aggr_interval)
return -EINVAL;
+ ctx->next_aggregation_sis = ctx->passed_sample_intervals +
+ attrs->aggr_interval / sample_interval;
+ ctx->next_ops_update_sis = ctx->passed_sample_intervals +
+ attrs->ops_update_interval / sample_interval;
+
damon_update_monitoring_results(ctx, attrs);
ctx->attrs = *attrs;
+
+ damon_for_each_scheme(s, ctx)
+ damos_set_next_apply_sis(s, ctx);
+
return 0;
}
@@ -699,8 +735,7 @@ static int __damon_stop(struct damon_ctx *ctx)
if (tsk) {
get_task_struct(tsk);
mutex_unlock(&ctx->kdamond_lock);
- kthread_stop(tsk);
- put_task_struct(tsk);
+ kthread_stop_put(tsk);
return 0;
}
mutex_unlock(&ctx->kdamond_lock);
@@ -729,38 +764,6 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
}
/*
- * damon_check_reset_time_interval() - Check if a time interval is elapsed.
- * @baseline: the time to check whether the interval has elapsed since
- * @interval: the time interval (microseconds)
- *
- * See whether the given time interval has passed since the given baseline
- * time. If so, it also updates the baseline to current time for next check.
- *
- * Return: true if the time interval has passed, or false otherwise.
- */
-static bool damon_check_reset_time_interval(struct timespec64 *baseline,
- unsigned long interval)
-{
- struct timespec64 now;
-
- ktime_get_coarse_ts64(&now);
- if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) <
- interval * 1000)
- return false;
- *baseline = now;
- return true;
-}
-
-/*
- * Check whether it is time to flush the aggregated information
- */
-static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
-{
- return damon_check_reset_time_interval(&ctx->last_aggregation,
- ctx->attrs.aggr_interval);
-}
-
-/*
* Reset the aggregated monitoring results ('nr_accesses' of each region).
*/
static void kdamond_reset_aggregated(struct damon_ctx *c)
@@ -772,7 +775,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
struct damon_region *r;
damon_for_each_region(r, t) {
- trace_damon_aggregated(t, ti, r, damon_nr_regions(t));
+ trace_damon_aggregated(ti, r, damon_nr_regions(t));
r->last_nr_accesses = r->nr_accesses;
r->nr_accesses = 0;
}
@@ -786,12 +789,13 @@ static void damon_split_region_at(struct damon_target *t,
static bool __damos_valid_target(struct damon_region *r, struct damos *s)
{
unsigned long sz;
+ unsigned int nr_accesses = r->nr_accesses_bp / 10000;
sz = damon_sz_region(r);
return s->pattern.min_sz_region <= sz &&
sz <= s->pattern.max_sz_region &&
- s->pattern.min_nr_accesses <= r->nr_accesses &&
- r->nr_accesses <= s->pattern.max_nr_accesses &&
+ s->pattern.min_nr_accesses <= nr_accesses &&
+ nr_accesses <= s->pattern.max_nr_accesses &&
s->pattern.min_age_region <= r->age &&
r->age <= s->pattern.max_age_region;
}
@@ -946,6 +950,33 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
struct timespec64 begin, end;
unsigned long sz_applied = 0;
int err = 0;
+ /*
+ * We plan to support multiple context per kdamond, as DAMON sysfs
+ * implies with 'nr_contexts' file. Nevertheless, only single context
+ * per kdamond is supported for now. So, we can simply use '0' context
+ * index here.
+ */
+ unsigned int cidx = 0;
+ struct damos *siter; /* schemes iterator */
+ unsigned int sidx = 0;
+ struct damon_target *titer; /* targets iterator */
+ unsigned int tidx = 0;
+ bool do_trace = false;
+
+ /* get indices for trace_damos_before_apply() */
+ if (trace_damos_before_apply_enabled()) {
+ damon_for_each_scheme(siter, c) {
+ if (siter == s)
+ break;
+ sidx++;
+ }
+ damon_for_each_target(titer, c) {
+ if (titer == t)
+ break;
+ tidx++;
+ }
+ do_trace = true;
+ }
if (c->ops.apply_scheme) {
if (quota->esz && quota->charged_sz + sz > quota->esz) {
@@ -960,8 +991,11 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
ktime_get_coarse_ts64(&begin);
if (c->callback.before_damos_apply)
err = c->callback.before_damos_apply(c, t, r, s);
- if (!err)
+ if (!err) {
+ trace_damos_before_apply(cidx, sidx, tidx, r,
+ damon_nr_regions(t), do_trace);
sz_applied = c->ops.apply_scheme(c, t, r, s);
+ }
ktime_get_coarse_ts64(&end);
quota->total_charged_ns += timespec64_to_ns(&end) -
timespec64_to_ns(&begin);
@@ -1079,14 +1113,29 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
struct damon_target *t;
struct damon_region *r, *next_r;
struct damos *s;
+ unsigned long sample_interval = c->attrs.sample_interval ?
+ c->attrs.sample_interval : 1;
+ bool has_schemes_to_apply = false;
damon_for_each_scheme(s, c) {
+ if (c->passed_sample_intervals != s->next_apply_sis)
+ continue;
+
+ s->next_apply_sis +=
+ (s->apply_interval_us ? s->apply_interval_us :
+ c->attrs.aggr_interval) / sample_interval;
+
if (!s->wmarks.activated)
continue;
+ has_schemes_to_apply = true;
+
damos_adjust_quota(c, s);
}
+ if (!has_schemes_to_apply)
+ return;
+
damon_for_each_target(t, c) {
damon_for_each_region_safe(r, next_r, t)
damon_do_apply_schemes(c, t, r);
@@ -1103,6 +1152,7 @@ static void damon_merge_two_regions(struct damon_target *t,
l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
(sz_l + sz_r);
+ l->nr_accesses_bp = l->nr_accesses * 10000;
l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
l->ar.end = r->ar.end;
damon_destroy_region(r, t);
@@ -1174,6 +1224,7 @@ static void damon_split_region_at(struct damon_target *t,
new->age = r->age;
new->last_nr_accesses = r->last_nr_accesses;
+ new->nr_accesses_bp = r->nr_accesses_bp;
damon_insert_region(new, r, damon_next_region(r), t);
}
@@ -1241,18 +1292,6 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
}
/*
- * Check whether it is time to check and apply the operations-related data
- * structures.
- *
- * Returns true if it is.
- */
-static bool kdamond_need_update_operations(struct damon_ctx *ctx)
-{
- return damon_check_reset_time_interval(&ctx->last_ops_update,
- ctx->attrs.ops_update_interval);
-}
-
-/*
* Check whether current monitoring should be stopped
*
* The monitoring is stopped when either the user requested to stop, or all
@@ -1280,12 +1319,10 @@ static bool kdamond_need_stop(struct damon_ctx *ctx)
static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric)
{
- struct sysinfo i;
-
switch (metric) {
case DAMOS_WMARK_FREE_MEM_RATE:
- si_meminfo(&i);
- return i.freeram * 1000 / i.totalram;
+ return global_zone_page_state(NR_FREE_PAGES) * 1000 /
+ totalram_pages();
default:
break;
}
@@ -1363,6 +1400,25 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
return -EBUSY;
}
+static void kdamond_init_intervals_sis(struct damon_ctx *ctx)
+{
+ unsigned long sample_interval = ctx->attrs.sample_interval ?
+ ctx->attrs.sample_interval : 1;
+ unsigned long apply_interval;
+ struct damos *scheme;
+
+ ctx->passed_sample_intervals = 0;
+ ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval;
+ ctx->next_ops_update_sis = ctx->attrs.ops_update_interval /
+ sample_interval;
+
+ damon_for_each_scheme(scheme, ctx) {
+ apply_interval = scheme->apply_interval_us ?
+ scheme->apply_interval_us : ctx->attrs.aggr_interval;
+ scheme->next_apply_sis = apply_interval / sample_interval;
+ }
+}
+
/*
* The monitoring daemon that runs as a kernel thread
*/
@@ -1376,6 +1432,8 @@ static int kdamond_fn(void *data)
pr_debug("kdamond (%d) starts\n", current->pid);
+ kdamond_init_intervals_sis(ctx);
+
if (ctx->ops.init)
ctx->ops.init(ctx);
if (ctx->callback.before_start && ctx->callback.before_start(ctx))
@@ -1384,6 +1442,17 @@ static int kdamond_fn(void *data)
sz_limit = damon_region_sz_limit(ctx);
while (!kdamond_need_stop(ctx)) {
+ /*
+ * ctx->attrs and ctx->next_{aggregation,ops_update}_sis could
+ * be changed from after_wmarks_check() or after_aggregation()
+ * callbacks. Read the values here, and use those for this
+ * iteration. That is, damon_set_attrs() updated new values
+ * are respected from next iteration.
+ */
+ unsigned long next_aggregation_sis = ctx->next_aggregation_sis;
+ unsigned long next_ops_update_sis = ctx->next_ops_update_sis;
+ unsigned long sample_interval = ctx->attrs.sample_interval;
+
if (kdamond_wait_activation(ctx))
break;
@@ -1393,27 +1462,44 @@ static int kdamond_fn(void *data)
ctx->callback.after_sampling(ctx))
break;
- kdamond_usleep(ctx->attrs.sample_interval);
+ kdamond_usleep(sample_interval);
+ ctx->passed_sample_intervals++;
if (ctx->ops.check_accesses)
max_nr_accesses = ctx->ops.check_accesses(ctx);
- if (kdamond_aggregate_interval_passed(ctx)) {
+ if (ctx->passed_sample_intervals == next_aggregation_sis) {
kdamond_merge_regions(ctx,
max_nr_accesses / 10,
sz_limit);
if (ctx->callback.after_aggregation &&
ctx->callback.after_aggregation(ctx))
break;
- if (!list_empty(&ctx->schemes))
- kdamond_apply_schemes(ctx);
+ }
+
+ /*
+ * do kdamond_apply_schemes() after kdamond_merge_regions() if
+ * possible, to reduce overhead
+ */
+ if (!list_empty(&ctx->schemes))
+ kdamond_apply_schemes(ctx);
+
+ sample_interval = ctx->attrs.sample_interval ?
+ ctx->attrs.sample_interval : 1;
+ if (ctx->passed_sample_intervals == next_aggregation_sis) {
+ ctx->next_aggregation_sis = next_aggregation_sis +
+ ctx->attrs.aggr_interval / sample_interval;
+
kdamond_reset_aggregated(ctx);
kdamond_split_regions(ctx);
if (ctx->ops.reset_aggregated)
ctx->ops.reset_aggregated(ctx);
}
- if (kdamond_need_update_operations(ctx)) {
+ if (ctx->passed_sample_intervals == next_ops_update_sis) {
+ ctx->next_ops_update_sis = next_ops_update_sis +
+ ctx->attrs.ops_update_interval /
+ sample_interval;
if (ctx->ops.update)
ctx->ops.update(ctx);
sz_limit = damon_region_sz_limit(ctx);
@@ -1517,6 +1603,76 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
return damon_set_regions(t, &addr_range, 1);
}
+/*
+ * damon_moving_sum() - Calculate an inferred moving sum value.
+ * @mvsum: Inferred sum of the last @len_window values.
+ * @nomvsum: Non-moving sum of the last discrete @len_window window values.
+ * @len_window: The number of last values to take care of.
+ * @new_value: New value that will be added to the pseudo moving sum.
+ *
+ * Moving sum (moving average * window size) is good for handling noise, but
+ * the cost of keeping past values can be high for arbitrary window size. This
+ * function implements a lightweight pseudo moving sum function that doesn't
+ * keep the past window values.
+ *
+ * It simply assumes there was no noise in the past, and get the no-noise
+ * assumed past value to drop from @nomvsum and @len_window. @nomvsum is a
+ * non-moving sum of the last window. For example, if @len_window is 10 and we
+ * have 25 values, @nomvsum is the sum of the 11th to 20th values of the 25
+ * values. Hence, this function simply drops @nomvsum / @len_window from
+ * given @mvsum and add @new_value.
+ *
+ * For example, if @len_window is 10 and @nomvsum is 50, the last 10 values for
+ * the last window could be vary, e.g., 0, 10, 0, 10, 0, 10, 0, 0, 0, 20. For
+ * calculating next moving sum with a new value, we should drop 0 from 50 and
+ * add the new value. However, this function assumes it got value 5 for each
+ * of the last ten times. Based on the assumption, when the next value is
+ * measured, it drops the assumed past value, 5 from the current sum, and add
+ * the new value to get the updated pseduo-moving average.
+ *
+ * This means the value could have errors, but the errors will be disappeared
+ * for every @len_window aligned calls. For example, if @len_window is 10, the
+ * pseudo moving sum with 11th value to 19th value would have an error. But
+ * the sum with 20th value will not have the error.
+ *
+ * Return: Pseudo-moving average after getting the @new_value.
+ */
+static unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
+ unsigned int len_window, unsigned int new_value)
+{
+ return mvsum - nomvsum / len_window + new_value;
+}
+
+/**
+ * damon_update_region_access_rate() - Update the access rate of a region.
+ * @r: The DAMON region to update for its access check result.
+ * @accessed: Whether the region has accessed during last sampling interval.
+ * @attrs: The damon_attrs of the DAMON context.
+ *
+ * Update the access rate of a region with the region's last sampling interval
+ * access check result.
+ *
+ * Usually this will be called by &damon_operations->check_accesses callback.
+ */
+void damon_update_region_access_rate(struct damon_region *r, bool accessed,
+ struct damon_attrs *attrs)
+{
+ unsigned int len_window = 1;
+
+ /*
+ * sample_interval can be zero, but cannot be larger than
+ * aggr_interval, owing to validation of damon_set_attrs().
+ */
+ if (attrs->sample_interval)
+ len_window = damon_max_nr_accesses(attrs);
+ r->nr_accesses_bp = damon_moving_sum(r->nr_accesses_bp,
+ r->last_nr_accesses * 10000, len_window,
+ accessed ? 10000 : 0);
+
+ if (accessed)
+ r->nr_accesses++;
+}
+
static int __init damon_init(void)
{
damon_region_cache = KMEM_CACHE(damon_region, 0);
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 124f0f8c97b7..dc0ea1fc30ca 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -278,7 +278,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
goto fail;
pos += parsed;
- scheme = damon_new_scheme(&pattern, action, &quota, &wmarks);
+ scheme = damon_new_scheme(&pattern, action, 0, &quota,
+ &wmarks);
if (!scheme)
goto fail;
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 7b8fce2f67a8..f2e5f9431892 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -158,6 +158,8 @@ static struct damos *damon_lru_sort_new_scheme(
pattern,
/* (de)prioritize on LRU-lists */
action,
+ /* for each aggregation interval */
+ 0,
/* under the quota. */
&quota,
/* (De)activate this according to the watermarks. */
@@ -193,9 +195,7 @@ static int damon_lru_sort_apply_parameters(void)
if (err)
return err;
- /* aggr_interval / sample_interval is the maximum nr_accesses */
- hot_thres = damon_lru_sort_mon_attrs.aggr_interval /
- damon_lru_sort_mon_attrs.sample_interval *
+ hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) *
hot_thres_access_freq / 1000;
scheme = damon_lru_sort_new_hot_scheme(hot_thres);
if (!scheme)
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index ac1c3fa80f98..d25d99cb5f2b 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -73,7 +73,6 @@ void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr
int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s)
{
- unsigned int max_nr_accesses;
int freq_subscore;
unsigned int age_in_sec;
int age_in_log, age_subscore;
@@ -81,8 +80,8 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
unsigned int age_weight = s->quota.weight_age;
int hotness;
- max_nr_accesses = c->attrs.aggr_interval / c->attrs.sample_interval;
- freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
+ freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE /
+ damon_max_nr_accesses(&c->attrs);
age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000;
for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 909db25efb35..081e2a325778 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -148,7 +148,8 @@ out:
return accessed;
}
-static void __damon_pa_check_access(struct damon_region *r)
+static void __damon_pa_check_access(struct damon_region *r,
+ struct damon_attrs *attrs)
{
static unsigned long last_addr;
static unsigned long last_folio_sz = PAGE_SIZE;
@@ -157,14 +158,12 @@ static void __damon_pa_check_access(struct damon_region *r)
/* If the region is in the last checked page, reuse the result */
if (ALIGN_DOWN(last_addr, last_folio_sz) ==
ALIGN_DOWN(r->sampling_addr, last_folio_sz)) {
- if (last_accessed)
- r->nr_accesses++;
+ damon_update_region_access_rate(r, last_accessed, attrs);
return;
}
last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz);
- if (last_accessed)
- r->nr_accesses++;
+ damon_update_region_access_rate(r, last_accessed, attrs);
last_addr = r->sampling_addr;
}
@@ -177,7 +176,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t) {
- __damon_pa_check_access(r);
+ __damon_pa_check_access(r, &ctx->attrs);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
}
}
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 648d2a85523a..ab974e477d2f 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -142,6 +142,8 @@ static struct damos *damon_reclaim_new_scheme(void)
&pattern,
/* page out those, as soon as found */
DAMOS_PAGEOUT,
+ /* for each aggregation interval */
+ 0,
/* under the quota. */
&damon_reclaim_quota,
/* (De)activate this according to the watermarks. */
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index fd482a0639b4..5ff081226e28 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -49,6 +49,8 @@ int damon_sysfs_schemes_update_regions_start(
struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx, bool total_bytes_only);
+bool damos_sysfs_regions_upd_done(void);
+
int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
int damon_sysfs_schemes_clear_regions(
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 527e7d17eb3b..45bd0fd4a8b1 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -31,7 +31,7 @@ static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc(
return NULL;
sysfs_region->kobj = (struct kobject){};
sysfs_region->ar = region->ar;
- sysfs_region->nr_accesses = region->nr_accesses;
+ sysfs_region->nr_accesses = region->nr_accesses_bp / 10000;
sysfs_region->age = region->age;
INIT_LIST_HEAD(&sysfs_region->list);
return sysfs_region;
@@ -113,11 +113,47 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = {
* scheme regions directory
*/
+/*
+ * enum damos_sysfs_regions_upd_status - Represent DAMOS tried regions update
+ * status
+ * @DAMOS_TRIED_REGIONS_UPD_IDLE: Waiting for next request.
+ * @DAMOS_TRIED_REGIONS_UPD_STARTED: Update started.
+ * @DAMOS_TRIED_REGIONS_UPD_FINISHED: Update finished.
+ *
+ * Each DAMON-based operation scheme (&struct damos) has its own apply
+ * interval, and we need to expose the scheme tried regions based on only
+ * single snapshot. For this, we keep the tried regions update status for each
+ * scheme. The status becomes 'idle' at the beginning.
+ *
+ * Once the tried regions update request is received, the request handling
+ * start function (damon_sysfs_scheme_update_regions_start()) sets the status
+ * of all schemes as 'idle' again, and register ->before_damos_apply() and
+ * ->after_sampling() callbacks.
+ *
+ * Then, the first followup ->before_damos_apply() callback
+ * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first
+ * ->after_sampling() callback (damon_sysfs_after_sampling()) after the call
+ * is called only after the scheme is completely applied
+ * to the given snapshot. Hence the callback knows the situation by showing
+ * 'started' status, and sets the status as 'finished'. Then,
+ * damon_sysfs_before_damos_apply() understands the situation by showing the
+ * 'finished' status and do nothing.
+ *
+ * Finally, the tried regions request handling finisher function
+ * (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks.
+ */
+enum damos_sysfs_regions_upd_status {
+ DAMOS_TRIED_REGIONS_UPD_IDLE,
+ DAMOS_TRIED_REGIONS_UPD_STARTED,
+ DAMOS_TRIED_REGIONS_UPD_FINISHED,
+};
+
struct damon_sysfs_scheme_regions {
struct kobject kobj;
struct list_head regions_list;
int nr_regions;
unsigned long total_bytes;
+ enum damos_sysfs_regions_upd_status upd_status;
};
static struct damon_sysfs_scheme_regions *
@@ -130,6 +166,7 @@ damon_sysfs_scheme_regions_alloc(void)
INIT_LIST_HEAD(&regions->regions_list);
regions->nr_regions = 0;
regions->total_bytes = 0;
+ regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE;
return regions;
}
@@ -1121,6 +1158,7 @@ struct damon_sysfs_scheme {
struct kobject kobj;
enum damos_action action;
struct damon_sysfs_access_pattern *access_pattern;
+ unsigned long apply_interval_us;
struct damon_sysfs_quotas *quotas;
struct damon_sysfs_watermarks *watermarks;
struct damon_sysfs_scheme_filters *filters;
@@ -1141,7 +1179,7 @@ static const char * const damon_sysfs_damos_action_strs[] = {
};
static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
- enum damos_action action)
+ enum damos_action action, unsigned long apply_interval_us)
{
struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme),
GFP_KERNEL);
@@ -1150,6 +1188,7 @@ static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
return NULL;
scheme->kobj = (struct kobject){};
scheme->action = action;
+ scheme->apply_interval_us = apply_interval_us;
return scheme;
}
@@ -1353,6 +1392,25 @@ static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
return -EINVAL;
}
+static ssize_t apply_interval_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme *scheme = container_of(kobj,
+ struct damon_sysfs_scheme, kobj);
+
+ return sysfs_emit(buf, "%lu\n", scheme->apply_interval_us);
+}
+
+static ssize_t apply_interval_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme *scheme = container_of(kobj,
+ struct damon_sysfs_scheme, kobj);
+ int err = kstrtoul(buf, 0, &scheme->apply_interval_us);
+
+ return err ? err : count;
+}
+
static void damon_sysfs_scheme_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
@@ -1361,8 +1419,12 @@ static void damon_sysfs_scheme_release(struct kobject *kobj)
static struct kobj_attribute damon_sysfs_scheme_action_attr =
__ATTR_RW_MODE(action, 0600);
+static struct kobj_attribute damon_sysfs_scheme_apply_interval_us_attr =
+ __ATTR_RW_MODE(apply_interval_us, 0600);
+
static struct attribute *damon_sysfs_scheme_attrs[] = {
&damon_sysfs_scheme_action_attr.attr,
+ &damon_sysfs_scheme_apply_interval_us_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme);
@@ -1413,7 +1475,11 @@ static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes,
schemes->schemes_arr = schemes_arr;
for (i = 0; i < nr_schemes; i++) {
- scheme = damon_sysfs_scheme_alloc(DAMOS_STAT);
+ /*
+ * apply_interval_us as 0 means same to aggregation interval
+ * (same to before-apply_interval behavior)
+ */
+ scheme = damon_sysfs_scheme_alloc(DAMOS_STAT, 0);
if (!scheme) {
damon_sysfs_schemes_rm_dirs(schemes);
return -ENOMEM;
@@ -1610,8 +1676,8 @@ static struct damos *damon_sysfs_mk_scheme(
.low = sysfs_wmarks->low,
};
- scheme = damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
- &wmarks);
+ scheme = damon_new_scheme(&pattern, sysfs_scheme->action,
+ sysfs_scheme->apply_interval_us, &quota, &wmarks);
if (!scheme)
return NULL;
@@ -1641,6 +1707,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme,
scheme->pattern.max_age_region = access_pattern->age->max;
scheme->action = sysfs_scheme->action;
+ scheme->apply_interval_us = sysfs_scheme->apply_interval_us;
scheme->quota.ms = sysfs_quotas->ms;
scheme->quota.sz = sysfs_quotas->sz;
@@ -1747,6 +1814,10 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
return 0;
sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
+ if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_FINISHED)
+ return 0;
+ if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_IDLE)
+ sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_STARTED;
sysfs_regions->total_bytes += r->ar.end - r->ar.start;
if (damos_regions_upd_total_bytes_only)
return 0;
@@ -1763,6 +1834,29 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
return 0;
}
+/*
+ * DAMON callback that called after each accesses sampling. While this
+ * callback is registered, damon_sysfs_lock should be held to ensure the
+ * regions directories exist.
+ */
+static int damon_sysfs_after_sampling(struct damon_ctx *ctx)
+{
+ struct damon_sysfs_schemes *sysfs_schemes =
+ damon_sysfs_schemes_for_damos_callback;
+ struct damon_sysfs_scheme_regions *sysfs_regions;
+ int i;
+
+ for (i = 0; i < sysfs_schemes->nr; i++) {
+ sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
+ if (sysfs_regions->upd_status ==
+ DAMOS_TRIED_REGIONS_UPD_STARTED)
+ sysfs_regions->upd_status =
+ DAMOS_TRIED_REGIONS_UPD_FINISHED;
+ }
+
+ return 0;
+}
+
/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
int damon_sysfs_schemes_clear_regions(
struct damon_sysfs_schemes *sysfs_schemes,
@@ -1786,6 +1880,16 @@ int damon_sysfs_schemes_clear_regions(
return 0;
}
+static void damos_tried_regions_init_upd_status(
+ struct damon_sysfs_schemes *sysfs_schemes)
+{
+ int i;
+
+ for (i = 0; i < sysfs_schemes->nr; i++)
+ sysfs_schemes->schemes_arr[i]->tried_regions->upd_status =
+ DAMOS_TRIED_REGIONS_UPD_IDLE;
+}
+
/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
int damon_sysfs_schemes_update_regions_start(
struct damon_sysfs_schemes *sysfs_schemes,
@@ -1793,11 +1897,29 @@ int damon_sysfs_schemes_update_regions_start(
{
damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx);
damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
+ damos_tried_regions_init_upd_status(sysfs_schemes);
damos_regions_upd_total_bytes_only = total_bytes_only;
ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
+ ctx->callback.after_sampling = damon_sysfs_after_sampling;
return 0;
}
+bool damos_sysfs_regions_upd_done(void)
+{
+ struct damon_sysfs_schemes *sysfs_schemes =
+ damon_sysfs_schemes_for_damos_callback;
+ struct damon_sysfs_scheme_regions *sysfs_regions;
+ int i;
+
+ for (i = 0; i < sysfs_schemes->nr; i++) {
+ sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
+ if (sysfs_regions->upd_status !=
+ DAMOS_TRIED_REGIONS_UPD_FINISHED)
+ return false;
+ }
+ return true;
+}
+
/*
* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller
* should unlock damon_sysfs_lock which held before
@@ -1807,6 +1929,7 @@ int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
{
damon_sysfs_schemes_for_damos_callback = NULL;
ctx->callback.before_damos_apply = NULL;
+ ctx->callback.after_sampling = NULL;
damon_sysfs_schemes_region_idx = 0;
return 0;
}
diff --git a/mm/damon/sysfs-test.h b/mm/damon/sysfs-test.h
new file mode 100644
index 000000000000..73bdce2452c1
--- /dev/null
+++ b/mm/damon/sysfs-test.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Data Access Monitor Unit Tests
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#ifdef CONFIG_DAMON_SYSFS_KUNIT_TEST
+
+#ifndef _DAMON_SYSFS_TEST_H
+#define _DAMON_SYSFS_TEST_H
+
+#include <kunit/test.h>
+
+static unsigned int nr_damon_targets(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+ unsigned int nr_targets = 0;
+
+ damon_for_each_target(t, ctx)
+ nr_targets++;
+
+ return nr_targets;
+}
+
+static int __damon_sysfs_test_get_any_pid(int min, int max)
+{
+ struct pid *pid;
+ int i;
+
+ for (i = min; i <= max; i++) {
+ pid = find_get_pid(i);
+ if (pid) {
+ put_pid(pid);
+ return i;
+ }
+ }
+ return -1;
+}
+
+static void damon_sysfs_test_set_targets(struct kunit *test)
+{
+ struct damon_sysfs_targets *sysfs_targets;
+ struct damon_sysfs_target *sysfs_target;
+ struct damon_ctx *ctx;
+
+ sysfs_targets = damon_sysfs_targets_alloc();
+ sysfs_targets->nr = 1;
+ sysfs_targets->targets_arr = kmalloc_array(1,
+ sizeof(*sysfs_targets->targets_arr), GFP_KERNEL);
+
+ sysfs_target = damon_sysfs_target_alloc();
+ sysfs_target->pid = __damon_sysfs_test_get_any_pid(12, 100);
+ sysfs_target->regions = damon_sysfs_regions_alloc();
+ sysfs_targets->targets_arr[0] = sysfs_target;
+
+ ctx = damon_new_ctx();
+
+ damon_sysfs_set_targets(ctx, sysfs_targets);
+ KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(ctx));
+
+ sysfs_target->pid = __damon_sysfs_test_get_any_pid(
+ sysfs_target->pid + 1, 200);
+ damon_sysfs_set_targets(ctx, sysfs_targets);
+ KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(ctx));
+
+ damon_destroy_ctx(ctx);
+ kfree(sysfs_targets->targets_arr);
+ kfree(sysfs_targets);
+ kfree(sysfs_target);
+}
+
+static struct kunit_case damon_sysfs_test_cases[] = {
+ KUNIT_CASE(damon_sysfs_test_set_targets),
+ {},
+};
+
+static struct kunit_suite damon_sysfs_test_suite = {
+ .name = "damon-sysfs",
+ .test_cases = damon_sysfs_test_cases,
+};
+kunit_test_suite(damon_sysfs_test_suite);
+
+#endif /* _DAMON_SYSFS_TEST_H */
+
+#endif /* CONFIG_DAMON_SYSFS_KUNIT_TEST */
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index b86ba7b0a921..e27846708b5a 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1150,64 +1150,81 @@ destroy_targets_out:
return err;
}
-/*
- * Search a target in a context that corresponds to the sysfs target input.
- *
- * Return: pointer to the target if found, NULL if not found, or negative
- * error code if the search failed.
- */
-static struct damon_target *damon_sysfs_existing_target(
- struct damon_sysfs_target *sys_target, struct damon_ctx *ctx)
+static int damon_sysfs_update_target_pid(struct damon_target *target, int pid)
{
- struct pid *pid;
- struct damon_target *t;
+ struct pid *pid_new;
- if (!damon_target_has_pid(ctx)) {
- /* Up to only one target for paddr could exist */
- damon_for_each_target(t, ctx)
- return t;
- return NULL;
+ pid_new = find_get_pid(pid);
+ if (!pid_new)
+ return -EINVAL;
+
+ if (pid_new == target->pid) {
+ put_pid(pid_new);
+ return 0;
}
- /* ops.id should be DAMON_OPS_VADDR or DAMON_OPS_FVADDR */
- pid = find_get_pid(sys_target->pid);
- if (!pid)
- return ERR_PTR(-EINVAL);
- damon_for_each_target(t, ctx) {
- if (t->pid == pid) {
- put_pid(pid);
- return t;
- }
+ put_pid(target->pid);
+ target->pid = pid_new;
+ return 0;
+}
+
+static int damon_sysfs_update_target(struct damon_target *target,
+ struct damon_ctx *ctx,
+ struct damon_sysfs_target *sys_target)
+{
+ int err;
+
+ if (damon_target_has_pid(ctx)) {
+ err = damon_sysfs_update_target_pid(target, sys_target->pid);
+ if (err)
+ return err;
}
- put_pid(pid);
- return NULL;
+
+ /*
+ * Do monitoring target region boundary update only if one or more
+ * regions are set by the user. This is for keeping current monitoring
+ * target results and range easier, especially for dynamic monitoring
+ * target regions update ops like 'vaddr'.
+ */
+ if (sys_target->regions->nr)
+ err = damon_sysfs_set_regions(target, sys_target->regions);
+ return err;
}
static int damon_sysfs_set_targets(struct damon_ctx *ctx,
struct damon_sysfs_targets *sysfs_targets)
{
- int i, err;
+ struct damon_target *t, *next;
+ int i = 0, err;
/* Multiple physical address space monitoring targets makes no sense */
if (ctx->ops.id == DAMON_OPS_PADDR && sysfs_targets->nr > 1)
return -EINVAL;
- for (i = 0; i < sysfs_targets->nr; i++) {
+ damon_for_each_target_safe(t, next, ctx) {
+ if (i < sysfs_targets->nr) {
+ damon_sysfs_update_target(t, ctx,
+ sysfs_targets->targets_arr[i]);
+ } else {
+ if (damon_target_has_pid(ctx))
+ put_pid(t->pid);
+ damon_destroy_target(t);
+ }
+ i++;
+ }
+
+ for (; i < sysfs_targets->nr; i++) {
struct damon_sysfs_target *st = sysfs_targets->targets_arr[i];
- struct damon_target *t = damon_sysfs_existing_target(st, ctx);
-
- if (IS_ERR(t))
- return PTR_ERR(t);
- if (!t)
- err = damon_sysfs_add_target(st, ctx);
- else
- err = damon_sysfs_set_regions(t, st->regions);
+
+ err = damon_sysfs_add_target(st, ctx);
if (err)
return err;
}
return 0;
}
+static bool damon_sysfs_schemes_regions_updating;
+
static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
@@ -1219,8 +1236,10 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
cmd = damon_sysfs_cmd_request.cmd;
if (kdamond && ctx == kdamond->damon_ctx &&
(cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS ||
- cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES)) {
+ cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES) &&
+ damon_sysfs_schemes_regions_updating) {
damon_sysfs_schemes_update_regions_stop(ctx);
+ damon_sysfs_schemes_regions_updating = false;
mutex_unlock(&damon_sysfs_lock);
}
@@ -1332,15 +1351,15 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
/*
* damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
- * @c: The DAMON context of the callback.
+ * @c: The DAMON context of the callback.
+ * @active: Whether @c is not deactivated due to watermarks.
*
* This function is periodically called back from the kdamond thread for @c.
* Then, it checks if there is a waiting DAMON sysfs request and handles it.
*/
-static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
+static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
{
struct damon_sysfs_kdamond *kdamond;
- static bool damon_sysfs_schemes_regions_updating;
bool total_bytes_only = false;
int err = 0;
@@ -1370,6 +1389,13 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
goto keep_lock_out;
}
} else {
+ /*
+ * Continue regions updating if DAMON is till
+ * active and the update for all schemes is not
+ * finished.
+ */
+ if (active && !damos_sysfs_regions_upd_done())
+ goto keep_lock_out;
err = damon_sysfs_upd_schemes_regions_stop(kdamond);
damon_sysfs_schemes_regions_updating = false;
}
@@ -1389,6 +1415,24 @@ keep_lock_out:
return err;
}
+static int damon_sysfs_after_wmarks_check(struct damon_ctx *c)
+{
+ /*
+ * after_wmarks_check() is called back while the context is deactivated
+ * by watermarks.
+ */
+ return damon_sysfs_cmd_request_callback(c, false);
+}
+
+static int damon_sysfs_after_aggregation(struct damon_ctx *c)
+{
+ /*
+ * after_aggregation() is called back only while the context is not
+ * deactivated by watermarks.
+ */
+ return damon_sysfs_cmd_request_callback(c, true);
+}
+
static struct damon_ctx *damon_sysfs_build_ctx(
struct damon_sysfs_context *sys_ctx)
{
@@ -1404,8 +1448,8 @@ static struct damon_ctx *damon_sysfs_build_ctx(
return ERR_PTR(err);
}
- ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback;
- ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback;
+ ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check;
+ ctx->callback.after_aggregation = damon_sysfs_after_aggregation;
ctx->callback.before_terminate = damon_sysfs_before_terminate;
return ctx;
}
@@ -1807,3 +1851,5 @@ out:
return err;
}
subsys_initcall(damon_sysfs_init);
+
+#include "sysfs-test.h"
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index c4b455b5ee30..dcf1ca6b31cc 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -148,6 +148,8 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]);
KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]);
}
+
+ damon_destroy_target(t);
}
/*
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 4c81a9dbd044..a4d1f63c5b23 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -341,13 +341,14 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
bool referenced = false;
pte_t entry = huge_ptep_get(pte);
struct folio *folio = pfn_folio(pte_pfn(entry));
+ unsigned long psize = huge_page_size(hstate_vma(vma));
folio_get(folio);
if (pte_young(entry)) {
referenced = true;
entry = pte_mkold(entry);
- set_huge_pte_at(mm, addr, pte, entry);
+ set_huge_pte_at(mm, addr, pte, entry, psize);
}
#ifdef CONFIG_MMU_NOTIFIER
@@ -557,23 +558,27 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
* r the region to be checked
*/
static void __damon_va_check_access(struct mm_struct *mm,
- struct damon_region *r, bool same_target)
+ struct damon_region *r, bool same_target,
+ struct damon_attrs *attrs)
{
static unsigned long last_addr;
static unsigned long last_folio_sz = PAGE_SIZE;
static bool last_accessed;
+ if (!mm) {
+ damon_update_region_access_rate(r, false, attrs);
+ return;
+ }
+
/* If the region is in the last checked page, reuse the result */
if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) ==
ALIGN_DOWN(r->sampling_addr, last_folio_sz))) {
- if (last_accessed)
- r->nr_accesses++;
+ damon_update_region_access_rate(r, last_accessed, attrs);
return;
}
last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz);
- if (last_accessed)
- r->nr_accesses++;
+ damon_update_region_access_rate(r, last_accessed, attrs);
last_addr = r->sampling_addr;
}
@@ -588,15 +593,15 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
mm = damon_get_mm(t);
- if (!mm)
- continue;
same_target = false;
damon_for_each_region(r, t) {
- __damon_va_check_access(mm, r, same_target);
+ __damon_va_check_access(mm, r, same_target,
+ &ctx->attrs);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
same_target = true;
}
- mmput(mm);
+ if (mm)
+ mmput(mm);
}
return max_nr_accesses;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 48e329ea5ba3..e651500e597a 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1322,8 +1322,8 @@ static int __init debug_vm_pgtable(void)
* true irrespective of the starting protection value for a
* given page table entry.
*
- * Protection based vm_flags combinatins are always linear
- * and increasing i.e starting from VM_NONE and going upto
+ * Protection based vm_flags combinations are always linear
+ * and increasing i.e starting from VM_NONE and going up to
* (VM_SHARED | READ | WRITE | EXEC).
*/
#define VM_FLAGS_START (VM_NONE)
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ea4387053e8..9710f43a89ac 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -131,11 +131,8 @@ static void page_cache_delete(struct address_space *mapping,
mapping_set_update(&xas, mapping);
- /* hugetlb pages are represented by a single entry in the xarray */
- if (!folio_test_hugetlb(folio)) {
- xas_set_order(&xas, folio->index, folio_order(folio));
- nr = folio_nr_pages(folio);
- }
+ xas_set_order(&xas, folio->index, folio_order(folio));
+ nr = folio_nr_pages(folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -234,7 +231,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio)
if (free_folio)
free_folio(folio);
- if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+ if (folio_test_large(folio))
refs = folio_nr_pages(folio);
folio_put_refs(folio, refs);
}
@@ -819,7 +816,7 @@ void replace_page_cache_folio(struct folio *old, struct folio *new)
new->mapping = mapping;
new->index = offset;
- mem_cgroup_migrate(old, new);
+ mem_cgroup_replace_folio(old, new);
xas_lock_irq(&xas);
xas_store(&xas, new);
@@ -855,14 +852,15 @@ noinline int __filemap_add_folio(struct address_space *mapping,
if (!huge) {
int error = mem_cgroup_charge(folio, NULL, gfp);
- VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
if (error)
return error;
charged = true;
- xas_set_order(&xas, index, folio_order(folio));
- nr = folio_nr_pages(folio);
}
+ VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
+ xas_set_order(&xas, index, folio_order(folio));
+ nr = folio_nr_pages(folio);
+
gfp &= GFP_RECLAIM_MASK;
folio_ref_add(folio, nr);
folio->mapping = mapping;
@@ -1135,32 +1133,13 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
wait_queue_head_t *q = folio_waitqueue(folio);
struct wait_page_key key;
unsigned long flags;
- wait_queue_entry_t bookmark;
key.folio = folio;
key.bit_nr = bit_nr;
key.page_match = 0;
- bookmark.flags = 0;
- bookmark.private = NULL;
- bookmark.func = NULL;
- INIT_LIST_HEAD(&bookmark.entry);
-
spin_lock_irqsave(&q->lock, flags);
- __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
-
- while (bookmark.flags & WQ_FLAG_BOOKMARK) {
- /*
- * Take a breather from holding the lock,
- * allow pages that finish wake up asynchronously
- * to acquire the lock and remove themselves
- * from wait queue
- */
- spin_unlock_irqrestore(&q->lock, flags);
- cpu_relax();
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
- }
+ __wake_up_locked_key(q, TASK_NORMAL, &key);
/*
* It's possible to miss clearing waiters here, when we woke our page
@@ -1177,13 +1156,6 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
spin_unlock_irqrestore(&q->lock, flags);
}
-static void folio_wake(struct folio *folio, int bit)
-{
- if (!folio_test_waiters(folio))
- return;
- folio_wake_bit(folio, bit);
-}
-
/*
* A choice of three behaviors for folio_wait_bit_common():
*/
@@ -1484,29 +1456,6 @@ void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
}
EXPORT_SYMBOL_GPL(folio_add_wait_queue);
-#ifndef clear_bit_unlock_is_negative_byte
-
-/*
- * PG_waiters is the high bit in the same byte as PG_lock.
- *
- * On x86 (and on many other architectures), we can clear PG_lock and
- * test the sign bit at the same time. But if the architecture does
- * not support that special operation, we just do this all by hand
- * instead.
- *
- * The read of PG_waiters has to be after (or concurrently with) PG_locked
- * being cleared, but a memory barrier should be unnecessary since it is
- * in the same byte as PG_locked.
- */
-static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
-{
- clear_bit_unlock(nr, mem);
- /* smp_mb__after_atomic(); */
- return test_bit(PG_waiters, mem);
-}
-
-#endif
-
/**
* folio_unlock - Unlock a locked folio.
* @folio: The folio.
@@ -1522,12 +1471,42 @@ void folio_unlock(struct folio *folio)
BUILD_BUG_ON(PG_waiters != 7);
BUILD_BUG_ON(PG_locked > 7);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
+ if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))
folio_wake_bit(folio, PG_locked);
}
EXPORT_SYMBOL(folio_unlock);
/**
+ * folio_end_read - End read on a folio.
+ * @folio: The folio.
+ * @success: True if all reads completed successfully.
+ *
+ * When all reads against a folio have completed, filesystems should
+ * call this function to let the pagecache know that no more reads
+ * are outstanding. This will unlock the folio and wake up any thread
+ * sleeping on the lock. The folio will also be marked uptodate if all
+ * reads succeeded.
+ *
+ * Context: May be called from interrupt or process context. May not be
+ * called from NMI context.
+ */
+void folio_end_read(struct folio *folio, bool success)
+{
+ unsigned long mask = 1 << PG_locked;
+
+ /* Must be in bottom byte for x86 to work */
+ BUILD_BUG_ON(PG_uptodate > 7);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
+
+ if (likely(success))
+ mask |= 1 << PG_uptodate;
+ if (folio_xor_flags_has_waiters(folio, mask))
+ folio_wake_bit(folio, PG_locked);
+}
+EXPORT_SYMBOL(folio_end_read);
+
+/**
* folio_end_private_2 - Clear PG_private_2 and wake any waiters.
* @folio: The folio.
*
@@ -1588,9 +1567,15 @@ EXPORT_SYMBOL(folio_wait_private_2_killable);
/**
* folio_end_writeback - End writeback against a folio.
* @folio: The folio.
+ *
+ * The folio must actually be under writeback.
+ *
+ * Context: May be called from process or interrupt context.
*/
void folio_end_writeback(struct folio *folio)
{
+ VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
+
/*
* folio_test_clear_reclaim() could be used here but it is an
* atomic operation and overkill in this particular case. Failing
@@ -1607,14 +1592,11 @@ void folio_end_writeback(struct folio *folio)
* Writeback does not hold a folio reference of its own, relying
* on truncation to wait for the clearing of PG_writeback.
* But here we must make sure that the folio is not freed and
- * reused before the folio_wake().
+ * reused before the folio_wake_bit().
*/
folio_get(folio);
- if (!__folio_end_writeback(folio))
- BUG();
-
- smp_mb__after_atomic();
- folio_wake(folio, PG_writeback);
+ if (__folio_end_writeback(folio))
+ folio_wake_bit(folio, PG_writeback);
acct_reclaim_writeback(folio);
folio_put(folio);
}
@@ -2040,7 +2022,7 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
int idx = folio_batch_count(fbatch) - 1;
folio = fbatch->folios[idx];
- if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+ if (!xa_is_value(folio))
nr = folio_nr_pages(folio);
*start = indices[idx] + nr;
}
@@ -2104,7 +2086,7 @@ put:
int idx = folio_batch_count(fbatch) - 1;
folio = fbatch->folios[idx];
- if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+ if (!xa_is_value(folio))
nr = folio_nr_pages(folio);
*start = indices[idx] + nr;
}
@@ -2122,51 +2104,13 @@ put:
* index @start and up to index @end (inclusive). The folios are returned
* in @fbatch with an elevated reference count.
*
- * The first folio may start before @start; if it does, it will contain
- * @start. The final folio may extend beyond @end; if it does, it will
- * contain @end. The folios have ascending indices. There may be gaps
- * between the folios if there are indices which have no folio in the
- * page cache. If folios are added to or removed from the page cache
- * while this is running, they may or may not be found by this call.
- *
* Return: The number of folios which were found.
* We also update @start to index the next folio for the traversal.
*/
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch)
{
- XA_STATE(xas, &mapping->i_pages, *start);
- struct folio *folio;
-
- rcu_read_lock();
- while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
- /* Skip over shadow, swap and DAX entries */
- if (xa_is_value(folio))
- continue;
- if (!folio_batch_add(fbatch, folio)) {
- unsigned long nr = folio_nr_pages(folio);
-
- if (folio_test_hugetlb(folio))
- nr = 1;
- *start = folio->index + nr;
- goto out;
- }
- }
-
- /*
- * We come here when there is no page beyond @end. We take care to not
- * overflow the index @start as it confuses some of the callers. This
- * breaks the iteration when there is a page at index -1 but that is
- * already broken anyway.
- */
- if (end == (pgoff_t)-1)
- *start = (pgoff_t)-1;
- else
- *start = end + 1;
-out:
- rcu_read_unlock();
-
- return folio_batch_count(fbatch);
+ return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);
}
EXPORT_SYMBOL(filemap_get_folios);
@@ -2213,9 +2157,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
if (!folio_batch_add(fbatch, folio)) {
nr = folio_nr_pages(folio);
-
- if (folio_test_hugetlb(folio))
- nr = 1;
*start = folio->index + nr;
goto out;
}
@@ -2232,10 +2173,7 @@ update_start:
if (nr) {
folio = fbatch->folios[nr - 1];
- if (folio_test_hugetlb(folio))
- *start = folio->index + 1;
- else
- *start = folio_next_index(folio);
+ *start = folio->index + folio_nr_pages(folio);
}
out:
rcu_read_unlock();
@@ -2251,7 +2189,13 @@ EXPORT_SYMBOL(filemap_get_folios_contig);
* @tag: The tag index
* @fbatch: The batch to fill
*
- * Same as filemap_get_folios(), but only returning folios tagged with @tag.
+ * The first folio may start before @start; if it does, it will contain
+ * @start. The final folio may extend beyond @end; if it does, it will
+ * contain @end. The folios have ascending indices. There may be gaps
+ * between the folios if there are indices which have no folio in the
+ * page cache. If folios are added to or removed from the page cache
+ * while this is running, they may or may not be found by this call.
+ * Only returns folios that are tagged with @tag.
*
* Return: The number of folios found.
* Also update @start to index the next folio for traversal.
@@ -2273,9 +2217,6 @@ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
continue;
if (!folio_batch_add(fbatch, folio)) {
unsigned long nr = folio_nr_pages(folio);
-
- if (folio_test_hugetlb(folio))
- nr = 1;
*start = folio->index + nr;
goto out;
}
@@ -3104,7 +3045,7 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
/*
* NOTE! This will make us return with VM_FAULT_RETRY, but with
- * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
+ * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
* is supposed to work. We have way too many special cases..
*/
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
@@ -3114,13 +3055,14 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
if (vmf->flags & FAULT_FLAG_KILLABLE) {
if (__folio_lock_killable(folio)) {
/*
- * We didn't have the right flags to drop the mmap_lock,
- * but all fault_handlers only check for fatal signals
- * if we return VM_FAULT_RETRY, so we need to drop the
- * mmap_lock here and return 0 if we don't have a fpin.
+ * We didn't have the right flags to drop the
+ * fault lock, but all fault_handlers only check
+ * for fatal signals if we return VM_FAULT_RETRY,
+ * so we need to drop the fault lock here and
+ * return 0 if we don't have a fpin.
*/
if (*fpin == NULL)
- mmap_read_unlock(vmf->vma->vm_mm);
+ release_fault_lock(vmf);
return 0;
}
} else
@@ -3321,21 +3263,28 @@ retry_find:
VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
/*
- * We have a locked page in the page cache, now we need to check
- * that it's up-to-date. If not, it is going to be due to an error.
+ * We have a locked folio in the page cache, now we need to check
+ * that it's up-to-date. If not, it is going to be due to an error,
+ * or because readahead was otherwise unable to retrieve it.
*/
if (unlikely(!folio_test_uptodate(folio))) {
/*
- * The page was in cache and uptodate and now it is not.
- * Strange but possible since we didn't hold the page lock all
- * the time. Let's drop everything get the invalidate lock and
- * try again.
+ * If the invalidate lock is not held, the folio was in cache
+ * and uptodate and now it is not. Strange but possible since we
+ * didn't hold the page lock all the time. Let's drop
+ * everything, get the invalidate lock and try again.
*/
if (!mapping_locked) {
folio_unlock(folio);
folio_put(folio);
goto retry_find;
}
+
+ /*
+ * OK, the folio is really not uptodate. This can be because the
+ * VMA has the VM_RAND_READ flag set, or because an error
+ * arose. Let's read it in directly.
+ */
goto page_not_uptodate;
}
@@ -3503,7 +3452,7 @@ skip:
if (count) {
set_pte_range(vmf, folio, page, count, addr);
folio_ref_add(folio, count);
- if (in_range(vmf->address, addr, count))
+ if (in_range(vmf->address, addr, count * PAGE_SIZE))
ret = VM_FAULT_NOPAGE;
}
@@ -3517,7 +3466,7 @@ skip:
if (count) {
set_pte_range(vmf, folio, page, count, addr);
folio_ref_add(folio, count);
- if (in_range(vmf->address, addr, count))
+ if (in_range(vmf->address, addr, count * PAGE_SIZE))
ret = VM_FAULT_NOPAGE;
}
@@ -3591,7 +3540,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index;
- end = folio->index + folio_nr_pages(folio) - 1;
+ end = folio_next_index(folio) - 1;
nr_pages = min(end, end_pgoff) - xas.xa_index + 1;
if (!folio_test_large(folio))
@@ -3669,7 +3618,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
*/
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ if (vma_is_shared_maywrite(vma))
return -EINVAL;
return generic_file_mmap(file, vma);
}
diff --git a/mm/gup.c b/mm/gup.c
index 2f8a2d89fde1..231711efa390 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1471,6 +1471,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
long ret, pages_done;
bool must_unlock = false;
+ if (!nr_pages)
+ return 0;
+
/*
* The internal caller expects GUP to manage the lock internally and the
* lock must be released when this returns.
@@ -1595,6 +1598,14 @@ retry:
mmap_read_unlock(mm);
*locked = 0;
}
+
+ /*
+ * Failing to pin anything implies something has gone wrong (except when
+ * FOLL_NOWAIT is specified).
+ */
+ if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT)))
+ return -EFAULT;
+
return pages_done;
}
@@ -2227,12 +2238,11 @@ static bool is_valid_gup_args(struct page **pages, int *locked,
/*
* These flags not allowed to be specified externally to the gup
* interfaces:
- * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
+ * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
* - FOLL_REMOTE is internal only and used on follow_page()
* - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
*/
- if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE |
- FOLL_REMOTE | FOLL_FAST_ONLY)))
+ if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS))
return false;
gup_flags |= to_set;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 064fbd90822b..f31f02472396 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -65,7 +65,11 @@ unsigned long transparent_hugepage_flags __read_mostly =
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
-static struct shrinker deferred_split_shrinker;
+static struct shrinker *deferred_split_shrinker;
+static unsigned long deferred_split_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
@@ -96,11 +100,11 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
return in_pf;
/*
- * Special VMA and hugetlb VMA.
+ * khugepaged special VMA and hugetlb VMA.
* Must be checked after dax since some dax mappings may have
* VM_MIXEDMAP set.
*/
- if (vm_flags & VM_NO_KHUGEPAGED)
+ if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
return false;
/*
@@ -128,12 +132,18 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
!hugepage_flags_always())))
return false;
- /* Only regular file is valid */
- if (!in_pf && file_thp_enabled(vma))
- return true;
-
- if (!vma_is_anonymous(vma))
+ if (!vma_is_anonymous(vma)) {
+ /*
+ * Trust that ->huge_fault() handlers know what they are doing
+ * in fault path.
+ */
+ if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
+ return true;
+ /* Only regular file is valid in collapse path */
+ if (((!in_pf || smaps)) && file_thp_enabled(vma))
+ return true;
return false;
+ }
if (vma_is_temporary_stack(vma))
return false;
@@ -229,11 +239,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
return 0;
}
-static struct shrinker huge_zero_page_shrinker = {
- .count_objects = shrink_huge_zero_page_count,
- .scan_objects = shrink_huge_zero_page_scan,
- .seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *huge_zero_page_shrinker;
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
@@ -454,6 +460,38 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
}
#endif /* CONFIG_SYSFS */
+static int __init thp_shrinker_init(void)
+{
+ huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
+ if (!huge_zero_page_shrinker)
+ return -ENOMEM;
+
+ deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
+ SHRINKER_MEMCG_AWARE |
+ SHRINKER_NONSLAB,
+ "thp-deferred_split");
+ if (!deferred_split_shrinker) {
+ shrinker_free(huge_zero_page_shrinker);
+ return -ENOMEM;
+ }
+
+ huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
+ huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
+ shrinker_register(huge_zero_page_shrinker);
+
+ deferred_split_shrinker->count_objects = deferred_split_count;
+ deferred_split_shrinker->scan_objects = deferred_split_scan;
+ shrinker_register(deferred_split_shrinker);
+
+ return 0;
+}
+
+static void __init thp_shrinker_exit(void)
+{
+ shrinker_free(huge_zero_page_shrinker);
+ shrinker_free(deferred_split_shrinker);
+}
+
static int __init hugepage_init(void)
{
int err;
@@ -482,12 +520,9 @@ static int __init hugepage_init(void)
if (err)
goto err_slab;
- err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
+ err = thp_shrinker_init();
if (err)
- goto err_hzp_shrinker;
- err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
- if (err)
- goto err_split_shrinker;
+ goto err_shrinker;
/*
* By default disable transparent hugepages on smaller systems,
@@ -505,10 +540,8 @@ static int __init hugepage_init(void)
return 0;
err_khugepaged:
- unregister_shrinker(&deferred_split_shrinker);
-err_split_shrinker:
- unregister_shrinker(&huge_zero_page_shrinker);
-err_hzp_shrinker:
+ thp_shrinker_exit();
+err_shrinker:
khugepaged_destroy();
err_slab:
hugepage_exit_sysfs(hugepage_kobj);
@@ -1349,7 +1382,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
if (folio_ref_count(folio) == 1) {
pmd_t entry;
- page_move_anon_rmap(page, vma);
+ folio_move_anon_rmap(folio, vma);
+ SetPageAnonExclusive(page);
folio_unlock(folio);
reuse:
if (unlikely(unshare)) {
@@ -1490,9 +1524,9 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
pmd_t oldpmd = vmf->orig_pmd;
pmd_t pmd;
- struct page *page;
+ struct folio *folio;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- int page_nid = NUMA_NO_NODE;
+ int nid = NUMA_NO_NODE;
int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
bool migrated = false, writable = false;
int flags = 0;
@@ -1514,36 +1548,34 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
can_change_pmd_writable(vma, vmf->address, pmd))
writable = true;
- page = vm_normal_page_pmd(vma, haddr, pmd);
- if (!page)
+ folio = vm_normal_folio_pmd(vma, haddr, pmd);
+ if (!folio)
goto out_map;
/* See similar comment in do_numa_page for explanation */
if (!writable)
flags |= TNF_NO_GROUP;
- page_nid = page_to_nid(page);
+ nid = folio_nid(folio);
/*
* For memory tiering mode, cpupid of slow memory page is used
* to record page access time. So use default value.
*/
- if (node_is_toptier(page_nid))
- last_cpupid = page_cpupid_last(page);
- target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
- &flags);
-
+ if (node_is_toptier(nid))
+ last_cpupid = folio_last_cpupid(folio);
+ target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags);
if (target_nid == NUMA_NO_NODE) {
- put_page(page);
+ folio_put(folio);
goto out_map;
}
spin_unlock(vmf->ptl);
writable = false;
- migrated = migrate_misplaced_page(page, vma, target_nid);
+ migrated = migrate_misplaced_folio(folio, vma, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
- page_nid = target_nid;
+ nid = target_nid;
} else {
flags |= TNF_MIGRATE_FAIL;
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -1555,9 +1587,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
}
out:
- if (page_nid != NUMA_NO_NODE)
- task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
- flags);
+ if (nid != NUMA_NO_NODE)
+ task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
return 0;
@@ -1825,7 +1856,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
pmd_t newpmd;
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
@@ -1834,7 +1865,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* A protection check is difficult so
* just be safe and disable write
*/
- if (PageAnon(page))
+ if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(swp_offset(entry));
else
entry = make_readable_migration_entry(swp_offset(entry));
@@ -1856,7 +1887,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
#endif
if (prot_numa) {
- struct page *page;
+ struct folio *folio;
bool toptier;
/*
* Avoid trapping faults against the zero page. The read-only
@@ -1869,8 +1900,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (pmd_protnone(*pmd))
goto unlock;
- page = pmd_page(*pmd);
- toptier = node_is_toptier(page_to_nid(page));
+ folio = page_folio(pmd_page(*pmd));
+ toptier = node_is_toptier(folio_nid(folio));
/*
* Skip scanning top tier node if normal numa
* balancing is disabled
@@ -1881,7 +1912,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
!toptier)
- xchg_page_access_time(page, jiffies_to_msecs(jiffies));
+ folio_xchg_access_time(folio,
+ jiffies_to_msecs(jiffies));
}
/*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
@@ -2483,7 +2515,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
if (page_is_idle(head))
set_page_idle(page_tail);
- page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+ folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
/*
* always add to the tail because some iterators expect new
@@ -2828,7 +2860,7 @@ void deferred_split_folio(struct folio *folio)
#ifdef CONFIG_MEMCG
if (memcg)
set_shrinker_bit(memcg, folio_nid(folio),
- deferred_split_shrinker.id);
+ deferred_split_shrinker->id);
#endif
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
@@ -2902,14 +2934,6 @@ next:
return split;
}
-static struct shrinker deferred_split_shrinker = {
- .count_objects = deferred_split_count,
- .scan_objects = deferred_split_scan,
- .seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
- SHRINKER_NONSLAB,
-};
-
#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ba6d39b71cb1..1169ef2f2176 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -97,6 +97,7 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -267,6 +268,10 @@ void hugetlb_vma_lock_read(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
down_read(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ down_read(&resv_map->rw_sema);
}
}
@@ -276,6 +281,10 @@ void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
up_read(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ up_read(&resv_map->rw_sema);
}
}
@@ -285,6 +294,10 @@ void hugetlb_vma_lock_write(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
down_write(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ down_write(&resv_map->rw_sema);
}
}
@@ -294,17 +307,27 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
up_write(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ up_write(&resv_map->rw_sema);
}
}
int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
{
- struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
- if (!__vma_shareable_lock(vma))
- return 1;
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ return down_write_trylock(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ return down_write_trylock(&resv_map->rw_sema);
+ }
- return down_write_trylock(&vma_lock->rw_sema);
+ return 1;
}
void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
@@ -313,6 +336,10 @@ void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
lockdep_assert_held(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ lockdep_assert_held(&resv_map->rw_sema);
}
}
@@ -345,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
__hugetlb_vma_unlock_write_put(vma_lock);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ /* no free for anon vmas, but still need to unlock */
+ up_write(&resv_map->rw_sema);
}
}
@@ -952,7 +984,7 @@ static long region_count(struct resv_map *resv, long f, long t)
/*
* Convert the address within this vma to the page offset within
- * the mapping, in pagecache page units; huge pages here.
+ * the mapping, huge page units here.
*/
static pgoff_t vma_hugecache_offset(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
@@ -961,13 +993,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
(vma->vm_pgoff >> huge_page_order(h));
}
-pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
- unsigned long address)
-{
- return vma_hugecache_offset(hstate_vma(vma), vma, address);
-}
-EXPORT_SYMBOL_GPL(linear_hugepage_index);
-
/**
* vma_kernel_pagesize - Page size granularity for this VMA.
* @vma: The user mapping.
@@ -1068,6 +1093,7 @@ struct resv_map *resv_map_alloc(void)
kref_init(&resv_map->refs);
spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
+ init_rwsem(&resv_map->rw_sema);
resv_map->adds_in_progress = 0;
/*
@@ -1138,8 +1164,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
- set_vma_private_data(vma, (get_vma_private_data(vma) &
- HPAGE_RESV_MASK) | (unsigned long)map);
+ set_vma_private_data(vma, (unsigned long)map);
}
static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
@@ -1446,7 +1471,7 @@ static int hstate_next_node_to_alloc(struct hstate *h,
}
/*
- * helper for remove_pool_huge_page() - return the previously saved
+ * helper for remove_pool_hugetlb_folio() - return the previously saved
* node ["this node"] from which to free a huge page. Advance the
* next node id whether or not we find a free huge page to free so
* that the next attempt to free addresses the next node.
@@ -1720,7 +1745,12 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return;
- if (hugetlb_vmemmap_restore(h, &folio->page)) {
+ /*
+ * If folio is not vmemmap optimized (!clear_dtor), then the folio
+ * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio
+ * can only be passed hugetlb pages and will BUG otherwise.
+ */
+ if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) {
spin_lock_irq(&hugetlb_lock);
/*
* If we cannot allocate vmemmap pages, just refuse to free the
@@ -1782,22 +1812,22 @@ static void free_hpage_workfn(struct work_struct *work)
node = llist_del_all(&hpage_freelist);
while (node) {
- struct page *page;
+ struct folio *folio;
struct hstate *h;
- page = container_of((struct address_space **)node,
- struct page, mapping);
+ folio = container_of((struct address_space **)node,
+ struct folio, mapping);
node = node->next;
- page->mapping = NULL;
+ folio->mapping = NULL;
/*
* The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
* folio_hstate() is going to trigger because a previous call to
* remove_hugetlb_folio() will clear the hugetlb bit, so do
* not use folio_hstate() directly.
*/
- h = size_to_hstate(page_size(page));
+ h = size_to_hstate(folio_size(folio));
- __update_and_free_hugetlb_folio(h, page_folio(page));
+ __update_and_free_hugetlb_folio(h, folio);
cond_resched();
}
@@ -1829,13 +1859,93 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
schedule_work(&free_hpage_work);
}
-static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
+static void bulk_vmemmap_restore_error(struct hstate *h,
+ struct list_head *folio_list,
+ struct list_head *non_hvo_folios)
{
- struct page *page, *t_page;
- struct folio *folio;
+ struct folio *folio, *t_folio;
+
+ if (!list_empty(non_hvo_folios)) {
+ /*
+ * Free any restored hugetlb pages so that restore of the
+ * entire list can be retried.
+ * The idea is that in the common case of ENOMEM errors freeing
+ * hugetlb pages with vmemmap we will free up memory so that we
+ * can allocate vmemmap for more hugetlb pages.
+ */
+ list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
+ list_del(&folio->lru);
+ spin_lock_irq(&hugetlb_lock);
+ __clear_hugetlb_destructor(h, folio);
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_hugetlb_folio(h, folio, false);
+ cond_resched();
+ }
+ } else {
+ /*
+ * In the case where there are no folios which can be
+ * immediately freed, we loop through the list trying to restore
+ * vmemmap individually in the hope that someone elsewhere may
+ * have done something to cause success (such as freeing some
+ * memory). If unable to restore a hugetlb page, the hugetlb
+ * page is made a surplus page and removed from the list.
+ * If are able to restore vmemmap and free one hugetlb page, we
+ * quit processing the list to retry the bulk operation.
+ */
+ list_for_each_entry_safe(folio, t_folio, folio_list, lru)
+ if (hugetlb_vmemmap_restore_folio(h, folio)) {
+ list_del(&folio->lru);
+ spin_lock_irq(&hugetlb_lock);
+ add_hugetlb_folio(h, folio, true);
+ spin_unlock_irq(&hugetlb_lock);
+ } else {
+ list_del(&folio->lru);
+ spin_lock_irq(&hugetlb_lock);
+ __clear_hugetlb_destructor(h, folio);
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_hugetlb_folio(h, folio, false);
+ cond_resched();
+ break;
+ }
+ }
+}
- list_for_each_entry_safe(page, t_page, list, lru) {
- folio = page_folio(page);
+static void update_and_free_pages_bulk(struct hstate *h,
+ struct list_head *folio_list)
+{
+ long ret;
+ struct folio *folio, *t_folio;
+ LIST_HEAD(non_hvo_folios);
+
+ /*
+ * First allocate required vmemmmap (if necessary) for all folios.
+ * Carefully handle errors and free up any available hugetlb pages
+ * in an effort to make forward progress.
+ */
+retry:
+ ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios);
+ if (ret < 0) {
+ bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios);
+ goto retry;
+ }
+
+ /*
+ * At this point, list should be empty, ret should be >= 0 and there
+ * should only be pages on the non_hvo_folios list.
+ * Do note that the non_hvo_folios list could be empty.
+ * Without HVO enabled, ret will be 0 and there is no need to call
+ * __clear_hugetlb_destructor as this was done previously.
+ */
+ VM_WARN_ON(!list_empty(folio_list));
+ VM_WARN_ON(ret < 0);
+ if (!list_empty(&non_hvo_folios) && ret) {
+ spin_lock_irq(&hugetlb_lock);
+ list_for_each_entry(folio, &non_hvo_folios, lru)
+ __clear_hugetlb_destructor(h, folio);
+ spin_unlock_irq(&hugetlb_lock);
+ }
+
+ list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) {
update_and_free_hugetlb_folio(h, folio, false);
cond_resched();
}
@@ -1899,6 +2009,7 @@ void free_huge_folio(struct folio *folio)
pages_per_huge_page(h), folio);
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
pages_per_huge_page(h), folio);
+ mem_cgroup_uncharge(folio);
if (restore_reserve)
h->resv_huge_pages++;
@@ -1928,16 +2039,21 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
h->nr_huge_pages_node[nid]++;
}
-static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
+static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
- hugetlb_vmemmap_optimize(h, &folio->page);
- INIT_LIST_HEAD(&folio->lru);
folio_set_hugetlb(folio);
+ INIT_LIST_HEAD(&folio->lru);
hugetlb_set_folio_subpool(folio, NULL);
set_hugetlb_cgroup(folio, NULL);
set_hugetlb_cgroup_rsvd(folio, NULL);
}
+static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
+{
+ init_new_hugetlb_folio(h, folio);
+ hugetlb_vmemmap_optimize_folio(h, folio);
+}
+
static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
{
__prep_new_hugetlb_folio(h, folio);
@@ -2071,20 +2187,6 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
return NULL;
}
-pgoff_t hugetlb_basepage_index(struct page *page)
-{
- struct page *page_head = compound_head(page);
- pgoff_t index = page_index(page_head);
- unsigned long compound_idx;
-
- if (compound_order(page_head) > MAX_ORDER)
- compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
- else
- compound_idx = page - page_head;
-
- return (index << compound_order(page_head)) + compound_idx;
-}
-
static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
@@ -2148,16 +2250,9 @@ retry:
return page_folio(page);
}
-/*
- * Common helper to allocate a fresh hugetlb page. All specific allocators
- * should use this function to get new hugetlb pages
- *
- * Note that returned page is 'frozen': ref count of head page and all tail
- * pages is zero.
- */
-static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask,
- nodemask_t *node_alloc_noretry)
+static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
struct folio *folio;
bool retry = false;
@@ -2170,6 +2265,7 @@ retry:
nid, nmask, node_alloc_noretry);
if (!folio)
return NULL;
+
if (hstate_is_gigantic(h)) {
if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
/*
@@ -2184,32 +2280,84 @@ retry:
return NULL;
}
}
- prep_new_hugetlb_folio(h, folio, folio_nid(folio));
return folio;
}
+static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
+{
+ struct folio *folio;
+
+ folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask,
+ node_alloc_noretry);
+ if (folio)
+ init_new_hugetlb_folio(h, folio);
+ return folio;
+}
+
/*
- * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
- * manner.
+ * Common helper to allocate a fresh hugetlb page. All specific allocators
+ * should use this function to get new hugetlb pages
+ *
+ * Note that returned page is 'frozen': ref count of head page and all tail
+ * pages is zero.
*/
-static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
- nodemask_t *node_alloc_noretry)
+static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
struct folio *folio;
- int nr_nodes, node;
+
+ folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask,
+ node_alloc_noretry);
+ if (!folio)
+ return NULL;
+
+ prep_new_hugetlb_folio(h, folio, folio_nid(folio));
+ return folio;
+}
+
+static void prep_and_add_allocated_folios(struct hstate *h,
+ struct list_head *folio_list)
+{
+ unsigned long flags;
+ struct folio *folio, *tmp_f;
+
+ /* Send list for bulk vmemmap optimization processing */
+ hugetlb_vmemmap_optimize_folios(h, folio_list);
+
+ /* Add all new pool pages to free lists in one lock cycle */
+ spin_lock_irqsave(&hugetlb_lock, flags);
+ list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
+ __prep_account_new_huge_page(h, folio_nid(folio));
+ enqueue_hugetlb_folio(h, folio);
+ }
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
+}
+
+/*
+ * Allocates a fresh hugetlb page in a node interleaved manner. The page
+ * will later be added to the appropriate hugetlb pool.
+ */
+static struct folio *alloc_pool_huge_folio(struct hstate *h,
+ nodemask_t *nodes_allowed,
+ nodemask_t *node_alloc_noretry)
+{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ int nr_nodes, node;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node,
+ struct folio *folio;
+
+ folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node,
nodes_allowed, node_alloc_noretry);
- if (folio) {
- free_huge_folio(folio); /* free it into the hugepage allocator */
- return 1;
- }
+ if (folio)
+ return folio;
}
- return 0;
+ return NULL;
}
/*
@@ -2219,13 +2367,11 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
* an additional call to free the page to low level allocators.
* Called with hugetlb_lock locked.
*/
-static struct page *remove_pool_huge_page(struct hstate *h,
- nodemask_t *nodes_allowed,
- bool acct_surplus)
+static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
+ nodemask_t *nodes_allowed, bool acct_surplus)
{
int nr_nodes, node;
- struct page *page = NULL;
- struct folio *folio;
+ struct folio *folio = NULL;
lockdep_assert_held(&hugetlb_lock);
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
@@ -2235,15 +2381,14 @@ static struct page *remove_pool_huge_page(struct hstate *h,
*/
if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
!list_empty(&h->hugepage_freelists[node])) {
- page = list_entry(h->hugepage_freelists[node].next,
- struct page, lru);
- folio = page_folio(page);
+ folio = list_entry(h->hugepage_freelists[node].next,
+ struct folio, lru);
remove_hugetlb_folio(h, folio, acct_surplus);
break;
}
}
- return page;
+ return folio;
}
/*
@@ -2311,17 +2456,23 @@ retry:
* need to adjust max_huge_pages if the page is not freed.
* Attempt to allocate vmemmmap here so that we can take
* appropriate action on failure.
+ *
+ * The folio_test_hugetlb check here is because
+ * remove_hugetlb_folio will clear hugetlb folio flag for
+ * non-vmemmap optimized hugetlb folios.
*/
- rc = hugetlb_vmemmap_restore(h, &folio->page);
- if (!rc) {
- update_and_free_hugetlb_folio(h, folio, false);
- } else {
- spin_lock_irq(&hugetlb_lock);
- add_hugetlb_folio(h, folio, false);
- h->max_huge_pages++;
- spin_unlock_irq(&hugetlb_lock);
- }
+ if (folio_test_hugetlb(folio)) {
+ rc = hugetlb_vmemmap_restore_folio(h, folio);
+ if (rc) {
+ spin_lock_irq(&hugetlb_lock);
+ add_hugetlb_folio(h, folio, false);
+ h->max_huge_pages++;
+ goto out;
+ }
+ } else
+ rc = 0;
+ update_and_free_hugetlb_folio(h, folio, false);
return rc;
}
out:
@@ -2479,24 +2630,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
}
-/* mempolicy aware migration callback */
-struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
- unsigned long address)
-{
- struct mempolicy *mpol;
- nodemask_t *nodemask;
- struct folio *folio;
- gfp_t gfp_mask;
- int node;
-
- gfp_mask = htlb_alloc_mask(h);
- node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
- mpol_cond_put(mpol);
-
- return folio;
-}
-
/*
* Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
@@ -2597,7 +2730,6 @@ static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
unsigned long nr_pages;
- struct page *page;
LIST_HEAD(page_list);
lockdep_assert_held(&hugetlb_lock);
@@ -2618,15 +2750,17 @@ static void return_unused_surplus_pages(struct hstate *h,
* evenly across all nodes with memory. Iterate across these nodes
* until we can no longer free unreserved surplus pages. This occurs
* when the nodes with surplus pages have no free pages.
- * remove_pool_huge_page() will balance the freed pages across the
+ * remove_pool_hugetlb_folio() will balance the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
*/
while (nr_pages--) {
- page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
- if (!page)
+ struct folio *folio;
+
+ folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1);
+ if (!folio)
goto out;
- list_add(&page->lru, &page_list);
+ list_add(&folio->lru, &page_list);
}
out:
@@ -3008,11 +3142,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct folio *folio;
- long map_chg, map_commit;
+ long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
long gbl_chg;
- int ret, idx;
+ int memcg_charge_ret, ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
+ struct mem_cgroup *memcg;
bool deferred_reserve;
+ gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
+
+ memcg = get_mem_cgroup_from_current();
+ memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages);
+ if (memcg_charge_ret == -ENOMEM) {
+ mem_cgroup_put(memcg);
+ return ERR_PTR(-ENOMEM);
+ }
idx = hstate_index(h);
/*
@@ -3021,8 +3164,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
* code of zero indicates a reservation exists (no change).
*/
map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
- if (map_chg < 0)
+ if (map_chg < 0) {
+ if (!memcg_charge_ret)
+ mem_cgroup_cancel_charge(memcg, nr_pages);
+ mem_cgroup_put(memcg);
return ERR_PTR(-ENOMEM);
+ }
/*
* Processes that did not create the mapping will have no
@@ -3033,10 +3180,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
*/
if (map_chg || avoid_reserve) {
gbl_chg = hugepage_subpool_get_pages(spool, 1);
- if (gbl_chg < 0) {
- vma_end_reservation(h, vma, addr);
- return ERR_PTR(-ENOSPC);
- }
+ if (gbl_chg < 0)
+ goto out_end_reservation;
/*
* Even though there was no reservation in the region/reserve
@@ -3118,6 +3263,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
pages_per_huge_page(h), folio);
}
+
+ if (!memcg_charge_ret)
+ mem_cgroup_commit_charge(folio, memcg);
+ mem_cgroup_put(memcg);
+
return folio;
out_uncharge_cgroup:
@@ -3129,7 +3279,11 @@ out_uncharge_cgroup_reservation:
out_subpool_put:
if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
+out_end_reservation:
vma_end_reservation(h, vma, addr);
+ if (!memcg_charge_ret)
+ mem_cgroup_cancel_charge(memcg, nr_pages);
+ mem_cgroup_put(memcg);
return ERR_PTR(-ENOSPC);
}
@@ -3164,6 +3318,16 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
}
found:
+
+ /*
+ * Only initialize the head struct page in memmap_init_reserved_pages,
+ * rest of the struct pages will be initialized by the HugeTLB
+ * subsystem itself.
+ * The head struct page is used to get folio information by the HugeTLB
+ * subsystem like zone id and node id.
+ */
+ memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
+ huge_page_size(h) - PAGE_SIZE);
/* Put them into a private list first because mem_map is not up yet */
INIT_LIST_HEAD(&m->list);
list_add(&m->list, &huge_boot_pages);
@@ -3171,29 +3335,102 @@ found:
return 1;
}
+/* Initialize [start_page:end_page_number] tail struct pages of a hugepage */
+static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
+ unsigned long start_page_number,
+ unsigned long end_page_number)
+{
+ enum zone_type zone = zone_idx(folio_zone(folio));
+ int nid = folio_nid(folio);
+ unsigned long head_pfn = folio_pfn(folio);
+ unsigned long pfn, end_pfn = head_pfn + end_page_number;
+ int ret;
+
+ for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ __init_single_page(page, pfn, zone, nid);
+ prep_compound_tail((struct page *)folio, pfn - head_pfn);
+ ret = page_ref_freeze(page, 1);
+ VM_BUG_ON(!ret);
+ }
+}
+
+static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
+ struct hstate *h,
+ unsigned long nr_pages)
+{
+ int ret;
+
+ /* Prepare folio head */
+ __folio_clear_reserved(folio);
+ __folio_set_head(folio);
+ ret = folio_ref_freeze(folio, 1);
+ VM_BUG_ON(!ret);
+ /* Initialize the necessary tail struct pages */
+ hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages);
+ prep_compound_head((struct page *)folio, huge_page_order(h));
+}
+
+static void __init prep_and_add_bootmem_folios(struct hstate *h,
+ struct list_head *folio_list)
+{
+ unsigned long flags;
+ struct folio *folio, *tmp_f;
+
+ /* Send list for bulk vmemmap optimization processing */
+ hugetlb_vmemmap_optimize_folios(h, folio_list);
+
+ /* Add all new pool pages to free lists in one lock cycle */
+ spin_lock_irqsave(&hugetlb_lock, flags);
+ list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
+ if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
+ /*
+ * If HVO fails, initialize all tail struct pages
+ * We do not worry about potential long lock hold
+ * time as this is early in boot and there should
+ * be no contention.
+ */
+ hugetlb_folio_init_tail_vmemmap(folio,
+ HUGETLB_VMEMMAP_RESERVE_PAGES,
+ pages_per_huge_page(h));
+ }
+ __prep_account_new_huge_page(h, folio_nid(folio));
+ enqueue_hugetlb_folio(h, folio);
+ }
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
+}
+
/*
* Put bootmem huge pages into the standard lists after mem_map is up.
* Note: This only applies to gigantic (order > MAX_ORDER) pages.
*/
static void __init gather_bootmem_prealloc(void)
{
+ LIST_HEAD(folio_list);
struct huge_bootmem_page *m;
+ struct hstate *h = NULL, *prev_h = NULL;
list_for_each_entry(m, &huge_boot_pages, list) {
struct page *page = virt_to_page(m);
- struct folio *folio = page_folio(page);
- struct hstate *h = m->hstate;
+ struct folio *folio = (void *)page;
+
+ h = m->hstate;
+ /*
+ * It is possible to have multiple huge page sizes (hstates)
+ * in this list. If so, process each size separately.
+ */
+ if (h != prev_h && prev_h != NULL)
+ prep_and_add_bootmem_folios(prev_h, &folio_list);
+ prev_h = h;
VM_BUG_ON(!hstate_is_gigantic(h));
WARN_ON(folio_ref_count(folio) != 1);
- if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
- WARN_ON(folio_test_reserved(folio));
- prep_new_hugetlb_folio(h, folio, folio_nid(folio));
- free_huge_folio(folio); /* add to the hugepage allocator */
- } else {
- /* VERY unlikely inflated ref count on a tail page */
- free_gigantic_folio(folio, huge_page_order(h));
- }
+
+ hugetlb_folio_init_vmemmap(folio, h,
+ HUGETLB_VMEMMAP_RESERVE_PAGES);
+ init_new_hugetlb_folio(h, folio);
+ list_add(&folio->lru, &folio_list);
/*
* We need to restore the 'stolen' pages to totalram_pages
@@ -3203,7 +3440,10 @@ static void __init gather_bootmem_prealloc(void)
adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
+
+ prep_and_add_bootmem_folios(h, &folio_list);
}
+
static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
{
unsigned long i;
@@ -3235,9 +3475,22 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
h->max_huge_pages_node[nid] = i;
}
+/*
+ * NOTE: this routine is called in different contexts for gigantic and
+ * non-gigantic pages.
+ * - For gigantic pages, this is called early in the boot process and
+ * pages are allocated from memblock allocated or something similar.
+ * Gigantic pages are actually added to pools later with the routine
+ * gather_bootmem_prealloc.
+ * - For non-gigantic pages, this is called later in the boot process after
+ * all of mm is up and functional. Pages are allocated from buddy and
+ * then added to hugetlb pools.
+ */
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
+ struct folio *folio;
+ LIST_HEAD(folio_list);
nodemask_t *node_alloc_noretry;
bool node_specific_alloc = false;
@@ -3279,14 +3532,25 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
+ /*
+ * gigantic pages not added to list as they are not
+ * added to pools now.
+ */
if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
break;
- } else if (!alloc_pool_huge_page(h,
- &node_states[N_MEMORY],
- node_alloc_noretry))
- break;
+ } else {
+ folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
+ node_alloc_noretry);
+ if (!folio)
+ break;
+ list_add(&folio->lru, &folio_list);
+ }
cond_resched();
}
+
+ /* list will be empty if hstate_is_gigantic */
+ prep_and_add_allocated_folios(h, &folio_list);
+
if (i < h->max_huge_pages) {
char buf[32];
@@ -3359,15 +3623,15 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
* Collect pages to be freed on a list, and free after dropping lock
*/
for_each_node_mask(i, *nodes_allowed) {
- struct page *page, *next;
+ struct folio *folio, *next;
struct list_head *freel = &h->hugepage_freelists[i];
- list_for_each_entry_safe(page, next, freel, lru) {
+ list_for_each_entry_safe(folio, next, freel, lru) {
if (count >= h->nr_huge_pages)
goto out;
- if (PageHighMem(page))
+ if (folio_test_highmem(folio))
continue;
- remove_hugetlb_folio(h, page_folio(page), false);
- list_add(&page->lru, &page_list);
+ remove_hugetlb_folio(h, folio, false);
+ list_add(&folio->lru, &page_list);
}
}
@@ -3420,8 +3684,9 @@ found:
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
- unsigned long min_count, ret;
- struct page *page;
+ unsigned long min_count;
+ unsigned long allocated;
+ struct folio *folio;
LIST_HEAD(page_list);
NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
@@ -3452,7 +3717,9 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
if (nid != NUMA_NO_NODE) {
unsigned long old_count = count;
- count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+ count += persistent_huge_pages(h) -
+ (h->nr_huge_pages_node[nid] -
+ h->surplus_huge_pages_node[nid]);
/*
* User may have specified a large count value which caused the
* above calculation to overflow. In this case, they wanted
@@ -3496,7 +3763,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
break;
}
- while (count > persistent_huge_pages(h)) {
+ allocated = 0;
+ while (count > (persistent_huge_pages(h) + allocated)) {
/*
* If this allocation races such that we no longer need the
* page, free_huge_folio will handle it by freeing the page
@@ -3507,15 +3775,32 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
/* yield cpu to avoid soft lockup */
cond_resched();
- ret = alloc_pool_huge_page(h, nodes_allowed,
+ folio = alloc_pool_huge_folio(h, nodes_allowed,
node_alloc_noretry);
- spin_lock_irq(&hugetlb_lock);
- if (!ret)
+ if (!folio) {
+ prep_and_add_allocated_folios(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
goto out;
+ }
+
+ list_add(&folio->lru, &page_list);
+ allocated++;
/* Bail for signals. Probably ctrl-c from user */
- if (signal_pending(current))
+ if (signal_pending(current)) {
+ prep_and_add_allocated_folios(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
goto out;
+ }
+
+ spin_lock_irq(&hugetlb_lock);
+ }
+
+ /* Add allocated pages to the pool */
+ if (!list_empty(&page_list)) {
+ spin_unlock_irq(&hugetlb_lock);
+ prep_and_add_allocated_folios(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
}
/*
@@ -3541,11 +3826,11 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* Collect pages to be removed on list without dropping lock
*/
while (min_count < persistent_huge_pages(h)) {
- page = remove_pool_huge_page(h, nodes_allowed, 0);
- if (!page)
+ folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0);
+ if (!folio)
break;
- list_add(&page->lru, &page_list);
+ list_add(&folio->lru, &page_list);
}
/* free the pages after dropping lock */
spin_unlock_irq(&hugetlb_lock);
@@ -3580,13 +3865,21 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
remove_hugetlb_folio_for_demote(h, folio, false);
spin_unlock_irq(&hugetlb_lock);
- rc = hugetlb_vmemmap_restore(h, &folio->page);
- if (rc) {
- /* Allocation of vmemmmap failed, we can not demote folio */
- spin_lock_irq(&hugetlb_lock);
- folio_ref_unfreeze(folio, 1);
- add_hugetlb_folio(h, folio, false);
- return rc;
+ /*
+ * If vmemmap already existed for folio, the remove routine above would
+ * have cleared the hugetlb folio flag. Hence the folio is technically
+ * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be
+ * passed hugetlb folios and will BUG otherwise.
+ */
+ if (folio_test_hugetlb(folio)) {
+ rc = hugetlb_vmemmap_restore_folio(h, folio);
+ if (rc) {
+ /* Allocation of vmemmmap failed, we can not demote folio */
+ spin_lock_irq(&hugetlb_lock);
+ folio_ref_unfreeze(folio, 1);
+ add_hugetlb_folio(h, folio, false);
+ return rc;
+ }
}
/*
@@ -4282,7 +4575,7 @@ void __init hugetlb_add_hstate(unsigned int order)
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
- BUG_ON(order == 0);
+ BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
h = &hstates[hugetlb_max_hstate++];
mutex_init(&h->resize_lock);
h->order = order;
@@ -4965,7 +5258,7 @@ bool is_hugetlb_entry_migration(pte_t pte)
return false;
}
-static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
+bool is_hugetlb_entry_hwpoisoned(pte_t pte)
{
swp_entry_t swp;
@@ -4980,7 +5273,7 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
static void
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
- struct folio *new_folio, pte_t old)
+ struct folio *new_folio, pte_t old, unsigned long sz)
{
pte_t newpte = make_huge_pte(vma, &new_folio->page, 1);
@@ -4988,7 +5281,7 @@ hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long add
hugepage_add_new_anon_rmap(new_folio, vma, addr);
if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old))
newpte = huge_pte_mkuffd_wp(newpte);
- set_huge_pte_at(vma->vm_mm, addr, ptep, newpte);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz);
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
folio_set_hugetlb_migratable(new_folio);
}
@@ -5065,7 +5358,7 @@ again:
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
if (!userfaultfd_wp(dst_vma))
entry = huge_pte_clear_uffd_wp(entry);
- set_huge_pte_at(dst, addr, dst_pte, entry);
+ set_huge_pte_at(dst, addr, dst_pte, entry, sz);
} else if (unlikely(is_hugetlb_entry_migration(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
bool uffd_wp = pte_swp_uffd_wp(entry);
@@ -5080,18 +5373,18 @@ again:
entry = swp_entry_to_pte(swp_entry);
if (userfaultfd_wp(src_vma) && uffd_wp)
entry = pte_swp_mkuffd_wp(entry);
- set_huge_pte_at(src, addr, src_pte, entry);
+ set_huge_pte_at(src, addr, src_pte, entry, sz);
}
if (!userfaultfd_wp(dst_vma))
entry = huge_pte_clear_uffd_wp(entry);
- set_huge_pte_at(dst, addr, dst_pte, entry);
+ set_huge_pte_at(dst, addr, dst_pte, entry, sz);
} else if (unlikely(is_pte_marker(entry))) {
pte_marker marker = copy_pte_marker(
pte_to_swp_entry(entry), dst_vma);
if (marker)
set_huge_pte_at(dst, addr, dst_pte,
- make_pte_marker(marker));
+ make_pte_marker(marker), sz);
} else {
entry = huge_ptep_get(src_pte);
pte_folio = page_folio(pte_page(entry));
@@ -5145,7 +5438,7 @@ again:
goto again;
}
hugetlb_install_folio(dst_vma, dst_pte, addr,
- new_folio, src_pte_old);
+ new_folio, src_pte_old, sz);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
continue;
@@ -5166,7 +5459,7 @@ again:
if (!userfaultfd_wp(dst_vma))
entry = huge_pte_clear_uffd_wp(entry);
- set_huge_pte_at(dst, addr, dst_pte, entry);
+ set_huge_pte_at(dst, addr, dst_pte, entry, sz);
hugetlb_count_add(npages, dst);
}
spin_unlock(src_ptl);
@@ -5184,7 +5477,8 @@ again:
}
static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte)
+ unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
+ unsigned long sz)
{
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
@@ -5202,7 +5496,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte);
+ set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
if (src_ptl != dst_ptl)
spin_unlock(src_ptl);
@@ -5259,7 +5553,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
if (!dst_pte)
break;
- move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
+ move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
}
if (shared_pmd)
@@ -5273,9 +5567,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
return len + old_addr - old_end;
}
-static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct page *ref_page, zap_flags_t zap_flags)
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page, zap_flags_t zap_flags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -5337,7 +5631,8 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
if (pte_swp_uffd_wp_any(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
set_huge_pte_at(mm, address, ptep,
- make_pte_marker(PTE_MARKER_UFFD_WP));
+ make_pte_marker(PTE_MARKER_UFFD_WP),
+ sz);
else
huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
@@ -5371,7 +5666,8 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
set_huge_pte_at(mm, address, ptep,
- make_pte_marker(PTE_MARKER_UFFD_WP));
+ make_pte_marker(PTE_MARKER_UFFD_WP),
+ sz);
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, vma, true);
@@ -5402,16 +5698,25 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
tlb_flush_mmu_tlbonly(tlb);
}
-void __unmap_hugepage_range_final(struct mmu_gather *tlb,
- struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page,
- zap_flags_t zap_flags)
+void __hugetlb_zap_begin(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
{
+ if (!vma->vm_file) /* hugetlbfs_file_mmap error */
+ return;
+
+ adjust_range_if_pmd_sharing_possible(vma, start, end);
hugetlb_vma_lock_write(vma);
- i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (vma->vm_file)
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+}
- /* mmu notification performed in caller */
- __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
+void __hugetlb_zap_end(struct vm_area_struct *vma,
+ struct zap_details *details)
+{
+ zap_flags_t zap_flags = details ? details->zap_flags : 0;
+
+ if (!vma->vm_file) /* hugetlbfs_file_mmap error */
+ return;
if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
/*
@@ -5424,11 +5729,12 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
* someone else.
*/
__hugetlb_vma_unlock_write_free(vma);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
} else {
- i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
}
+
+ if (vma->vm_file)
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
@@ -5560,8 +5866,10 @@ retry_avoidcopy:
* owner and can reuse this page.
*/
if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
- if (!PageAnonExclusive(&old_folio->page))
- page_move_anon_rmap(&old_folio->page, vma);
+ if (!PageAnonExclusive(&old_folio->page)) {
+ folio_move_anon_rmap(old_folio, vma);
+ SetPageAnonExclusive(&old_folio->page);
+ }
if (likely(!unshare))
set_huge_ptep_writable(vma, haddr, ptep);
@@ -5676,7 +5984,7 @@ retry_avoidcopy:
hugepage_add_new_anon_rmap(new_folio, vma, haddr);
if (huge_pte_uffd_wp(pte))
newpte = huge_pte_mkuffd_wp(newpte);
- set_huge_pte_at(mm, haddr, ptep, newpte);
+ set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h));
folio_set_hugetlb_migratable(new_folio);
/* Make the old page be freed below */
new_folio = old_folio;
@@ -5707,7 +6015,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
struct address_space *mapping = vma->vm_file->f_mapping;
- pgoff_t idx = vma_hugecache_offset(h, vma, address);
+ pgoff_t idx = linear_page_index(vma, address);
struct folio *folio;
folio = filemap_get_folio(mapping, idx);
@@ -5724,6 +6032,7 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping
struct hstate *h = hstate_inode(inode);
int err;
+ idx <<= huge_page_order(h);
__folio_set_locked(folio);
err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
@@ -5831,7 +6140,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* before we get page_table_lock.
*/
new_folio = false;
- folio = filemap_lock_folio(mapping, idx);
+ folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(folio)) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
@@ -5972,7 +6281,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
*/
if (unlikely(pte_marker_uffd_wp(old_pte)))
new_pte = huge_pte_mkuffd_wp(new_pte);
- set_huge_pte_at(mm, haddr, ptep, new_pte);
+ set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h));
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
@@ -6140,7 +6449,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
- pagecache_folio = filemap_lock_folio(mapping, idx);
+ pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(pagecache_folio))
pagecache_folio = NULL;
}
@@ -6154,21 +6463,28 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Handle userfault-wp first, before trying to lock more pages */
if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
(flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
- struct vm_fault vmf = {
- .vma = vma,
- .address = haddr,
- .real_address = address,
- .flags = flags,
- };
+ if (!userfaultfd_wp_async(vma)) {
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = haddr,
+ .real_address = address,
+ .flags = flags,
+ };
- spin_unlock(ptl);
- if (pagecache_folio) {
- folio_unlock(pagecache_folio);
- folio_put(pagecache_folio);
+ spin_unlock(ptl);
+ if (pagecache_folio) {
+ folio_unlock(pagecache_folio);
+ folio_put(pagecache_folio);
+ }
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ return handle_userfault(&vmf, VM_UFFD_WP);
}
- hugetlb_vma_unlock_read(vma);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- return handle_userfault(&vmf, VM_UFFD_WP);
+
+ entry = huge_pte_clear_uffd_wp(entry);
+ set_huge_pte_at(mm, haddr, ptep, entry,
+ huge_page_size(hstate_vma(vma)));
+ /* Fallthrough to CoW */
}
/*
@@ -6226,6 +6542,26 @@ out_mutex:
#ifdef CONFIG_USERFAULTFD
/*
+ * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte().
+ */
+static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct mempolicy *mpol;
+ nodemask_t *nodemask;
+ struct folio *folio;
+ gfp_t gfp_mask;
+ int node;
+
+ gfp_mask = htlb_alloc_mask(h);
+ node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+ folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
+ mpol_cond_put(mpol);
+
+ return folio;
+}
+
+/*
* Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
* with modifications for hugetlb pages.
*/
@@ -6261,7 +6597,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
}
_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
- set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte,
+ huge_page_size(h));
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
@@ -6272,7 +6609,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
if (is_continue) {
ret = -EFAULT;
- folio = filemap_lock_folio(mapping, idx);
+ folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(folio))
goto out;
folio_in_pagecache = true;
@@ -6412,7 +6749,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
if (wp_enabled)
_dst_pte = huge_pte_mkuffd_wp(_dst_pte);
- set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, huge_page_size(h));
hugetlb_count_add(pages_per_huge_page(h), dst_mm);
@@ -6474,7 +6811,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
}
}
- page += ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+ page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT));
/*
* Note that page may be a sub-page, and with vmemmap
@@ -6598,7 +6935,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
else if (uffd_wp_resolve)
newpte = pte_swp_clear_uffd_wp(newpte);
if (!pte_same(pte, newpte))
- set_huge_pte_at(mm, address, ptep, newpte);
+ set_huge_pte_at(mm, address, ptep, newpte, psize);
} else if (unlikely(is_pte_marker(pte))) {
/* No other markers apply for now. */
WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
@@ -6623,7 +6960,8 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
if (unlikely(uffd_wp))
/* Safe to modify directly (none->non-present). */
set_huge_pte_at(mm, address, ptep,
- make_pte_marker(PTE_MARKER_UFFD_WP));
+ make_pte_marker(PTE_MARKER_UFFD_WP),
+ psize);
}
spin_unlock(ptl);
}
@@ -6806,8 +7144,10 @@ out_err:
*/
if (chg >= 0 && add < 0)
region_abort(resv_map, from, to, regions_needed);
- if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
kref_put(&resv_map->refs, resv_map_release);
+ set_vma_resv_map(vma, NULL);
+ }
return false;
}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index dedd2edb076e..aa4486bd3904 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -262,12 +262,6 @@ static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
if (hugetlb_cgroup_disabled())
goto done;
- /*
- * We don't charge any cgroup if the compound page have less
- * than 3 pages.
- */
- if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
- goto done;
again:
rcu_read_lock();
h_cg = hugetlb_cgroup_from_task(current);
@@ -397,9 +391,6 @@ static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
if (hugetlb_cgroup_disabled() || !h_cg)
return;
- if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
- return;
-
page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
rsvd),
nr_pages);
@@ -869,15 +860,8 @@ void __init hugetlb_cgroup_file_init(void)
{
struct hstate *h;
- for_each_hstate(h) {
- /*
- * Add cgroup control files only if the huge page consists
- * of more than two normal pages. This is because we use
- * page[2].private for storing cgroup details.
- */
- if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
- __hugetlb_cgroup_file_init(hstate_index(h));
- }
+ for_each_hstate(h)
+ __hugetlb_cgroup_file_init(hstate_index(h));
}
/*
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 4b9734777f69..87818ee7f01d 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -13,6 +13,7 @@
#include <linux/pgtable.h>
#include <linux/moduleparam.h>
#include <linux/bootmem_info.h>
+#include <linux/mmdebug.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "hugetlb_vmemmap.h"
@@ -26,6 +27,8 @@
* @reuse_addr: the virtual address of the @reuse_page page.
* @vmemmap_pages: the list head of the vmemmap pages that can be freed
* or is mapped from.
+ * @flags: used to modify behavior in vmemmap page table walking
+ * operations.
*/
struct vmemmap_remap_walk {
void (*remap_pte)(pte_t *pte, unsigned long addr,
@@ -34,9 +37,15 @@ struct vmemmap_remap_walk {
struct page *reuse_page;
unsigned long reuse_addr;
struct list_head *vmemmap_pages;
+
+/* Skip the TLB flush when we split the PMD */
+#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
+/* Skip the TLB flush when we remap the PTE */
+#define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1)
+ unsigned long flags;
};
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
{
pmd_t __pmd;
int i;
@@ -79,7 +88,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
/* Make pte visible before pmd. See comment in pmd_install(). */
smp_wmb();
pmd_populate_kernel(&init_mm, pmd, pgtable);
- flush_tlb_kernel_range(start, start + PMD_SIZE);
+ if (flush)
+ flush_tlb_kernel_range(start, start + PMD_SIZE);
} else {
pte_free_kernel(&init_mm, pgtable);
}
@@ -126,11 +136,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
do {
int ret;
- ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+ ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
+ !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH));
if (ret)
return ret;
next = pmd_addr_end(addr, end);
+
+ /*
+ * We are only splitting, not remapping the hugetlb vmemmap
+ * pages.
+ */
+ if (!walk->remap_pte)
+ continue;
+
vmemmap_pte_range(pmd, addr, next, walk);
} while (pmd++, addr = next, addr != end);
@@ -197,7 +216,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
return ret;
} while (pgd++, addr = next, addr != end);
- flush_tlb_kernel_range(start, end);
+ if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
+ flush_tlb_kernel_range(start, end);
return 0;
}
@@ -250,7 +270,7 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
}
entry = mk_pte(walk->reuse_page, pgprot);
- list_add_tail(&page->lru, walk->vmemmap_pages);
+ list_add(&page->lru, walk->vmemmap_pages);
set_pte_at(&init_mm, addr, pte, entry);
}
@@ -297,6 +317,36 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
}
/**
+ * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
+ * backing PMDs of the directmap into PTEs
+ * @start: start address of the vmemmap virtual address range that we want
+ * to remap.
+ * @end: end address of the vmemmap virtual address range that we want to
+ * remap.
+ * @reuse: reuse address.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_split(unsigned long start, unsigned long end,
+ unsigned long reuse)
+{
+ int ret;
+ struct vmemmap_remap_walk walk = {
+ .remap_pte = NULL,
+ .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH,
+ };
+
+ /* See the comment in the vmemmap_remap_free(). */
+ BUG_ON(start - reuse != PAGE_SIZE);
+
+ mmap_read_lock(&init_mm);
+ ret = vmemmap_remap_range(reuse, end, &walk);
+ mmap_read_unlock(&init_mm);
+
+ return ret;
+}
+
+/**
* vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
* to the page which @reuse is mapped to, then free vmemmap
* which the range are mapped to.
@@ -305,22 +355,26 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
* @end: end address of the vmemmap virtual address range that we want to
* remap.
* @reuse: reuse address.
+ * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers
+ * responsibility to free pages.
+ * @flags: modifications to vmemmap_remap_walk flags
*
* Return: %0 on success, negative error code otherwise.
*/
static int vmemmap_remap_free(unsigned long start, unsigned long end,
- unsigned long reuse)
+ unsigned long reuse,
+ struct list_head *vmemmap_pages,
+ unsigned long flags)
{
int ret;
- LIST_HEAD(vmemmap_pages);
struct vmemmap_remap_walk walk = {
.remap_pte = vmemmap_remap_pte,
.reuse_addr = reuse,
- .vmemmap_pages = &vmemmap_pages,
+ .vmemmap_pages = vmemmap_pages,
+ .flags = flags,
};
- int nid = page_to_nid((struct page *)start);
- gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY |
- __GFP_NOWARN;
+ int nid = page_to_nid((struct page *)reuse);
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
/*
* Allocate a new head vmemmap page to avoid breaking a contiguous
@@ -334,7 +388,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
if (walk.reuse_page) {
copy_page(page_to_virt(walk.reuse_page),
(void *)walk.reuse_addr);
- list_add(&walk.reuse_page->lru, &vmemmap_pages);
+ list_add(&walk.reuse_page->lru, vmemmap_pages);
}
/*
@@ -365,22 +419,21 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
walk = (struct vmemmap_remap_walk) {
.remap_pte = vmemmap_restore_pte,
.reuse_addr = reuse,
- .vmemmap_pages = &vmemmap_pages,
+ .vmemmap_pages = vmemmap_pages,
+ .flags = 0,
};
vmemmap_remap_range(reuse, end, &walk);
}
mmap_read_unlock(&init_mm);
- free_vmemmap_page_list(&vmemmap_pages);
-
return ret;
}
static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
struct list_head *list)
{
- gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE;
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
int nid = page_to_nid((struct page *)start);
struct page *page, *next;
@@ -389,7 +442,7 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
page = alloc_pages_node(nid, gfp_mask, 0);
if (!page)
goto out;
- list_add_tail(&page->lru, list);
+ list_add(&page->lru, list);
}
return 0;
@@ -408,17 +461,19 @@ out:
* @end: end address of the vmemmap virtual address range that we want to
* remap.
* @reuse: reuse address.
+ * @flags: modifications to vmemmap_remap_walk flags
*
* Return: %0 on success, negative error code otherwise.
*/
static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
- unsigned long reuse)
+ unsigned long reuse, unsigned long flags)
{
LIST_HEAD(vmemmap_pages);
struct vmemmap_remap_walk walk = {
.remap_pte = vmemmap_restore_pte,
.reuse_addr = reuse,
.vmemmap_pages = &vmemmap_pages,
+ .flags = flags,
};
/* See the comment in the vmemmap_remap_free(). */
@@ -440,23 +495,15 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
-/**
- * hugetlb_vmemmap_restore - restore previously optimized (by
- * hugetlb_vmemmap_optimize()) vmemmap pages which
- * will be reallocated and remapped.
- * @h: struct hstate.
- * @head: the head page whose vmemmap pages will be restored.
- *
- * Return: %0 if @head's vmemmap pages have been reallocated and remapped,
- * negative error code otherwise.
- */
-int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
+static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags)
{
int ret;
+ struct page *head = &folio->page;
unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
unsigned long vmemmap_reuse;
- if (!HPageVmemmapOptimized(head))
+ VM_WARN_ON_ONCE(!PageHuge(head));
+ if (!folio_test_hugetlb_vmemmap_optimized(folio))
return 0;
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
@@ -470,18 +517,77 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
* When a HugeTLB page is freed to the buddy allocator, previously
* discarded vmemmap pages must be allocated and remapping.
*/
- ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse);
+ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
if (!ret) {
- ClearHPageVmemmapOptimized(head);
+ folio_clear_hugetlb_vmemmap_optimized(folio);
static_branch_dec(&hugetlb_optimize_vmemmap_key);
}
return ret;
}
+/**
+ * hugetlb_vmemmap_restore_folio - restore previously optimized (by
+ * hugetlb_vmemmap_optimize_folio()) vmemmap pages which
+ * will be reallocated and remapped.
+ * @h: struct hstate.
+ * @folio: the folio whose vmemmap pages will be restored.
+ *
+ * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
+ * negative error code otherwise.
+ */
+int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
+{
+ return __hugetlb_vmemmap_restore_folio(h, folio, 0);
+}
+
+/**
+ * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
+ * @h: hstate.
+ * @folio_list: list of folios.
+ * @non_hvo_folios: Output list of folios for which vmemmap exists.
+ *
+ * Return: number of folios for which vmemmap was restored, or an error code
+ * if an error was encountered restoring vmemmap for a folio.
+ * Folios that have vmemmap are moved to the non_hvo_folios
+ * list. Processing of entries stops when the first error is
+ * encountered. The folio that experienced the error and all
+ * non-processed folios will remain on folio_list.
+ */
+long hugetlb_vmemmap_restore_folios(const struct hstate *h,
+ struct list_head *folio_list,
+ struct list_head *non_hvo_folios)
+{
+ struct folio *folio, *t_folio;
+ long restored = 0;
+ long ret = 0;
+
+ list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
+ if (folio_test_hugetlb_vmemmap_optimized(folio)) {
+ ret = __hugetlb_vmemmap_restore_folio(h, folio,
+ VMEMMAP_REMAP_NO_TLB_FLUSH);
+ if (ret)
+ break;
+ restored++;
+ }
+
+ /* Add non-optimized folios to output list */
+ list_move(&folio->lru, non_hvo_folios);
+ }
+
+ if (restored)
+ flush_tlb_all();
+ if (!ret)
+ ret = restored;
+ return ret;
+}
+
/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
{
+ if (HPageVmemmapOptimized((struct page *)head))
+ return false;
+
if (!READ_ONCE(vmemmap_optimize_enabled))
return false;
@@ -535,39 +641,136 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h
return true;
}
+static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
+ struct folio *folio,
+ struct list_head *vmemmap_pages,
+ unsigned long flags)
+{
+ int ret = 0;
+ struct page *head = &folio->page;
+ unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+ unsigned long vmemmap_reuse;
+
+ VM_WARN_ON_ONCE(!PageHuge(head));
+ if (!vmemmap_should_optimize(h, head))
+ return ret;
+
+ static_branch_inc(&hugetlb_optimize_vmemmap_key);
+ /*
+ * Very Subtle
+ * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
+ * immediately after remapping. As a result, subsequent accesses
+ * and modifications to struct pages associated with the hugetlb
+ * page could be to the OLD struct pages. Set the vmemmap optimized
+ * flag here so that it is copied to the new head page. This keeps
+ * the old and new struct pages in sync.
+ * If there is an error during optimization, we will immediately FLUSH
+ * the TLB and clear the flag below.
+ */
+ folio_set_hugetlb_vmemmap_optimized(folio);
+
+ vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
+ vmemmap_reuse = vmemmap_start;
+ vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
+
+ /*
+ * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
+ * to the page which @vmemmap_reuse is mapped to. Add pages previously
+ * mapping the range to vmemmap_pages list so that they can be freed by
+ * the caller.
+ */
+ ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
+ vmemmap_pages, flags);
+ if (ret) {
+ static_branch_dec(&hugetlb_optimize_vmemmap_key);
+ folio_clear_hugetlb_vmemmap_optimized(folio);
+ }
+
+ return ret;
+}
+
/**
- * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages.
+ * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
* @h: struct hstate.
- * @head: the head page whose vmemmap pages will be optimized.
+ * @folio: the folio whose vmemmap pages will be optimized.
*
- * This function only tries to optimize @head's vmemmap pages and does not
+ * This function only tries to optimize @folio's vmemmap pages and does not
* guarantee that the optimization will succeed after it returns. The caller
- * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages
- * have been optimized.
+ * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
+ * vmemmap pages have been optimized.
*/
-void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
+void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
+{
+ LIST_HEAD(vmemmap_pages);
+
+ __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
+ free_vmemmap_page_list(&vmemmap_pages);
+}
+
+static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
{
unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
unsigned long vmemmap_reuse;
if (!vmemmap_should_optimize(h, head))
- return;
-
- static_branch_inc(&hugetlb_optimize_vmemmap_key);
+ return 0;
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
vmemmap_reuse = vmemmap_start;
vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
/*
- * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
- * to the page which @vmemmap_reuse is mapped to, then free the pages
- * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
+ * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
+ * @vmemmap_end]
*/
- if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse))
- static_branch_dec(&hugetlb_optimize_vmemmap_key);
- else
- SetHPageVmemmapOptimized(head);
+ return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
+}
+
+void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
+{
+ struct folio *folio;
+ LIST_HEAD(vmemmap_pages);
+
+ list_for_each_entry(folio, folio_list, lru) {
+ int ret = hugetlb_vmemmap_split(h, &folio->page);
+
+ /*
+ * Spliting the PMD requires allocating a page, thus lets fail
+ * early once we encounter the first OOM. No point in retrying
+ * as it can be dynamically done on remap with the memory
+ * we get back from the vmemmap deduplication.
+ */
+ if (ret == -ENOMEM)
+ break;
+ }
+
+ flush_tlb_all();
+
+ list_for_each_entry(folio, folio_list, lru) {
+ int ret = __hugetlb_vmemmap_optimize_folio(h, folio,
+ &vmemmap_pages,
+ VMEMMAP_REMAP_NO_TLB_FLUSH);
+
+ /*
+ * Pages to be freed may have been accumulated. If we
+ * encounter an ENOMEM, free what we have and try again.
+ * This can occur in the case that both spliting fails
+ * halfway and head page allocation also failed. In this
+ * case __hugetlb_vmemmap_optimize_folio() would free memory
+ * allowing more vmemmap remaps to occur.
+ */
+ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
+ flush_tlb_all();
+ free_vmemmap_page_list(&vmemmap_pages);
+ INIT_LIST_HEAD(&vmemmap_pages);
+ __hugetlb_vmemmap_optimize_folio(h, folio,
+ &vmemmap_pages,
+ VMEMMAP_REMAP_NO_TLB_FLUSH);
+ }
+ }
+
+ flush_tlb_all();
+ free_vmemmap_page_list(&vmemmap_pages);
}
static struct ctl_table hugetlb_vmemmap_sysctls[] = {
@@ -586,7 +789,7 @@ static int __init hugetlb_vmemmap_init(void)
const struct hstate *h;
/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
- BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE);
+ BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
for_each_hstate(h) {
if (hugetlb_vmemmap_optimizable(h)) {
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 25bd0e002431..2fcae92d3359 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -10,15 +10,20 @@
#define _LINUX_HUGETLB_VMEMMAP_H
#include <linux/hugetlb.h>
-#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
-void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
-
/*
* Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
- * Documentation/vm/vmemmap_dedup.rst.
+ * Documentation/mm/vmemmap_dedup.rst.
*/
#define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE
+#define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page))
+
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio);
+long hugetlb_vmemmap_restore_folios(const struct hstate *h,
+ struct list_head *folio_list,
+ struct list_head *non_hvo_folios);
+void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio);
+void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
{
@@ -38,12 +43,24 @@ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate
return size > 0 ? size : 0;
}
#else
-static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
+static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
{
return 0;
}
-static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
+static long hugetlb_vmemmap_restore_folios(const struct hstate *h,
+ struct list_head *folio_list,
+ struct list_head *non_hvo_folios)
+{
+ list_splice_init(folio_list, non_hvo_folios);
+ return 0;
+}
+
+static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
+{
+}
+
+static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
{
}
diff --git a/mm/internal.h b/mm/internal.h
index 30cf724ddbce..b61034bd50f5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -415,6 +415,15 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
void folio_undo_large_rmappable(struct folio *folio);
+static inline struct folio *page_rmappable_folio(struct page *page)
+{
+ struct folio *folio = (struct folio *)page;
+
+ if (folio && folio_order(folio) > 1)
+ folio_prep_large_rmappable(folio);
+ return folio;
+}
+
static inline void prep_compound_head(struct page *page, unsigned int order)
{
struct folio *folio = (struct folio *)page;
@@ -586,6 +595,56 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma,
bool write, int *locked);
extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
unsigned long bytes);
+
+/*
+ * NOTE: This function can't tell whether the folio is "fully mapped" in the
+ * range.
+ * "fully mapped" means all the pages of folio is associated with the page
+ * table of range while this function just check whether the folio range is
+ * within the range [start, end). Function caller needs to do page table
+ * check if it cares about the page table association.
+ *
+ * Typical usage (like mlock or madvise) is:
+ * Caller knows at least 1 page of folio is associated with page table of VMA
+ * and the range [start, end) is intersect with the VMA range. Caller wants
+ * to know whether the folio is fully associated with the range. It calls
+ * this function to check whether the folio is in the range first. Then checks
+ * the page table to know whether the folio is fully mapped to the range.
+ */
+static inline bool
+folio_within_range(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ pgoff_t pgoff, addr;
+ unsigned long vma_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+
+ VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
+ if (start > end)
+ return false;
+
+ if (start < vma->vm_start)
+ start = vma->vm_start;
+
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+
+ pgoff = folio_pgoff(folio);
+
+ /* if folio start address is not in vma range */
+ if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
+ return false;
+
+ addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+
+ return !(addr < start || end - addr < folio_size(folio));
+}
+
+static inline bool
+folio_within_vma(struct folio *folio, struct vm_area_struct *vma)
+{
+ return folio_within_range(folio, vma, vma->vm_start, vma->vm_end);
+}
+
/*
* mlock_vma_folio() and munlock_vma_folio():
* should be called with vma's mmap_lock held for read or write,
@@ -594,14 +653,10 @@ extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
* mlock is usually called at the end of page_add_*_rmap(), munlock at
* the end of page_remove_rmap(); but new anon folios are managed by
* folio_add_lru_vma() calling mlock_new_folio().
- *
- * @compound is used to include pmd mappings of THPs, but filter out
- * pte mappings of THPs, which cannot be consistently counted: a pte
- * mapping of the THP head cannot be distinguished by the page alone.
*/
void mlock_folio(struct folio *folio);
static inline void mlock_vma_folio(struct folio *folio,
- struct vm_area_struct *vma, bool compound)
+ struct vm_area_struct *vma)
{
/*
* The VM_SPECIAL check here serves two purposes.
@@ -611,17 +666,24 @@ static inline void mlock_vma_folio(struct folio *folio,
* file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
* still be set while VM_SPECIAL bits are added: so ignore it then.
*/
- if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
- (compound || !folio_test_large(folio)))
+ if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED))
mlock_folio(folio);
}
void munlock_folio(struct folio *folio);
static inline void munlock_vma_folio(struct folio *folio,
- struct vm_area_struct *vma, bool compound)
+ struct vm_area_struct *vma)
{
- if (unlikely(vma->vm_flags & VM_LOCKED) &&
- (compound || !folio_test_large(folio)))
+ /*
+ * munlock if the function is called. Ideally, we should only
+ * do munlock if any page of folio is unmapped from VMA and
+ * cause folio not fully mapped to VMA.
+ *
+ * But it's not easy to confirm that's the situation. So we
+ * always munlock the folio and page reclaim will correct it
+ * if it's wrong.
+ */
+ if (unlikely(vma->vm_flags & VM_LOCKED))
munlock_folio(folio);
}
@@ -930,7 +992,7 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
void __vunmap_range_noflush(unsigned long start, unsigned long end);
-int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);
void free_zone_device_page(struct page *page);
@@ -949,6 +1011,13 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd,
unsigned int flags);
+/*
+ * mm/mmap.c
+ */
+struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
+ struct vm_area_struct *vma,
+ unsigned long delta);
+
enum {
/* mark page accessed */
FOLL_TOUCH = 1 << 16,
@@ -964,6 +1033,9 @@ enum {
FOLL_UNLOCKABLE = 1 << 21,
};
+#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
+ FOLL_FAST_ONLY | FOLL_UNLOCKABLE)
+
/*
* Indicates for which pages that are write-protected in the page table,
* whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
@@ -1154,4 +1226,57 @@ struct vma_prepare {
struct vm_area_struct *remove;
struct vm_area_struct *remove2;
};
+
+void __meminit __init_single_page(struct page *page, unsigned long pfn,
+ unsigned long zone, int nid);
+
+/* shrinker related functions */
+unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
+ int priority);
+
+#ifdef CONFIG_SHRINKER_DEBUG
+static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
+ struct shrinker *shrinker, const char *fmt, va_list ap)
+{
+ shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+
+ return shrinker->name ? 0 : -ENOMEM;
+}
+
+static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
+{
+ kfree_const(shrinker->name);
+ shrinker->name = NULL;
+}
+
+extern int shrinker_debugfs_add(struct shrinker *shrinker);
+extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
+ int *debugfs_id);
+extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
+ int debugfs_id);
+#else /* CONFIG_SHRINKER_DEBUG */
+static inline int shrinker_debugfs_add(struct shrinker *shrinker)
+{
+ return 0;
+}
+static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
+ const char *fmt, va_list ap)
+{
+ return 0;
+}
+static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
+{
+}
+static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
+ int *debugfs_id)
+{
+ *debugfs_id = -1;
+ return NULL;
+}
+static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
+ int debugfs_id)
+{
+}
+#endif /* CONFIG_SHRINKER_DEBUG */
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index d37831b8511c..8b06bab5c406 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -562,7 +562,6 @@ void kasan_restore_multi_shot(bool enabled);
* code. Declared here to avoid warnings about missing declarations.
*/
-asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
void __asan_register_globals(void *globals, ssize_t size);
void __asan_unregister_globals(void *globals, ssize_t size);
void __asan_handle_no_return(void);
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index b61cc6a42541..8281eb42464b 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -5,7 +5,7 @@
* Author: Andrey Ryabinin <a.ryabinin@samsung.com>
*/
-#define pr_fmt(fmt) "kasan_test: " fmt
+#define pr_fmt(fmt) "kasan: test: " fmt
#include <kunit/test.h>
#include <linux/bitops.h>
@@ -91,10 +91,11 @@ static void kasan_test_exit(struct kunit *test)
}
/**
- * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a
- * KASAN report; causes a test failure otherwise. This relies on a KUnit
- * resource named "kasan_status". Do not use this name for KUnit resources
- * outside of KASAN tests.
+ * KUNIT_EXPECT_KASAN_FAIL - check that the executed expression produces a
+ * KASAN report; causes a KUnit test failure otherwise.
+ *
+ * @test: Currently executing KUnit test.
+ * @expression: Expression that must produce a KASAN report.
*
* For hardware tag-based KASAN, when a synchronous tag fault happens, tag
* checking is auto-disabled. When this happens, this test handler reenables
@@ -1097,11 +1098,9 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr)
KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr));
KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr));
KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr));
-
-#if defined(clear_bit_unlock_is_negative_byte)
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result =
- clear_bit_unlock_is_negative_byte(nr, addr));
-#endif
+ if (nr < 7)
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result =
+ xor_unlock_is_negative_byte(1 << nr, addr));
}
static void kasan_bitops_generic(struct kunit *test)
diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c
index 7be7bed456ef..8b7b3ea2c74e 100644
--- a/mm/kasan/kasan_test_module.c
+++ b/mm/kasan/kasan_test_module.c
@@ -5,7 +5,7 @@
* Author: Andrey Ryabinin <a.ryabinin@samsung.com>
*/
-#define pr_fmt(fmt) "kasan test: %s " fmt, __func__
+#define pr_fmt(fmt) "kasan: test: " fmt
#include <linux/mman.h>
#include <linux/module.h>
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 152dca73f398..ca4529156735 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -8,6 +8,8 @@
* Based on code by Dmitry Chernenkov.
*/
+#define pr_fmt(fmt) "kasan: " fmt
+
#include <linux/gfp.h>
#include <linux/hash.h>
#include <linux/kernel.h>
@@ -414,7 +416,7 @@ static int __init kasan_cpu_quarantine_init(void)
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/kasan:online",
kasan_cpu_online, kasan_cpu_offline);
if (ret < 0)
- pr_err("kasan cpu quarantine register failed [%d]\n", ret);
+ pr_err("cpu quarantine register failed [%d]\n", ret);
return ret;
}
late_initcall(kasan_cpu_quarantine_init);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index ca4b6ff080a6..e77facb62900 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -538,7 +538,7 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty
start_report(&flags, true);
- memset(&info, 0, sizeof(info));
+ __memset(&info, 0, sizeof(info));
info.type = type;
info.access_addr = ptr;
info.access_size = 0;
@@ -576,7 +576,7 @@ bool kasan_report(const void *addr, size_t size, bool is_write,
start_report(&irq_flags, true);
- memset(&info, 0, sizeof(info));
+ __memset(&info, 0, sizeof(info));
info.type = KASAN_REPORT_ACCESS;
info.access_addr = addr;
info.access_size = size;
@@ -621,7 +621,7 @@ void kasan_report_async(void)
}
#endif /* CONFIG_KASAN_HW_TAGS */
-#ifdef CONFIG_KASAN_INLINE
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
/*
* With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high
* canonical half of the address space) cause out-of-bounds shadow memory reads
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 51a1e8a8877f..99cbcd73cff7 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -220,7 +220,7 @@ static bool __must_check tokenize_frame_descr(const char **frame_descr,
const size_t tok_len = sep - *frame_descr;
if (tok_len + 1 > max_tok_len) {
- pr_err("KASAN internal error: frame description too long: %s\n",
+ pr_err("internal error: frame description too long: %s\n",
*frame_descr);
return false;
}
@@ -233,7 +233,7 @@ static bool __must_check tokenize_frame_descr(const char **frame_descr,
*frame_descr = sep + 1;
if (value != NULL && kstrtoul(token, 10, value)) {
- pr_err("KASAN internal error: not a valid number: %s\n", token);
+ pr_err("internal error: not a valid number: %s\n", token);
return false;
}
@@ -323,7 +323,7 @@ static bool __must_check get_address_stack_frame_info(const void *addr,
frame = (const unsigned long *)(mem_ptr + KASAN_GRANULE_SIZE);
if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) {
- pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n",
+ pr_err("internal error: frame has invalid marker: %lu\n",
frame[0]);
return false;
}
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index dd772f9d0f08..d687f09a7ae3 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -324,7 +324,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
if (!page)
return -ENOMEM;
- memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 88433cc25d8a..064654717843 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -91,7 +91,7 @@ static unsigned int khugepaged_max_ptes_shared __read_mostly;
#define MM_SLOTS_HASH_BITS 10
static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
-static struct kmem_cache *mm_slot_cache __read_mostly;
+static struct kmem_cache *mm_slot_cache __ro_after_init;
struct collapse_control {
bool is_khugepaged;
@@ -524,15 +524,15 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
}
-static bool is_refcount_suitable(struct page *page)
+static bool is_refcount_suitable(struct folio *folio)
{
int expected_refcount;
- expected_refcount = total_mapcount(page);
- if (PageSwapCache(page))
- expected_refcount += compound_nr(page);
+ expected_refcount = folio_mapcount(folio);
+ if (folio_test_swapcache(folio))
+ expected_refcount += folio_nr_pages(folio);
- return page_count(page) == expected_refcount;
+ return folio_ref_count(folio) == expected_refcount;
}
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
@@ -542,6 +542,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
struct list_head *compound_pagelist)
{
struct page *page = NULL;
+ struct folio *folio = NULL;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
bool writable = false;
@@ -576,7 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
- VM_BUG_ON_PAGE(!PageAnon(page), page);
+ folio = page_folio(page);
+ VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
if (page_mapcount(page) > 1) {
++shared;
@@ -588,16 +590,15 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
}
- if (PageCompound(page)) {
- struct page *p;
- page = compound_head(page);
+ if (folio_test_large(folio)) {
+ struct folio *f;
/*
* Check if we have dealt with the compound page
* already
*/
- list_for_each_entry(p, compound_pagelist, lru) {
- if (page == p)
+ list_for_each_entry(f, compound_pagelist, lru) {
+ if (folio == f)
goto next;
}
}
@@ -608,7 +609,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
- if (!trylock_page(page)) {
+ if (!folio_trylock(folio)) {
result = SCAN_PAGE_LOCK;
goto out;
}
@@ -624,8 +625,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
- if (!is_refcount_suitable(page)) {
- unlock_page(page);
+ if (!is_refcount_suitable(folio)) {
+ folio_unlock(folio);
result = SCAN_PAGE_COUNT;
goto out;
}
@@ -634,27 +635,27 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
*/
- if (!isolate_lru_page(page)) {
- unlock_page(page);
+ if (!folio_isolate_lru(folio)) {
+ folio_unlock(folio);
result = SCAN_DEL_PAGE_LRU;
goto out;
}
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_is_file_lru(page),
- compound_nr(page));
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- if (PageCompound(page))
- list_add_tail(&page->lru, compound_pagelist);
+ if (folio_test_large(folio))
+ list_add_tail(&folio->lru, compound_pagelist);
next:
/*
* If collapse was initiated by khugepaged, check that there is
* enough young pte to justify collapsing the page
*/
if (cc->is_khugepaged &&
- (pte_young(pteval) || page_is_young(page) ||
- PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ (pte_young(pteval) || folio_test_young(folio) ||
+ folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
address)))
referenced++;
@@ -668,13 +669,13 @@ next:
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
referenced, writable, result);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
referenced, writable, result);
return result;
}
@@ -887,16 +888,16 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
}
#endif
-static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
+static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node,
nodemask_t *nmask)
{
- *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
- if (unlikely(!*hpage)) {
+ *folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask);
+
+ if (unlikely(!*folio)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
return false;
}
- folio_prep_large_rmappable((struct folio *)*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return true;
}
@@ -1063,17 +1064,20 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
int node = hpage_collapse_find_target_node(cc);
struct folio *folio;
- if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
+ if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) {
+ *hpage = NULL;
return SCAN_ALLOC_HUGE_PAGE_FAIL;
+ }
- folio = page_folio(*hpage);
if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
folio_put(folio);
*hpage = NULL;
return SCAN_CGROUP_CHARGE_FAIL;
}
- count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
+ count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
+
+ *hpage = folio_page(folio, 0);
return SCAN_SUCCEED;
}
@@ -1247,6 +1251,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
int result = SCAN_FAIL, referenced = 0;
int none_or_zero = 0, shared = 0;
struct page *page = NULL;
+ struct folio *folio = NULL;
unsigned long _address;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
@@ -1333,29 +1338,28 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
}
}
- page = compound_head(page);
-
+ folio = page_folio(page);
/*
* Record which node the original page is from and save this
* information to cc->node_load[].
* Khugepaged will allocate hugepage from the node has the max
* hit record.
*/
- node = page_to_nid(page);
+ node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
goto out_unmap;
}
cc->node_load[node]++;
- if (!PageLRU(page)) {
+ if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
goto out_unmap;
}
- if (PageLocked(page)) {
+ if (folio_test_locked(folio)) {
result = SCAN_PAGE_LOCK;
goto out_unmap;
}
- if (!PageAnon(page)) {
+ if (!folio_test_anon(folio)) {
result = SCAN_PAGE_ANON;
goto out_unmap;
}
@@ -1370,7 +1374,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
- if (!is_refcount_suitable(page)) {
+ if (!is_refcount_suitable(folio)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -1380,8 +1384,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* enough young pte to justify collapsing the page
*/
if (cc->is_khugepaged &&
- (pte_young(pteval) || page_is_young(page) ||
- PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ (pte_young(pteval) || folio_test_young(folio) ||
+ folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
address)))
referenced++;
}
@@ -1403,7 +1407,7 @@ out_unmap:
*mmap_locked = false;
}
out:
- trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
none_or_zero, result, unmapped);
return result;
}
@@ -1473,7 +1477,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
- struct page *hpage;
+ struct folio *folio;
pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd;
spinlock_t *pml = NULL, *ptl;
@@ -1506,19 +1510,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (userfaultfd_wp(vma))
return SCAN_PTE_UFFD_WP;
- hpage = find_lock_page(vma->vm_file->f_mapping,
+ folio = filemap_lock_folio(vma->vm_file->f_mapping,
linear_page_index(vma, haddr));
- if (!hpage)
+ if (IS_ERR(folio))
return SCAN_PAGE_NULL;
- if (!PageHead(hpage)) {
- result = SCAN_FAIL;
- goto drop_hpage;
- }
-
- if (compound_order(hpage) != HPAGE_PMD_ORDER) {
+ if (folio_order(folio) != HPAGE_PMD_ORDER) {
result = SCAN_PAGE_COMPOUND;
- goto drop_hpage;
+ goto drop_folio;
}
result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
@@ -1532,13 +1531,13 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
*/
goto maybe_install_pmd;
default:
- goto drop_hpage;
+ goto drop_folio;
}
result = SCAN_FAIL;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
if (!start_pte) /* mmap_lock + page lock should prevent this */
- goto drop_hpage;
+ goto drop_folio;
/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
@@ -1563,7 +1562,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* Note that uprobe, debugger, or MAP_PRIVATE may change the
* page table, but the new page will not be a subpage of hpage.
*/
- if (hpage + i != page)
+ if (folio_page(folio, i) != page)
goto abort;
}
@@ -1578,7 +1577,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* page_table_lock) ptl nests inside pml. The less time we hold pml,
* the better; but userfaultfd's mfill_atomic_pte() on a private VMA
* inserts a valid as-if-COWed PTE without even looking up page cache.
- * So page lock of hpage does not protect from it, so we must not drop
+ * So page lock of folio does not protect from it, so we must not drop
* ptl before pgt_pmd is removed, so uffd private needs pml taken now.
*/
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
@@ -1602,7 +1601,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
continue;
/*
* We dropped ptl after the first scan, to do the mmu_notifier:
- * page lock stops more PTEs of the hpage being faulted in, but
+ * page lock stops more PTEs of the folio being faulted in, but
* does not stop write faults COWing anon copies from existing
* PTEs; and does not stop those being swapped out or migrated.
*/
@@ -1611,7 +1610,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
}
page = vm_normal_page(vma, addr, ptent);
- if (hpage + i != page)
+ if (folio_page(folio, i) != page)
goto abort;
/*
@@ -1630,8 +1629,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 3: set proper refcount and mm_counters. */
if (nr_ptes) {
- page_ref_sub(hpage, nr_ptes);
- add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
+ folio_ref_sub(folio, nr_ptes);
+ add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
}
/* step 4: remove empty page table */
@@ -1655,14 +1654,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
- ? set_huge_pmd(vma, haddr, pmd, hpage)
+ ? set_huge_pmd(vma, haddr, pmd, &folio->page)
: SCAN_SUCCEED;
- goto drop_hpage;
+ goto drop_folio;
abort:
if (nr_ptes) {
flush_tlb_mm(mm);
- page_ref_sub(hpage, nr_ptes);
- add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
+ folio_ref_sub(folio, nr_ptes);
+ add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
}
if (start_pte)
pte_unmap_unlock(start_pte, ptl);
@@ -1670,9 +1669,9 @@ abort:
spin_unlock(pml);
if (notified)
mmu_notifier_invalidate_range_end(&range);
-drop_hpage:
- unlock_page(hpage);
- put_page(hpage);
+drop_folio:
+ folio_unlock(folio);
+ folio_put(folio);
return result;
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 54c2c90d3abc..1eacca03bedd 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -583,6 +583,19 @@ static void __remove_object(struct kmemleak_object *object)
object->del_state |= DELSTATE_REMOVED;
}
+static struct kmemleak_object *__find_and_remove_object(unsigned long ptr,
+ int alias,
+ bool is_phys)
+{
+ struct kmemleak_object *object;
+
+ object = __lookup_object(ptr, alias, is_phys);
+ if (object)
+ __remove_object(object);
+
+ return object;
+}
+
/*
* Look up an object in the object search tree and remove it from both
* object_tree_root (or object_phys_tree_root) and object_list. The
@@ -596,9 +609,7 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali
struct kmemleak_object *object;
raw_spin_lock_irqsave(&kmemleak_lock, flags);
- object = __lookup_object(ptr, alias, is_phys);
- if (object)
- __remove_object(object);
+ object = __find_and_remove_object(ptr, alias, is_phys);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
@@ -623,27 +634,28 @@ static noinline depot_stack_handle_t set_track_prepare(void)
return trace_handle;
}
-/*
- * Create the metadata (struct kmemleak_object) corresponding to an allocated
- * memory block and add it to the object_list and object_tree_root (or
- * object_phys_tree_root).
- */
-static void __create_object(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp, bool is_phys)
+static struct kmemleak_object *__alloc_object(gfp_t gfp)
{
- unsigned long flags;
- struct kmemleak_object *object, *parent;
- struct rb_node **link, *rb_parent;
- unsigned long untagged_ptr;
- unsigned long untagged_objp;
+ struct kmemleak_object *object;
object = mem_pool_alloc(gfp);
if (!object) {
pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
- return;
}
+ return object;
+}
+
+static int __link_object(struct kmemleak_object *object, unsigned long ptr,
+ size_t size, int min_count, bool is_phys)
+{
+
+ struct kmemleak_object *parent;
+ struct rb_node **link, *rb_parent;
+ unsigned long untagged_ptr;
+ unsigned long untagged_objp;
+
INIT_LIST_HEAD(&object->object_list);
INIT_LIST_HEAD(&object->gray_list);
INIT_HLIST_HEAD(&object->area_list);
@@ -680,8 +692,6 @@ static void __create_object(unsigned long ptr, size_t size,
/* kernel backtrace */
object->trace_handle = set_track_prepare();
- raw_spin_lock_irqsave(&kmemleak_lock, flags);
-
untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
/*
* Only update min_addr and max_addr with object
@@ -710,16 +720,38 @@ static void __create_object(unsigned long ptr, size_t size,
* be freed while the kmemleak_lock is held.
*/
dump_object_info(parent);
- kmem_cache_free(object_cache, object);
- goto out;
+ return -EEXIST;
}
}
rb_link_node(&object->rb_node, rb_parent, link);
rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root :
&object_tree_root);
list_add_tail_rcu(&object->object_list, &object_list);
-out:
+
+ return 0;
+}
+
+/*
+ * Create the metadata (struct kmemleak_object) corresponding to an allocated
+ * memory block and add it to the object_list and object_tree_root (or
+ * object_phys_tree_root).
+ */
+static void __create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp, bool is_phys)
+{
+ struct kmemleak_object *object;
+ unsigned long flags;
+ int ret;
+
+ object = __alloc_object(gfp);
+ if (!object)
+ return;
+
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
+ ret = __link_object(object, ptr, size, min_count, is_phys);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+ if (ret)
+ mem_pool_free(object);
}
/* Create kmemleak object which allocated with virtual address. */
@@ -782,16 +814,25 @@ static void delete_object_full(unsigned long ptr)
*/
static void delete_object_part(unsigned long ptr, size_t size, bool is_phys)
{
- struct kmemleak_object *object;
- unsigned long start, end;
+ struct kmemleak_object *object, *object_l, *object_r;
+ unsigned long start, end, flags;
+
+ object_l = __alloc_object(GFP_KERNEL);
+ if (!object_l)
+ return;
- object = find_and_remove_object(ptr, 1, is_phys);
+ object_r = __alloc_object(GFP_KERNEL);
+ if (!object_r)
+ goto out;
+
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
+ object = __find_and_remove_object(ptr, 1, is_phys);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
ptr, size);
#endif
- return;
+ goto unlock;
}
/*
@@ -801,14 +842,25 @@ static void delete_object_part(unsigned long ptr, size_t size, bool is_phys)
*/
start = object->pointer;
end = object->pointer + object->size;
- if (ptr > start)
- __create_object(start, ptr - start, object->min_count,
- GFP_KERNEL, is_phys);
- if (ptr + size < end)
- __create_object(ptr + size, end - ptr - size, object->min_count,
- GFP_KERNEL, is_phys);
+ if ((ptr > start) &&
+ !__link_object(object_l, start, ptr - start,
+ object->min_count, is_phys))
+ object_l = NULL;
+ if ((ptr + size < end) &&
+ !__link_object(object_r, ptr + size, end - ptr - size,
+ object->min_count, is_phys))
+ object_r = NULL;
+
+unlock:
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+ if (object)
+ __delete_object(object);
- __delete_object(object);
+out:
+ if (object_l)
+ mem_pool_free(object_l);
+ if (object_r)
+ mem_pool_free(object_r);
}
static void __paint_it(struct kmemleak_object *object, int color)
@@ -975,7 +1027,7 @@ static void object_no_scan(unsigned long ptr)
void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
gfp_t gfp)
{
- pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
+ pr_debug("%s(0x%px, %zu, %d)\n", __func__, ptr, size, min_count);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
create_object((unsigned long)ptr, size, min_count, gfp);
@@ -996,7 +1048,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
{
unsigned int cpu;
- pr_debug("%s(0x%p, %zu)\n", __func__, ptr, size);
+ pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size);
/*
* Percpu allocations are only scanned and not reported as leaks
@@ -1020,7 +1072,7 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
*/
void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp)
{
- pr_debug("%s(0x%p, %zu)\n", __func__, area, size);
+ pr_debug("%s(0x%px, %zu)\n", __func__, area, size);
/*
* A min_count = 2 is needed because vm_struct contains a reference to
@@ -1043,7 +1095,7 @@ EXPORT_SYMBOL_GPL(kmemleak_vmalloc);
*/
void __ref kmemleak_free(const void *ptr)
{
- pr_debug("%s(0x%p)\n", __func__, ptr);
+ pr_debug("%s(0x%px)\n", __func__, ptr);
if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
delete_object_full((unsigned long)ptr);
@@ -1061,7 +1113,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free);
*/
void __ref kmemleak_free_part(const void *ptr, size_t size)
{
- pr_debug("%s(0x%p)\n", __func__, ptr);
+ pr_debug("%s(0x%px)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
delete_object_part((unsigned long)ptr, size, false);
@@ -1079,7 +1131,7 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr)
{
unsigned int cpu;
- pr_debug("%s(0x%p)\n", __func__, ptr);
+ pr_debug("%s(0x%px)\n", __func__, ptr);
if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
for_each_possible_cpu(cpu)
@@ -1100,7 +1152,7 @@ void __ref kmemleak_update_trace(const void *ptr)
struct kmemleak_object *object;
unsigned long flags;
- pr_debug("%s(0x%p)\n", __func__, ptr);
+ pr_debug("%s(0x%px)\n", __func__, ptr);
if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr))
return;
@@ -1131,7 +1183,7 @@ EXPORT_SYMBOL(kmemleak_update_trace);
*/
void __ref kmemleak_not_leak(const void *ptr)
{
- pr_debug("%s(0x%p)\n", __func__, ptr);
+ pr_debug("%s(0x%px)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_gray_object((unsigned long)ptr);
@@ -1149,7 +1201,7 @@ EXPORT_SYMBOL(kmemleak_not_leak);
*/
void __ref kmemleak_ignore(const void *ptr)
{
- pr_debug("%s(0x%p)\n", __func__, ptr);
+ pr_debug("%s(0x%px)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_black_object((unsigned long)ptr, false);
@@ -1169,7 +1221,7 @@ EXPORT_SYMBOL(kmemleak_ignore);
*/
void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
{
- pr_debug("%s(0x%p)\n", __func__, ptr);
+ pr_debug("%s(0x%px)\n", __func__, ptr);
if (kmemleak_enabled && ptr && size && !IS_ERR(ptr))
add_scan_area((unsigned long)ptr, size, gfp);
@@ -1187,7 +1239,7 @@ EXPORT_SYMBOL(kmemleak_scan_area);
*/
void __ref kmemleak_no_scan(const void *ptr)
{
- pr_debug("%s(0x%p)\n", __func__, ptr);
+ pr_debug("%s(0x%px)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
object_no_scan((unsigned long)ptr);
@@ -1203,7 +1255,7 @@ EXPORT_SYMBOL(kmemleak_no_scan);
*/
void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp)
{
- pr_debug("%s(0x%pa, %zu)\n", __func__, &phys, size);
+ pr_debug("%s(0x%px, %zu)\n", __func__, &phys, size);
if (kmemleak_enabled)
/*
@@ -1223,7 +1275,7 @@ EXPORT_SYMBOL(kmemleak_alloc_phys);
*/
void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
{
- pr_debug("%s(0x%pa)\n", __func__, &phys);
+ pr_debug("%s(0x%px)\n", __func__, &phys);
if (kmemleak_enabled)
delete_object_part((unsigned long)phys, size, true);
@@ -1237,7 +1289,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys);
*/
void __ref kmemleak_ignore_phys(phys_addr_t phys)
{
- pr_debug("%s(0x%pa)\n", __func__, &phys);
+ pr_debug("%s(0x%px)\n", __func__, &phys);
if (kmemleak_enabled)
make_black_object((unsigned long)phys, true);
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index 3adb4c1d3b19..c19f47af0424 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -83,131 +83,66 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
/* Copy the metadata following the memmove() behavior. */
void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n)
{
+ depot_stack_handle_t prev_old_origin = 0, prev_new_origin = 0;
+ int i, iter, step, src_off, dst_off, oiter_src, oiter_dst;
depot_stack_handle_t old_origin = 0, new_origin = 0;
- int src_slots, dst_slots, i, iter, step, skip_bits;
depot_stack_handle_t *origin_src, *origin_dst;
- void *shadow_src, *shadow_dst;
- u32 *align_shadow_src, shadow;
+ u8 *shadow_src, *shadow_dst;
+ u32 *align_shadow_dst;
bool backwards;
shadow_dst = kmsan_get_metadata(dst, KMSAN_META_SHADOW);
if (!shadow_dst)
return;
KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(dst, n));
+ align_shadow_dst =
+ (u32 *)ALIGN_DOWN((u64)shadow_dst, KMSAN_ORIGIN_SIZE);
shadow_src = kmsan_get_metadata(src, KMSAN_META_SHADOW);
if (!shadow_src) {
- /*
- * @src is untracked: zero out destination shadow, ignore the
- * origins, we're done.
- */
- __memset(shadow_dst, 0, n);
+ /* @src is untracked: mark @dst as initialized. */
+ kmsan_internal_unpoison_memory(dst, n, /*checked*/ false);
return;
}
KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(src, n));
- __memmove(shadow_dst, shadow_src, n);
-
origin_dst = kmsan_get_metadata(dst, KMSAN_META_ORIGIN);
origin_src = kmsan_get_metadata(src, KMSAN_META_ORIGIN);
KMSAN_WARN_ON(!origin_dst || !origin_src);
- src_slots = (ALIGN((u64)src + n, KMSAN_ORIGIN_SIZE) -
- ALIGN_DOWN((u64)src, KMSAN_ORIGIN_SIZE)) /
- KMSAN_ORIGIN_SIZE;
- dst_slots = (ALIGN((u64)dst + n, KMSAN_ORIGIN_SIZE) -
- ALIGN_DOWN((u64)dst, KMSAN_ORIGIN_SIZE)) /
- KMSAN_ORIGIN_SIZE;
- KMSAN_WARN_ON((src_slots < 1) || (dst_slots < 1));
- KMSAN_WARN_ON((src_slots - dst_slots > 1) ||
- (dst_slots - src_slots < -1));
backwards = dst > src;
- i = backwards ? min(src_slots, dst_slots) - 1 : 0;
- iter = backwards ? -1 : 1;
-
- align_shadow_src =
- (u32 *)ALIGN_DOWN((u64)shadow_src, KMSAN_ORIGIN_SIZE);
- for (step = 0; step < min(src_slots, dst_slots); step++, i += iter) {
- KMSAN_WARN_ON(i < 0);
- shadow = align_shadow_src[i];
- if (i == 0) {
- /*
- * If @src isn't aligned on KMSAN_ORIGIN_SIZE, don't
- * look at the first @src % KMSAN_ORIGIN_SIZE bytes
- * of the first shadow slot.
- */
- skip_bits = ((u64)src % KMSAN_ORIGIN_SIZE) * 8;
- shadow = (shadow >> skip_bits) << skip_bits;
+ step = backwards ? -1 : 1;
+ iter = backwards ? n - 1 : 0;
+ src_off = (u64)src % KMSAN_ORIGIN_SIZE;
+ dst_off = (u64)dst % KMSAN_ORIGIN_SIZE;
+
+ /* Copy shadow bytes one by one, updating the origins if necessary. */
+ for (i = 0; i < n; i++, iter += step) {
+ oiter_src = (iter + src_off) / KMSAN_ORIGIN_SIZE;
+ oiter_dst = (iter + dst_off) / KMSAN_ORIGIN_SIZE;
+ if (!shadow_src[iter]) {
+ shadow_dst[iter] = 0;
+ if (!align_shadow_dst[oiter_dst])
+ origin_dst[oiter_dst] = 0;
+ continue;
}
- if (i == src_slots - 1) {
- /*
- * If @src + n isn't aligned on
- * KMSAN_ORIGIN_SIZE, don't look at the last
- * (@src + n) % KMSAN_ORIGIN_SIZE bytes of the
- * last shadow slot.
- */
- skip_bits = (((u64)src + n) % KMSAN_ORIGIN_SIZE) * 8;
- shadow = (shadow << skip_bits) >> skip_bits;
- }
- /*
- * Overwrite the origin only if the corresponding
- * shadow is nonempty.
- */
- if (origin_src[i] && (origin_src[i] != old_origin) && shadow) {
- old_origin = origin_src[i];
- new_origin = kmsan_internal_chain_origin(old_origin);
+ shadow_dst[iter] = shadow_src[iter];
+ old_origin = origin_src[oiter_src];
+ if (old_origin == prev_old_origin)
+ new_origin = prev_new_origin;
+ else {
/*
* kmsan_internal_chain_origin() may return
* NULL, but we don't want to lose the previous
* origin value.
*/
+ new_origin = kmsan_internal_chain_origin(old_origin);
if (!new_origin)
new_origin = old_origin;
}
- if (shadow)
- origin_dst[i] = new_origin;
- else
- origin_dst[i] = 0;
- }
- /*
- * If dst_slots is greater than src_slots (i.e.
- * dst_slots == src_slots + 1), there is an extra origin slot at the
- * beginning or end of the destination buffer, for which we take the
- * origin from the previous slot.
- * This is only done if the part of the source shadow corresponding to
- * slot is non-zero.
- *
- * E.g. if we copy 8 aligned bytes that are marked as uninitialized
- * and have origins o111 and o222, to an unaligned buffer with offset 1,
- * these two origins are copied to three origin slots, so one of then
- * needs to be duplicated, depending on the copy direction (@backwards)
- *
- * src shadow: |uuuu|uuuu|....|
- * src origin: |o111|o222|....|
- *
- * backwards = 0:
- * dst shadow: |.uuu|uuuu|u...|
- * dst origin: |....|o111|o222| - fill the empty slot with o111
- * backwards = 1:
- * dst shadow: |.uuu|uuuu|u...|
- * dst origin: |o111|o222|....| - fill the empty slot with o222
- */
- if (src_slots < dst_slots) {
- if (backwards) {
- shadow = align_shadow_src[src_slots - 1];
- skip_bits = (((u64)dst + n) % KMSAN_ORIGIN_SIZE) * 8;
- shadow = (shadow << skip_bits) >> skip_bits;
- if (shadow)
- /* src_slots > 0, therefore dst_slots is at least 2 */
- origin_dst[dst_slots - 1] =
- origin_dst[dst_slots - 2];
- } else {
- shadow = align_shadow_src[0];
- skip_bits = ((u64)dst % KMSAN_ORIGIN_SIZE) * 8;
- shadow = (shadow >> skip_bits) << skip_bits;
- if (shadow)
- origin_dst[0] = origin_dst[1];
- }
+ origin_dst[oiter_dst] = new_origin;
+ prev_new_origin = new_origin;
+ prev_old_origin = old_origin;
}
}
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index 312989aa2865..07d3a3a5a9c5 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -67,6 +67,17 @@ static bool report_available(void)
return READ_ONCE(observed.available);
}
+/* Reset observed.available, so that the test can trigger another report. */
+static void report_reset(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ WRITE_ONCE(observed.available, false);
+ observed.ignore = false;
+ spin_unlock_irqrestore(&observed.lock, flags);
+}
+
/* Information we expect in a report. */
struct expect_report {
const char *error_type; /* Error type. */
@@ -407,33 +418,25 @@ static void test_printk(struct kunit *test)
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
-/*
- * Prevent the compiler from optimizing @var away. Without this, Clang may
- * notice that @var is uninitialized and drop memcpy() calls that use it.
- *
- * There is OPTIMIZER_HIDE_VAR() in linux/compier.h that we cannot use here,
- * because it is implemented as inline assembly receiving @var as a parameter
- * and will enforce a KMSAN check. Same is true for e.g. barrier_data(var).
- */
-#define DO_NOT_OPTIMIZE(var) barrier()
+/* Prevent the compiler from inlining a memcpy() call. */
+static noinline void *memcpy_noinline(volatile void *dst,
+ const volatile void *src, size_t size)
+{
+ return memcpy((void *)dst, (const void *)src, size);
+}
-/*
- * Test case: ensure that memcpy() correctly copies initialized values.
- * Also serves as a regression test to ensure DO_NOT_OPTIMIZE() does not cause
- * extra checks.
- */
+/* Test case: ensure that memcpy() correctly copies initialized values. */
static void test_init_memcpy(struct kunit *test)
{
EXPECTATION_NO_REPORT(expect);
- volatile int src;
- volatile int dst = 0;
+ volatile long long src;
+ volatile long long dst = 0;
- DO_NOT_OPTIMIZE(src);
src = 1;
kunit_info(
test,
"memcpy()ing aligned initialized src to aligned dst (no reports)\n");
- memcpy((void *)&dst, (void *)&src, sizeof(src));
+ memcpy_noinline((void *)&dst, (void *)&src, sizeof(src));
kmsan_check_memory((void *)&dst, sizeof(dst));
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
@@ -451,8 +454,7 @@ static void test_memcpy_aligned_to_aligned(struct kunit *test)
kunit_info(
test,
"memcpy()ing aligned uninit src to aligned dst (UMR report)\n");
- DO_NOT_OPTIMIZE(uninit_src);
- memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src));
+ memcpy_noinline((void *)&dst, (void *)&uninit_src, sizeof(uninit_src));
kmsan_check_memory((void *)&dst, sizeof(dst));
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
@@ -463,7 +465,7 @@ static void test_memcpy_aligned_to_aligned(struct kunit *test)
*
* Copying aligned 4-byte value to an unaligned one leads to touching two
* aligned 4-byte values. This test case checks that KMSAN correctly reports an
- * error on the first of the two values.
+ * error on the mentioned two values.
*/
static void test_memcpy_aligned_to_unaligned(struct kunit *test)
{
@@ -474,33 +476,65 @@ static void test_memcpy_aligned_to_unaligned(struct kunit *test)
kunit_info(
test,
"memcpy()ing aligned uninit src to unaligned dst (UMR report)\n");
- DO_NOT_OPTIMIZE(uninit_src);
- memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src));
+ kmsan_check_memory((void *)&uninit_src, sizeof(uninit_src));
+ memcpy_noinline((void *)&dst[1], (void *)&uninit_src,
+ sizeof(uninit_src));
kmsan_check_memory((void *)dst, 4);
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ report_reset();
+ kmsan_check_memory((void *)&dst[4], sizeof(uninit_src));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
/*
- * Test case: ensure that memcpy() correctly copies uninitialized values between
- * aligned `src` and unaligned `dst`.
+ * Test case: ensure that origin slots do not accidentally get overwritten with
+ * zeroes during memcpy().
*
- * Copying aligned 4-byte value to an unaligned one leads to touching two
- * aligned 4-byte values. This test case checks that KMSAN correctly reports an
- * error on the second of the two values.
+ * Previously, when copying memory from an aligned buffer to an unaligned one,
+ * if there were zero origins corresponding to zero shadow values in the source
+ * buffer, they could have ended up being copied to nonzero shadow values in the
+ * destination buffer:
+ *
+ * memcpy(0xffff888080a00000, 0xffff888080900002, 8)
+ *
+ * src (0xffff888080900002): ..xx .... xx..
+ * src origins: o111 0000 o222
+ * dst (0xffff888080a00000): xx.. ..xx
+ * dst origins: o111 0000
+ * (or 0000 o222)
+ *
+ * (here . stands for an initialized byte, and x for an uninitialized one.
+ *
+ * Ensure that this does not happen anymore, and for both destination bytes
+ * the origin is nonzero (i.e. KMSAN reports an error).
*/
-static void test_memcpy_aligned_to_unaligned2(struct kunit *test)
+static void test_memcpy_initialized_gap(struct kunit *test)
{
- EXPECTATION_UNINIT_VALUE_FN(expect,
- "test_memcpy_aligned_to_unaligned2");
- volatile int uninit_src;
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_initialized_gap");
+ volatile char uninit_src[12];
volatile char dst[8] = { 0 };
kunit_info(
test,
- "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n");
- DO_NOT_OPTIMIZE(uninit_src);
- memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src));
- kmsan_check_memory((void *)&dst[4], sizeof(uninit_src));
+ "unaligned 4-byte initialized value gets a nonzero origin after memcpy() - (2 UMR reports)\n");
+
+ uninit_src[0] = 42;
+ uninit_src[1] = 42;
+ uninit_src[4] = 42;
+ uninit_src[5] = 42;
+ uninit_src[6] = 42;
+ uninit_src[7] = 42;
+ uninit_src[10] = 42;
+ uninit_src[11] = 42;
+ memcpy_noinline((void *)&dst[0], (void *)&uninit_src[2], 8);
+
+ kmsan_check_memory((void *)&dst[0], 4);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ report_reset();
+ kmsan_check_memory((void *)&dst[2], 4);
+ KUNIT_EXPECT_FALSE(test, report_matches(&expect));
+ report_reset();
+ kmsan_check_memory((void *)&dst[4], 4);
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
@@ -513,7 +547,6 @@ static void test_memcpy_aligned_to_unaligned2(struct kunit *test)
\
kunit_info(test, \
"memset" #size "() should initialize memory\n"); \
- DO_NOT_OPTIMIZE(uninit); \
memset##size((uint##size##_t *)&uninit, 0, 1); \
kmsan_check_memory((void *)&uninit, sizeof(uninit)); \
KUNIT_EXPECT_TRUE(test, report_matches(&expect)); \
@@ -598,7 +631,7 @@ static struct kunit_case kmsan_test_cases[] = {
KUNIT_CASE(test_init_memcpy),
KUNIT_CASE(test_memcpy_aligned_to_aligned),
KUNIT_CASE(test_memcpy_aligned_to_unaligned),
- KUNIT_CASE(test_memcpy_aligned_to_unaligned2),
+ KUNIT_CASE(test_memcpy_initialized_gap),
KUNIT_CASE(test_memset16),
KUNIT_CASE(test_memset32),
KUNIT_CASE(test_memset64),
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 87318f9170f1..b9d05aff313e 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -285,12 +285,17 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end)
size = PAGE_ALIGN((u64)end - (u64)start);
shadow = memblock_alloc(size, PAGE_SIZE);
origin = memblock_alloc(size, PAGE_SIZE);
+
+ if (!shadow || !origin)
+ panic("%s: Failed to allocate metadata memory for early boot range of size %llu",
+ __func__, size);
+
for (u64 addr = 0; addr < size; addr += PAGE_SIZE) {
page = virt_to_page_or_null((char *)start + addr);
- shadow_p = virt_to_page_or_null((char *)shadow + addr);
+ shadow_p = virt_to_page((char *)shadow + addr);
set_no_shadow_origin_page(shadow_p);
shadow_page_for(page) = shadow_p;
- origin_p = virt_to_page_or_null((char *)origin + addr);
+ origin_p = virt_to_page((char *)origin + addr);
set_no_shadow_origin_page(origin_p);
origin_page_for(page) = origin_p;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 981af9c72e7a..7efcc68ccc6e 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -56,6 +56,8 @@
#define DO_NUMA(x) do { } while (0)
#endif
+typedef u8 rmap_age_t;
+
/**
* DOC: Overview
*
@@ -193,6 +195,8 @@ struct ksm_stable_node {
* @node: rb node of this rmap_item in the unstable tree
* @head: pointer to stable_node heading this list in the stable tree
* @hlist: link into hlist of rmap_items hanging off that stable_node
+ * @age: number of scan iterations since creation
+ * @remaining_skips: how many scans to skip
*/
struct ksm_rmap_item {
struct ksm_rmap_item *rmap_list;
@@ -205,6 +209,8 @@ struct ksm_rmap_item {
struct mm_struct *mm;
unsigned long address; /* + low bits used for flags below */
unsigned int oldchecksum; /* when unstable */
+ rmap_age_t age;
+ rmap_age_t remaining_skips;
union {
struct rb_node node; /* when node of unstable tree */
struct { /* when listed from stable tree */
@@ -281,9 +287,16 @@ static unsigned int zero_checksum __read_mostly;
/* Whether to merge empty (zeroed) pages with actual zero pages */
static bool ksm_use_zero_pages __read_mostly;
+/* Skip pages that couldn't be de-duplicated previously */
+/* Default to true at least temporarily, for testing */
+static bool ksm_smart_scan = true;
+
/* The number of zero pages which is placed by KSM */
unsigned long ksm_zero_pages;
+/* The number of pages that have been skipped due to "smart scanning" */
+static unsigned long ksm_pages_skipped;
+
#ifdef CONFIG_NUMA
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
@@ -2305,6 +2318,74 @@ static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
return rmap_item;
}
+/*
+ * Calculate skip age for the ksm page age. The age determines how often
+ * de-duplicating has already been tried unsuccessfully. If the age is
+ * smaller, the scanning of this page is skipped for less scans.
+ *
+ * @age: rmap_item age of page
+ */
+static unsigned int skip_age(rmap_age_t age)
+{
+ if (age <= 3)
+ return 1;
+ if (age <= 5)
+ return 2;
+ if (age <= 8)
+ return 4;
+
+ return 8;
+}
+
+/*
+ * Determines if a page should be skipped for the current scan.
+ *
+ * @page: page to check
+ * @rmap_item: associated rmap_item of page
+ */
+static bool should_skip_rmap_item(struct page *page,
+ struct ksm_rmap_item *rmap_item)
+{
+ rmap_age_t age;
+
+ if (!ksm_smart_scan)
+ return false;
+
+ /*
+ * Never skip pages that are already KSM; pages cmp_and_merge_page()
+ * will essentially ignore them, but we still have to process them
+ * properly.
+ */
+ if (PageKsm(page))
+ return false;
+
+ age = rmap_item->age;
+ if (age != U8_MAX)
+ rmap_item->age++;
+
+ /*
+ * Smaller ages are not skipped, they need to get a chance to go
+ * through the different phases of the KSM merging.
+ */
+ if (age < 3)
+ return false;
+
+ /*
+ * Are we still allowed to skip? If not, then don't skip it
+ * and determine how much more often we are allowed to skip next.
+ */
+ if (!rmap_item->remaining_skips) {
+ rmap_item->remaining_skips = skip_age(age);
+ return false;
+ }
+
+ /* Skip this page */
+ ksm_pages_skipped++;
+ rmap_item->remaining_skips--;
+ remove_rmap_item_from_tree(rmap_item);
+ return true;
+}
+
static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
@@ -2409,6 +2490,10 @@ next_mm:
if (rmap_item) {
ksm_scan.rmap_list =
&rmap_item->rmap_list;
+
+ if (should_skip_rmap_item(*page, rmap_item))
+ goto next_page;
+
ksm_scan.address += PAGE_SIZE;
} else
put_page(*page);
@@ -3383,6 +3468,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
}
KSM_ATTR_RO(pages_volatile);
+static ssize_t pages_skipped_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", ksm_pages_skipped);
+}
+KSM_ATTR_RO(pages_skipped);
+
static ssize_t ksm_zero_pages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -3449,6 +3541,28 @@ static ssize_t full_scans_show(struct kobject *kobj,
}
KSM_ATTR_RO(full_scans);
+static ssize_t smart_scan_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", ksm_smart_scan);
+}
+
+static ssize_t smart_scan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ bool value;
+
+ err = kstrtobool(buf, &value);
+ if (err)
+ return -EINVAL;
+
+ ksm_smart_scan = value;
+ return count;
+}
+KSM_ATTR(smart_scan);
+
static struct attribute *ksm_attrs[] = {
&sleep_millisecs_attr.attr,
&pages_to_scan_attr.attr,
@@ -3458,6 +3572,7 @@ static struct attribute *ksm_attrs[] = {
&pages_sharing_attr.attr,
&pages_unshared_attr.attr,
&pages_volatile_attr.attr,
+ &pages_skipped_attr.attr,
&ksm_zero_pages_attr.attr,
&full_scans_attr.attr,
#ifdef CONFIG_NUMA
@@ -3469,6 +3584,7 @@ static struct attribute *ksm_attrs[] = {
&stable_node_chains_prune_millisecs_attr.attr,
&use_zero_pages_attr.attr,
&general_profit_attr.attr,
+ &smart_scan_attr.attr,
NULL,
};
diff --git a/mm/madvise.c b/mm/madvise.c
index 4dded5d27e7e..cf4d694280e9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -141,7 +141,6 @@ static int madvise_update_vma(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
int error;
- pgoff_t pgoff;
VMA_ITERATOR(vmi, mm, start);
if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
@@ -149,30 +148,13 @@ static int madvise_update_vma(struct vm_area_struct *vma,
return 0;
}
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, anon_name);
- if (*prev) {
- vma = *prev;
- goto success;
- }
+ vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
+ anon_name);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
*prev = vma;
- if (start != vma->vm_start) {
- error = split_vma(&vmi, vma, start, 1);
- if (error)
- return error;
- }
-
- if (end != vma->vm_end) {
- error = split_vma(&vmi, vma, end, 0);
- if (error)
- return error;
- }
-
-success:
/* vm_flags is protected by the mmap_lock held in write mode. */
vma_start_write(vma);
vm_flags_reset(vma, new_flags);
@@ -746,11 +728,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
folio_mark_lazyfree(folio);
}
- if (nr_swap) {
- if (current->mm == mm)
- sync_mm_rss(mm);
+ if (nr_swap)
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
- }
if (start_pte) {
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(start_pte, ptl);
@@ -991,7 +970,7 @@ static long madvise_remove(struct vm_area_struct *vma,
return -EINVAL;
}
- if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
+ if (!vma_is_shared_maywrite(vma))
return -EACCES;
offset = (loff_t)(start - vma->vm_start)
diff --git a/mm/memblock.c b/mm/memblock.c
index 0863222af4a4..5a88d6d24d79 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -892,6 +892,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
/**
* memblock_setclr_flag - set or clear flag for a memory region
+ * @type: memblock type to set/clear flag for
* @base: base address of the region
* @size: size of the region
* @set: set or clear the flag
@@ -901,10 +902,9 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
*
* Return: 0 on success, -errno on failure.
*/
-static int __init_memblock memblock_setclr_flag(phys_addr_t base,
- phys_addr_t size, int set, int flag)
+static int __init_memblock memblock_setclr_flag(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size, int set, int flag)
{
- struct memblock_type *type = &memblock.memory;
int i, ret, start_rgn, end_rgn;
ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
@@ -933,7 +933,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
*/
int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
{
- return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG);
+ return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_HOTPLUG);
}
/**
@@ -945,7 +945,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
*/
int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
{
- return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG);
+ return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_HOTPLUG);
}
/**
@@ -962,7 +962,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
system_has_some_mirror = true;
- return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
+ return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_MIRROR);
}
/**
@@ -982,7 +982,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
*/
int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
{
- return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP);
+ return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_NOMAP);
}
/**
@@ -994,7 +994,25 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
*/
int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
{
- return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP);
+ return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_NOMAP);
+}
+
+/**
+ * memblock_reserved_mark_noinit - Mark a reserved memory region with flag
+ * MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized
+ * for this region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * struct pages will not be initialized for reserved memory regions marked with
+ * %MEMBLOCK_RSRV_NOINIT.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(&memblock.reserved, base, size, 1,
+ MEMBLOCK_RSRV_NOINIT);
}
static bool should_skip_region(struct memblock_type *type,
@@ -2113,13 +2131,18 @@ static void __init memmap_init_reserved_pages(void)
memblock_set_node(start, end, &memblock.reserved, nid);
}
- /* initialize struct pages for the reserved regions */
+ /*
+ * initialize struct pages for reserved regions that don't have
+ * the MEMBLOCK_RSRV_NOINIT flag set
+ */
for_each_reserved_mem_region(region) {
- nid = memblock_get_region_node(region);
- start = region->base;
- end = start + region->size;
+ if (!memblock_is_reserved_noinit(region)) {
+ nid = memblock_get_region_node(region);
+ start = region->base;
+ end = start + region->size;
- reserve_bootmem_region(start, end, nid);
+ reserve_bootmem_region(start, end, nid);
+ }
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d13dde2f8b56..774bd6e21e27 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -249,6 +249,9 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
return container_of(vmpr, struct mem_cgroup, vmpressure);
}
+#define CURRENT_OBJCG_UPDATE_BIT 0
+#define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT)
+
#ifdef CONFIG_MEMCG_KMEM
static DEFINE_SPINLOCK(objcg_lock);
@@ -704,6 +707,8 @@ static const unsigned int memcg_vm_event_stat[] = {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
THP_FAULT_ALLOC,
THP_COLLAPSE_ALLOC,
+ THP_SWPOUT,
+ THP_SWPOUT_FALLBACK,
#endif
};
@@ -761,6 +766,22 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
return x;
}
+static int memcg_page_state_unit(int item);
+
+/*
+ * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
+ * up non-zero sub-page updates to 1 page as zero page updates are ignored.
+ */
+static int memcg_state_val_in_pages(int idx, int val)
+{
+ int unit = memcg_page_state_unit(idx);
+
+ if (!val || unit == PAGE_SIZE)
+ return val;
+ else
+ return max(val * unit / PAGE_SIZE, 1UL);
+}
+
/**
* __mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
@@ -773,7 +794,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
return;
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
}
/* idx can be of type enum memcg_stat_item or node_stat_item. */
@@ -798,7 +819,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
memcg = pn->memcg;
/*
- * The caller from rmap relay on disabled preemption becase they never
+ * The caller from rmap relies on disabled preemption because they never
* update their counter from in-interrupt context. For these two
* counters we check that the update is never performed from an
* interrupt context while other caller need to have disabled interrupt.
@@ -824,7 +845,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update lruvec */
__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
memcg_stats_unlock();
}
@@ -1068,17 +1089,25 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
}
EXPORT_SYMBOL(get_mem_cgroup_from_mm);
-static __always_inline bool memcg_kmem_bypass(void)
+/**
+ * get_mem_cgroup_from_current - Obtain a reference on current task's memcg.
+ */
+struct mem_cgroup *get_mem_cgroup_from_current(void)
{
- /* Allow remote memcg charging from any context. */
- if (unlikely(active_memcg()))
- return false;
+ struct mem_cgroup *memcg;
- /* Memcg to charge can't be determined. */
- if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
- return true;
+ if (mem_cgroup_disabled())
+ return NULL;
- return false;
+again:
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (!css_tryget(&memcg->css)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+ return memcg;
}
/**
@@ -1533,7 +1562,7 @@ static const struct memory_stat memory_stats[] = {
{ "workingset_nodereclaim", WORKINGSET_NODERECLAIM },
};
-/* Translate stat items to the correct unit for memory.stat output */
+/* The actual unit of the state item, not the same as the output unit */
static int memcg_page_state_unit(int item)
{
switch (item) {
@@ -1541,6 +1570,22 @@ static int memcg_page_state_unit(int item)
case MEMCG_ZSWAP_B:
case NR_SLAB_RECLAIMABLE_B:
case NR_SLAB_UNRECLAIMABLE_B:
+ return 1;
+ case NR_KERNEL_STACK_KB:
+ return SZ_1K;
+ default:
+ return PAGE_SIZE;
+ }
+}
+
+/* Translate stat items to the correct unit for memory.stat output */
+static int memcg_page_state_output_unit(int item)
+{
+ /*
+ * Workingset state is actually in pages, but we export it to userspace
+ * as a scalar count of events, so special case it here.
+ */
+ switch (item) {
case WORKINGSET_REFAULT_ANON:
case WORKINGSET_REFAULT_FILE:
case WORKINGSET_ACTIVATE_ANON:
@@ -1549,17 +1594,23 @@ static int memcg_page_state_unit(int item)
case WORKINGSET_RESTORE_FILE:
case WORKINGSET_NODERECLAIM:
return 1;
- case NR_KERNEL_STACK_KB:
- return SZ_1K;
default:
- return PAGE_SIZE;
+ return memcg_page_state_unit(item);
}
}
static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
int item)
{
- return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
+ return memcg_page_state(memcg, item) *
+ memcg_page_state_output_unit(item);
+}
+
+static inline unsigned long memcg_page_state_local_output(
+ struct mem_cgroup *memcg, int item)
+{
+ return memcg_page_state_local(memcg, item) *
+ memcg_page_state_output_unit(item);
}
static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
@@ -2833,7 +2884,12 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
return try_charge_memcg(memcg, gfp_mask, nr_pages);
}
-static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
+/**
+ * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
+ * @memcg: memcg previously charged.
+ * @nr_pages: number of pages previously charged.
+ */
+void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (mem_cgroup_is_root(memcg))
return;
@@ -2858,6 +2914,22 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg;
}
+/**
+ * mem_cgroup_commit_charge - commit a previously successful try_charge().
+ * @folio: folio to commit the charge to.
+ * @memcg: memcg previously charged.
+ */
+void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+{
+ css_get(&memcg->css);
+ commit_charge(folio, memcg);
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio));
+ memcg_check_events(memcg, folio_nid(folio));
+ local_irq_enable();
+}
+
#ifdef CONFIG_MEMCG_KMEM
/*
* The allocated objcg pointers array is not accounted directly.
@@ -3007,28 +3079,105 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
objcg = rcu_dereference(memcg->objcg);
- if (objcg && obj_cgroup_tryget(objcg))
+ if (likely(objcg && obj_cgroup_tryget(objcg)))
break;
objcg = NULL;
}
return objcg;
}
-__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+static struct obj_cgroup *current_objcg_update(void)
{
- struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg;
+ struct obj_cgroup *old, *objcg = NULL;
- if (memcg_kmem_bypass())
- return NULL;
+ do {
+ /* Atomically drop the update bit. */
+ old = xchg(&current->objcg, NULL);
+ if (old) {
+ old = (struct obj_cgroup *)
+ ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG);
+ if (old)
+ obj_cgroup_put(old);
+
+ old = NULL;
+ }
- rcu_read_lock();
- if (unlikely(active_memcg()))
- memcg = active_memcg();
- else
+ /* If new objcg is NULL, no reason for the second atomic update. */
+ if (!current->mm || (current->flags & PF_KTHREAD))
+ return NULL;
+
+ /*
+ * Release the objcg pointer from the previous iteration,
+ * if try_cmpxcg() below fails.
+ */
+ if (unlikely(objcg)) {
+ obj_cgroup_put(objcg);
+ objcg = NULL;
+ }
+
+ /*
+ * Obtain the new objcg pointer. The current task can be
+ * asynchronously moved to another memcg and the previous
+ * memcg can be offlined. So let's get the memcg pointer
+ * and try get a reference to objcg under a rcu read lock.
+ */
+
+ rcu_read_lock();
memcg = mem_cgroup_from_task(current);
- objcg = __get_obj_cgroup_from_memcg(memcg);
- rcu_read_unlock();
+ objcg = __get_obj_cgroup_from_memcg(memcg);
+ rcu_read_unlock();
+
+ /*
+ * Try set up a new objcg pointer atomically. If it
+ * fails, it means the update flag was set concurrently, so
+ * the whole procedure should be repeated.
+ */
+ } while (!try_cmpxchg(&current->objcg, &old, objcg));
+
+ return objcg;
+}
+
+__always_inline struct obj_cgroup *current_obj_cgroup(void)
+{
+ struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
+
+ if (in_task()) {
+ memcg = current->active_memcg;
+ if (unlikely(memcg))
+ goto from_memcg;
+
+ objcg = READ_ONCE(current->objcg);
+ if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG))
+ objcg = current_objcg_update();
+ /*
+ * Objcg reference is kept by the task, so it's safe
+ * to use the objcg by the current task.
+ */
+ return objcg;
+ }
+
+ memcg = this_cpu_read(int_active_memcg);
+ if (unlikely(memcg))
+ goto from_memcg;
+
+ return NULL;
+
+from_memcg:
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
+ /*
+ * Memcg pointer is protected by scope (see set_active_memcg())
+ * and is pinning the corresponding objcg, so objcg can't go
+ * away and can be used within the scope without any additional
+ * protection.
+ */
+ objcg = rcu_dereference_check(memcg->objcg, 1);
+ if (likely(objcg))
+ break;
+ objcg = NULL;
+ }
+
return objcg;
}
@@ -3126,15 +3275,15 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
struct obj_cgroup *objcg;
int ret = 0;
- objcg = get_obj_cgroup_from_current();
+ objcg = current_obj_cgroup();
if (objcg) {
ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
if (!ret) {
+ obj_cgroup_get(objcg);
page->memcg_data = (unsigned long)objcg |
MEMCG_DATA_KMEM;
return 0;
}
- obj_cgroup_put(objcg);
}
return ret;
}
@@ -3761,6 +3910,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
objcg->memcg = memcg;
rcu_assign_pointer(memcg->objcg, objcg);
+ obj_cgroup_get(objcg);
+ memcg->orig_objcg = objcg;
static_branch_enable(&memcg_kmem_online_key);
@@ -3867,6 +4018,13 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
case _MEMSWAP:
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
+ case _KMEM:
+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
+ "Writing any value to this file has no effect. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
+ ret = 0;
+ break;
case _TCP:
ret = memcg_update_tcp_max(memcg, nr_pages);
break;
@@ -4059,7 +4217,10 @@ static const unsigned int memcg1_stats[] = {
NR_WRITEBACK,
WORKINGSET_REFAULT_ANON,
WORKINGSET_REFAULT_FILE,
+#ifdef CONFIG_SWAP
MEMCG_SWAP,
+ NR_SWAPCACHE,
+#endif
};
static const char *const memcg1_stat_names[] = {
@@ -4074,7 +4235,10 @@ static const char *const memcg1_stat_names[] = {
"writeback",
"workingset_refault_anon",
"workingset_refault_file",
+#ifdef CONFIG_SWAP
"swap",
+ "swapcached",
+#endif
};
/* Universal VM events cgroup1 shows, original sort order */
@@ -4098,11 +4262,8 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
- if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
- continue;
- nr = memcg_page_state_local(memcg, memcg1_stats[i]);
- seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i],
- nr * memcg_page_state_unit(memcg1_stats[i]));
+ nr = memcg_page_state_local_output(memcg, memcg1_stats[i]);
+ seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -4122,18 +4283,15 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
}
seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
(u64)memory * PAGE_SIZE);
- if (do_memsw_account())
- seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
- (u64)memsw * PAGE_SIZE);
+ seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
+ (u64)memsw * PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
- if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
- continue;
- nr = memcg_page_state(memcg, memcg1_stats[i]);
+ nr = memcg_page_state_output(memcg, memcg1_stats[i]);
seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
- (u64)nr * memcg_page_state_unit(memcg1_stats[i]));
+ (u64)nr);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -5078,6 +5236,12 @@ static struct cftype mem_cgroup_legacy_files[] = {
},
#endif
{
+ .name = "kmem.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
@@ -5255,6 +5419,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
+ if (memcg->orig_objcg)
+ obj_cgroup_put(memcg->orig_objcg);
+
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
kfree(memcg->vmstats);
@@ -6063,7 +6230,7 @@ static void __mem_cgroup_clear_mc(void)
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
- cancel_charge(mc.to, mc.precharge);
+ mem_cgroup_cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
}
/*
@@ -6071,7 +6238,7 @@ static void __mem_cgroup_clear_mc(void)
* we must uncharge here.
*/
if (mc.moved_charge) {
- cancel_charge(mc.from, mc.moved_charge);
+ mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
}
/* we must fixup refcnts and charges */
@@ -6351,6 +6518,7 @@ static void mem_cgroup_move_task(void)
mem_cgroup_clear_mc();
}
}
+
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
{
@@ -6364,8 +6532,39 @@ static void mem_cgroup_move_task(void)
}
#endif
+#ifdef CONFIG_MEMCG_KMEM
+static void mem_cgroup_fork(struct task_struct *task)
+{
+ /*
+ * Set the update flag to cause task->objcg to be initialized lazily
+ * on the first allocation. It can be done without any synchronization
+ * because it's always performed on the current task, so does
+ * current_objcg_update().
+ */
+ task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG;
+}
+
+static void mem_cgroup_exit(struct task_struct *task)
+{
+ struct obj_cgroup *objcg = task->objcg;
+
+ objcg = (struct obj_cgroup *)
+ ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG);
+ if (objcg)
+ obj_cgroup_put(objcg);
+
+ /*
+ * Some kernel allocations can happen after this point,
+ * but let's ignore them. It can be done without any synchronization
+ * because it's always performed on the current task, so does
+ * current_objcg_update().
+ */
+ task->objcg = NULL;
+}
+#endif
+
#ifdef CONFIG_LRU_GEN
-static void mem_cgroup_attach(struct cgroup_taskset *tset)
+static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
@@ -6383,10 +6582,31 @@ static void mem_cgroup_attach(struct cgroup_taskset *tset)
task_unlock(task);
}
#else
+static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {}
+#endif /* CONFIG_LRU_GEN */
+
+#ifdef CONFIG_MEMCG_KMEM
+static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+
+ cgroup_taskset_for_each(task, css, tset) {
+ /* atomically set the update bit */
+ set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg);
+ }
+}
+#else
+static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) {}
+#endif /* CONFIG_MEMCG_KMEM */
+
+#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM)
static void mem_cgroup_attach(struct cgroup_taskset *tset)
{
+ mem_cgroup_lru_gen_attach(tset);
+ mem_cgroup_kmem_attach(tset);
}
-#endif /* CONFIG_LRU_GEN */
+#endif
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
{
@@ -6609,7 +6829,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
int item)
{
- return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
+ return lruvec_page_state(lruvec, item) *
+ memcg_page_state_output_unit(item);
}
static int memory_numa_stat_show(struct seq_file *m, void *v)
@@ -6795,9 +7016,15 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_reset = mem_cgroup_css_reset,
.css_rstat_flush = mem_cgroup_css_rstat_flush,
.can_attach = mem_cgroup_can_attach,
+#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM)
.attach = mem_cgroup_attach,
+#endif
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
+#ifdef CONFIG_MEMCG_KMEM
+ .fork = mem_cgroup_fork,
+ .exit = mem_cgroup_exit,
+#endif
.dfl_cftypes = memory_files,
.legacy_cftypes = mem_cgroup_legacy_files,
.early_init = 0,
@@ -6977,20 +7204,13 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
gfp_t gfp)
{
- long nr_pages = folio_nr_pages(folio);
int ret;
- ret = try_charge(memcg, gfp, nr_pages);
+ ret = try_charge(memcg, gfp, folio_nr_pages(folio));
if (ret)
goto out;
- css_get(&memcg->css);
- commit_charge(folio, memcg);
-
- local_irq_disable();
- mem_cgroup_charge_statistics(memcg, nr_pages);
- memcg_check_events(memcg, folio_nid(folio));
- local_irq_enable();
+ mem_cgroup_commit_charge(folio, memcg);
out:
return ret;
}
@@ -7008,6 +7228,41 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
}
/**
+ * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
+ * @memcg: memcg to charge.
+ * @gfp: reclaim mode.
+ * @nr_pages: number of pages to charge.
+ *
+ * This function is called when allocating a huge page folio to determine if
+ * the memcg has the capacity for it. It does not commit the charge yet,
+ * as the hugetlb folio itself has not been obtained from the hugetlb pool.
+ *
+ * Once we have obtained the hugetlb folio, we can call
+ * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
+ * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
+ * of try_charge().
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
+ long nr_pages)
+{
+ /*
+ * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
+ * but do not attempt to commit charge later (or cancel on error) either.
+ */
+ if (mem_cgroup_disabled() || !memcg ||
+ !cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
+ !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+ return -EOPNOTSUPP;
+
+ if (try_charge(memcg, gfp, nr_pages))
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
* mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
* @folio: folio to charge.
* @mm: mm context of the victim
@@ -7203,16 +7458,17 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
}
/**
- * mem_cgroup_migrate - Charge a folio's replacement.
+ * mem_cgroup_replace_folio - Charge a folio's replacement.
* @old: Currently circulating folio.
* @new: Replacement folio.
*
* Charge @new as a replacement folio for @old. @old will
- * be uncharged upon free.
+ * be uncharged upon free. This is only used by the page cache
+ * (in replace_page_cache_folio()).
*
* Both folios must be locked, @new->mapping must be set up.
*/
-void mem_cgroup_migrate(struct folio *old, struct folio *new)
+void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
{
struct mem_cgroup *memcg;
long nr_pages = folio_nr_pages(new);
@@ -7251,6 +7507,44 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
local_irq_restore(flags);
}
+/**
+ * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio.
+ * @old: Currently circulating folio.
+ * @new: Replacement folio.
+ *
+ * Transfer the memcg data from the old folio to the new folio for migration.
+ * The old folio's data info will be cleared. Note that the memory counters
+ * will remain unchanged throughout the process.
+ *
+ * Both folios must be locked, @new->mapping must be set up.
+ */
+void mem_cgroup_migrate(struct folio *old, struct folio *new)
+{
+ struct mem_cgroup *memcg;
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
+ VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
+ VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
+ VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ memcg = folio_memcg(old);
+ /*
+ * Note that it is normal to see !memcg for a hugetlb folio.
+ * For e.g, itt could have been allocated when memory_hugetlb_accounting
+ * was not selected.
+ */
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
+ if (!memcg)
+ return;
+
+ /* Transfer the charge and the css ref */
+ commit_charge(new, memcg);
+ old->memcg_data = 0;
+}
+
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
EXPORT_SYMBOL(memcg_sockets_enabled_key);
@@ -7750,7 +8044,7 @@ static struct cftype memsw_files[] = {
*
* This doesn't check for specific headroom, and it is not atomic
* either. But with zswap, the size of the allocation is only known
- * once compression has occured, and this optimistic pre-check avoids
+ * once compression has occurred, and this optimistic pre-check avoids
* spending cycles on compression when there is already no room left
* or zswap is disabled altogether somewhere in the hierarchy.
*/
diff --git a/mm/memfd.c b/mm/memfd.c
index 2dba2cb6f0d0..d3a1ba4208c9 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -315,12 +315,6 @@ SYSCALL_DEFINE2(memfd_create,
if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
return -EINVAL;
- if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
- pr_warn_once(
- "%s[%d]: memfd_create() called without MFD_EXEC or MFD_NOEXEC_SEAL set\n",
- current->comm, task_pid_nr(current));
- }
-
error = check_sysctl_memfd_noexec(&flags);
if (error < 0)
return error;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4d6e43c88489..660c21859118 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1713,32 +1713,35 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
}
+/*
+ * Only dev_pagemap pages get here, such as fsdax when the filesystem
+ * either do not claim or fails to claim a hwpoison event, or devdax.
+ * The fsdax pages are initialized per base page, and the devdax pages
+ * could be initialized either as base pages, or as compound pages with
+ * vmemmap optimization enabled. Devdax is simplistic in its dealing with
+ * hwpoison, such that, if a subpage of a compound page is poisoned,
+ * simply mark the compound head page is by far sufficient.
+ */
static int mf_generic_kill_procs(unsigned long long pfn, int flags,
struct dev_pagemap *pgmap)
{
- struct page *page = pfn_to_page(pfn);
+ struct folio *folio = pfn_folio(pfn);
LIST_HEAD(to_kill);
dax_entry_t cookie;
int rc = 0;
/*
- * Pages instantiated by device-dax (not filesystem-dax)
- * may be compound pages.
- */
- page = compound_head(page);
-
- /*
* Prevent the inode from being freed while we are interrogating
* the address_space, typically this would be handled by
* lock_page(), but dax pages do not use the page lock. This
* also prevents changes to the mapping of this pfn until
* poison signaling is complete.
*/
- cookie = dax_lock_page(page);
+ cookie = dax_lock_folio(folio);
if (!cookie)
return -EBUSY;
- if (hwpoison_filter(page)) {
+ if (hwpoison_filter(&folio->page)) {
rc = -EOPNOTSUPP;
goto unlock;
}
@@ -1760,7 +1763,7 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags,
* Use this flag as an indication that the dax page has been
* remapped UC to prevent speculative consumption of poison.
*/
- SetPageHWPoison(page);
+ SetPageHWPoison(&folio->page);
/*
* Unlike System-RAM there is no possibility to swap in a
@@ -1769,11 +1772,11 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags,
* SIGBUS (i.e. MF_MUST_KILL)
*/
flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
- collect_procs(page, &to_kill, true);
+ collect_procs(&folio->page, &to_kill, true);
- unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);
+ unmap_and_kill(&to_kill, pfn, folio->mapping, folio->index, flags);
unlock:
- dax_unlock_page(page, cookie);
+ dax_unlock_folio(folio, cookie);
return rc;
}
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 37a4f59d9585..8d5291add2bc 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -5,6 +5,7 @@
#include <linux/kobject.h>
#include <linux/memory.h>
#include <linux/memory-tiers.h>
+#include <linux/notifier.h>
#include "internal.h"
@@ -36,7 +37,7 @@ struct node_memory_type_map {
static DEFINE_MUTEX(memory_tier_lock);
static LIST_HEAD(memory_tiers);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
-static struct memory_dev_type *default_dram_type;
+struct memory_dev_type *default_dram_type;
static struct bus_type memory_tier_subsys = {
.name = "memory_tiering",
@@ -105,6 +106,13 @@ static int top_tier_adistance;
static struct demotion_nodes *node_demotion __read_mostly;
#endif /* CONFIG_MIGRATION */
+static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
+
+static bool default_dram_perf_error;
+static struct node_hmem_attrs default_dram_perf;
+static int default_dram_perf_ref_nid = NUMA_NO_NODE;
+static const char *default_dram_perf_ref_source;
+
static inline struct memory_tier *to_memory_tier(struct device *device)
{
return container_of(device, struct memory_tier, dev);
@@ -115,7 +123,7 @@ static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memti
nodemask_t nodes = NODE_MASK_NONE;
struct memory_dev_type *memtype;
- list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
+ list_for_each_entry(memtype, &memtier->memory_types, tier_sibling)
nodes_or(nodes, nodes, memtype->nodes);
return nodes;
@@ -174,7 +182,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
* If the memtype is already part of a memory tier,
* just return that.
*/
- if (!list_empty(&memtype->tier_sibiling)) {
+ if (!list_empty(&memtype->tier_sibling)) {
list_for_each_entry(memtier, &memory_tiers, list) {
if (adistance == memtier->adistance_start)
return memtier;
@@ -218,7 +226,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
memtier = new_memtier;
link_memtype:
- list_add(&memtype->tier_sibiling, &memtier->memory_types);
+ list_add(&memtype->tier_sibling, &memtier->memory_types);
return memtier;
}
@@ -527,7 +535,7 @@ static bool clear_node_memory_tier(int node)
memtype = node_memory_types[node].memtype;
node_clear(node, memtype->nodes);
if (nodes_empty(memtype->nodes)) {
- list_del_init(&memtype->tier_sibiling);
+ list_del_init(&memtype->tier_sibling);
if (list_empty(&memtier->memory_types))
destroy_memory_tier(memtier);
}
@@ -553,7 +561,7 @@ struct memory_dev_type *alloc_memory_type(int adistance)
return ERR_PTR(-ENOMEM);
memtype->adistance = adistance;
- INIT_LIST_HEAD(&memtype->tier_sibiling);
+ INIT_LIST_HEAD(&memtype->tier_sibling);
memtype->nodes = NODE_MASK_NONE;
kref_init(&memtype->kref);
return memtype;
@@ -578,13 +586,14 @@ EXPORT_SYMBOL_GPL(init_node_memory_type);
void clear_node_memory_type(int node, struct memory_dev_type *memtype)
{
mutex_lock(&memory_tier_lock);
- if (node_memory_types[node].memtype == memtype)
+ if (node_memory_types[node].memtype == memtype || !memtype)
node_memory_types[node].map_count--;
/*
* If we umapped all the attached devices to this node,
* clear the node memory type.
*/
if (!node_memory_types[node].map_count) {
+ memtype = node_memory_types[node].memtype;
node_memory_types[node].memtype = NULL;
put_memory_type(memtype);
}
@@ -592,6 +601,158 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
}
EXPORT_SYMBOL_GPL(clear_node_memory_type);
+static void dump_hmem_attrs(struct node_hmem_attrs *attrs, const char *prefix)
+{
+ pr_info(
+"%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n",
+ prefix, attrs->read_latency, attrs->write_latency,
+ attrs->read_bandwidth, attrs->write_bandwidth);
+}
+
+int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+ const char *source)
+{
+ int rc = 0;
+
+ mutex_lock(&memory_tier_lock);
+ if (default_dram_perf_error) {
+ rc = -EIO;
+ goto out;
+ }
+
+ if (perf->read_latency + perf->write_latency == 0 ||
+ perf->read_bandwidth + perf->write_bandwidth == 0) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (default_dram_perf_ref_nid == NUMA_NO_NODE) {
+ default_dram_perf = *perf;
+ default_dram_perf_ref_nid = nid;
+ default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL);
+ goto out;
+ }
+
+ /*
+ * The performance of all default DRAM nodes is expected to be
+ * same (that is, the variation is less than 10%). And it
+ * will be used as base to calculate the abstract distance of
+ * other memory nodes.
+ */
+ if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 >
+ default_dram_perf.read_latency ||
+ abs(perf->write_latency - default_dram_perf.write_latency) * 10 >
+ default_dram_perf.write_latency ||
+ abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 >
+ default_dram_perf.read_bandwidth ||
+ abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 >
+ default_dram_perf.write_bandwidth) {
+ pr_info(
+"memory-tiers: the performance of DRAM node %d mismatches that of the reference\n"
+"DRAM node %d.\n", nid, default_dram_perf_ref_nid);
+ pr_info(" performance of reference DRAM node %d:\n",
+ default_dram_perf_ref_nid);
+ dump_hmem_attrs(&default_dram_perf, " ");
+ pr_info(" performance of DRAM node %d:\n", nid);
+ dump_hmem_attrs(perf, " ");
+ pr_info(
+" disable default DRAM node performance based abstract distance algorithm.\n");
+ default_dram_perf_error = true;
+ rc = -EINVAL;
+ }
+
+out:
+ mutex_unlock(&memory_tier_lock);
+ return rc;
+}
+
+int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
+{
+ if (default_dram_perf_error)
+ return -EIO;
+
+ if (default_dram_perf_ref_nid == NUMA_NO_NODE)
+ return -ENOENT;
+
+ if (perf->read_latency + perf->write_latency == 0 ||
+ perf->read_bandwidth + perf->write_bandwidth == 0)
+ return -EINVAL;
+
+ mutex_lock(&memory_tier_lock);
+ /*
+ * The abstract distance of a memory node is in direct proportion to
+ * its memory latency (read + write) and inversely proportional to its
+ * memory bandwidth (read + write). The abstract distance, memory
+ * latency, and memory bandwidth of the default DRAM nodes are used as
+ * the base.
+ */
+ *adist = MEMTIER_ADISTANCE_DRAM *
+ (perf->read_latency + perf->write_latency) /
+ (default_dram_perf.read_latency + default_dram_perf.write_latency) *
+ (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) /
+ (perf->read_bandwidth + perf->write_bandwidth);
+ mutex_unlock(&memory_tier_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mt_perf_to_adistance);
+
+/**
+ * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm
+ * @nb: The notifier block which describe the algorithm
+ *
+ * Return: 0 on success, errno on error.
+ *
+ * Every memory tiering abstract distance algorithm provider needs to
+ * register the algorithm with register_mt_adistance_algorithm(). To
+ * calculate the abstract distance for a specified memory node, the
+ * notifier function will be called unless some high priority
+ * algorithm has provided result. The prototype of the notifier
+ * function is as follows,
+ *
+ * int (*algorithm_notifier)(struct notifier_block *nb,
+ * unsigned long nid, void *data);
+ *
+ * Where "nid" specifies the memory node, "data" is the pointer to the
+ * returned abstract distance (that is, "int *adist"). If the
+ * algorithm provides the result, NOTIFY_STOP should be returned.
+ * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next
+ * algorithm in the chain to provide the result.
+ */
+int register_mt_adistance_algorithm(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&mt_adistance_algorithms, nb);
+}
+EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm);
+
+/**
+ * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm
+ * @nb: the notifier block which describe the algorithm
+ *
+ * Return: 0 on success, errno on error.
+ */
+int unregister_mt_adistance_algorithm(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm);
+
+/**
+ * mt_calc_adistance() - Calculate abstract distance with registered algorithms
+ * @node: the node to calculate abstract distance for
+ * @adist: the returned abstract distance
+ *
+ * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some
+ * abstract distance algorithm provides the result, and return it via
+ * @adist. Otherwise, no algorithm can provide the result and @adist
+ * will be kept as it is.
+ */
+int mt_calc_adistance(int node, int *adist)
+{
+ return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist);
+}
+EXPORT_SYMBOL_GPL(mt_calc_adistance);
+
static int __meminit memtier_hotplug_callback(struct notifier_block *self,
unsigned long action, void *_arg)
{
diff --git a/mm/memory.c b/mm/memory.c
index 6c264d2f969c..1f18ed4a5497 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1,3 +1,4 @@
+
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/memory.c
@@ -471,8 +472,6 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
int i;
- if (current->mm == mm)
- sync_mm_rss(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
if (rss[i])
add_mm_counter(mm, i, rss[i]);
@@ -691,6 +690,16 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
out:
return pfn_to_page(pfn);
}
+
+struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd)
+{
+ struct page *page = vm_normal_page_pmd(vma, addr, pmd);
+
+ if (page)
+ return page_folio(page);
+ return NULL;
+}
#endif
static void restore_exclusive_pte(struct vm_area_struct *vma,
@@ -1683,7 +1692,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (vma->vm_file) {
zap_flags_t zap_flags = details ?
details->zap_flags : 0;
- __unmap_hugepage_range_final(tlb, vma, start, end,
+ __unmap_hugepage_range(tlb, vma, start, end,
NULL, zap_flags);
}
} else
@@ -1728,8 +1737,12 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
do {
- unmap_single_vma(tlb, vma, start_addr, end_addr, &details,
+ unsigned long start = start_addr;
+ unsigned long end = end_addr;
+ hugetlb_zap_begin(vma, &start, &end);
+ unmap_single_vma(tlb, vma, start, end, &details,
mm_wr_locked);
+ hugetlb_zap_end(vma, &details);
} while ((vma = mas_find(mas, tree_end - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
}
@@ -1753,9 +1766,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
- if (is_vm_hugetlb_page(vma))
- adjust_range_if_pmd_sharing_possible(vma, &range.start,
- &range.end);
+ hugetlb_zap_begin(vma, &range.start, &range.end);
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
@@ -1766,6 +1777,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unmap_single_vma(&tlb, vma, address, end, details, false);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
+ hugetlb_zap_end(vma, details);
}
/**
@@ -3003,23 +3015,24 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
-static inline void wp_page_reuse(struct vm_fault *vmf)
+static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = vmf->page;
pte_t entry;
VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
- VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page));
- /*
- * Clear the pages cpupid information as the existing
- * information potentially belongs to a now completely
- * unrelated process.
- */
- if (page)
- page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
+ if (folio) {
+ VM_BUG_ON(folio_test_anon(folio) &&
+ !PageAnonExclusive(vmf->page));
+ /*
+ * Clear the folio's cpupid information as the existing
+ * information potentially belongs to a now completely
+ * unrelated process.
+ */
+ folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
+ }
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
@@ -3031,6 +3044,36 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
}
/*
+ * We could add a bitflag somewhere, but for now, we know that all
+ * vm_ops that have a ->map_pages have been audited and don't need
+ * the mmap_lock to be held.
+ */
+static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+
+ if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
+ return 0;
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+}
+
+static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+
+ if (likely(vma->anon_vma))
+ return 0;
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+ if (__anon_vma_prepare(vma))
+ return VM_FAULT_OOM;
+ return 0;
+}
+
+/*
* Handle the case of a page which we actually need to copy to a new page,
* either due to COW or unsharing.
*
@@ -3057,27 +3100,29 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
pte_t entry;
int page_copied = 0;
struct mmu_notifier_range range;
- int ret;
+ vm_fault_t ret;
delayacct_wpcopy_start();
if (vmf->page)
old_folio = page_folio(vmf->page);
- if (unlikely(anon_vma_prepare(vma)))
- goto oom;
+ ret = vmf_anon_prepare(vmf);
+ if (unlikely(ret))
+ goto out;
if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
if (!new_folio)
goto oom;
} else {
+ int err;
new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
vmf->address, false);
if (!new_folio)
goto oom;
- ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
- if (ret) {
+ err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
+ if (err) {
/*
* COW failed, if the fault was solved by other,
* it's fine. If not, userspace would re-fault on
@@ -3090,7 +3135,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
folio_put(old_folio);
delayacct_wpcopy_end();
- return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
+ return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
}
kmsan_copy_page_meta(&new_folio->page, vmf->page);
}
@@ -3200,11 +3245,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
oom_free_new:
folio_put(new_folio);
oom:
+ ret = VM_FAULT_OOM;
+out:
if (old_folio)
folio_put(old_folio);
delayacct_wpcopy_end();
- return VM_FAULT_OOM;
+ return ret;
}
/**
@@ -3212,6 +3259,7 @@ oom:
* writeable once the page is prepared
*
* @vmf: structure describing the fault
+ * @folio: the folio of vmf->page
*
* This function handles all that is needed to finish a write page fault in a
* shared mapping due to PTE being read-only once the mapped page is prepared.
@@ -3223,7 +3271,7 @@ oom:
* Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
* we acquired PTE lock.
*/
-vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
+static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio)
{
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
@@ -3239,7 +3287,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return VM_FAULT_NOPAGE;
}
- wp_page_reuse(vmf);
+ wp_page_reuse(vmf, folio);
return 0;
}
@@ -3255,18 +3303,17 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
vm_fault_t ret;
pte_unmap_unlock(vmf->pte, vmf->ptl);
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- vma_end_read(vmf->vma);
- return VM_FAULT_RETRY;
- }
+ ret = vmf_can_call_fault(vmf);
+ if (ret)
+ return ret;
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vmf);
if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
- return finish_mkwrite_fault(vmf);
+ return finish_mkwrite_fault(vmf, NULL);
}
- wp_page_reuse(vmf);
+ wp_page_reuse(vmf, NULL);
return 0;
}
@@ -3282,10 +3329,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
vm_fault_t tmp;
pte_unmap_unlock(vmf->pte, vmf->ptl);
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ tmp = vmf_can_call_fault(vmf);
+ if (tmp) {
folio_put(folio);
- vma_end_read(vmf->vma);
- return VM_FAULT_RETRY;
+ return tmp;
}
tmp = do_page_mkwrite(vmf, folio);
@@ -3294,14 +3341,14 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
folio_put(folio);
return tmp;
}
- tmp = finish_mkwrite_fault(vmf);
+ tmp = finish_mkwrite_fault(vmf, folio);
if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
folio_unlock(folio);
folio_put(folio);
return tmp;
}
} else {
- wp_page_reuse(vmf);
+ wp_page_reuse(vmf, folio);
folio_lock(folio);
}
ret |= fault_dirty_shared_page(vmf);
@@ -3310,6 +3357,44 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
return ret;
}
+static bool wp_can_reuse_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
+{
+ /*
+ * We have to verify under folio lock: these early checks are
+ * just an optimization to avoid locking the folio and freeing
+ * the swapcache if there is little hope that we can reuse.
+ *
+ * KSM doesn't necessarily raise the folio refcount.
+ */
+ if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
+ return false;
+ if (!folio_test_lru(folio))
+ /*
+ * We cannot easily detect+handle references from
+ * remote LRU caches or references to LRU folios.
+ */
+ lru_add_drain();
+ if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
+ return false;
+ if (!folio_trylock(folio))
+ return false;
+ if (folio_test_swapcache(folio))
+ folio_free_swap(folio);
+ if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
+ folio_unlock(folio);
+ return false;
+ }
+ /*
+ * Ok, we've got the only folio reference from our mapping
+ * and the folio is locked, it's dark out, and we're wearing
+ * sunglasses. Hit it.
+ */
+ folio_move_anon_rmap(folio, vma);
+ folio_unlock(folio);
+ return true;
+}
+
/*
* This routine handles present pages, when
* * users try to write to a shared page (FAULT_FLAG_WRITE)
@@ -3338,11 +3423,28 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
struct folio *folio = NULL;
+ pte_t pte;
if (likely(!unshare)) {
if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return handle_userfault(vmf, VM_UFFD_WP);
+ if (!userfaultfd_wp_async(vma)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_WP);
+ }
+
+ /*
+ * Nothing needed (cache flush, TLB invalidations,
+ * etc.) because we're only removing the uffd-wp bit,
+ * which is completely invisible to the user.
+ */
+ pte = pte_clear_uffd_wp(ptep_get(vmf->pte));
+
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ /*
+ * Update this to be prepared for following up CoW
+ * handling
+ */
+ vmf->orig_pte = pte;
}
/*
@@ -3379,62 +3481,21 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
/*
* Private mapping: create an exclusive anonymous page copy if reuse
* is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
+ *
+ * If we encounter a page that is marked exclusive, we must reuse
+ * the page without further checks.
*/
- if (folio && folio_test_anon(folio)) {
- /*
- * If the page is exclusive to this process we must reuse the
- * page without further checks.
- */
- if (PageAnonExclusive(vmf->page))
- goto reuse;
-
- /*
- * We have to verify under folio lock: these early checks are
- * just an optimization to avoid locking the folio and freeing
- * the swapcache if there is little hope that we can reuse.
- *
- * KSM doesn't necessarily raise the folio refcount.
- */
- if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
- goto copy;
- if (!folio_test_lru(folio))
- /*
- * We cannot easily detect+handle references from
- * remote LRU caches or references to LRU folios.
- */
- lru_add_drain();
- if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
- goto copy;
- if (!folio_trylock(folio))
- goto copy;
- if (folio_test_swapcache(folio))
- folio_free_swap(folio);
- if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
- folio_unlock(folio);
- goto copy;
- }
- /*
- * Ok, we've got the only folio reference from our mapping
- * and the folio is locked, it's dark out, and we're wearing
- * sunglasses. Hit it.
- */
- page_move_anon_rmap(vmf->page, vma);
- folio_unlock(folio);
-reuse:
+ if (folio && folio_test_anon(folio) &&
+ (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) {
+ if (!PageAnonExclusive(vmf->page))
+ SetPageAnonExclusive(vmf->page);
if (unlikely(unshare)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
- wp_page_reuse(vmf);
+ wp_page_reuse(vmf, folio);
return 0;
}
-copy:
- if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- vma_end_read(vmf->vma);
- return VM_FAULT_RETRY;
- }
-
/*
* Ok, we need to copy. Oh, well..
*/
@@ -4560,10 +4621,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
return ret;
}
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- vma_end_read(vmf->vma);
- return VM_FAULT_RETRY;
- }
+ ret = vmf_can_call_fault(vmf);
+ if (ret)
+ return ret;
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -4582,13 +4642,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- vma_end_read(vma);
- return VM_FAULT_RETRY;
- }
-
- if (unlikely(anon_vma_prepare(vma)))
- return VM_FAULT_OOM;
+ ret = vmf_can_call_fault(vmf);
+ if (!ret)
+ ret = vmf_anon_prepare(vmf);
+ if (ret)
+ return ret;
vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
if (!vmf->cow_page)
@@ -4627,10 +4685,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
vm_fault_t ret, tmp;
struct folio *folio;
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- vma_end_read(vma);
- return VM_FAULT_RETRY;
- }
+ ret = vmf_can_call_fault(vmf);
+ if (ret)
+ return ret;
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -4716,10 +4773,10 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
return ret;
}
-int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags)
{
- get_page(page);
+ folio_get(folio);
/* Record the current PID acceesing VMA */
vma_set_access_pid_bit(vma);
@@ -4730,14 +4787,14 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
*flags |= TNF_FAULT_LOCAL;
}
- return mpol_misplaced(page, vma, addr);
+ return mpol_misplaced(folio, vma, addr);
}
static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = NULL;
- int page_nid = NUMA_NO_NODE;
+ struct folio *folio = NULL;
+ int nid = NUMA_NO_NODE;
bool writable = false;
int last_cpupid;
int target_nid;
@@ -4768,12 +4825,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
can_change_pte_writable(vma, vmf->address, pte))
writable = true;
- page = vm_normal_page(vma, vmf->address, pte);
- if (!page || is_zone_device_page(page))
+ folio = vm_normal_folio(vma, vmf->address, pte);
+ if (!folio || folio_is_zone_device(folio))
goto out_map;
/* TODO: handle PTE-mapped THP */
- if (PageCompound(page))
+ if (folio_test_large(folio))
goto out_map;
/*
@@ -4788,34 +4845,33 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
flags |= TNF_NO_GROUP;
/*
- * Flag if the page is shared between multiple address spaces. This
+ * Flag if the folio is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
- if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+ if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED))
flags |= TNF_SHARED;
- page_nid = page_to_nid(page);
+ nid = folio_nid(folio);
/*
* For memory tiering mode, cpupid of slow memory page is used
* to record page access time. So use default value.
*/
if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
- !node_is_toptier(page_nid))
+ !node_is_toptier(nid))
last_cpupid = (-1 & LAST_CPUPID_MASK);
else
- last_cpupid = page_cpupid_last(page);
- target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
- &flags);
+ last_cpupid = folio_last_cpupid(folio);
+ target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags);
if (target_nid == NUMA_NO_NODE) {
- put_page(page);
+ folio_put(folio);
goto out_map;
}
pte_unmap_unlock(vmf->pte, vmf->ptl);
writable = false;
/* Migrate to the requested node */
- if (migrate_misplaced_page(page, vma, target_nid)) {
- page_nid = target_nid;
+ if (migrate_misplaced_folio(folio, vma, target_nid)) {
+ nid = target_nid;
flags |= TNF_MIGRATED;
} else {
flags |= TNF_MIGRATE_FAIL;
@@ -4831,8 +4887,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
}
out:
- if (page_nid != NUMA_NO_NODE)
- task_numa_fault(last_cpupid, page_nid, 1, flags);
+ if (nid != NUMA_NO_NODE)
+ task_numa_fault(last_cpupid, nid, 1, flags);
return 0;
out_map:
/*
@@ -4869,8 +4925,11 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
if (vma_is_anonymous(vma)) {
if (likely(!unshare) &&
- userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd))
+ userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
+ if (userfaultfd_wp_async(vmf->vma))
+ goto split;
return handle_userfault(vmf, VM_UFFD_WP);
+ }
return do_huge_pmd_wp_page(vmf);
}
@@ -4882,6 +4941,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
}
}
+split:
/* COW or write-notify handled on pte level: split pmd. */
__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
@@ -5733,8 +5793,8 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
/*
* Access another process' address space as given in mm.
*/
-int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
- int len, unsigned int gup_flags)
+static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
{
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
@@ -5757,7 +5817,7 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
struct page *page = get_user_page_vma_remote(mm, addr,
gup_flags, &vma);
- if (IS_ERR_OR_NULL(page)) {
+ if (IS_ERR(page)) {
/* We might need to expand the stack to access it */
vma = vma_lookup(mm, addr);
if (!vma) {
@@ -5771,7 +5831,6 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
continue;
}
-
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1b03f4ec6fd2..ab41a511e20a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1689,7 +1689,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
*/
if (HPageMigratable(head))
goto found;
- skip = compound_nr(head) - (page - head);
+ skip = compound_nr(head) - (pfn - page_to_pfn(head));
pfn += skip - 1;
}
return -ENOENT;
@@ -2012,12 +2012,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
/* reinitialise watermarks and update pcp limits */
init_per_zone_wmark_min();
+ /*
+ * Make sure to mark the node as memory-less before rebuilding the zone
+ * list. Otherwise this node would still appear in the fallback lists.
+ */
+ node_states_clear_node(node, &arg);
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
build_all_zonelists(NULL);
}
- node_states_clear_node(node, &arg);
if (arg.status_change_nid >= 0) {
kcompactd_stop(node);
kswapd_stop(node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 42b5567e3773..10a590ee1c89 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -25,7 +25,7 @@
* to the last. It would be better if bind would truly restrict
* the allocation to memory nodes instead
*
- * preferred Try a specific node first before normal fallback.
+ * preferred Try a specific node first before normal fallback.
* As a special case NUMA_NO_NODE here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
@@ -52,7 +52,7 @@
* on systems with highmem kernel lowmem allocation don't get policied.
* Same with GFP_DMA allocations.
*
- * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
+ * For shmem/tmpfs shared memory the policy is shared between
* all users and remembered even when nobody has memory mapped.
*/
@@ -111,7 +111,8 @@
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
-#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
+#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
+#define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
@@ -131,22 +132,26 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/**
- * numa_map_to_online_node - Find closest online node
+ * numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
+ * @state: State to filter the search
*
- * Lookup the next closest node by distance if @nid is not online.
+ * Lookup the closest node by distance if @nid is not in state.
*
- * Return: this @node if it is online, otherwise the closest node by distance
+ * Return: this @node if it is in state, otherwise the closest node by distance
*/
-int numa_map_to_online_node(int node)
+int numa_nearest_node(int node, unsigned int state)
{
int min_dist = INT_MAX, dist, n, min_node;
- if (node == NUMA_NO_NODE || node_online(node))
+ if (state >= NR_NODE_STATES)
+ return -EINVAL;
+
+ if (node == NUMA_NO_NODE || node_state(node, state))
return node;
min_node = node;
- for_each_online_node(n) {
+ for_each_node_state(n, state) {
dist = node_distance(node, n);
if (dist < min_dist) {
min_dist = dist;
@@ -156,7 +161,7 @@ int numa_map_to_online_node(int node)
return min_node;
}
-EXPORT_SYMBOL_GPL(numa_map_to_online_node);
+EXPORT_SYMBOL_GPL(numa_nearest_node);
struct mempolicy *get_task_policy(struct task_struct *p)
{
@@ -263,9 +268,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
{
struct mempolicy *policy;
- pr_debug("setting mode %d flags %d nodes[0] %lx\n",
- mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
-
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
@@ -293,6 +295,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
return ERR_PTR(-EINVAL);
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
+
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
@@ -305,11 +308,11 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
}
/* Slow path of a mpol destructor. */
-void __mpol_put(struct mempolicy *p)
+void __mpol_put(struct mempolicy *pol)
{
- if (!atomic_dec_and_test(&p->refcnt))
+ if (!atomic_dec_and_test(&pol->refcnt))
return;
- kmem_cache_free(policy_cache, p);
+ kmem_cache_free(policy_cache, pol);
}
static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
@@ -366,7 +369,6 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
*
* Called with task's alloc_lock held.
*/
-
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
mpol_rebind_policy(tsk->mempolicy, new);
@@ -377,7 +379,6 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
*
* Call holding a reference to mm. Takes mm->mmap_lock during call.
*/
-
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
@@ -416,8 +417,25 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
},
};
-static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
+static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags);
+static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
+ pgoff_t ilx, int *nid);
+
+static bool strictly_unmovable(unsigned long flags)
+{
+ /*
+ * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
+ * if any misplaced page is found.
+ */
+ return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
+ MPOL_MF_STRICT;
+}
+
+struct migration_mpol { /* for alloc_migration_target_by_mpol() */
+ struct mempolicy *pol;
+ pgoff_t ilx;
+};
struct queue_pages {
struct list_head *pagelist;
@@ -426,6 +444,8 @@ struct queue_pages {
unsigned long start;
unsigned long end;
struct vm_area_struct *first;
+ struct folio *large; /* note last large folio encountered */
+ long nr_failed; /* could not be isolated at this time */
};
/*
@@ -443,63 +463,37 @@ static inline bool queue_folio_required(struct folio *folio,
return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}
-/*
- * queue_folios_pmd() has three possible return values:
- * 0 - folios are placed on the right node or queued successfully, or
- * special page is met, i.e. huge zero page.
- * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
- * specified.
- * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
- * existing folio was already on a node that does not follow the
- * policy.
- */
-static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
- unsigned long end, struct mm_walk *walk)
- __releases(ptl)
+static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
{
- int ret = 0;
struct folio *folio;
struct queue_pages *qp = walk->private;
- unsigned long flags;
if (unlikely(is_pmd_migration_entry(*pmd))) {
- ret = -EIO;
- goto unlock;
+ qp->nr_failed++;
+ return;
}
folio = pfn_folio(pmd_pfn(*pmd));
if (is_huge_zero_page(&folio->page)) {
walk->action = ACTION_CONTINUE;
- goto unlock;
+ return;
}
if (!queue_folio_required(folio, qp))
- goto unlock;
-
- flags = qp->flags;
- /* go to folio migration */
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- if (!vma_migratable(walk->vma) ||
- migrate_folio_add(folio, qp->pagelist, flags)) {
- ret = 1;
- goto unlock;
- }
- } else
- ret = -EIO;
-unlock:
- spin_unlock(ptl);
- return ret;
+ return;
+ if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
+ !vma_migratable(walk->vma) ||
+ !migrate_folio_add(folio, qp->pagelist, qp->flags))
+ qp->nr_failed++;
}
/*
- * Scan through pages checking if pages follow certain conditions,
- * and move them to the pagelist if they do.
+ * Scan through folios, checking if they satisfy the required conditions,
+ * moving them from LRU to local pagelist for migration if they do (or not).
*
- * queue_folios_pte_range() has three possible return values:
- * 0 - folios are placed on the right node or queued successfully, or
- * special page is met, i.e. zero page.
- * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
- * specified.
- * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already
- * on a node that does not follow the policy.
+ * queue_folios_pte_range() has two possible return values:
+ * 0 - continue walking to scan for more, even if an existing folio on the
+ * wrong node could not be isolated and queued for migration.
+ * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
+ * and an existing folio was on a node that does not follow the policy.
*/
static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -508,14 +502,16 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
struct folio *folio;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- bool has_unmovable = false;
pte_t *pte, *mapped_pte;
pte_t ptent;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
- if (ptl)
- return queue_folios_pmd(pmd, ptl, addr, end, walk);
+ if (ptl) {
+ queue_folios_pmd(pmd, walk);
+ spin_unlock(ptl);
+ goto out;
+ }
mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
if (!pte) {
@@ -524,8 +520,13 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
}
for (; addr != end; pte++, addr += PAGE_SIZE) {
ptent = ptep_get(pte);
- if (!pte_present(ptent))
+ if (pte_none(ptent))
+ continue;
+ if (!pte_present(ptent)) {
+ if (is_migration_entry(pte_to_swp_entry(ptent)))
+ qp->nr_failed++;
continue;
+ }
folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
@@ -537,97 +538,87 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
continue;
if (!queue_folio_required(folio, qp))
continue;
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- /* MPOL_MF_STRICT must be specified if we get here */
- if (!vma_migratable(vma)) {
- has_unmovable = true;
- break;
- }
-
+ if (folio_test_large(folio)) {
/*
- * Do not abort immediately since there may be
- * temporary off LRU pages in the range. Still
- * need migrate other LRU pages.
+ * A large folio can only be isolated from LRU once,
+ * but may be mapped by many PTEs (and Copy-On-Write may
+ * intersperse PTEs of other, order 0, folios). This is
+ * a common case, so don't mistake it for failure (but
+ * there can be other cases of multi-mapped pages which
+ * this quick check does not help to filter out - and a
+ * search of the pagelist might grow to be prohibitive).
+ *
+ * migrate_pages(&pagelist) returns nr_failed folios, so
+ * check "large" now so that queue_pages_range() returns
+ * a comparable nr_failed folios. This does imply that
+ * if folio could not be isolated for some racy reason
+ * at its first PTE, later PTEs will not give it another
+ * chance of isolation; but keeps the accounting simple.
*/
- if (migrate_folio_add(folio, qp->pagelist, flags))
- has_unmovable = true;
- } else
- break;
+ if (folio == qp->large)
+ continue;
+ qp->large = folio;
+ }
+ if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
+ !vma_migratable(vma) ||
+ !migrate_folio_add(folio, qp->pagelist, flags)) {
+ qp->nr_failed++;
+ if (strictly_unmovable(flags))
+ break;
+ }
}
pte_unmap_unlock(mapped_pte, ptl);
cond_resched();
-
- if (has_unmovable)
- return 1;
-
- return addr != end ? -EIO : 0;
+out:
+ if (qp->nr_failed && strictly_unmovable(flags))
+ return -EIO;
+ return 0;
}
static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- int ret = 0;
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
- unsigned long flags = (qp->flags & MPOL_MF_VALID);
+ unsigned long flags = qp->flags;
struct folio *folio;
spinlock_t *ptl;
pte_t entry;
ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
entry = huge_ptep_get(pte);
- if (!pte_present(entry))
+ if (!pte_present(entry)) {
+ if (unlikely(is_hugetlb_entry_migration(entry)))
+ qp->nr_failed++;
goto unlock;
+ }
folio = pfn_folio(pte_pfn(entry));
if (!queue_folio_required(folio, qp))
goto unlock;
-
- if (flags == MPOL_MF_STRICT) {
- /*
- * STRICT alone means only detecting misplaced folio and no
- * need to further check other vma.
- */
- ret = -EIO;
- goto unlock;
- }
-
- if (!vma_migratable(walk->vma)) {
- /*
- * Must be STRICT with MOVE*, otherwise .test_walk() have
- * stopped walking current vma.
- * Detecting misplaced folio but allow migrating folios which
- * have been queued.
- */
- ret = 1;
+ if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
+ !vma_migratable(walk->vma)) {
+ qp->nr_failed++;
goto unlock;
}
-
/*
- * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it
- * is shared it is likely not worth migrating.
+ * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
+ * Choosing not to migrate a shared folio is not counted as a failure.
*
* To check if the folio is shared, ideally we want to make sure
* every page is mapped to the same process. Doing that is very
- * expensive, so check the estimated mapcount of the folio instead.
+ * expensive, so check the estimated sharers of the folio instead.
*/
- if (flags & (MPOL_MF_MOVE_ALL) ||
- (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 &&
- !hugetlb_pmd_shared(pte))) {
- if (!isolate_hugetlb(folio, qp->pagelist) &&
- (flags & MPOL_MF_STRICT))
- /*
- * Failed to isolate folio but allow migrating pages
- * which have been queued.
- */
- ret = 1;
- }
+ if ((flags & MPOL_MF_MOVE_ALL) ||
+ (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte)))
+ if (!isolate_hugetlb(folio, qp->pagelist))
+ qp->nr_failed++;
unlock:
spin_unlock(ptl);
-#else
- BUG();
+ if (qp->nr_failed && strictly_unmovable(flags))
+ return -EIO;
#endif
- return ret;
+ return 0;
}
#ifdef CONFIG_NUMA_BALANCING
@@ -656,12 +647,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
return nr_updated;
}
-#else
-static unsigned long change_prot_numa(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
-{
- return 0;
-}
#endif /* CONFIG_NUMA_BALANCING */
static int queue_pages_test_walk(unsigned long start, unsigned long end,
@@ -700,16 +685,11 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (endvma > end)
endvma = end;
- if (flags & MPOL_MF_LAZY) {
- /* Similar to task_numa_work, skip inaccessible VMAs */
- if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
- !(vma->vm_flags & VM_MIXEDMAP))
- change_prot_numa(vma, start, endvma);
- return 1;
- }
-
- /* queue pages from current vma */
- if (flags & MPOL_MF_VALID)
+ /*
+ * Check page nodes, and queue pages to move, in the current vma.
+ * But if no moving, and no strict checking, the scan can be skipped.
+ */
+ if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
return 0;
return 1;
}
@@ -731,22 +711,21 @@ static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
/*
* Walk through page tables and collect pages to be migrated.
*
- * If pages found in a given range are on a set of nodes (determined by
- * @nodes and @flags,) it's isolated and queued to the pagelist which is
- * passed via @private.
+ * If pages found in a given range are not on the required set of @nodes,
+ * and migration is allowed, they are isolated and queued to @pagelist.
*
- * queue_pages_range() has three possible return values:
- * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
- * specified.
- * 0 - queue pages successfully or no misplaced page.
- * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
- * memory range specified by nodemask and maxnode points outside
- * your accessible address space (-EFAULT)
+ * queue_pages_range() may return:
+ * 0 - all pages already on the right node, or successfully queued for moving
+ * (or neither strict checking nor moving requested: only range checking).
+ * >0 - this number of misplaced folios could not be queued for moving
+ * (a hugetlbfs page or a transparent huge page being counted as 1).
+ * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
+ * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
*/
-static int
+static long
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
nodemask_t *nodes, unsigned long flags,
- struct list_head *pagelist, bool lock_vma)
+ struct list_head *pagelist)
{
int err;
struct queue_pages qp = {
@@ -757,7 +736,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
.end = end,
.first = NULL,
};
- const struct mm_walk_ops *ops = lock_vma ?
+ const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
err = walk_page_range(mm, start, end, ops, &qp);
@@ -766,7 +745,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
/* whole range in hole */
err = -EFAULT;
- return err;
+ return err ? : qp.nr_failed;
}
/*
@@ -774,7 +753,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
* This must be called with the mmap_lock held for writing.
*/
static int vma_replace_policy(struct vm_area_struct *vma,
- struct mempolicy *pol)
+ struct mempolicy *pol)
{
int err;
struct mempolicy *old;
@@ -782,11 +761,6 @@ static int vma_replace_policy(struct vm_area_struct *vma,
vma_assert_write_locked(vma);
- pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
- vma->vm_start, vma->vm_end, vma->vm_pgoff,
- vma->vm_ops, vma->vm_file,
- vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-
new = mpol_dup(pol);
if (IS_ERR(new))
return PTR_ERR(new);
@@ -812,10 +786,7 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
- struct vm_area_struct *merged;
unsigned long vmstart, vmend;
- pgoff_t pgoff;
- int err;
vmend = min(end, vma->vm_end);
if (start > vma->vm_start) {
@@ -825,31 +796,14 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
vmstart = vma->vm_start;
}
- if (mpol_equal(vma_policy(vma), new_pol)) {
+ if (mpol_equal(vma->vm_policy, new_pol)) {
*prev = vma;
return 0;
}
- pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
- merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, new_pol,
- vma->vm_userfaultfd_ctx, anon_vma_name(vma));
- if (merged) {
- *prev = merged;
- return vma_replace_policy(merged, new_pol);
- }
-
- if (vma->vm_start != vmstart) {
- err = split_vma(vmi, vma, vmstart, 1);
- if (err)
- return err;
- }
-
- if (vma->vm_end != vmend) {
- err = split_vma(vmi, vma, vmend, 0);
- if (err)
- return err;
- }
+ vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
*prev = vma;
return vma_replace_policy(vma, new_pol);
@@ -897,18 +851,18 @@ out:
*
* Called with task's alloc_lock held
*/
-static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
+static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
{
nodes_clear(*nodes);
- if (p == &default_policy)
+ if (pol == &default_policy)
return;
- switch (p->mode) {
+ switch (pol->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
- *nodes = p->nodes;
+ *nodes = pol->nodes;
break;
case MPOL_LOCAL:
/* return empty node mask for local allocation */
@@ -955,6 +909,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
}
if (flags & MPOL_F_ADDR) {
+ pgoff_t ilx; /* ignored here */
/*
* Do NOT fall back to task policy if the
* vma/shared policy at addr is NULL. We
@@ -966,10 +921,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
mmap_read_unlock(mm);
return -EFAULT;
}
- if (vma->vm_ops && vma->vm_ops->get_policy)
- pol = vma->vm_ops->get_policy(vma, addr);
- else
- pol = vma->vm_policy;
+ pol = __get_vma_policy(vma, addr, &ilx);
} else if (addr)
return -EINVAL;
@@ -1029,16 +981,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
}
#ifdef CONFIG_MIGRATION
-static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
+static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags)
{
/*
- * We try to migrate only unshared folios. If it is shared it
- * is likely not worth migrating.
+ * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
+ * Choosing not to migrate a shared folio is not counted as a failure.
*
* To check if the folio is shared, ideally we want to make sure
* every page is mapped to the same process. Doing that is very
- * expensive, so check the estimated mapcount of the folio instead.
+ * expensive, so check the estimated sharers of the folio instead.
*/
if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
if (folio_isolate_lru(folio)) {
@@ -1046,32 +998,31 @@ static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
node_stat_mod_folio(folio,
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
- } else if (flags & MPOL_MF_STRICT) {
+ } else {
/*
* Non-movable folio may reach here. And, there may be
* temporary off LRU folios or non-LRU movable folios.
* Treat them as unmovable folios since they can't be
- * isolated, so they can't be moved at the moment. It
- * should return -EIO for this case too.
+ * isolated, so they can't be moved at the moment.
*/
- return -EIO;
+ return false;
}
}
-
- return 0;
+ return true;
}
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
*/
-static int migrate_to_node(struct mm_struct *mm, int source, int dest,
- int flags)
+static long migrate_to_node(struct mm_struct *mm, int source, int dest,
+ int flags)
{
nodemask_t nmask;
struct vm_area_struct *vma;
LIST_HEAD(pagelist);
- int err = 0;
+ long nr_failed;
+ long err = 0;
struct migration_target_control mtc = {
.nid = dest,
.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
@@ -1080,23 +1031,30 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
nodes_clear(nmask);
node_set(source, nmask);
+ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+
+ mmap_read_lock(mm);
+ vma = find_vma(mm, 0);
+
/*
- * This does not "check" the range but isolates all pages that
+ * This does not migrate the range, but isolates all pages that
* need migration. Between passing in the full user address
- * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+ * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
+ * but passes back the count of pages which could not be isolated.
*/
- vma = find_vma(mm, 0);
- VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
- queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
- flags | MPOL_MF_DISCONTIG_OK, &pagelist, false);
+ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+ mmap_read_unlock(mm);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
if (err)
putback_movable_pages(&pagelist);
}
+ if (err >= 0)
+ err += nr_failed;
return err;
}
@@ -1109,14 +1067,12 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
- int busy = 0;
- int err = 0;
+ long nr_failed = 0;
+ long err = 0;
nodemask_t tmp;
lru_cache_disable();
- mmap_read_lock(mm);
-
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
@@ -1192,59 +1148,58 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
node_clear(source, tmp);
err = migrate_to_node(mm, source, dest, flags);
if (err > 0)
- busy += err;
+ nr_failed += err;
if (err < 0)
break;
}
- mmap_read_unlock(mm);
lru_cache_enable();
if (err < 0)
return err;
- return busy;
-
+ return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
}
/*
- * Allocate a new page for page migration based on vma policy.
- * Start by assuming the page is mapped by the same vma as contains @start.
- * Search forward from there, if not. N.B., this assumes that the
- * list of pages handed to migrate_pages()--which is how we get here--
- * is in virtual address order.
+ * Allocate a new folio for page migration, according to NUMA mempolicy.
*/
-static struct folio *new_folio(struct folio *src, unsigned long start)
+static struct folio *alloc_migration_target_by_mpol(struct folio *src,
+ unsigned long private)
{
- struct vm_area_struct *vma;
- unsigned long address;
- VMA_ITERATOR(vmi, current->mm, start);
- gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
+ struct migration_mpol *mmpol = (struct migration_mpol *)private;
+ struct mempolicy *pol = mmpol->pol;
+ pgoff_t ilx = mmpol->ilx;
+ struct page *page;
+ unsigned int order;
+ int nid = numa_node_id();
+ gfp_t gfp;
- for_each_vma(vmi, vma) {
- address = page_address_in_vma(&src->page, vma);
- if (address != -EFAULT)
- break;
- }
+ order = folio_order(src);
+ ilx += src->index >> order;
if (folio_test_hugetlb(src)) {
- return alloc_hugetlb_folio_vma(folio_hstate(src),
- vma, address);
+ nodemask_t *nodemask;
+ struct hstate *h;
+
+ h = folio_hstate(src);
+ gfp = htlb_alloc_mask(h);
+ nodemask = policy_nodemask(gfp, pol, ilx, &nid);
+ return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp);
}
if (folio_test_large(src))
gfp = GFP_TRANSHUGE;
+ else
+ gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
- /*
- * if !vma, vma_alloc_folio() will use task or system default policy
- */
- return vma_alloc_folio(gfp, folio_order(src), vma, address,
- folio_test_large(src));
+ page = alloc_pages_mpol(gfp, order, pol, ilx, nid);
+ return page_rmappable_folio(page);
}
#else
-static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
+static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags)
{
- return -EIO;
+ return false;
}
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
@@ -1253,7 +1208,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
return -ENOSYS;
}
-static struct folio *new_folio(struct folio *src, unsigned long start)
+static struct folio *alloc_migration_target_by_mpol(struct folio *src,
+ unsigned long private)
{
return NULL;
}
@@ -1266,10 +1222,11 @@ static long do_mbind(unsigned long start, unsigned long len,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
struct vma_iterator vmi;
+ struct migration_mpol mmpol;
struct mempolicy *new;
unsigned long end;
- int err;
- int ret;
+ long err;
+ long nr_failed;
LIST_HEAD(pagelist);
if (flags & ~(unsigned long)MPOL_MF_VALID)
@@ -1295,9 +1252,6 @@ static long do_mbind(unsigned long start, unsigned long len,
if (IS_ERR(new))
return PTR_ERR(new);
- if (flags & MPOL_MF_LAZY)
- new->flags |= MPOL_F_MOF;
-
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
@@ -1305,14 +1259,8 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
- pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
- start, start + len, mode, mode_flags,
- nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
-
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
lru_cache_disable();
- }
{
NODEMASK_SCRATCH(scratch);
if (scratch) {
@@ -1328,45 +1276,81 @@ static long do_mbind(unsigned long start, unsigned long len,
goto mpol_out;
/*
- * Lock the VMAs before scanning for pages to migrate, to ensure we don't
- * miss a concurrently inserted page.
+ * Lock the VMAs before scanning for pages to migrate,
+ * to ensure we don't miss a concurrently inserted page.
*/
- ret = queue_pages_range(mm, start, end, nmask,
- flags | MPOL_MF_INVERT, &pagelist, true);
-
- if (ret < 0) {
- err = ret;
- goto up_out;
- }
+ nr_failed = queue_pages_range(mm, start, end, nmask,
+ flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
- vma_iter_init(&vmi, mm, start);
- prev = vma_prev(&vmi);
- for_each_vma_range(vmi, vma, end) {
- err = mbind_range(&vmi, vma, &prev, start, end, new);
- if (err)
- break;
+ if (nr_failed < 0) {
+ err = nr_failed;
+ nr_failed = 0;
+ } else {
+ vma_iter_init(&vmi, mm, start);
+ prev = vma_prev(&vmi);
+ for_each_vma_range(vmi, vma, end) {
+ err = mbind_range(&vmi, vma, &prev, start, end, new);
+ if (err)
+ break;
+ }
}
- if (!err) {
- int nr_failed = 0;
-
- if (!list_empty(&pagelist)) {
- WARN_ON_ONCE(flags & MPOL_MF_LAZY);
- nr_failed = migrate_pages(&pagelist, new_folio, NULL,
- start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
- if (nr_failed)
- putback_movable_pages(&pagelist);
+ if (!err && !list_empty(&pagelist)) {
+ /* Convert MPOL_DEFAULT's NULL to task or default policy */
+ if (!new) {
+ new = get_task_policy(current);
+ mpol_get(new);
}
+ mmpol.pol = new;
+ mmpol.ilx = 0;
- if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
- err = -EIO;
- } else {
-up_out:
- if (!list_empty(&pagelist))
- putback_movable_pages(&pagelist);
+ /*
+ * In the interleaved case, attempt to allocate on exactly the
+ * targeted nodes, for the first VMA to be migrated; for later
+ * VMAs, the nodes will still be interleaved from the targeted
+ * nodemask, but one by one may be selected differently.
+ */
+ if (new->mode == MPOL_INTERLEAVE) {
+ struct page *page;
+ unsigned int order;
+ unsigned long addr = -EFAULT;
+
+ list_for_each_entry(page, &pagelist, lru) {
+ if (!PageKsm(page))
+ break;
+ }
+ if (!list_entry_is_head(page, &pagelist, lru)) {
+ vma_iter_init(&vmi, mm, start);
+ for_each_vma_range(vmi, vma, end) {
+ addr = page_address_in_vma(page, vma);
+ if (addr != -EFAULT)
+ break;
+ }
+ }
+ if (addr != -EFAULT) {
+ order = compound_order(page);
+ /* We already know the pol, but not the ilx */
+ mpol_cond_put(get_vma_policy(vma, addr, order,
+ &mmpol.ilx));
+ /* Set base from which to increment by index */
+ mmpol.ilx -= page->index >> order;
+ }
+ }
}
mmap_write_unlock(mm);
+
+ if (!err && !list_empty(&pagelist)) {
+ nr_failed |= migrate_pages(&pagelist,
+ alloc_migration_target_by_mpol, NULL,
+ (unsigned long)&mmpol, MIGRATE_SYNC,
+ MR_MEMPOLICY_MBIND, NULL);
+ }
+
+ if (nr_failed && (flags & MPOL_MF_STRICT))
+ err = -EIO;
+ if (!list_empty(&pagelist))
+ putback_movable_pages(&pagelist);
mpol_out:
mpol_put(new);
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
@@ -1544,8 +1528,10 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
* the home node for vmas we already updated before.
*/
old = vma_policy(vma);
- if (!old)
+ if (!old) {
+ prev = vma;
continue;
+ }
if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
err = -EOPNOTSUPP;
break;
@@ -1685,7 +1671,6 @@ out:
out_put:
put_task_struct(task);
goto out;
-
}
SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
@@ -1695,7 +1680,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}
-
/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
@@ -1762,34 +1746,19 @@ bool vma_migratable(struct vm_area_struct *vma)
}
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
- unsigned long addr)
+ unsigned long addr, pgoff_t *ilx)
{
- struct mempolicy *pol = NULL;
-
- if (vma) {
- if (vma->vm_ops && vma->vm_ops->get_policy) {
- pol = vma->vm_ops->get_policy(vma, addr);
- } else if (vma->vm_policy) {
- pol = vma->vm_policy;
-
- /*
- * shmem_alloc_page() passes MPOL_F_SHARED policy with
- * a pseudo vma whose vma->vm_ops=NULL. Take a reference
- * count on these policies which will be dropped by
- * mpol_cond_put() later
- */
- if (mpol_needs_cond_ref(pol))
- mpol_get(pol);
- }
- }
-
- return pol;
+ *ilx = 0;
+ return (vma->vm_ops && vma->vm_ops->get_policy) ?
+ vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
}
/*
- * get_vma_policy(@vma, @addr)
+ * get_vma_policy(@vma, @addr, @order, @ilx)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup
+ * @order: 0, or appropriate huge_page_order for interleaving
+ * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE
*
* Returns effective policy for a VMA at specified address.
* Falls back to current->mempolicy or system default policy, as necessary.
@@ -1798,14 +1767,18 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
-static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
- unsigned long addr)
+struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+ unsigned long addr, int order, pgoff_t *ilx)
{
- struct mempolicy *pol = __get_vma_policy(vma, addr);
+ struct mempolicy *pol;
+ pol = __get_vma_policy(vma, addr, ilx);
if (!pol)
pol = get_task_policy(current);
-
+ if (pol->mode == MPOL_INTERLEAVE) {
+ *ilx += vma->vm_pgoff >> order;
+ *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
+ }
return pol;
}
@@ -1815,8 +1788,9 @@ bool vma_policy_mof(struct vm_area_struct *vma)
if (vma->vm_ops && vma->vm_ops->get_policy) {
bool ret = false;
+ pgoff_t ilx; /* ignored here */
- pol = vma->vm_ops->get_policy(vma, vma->vm_start);
+ pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
if (pol && (pol->flags & MPOL_F_MOF))
ret = true;
mpol_cond_put(pol);
@@ -1851,64 +1825,15 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
return zone >= dynamic_policy_zone;
}
-/*
- * Return a nodemask representing a mempolicy for filtering nodes for
- * page allocation
- */
-nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
-{
- int mode = policy->mode;
-
- /* Lower zones don't get a nodemask applied for MPOL_BIND */
- if (unlikely(mode == MPOL_BIND) &&
- apply_policy_zone(policy, gfp_zone(gfp)) &&
- cpuset_nodemask_valid_mems_allowed(&policy->nodes))
- return &policy->nodes;
-
- if (mode == MPOL_PREFERRED_MANY)
- return &policy->nodes;
-
- return NULL;
-}
-
-/*
- * Return the preferred node id for 'prefer' mempolicy, and return
- * the given id for all other policies.
- *
- * policy_node() is always coupled with policy_nodemask(), which
- * secures the nodemask limit for 'bind' and 'prefer-many' policy.
- */
-static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
-{
- if (policy->mode == MPOL_PREFERRED) {
- nd = first_node(policy->nodes);
- } else {
- /*
- * __GFP_THISNODE shouldn't even be used with the bind policy
- * because we might easily break the expectation to stay on the
- * requested node and not break the policy.
- */
- WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
- }
-
- if ((policy->mode == MPOL_BIND ||
- policy->mode == MPOL_PREFERRED_MANY) &&
- policy->home_node != NUMA_NO_NODE)
- return policy->home_node;
-
- return nd;
-}
-
/* Do dynamic interleaving for a process */
-static unsigned interleave_nodes(struct mempolicy *policy)
+static unsigned int interleave_nodes(struct mempolicy *policy)
{
- unsigned next;
- struct task_struct *me = current;
+ unsigned int nid;
- next = next_node_in(me->il_prev, policy->nodes);
- if (next < MAX_NUMNODES)
- me->il_prev = next;
- return next;
+ nid = next_node_in(current->il_prev, policy->nodes);
+ if (nid < MAX_NUMNODES)
+ current->il_prev = nid;
+ return nid;
}
/*
@@ -1959,11 +1884,11 @@ unsigned int mempolicy_slab_node(void)
}
/*
- * Do static interleaving for a VMA with known offset @n. Returns the n'th
- * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
- * number of present nodes.
+ * Do static interleaving for interleave index @ilx. Returns the ilx'th
+ * node in pol->nodes (starting from ilx=0), wrapping around if ilx
+ * exceeds the number of present nodes.
*/
-static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
+static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
nodemask_t nodemask = pol->nodes;
unsigned int target, nnodes;
@@ -1981,33 +1906,54 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
nnodes = nodes_weight(nodemask);
if (!nnodes)
return numa_node_id();
- target = (unsigned int)n % nnodes;
+ target = ilx % nnodes;
nid = first_node(nodemask);
for (i = 0; i < target; i++)
nid = next_node(nid, nodemask);
return nid;
}
-/* Determine a node number for interleave */
-static inline unsigned interleave_nid(struct mempolicy *pol,
- struct vm_area_struct *vma, unsigned long addr, int shift)
+/*
+ * Return a nodemask representing a mempolicy for filtering nodes for
+ * page allocation, together with preferred node id (or the input node id).
+ */
+static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
+ pgoff_t ilx, int *nid)
{
- if (vma) {
- unsigned long off;
+ nodemask_t *nodemask = NULL;
+ switch (pol->mode) {
+ case MPOL_PREFERRED:
+ /* Override input node id */
+ *nid = first_node(pol->nodes);
+ break;
+ case MPOL_PREFERRED_MANY:
+ nodemask = &pol->nodes;
+ if (pol->home_node != NUMA_NO_NODE)
+ *nid = pol->home_node;
+ break;
+ case MPOL_BIND:
+ /* Restrict to nodemask (but not on lower zones) */
+ if (apply_policy_zone(pol, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&pol->nodes))
+ nodemask = &pol->nodes;
+ if (pol->home_node != NUMA_NO_NODE)
+ *nid = pol->home_node;
/*
- * for small pages, there is no difference between
- * shift and PAGE_SHIFT, so the bit-shift is safe.
- * for huge pages, since vm_pgoff is in units of small
- * pages, we need to shift off the always 0 bits to get
- * a useful offset.
+ * __GFP_THISNODE shouldn't even be used with the bind policy
+ * because we might easily break the expectation to stay on the
+ * requested node and not break the policy.
*/
- BUG_ON(shift < PAGE_SHIFT);
- off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
- off += (addr - vma->vm_start) >> shift;
- return offset_il_node(pol, off);
- } else
- return interleave_nodes(pol);
+ WARN_ON_ONCE(gfp & __GFP_THISNODE);
+ break;
+ case MPOL_INTERLEAVE:
+ /* Override input node id */
+ *nid = (ilx == NO_INTERLEAVE_INDEX) ?
+ interleave_nodes(pol) : interleave_nid(pol, ilx);
+ break;
+ }
+
+ return nodemask;
}
#ifdef CONFIG_HUGETLBFS
@@ -2023,27 +1969,16 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'bind' or 'prefer-many', returns a pointer
* to the mempolicy's @nodemask for filtering the zonelist.
- *
- * Must be protected by read_mems_allowed_begin()
*/
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
- struct mempolicy **mpol, nodemask_t **nodemask)
+ struct mempolicy **mpol, nodemask_t **nodemask)
{
+ pgoff_t ilx;
int nid;
- int mode;
- *mpol = get_vma_policy(vma, addr);
- *nodemask = NULL;
- mode = (*mpol)->mode;
-
- if (unlikely(mode == MPOL_INTERLEAVE)) {
- nid = interleave_nid(*mpol, vma, addr,
- huge_page_shift(hstate_vma(vma)));
- } else {
- nid = policy_node(gfp_flags, *mpol, numa_node_id());
- if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
- *nodemask = &(*mpol)->nodes;
- }
+ nid = numa_node_id();
+ *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
+ *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
return nid;
}
@@ -2121,27 +2056,8 @@ bool mempolicy_in_oom_domain(struct task_struct *tsk,
return ret;
}
-/* Allocate a page in interleaved policy.
- Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
- unsigned nid)
-{
- struct page *page;
-
- page = __alloc_pages(gfp, order, nid, NULL);
- /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
- if (!static_branch_likely(&vm_numa_stat_key))
- return page;
- if (page && page_to_nid(page) == nid) {
- preempt_disable();
- __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
- preempt_enable();
- }
- return page;
-}
-
static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
- int nid, struct mempolicy *pol)
+ int nid, nodemask_t *nodemask)
{
struct page *page;
gfp_t preferred_gfp;
@@ -2154,7 +2070,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
*/
preferred_gfp = gfp | __GFP_NOWARN;
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
+ page = __alloc_pages(preferred_gfp, order, nid, nodemask);
if (!page)
page = __alloc_pages(gfp, order, nid, NULL);
@@ -2162,61 +2078,29 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
}
/**
- * vma_alloc_folio - Allocate a folio for a VMA.
+ * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
* @gfp: GFP flags.
- * @order: Order of the folio.
- * @vma: Pointer to VMA or NULL if not available.
- * @addr: Virtual address of the allocation. Must be inside @vma.
- * @hugepage: For hugepages try only the preferred node if possible.
+ * @order: Order of the page allocation.
+ * @pol: Pointer to the NUMA mempolicy.
+ * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
+ * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
*
- * Allocate a folio for a specific address in @vma, using the appropriate
- * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
- * of the mm_struct of the VMA to prevent it from going away. Should be
- * used for all allocations for folios that will be mapped into user space.
- *
- * Return: The folio on success or NULL if allocation fails.
+ * Return: The page on success or NULL if allocation fails.
*/
-struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, bool hugepage)
+struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+ struct mempolicy *pol, pgoff_t ilx, int nid)
{
- struct mempolicy *pol;
- int node = numa_node_id();
- struct folio *folio;
- int preferred_nid;
- nodemask_t *nmask;
-
- pol = get_vma_policy(vma, addr);
-
- if (pol->mode == MPOL_INTERLEAVE) {
- struct page *page;
- unsigned nid;
-
- nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
- mpol_cond_put(pol);
- gfp |= __GFP_COMP;
- page = alloc_page_interleave(gfp, order, nid);
- folio = (struct folio *)page;
- if (folio && order > 1)
- folio_prep_large_rmappable(folio);
- goto out;
- }
+ nodemask_t *nodemask;
+ struct page *page;
- if (pol->mode == MPOL_PREFERRED_MANY) {
- struct page *page;
+ nodemask = policy_nodemask(gfp, pol, ilx, &nid);
- node = policy_node(gfp, pol, node);
- gfp |= __GFP_COMP;
- page = alloc_pages_preferred_many(gfp, order, node, pol);
- mpol_cond_put(pol);
- folio = (struct folio *)page;
- if (folio && order > 1)
- folio_prep_large_rmappable(folio);
- goto out;
- }
-
- if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
- int hpage_node = node;
+ if (pol->mode == MPOL_PREFERRED_MANY)
+ return alloc_pages_preferred_many(gfp, order, nid, nodemask);
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ /* filter "hugepage" allocation, unless from alloc_pages() */
+ order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
/*
* For hugepage allocation and non-interleave policy which
* allows the current node (or other explicitly preferred
@@ -2227,39 +2111,68 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
* If the policy is interleave or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
- if (pol->mode == MPOL_PREFERRED)
- hpage_node = first_node(pol->nodes);
-
- nmask = policy_nodemask(gfp, pol);
- if (!nmask || node_isset(hpage_node, *nmask)) {
- mpol_cond_put(pol);
+ if (pol->mode != MPOL_INTERLEAVE &&
+ (!nodemask || node_isset(nid, *nodemask))) {
/*
* First, try to allocate THP only on local node, but
* don't reclaim unnecessarily, just compact.
*/
- folio = __folio_alloc_node(gfp | __GFP_THISNODE |
- __GFP_NORETRY, order, hpage_node);
-
+ page = __alloc_pages_node(nid,
+ gfp | __GFP_THISNODE | __GFP_NORETRY, order);
+ if (page || !(gfp & __GFP_DIRECT_RECLAIM))
+ return page;
/*
* If hugepage allocations are configured to always
* synchronous compact or the vma has been madvised
* to prefer hugepage backing, retry allowing remote
* memory with both reclaim and compact as well.
*/
- if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
- folio = __folio_alloc(gfp, order, hpage_node,
- nmask);
+ }
+ }
- goto out;
+ page = __alloc_pages(gfp, order, nid, nodemask);
+
+ if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
+ /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
+ if (static_branch_likely(&vm_numa_stat_key) &&
+ page_to_nid(page) == nid) {
+ preempt_disable();
+ __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
+ preempt_enable();
}
}
- nmask = policy_nodemask(gfp, pol);
- preferred_nid = policy_node(gfp, pol, node);
- folio = __folio_alloc(gfp, order, preferred_nid, nmask);
+ return page;
+}
+
+/**
+ * vma_alloc_folio - Allocate a folio for a VMA.
+ * @gfp: GFP flags.
+ * @order: Order of the folio.
+ * @vma: Pointer to VMA.
+ * @addr: Virtual address of the allocation. Must be inside @vma.
+ * @hugepage: Unused (was: For hugepages try only preferred node if possible).
+ *
+ * Allocate a folio for a specific address in @vma, using the appropriate
+ * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
+ * VMA to prevent it from going away. Should be used for all allocations
+ * for folios that will be mapped into user space, excepting hugetlbfs, and
+ * excepting where direct use of alloc_pages_mpol() is more appropriate.
+ *
+ * Return: The folio on success or NULL if allocation fails.
+ */
+struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
+ unsigned long addr, bool hugepage)
+{
+ struct mempolicy *pol;
+ pgoff_t ilx;
+ struct page *page;
+
+ pol = get_vma_policy(vma, addr, order, &ilx);
+ page = alloc_pages_mpol(gfp | __GFP_COMP, order,
+ pol, ilx, numa_node_id());
mpol_cond_put(pol);
-out:
- return folio;
+ return page_rmappable_folio(page);
}
EXPORT_SYMBOL(vma_alloc_folio);
@@ -2277,40 +2190,25 @@ EXPORT_SYMBOL(vma_alloc_folio);
* flags are used.
* Return: The page on success or NULL if allocation fails.
*/
-struct page *alloc_pages(gfp_t gfp, unsigned order)
+struct page *alloc_pages(gfp_t gfp, unsigned int order)
{
struct mempolicy *pol = &default_policy;
- struct page *page;
-
- if (!in_interrupt() && !(gfp & __GFP_THISNODE))
- pol = get_task_policy(current);
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
- if (pol->mode == MPOL_INTERLEAVE)
- page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
- else if (pol->mode == MPOL_PREFERRED_MANY)
- page = alloc_pages_preferred_many(gfp, order,
- policy_node(gfp, pol, numa_node_id()), pol);
- else
- page = __alloc_pages(gfp, order,
- policy_node(gfp, pol, numa_node_id()),
- policy_nodemask(gfp, pol));
+ if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+ pol = get_task_policy(current);
- return page;
+ return alloc_pages_mpol(gfp, order,
+ pol, NO_INTERLEAVE_INDEX, numa_node_id());
}
EXPORT_SYMBOL(alloc_pages);
-struct folio *folio_alloc(gfp_t gfp, unsigned order)
+struct folio *folio_alloc(gfp_t gfp, unsigned int order)
{
- struct page *page = alloc_pages(gfp | __GFP_COMP, order);
- struct folio *folio = (struct folio *)page;
-
- if (folio && order > 1)
- folio_prep_large_rmappable(folio);
- return folio;
+ return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order));
}
EXPORT_SYMBOL(folio_alloc);
@@ -2379,6 +2277,8 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
unsigned long nr_pages, struct page **page_array)
{
struct mempolicy *pol = &default_policy;
+ nodemask_t *nodemask;
+ int nid;
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
@@ -2391,14 +2291,15 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
return alloc_pages_bulk_array_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
- return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
- policy_nodemask(gfp, pol), nr_pages, NULL,
- page_array);
+ nid = numa_node_id();
+ nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
+ return __alloc_pages_bulk(gfp, nid, nodemask,
+ nr_pages, NULL, page_array);
}
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
- struct mempolicy *pol = mpol_dup(vma_policy(src));
+ struct mempolicy *pol = mpol_dup(src->vm_policy);
if (IS_ERR(pol))
return PTR_ERR(pol);
@@ -2483,8 +2384,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
* lookup first element intersecting start-end. Caller holds sp->lock for
* reading or for writing
*/
-static struct sp_node *
-sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
+static struct sp_node *sp_lookup(struct shared_policy *sp,
+ pgoff_t start, pgoff_t end)
{
struct rb_node *n = sp->root.rb_node;
@@ -2535,13 +2436,11 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
}
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
- pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
- new->policy ? new->policy->mode : 0);
}
/* Find shared policy intersecting idx */
-struct mempolicy *
-mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
+ pgoff_t idx)
{
struct mempolicy *pol = NULL;
struct sp_node *sn;
@@ -2565,39 +2464,38 @@ static void sp_free(struct sp_node *n)
}
/**
- * mpol_misplaced - check whether current page node is valid in policy
+ * mpol_misplaced - check whether current folio node is valid in policy
*
- * @page: page to be checked
- * @vma: vm area where page mapped
- * @addr: virtual address where page mapped
+ * @folio: folio to be checked
+ * @vma: vm area where folio mapped
+ * @addr: virtual address in @vma for shared policy lookup and interleave policy
*
- * Lookup current policy node id for vma,addr and "compare to" page's
+ * Lookup current policy node id for vma,addr and "compare to" folio's
* node id. Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*
* Return: NUMA_NO_NODE if the page is in a node that is valid for this
- * policy, or a suitable node ID to allocate a replacement page from.
+ * policy, or a suitable node ID to allocate a replacement folio from.
*/
-int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr)
{
struct mempolicy *pol;
+ pgoff_t ilx;
struct zoneref *z;
- int curnid = page_to_nid(page);
- unsigned long pgoff;
+ int curnid = folio_nid(folio);
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
int polnid = NUMA_NO_NODE;
int ret = NUMA_NO_NODE;
- pol = get_vma_policy(vma, addr);
+ pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
if (!(pol->flags & MPOL_F_MOF))
goto out;
switch (pol->mode) {
case MPOL_INTERLEAVE:
- pgoff = vma->vm_pgoff;
- pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
- polnid = offset_il_node(pol, pgoff);
+ polnid = interleave_nid(pol, ilx);
break;
case MPOL_PREFERRED:
@@ -2638,11 +2536,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
BUG();
}
- /* Migrate the page towards the node whose CPU is referencing it */
+ /* Migrate the folio towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
polnid = thisnid;
- if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
+ if (!should_numa_migrate_memory(current, folio, curnid,
+ thiscpu))
goto out;
}
@@ -2673,7 +2572,6 @@ void mpol_put_task_policy(struct task_struct *task)
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
- pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
sp_free(n);
}
@@ -2708,8 +2606,8 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
}
/* Replace a policy range. */
-static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
- unsigned long end, struct sp_node *new)
+static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
+ pgoff_t end, struct sp_node *new)
{
struct sp_node *n;
struct sp_node *n_new = NULL;
@@ -2792,30 +2690,30 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
rwlock_init(&sp->lock);
if (mpol) {
- struct vm_area_struct pvma;
- struct mempolicy *new;
+ struct sp_node *sn;
+ struct mempolicy *npol;
NODEMASK_SCRATCH(scratch);
if (!scratch)
goto put_mpol;
- /* contextualize the tmpfs mount point mempolicy */
- new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
- if (IS_ERR(new))
+
+ /* contextualize the tmpfs mount point mempolicy to this file */
+ npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
+ if (IS_ERR(npol))
goto free_scratch; /* no valid nodemask intersection */
task_lock(current);
- ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
+ ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
task_unlock(current);
if (ret)
- goto put_new;
-
- /* Create pseudo-vma that contains just the policy */
- vma_init(&pvma, NULL);
- pvma.vm_end = TASK_SIZE; /* policy covers entire file */
- mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
-
-put_new:
- mpol_put(new); /* drop initial ref */
+ goto put_npol;
+
+ /* alloc node covering entire file; adds ref to file's npol */
+ sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
+ if (sn)
+ sp_insert(sp, sn);
+put_npol:
+ mpol_put(npol); /* drop initial ref on file's npol */
free_scratch:
NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
@@ -2823,46 +2721,40 @@ put_mpol:
}
}
-int mpol_set_shared_policy(struct shared_policy *info,
- struct vm_area_struct *vma, struct mempolicy *npol)
+int mpol_set_shared_policy(struct shared_policy *sp,
+ struct vm_area_struct *vma, struct mempolicy *pol)
{
int err;
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
- pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
- vma->vm_pgoff,
- sz, npol ? npol->mode : -1,
- npol ? npol->flags : -1,
- npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
-
- if (npol) {
- new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
+ if (pol) {
+ new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
if (!new)
return -ENOMEM;
}
- err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
+ err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
if (err && new)
sp_free(new);
return err;
}
/* Free a backing policy store on inode delete. */
-void mpol_free_shared_policy(struct shared_policy *p)
+void mpol_free_shared_policy(struct shared_policy *sp)
{
struct sp_node *n;
struct rb_node *next;
- if (!p->root.rb_node)
+ if (!sp->root.rb_node)
return;
- write_lock(&p->lock);
- next = rb_first(&p->root);
+ write_lock(&sp->lock);
+ next = rb_first(&sp->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
- sp_delete(p, n);
+ sp_delete(sp, n);
}
- write_unlock(&p->lock);
+ write_unlock(&sp->lock);
}
#ifdef CONFIG_NUMA_BALANCING
@@ -2912,7 +2804,6 @@ static inline void __init check_numabalancing_enable(void)
}
#endif /* CONFIG_NUMA_BALANCING */
-/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
nodemask_t interleave_nodes;
@@ -2975,7 +2866,6 @@ void numa_default_policy(void)
/*
* Parse and format mempolicy from/to strings
*/
-
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
@@ -2986,7 +2876,6 @@ static const char * const policy_modes[] =
[MPOL_PREFERRED_MANY] = "prefer (many)",
};
-
#ifdef CONFIG_TMPFS
/**
* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
diff --git a/mm/migrate.c b/mm/migrate.c
index b7fa020003f3..35a88334bb3c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -157,8 +157,8 @@ void putback_movable_pages(struct list_head *l)
list_del(&folio->lru);
/*
* We isolated non-lru movable folio so here we can use
- * __PageMovable because LRU folio's mapping cannot have
- * PAGE_MAPPING_MOVABLE.
+ * __folio_test_movable because LRU folio's mapping cannot
+ * have PAGE_MAPPING_MOVABLE.
*/
if (unlikely(__folio_test_movable(folio))) {
VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio);
@@ -243,15 +243,18 @@ static bool remove_migration_pte(struct folio *folio,
#ifdef CONFIG_HUGETLB_PAGE
if (folio_test_hugetlb(folio)) {
- unsigned int shift = huge_page_shift(hstate_vma(vma));
+ struct hstate *h = hstate_vma(vma);
+ unsigned int shift = huge_page_shift(h);
+ unsigned long psize = huge_page_size(h);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
if (folio_test_anon(folio))
- hugepage_add_anon_rmap(new, vma, pvmw.address,
+ hugepage_add_anon_rmap(folio, vma, pvmw.address,
rmap_flags);
else
page_dup_file_rmap(new, true);
- set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
+ set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte,
+ psize);
} else
#endif
{
@@ -521,7 +524,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
int expected_count;
xas_lock_irq(&xas);
- expected_count = 2 + folio_has_private(src);
+ expected_count = folio_expected_refs(mapping, src);
if (!folio_ref_freeze(src, expected_count)) {
xas_unlock_irq(&xas);
return -EAGAIN;
@@ -530,11 +533,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
dst->index = src->index;
dst->mapping = src->mapping;
- folio_get(dst);
+ folio_ref_add(dst, folio_nr_pages(dst));
xas_store(&xas, dst);
- folio_ref_unfreeze(src, expected_count - 1);
+ folio_ref_unfreeze(src, expected_count - folio_nr_pages(src));
xas_unlock_irq(&xas);
@@ -585,20 +588,20 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
* Copy NUMA information to the new page, to prevent over-eager
* future migrations of this same page.
*/
- cpupid = page_cpupid_xchg_last(&folio->page, -1);
+ cpupid = folio_xchg_last_cpupid(folio, -1);
/*
* For memory tiering mode, when migrate between slow and fast
* memory node, reset cpupid, because that is used to record
* page access time in slow memory node.
*/
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
- bool f_toptier = node_is_toptier(page_to_nid(&folio->page));
- bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page));
+ bool f_toptier = node_is_toptier(folio_nid(folio));
+ bool t_toptier = node_is_toptier(folio_nid(newfolio));
if (f_toptier != t_toptier)
cpupid = -1;
}
- page_cpupid_xchg_last(&newfolio->page, cpupid);
+ folio_xchg_last_cpupid(newfolio, cpupid);
folio_migrate_ksm(newfolio, folio);
/*
@@ -630,8 +633,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
folio_copy_owner(newfolio, folio);
- if (!folio_test_hugetlb(folio))
- mem_cgroup_migrate(folio, newfolio);
+ mem_cgroup_migrate(folio, newfolio);
}
EXPORT_SYMBOL(folio_migrate_flags);
@@ -943,7 +945,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
int rc = -EAGAIN;
- bool is_lru = !__PageMovable(&src->page);
+ bool is_lru = !__folio_test_movable(src);
VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
@@ -990,7 +992,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
* src is freed; but stats require that PageAnon be left as PageAnon.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- if (__PageMovable(&src->page)) {
+ if (__folio_test_movable(src)) {
VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
/*
@@ -1025,22 +1027,28 @@ union migration_ptr {
struct anon_vma *anon_vma;
struct address_space *mapping;
};
+
+enum {
+ PAGE_WAS_MAPPED = BIT(0),
+ PAGE_WAS_MLOCKED = BIT(1),
+};
+
static void __migrate_folio_record(struct folio *dst,
- unsigned long page_was_mapped,
+ unsigned long old_page_state,
struct anon_vma *anon_vma)
{
union migration_ptr ptr = { .anon_vma = anon_vma };
dst->mapping = ptr.mapping;
- dst->private = (void *)page_was_mapped;
+ dst->private = (void *)old_page_state;
}
static void __migrate_folio_extract(struct folio *dst,
- int *page_was_mappedp,
+ int *old_page_state,
struct anon_vma **anon_vmap)
{
union migration_ptr ptr = { .mapping = dst->mapping };
*anon_vmap = ptr.anon_vma;
- *page_was_mappedp = (unsigned long)dst->private;
+ *old_page_state = (unsigned long)dst->private;
dst->mapping = NULL;
dst->private = NULL;
}
@@ -1082,7 +1090,7 @@ static void migrate_folio_done(struct folio *src,
/*
* Compaction can migrate also non-LRU pages which are
* not accounted to NR_ISOLATED_*. They can be recognized
- * as __PageMovable
+ * as __folio_test_movable
*/
if (likely(!__folio_test_movable(src)))
mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
@@ -1101,9 +1109,9 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
{
struct folio *dst;
int rc = -EAGAIN;
- int page_was_mapped = 0;
+ int old_page_state = 0;
struct anon_vma *anon_vma = NULL;
- bool is_lru = !__PageMovable(&src->page);
+ bool is_lru = !__folio_test_movable(src);
bool locked = false;
bool dst_locked = false;
@@ -1155,6 +1163,8 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
folio_lock(src);
}
locked = true;
+ if (folio_test_mlocked(src))
+ old_page_state |= PAGE_WAS_MLOCKED;
if (folio_test_writeback(src)) {
/*
@@ -1204,7 +1214,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
dst_locked = true;
if (unlikely(!is_lru)) {
- __migrate_folio_record(dst, page_was_mapped, anon_vma);
+ __migrate_folio_record(dst, old_page_state, anon_vma);
return MIGRATEPAGE_UNMAP;
}
@@ -1230,11 +1240,11 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
VM_BUG_ON_FOLIO(folio_test_anon(src) &&
!folio_test_ksm(src) && !anon_vma, src);
try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
- page_was_mapped = 1;
+ old_page_state |= PAGE_WAS_MAPPED;
}
if (!folio_mapped(src)) {
- __migrate_folio_record(dst, page_was_mapped, anon_vma);
+ __migrate_folio_record(dst, old_page_state, anon_vma);
return MIGRATEPAGE_UNMAP;
}
@@ -1246,7 +1256,8 @@ out:
if (rc == -EAGAIN)
ret = NULL;
- migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret);
+ migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
+ anon_vma, locked, ret);
migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private);
return rc;
@@ -1259,12 +1270,12 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
struct list_head *ret)
{
int rc;
- int page_was_mapped = 0;
+ int old_page_state = 0;
struct anon_vma *anon_vma = NULL;
- bool is_lru = !__PageMovable(&src->page);
+ bool is_lru = !__folio_test_movable(src);
struct list_head *prev;
- __migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
+ __migrate_folio_extract(dst, &old_page_state, &anon_vma);
prev = dst->lru.prev;
list_del(&dst->lru);
@@ -1285,10 +1296,10 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
* isolated from the unevictable LRU: but this case is the easiest.
*/
folio_add_lru(dst);
- if (page_was_mapped)
+ if (old_page_state & PAGE_WAS_MLOCKED)
lru_add_drain();
- if (page_was_mapped)
+ if (old_page_state & PAGE_WAS_MAPPED)
remove_migration_ptes(src, dst, false);
out_unlock_both:
@@ -1320,11 +1331,12 @@ out:
*/
if (rc == -EAGAIN) {
list_add(&dst->lru, prev);
- __migrate_folio_record(dst, page_was_mapped, anon_vma);
+ __migrate_folio_record(dst, old_page_state, anon_vma);
return rc;
}
- migrate_folio_undo_src(src, page_was_mapped, anon_vma, true, ret);
+ migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
+ anon_vma, true, ret);
migrate_folio_undo_dst(dst, true, put_new_folio, private);
return rc;
@@ -1493,6 +1505,7 @@ struct migrate_pages_stats {
int nr_thp_succeeded; /* THP migrated successfully */
int nr_thp_failed; /* THP failed to be migrated */
int nr_thp_split; /* THP split before migrating */
+ int nr_split; /* Large folio (include THP) split before migrating */
};
/*
@@ -1612,6 +1625,7 @@ static int migrate_pages_batch(struct list_head *from,
int nr_retry_pages = 0;
int pass = 0;
bool is_thp = false;
+ bool is_large = false;
struct folio *folio, *folio2, *dst = NULL, *dst2;
int rc, rc_saved = 0, nr_pages;
LIST_HEAD(unmap_folios);
@@ -1627,7 +1641,8 @@ static int migrate_pages_batch(struct list_head *from,
nr_retry_pages = 0;
list_for_each_entry_safe(folio, folio2, from, lru) {
- is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
+ is_large = folio_test_large(folio);
+ is_thp = is_large && folio_test_pmd_mappable(folio);
nr_pages = folio_nr_pages(folio);
cond_resched();
@@ -1647,6 +1662,7 @@ static int migrate_pages_batch(struct list_head *from,
stats->nr_thp_failed++;
if (!try_split_folio(folio, split_folios)) {
stats->nr_thp_split++;
+ stats->nr_split++;
continue;
}
stats->nr_failed_pages += nr_pages;
@@ -1675,11 +1691,12 @@ static int migrate_pages_batch(struct list_head *from,
nr_failed++;
stats->nr_thp_failed += is_thp;
/* Large folio NUMA faulting doesn't split to retry. */
- if (folio_test_large(folio) && !nosplit) {
+ if (is_large && !nosplit) {
int ret = try_split_folio(folio, split_folios);
if (!ret) {
stats->nr_thp_split += is_thp;
+ stats->nr_split++;
break;
} else if (reason == MR_LONGTERM_PIN &&
ret == -EAGAIN) {
@@ -1792,12 +1809,12 @@ out:
dst = list_first_entry(&dst_folios, struct folio, lru);
dst2 = list_next_entry(dst, lru);
list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
- int page_was_mapped = 0;
+ int old_page_state = 0;
struct anon_vma *anon_vma = NULL;
- __migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
- migrate_folio_undo_src(folio, page_was_mapped, anon_vma,
- true, ret_folios);
+ __migrate_folio_extract(dst, &old_page_state, &anon_vma);
+ migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
+ anon_vma, true, ret_folios);
list_del(&dst->lru);
migrate_folio_undo_dst(dst, true, put_new_folio, private);
dst = dst2;
@@ -1825,6 +1842,7 @@ static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
stats->nr_succeeded += astats.nr_succeeded;
stats->nr_thp_succeeded += astats.nr_thp_succeeded;
stats->nr_thp_split += astats.nr_thp_split;
+ stats->nr_split += astats.nr_split;
if (rc < 0) {
stats->nr_failed_pages += astats.nr_failed_pages;
stats->nr_thp_failed += astats.nr_thp_failed;
@@ -1832,7 +1850,11 @@ static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
return rc;
}
stats->nr_thp_failed += astats.nr_thp_split;
- nr_failed += astats.nr_thp_split;
+ /*
+ * Do not count rc, as pages will be retried below.
+ * Count nr_split only, since it includes nr_thp_split.
+ */
+ nr_failed += astats.nr_split;
/*
* Fall back to migrate all failed folios one by one synchronously. All
* failed folios except split THPs will be retried, so their failure
@@ -1967,7 +1989,8 @@ out:
count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split);
trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages,
stats.nr_thp_succeeded, stats.nr_thp_failed,
- stats.nr_thp_split, mode, reason);
+ stats.nr_thp_split, stats.nr_split, mode,
+ reason);
if (ret_succeeded)
*ret_succeeded = stats.nr_succeeded;
@@ -2026,8 +2049,7 @@ static int store_status(int __user *status, int start, int value, int nr)
return 0;
}
-static int do_move_pages_to_node(struct mm_struct *mm,
- struct list_head *pagelist, int node)
+static int do_move_pages_to_node(struct list_head *pagelist, int node)
{
int err;
struct migration_target_control mtc = {
@@ -2057,8 +2079,8 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p,
struct vm_area_struct *vma;
unsigned long addr;
struct page *page;
+ struct folio *folio;
int err;
- bool isolated;
mmap_read_lock(mm);
addr = (unsigned long)untagged_addr_remote(mm, p);
@@ -2079,51 +2101,44 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p,
if (!page)
goto out;
- if (is_zone_device_page(page))
- goto out_putpage;
+ folio = page_folio(page);
+ if (folio_is_zone_device(folio))
+ goto out_putfolio;
err = 0;
- if (page_to_nid(page) == node)
- goto out_putpage;
+ if (folio_nid(folio) == node)
+ goto out_putfolio;
err = -EACCES;
if (page_mapcount(page) > 1 && !migrate_all)
- goto out_putpage;
+ goto out_putfolio;
- if (PageHuge(page)) {
- if (PageHead(page)) {
- isolated = isolate_hugetlb(page_folio(page), pagelist);
- err = isolated ? 1 : -EBUSY;
- }
+ err = -EBUSY;
+ if (folio_test_hugetlb(folio)) {
+ if (isolate_hugetlb(folio, pagelist))
+ err = 1;
} else {
- struct page *head;
-
- head = compound_head(page);
- isolated = isolate_lru_page(head);
- if (!isolated) {
- err = -EBUSY;
- goto out_putpage;
- }
+ if (!folio_isolate_lru(folio))
+ goto out_putfolio;
err = 1;
- list_add_tail(&head->lru, pagelist);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_lru(head),
- thp_nr_pages(head));
+ list_add_tail(&folio->lru, pagelist);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
}
-out_putpage:
+out_putfolio:
/*
- * Either remove the duplicate refcount from
- * isolate_lru_page() or drop the page ref if it was
- * not isolated.
+ * Either remove the duplicate refcount from folio_isolate_lru()
+ * or drop the folio ref if it was not isolated.
*/
- put_page(page);
+ folio_put(folio);
out:
mmap_read_unlock(mm);
return err;
}
-static int move_pages_and_store_status(struct mm_struct *mm, int node,
+static int move_pages_and_store_status(int node,
struct list_head *pagelist, int __user *status,
int start, int i, unsigned long nr_pages)
{
@@ -2132,7 +2147,7 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node,
if (list_empty(pagelist))
return 0;
- err = do_move_pages_to_node(mm, pagelist, node);
+ err = do_move_pages_to_node(pagelist, node);
if (err) {
/*
* Positive err means the number of failed
@@ -2159,6 +2174,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
const int __user *nodes,
int __user *status, int flags)
{
+ compat_uptr_t __user *compat_pages = (void __user *)pages;
int current_node = NUMA_NO_NODE;
LIST_HEAD(pagelist);
int start, i;
@@ -2171,8 +2187,17 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
int node;
err = -EFAULT;
- if (get_user(p, pages + i))
- goto out_flush;
+ if (in_compat_syscall()) {
+ compat_uptr_t cp;
+
+ if (get_user(cp, compat_pages + i))
+ goto out_flush;
+
+ p = compat_ptr(cp);
+ } else {
+ if (get_user(p, pages + i))
+ goto out_flush;
+ }
if (get_user(node, nodes + i))
goto out_flush;
@@ -2190,7 +2215,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
current_node = node;
start = i;
} else if (node != current_node) {
- err = move_pages_and_store_status(mm, current_node,
+ err = move_pages_and_store_status(current_node,
&pagelist, status, start, i, nr_pages);
if (err)
goto out;
@@ -2225,7 +2250,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
if (err)
goto out_flush;
- err = move_pages_and_store_status(mm, current_node, &pagelist,
+ err = move_pages_and_store_status(current_node, &pagelist,
status, start, i, nr_pages);
if (err) {
/* We have accounted for page i */
@@ -2237,7 +2262,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
}
out_flush:
/* Make sure we do not overwrite the existing error */
- err1 = move_pages_and_store_status(mm, current_node, &pagelist,
+ err1 = move_pages_and_store_status(current_node, &pagelist,
status, start, i, nr_pages);
if (err >= 0)
err = err1;
@@ -2478,16 +2503,9 @@ static struct folio *alloc_misplaced_dst_folio(struct folio *src,
return __folio_alloc_node(gfp, order, nid);
}
-static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+static int numamigrate_isolate_folio(pg_data_t *pgdat, struct folio *folio)
{
- int nr_pages = thp_nr_pages(page);
- int order = compound_order(page);
-
- VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
-
- /* Do not migrate THP mapped by multiple processes */
- if (PageTransHuge(page) && total_mapcount(page) > 1)
- return 0;
+ int nr_pages = folio_nr_pages(folio);
/* Avoid migrating to a node that is nearly full */
if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
@@ -2499,75 +2517,79 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
if (managed_zone(pgdat->node_zones + z))
break;
}
- wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
+ wakeup_kswapd(pgdat->node_zones + z, 0,
+ folio_order(folio), ZONE_MOVABLE);
return 0;
}
- if (!isolate_lru_page(page))
+ if (!folio_isolate_lru(folio))
return 0;
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page),
+ node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio),
nr_pages);
/*
- * Isolating the page has taken another reference, so the
- * caller's reference can be safely dropped without the page
+ * Isolating the folio has taken another reference, so the
+ * caller's reference can be safely dropped without the folio
* disappearing underneath us during migration.
*/
- put_page(page);
+ folio_put(folio);
return 1;
}
/*
- * Attempt to migrate a misplaced page to the specified destination
+ * Attempt to migrate a misplaced folio to the specified destination
* node. Caller is expected to have an elevated reference count on
- * the page that will be dropped by this function before returning.
+ * the folio that will be dropped by this function before returning.
*/
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
- int node)
+int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
+ int node)
{
pg_data_t *pgdat = NODE_DATA(node);
int isolated;
int nr_remaining;
unsigned int nr_succeeded;
LIST_HEAD(migratepages);
- int nr_pages = thp_nr_pages(page);
+ int nr_pages = folio_nr_pages(folio);
/*
- * Don't migrate file pages that are mapped in multiple processes
+ * Don't migrate file folios that are mapped in multiple processes
* with execute permissions as they are probably shared libraries.
+ * To check if the folio is shared, ideally we want to make sure
+ * every page is mapped to the same process. Doing that is very
+ * expensive, so check the estimated mapcount of the folio instead.
*/
- if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
+ if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) &&
(vma->vm_flags & VM_EXEC))
goto out;
/*
- * Also do not migrate dirty pages as not all filesystems can move
- * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
+ * Also do not migrate dirty folios as not all filesystems can move
+ * dirty folios in MIGRATE_ASYNC mode which is a waste of cycles.
*/
- if (page_is_file_lru(page) && PageDirty(page))
+ if (folio_is_file_lru(folio) && folio_test_dirty(folio))
goto out;
- isolated = numamigrate_isolate_page(pgdat, page);
+ isolated = numamigrate_isolate_folio(pgdat, folio);
if (!isolated)
goto out;
- list_add(&page->lru, &migratepages);
+ list_add(&folio->lru, &migratepages);
nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio,
NULL, node, MIGRATE_ASYNC,
MR_NUMA_MISPLACED, &nr_succeeded);
if (nr_remaining) {
if (!list_empty(&migratepages)) {
- list_del(&page->lru);
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_lru(page), -nr_pages);
- putback_lru_page(page);
+ list_del(&folio->lru);
+ node_stat_mod_folio(folio, NR_ISOLATED_ANON +
+ folio_is_file_lru(folio), -nr_pages);
+ folio_putback_lru(folio);
}
isolated = 0;
}
if (nr_succeeded) {
count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
- if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
+ if (!node_is_toptier(folio_nid(folio)) && node_is_toptier(node))
mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
nr_succeeded);
}
@@ -2575,7 +2597,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
return isolated;
out:
- put_page(page);
+ folio_put(folio);
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
diff --git a/mm/mlock.c b/mm/mlock.c
index 06bdfab83b58..086546ac5766 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -305,6 +305,62 @@ void munlock_folio(struct folio *folio)
local_unlock(&mlock_fbatch.lock);
}
+static inline unsigned int folio_mlock_step(struct folio *folio,
+ pte_t *pte, unsigned long addr, unsigned long end)
+{
+ unsigned int count, i, nr = folio_nr_pages(folio);
+ unsigned long pfn = folio_pfn(folio);
+ pte_t ptent = ptep_get(pte);
+
+ if (!folio_test_large(folio))
+ return 1;
+
+ count = pfn + nr - pte_pfn(ptent);
+ count = min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT);
+
+ for (i = 0; i < count; i++, pte++) {
+ pte_t entry = ptep_get(pte);
+
+ if (!pte_present(entry))
+ break;
+ if (pte_pfn(entry) - pfn >= nr)
+ break;
+ }
+
+ return i;
+}
+
+static inline bool allow_mlock_munlock(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned int step)
+{
+ /*
+ * For unlock, allow munlock large folio which is partially
+ * mapped to VMA. As it's possible that large folio is
+ * mlocked and VMA is split later.
+ *
+ * During memory pressure, such kind of large folio can
+ * be split. And the pages are not in VM_LOCKed VMA
+ * can be reclaimed.
+ */
+ if (!(vma->vm_flags & VM_LOCKED))
+ return true;
+
+ /* folio_within_range() cannot take KSM, but any small folio is OK */
+ if (!folio_test_large(folio))
+ return true;
+
+ /* folio not in range [start, end), skip mlock */
+ if (!folio_within_range(folio, vma, start, end))
+ return false;
+
+ /* folio is not fully mapped, skip mlock */
+ if (step != folio_nr_pages(folio))
+ return false;
+
+ return true;
+}
+
static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -314,6 +370,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
pte_t *start_pte, *pte;
pte_t ptent;
struct folio *folio;
+ unsigned int step = 1;
+ unsigned long start = addr;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -334,6 +392,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
walk->action = ACTION_AGAIN;
return 0;
}
+
for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
ptent = ptep_get(pte);
if (!pte_present(ptent))
@@ -341,12 +400,19 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
- if (folio_test_large(folio))
- continue;
+
+ step = folio_mlock_step(folio, pte, addr, end);
+ if (!allow_mlock_munlock(folio, vma, start, end, step))
+ goto next_entry;
+
if (vma->vm_flags & VM_LOCKED)
mlock_folio(folio);
else
munlock_folio(folio);
+
+next_entry:
+ pte += step - 1;
+ addr += (step - 1) << PAGE_SHIFT;
}
pte_unmap(start_pte);
out:
@@ -414,7 +480,6 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long end, vm_flags_t newflags)
{
struct mm_struct *mm = vma->vm_mm;
- pgoff_t pgoff;
int nr_pages;
int ret = 0;
vm_flags_t oldflags = vma->vm_flags;
@@ -425,28 +490,12 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- *prev = vma_merge(vmi, mm, *prev, start, end, newflags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, anon_vma_name(vma));
- if (*prev) {
- vma = *prev;
- goto success;
- }
-
- if (start != vma->vm_start) {
- ret = split_vma(vmi, vma, start, 1);
- if (ret)
- goto out;
- }
-
- if (end != vma->vm_end) {
- ret = split_vma(vmi, vma, end, 0);
- if (ret)
- goto out;
+ vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto out;
}
-success:
/*
* Keep track of amount of locked VM.
*/
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 50f2f34745af..077bfe393b5e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -555,7 +555,7 @@ out:
node_states[N_MEMORY] = saved_node_state;
}
-static void __meminit __init_single_page(struct page *page, unsigned long pfn,
+void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
{
mm_zero_struct_page(page);
@@ -1871,8 +1871,6 @@ void __init free_area_init(unsigned long *max_zone_pfn)
pg_data_t *pgdat;
if (!node_online(nid)) {
- pr_info("Initializing node %d as memoryless\n", nid);
-
/* Allocator not initialized yet */
pgdat = arch_alloc_nodedata(nid);
if (!pgdat)
diff --git a/mm/mmap.c b/mm/mmap.c
index b56a7f0c9f85..1971bfffcc03 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -107,7 +107,7 @@ void vma_set_page_prot(struct vm_area_struct *vma)
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
{
- if (vma->vm_flags & VM_SHARED)
+ if (vma_is_shared_maywrite(vma))
mapping_unmap_writable(mapping);
flush_dcache_mmap_lock(mapping);
@@ -384,7 +384,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
static void __vma_link_file(struct vm_area_struct *vma,
struct address_space *mapping)
{
- if (vma->vm_flags & VM_SHARED)
+ if (vma_is_shared_maywrite(vma))
mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
@@ -583,11 +583,12 @@ again:
* dup_anon_vma() - Helper function to duplicate anon_vma
* @dst: The destination VMA
* @src: The source VMA
+ * @dup: Pointer to the destination VMA when successful.
*
* Returns: 0 on success.
*/
static inline int dup_anon_vma(struct vm_area_struct *dst,
- struct vm_area_struct *src)
+ struct vm_area_struct *src, struct vm_area_struct **dup)
{
/*
* Easily overlooked: when mprotect shifts the boundary, make sure the
@@ -595,9 +596,15 @@ static inline int dup_anon_vma(struct vm_area_struct *dst,
* anon pages imported.
*/
if (src->anon_vma && !dst->anon_vma) {
+ int ret;
+
vma_assert_write_locked(dst);
dst->anon_vma = src->anon_vma;
- return anon_vma_clone(dst, src);
+ ret = anon_vma_clone(dst, src);
+ if (ret)
+ return ret;
+
+ *dup = dst;
}
return 0;
@@ -624,6 +631,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long start, unsigned long end, pgoff_t pgoff,
struct vm_area_struct *next)
{
+ struct vm_area_struct *anon_dup = NULL;
bool remove_next = false;
struct vma_prepare vp;
@@ -633,7 +641,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
remove_next = true;
vma_start_write(next);
- ret = dup_anon_vma(vma, next);
+ ret = dup_anon_vma(vma, next, &anon_dup);
if (ret)
return ret;
}
@@ -661,6 +669,8 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
return 0;
nomem:
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
return -ENOMEM;
}
@@ -850,16 +860,17 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* **** is not represented - it will be merged and the vma containing the
* area is returned, or the function will return NULL
*/
-struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
- struct vm_area_struct *prev, unsigned long addr,
- unsigned long end, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t pgoff, struct mempolicy *policy,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name)
+static struct vm_area_struct
+*vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
+ struct vm_area_struct *prev, unsigned long addr, unsigned long end,
+ unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file,
+ pgoff_t pgoff, struct mempolicy *policy,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ struct anon_vma_name *anon_name)
{
struct vm_area_struct *curr, *next, *res;
struct vm_area_struct *vma, *adjust, *remove, *remove2;
+ struct vm_area_struct *anon_dup = NULL;
struct vma_prepare vp;
pgoff_t vma_pgoff;
int err = 0;
@@ -927,18 +938,23 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
vma_start_write(next);
remove = next; /* case 1 */
vma_end = next->vm_end;
- err = dup_anon_vma(prev, next);
+ err = dup_anon_vma(prev, next, &anon_dup);
if (curr) { /* case 6 */
vma_start_write(curr);
remove = curr;
remove2 = next;
+ /*
+ * Note that the dup_anon_vma below cannot overwrite err
+ * since the first caller would do nothing unless next
+ * has an anon_vma.
+ */
if (!next->anon_vma)
- err = dup_anon_vma(prev, curr);
+ err = dup_anon_vma(prev, curr, &anon_dup);
}
} else if (merge_prev) { /* case 2 */
if (curr) {
vma_start_write(curr);
- err = dup_anon_vma(prev, curr);
+ err = dup_anon_vma(prev, curr, &anon_dup);
if (end == curr->vm_end) { /* case 7 */
remove = curr;
} else { /* case 5 */
@@ -954,7 +970,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
vma_end = addr;
adjust = next;
adj_start = -(prev->vm_end - addr);
- err = dup_anon_vma(next, prev);
+ err = dup_anon_vma(next, prev, &anon_dup);
} else {
/*
* Note that cases 3 and 8 are the ONLY ones where prev
@@ -968,14 +984,14 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
vma_pgoff = curr->vm_pgoff;
vma_start_write(curr);
remove = curr;
- err = dup_anon_vma(next, curr);
+ err = dup_anon_vma(next, curr, &anon_dup);
}
}
}
/* Error in anon_vma clone. */
if (err)
- return NULL;
+ goto anon_vma_fail;
if (vma_start < vma->vm_start || vma_end > vma->vm_end)
vma_expanded = true;
@@ -988,7 +1004,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
}
if (vma_iter_prealloc(vmi, vma))
- return NULL;
+ goto prealloc_fail;
init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
@@ -1016,6 +1032,15 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
vma_complete(&vp, vmi, mm);
khugepaged_enter_vma(res, vm_flags);
return res;
+
+prealloc_fail:
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
+anon_vma_fail:
+ vma_iter_set(vmi, addr);
+ vma_iter_load(vmi);
+ return NULL;
}
/*
@@ -1198,7 +1223,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* Does the application expect PROT_READ to imply PROT_EXEC?
*
* (the exception is when the underlying filesystem is noexec
- * mounted, in which case we dont add PROT_EXEC.)
+ * mounted, in which case we don't add PROT_EXEC.)
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && path_noexec(&file->f_path)))
@@ -1924,9 +1949,9 @@ static int acct_stack_growth(struct vm_area_struct *vma,
return 0;
}
-#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
+#if defined(CONFIG_STACK_GROWSUP)
/*
- * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * PA-RISC uses this for its stack.
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
@@ -2023,7 +2048,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
validate_mm(mm);
return error;
}
-#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
+#endif /* CONFIG_STACK_GROWSUP */
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
@@ -2159,8 +2184,6 @@ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned lon
#else
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
- return -EINVAL;
return expand_downwards(vma, address);
}
@@ -2323,8 +2346,8 @@ static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
* has already been checked or doesn't make sense to fail.
* VMA Iterator will point to the end VMA.
*/
-int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, int new_below)
+static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
{
struct vma_prepare vp;
struct vm_area_struct *new;
@@ -2405,8 +2428,8 @@ out_free_vma:
* Split a vma into two pieces at address 'addr', a new vma is allocated
* either for the first part or the tail.
*/
-int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, int new_below)
+static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
{
if (vma->vm_mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
@@ -2415,6 +2438,85 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
}
/*
+ * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
+ * context and anonymous VMA name within the range [start, end).
+ *
+ * As a result, we might be able to merge the newly modified VMA range with an
+ * adjacent VMA with identical properties.
+ *
+ * If no merge is possible and the range does not span the entirety of the VMA,
+ * we then need to split the VMA to accommodate the change.
+ *
+ * The function returns either the merged VMA, the original VMA if a split was
+ * required instead, or an error if the split failed.
+ */
+struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long vm_flags,
+ struct mempolicy *policy,
+ struct vm_userfaultfd_ctx uffd_ctx,
+ struct anon_vma_name *anon_name)
+{
+ pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ struct vm_area_struct *merged;
+
+ merged = vma_merge(vmi, vma->vm_mm, prev, start, end, vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff, policy,
+ uffd_ctx, anon_name);
+ if (merged)
+ return merged;
+
+ if (vma->vm_start < start) {
+ int err = split_vma(vmi, vma, start, 1);
+
+ if (err)
+ return ERR_PTR(err);
+ }
+
+ if (vma->vm_end > end) {
+ int err = split_vma(vmi, vma, end, 0);
+
+ if (err)
+ return ERR_PTR(err);
+ }
+
+ return vma;
+}
+
+/*
+ * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
+ * must ensure that [start, end) does not overlap any existing VMA.
+ */
+static struct vm_area_struct
+*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgoff_t pgoff)
+{
+ return vma_merge(vmi, vma->vm_mm, prev, start, end, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+}
+
+/*
+ * Expand vma by delta bytes, potentially merging with an immediately adjacent
+ * VMA with identical properties.
+ */
+struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
+ struct vm_area_struct *vma,
+ unsigned long delta)
+{
+ pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
+
+ /* vma is specified as prev, so case 1 or 2 will apply. */
+ return vma_merge(vmi, vma->vm_mm, vma, vma->vm_end, vma->vm_end + delta,
+ vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff,
+ vma_policy(vma), vma->vm_userfaultfd_ctx,
+ anon_vma_name(vma));
+}
+
+/*
* do_vmi_align_munmap() - munmap the aligned region from @start to @end.
* @vmi: The vma iterator
* @vma: The starting vm_area_struct
@@ -2650,6 +2752,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long charged = 0;
unsigned long end = addr + len;
unsigned long merge_start = addr, merge_end = end;
+ bool writable_file_mapping = false;
pgoff_t vm_pgoff;
int error;
VMA_ITERATOR(vmi, mm, addr);
@@ -2744,17 +2847,19 @@ cannot_expand:
vma->vm_pgoff = pgoff;
if (file) {
- if (vm_flags & VM_SHARED) {
- error = mapping_map_writable(file->f_mapping);
- if (error)
- goto free_vma;
- }
-
vma->vm_file = get_file(file);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
+ if (vma_is_shared_maywrite(vma)) {
+ error = mapping_map_writable(file->f_mapping);
+ if (error)
+ goto close_and_free_vma;
+
+ writable_file_mapping = true;
+ }
+
/*
* Expansion is handled above, merging is handled below.
* Drivers should not alter the address of the VMA.
@@ -2769,10 +2874,9 @@ cannot_expand:
* vma again as we may succeed this time.
*/
if (unlikely(vm_flags != vma->vm_flags && prev)) {
- merge = vma_merge(&vmi, mm, prev, vma->vm_start,
- vma->vm_end, vma->vm_flags, NULL,
- vma->vm_file, vma->vm_pgoff, NULL,
- NULL_VM_UFFD_CTX, NULL);
+ merge = vma_merge_new_vma(&vmi, prev, vma,
+ vma->vm_start, vma->vm_end,
+ vma->vm_pgoff);
if (merge) {
/*
* ->mmap() can change vma->vm_file and fput
@@ -2819,7 +2923,7 @@ cannot_expand:
mm->map_count++;
if (vma->vm_file) {
i_mmap_lock_write(vma->vm_file->f_mapping);
- if (vma->vm_flags & VM_SHARED)
+ if (vma_is_shared_maywrite(vma))
mapping_allow_writable(vma->vm_file->f_mapping);
flush_dcache_mmap_lock(vma->vm_file->f_mapping);
@@ -2836,7 +2940,7 @@ cannot_expand:
/* Once vma denies write, undo our temporary denial count */
unmap_writable:
- if (file && vm_flags & VM_SHARED)
+ if (writable_file_mapping)
mapping_unmap_writable(file->f_mapping);
file = vma->vm_file;
ksm_add_vma(vma);
@@ -2884,7 +2988,7 @@ unmap_and_free_vma:
unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
vma->vm_end, vma->vm_end, true);
}
- if (file && (vm_flags & VM_SHARED))
+ if (writable_file_mapping)
mapping_unmap_writable(file->f_mapping);
free_vma:
vm_area_free(vma);
@@ -3143,13 +3247,13 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (!len)
return 0;
- if (mmap_write_lock_killable(mm))
- return -EINTR;
-
/* Until we need other flags, refuse anything except VM_EXEC. */
if ((flags & (~VM_EXEC)) != 0)
return -EINVAL;
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
ret = check_brk_limits(addr, len);
if (ret)
goto limits_failed;
@@ -3174,12 +3278,6 @@ limits_failed:
}
EXPORT_SYMBOL(vm_brk_flags);
-int vm_brk(unsigned long addr, unsigned long len)
-{
- return vm_brk_flags(addr, len, 0);
-}
-EXPORT_SYMBOL(vm_brk);
-
/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
@@ -3278,7 +3376,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
}
if (vma_link(mm, vma)) {
- vm_unacct_memory(charged);
+ if (vma->vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(charged);
return -ENOMEM;
}
@@ -3313,9 +3412,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
- new_vma = vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff);
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 68e1511be12d..b594d3f268fe 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -93,19 +93,19 @@ void lruvec_init(struct lruvec *lruvec)
}
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
-int page_cpupid_xchg_last(struct page *page, int cpupid)
+int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
unsigned long old_flags, flags;
int last_cpupid;
- old_flags = READ_ONCE(page->flags);
+ old_flags = READ_ONCE(folio->flags);
do {
flags = old_flags;
last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
- } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
+ } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags)));
return last_cpupid;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b94fbb45d5c7..81991102f785 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -114,7 +114,7 @@ static long change_pte_range(struct mmu_gather *tlb,
* pages. See similar comment in change_huge_pmd.
*/
if (prot_numa) {
- struct page *page;
+ struct folio *folio;
int nid;
bool toptier;
@@ -122,13 +122,14 @@ static long change_pte_range(struct mmu_gather *tlb,
if (pte_protnone(oldpte))
continue;
- page = vm_normal_page(vma, addr, oldpte);
- if (!page || is_zone_device_page(page) || PageKsm(page))
+ folio = vm_normal_folio(vma, addr, oldpte);
+ if (!folio || folio_is_zone_device(folio) ||
+ folio_test_ksm(folio))
continue;
/* Also skip shared copy-on-write pages */
if (is_cow_mapping(vma->vm_flags) &&
- page_count(page) != 1)
+ folio_ref_count(folio) != 1)
continue;
/*
@@ -136,14 +137,15 @@ static long change_pte_range(struct mmu_gather *tlb,
* it cannot move them all from MIGRATE_ASYNC
* context.
*/
- if (page_is_file_lru(page) && PageDirty(page))
+ if (folio_is_file_lru(folio) &&
+ folio_test_dirty(folio))
continue;
/*
* Don't mess with PTEs if page is already on the node
* a single-threaded process is running on.
*/
- nid = page_to_nid(page);
+ nid = folio_nid(folio);
if (target_node == nid)
continue;
toptier = node_is_toptier(nid);
@@ -157,7 +159,7 @@ static long change_pte_range(struct mmu_gather *tlb,
continue;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
!toptier)
- xchg_page_access_time(page,
+ folio_xchg_access_time(folio,
jiffies_to_msecs(jiffies));
}
@@ -581,7 +583,6 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned int mm_cp_flags = 0;
unsigned long charged = 0;
- pgoff_t pgoff;
int error;
if (newflags == oldflags) {
@@ -608,8 +609,11 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
/*
* If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we
- * make it unwritable again. hugetlb mapping were accounted for
- * even if read-only so there is no need to account for them here
+ * make it unwritable again except in the anonymous case where no
+ * anon_vma has yet to be assigned.
+ *
+ * hugetlb mapping were accounted for even if read-only so there is
+ * no need to account for them here.
*/
if (newflags & VM_WRITE) {
/* Check space limits when area turns into data. */
@@ -623,36 +627,19 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
return -ENOMEM;
newflags |= VM_ACCOUNT;
}
+ } else if ((oldflags & VM_ACCOUNT) && vma_is_anonymous(vma) &&
+ !vma->anon_vma) {
+ newflags &= ~VM_ACCOUNT;
}
- /*
- * First try to merge with previous and/or next vma.
- */
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, anon_vma_name(vma));
- if (*pprev) {
- vma = *pprev;
- VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
- goto success;
+ vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags);
+ if (IS_ERR(vma)) {
+ error = PTR_ERR(vma);
+ goto fail;
}
*pprev = vma;
- if (start != vma->vm_start) {
- error = split_vma(vmi, vma, start, 1);
- if (error)
- goto fail;
- }
-
- if (end != vma->vm_end) {
- error = split_vma(vmi, vma, end, 0);
- if (error)
- goto fail;
- }
-
-success:
/*
* vm_flags and vm_page_prot are protected by the mmap_lock
* held in write mode.
@@ -665,6 +652,9 @@ success:
change_protection(tlb, vma, start, end, mm_cp_flags);
+ if ((oldflags & VM_ACCOUNT) && !(newflags & VM_ACCOUNT))
+ vm_unacct_memory(nrpages);
+
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
* fault on access.
diff --git a/mm/mremap.c b/mm/mremap.c
index 382e81c33fc4..38d98465f3d8 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -489,10 +489,62 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
return moved;
}
+/*
+ * A helper to check if aligning down is OK. The aligned address should fall
+ * on *no mapping*. For the stack moving down, that's a special move within
+ * the VMA that is created to span the source and destination of the move,
+ * so we make an exception for it.
+ */
+static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
+ unsigned long mask, bool for_stack)
+{
+ unsigned long addr_masked = addr_to_align & mask;
+
+ /*
+ * If @addr_to_align of either source or destination is not the beginning
+ * of the corresponding VMA, we can't align down or we will destroy part
+ * of the current mapping.
+ */
+ if (!for_stack && vma->vm_start != addr_to_align)
+ return false;
+
+ /* In the stack case we explicitly permit in-VMA alignment. */
+ if (for_stack && addr_masked >= vma->vm_start)
+ return true;
+
+ /*
+ * Make sure the realignment doesn't cause the address to fall on an
+ * existing mapping.
+ */
+ return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
+}
+
+/* Opportunistically realign to specified boundary for faster copy. */
+static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
+ unsigned long *new_addr, struct vm_area_struct *new_vma,
+ unsigned long mask, bool for_stack)
+{
+ /* Skip if the addresses are already aligned. */
+ if ((*old_addr & ~mask) == 0)
+ return;
+
+ /* Only realign if the new and old addresses are mutually aligned. */
+ if ((*old_addr & ~mask) != (*new_addr & ~mask))
+ return;
+
+ /* Ensure realignment doesn't cause overlap with existing mappings. */
+ if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
+ !can_align_down(new_vma, *new_addr, mask, for_stack))
+ return;
+
+ *old_addr = *old_addr & mask;
+ *new_addr = *new_addr & mask;
+}
+
unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
- bool need_rmap_locks)
+ bool need_rmap_locks, bool for_stack)
{
unsigned long extent, old_end;
struct mmu_notifier_range range;
@@ -508,6 +560,14 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
return move_hugetlb_page_tables(vma, new_vma, old_addr,
new_addr, len);
+ /*
+ * If possible, realign addresses to PMD boundary for faster copy.
+ * Only realign if the mremap copying hits a PMD boundary.
+ */
+ if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
+ try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
+ for_stack);
+
flush_cache_range(vma, old_addr, old_end);
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
old_addr, old_end);
@@ -577,6 +637,13 @@ again:
mmu_notifier_invalidate_range_end(&range);
+ /*
+ * Prevent negative return values when {old,new}_addr was realigned
+ * but we broke out of the above loop for the first PMD itself.
+ */
+ if (len + old_addr < old_end)
+ return 0;
+
return len + old_addr - old_end; /* how much done */
}
@@ -646,7 +713,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
}
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
- need_rmap_locks);
+ need_rmap_locks, false);
if (moved_len < old_len) {
err = -ENOMEM;
} else if (vma->vm_ops && vma->vm_ops->mremap) {
@@ -660,7 +727,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* and then proceed to unmap new area instead of old.
*/
move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
- true);
+ true, false);
vma = new_vma;
old_len = new_len;
old_addr = new_addr;
@@ -1029,36 +1096,34 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/* old_len exactly to the end of the area..
*/
if (old_len == vma->vm_end - addr) {
+ unsigned long delta = new_len - old_len;
+
/* can we just expand the current mapping? */
- if (vma_expandable(vma, new_len - old_len)) {
- long pages = (new_len - old_len) >> PAGE_SHIFT;
- unsigned long extension_start = addr + old_len;
- unsigned long extension_end = addr + new_len;
- pgoff_t extension_pgoff = vma->vm_pgoff +
- ((extension_start - vma->vm_start) >> PAGE_SHIFT);
- VMA_ITERATOR(vmi, mm, extension_start);
+ if (vma_expandable(vma, delta)) {
+ long pages = delta >> PAGE_SHIFT;
+ VMA_ITERATOR(vmi, mm, vma->vm_end);
+ long charged = 0;
if (vma->vm_flags & VM_ACCOUNT) {
if (security_vm_enough_memory_mm(mm, pages)) {
ret = -ENOMEM;
goto out;
}
+ charged = pages;
}
/*
- * Function vma_merge() is called on the extension we
- * are adding to the already existing vma, vma_merge()
- * will merge this extension with the already existing
- * vma (expand operation itself) and possibly also with
- * the next vma if it becomes adjacent to the expanded
- * vma and otherwise compatible.
+ * Function vma_merge_extend() is called on the
+ * extension we are adding to the already existing vma,
+ * vma_merge_extend() will merge this extension with the
+ * already existing vma (expand operation itself) and
+ * possibly also with the next vma if it becomes
+ * adjacent to the expanded vma and otherwise
+ * compatible.
*/
- vma = vma_merge(&vmi, mm, vma, extension_start,
- extension_end, vma->vm_flags, vma->anon_vma,
- vma->vm_file, extension_pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ vma = vma_merge_extend(&vmi, vma, delta);
if (!vma) {
- vm_unacct_memory(pages);
+ vm_unacct_memory(charged);
ret = -ENOMEM;
goto out;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 7f9e9e5a0e12..b6dc558d3144 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1305,8 +1305,8 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
* split a vma into two pieces at address 'addr', a new vma is allocated either
* for the first part or the tail.
*/
-int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, int new_below)
+static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
{
struct vm_area_struct *new;
struct vm_region *region;
@@ -1531,11 +1531,6 @@ void exit_mmap(struct mm_struct *mm)
mmap_write_unlock(mm);
}
-int vm_brk(unsigned long addr, unsigned long len)
-{
- return -ENOMEM;
-}
-
/*
* expand (or shrink) an existing mapping, potentially moving it at the same
* time (controlled by the MREMAP_MAYMOVE flag and available VM space)
@@ -1651,8 +1646,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
}
EXPORT_SYMBOL(filemap_map_pages);
-int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
- int len, unsigned int gup_flags)
+static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
int write = gup_flags & FOLL_WRITE;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 44bde56ecd02..9e6071fde34a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -437,7 +437,7 @@ static void dump_tasks(struct oom_control *oc)
}
}
-static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
+static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim)
{
/* one line summary of the oom killer context. */
pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
@@ -449,7 +449,7 @@ static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
from_kuid(&init_user_ns, task_uid(victim)));
}
-static void dump_header(struct oom_control *oc, struct task_struct *p)
+static void dump_header(struct oom_control *oc)
{
pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
@@ -467,8 +467,6 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
}
if (sysctl_oom_dump_tasks)
dump_tasks(oc);
- if (p)
- dump_oom_summary(oc, p);
}
/*
@@ -1029,8 +1027,10 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
}
task_unlock(victim);
- if (__ratelimit(&oom_rs))
- dump_header(oc, victim);
+ if (__ratelimit(&oom_rs)) {
+ dump_header(oc);
+ dump_oom_victim(oc, victim);
+ }
/*
* Do we need to kill the entire memory cgroup?
@@ -1072,7 +1072,7 @@ static void check_panic_on_oom(struct oom_control *oc)
/* Do not panic for oom kills triggered by sysrq */
if (is_sysrq_oom(oc))
return;
- dump_header(oc, NULL);
+ dump_header(oc);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -1155,7 +1155,7 @@ bool out_of_memory(struct oom_control *oc)
select_bad_process(oc);
/* Found nothing?!?! */
if (!oc->chosen) {
- dump_header(oc, NULL);
+ dump_header(oc);
pr_warn("Out of memory and no killable processes...\n");
/*
* If we got here due to an actual allocation at the
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b8d3d7040a50..46f2f5d3d183 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2679,7 +2679,7 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
* @folio: Folio to be marked as dirty.
*
* Filesystems which do not use buffer heads should call this function
- * from their set_page_dirty address space operation. It ignores the
+ * from their dirty_folio address space operation. It ignores the
* contents of folio_get_private(), so if the filesystem marks individual
* blocks as dirty, the filesystem should handle that itself.
*
@@ -2953,19 +2953,16 @@ bool __folio_end_writeback(struct folio *folio)
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
- ret = folio_test_clear_writeback(folio);
- if (ret) {
- __xa_clear_mark(&mapping->i_pages, folio_index(folio),
- PAGECACHE_TAG_WRITEBACK);
- if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
- struct bdi_writeback *wb = inode_to_wb(inode);
-
- wb_stat_mod(wb, WB_WRITEBACK, -nr);
- __wb_writeout_add(wb, nr);
- if (!mapping_tagged(mapping,
- PAGECACHE_TAG_WRITEBACK))
- wb_inode_writeback_end(wb);
- }
+ ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
+ __xa_clear_mark(&mapping->i_pages, folio_index(folio),
+ PAGECACHE_TAG_WRITEBACK);
+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ wb_stat_mod(wb, WB_WRITEBACK, -nr);
+ __wb_writeout_add(wb, nr);
+ if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+ wb_inode_writeback_end(wb);
}
if (mapping->host && !mapping_tagged(mapping,
@@ -2974,14 +2971,14 @@ bool __folio_end_writeback(struct folio *folio)
xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
- ret = folio_test_clear_writeback(folio);
- }
- if (ret) {
- lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
- zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
- node_stat_mod_folio(folio, NR_WRITTEN, nr);
+ ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
}
+
+ lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
+ zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+ node_stat_mod_folio(folio, NR_WRITTEN, nr);
folio_memcg_unlock(folio);
+
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 95546f376302..733732e7e0ba 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -52,6 +52,7 @@
#include <linux/psi.h>
#include <linux/khugepaged.h>
#include <linux/delayacct.h>
+#include <linux/cacheinfo.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
@@ -1078,6 +1079,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
int bad = 0;
bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
bool init = want_init_on_free();
+ bool compound = PageCompound(page);
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1096,16 +1098,15 @@ static __always_inline bool free_pages_prepare(struct page *page,
return false;
}
+ VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+
/*
* Check tail pages before head page information is cleared to
* avoid checking PageCompound for order-0 pages.
*/
if (unlikely(order)) {
- bool compound = PageCompound(page);
int i;
- VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
-
if (compound)
page[1].flags &= ~PAGE_FLAGS_SECOND;
for (i = 1; i < (1 << order); i++) {
@@ -2156,6 +2157,40 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
return i;
}
+/*
+ * Called from the vmstat counter updater to decay the PCP high.
+ * Return whether there are addition works to do.
+ */
+int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+{
+ int high_min, to_drain, batch;
+ int todo = 0;
+
+ high_min = READ_ONCE(pcp->high_min);
+ batch = READ_ONCE(pcp->batch);
+ /*
+ * Decrease pcp->high periodically to try to free possible
+ * idle PCP pages. And, avoid to free too many pages to
+ * control latency. This caps pcp->high decrement too.
+ */
+ if (pcp->high > high_min) {
+ pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
+ pcp->high - (pcp->high >> 3), high_min);
+ if (pcp->high > high_min)
+ todo++;
+ }
+
+ to_drain = pcp->count - pcp->high;
+ if (to_drain > 0) {
+ spin_lock(&pcp->lock);
+ free_pcppages_bulk(zone, to_drain, pcp, 0);
+ spin_unlock(&pcp->lock);
+ todo++;
+ }
+
+ return todo;
+}
+
#ifdef CONFIG_NUMA
/*
* Called from the vmstat counter updater to drain pagesets of this
@@ -2317,14 +2352,13 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
return true;
}
-static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
+static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
{
int min_nr_free, max_nr_free;
- int batch = READ_ONCE(pcp->batch);
- /* Free everything if batch freeing high-order pages. */
+ /* Free as much as possible if batch freeing high-order pages. */
if (unlikely(free_high))
- return pcp->count;
+ return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);
/* Check for PCP disabled or boot pageset */
if (unlikely(high < batch))
@@ -2335,59 +2369,107 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
max_nr_free = high - batch;
/*
- * Double the number of pages freed each time there is subsequent
- * freeing of pages without any allocation.
+ * Increase the batch number to the number of the consecutive
+ * freed pages to reduce zone lock contention.
*/
- batch <<= pcp->free_factor;
- if (batch < max_nr_free)
- pcp->free_factor++;
- batch = clamp(batch, min_nr_free, max_nr_free);
+ batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free);
return batch;
}
static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
- bool free_high)
+ int batch, bool free_high)
{
- int high = READ_ONCE(pcp->high);
+ int high, high_min, high_max;
+
+ high_min = READ_ONCE(pcp->high_min);
+ high_max = READ_ONCE(pcp->high_max);
+ high = pcp->high = clamp(pcp->high, high_min, high_max);
- if (unlikely(!high || free_high))
+ if (unlikely(!high))
return 0;
- if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
- return high;
+ if (unlikely(free_high)) {
+ pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
+ high_min);
+ return 0;
+ }
/*
* If reclaim is active, limit the number of pages that can be
* stored on pcp lists
*/
- return min(READ_ONCE(pcp->batch) << 2, high);
+ if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
+ int free_count = max_t(int, pcp->free_count, batch);
+
+ pcp->high = max(high - free_count, high_min);
+ return min(batch << 2, pcp->high);
+ }
+
+ if (high_min == high_max)
+ return high;
+
+ if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) {
+ int free_count = max_t(int, pcp->free_count, batch);
+
+ pcp->high = max(high - free_count, high_min);
+ high = max(pcp->count, high_min);
+ } else if (pcp->count >= high) {
+ int need_high = pcp->free_count + batch;
+
+ /* pcp->high should be large enough to hold batch freed pages */
+ if (pcp->high < need_high)
+ pcp->high = clamp(need_high, high_min, high_max);
+ }
+
+ return high;
}
static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
struct page *page, int migratetype,
unsigned int order)
{
- int high;
+ int high, batch;
int pindex;
- bool free_high;
+ bool free_high = false;
+ /*
+ * On freeing, reduce the number of pages that are batch allocated.
+ * See nr_pcp_alloc() where alloc_factor is increased for subsequent
+ * allocations.
+ */
+ pcp->alloc_factor >>= 1;
__count_vm_events(PGFREE, 1 << order);
pindex = order_to_pindex(migratetype, order);
list_add(&page->pcp_list, &pcp->lists[pindex]);
pcp->count += 1 << order;
+ batch = READ_ONCE(pcp->batch);
/*
* As high-order pages other than THP's stored on PCP can contribute
* to fragmentation, limit the number stored when PCP is heavily
* freeing without allocation. The remainder after bulk freeing
* stops will be drained from vmstat refresh context.
*/
- free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
-
- high = nr_pcp_high(pcp, zone, free_high);
+ if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
+ free_high = (pcp->free_count >= batch &&
+ (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
+ (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
+ pcp->count >= READ_ONCE(batch)));
+ pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
+ } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
+ pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
+ }
+ if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
+ pcp->free_count += (1 << order);
+ high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count >= high) {
- free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex);
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
+ pcp, pindex);
+ if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
+ zone_watermark_ok(zone, 0, high_wmark_pages(zone),
+ ZONE_MOVABLE, 0))
+ clear_bit(ZONE_BELOW_HIGH, &zone->flags);
}
}
@@ -2671,6 +2753,56 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
return page;
}
+static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order)
+{
+ int high, base_batch, batch, max_nr_alloc;
+ int high_max, high_min;
+
+ base_batch = READ_ONCE(pcp->batch);
+ high_min = READ_ONCE(pcp->high_min);
+ high_max = READ_ONCE(pcp->high_max);
+ high = pcp->high = clamp(pcp->high, high_min, high_max);
+
+ /* Check for PCP disabled or boot pageset */
+ if (unlikely(high < base_batch))
+ return 1;
+
+ if (order)
+ batch = base_batch;
+ else
+ batch = (base_batch << pcp->alloc_factor);
+
+ /*
+ * If we had larger pcp->high, we could avoid to allocate from
+ * zone.
+ */
+ if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags))
+ high = pcp->high = min(high + batch, high_max);
+
+ if (!order) {
+ max_nr_alloc = max(high - pcp->count - base_batch, base_batch);
+ /*
+ * Double the number of pages allocated each time there is
+ * subsequent allocation of order-0 pages without any freeing.
+ */
+ if (batch <= max_nr_alloc &&
+ pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
+ pcp->alloc_factor++;
+ batch = min(batch, max_nr_alloc);
+ }
+
+ /*
+ * Scale batch relative to order if batch implies free pages
+ * can be stored on the PCP. Batch can be 1 for small zones or
+ * for boot pagesets which should never store free pages as
+ * the pages may belong to arbitrary zones.
+ */
+ if (batch > 1)
+ batch = max(batch >> order, 2);
+
+ return batch;
+}
+
/* Remove page from the per-cpu list, caller must protect the list */
static inline
struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
@@ -2683,18 +2815,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
do {
if (list_empty(list)) {
- int batch = READ_ONCE(pcp->batch);
+ int batch = nr_pcp_alloc(pcp, zone, order);
int alloced;
- /*
- * Scale batch relative to order if batch implies
- * free pages can be stored on the PCP. Batch can
- * be 1 for small zones or for boot pagesets which
- * should never store free pages as the pages may
- * belong to arbitrary zones.
- */
- if (batch > 1)
- batch = max(batch >> order, 2);
alloced = rmqueue_bulk(zone, order,
batch, list,
migratetype, alloc_flags);
@@ -2735,7 +2858,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
* See nr_pcp_free() where free_factor is increased for subsequent
* frees.
*/
- pcp->free_factor >>= 1;
+ pcp->free_count >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)];
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
pcp_spin_unlock(pcp);
@@ -3115,6 +3238,25 @@ retry:
}
}
+ /*
+ * Detect whether the number of free pages is below high
+ * watermark. If so, we will decrease pcp->high and free
+ * PCP pages in free path to reduce the possibility of
+ * premature page reclaiming. Detection is done here to
+ * avoid to do that in hotter free path.
+ */
+ if (test_bit(ZONE_BELOW_HIGH, &zone->flags))
+ goto check_alloc_wmark;
+
+ mark = high_wmark_pages(zone);
+ if (zone_watermark_fast(zone, order, mark,
+ ac->highest_zoneidx, alloc_flags,
+ gfp_mask))
+ goto try_this_zone;
+ else
+ set_bit(ZONE_BELOW_HIGH, &zone->flags);
+
+check_alloc_wmark:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
ac->highest_zoneidx, alloc_flags,
@@ -4456,12 +4598,8 @@ struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
- preferred_nid, nodemask);
- struct folio *folio = (struct folio *)page;
-
- if (folio && order > 1)
- folio_prep_large_rmappable(folio);
- return folio;
+ preferred_nid, nodemask);
+ return page_rmappable_folio(page);
}
EXPORT_SYMBOL(__folio_alloc);
@@ -4878,8 +5016,11 @@ int find_next_best_node(int node, nodemask_t *used_node_mask)
int min_val = INT_MAX;
int best_node = NUMA_NO_NODE;
- /* Use the local node if we haven't already */
- if (!node_isset(node, *used_node_mask)) {
+ /*
+ * Use the local node if we haven't already, but for memoryless local
+ * node, we should skip it and fall back to other nodes.
+ */
+ if (!node_isset(node, *used_node_mask) && node_state(node, N_MEMORY)) {
node_set(node, *used_node_mask);
return node;
}
@@ -5255,14 +5396,15 @@ static int zone_batchsize(struct zone *zone)
}
static int percpu_pagelist_high_fraction;
-static int zone_highsize(struct zone *zone, int batch, int cpu_online)
+static int zone_highsize(struct zone *zone, int batch, int cpu_online,
+ int high_fraction)
{
#ifdef CONFIG_MMU
int high;
int nr_split_cpus;
unsigned long total_pages;
- if (!percpu_pagelist_high_fraction) {
+ if (!high_fraction) {
/*
* By default, the high value of the pcp is based on the zone
* low watermark so that if they are full then background
@@ -5275,15 +5417,15 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
* value is based on a fraction of the managed pages in the
* zone.
*/
- total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
+ total_pages = zone_managed_pages(zone) / high_fraction;
}
/*
* Split the high value across all online CPUs local to the zone. Note
* that early in boot that CPUs may not be online yet and that during
* CPU hotplug that the cpumask is not yet updated when a CPU is being
- * onlined. For memory nodes that have no CPUs, split pcp->high across
- * all online CPUs to mitigate the risk that reclaim is triggered
+ * onlined. For memory nodes that have no CPUs, split the high value
+ * across all online CPUs to mitigate the risk that reclaim is triggered
* prematurely due to pages stored on pcp lists.
*/
nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
@@ -5311,19 +5453,21 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
* However, guaranteeing these relations at all times would require e.g. write
* barriers here but also careful usage of read barriers at the read side, and
* thus be prone to error and bad for performance. Thus the update only prevents
- * store tearing. Any new users of pcp->batch and pcp->high should ensure they
- * can cope with those fields changing asynchronously, and fully trust only the
- * pcp->count field on the local CPU with interrupts disabled.
+ * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max
+ * should ensure they can cope with those fields changing asynchronously, and
+ * fully trust only the pcp->count field on the local CPU with interrupts
+ * disabled.
*
* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
* outside of boot time (or some other assurance that no concurrent updaters
* exist).
*/
-static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
- unsigned long batch)
+static void pageset_update(struct per_cpu_pages *pcp, unsigned long high_min,
+ unsigned long high_max, unsigned long batch)
{
WRITE_ONCE(pcp->batch, batch);
- WRITE_ONCE(pcp->high, high);
+ WRITE_ONCE(pcp->high_min, high_min);
+ WRITE_ONCE(pcp->high_max, high_max);
}
static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
@@ -5343,20 +5487,21 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
* need to be as careful as pageset_update() as nobody can access the
* pageset yet.
*/
- pcp->high = BOOT_PAGESET_HIGH;
+ pcp->high_min = BOOT_PAGESET_HIGH;
+ pcp->high_max = BOOT_PAGESET_HIGH;
pcp->batch = BOOT_PAGESET_BATCH;
- pcp->free_factor = 0;
+ pcp->free_count = 0;
}
-static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
- unsigned long batch)
+static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
+ unsigned long high_max, unsigned long batch)
{
struct per_cpu_pages *pcp;
int cpu;
for_each_possible_cpu(cpu) {
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- pageset_update(pcp, high, batch);
+ pageset_update(pcp, high_min, high_max, batch);
}
}
@@ -5366,19 +5511,34 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h
*/
static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
{
- int new_high, new_batch;
+ int new_high_min, new_high_max, new_batch;
new_batch = max(1, zone_batchsize(zone));
- new_high = zone_highsize(zone, new_batch, cpu_online);
+ if (percpu_pagelist_high_fraction) {
+ new_high_min = zone_highsize(zone, new_batch, cpu_online,
+ percpu_pagelist_high_fraction);
+ /*
+ * PCP high is tuned manually, disable auto-tuning via
+ * setting high_min and high_max to the manual value.
+ */
+ new_high_max = new_high_min;
+ } else {
+ new_high_min = zone_highsize(zone, new_batch, cpu_online, 0);
+ new_high_max = zone_highsize(zone, new_batch, cpu_online,
+ MIN_PERCPU_PAGELIST_HIGH_FRACTION);
+ }
- if (zone->pageset_high == new_high &&
+ if (zone->pageset_high_min == new_high_min &&
+ zone->pageset_high_max == new_high_max &&
zone->pageset_batch == new_batch)
return;
- zone->pageset_high = new_high;
+ zone->pageset_high_min = new_high_min;
+ zone->pageset_high_max = new_high_max;
zone->pageset_batch = new_batch;
- __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
+ __zone_set_pageset_high_and_batch(zone, new_high_min, new_high_max,
+ new_batch);
}
void __meminit setup_zone_pageset(struct zone *zone)
@@ -5413,6 +5573,39 @@ static void zone_pcp_update(struct zone *zone, int cpu_online)
mutex_unlock(&pcp_batch_high_lock);
}
+static void zone_pcp_update_cacheinfo(struct zone *zone)
+{
+ int cpu;
+ struct per_cpu_pages *pcp;
+ struct cpu_cacheinfo *cci;
+
+ for_each_online_cpu(cpu) {
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ cci = get_cpu_cacheinfo(cpu);
+ /*
+ * If data cache slice of CPU is large enough, "pcp->batch"
+ * pages can be preserved in PCP before draining PCP for
+ * consecutive high-order pages freeing without allocation.
+ * This can reduce zone lock contention without hurting
+ * cache-hot pages sharing.
+ */
+ spin_lock(&pcp->lock);
+ if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+ pcp->flags |= PCPF_FREE_HIGH_BATCH;
+ else
+ pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+ spin_unlock(&pcp->lock);
+ }
+}
+
+void setup_pcp_cacheinfo(void)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ zone_pcp_update_cacheinfo(zone);
+}
+
/*
* Allocate per cpu pagesets and initialize them.
* Before this call only boot pagesets were available.
@@ -5454,7 +5647,8 @@ __meminit void zone_pcp_init(struct zone *zone)
*/
zone->per_cpu_pageset = &boot_pageset;
zone->per_cpu_zonestats = &boot_zonestats;
- zone->pageset_high = BOOT_PAGESET_HIGH;
+ zone->pageset_high_min = BOOT_PAGESET_HIGH;
+ zone->pageset_high_max = BOOT_PAGESET_HIGH;
zone->pageset_batch = BOOT_PAGESET_BATCH;
if (populated_zone(zone))
@@ -6356,13 +6550,14 @@ EXPORT_SYMBOL(free_contig_range);
void zone_pcp_disable(struct zone *zone)
{
mutex_lock(&pcp_batch_high_lock);
- __zone_set_pageset_high_and_batch(zone, 0, 1);
+ __zone_set_pageset_high_and_batch(zone, 0, 0, 1);
__drain_all_pages(zone, true);
}
void zone_pcp_enable(struct zone *zone)
{
- __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
+ __zone_set_pageset_high_and_batch(zone, zone->pageset_high_min,
+ zone->pageset_high_max, zone->pageset_batch);
mutex_unlock(&pcp_batch_high_lock);
}
@@ -6462,28 +6657,24 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
int migratetype)
{
unsigned long size = 1 << high;
- struct page *current_buddy, *next_page;
+ struct page *current_buddy;
while (high > low) {
high--;
size >>= 1;
if (target >= &page[size]) {
- next_page = page + size;
current_buddy = page;
+ page = page + size;
} else {
- next_page = page;
current_buddy = page + size;
}
if (set_page_guard(zone, current_buddy, high, migratetype))
continue;
- if (current_buddy != target) {
- add_to_free_list(current_buddy, zone, high, migratetype);
- set_buddy_order(current_buddy, high);
- page = next_page;
- }
+ add_to_free_list(current_buddy, zone, high, migratetype);
+ set_buddy_order(current_buddy, high);
}
}
diff --git a/mm/page_io.c b/mm/page_io.c
index fe4c21af23f2..cb559ae324c6 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -208,8 +208,10 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
static inline void count_swpout_vm_event(struct folio *folio)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (unlikely(folio_test_pmd_mappable(folio)))
+ if (unlikely(folio_test_pmd_mappable(folio))) {
+ count_memcg_folio_events(folio, THP_SWPOUT, 1);
count_vm_event(THP_SWPOUT);
+ }
#endif
count_vm_events(PSWPOUT, folio_nr_pages(folio));
}
@@ -278,9 +280,6 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
set_page_dirty(page);
ClearPageReclaim(page);
}
- } else {
- for (p = 0; p < sio->pages; p++)
- count_swpout_vm_event(page_folio(sio->bvec[p].bv_page));
}
for (p = 0; p < sio->pages; p++)
@@ -296,6 +295,7 @@ static void swap_writepage_fs(struct page *page, struct writeback_control *wbc)
struct file *swap_file = sis->swap_file;
loff_t pos = page_file_offset(page);
+ count_swpout_vm_event(page_folio(page));
set_page_writeback(page);
unlock_page(page);
if (wbc->swap_plug)
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 4e2723e1b300..4f13ce7d2452 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -408,11 +408,11 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
return -ENOMEM;
ret = scnprintf(kbuf, count,
- "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n",
+ "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns\n",
page_owner->order, page_owner->gfp_mask,
&page_owner->gfp_mask, page_owner->pid,
page_owner->tgid, page_owner->comm,
- page_owner->ts_nsec, page_owner->free_ts_nsec);
+ page_owner->ts_nsec);
/* Print information relevant to grouping pages by mobility */
pageblock_mt = get_pageblock_migratetype(page);
diff --git a/mm/percpu.c b/mm/percpu.c
index a7665de8485f..7b97d31df767 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1628,14 +1628,12 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
return true;
- objcg = get_obj_cgroup_from_current();
+ objcg = current_obj_cgroup();
if (!objcg)
return true;
- if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) {
- obj_cgroup_put(objcg);
+ if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
return false;
- }
*objcgp = objcg;
return true;
@@ -1649,6 +1647,7 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
return;
if (likely(chunk && chunk->obj_cgroups)) {
+ obj_cgroup_get(objcg);
chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
rcu_read_lock();
@@ -1657,7 +1656,6 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
rcu_read_unlock();
} else {
obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
- obj_cgroup_put(objcg);
}
}
@@ -2245,6 +2243,37 @@ static void pcpu_balance_workfn(struct work_struct *work)
}
/**
+ * pcpu_alloc_size - the size of the dynamic percpu area
+ * @ptr: pointer to the dynamic percpu area
+ *
+ * Returns the size of the @ptr allocation. This is undefined for statically
+ * defined percpu variables as there is no corresponding chunk->bound_map.
+ *
+ * RETURNS:
+ * The size of the dynamic percpu area.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
+ */
+size_t pcpu_alloc_size(void __percpu *ptr)
+{
+ struct pcpu_chunk *chunk;
+ unsigned long bit_off, end;
+ void *addr;
+
+ if (!ptr)
+ return 0;
+
+ addr = __pcpu_ptr_to_addr(ptr);
+ /* No pcpu_lock here: ptr has not been freed, so chunk is still alive */
+ chunk = pcpu_chunk_addr_search(addr);
+ bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE;
+ end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
+ bit_off + 1);
+ return (end - bit_off) * PCPU_MIN_ALLOC_SIZE;
+}
+
+/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
@@ -2267,12 +2296,10 @@ void free_percpu(void __percpu *ptr)
kmemleak_free_percpu(ptr);
addr = __pcpu_ptr_to_addr(ptr);
-
- spin_lock_irqsave(&pcpu_lock, flags);
-
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->base_addr;
+ spin_lock_irqsave(&pcpu_lock, flags);
size = pcpu_free_area(chunk, off);
pcpu_memcg_free_hook(chunk, off, size);
diff --git a/mm/readahead.c b/mm/readahead.c
index e815c114de21..6925e6959fd3 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -735,7 +735,8 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
*/
ret = -EINVAL;
if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
- !S_ISREG(file_inode(f.file)->i_mode))
+ (!S_ISREG(file_inode(f.file)->i_mode) &&
+ !S_ISBLK(file_inode(f.file)->i_mode)))
goto out;
ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
diff --git a/mm/rmap.c b/mm/rmap.c
index ec7f8e6c9e48..7a27a2b41802 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -798,6 +798,7 @@ struct folio_referenced_arg {
unsigned long vm_flags;
struct mem_cgroup *memcg;
};
+
/*
* arg: folio_referenced_arg will be passed
*/
@@ -807,17 +808,33 @@ static bool folio_referenced_one(struct folio *folio,
struct folio_referenced_arg *pra = arg;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
int referenced = 0;
+ unsigned long start = address, ptes = 0;
while (page_vma_mapped_walk(&pvmw)) {
address = pvmw.address;
- if ((vma->vm_flags & VM_LOCKED) &&
- (!folio_test_large(folio) || !pvmw.pte)) {
- /* Restore the mlock which got missed */
- mlock_vma_folio(folio, vma, !pvmw.pte);
- page_vma_mapped_walk_done(&pvmw);
- pra->vm_flags |= VM_LOCKED;
- return false; /* To break the loop */
+ if (vma->vm_flags & VM_LOCKED) {
+ if (!folio_test_large(folio) || !pvmw.pte) {
+ /* Restore the mlock which got missed */
+ mlock_vma_folio(folio, vma);
+ page_vma_mapped_walk_done(&pvmw);
+ pra->vm_flags |= VM_LOCKED;
+ return false; /* To break the loop */
+ }
+ /*
+ * For large folio fully mapped to VMA, will
+ * be handled after the pvmw loop.
+ *
+ * For large folio cross VMA boundaries, it's
+ * expected to be picked by page reclaim. But
+ * should skip reference of pages which are in
+ * the range of VM_LOCKED vma. As page reclaim
+ * should just count the reference of pages out
+ * the range of VM_LOCKED vma.
+ */
+ ptes++;
+ pra->mapcount--;
+ continue;
}
if (pvmw.pte) {
@@ -842,6 +859,23 @@ static bool folio_referenced_one(struct folio *folio,
pra->mapcount--;
}
+ if ((vma->vm_flags & VM_LOCKED) &&
+ folio_test_large(folio) &&
+ folio_within_vma(folio, vma)) {
+ unsigned long s_align, e_align;
+
+ s_align = ALIGN_DOWN(start, PMD_SIZE);
+ e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE);
+
+ /* folio doesn't cross page table boundary and fully mapped */
+ if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) {
+ /* Restore the mlock which got missed */
+ mlock_vma_folio(folio, vma);
+ pra->vm_flags |= VM_LOCKED;
+ return false; /* To break the loop */
+ }
+ }
+
if (referenced)
folio_clear_idle(folio);
if (folio_test_clear_young(folio))
@@ -1094,19 +1128,17 @@ int folio_total_mapcount(struct folio *folio)
}
/**
- * page_move_anon_rmap - move a page to our anon_vma
- * @page: the page to move to our anon_vma
- * @vma: the vma the page belongs to
+ * folio_move_anon_rmap - move a folio to our anon_vma
+ * @folio: The folio to move to our anon_vma
+ * @vma: The vma the folio belongs to
*
- * When a page belongs exclusively to one process after a COW event,
- * that page can be moved into the anon_vma that belongs to just that
- * process, so the rmap code will not search the parent or sibling
- * processes.
+ * When a folio belongs exclusively to one process after a COW event,
+ * that folio can be moved into the anon_vma that belongs to just that
+ * process, so the rmap code will not search the parent or sibling processes.
*/
-void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
+void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
{
void *anon_vma = vma->anon_vma;
- struct folio *folio = page_folio(page);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_VMA(!anon_vma, vma);
@@ -1118,31 +1150,25 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
* folio_test_anon()) will not see one without the other.
*/
WRITE_ONCE(folio->mapping, anon_vma);
- SetPageAnonExclusive(page);
}
/**
- * __page_set_anon_rmap - set up new anonymous rmap
- * @folio: Folio which contains page.
- * @page: Page to add to rmap.
- * @vma: VM area to add page to.
+ * __folio_set_anon - set up a new anonymous rmap for a folio
+ * @folio: The folio to set up the new anonymous rmap for.
+ * @vma: VM area to add the folio to.
* @address: User virtual address of the mapping
- * @exclusive: the page is exclusively owned by the current process
+ * @exclusive: Whether the folio is exclusive to the process.
*/
-static void __page_set_anon_rmap(struct folio *folio, struct page *page,
- struct vm_area_struct *vma, unsigned long address, int exclusive)
+static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long address, bool exclusive)
{
struct anon_vma *anon_vma = vma->anon_vma;
BUG_ON(!anon_vma);
- if (folio_test_anon(folio))
- goto out;
-
/*
- * If the page isn't exclusively mapped into this vma,
- * we must use the _oldest_ possible anon_vma for the
- * page mapping!
+ * If the folio isn't exclusive to this vma, we must use the _oldest_
+ * possible anon_vma for the folio mapping!
*/
if (!exclusive)
anon_vma = anon_vma->root;
@@ -1156,9 +1182,6 @@ static void __page_set_anon_rmap(struct folio *folio, struct page *page,
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
folio->index = linear_page_index(vma, address);
-out:
- if (exclusive)
- SetPageAnonExclusive(page);
}
/**
@@ -1207,7 +1230,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
atomic_t *mapped = &folio->_nr_pages_mapped;
int nr = 0, nr_pmdmapped = 0;
bool compound = flags & RMAP_COMPOUND;
- bool first = true;
+ bool first;
/* Is page being mapped by PTE? Is this its first map to be added? */
if (likely(!compound)) {
@@ -1236,24 +1259,40 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
}
}
- VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
- VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
-
if (nr_pmdmapped)
__lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped);
if (nr)
__lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
- if (likely(!folio_test_ksm(folio))) {
- /* address might be in next vma when migration races vma_merge */
- if (first)
- __page_set_anon_rmap(folio, page, vma, address,
- !!(flags & RMAP_EXCLUSIVE));
- else
- __page_check_anon_rmap(folio, page, vma, address);
+ if (unlikely(!folio_test_anon(folio))) {
+ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+ /*
+ * For a PTE-mapped large folio, we only know that the single
+ * PTE is exclusive. Further, __folio_set_anon() might not get
+ * folio->index right when not given the address of the head
+ * page.
+ */
+ VM_WARN_ON_FOLIO(folio_test_large(folio) && !compound, folio);
+ __folio_set_anon(folio, vma, address,
+ !!(flags & RMAP_EXCLUSIVE));
+ } else if (likely(!folio_test_ksm(folio))) {
+ __page_check_anon_rmap(folio, page, vma, address);
}
+ if (flags & RMAP_EXCLUSIVE)
+ SetPageAnonExclusive(page);
+ /* While PTE-mapping a THP we have a PMD and a PTE mapping. */
+ VM_WARN_ON_FOLIO((atomic_read(&page->_mapcount) > 0 ||
+ (folio_test_large(folio) && folio_entire_mapcount(folio) > 1)) &&
+ PageAnonExclusive(page), folio);
- mlock_vma_folio(folio, vma, compound);
+ /*
+ * For large folio, only mlock it if it's fully mapped to VMA. It's
+ * not easy to check whether the large folio is fully mapped to VMA
+ * here. Only mlock normal 4K folio and leave page reclaim to handle
+ * large folio.
+ */
+ if (!folio_test_large(folio))
+ mlock_vma_folio(folio, vma);
}
/**
@@ -1290,7 +1329,8 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
}
__lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
- __page_set_anon_rmap(folio, &folio->page, vma, address, 1);
+ __folio_set_anon(folio, vma, address, true);
+ SetPageAnonExclusive(&folio->page);
}
/**
@@ -1352,7 +1392,9 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page,
if (nr)
__lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);
- mlock_vma_folio(folio, vma, compound);
+ /* See comments in page_add_anon_rmap() */
+ if (!folio_test_large(folio))
+ mlock_vma_folio(folio, vma);
}
/**
@@ -1463,7 +1505,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
* it's only reliable while mapped.
*/
- munlock_vma_folio(folio, vma, compound);
+ munlock_vma_folio(folio, vma);
}
/*
@@ -1480,6 +1522,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
unsigned long pfn;
+ unsigned long hsz = 0;
/*
* When racing against e.g. zap_pte_range() on another cpu,
@@ -1511,6 +1554,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
+
+ /* We need the huge page size for set_huge_pte_at() */
+ hsz = huge_page_size(hstate_vma(vma));
}
mmu_notifier_invalidate_range_start(&range);
@@ -1524,7 +1570,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (!(flags & TTU_IGNORE_MLOCK) &&
(vma->vm_flags & VM_LOCKED)) {
/* Restore the mlock which got missed */
- mlock_vma_folio(folio, vma, false);
+ if (!folio_test_large(folio))
+ mlock_vma_folio(folio, vma);
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
@@ -1628,7 +1675,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
- set_huge_pte_at(mm, address, pvmw.pte, pteval);
+ set_huge_pte_at(mm, address, pvmw.pte, pteval,
+ hsz);
} else {
dec_mm_counter(mm, mm_counter(&folio->page));
set_pte_at(mm, address, pvmw.pte, pteval);
@@ -1820,6 +1868,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
unsigned long pfn;
+ unsigned long hsz = 0;
/*
* When racing against e.g. zap_pte_range() on another cpu,
@@ -1855,6 +1904,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
+
+ /* We need the huge page size for set_huge_pte_at() */
+ hsz = huge_page_size(hstate_vma(vma));
}
mmu_notifier_invalidate_range_start(&range);
@@ -2020,7 +2072,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
- set_huge_pte_at(mm, address, pvmw.pte, pteval);
+ set_huge_pte_at(mm, address, pvmw.pte, pteval,
+ hsz);
} else {
dec_mm_counter(mm, mm_counter(&folio->page));
set_pte_at(mm, address, pvmw.pte, pteval);
@@ -2044,7 +2097,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
if (folio_test_hugetlb(folio))
- set_huge_pte_at(mm, address, pvmw.pte, pteval);
+ set_huge_pte_at(mm, address, pvmw.pte,
+ pteval, hsz);
else
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
@@ -2058,7 +2112,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
if (folio_test_hugetlb(folio))
- set_huge_pte_at(mm, address, pvmw.pte, pteval);
+ set_huge_pte_at(mm, address, pvmw.pte,
+ pteval, hsz);
else
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
@@ -2090,7 +2145,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
if (folio_test_hugetlb(folio))
- set_huge_pte_at(mm, address, pvmw.pte, swp_pte);
+ set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
+ hsz);
else
set_pte_at(mm, address, pvmw.pte, swp_pte);
trace_set_migration_pte(address, pte_val(swp_pte),
@@ -2527,22 +2583,16 @@ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
*
* RMAP_COMPOUND is ignored.
*/
-void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
+void hugepage_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, rmap_t flags)
{
- struct folio *folio = page_folio(page);
- struct anon_vma *anon_vma = vma->anon_vma;
- int first;
+ VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
- BUG_ON(!folio_test_locked(folio));
- BUG_ON(!anon_vma);
- /* address might be in next vma when migration races vma_merge */
- first = atomic_inc_and_test(&folio->_entire_mapcount);
- VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
- VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
- if (first)
- __page_set_anon_rmap(folio, page, vma, address,
- !!(flags & RMAP_EXCLUSIVE));
+ atomic_inc(&folio->_entire_mapcount);
+ if (flags & RMAP_EXCLUSIVE)
+ SetPageAnonExclusive(&folio->page);
+ VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
+ PageAnonExclusive(&folio->page), folio);
}
void hugepage_add_new_anon_rmap(struct folio *folio,
@@ -2552,6 +2602,7 @@ void hugepage_add_new_anon_rmap(struct folio *folio,
/* increment count (starts at -1) */
atomic_set(&folio->_entire_mapcount, 0);
folio_clear_hugetlb_restore_reserve(folio);
- __page_set_anon_rmap(folio, &folio->page, vma, address, 1);
+ __folio_set_anon(folio, vma, address, true);
+ SetPageAnonExclusive(&folio->page);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index 69595d341882..91e2620148b2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -42,7 +42,7 @@
#include <linux/iversion.h>
#include "swap.h"
-static struct vfsmount *shm_mnt;
+static struct vfsmount *shm_mnt __ro_after_init;
#ifdef CONFIG_SHMEM
/*
@@ -146,9 +146,8 @@ static unsigned long shmem_default_max_inodes(void)
#endif
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
- struct folio **foliop, enum sgp_type sgp,
- gfp_t gfp, struct vm_area_struct *vma,
- vm_fault_t *fault_type);
+ struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
+ struct mm_struct *fault_mm, vm_fault_t *fault_type);
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
@@ -189,10 +188,10 @@ static inline int shmem_reacct_size(unsigned long flags,
/*
* ... whereas tmpfs objects are accounted incrementally as
* pages are allocated, in order to allow large sparse files.
- * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
+ * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/
-static inline int shmem_acct_block(unsigned long flags, long pages)
+static inline int shmem_acct_blocks(unsigned long flags, long pages)
{
if (!(flags & VM_NORESERVE))
return 0;
@@ -207,26 +206,26 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}
-static int shmem_inode_acct_block(struct inode *inode, long pages)
+static int shmem_inode_acct_blocks(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
int err = -ENOSPC;
- if (shmem_acct_block(info->flags, pages))
+ if (shmem_acct_blocks(info->flags, pages))
return err;
might_sleep(); /* when quotas */
if (sbinfo->max_blocks) {
- if (percpu_counter_compare(&sbinfo->used_blocks,
- sbinfo->max_blocks - pages) > 0)
+ if (!percpu_counter_limited_add(&sbinfo->used_blocks,
+ sbinfo->max_blocks, pages))
goto unacct;
err = dquot_alloc_block_nodirty(inode, pages);
- if (err)
+ if (err) {
+ percpu_counter_sub(&sbinfo->used_blocks, pages);
goto unacct;
-
- percpu_counter_add(&sbinfo->used_blocks, pages);
+ }
} else {
err = dquot_alloc_block_nodirty(inode, pages);
if (err)
@@ -447,7 +446,7 @@ bool shmem_charge(struct inode *inode, long pages)
{
struct address_space *mapping = inode->i_mapping;
- if (shmem_inode_acct_block(inode, pages))
+ if (shmem_inode_acct_blocks(inode, pages))
return false;
/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
@@ -756,16 +755,14 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
- * Like filemap_add_folio, but error if expected item has gone.
+ * Somewhat like filemap_add_folio, but error if expected item has gone.
*/
static int shmem_add_to_page_cache(struct folio *folio,
struct address_space *mapping,
- pgoff_t index, void *expected, gfp_t gfp,
- struct mm_struct *charge_mm)
+ pgoff_t index, void *expected, gfp_t gfp)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
long nr = folio_nr_pages(folio);
- int error;
VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -776,16 +773,7 @@ static int shmem_add_to_page_cache(struct folio *folio,
folio->mapping = mapping;
folio->index = index;
- if (!folio_test_swapcache(folio)) {
- error = mem_cgroup_charge(folio, charge_mm, gfp);
- if (error) {
- if (folio_test_pmd_mappable(folio)) {
- count_vm_event(THP_FILE_FALLBACK);
- count_vm_event(THP_FILE_FALLBACK_CHARGE);
- }
- goto error;
- }
- }
+ gfp &= GFP_RECLAIM_MASK;
folio_throttle_swaprate(folio, gfp);
do {
@@ -801,31 +789,26 @@ static int shmem_add_to_page_cache(struct folio *folio,
xas_store(&xas, folio);
if (xas_error(&xas))
goto unlock;
- if (folio_test_pmd_mappable(folio)) {
- count_vm_event(THP_FILE_ALLOC);
+ if (folio_test_pmd_mappable(folio))
__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
- }
- mapping->nrpages += nr;
__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
__lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
+ mapping->nrpages += nr;
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
- error = xas_error(&xas);
- goto error;
+ folio->mapping = NULL;
+ folio_ref_sub(folio, nr);
+ return xas_error(&xas);
}
return 0;
-error:
- folio->mapping = NULL;
- folio_ref_sub(folio, nr);
- return error;
}
/*
- * Like delete_from_page_cache, but substitutes swap for @folio.
+ * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
*/
static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
{
@@ -887,7 +870,6 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
cond_resched_rcu();
}
}
-
rcu_read_unlock();
return swapped << PAGE_SHIFT;
@@ -1112,7 +1094,7 @@ whole_folios:
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -1213,7 +1195,6 @@ static int shmem_setattr(struct mnt_idmap *idmap,
if (i_uid_needs_update(idmap, attr, inode) ||
i_gid_needs_update(idmap, attr, inode)) {
error = dquot_transfer(idmap, inode, attr);
-
if (error)
return error;
}
@@ -1224,7 +1205,7 @@ static int shmem_setattr(struct mnt_idmap *idmap,
if (!error && update_ctime) {
inode_set_ctime_current(inode);
if (update_mtime)
- inode->i_mtime = inode_get_ctime(inode);
+ inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
inode_inc_iversion(inode);
}
return error;
@@ -1326,10 +1307,8 @@ static int shmem_unuse_swap_entries(struct inode *inode,
if (!xa_is_value(folio))
continue;
- error = shmem_swapin_folio(inode, indices[i],
- &folio, SGP_CACHE,
- mapping_gfp_mask(mapping),
- NULL, NULL);
+ error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
+ mapping_gfp_mask(mapping), NULL, NULL);
if (error == 0) {
folio_unlock(folio);
folio_put(folio);
@@ -1565,38 +1544,20 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
return NULL;
}
#endif /* CONFIG_NUMA && CONFIG_TMPFS */
-#ifndef CONFIG_NUMA
-#define vm_policy vm_private_data
-#endif
-static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
- struct shmem_inode_info *info, pgoff_t index)
-{
- /* Create a pseudo vma that just contains the policy */
- vma_init(vma, NULL);
- /* Bias interleave by inode number to distribute better across nodes */
- vma->vm_pgoff = index + info->vfs_inode.i_ino;
- vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
-}
-
-static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
-{
- /* Drop reference taken by mpol_shared_policy_lookup() */
- mpol_cond_put(vma->vm_policy);
-}
+static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+ pgoff_t index, unsigned int order, pgoff_t *ilx);
-static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct vm_area_struct pvma;
+ struct mempolicy *mpol;
+ pgoff_t ilx;
struct page *page;
- struct vm_fault vmf = {
- .vma = &pvma,
- };
- shmem_pseudo_vma_init(&pvma, info, index);
- page = swap_cluster_readahead(swap, gfp, &vmf);
- shmem_pseudo_vma_destroy(&pvma);
+ mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+ page = swap_cluster_readahead(swap, gfp, mpol, ilx);
+ mpol_cond_put(mpol);
if (!page)
return NULL;
@@ -1630,67 +1591,126 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct vm_area_struct pvma;
- struct address_space *mapping = info->vfs_inode.i_mapping;
- pgoff_t hindex;
- struct folio *folio;
+ struct mempolicy *mpol;
+ pgoff_t ilx;
+ struct page *page;
- hindex = round_down(index, HPAGE_PMD_NR);
- if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
- XA_PRESENT))
- return NULL;
+ mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx);
+ page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id());
+ mpol_cond_put(mpol);
- shmem_pseudo_vma_init(&pvma, info, hindex);
- folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
- shmem_pseudo_vma_destroy(&pvma);
- if (!folio)
- count_vm_event(THP_FILE_FALLBACK);
- return folio;
+ return page_rmappable_folio(page);
}
static struct folio *shmem_alloc_folio(gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index)
+ struct shmem_inode_info *info, pgoff_t index)
{
- struct vm_area_struct pvma;
- struct folio *folio;
+ struct mempolicy *mpol;
+ pgoff_t ilx;
+ struct page *page;
- shmem_pseudo_vma_init(&pvma, info, index);
- folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
- shmem_pseudo_vma_destroy(&pvma);
+ mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+ page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id());
+ mpol_cond_put(mpol);
- return folio;
+ return (struct folio *)page;
}
-static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
- pgoff_t index, bool huge)
+static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
+ struct inode *inode, pgoff_t index,
+ struct mm_struct *fault_mm, bool huge)
{
+ struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct folio *folio;
- int nr;
- int err;
+ long pages;
+ int error;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
huge = false;
- nr = huge ? HPAGE_PMD_NR : 1;
- err = shmem_inode_acct_block(inode, nr);
- if (err)
- goto failed;
+ if (huge) {
+ pages = HPAGE_PMD_NR;
+ index = round_down(index, HPAGE_PMD_NR);
+
+ /*
+ * Check for conflict before waiting on a huge allocation.
+ * Conflict might be that a huge page has just been allocated
+ * and added to page cache by a racing thread, or that there
+ * is already at least one small page in the huge extent.
+ * Be careful to retry when appropriate, but not forever!
+ * Elsewhere -EEXIST would be the right code, but not here.
+ */
+ if (xa_find(&mapping->i_pages, &index,
+ index + HPAGE_PMD_NR - 1, XA_PRESENT))
+ return ERR_PTR(-E2BIG);
- if (huge)
folio = shmem_alloc_hugefolio(gfp, info, index);
- else
+ if (!folio)
+ count_vm_event(THP_FILE_FALLBACK);
+ } else {
+ pages = 1;
folio = shmem_alloc_folio(gfp, info, index);
- if (folio) {
- __folio_set_locked(folio);
- __folio_set_swapbacked(folio);
- return folio;
}
+ if (!folio)
+ return ERR_PTR(-ENOMEM);
- err = -ENOMEM;
- shmem_inode_unacct_blocks(inode, nr);
-failed:
- return ERR_PTR(err);
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+
+ gfp &= GFP_RECLAIM_MASK;
+ error = mem_cgroup_charge(folio, fault_mm, gfp);
+ if (error) {
+ if (xa_find(&mapping->i_pages, &index,
+ index + pages - 1, XA_PRESENT)) {
+ error = -EEXIST;
+ } else if (huge) {
+ count_vm_event(THP_FILE_FALLBACK);
+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
+ }
+ goto unlock;
+ }
+
+ error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
+ if (error)
+ goto unlock;
+
+ error = shmem_inode_acct_blocks(inode, pages);
+ if (error) {
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ long freed;
+ /*
+ * Try to reclaim some space by splitting a few
+ * large folios beyond i_size on the filesystem.
+ */
+ shmem_unused_huge_shrink(sbinfo, NULL, 2);
+ /*
+ * And do a shmem_recalc_inode() to account for freed pages:
+ * except our folio is there in cache, so not quite balanced.
+ */
+ spin_lock(&info->lock);
+ freed = pages + info->alloced - info->swapped -
+ READ_ONCE(mapping->nrpages);
+ if (freed > 0)
+ info->alloced -= freed;
+ spin_unlock(&info->lock);
+ if (freed > 0)
+ shmem_inode_unacct_blocks(inode, freed);
+ error = shmem_inode_acct_blocks(inode, pages);
+ if (error) {
+ filemap_remove_folio(folio);
+ goto unlock;
+ }
+ }
+
+ shmem_recalc_inode(inode, pages, 0);
+ folio_add_lru(folio);
+ return folio;
+
+unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ return ERR_PTR(error);
}
/*
@@ -1812,12 +1832,11 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
*/
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp,
- gfp_t gfp, struct vm_area_struct *vma,
+ gfp_t gfp, struct mm_struct *fault_mm,
vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
struct swap_info_struct *si;
struct folio *folio = NULL;
swp_entry_t swap;
@@ -1845,10 +1864,10 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
- count_memcg_event_mm(charge_mm, PGMAJFAULT);
+ count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
/* Here we actually start the io */
- folio = shmem_swapin(swap, gfp, info, index);
+ folio = shmem_swapin_cluster(swap, gfp, info, index);
if (!folio) {
error = -ENOMEM;
goto failed;
@@ -1882,8 +1901,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
}
error = shmem_add_to_page_cache(folio, mapping, index,
- swp_to_radix_entry(swap), gfp,
- charge_mm);
+ swp_to_radix_entry(swap), gfp);
if (error)
goto failed;
@@ -1921,37 +1939,29 @@ unlock:
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache.
*
- * vma, vmf, and fault_type are only supplied by shmem_fault:
- * otherwise they are NULL.
+ * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
*/
static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
- struct vm_area_struct *vma, struct vm_fault *vmf,
- vm_fault_t *fault_type)
+ struct vm_fault *vmf, vm_fault_t *fault_type)
{
- struct address_space *mapping = inode->i_mapping;
- struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sbinfo;
- struct mm_struct *charge_mm;
+ struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+ struct mm_struct *fault_mm;
struct folio *folio;
- pgoff_t hindex;
- gfp_t huge_gfp;
int error;
- int once = 0;
- int alloced = 0;
+ bool alloced;
if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
return -EFBIG;
repeat:
if (sgp <= SGP_CACHE &&
- ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
+ ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
return -EINVAL;
- }
- sbinfo = SHMEM_SB(inode->i_sb);
- charge_mm = vma ? vma->vm_mm : NULL;
+ alloced = false;
+ fault_mm = vma ? vma->vm_mm : NULL;
- folio = filemap_get_entry(mapping, index);
+ folio = filemap_get_entry(inode->i_mapping, index);
if (folio && vma && userfaultfd_minor(vma)) {
if (!xa_is_value(folio))
folio_put(folio);
@@ -1961,7 +1971,7 @@ repeat:
if (xa_is_value(folio)) {
error = shmem_swapin_folio(inode, index, &folio,
- sgp, gfp, vma, fault_type);
+ sgp, gfp, fault_mm, fault_type);
if (error == -EEXIST)
goto repeat;
@@ -1973,7 +1983,7 @@ repeat:
folio_lock(folio);
/* Has the folio been truncated or swapped out? */
- if (unlikely(folio->mapping != mapping)) {
+ if (unlikely(folio->mapping != inode->i_mapping)) {
folio_unlock(folio);
folio_put(folio);
goto repeat;
@@ -2008,58 +2018,38 @@ repeat:
return 0;
}
- if (!shmem_is_huge(inode, index, false,
- vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0))
- goto alloc_nohuge;
+ if (shmem_is_huge(inode, index, false, fault_mm,
+ vma ? vma->vm_flags : 0)) {
+ gfp_t huge_gfp;
- huge_gfp = vma_thp_gfp_mask(vma);
- huge_gfp = limit_gfp_mask(huge_gfp, gfp);
- folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
- if (IS_ERR(folio)) {
-alloc_nohuge:
- folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
+ huge_gfp = vma_thp_gfp_mask(vma);
+ huge_gfp = limit_gfp_mask(huge_gfp, gfp);
+ folio = shmem_alloc_and_add_folio(huge_gfp,
+ inode, index, fault_mm, true);
+ if (!IS_ERR(folio)) {
+ count_vm_event(THP_FILE_ALLOC);
+ goto alloced;
+ }
+ if (PTR_ERR(folio) == -EEXIST)
+ goto repeat;
}
- if (IS_ERR(folio)) {
- int retry = 5;
+ folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false);
+ if (IS_ERR(folio)) {
error = PTR_ERR(folio);
+ if (error == -EEXIST)
+ goto repeat;
folio = NULL;
- if (error != -ENOSPC)
- goto unlock;
- /*
- * Try to reclaim some space by splitting a large folio
- * beyond i_size on the filesystem.
- */
- while (retry--) {
- int ret;
-
- ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
- if (ret == SHRINK_STOP)
- break;
- if (ret)
- goto alloc_nohuge;
- }
goto unlock;
}
- hindex = round_down(index, folio_nr_pages(folio));
-
- if (sgp == SGP_WRITE)
- __folio_set_referenced(folio);
-
- error = shmem_add_to_page_cache(folio, mapping, hindex,
- NULL, gfp & GFP_RECLAIM_MASK,
- charge_mm);
- if (error)
- goto unacct;
-
- folio_add_lru(folio);
- shmem_recalc_inode(inode, folio_nr_pages(folio), 0);
+alloced:
alloced = true;
-
if (folio_test_pmd_mappable(folio) &&
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
folio_next_index(folio) - 1) {
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct shmem_inode_info *info = SHMEM_I(inode);
/*
* Part of the large folio is beyond i_size: subject
* to shrink under memory pressure.
@@ -2077,6 +2067,8 @@ alloc_nohuge:
spin_unlock(&sbinfo->shrinklist_lock);
}
+ if (sgp == SGP_WRITE)
+ folio_set_referenced(folio);
/*
* Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
*/
@@ -2100,11 +2092,6 @@ clear:
/* Perhaps the file has been truncated since we checked */
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
- if (alloced) {
- folio_clear_dirty(folio);
- filemap_remove_folio(folio);
- shmem_recalc_inode(inode, 0, 0);
- }
error = -EINVAL;
goto unlock;
}
@@ -2115,25 +2102,14 @@ out:
/*
* Error recovery.
*/
-unacct:
- shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
-
- if (folio_test_large(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- goto alloc_nohuge;
- }
unlock:
+ if (alloced)
+ filemap_remove_folio(folio);
+ shmem_recalc_inode(inode, 0, 0);
if (folio) {
folio_unlock(folio);
folio_put(folio);
}
- if (error == -ENOSPC && !once++) {
- shmem_recalc_inode(inode, 0, 0);
- goto repeat;
- }
- if (error == -EEXIST)
- goto repeat;
return error;
}
@@ -2141,7 +2117,7 @@ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
enum sgp_type sgp)
{
return shmem_get_folio_gfp(inode, index, foliop, sgp,
- mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
+ mapping_gfp_mask(inode->i_mapping), NULL, NULL);
}
/*
@@ -2149,87 +2125,99 @@ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
* entry unconditionally - even if something else had already woken the
* target.
*/
-static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+static int synchronous_wake_function(wait_queue_entry_t *wait,
+ unsigned int mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
list_del_init(&wait->entry);
return ret;
}
+/*
+ * Trinity finds that probing a hole which tmpfs is punching can
+ * prevent the hole-punch from ever completing: which in turn
+ * locks writers out with its hold on i_rwsem. So refrain from
+ * faulting pages into the hole while it's being punched. Although
+ * shmem_undo_range() does remove the additions, it may be unable to
+ * keep up, as each new page needs its own unmap_mapping_range() call,
+ * and the i_mmap tree grows ever slower to scan if new vmas are added.
+ *
+ * It does not matter if we sometimes reach this check just before the
+ * hole-punch begins, so that one fault then races with the punch:
+ * we just need to make racing faults a rare case.
+ *
+ * The implementation below would be much simpler if we just used a
+ * standard mutex or completion: but we cannot take i_rwsem in fault,
+ * and bloating every shmem inode for this unlikely case would be sad.
+ */
+static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
+{
+ struct shmem_falloc *shmem_falloc;
+ struct file *fpin = NULL;
+ vm_fault_t ret = 0;
+
+ spin_lock(&inode->i_lock);
+ shmem_falloc = inode->i_private;
+ if (shmem_falloc &&
+ shmem_falloc->waitq &&
+ vmf->pgoff >= shmem_falloc->start &&
+ vmf->pgoff < shmem_falloc->next) {
+ wait_queue_head_t *shmem_falloc_waitq;
+ DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
+
+ ret = VM_FAULT_NOPAGE;
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+ shmem_falloc_waitq = shmem_falloc->waitq;
+ prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ schedule();
+
+ /*
+ * shmem_falloc_waitq points into the shmem_fallocate()
+ * stack of the hole-punching task: shmem_falloc_waitq
+ * is usually invalid by the time we reach here, but
+ * finish_wait() does not dereference it in that case;
+ * though i_lock needed lest racing with wake_up_all().
+ */
+ spin_lock(&inode->i_lock);
+ finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
+ }
+ spin_unlock(&inode->i_lock);
+ if (fpin) {
+ fput(fpin);
+ ret = VM_FAULT_RETRY;
+ }
+ return ret;
+}
+
static vm_fault_t shmem_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = vmf->vma;
- struct inode *inode = file_inode(vma->vm_file);
+ struct inode *inode = file_inode(vmf->vma->vm_file);
gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
struct folio *folio = NULL;
+ vm_fault_t ret = 0;
int err;
- vm_fault_t ret = VM_FAULT_LOCKED;
/*
* Trinity finds that probing a hole which tmpfs is punching can
- * prevent the hole-punch from ever completing: which in turn
- * locks writers out with its hold on i_rwsem. So refrain from
- * faulting pages into the hole while it's being punched. Although
- * shmem_undo_range() does remove the additions, it may be unable to
- * keep up, as each new page needs its own unmap_mapping_range() call,
- * and the i_mmap tree grows ever slower to scan if new vmas are added.
- *
- * It does not matter if we sometimes reach this check just before the
- * hole-punch begins, so that one fault then races with the punch:
- * we just need to make racing faults a rare case.
- *
- * The implementation below would be much simpler if we just used a
- * standard mutex or completion: but we cannot take i_rwsem in fault,
- * and bloating every shmem inode for this unlikely case would be sad.
+ * prevent the hole-punch from ever completing: noted in i_private.
*/
if (unlikely(inode->i_private)) {
- struct shmem_falloc *shmem_falloc;
-
- spin_lock(&inode->i_lock);
- shmem_falloc = inode->i_private;
- if (shmem_falloc &&
- shmem_falloc->waitq &&
- vmf->pgoff >= shmem_falloc->start &&
- vmf->pgoff < shmem_falloc->next) {
- struct file *fpin;
- wait_queue_head_t *shmem_falloc_waitq;
- DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
-
- ret = VM_FAULT_NOPAGE;
- fpin = maybe_unlock_mmap_for_io(vmf, NULL);
- if (fpin)
- ret = VM_FAULT_RETRY;
-
- shmem_falloc_waitq = shmem_falloc->waitq;
- prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
- TASK_UNINTERRUPTIBLE);
- spin_unlock(&inode->i_lock);
- schedule();
-
- /*
- * shmem_falloc_waitq points into the shmem_fallocate()
- * stack of the hole-punching task: shmem_falloc_waitq
- * is usually invalid by the time we reach here, but
- * finish_wait() does not dereference it in that case;
- * though i_lock needed lest racing with wake_up_all().
- */
- spin_lock(&inode->i_lock);
- finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
- spin_unlock(&inode->i_lock);
-
- if (fpin)
- fput(fpin);
+ ret = shmem_falloc_wait(vmf, inode);
+ if (ret)
return ret;
- }
- spin_unlock(&inode->i_lock);
}
+ WARN_ON_ONCE(vmf->page != NULL);
err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
- gfp, vma, vmf, &ret);
+ gfp, vmf, &ret);
if (err)
return vmf_error(err);
- if (folio)
+ if (folio) {
vmf->page = folio_file_page(folio, vmf->pgoff);
+ ret |= VM_FAULT_LOCKED;
+ }
return ret;
}
@@ -2330,15 +2318,41 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
}
static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
- unsigned long addr)
+ unsigned long addr, pgoff_t *ilx)
{
struct inode *inode = file_inode(vma->vm_file);
pgoff_t index;
+ /*
+ * Bias interleave by inode number to distribute better across nodes;
+ * but this interface is independent of which page order is used, so
+ * supplies only that bias, letting caller apply the offset (adjusted
+ * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
+ */
+ *ilx = inode->i_ino;
index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
}
-#endif
+
+static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+ pgoff_t index, unsigned int order, pgoff_t *ilx)
+{
+ struct mempolicy *mpol;
+
+ /* Bias interleave by inode number to distribute better across nodes */
+ *ilx = info->vfs_inode.i_ino + (index >> order);
+
+ mpol = mpol_shared_policy_lookup(&info->policy, index);
+ return mpol ? mpol : get_task_policy(current);
+}
+#else
+static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+ pgoff_t index, unsigned int order, pgoff_t *ilx)
+{
+ *ilx = 0;
+ return NULL;
+}
+#endif /* CONFIG_NUMA */
int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
@@ -2374,7 +2388,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
struct shmem_inode_info *info = SHMEM_I(inode);
int ret;
- ret = seal_check_future_write(info->seals, vma);
+ ret = seal_check_write(info->seals, vma);
if (ret)
return ret;
@@ -2445,7 +2459,6 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
if (err)
return ERR_PTR(err);
-
inode = new_inode(sb);
if (!inode) {
shmem_free_inode(sb, 0);
@@ -2455,7 +2468,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
inode->i_ino = ino;
inode_init_owner(idmap, inode, dir, mode);
inode->i_blocks = 0;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_generation = get_random_u32();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
@@ -2463,18 +2476,17 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
- info->i_crtime = inode->i_mtime;
+ info->i_crtime = inode_get_mtime(inode);
info->fsflags = (dir == NULL) ? 0 :
SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
if (info->fsflags)
shmem_set_inode_flags(inode, info->fsflags);
INIT_LIST_HEAD(&info->shrinklist);
INIT_LIST_HEAD(&info->swaplist);
- INIT_LIST_HEAD(&info->swaplist);
- if (sbinfo->noswap)
- mapping_set_unevictable(inode->i_mapping);
simple_xattrs_init(&info->xattrs);
cache_no_acl(inode);
+ if (sbinfo->noswap)
+ mapping_set_unevictable(inode->i_mapping);
mapping_set_large_folios(inode->i_mapping);
switch (mode & S_IFMT) {
@@ -2565,7 +2577,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
int ret;
pgoff_t max_off;
- if (shmem_inode_acct_block(inode, 1)) {
+ if (shmem_inode_acct_blocks(inode, 1)) {
/*
* We may have got a page, returned -ENOENT triggering a retry,
* and now we find ourselves with -ENOMEM. Release the page, to
@@ -2637,8 +2649,10 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
if (unlikely(pgoff >= max_off))
goto out_release;
- ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
- gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
+ ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
+ if (ret)
+ goto out_release;
+ ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
if (ret)
goto out_release;
@@ -2686,7 +2700,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
}
ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
-
if (ret)
return ret;
@@ -3218,8 +3231,7 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
error = simple_acl_create(dir, inode);
if (error)
goto out_iput;
- error = security_inode_init_security(inode, dir,
- &dentry->d_name,
+ error = security_inode_init_security(inode, dir, &dentry->d_name,
shmem_initxattrs, NULL);
if (error && error != -EOPNOTSUPP)
goto out_iput;
@@ -3229,7 +3241,7 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
goto out_iput;
dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
@@ -3248,14 +3260,11 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
int error;
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
-
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto err_out;
}
-
- error = security_inode_init_security(inode, dir,
- NULL,
+ error = security_inode_init_security(inode, dir, NULL,
shmem_initxattrs, NULL);
if (error && error != -EOPNOTSUPP)
goto out_iput;
@@ -3292,7 +3301,8 @@ static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
/*
* Link a file..
*/
-static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+static int shmem_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
{
struct inode *inode = d_inode(old_dentry);
int ret = 0;
@@ -3318,12 +3328,12 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
}
dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_mtime = inode_set_ctime_to_ts(dir,
- inode_set_ctime_current(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
inode_inc_iversion(dir);
inc_nlink(inode);
ihold(inode); /* New dentry reference */
- dget(dentry); /* Extra pinning count for the created dentry */
+ dget(dentry); /* Extra pinning count for the created dentry */
d_instantiate(dentry, inode);
out:
return ret;
@@ -3339,11 +3349,11 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
dir->i_size -= BOGO_DIRENT_SIZE;
- dir->i_mtime = inode_set_ctime_to_ts(dir,
- inode_set_ctime_current(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
inode_inc_iversion(dir);
drop_nlink(inode);
- dput(dentry); /* Undo the count from "create" - this does all the work */
+ dput(dentry); /* Undo the count from "create" - does all the work */
return 0;
}
@@ -3453,7 +3463,6 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
VM_NORESERVE);
-
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -3488,7 +3497,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
folio_put(folio);
}
dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry);
@@ -3507,8 +3516,7 @@ static void shmem_put_link(void *arg)
folio_put(arg);
}
-static const char *shmem_get_link(struct dentry *dentry,
- struct inode *inode,
+static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *done)
{
struct folio *folio = NULL;
@@ -3582,8 +3590,7 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap,
* Callback for security_inode_init_security() for acquiring xattrs.
*/
static int shmem_initxattrs(struct inode *inode,
- const struct xattr *xattr_array,
- void *fs_info)
+ const struct xattr *xattr_array, void *fs_info)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
@@ -3714,7 +3721,7 @@ static const struct xattr_handler shmem_user_xattr_handler = {
.set = shmem_xattr_handler_set,
};
-static const struct xattr_handler *shmem_xattr_handlers[] = {
+static const struct xattr_handler * const shmem_xattr_handlers[] = {
&shmem_security_xattr_handler,
&shmem_trusted_xattr_handler,
&shmem_user_xattr_handler,
@@ -3767,7 +3774,6 @@ static struct dentry *shmem_find_alias(struct inode *inode)
return alias ?: d_find_any_alias(inode);
}
-
static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
@@ -4351,8 +4357,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
}
#endif /* CONFIG_TMPFS_QUOTA */
- inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
- VM_NORESERVE);
+ inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
+ S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto failed;
@@ -4394,7 +4400,7 @@ static const struct fs_context_operations shmem_fs_context_ops = {
#endif
};
-static struct kmem_cache *shmem_inode_cachep;
+static struct kmem_cache *shmem_inode_cachep __ro_after_init;
static struct inode *shmem_alloc_inode(struct super_block *sb)
{
@@ -4426,14 +4432,14 @@ static void shmem_init_inode(void *foo)
inode_init_once(&info->vfs_inode);
}
-static void shmem_init_inodecache(void)
+static void __init shmem_init_inodecache(void)
{
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
sizeof(struct shmem_inode_info),
0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
}
-static void shmem_destroy_inodecache(void)
+static void __init shmem_destroy_inodecache(void)
{
kmem_cache_destroy(shmem_inode_cachep);
}
@@ -4585,11 +4591,7 @@ static struct file_system_type shmem_fs_type = {
.parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
-#ifdef CONFIG_SHMEM
.fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
-#else
- .fs_flags = FS_USERNS_MOUNT,
-#endif
};
void __init shmem_init(void)
@@ -4655,11 +4657,9 @@ static ssize_t shmem_enabled_show(struct kobject *kobj,
for (i = 0; i < ARRAY_SIZE(values); i++) {
len += sysfs_emit_at(buf, len,
- shmem_huge == values[i] ? "%s[%s]" : "%s%s",
- i ? " " : "",
- shmem_format_huge(values[i]));
+ shmem_huge == values[i] ? "%s[%s]" : "%s%s",
+ i ? " " : "", shmem_format_huge(values[i]));
}
-
len += sysfs_emit_at(buf, len, "\n");
return len;
@@ -4756,8 +4756,9 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
-static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir,
- umode_t mode, dev_t dev, unsigned long flags)
+static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
+ struct super_block *sb, struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
{
struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
return inode ? inode : ERR_PTR(-ENOSPC);
@@ -4767,8 +4768,8 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct supe
/* common code */
-static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
- unsigned long flags, unsigned int i_flags)
+static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
+ loff_t size, unsigned long flags, unsigned int i_flags)
{
struct inode *inode;
struct file *res;
@@ -4787,7 +4788,6 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags);
-
if (IS_ERR(inode)) {
shmem_unacct_size(flags, size);
return ERR_CAST(inode);
@@ -4897,7 +4897,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
BUG_ON(!shmem_mapping(mapping));
error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
- gfp, NULL, NULL, NULL);
+ gfp, NULL, NULL);
if (error)
return ERR_PTR(error);
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 4b888b18bdde..ba0808d6917f 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -34,13 +34,8 @@ long si_mem_available(void)
long available;
unsigned long pagecache;
unsigned long wmark_low = 0;
- unsigned long pages[NR_LRU_LISTS];
unsigned long reclaimable;
struct zone *zone;
- int lru;
-
- for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
- pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
for_each_zone(zone)
wmark_low += low_wmark_pages(zone);
@@ -56,7 +51,8 @@ long si_mem_available(void)
* start swapping or thrashing. Assume at least half of the page
* cache, or the low watermark worth of cache, needs to stay.
*/
- pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache = global_node_page_state(NR_ACTIVE_FILE) +
+ global_node_page_state(NR_INACTIVE_FILE);
pagecache -= min(pagecache / 2, wmark_low);
available += pagecache;
@@ -67,7 +63,8 @@ long si_mem_available(void)
*/
reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
- available += reclaimable - min(reclaimable / 2, wmark_low);
+ reclaimable -= min(reclaimable / 2, wmark_low);
+ available += reclaimable;
if (available < 0)
available = 0;
diff --git a/mm/shrinker.c b/mm/shrinker.c
new file mode 100644
index 000000000000..dd91eab43ed3
--- /dev/null
+++ b/mm/shrinker.c
@@ -0,0 +1,809 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/memcontrol.h>
+#include <linux/rwsem.h>
+#include <linux/shrinker.h>
+#include <linux/rculist.h>
+#include <trace/events/vmscan.h>
+
+#include "internal.h"
+
+LIST_HEAD(shrinker_list);
+DEFINE_MUTEX(shrinker_mutex);
+
+#ifdef CONFIG_MEMCG
+static int shrinker_nr_max;
+
+static inline int shrinker_unit_size(int nr_items)
+{
+ return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
+}
+
+static inline void shrinker_unit_free(struct shrinker_info *info, int start)
+{
+ struct shrinker_info_unit **unit;
+ int nr, i;
+
+ if (!info)
+ return;
+
+ unit = info->unit;
+ nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
+
+ for (i = start; i < nr; i++) {
+ if (!unit[i])
+ break;
+
+ kfree(unit[i]);
+ unit[i] = NULL;
+ }
+}
+
+static inline int shrinker_unit_alloc(struct shrinker_info *new,
+ struct shrinker_info *old, int nid)
+{
+ struct shrinker_info_unit *unit;
+ int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
+ int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
+ int i;
+
+ for (i = start; i < nr; i++) {
+ unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
+ if (!unit) {
+ shrinker_unit_free(new, start);
+ return -ENOMEM;
+ }
+
+ new->unit[i] = unit;
+ }
+
+ return 0;
+}
+
+void free_shrinker_info(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_node *pn;
+ struct shrinker_info *info;
+ int nid;
+
+ for_each_node(nid) {
+ pn = memcg->nodeinfo[nid];
+ info = rcu_dereference_protected(pn->shrinker_info, true);
+ shrinker_unit_free(info, 0);
+ kvfree(info);
+ rcu_assign_pointer(pn->shrinker_info, NULL);
+ }
+}
+
+int alloc_shrinker_info(struct mem_cgroup *memcg)
+{
+ struct shrinker_info *info;
+ int nid, ret = 0;
+ int array_size = 0;
+
+ mutex_lock(&shrinker_mutex);
+ array_size = shrinker_unit_size(shrinker_nr_max);
+ for_each_node(nid) {
+ info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid);
+ if (!info)
+ goto err;
+ info->map_nr_max = shrinker_nr_max;
+ if (shrinker_unit_alloc(info, NULL, nid))
+ goto err;
+ rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
+ }
+ mutex_unlock(&shrinker_mutex);
+
+ return ret;
+
+err:
+ mutex_unlock(&shrinker_mutex);
+ free_shrinker_info(memcg);
+ return -ENOMEM;
+}
+
+static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
+ int nid)
+{
+ return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
+ lockdep_is_held(&shrinker_mutex));
+}
+
+static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
+ int old_size, int new_nr_max)
+{
+ struct shrinker_info *new, *old;
+ struct mem_cgroup_per_node *pn;
+ int nid;
+
+ for_each_node(nid) {
+ pn = memcg->nodeinfo[nid];
+ old = shrinker_info_protected(memcg, nid);
+ /* Not yet online memcg */
+ if (!old)
+ return 0;
+
+ /* Already expanded this shrinker_info */
+ if (new_nr_max <= old->map_nr_max)
+ continue;
+
+ new = kvmalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
+ if (!new)
+ return -ENOMEM;
+
+ new->map_nr_max = new_nr_max;
+
+ memcpy(new->unit, old->unit, old_size);
+ if (shrinker_unit_alloc(new, old, nid)) {
+ kvfree(new);
+ return -ENOMEM;
+ }
+
+ rcu_assign_pointer(pn->shrinker_info, new);
+ kvfree_rcu(old, rcu);
+ }
+
+ return 0;
+}
+
+static int expand_shrinker_info(int new_id)
+{
+ int ret = 0;
+ int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
+ int new_size, old_size = 0;
+ struct mem_cgroup *memcg;
+
+ if (!root_mem_cgroup)
+ goto out;
+
+ lockdep_assert_held(&shrinker_mutex);
+
+ new_size = shrinker_unit_size(new_nr_max);
+ old_size = shrinker_unit_size(shrinker_nr_max);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ ret = expand_one_shrinker_info(memcg, new_size, old_size,
+ new_nr_max);
+ if (ret) {
+ mem_cgroup_iter_break(NULL, memcg);
+ goto out;
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+out:
+ if (!ret)
+ shrinker_nr_max = new_nr_max;
+
+ return ret;
+}
+
+static inline int shrinker_id_to_index(int shrinker_id)
+{
+ return shrinker_id / SHRINKER_UNIT_BITS;
+}
+
+static inline int shrinker_id_to_offset(int shrinker_id)
+{
+ return shrinker_id % SHRINKER_UNIT_BITS;
+}
+
+static inline int calc_shrinker_id(int index, int offset)
+{
+ return index * SHRINKER_UNIT_BITS + offset;
+}
+
+void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
+{
+ if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
+ struct shrinker_info *info;
+ struct shrinker_info_unit *unit;
+
+ rcu_read_lock();
+ info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+ unit = info->unit[shrinker_id_to_index(shrinker_id)];
+ if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
+ /* Pairs with smp mb in shrink_slab() */
+ smp_mb__before_atomic();
+ set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
+ }
+ rcu_read_unlock();
+ }
+}
+
+static DEFINE_IDR(shrinker_idr);
+
+static int shrinker_memcg_alloc(struct shrinker *shrinker)
+{
+ int id, ret = -ENOMEM;
+
+ if (mem_cgroup_disabled())
+ return -ENOSYS;
+
+ mutex_lock(&shrinker_mutex);
+ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ goto unlock;
+
+ if (id >= shrinker_nr_max) {
+ if (expand_shrinker_info(id)) {
+ idr_remove(&shrinker_idr, id);
+ goto unlock;
+ }
+ }
+ shrinker->id = id;
+ ret = 0;
+unlock:
+ mutex_unlock(&shrinker_mutex);
+ return ret;
+}
+
+static void shrinker_memcg_remove(struct shrinker *shrinker)
+{
+ int id = shrinker->id;
+
+ BUG_ON(id < 0);
+
+ lockdep_assert_held(&shrinker_mutex);
+
+ idr_remove(&shrinker_idr, id);
+}
+
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ struct shrinker_info *info;
+ struct shrinker_info_unit *unit;
+ long nr_deferred;
+
+ rcu_read_lock();
+ info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+ unit = info->unit[shrinker_id_to_index(shrinker->id)];
+ nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
+ rcu_read_unlock();
+
+ return nr_deferred;
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ struct shrinker_info *info;
+ struct shrinker_info_unit *unit;
+ long nr_deferred;
+
+ rcu_read_lock();
+ info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+ unit = info->unit[shrinker_id_to_index(shrinker->id)];
+ nr_deferred =
+ atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
+ rcu_read_unlock();
+
+ return nr_deferred;
+}
+
+void reparent_shrinker_deferred(struct mem_cgroup *memcg)
+{
+ int nid, index, offset;
+ long nr;
+ struct mem_cgroup *parent;
+ struct shrinker_info *child_info, *parent_info;
+ struct shrinker_info_unit *child_unit, *parent_unit;
+
+ parent = parent_mem_cgroup(memcg);
+ if (!parent)
+ parent = root_mem_cgroup;
+
+ /* Prevent from concurrent shrinker_info expand */
+ mutex_lock(&shrinker_mutex);
+ for_each_node(nid) {
+ child_info = shrinker_info_protected(memcg, nid);
+ parent_info = shrinker_info_protected(parent, nid);
+ for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
+ child_unit = child_info->unit[index];
+ parent_unit = parent_info->unit[index];
+ for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
+ nr = atomic_long_read(&child_unit->nr_deferred[offset]);
+ atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
+ }
+ }
+ }
+ mutex_unlock(&shrinker_mutex);
+}
+#else
+static int shrinker_memcg_alloc(struct shrinker *shrinker)
+{
+ return -ENOSYS;
+}
+
+static void shrinker_memcg_remove(struct shrinker *shrinker)
+{
+}
+
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ return 0;
+}
+#endif /* CONFIG_MEMCG */
+
+static long xchg_nr_deferred(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int nid = sc->nid;
+
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
+ if (sc->memcg &&
+ (shrinker->flags & SHRINKER_MEMCG_AWARE))
+ return xchg_nr_deferred_memcg(nid, shrinker,
+ sc->memcg);
+
+ return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+}
+
+
+static long add_nr_deferred(long nr, struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int nid = sc->nid;
+
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
+ if (sc->memcg &&
+ (shrinker->flags & SHRINKER_MEMCG_AWARE))
+ return add_nr_deferred_memcg(nr, nid, shrinker,
+ sc->memcg);
+
+ return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
+}
+
+#define SHRINK_BATCH 128
+
+static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+ struct shrinker *shrinker, int priority)
+{
+ unsigned long freed = 0;
+ unsigned long long delta;
+ long total_scan;
+ long freeable;
+ long nr;
+ long new_nr;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
+ long scanned = 0, next_deferred;
+
+ freeable = shrinker->count_objects(shrinker, shrinkctl);
+ if (freeable == 0 || freeable == SHRINK_EMPTY)
+ return freeable;
+
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ nr = xchg_nr_deferred(shrinker, shrinkctl);
+
+ if (shrinker->seeks) {
+ delta = freeable >> priority;
+ delta *= 4;
+ do_div(delta, shrinker->seeks);
+ } else {
+ /*
+ * These objects don't require any IO to create. Trim
+ * them aggressively under memory pressure to keep
+ * them from causing refetches in the IO caches.
+ */
+ delta = freeable / 2;
+ }
+
+ total_scan = nr >> priority;
+ total_scan += delta;
+ total_scan = min(total_scan, (2 * freeable));
+
+ trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+ freeable, delta, total_scan, priority);
+
+ /*
+ * Normally, we should not scan less than batch_size objects in one
+ * pass to avoid too frequent shrinker calls, but if the slab has less
+ * than batch_size objects in total and we are really tight on memory,
+ * we will try to reclaim all available objects, otherwise we can end
+ * up failing allocations although there are plenty of reclaimable
+ * objects spread over several slabs with usage less than the
+ * batch_size.
+ *
+ * We detect the "tight on memory" situations by looking at the total
+ * number of objects we want to scan (total_scan). If it is greater
+ * than the total number of objects on slab (freeable), we must be
+ * scanning at high prio and therefore should try to reclaim as much as
+ * possible.
+ */
+ while (total_scan >= batch_size ||
+ total_scan >= freeable) {
+ unsigned long ret;
+ unsigned long nr_to_scan = min(batch_size, total_scan);
+
+ shrinkctl->nr_to_scan = nr_to_scan;
+ shrinkctl->nr_scanned = nr_to_scan;
+ ret = shrinker->scan_objects(shrinker, shrinkctl);
+ if (ret == SHRINK_STOP)
+ break;
+ freed += ret;
+
+ count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
+ total_scan -= shrinkctl->nr_scanned;
+ scanned += shrinkctl->nr_scanned;
+
+ cond_resched();
+ }
+
+ /*
+ * The deferred work is increased by any new work (delta) that wasn't
+ * done, decreased by old deferred work that was done now.
+ *
+ * And it is capped to two times of the freeable items.
+ */
+ next_deferred = max_t(long, (nr + delta - scanned), 0);
+ next_deferred = min(next_deferred, (2 * freeable));
+
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates.
+ */
+ new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
+
+ trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
+ return freed;
+}
+
+#ifdef CONFIG_MEMCG
+static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg, int priority)
+{
+ struct shrinker_info *info;
+ unsigned long ret, freed = 0;
+ int offset, index = 0;
+
+ if (!mem_cgroup_online(memcg))
+ return 0;
+
+ /*
+ * lockless algorithm of memcg shrink.
+ *
+ * The shrinker_info may be freed asynchronously via RCU in the
+ * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
+ * to ensure the existence of the shrinker_info.
+ *
+ * The shrinker_info_unit is never freed unless its corresponding memcg
+ * is destroyed. Here we already hold the refcount of memcg, so the
+ * memcg will not be destroyed, and of course shrinker_info_unit will
+ * not be freed.
+ *
+ * So in the memcg shrink:
+ * step 1: use rcu_read_lock() to guarantee existence of the
+ * shrinker_info.
+ * step 2: after getting shrinker_info_unit we can safely release the
+ * RCU lock.
+ * step 3: traverse the bitmap and calculate shrinker_id
+ * step 4: use rcu_read_lock() to guarantee existence of the shrinker.
+ * step 5: use shrinker_id to find the shrinker, then use
+ * shrinker_try_get() to guarantee existence of the shrinker,
+ * then we can release the RCU lock to do do_shrink_slab() that
+ * may sleep.
+ * step 6: do shrinker_put() paired with step 5 to put the refcount,
+ * if the refcount reaches 0, then wake up the waiter in
+ * shrinker_free() by calling complete().
+ * Note: here is different from the global shrink, we don't
+ * need to acquire the RCU lock to guarantee existence of
+ * the shrinker, because we don't need to use this
+ * shrinker to traverse the next shrinker in the bitmap.
+ * step 7: we have already exited the read-side of rcu critical section
+ * before calling do_shrink_slab(), the shrinker_info may be
+ * released in expand_one_shrinker_info(), so go back to step 1
+ * to reacquire the shrinker_info.
+ */
+again:
+ rcu_read_lock();
+ info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+ if (unlikely(!info))
+ goto unlock;
+
+ if (index < shrinker_id_to_index(info->map_nr_max)) {
+ struct shrinker_info_unit *unit;
+
+ unit = info->unit[index];
+
+ rcu_read_unlock();
+
+ for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
+ struct shrink_control sc = {
+ .gfp_mask = gfp_mask,
+ .nid = nid,
+ .memcg = memcg,
+ };
+ struct shrinker *shrinker;
+ int shrinker_id = calc_shrinker_id(index, offset);
+
+ rcu_read_lock();
+ shrinker = idr_find(&shrinker_idr, shrinker_id);
+ if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
+ clear_bit(offset, unit->map);
+ rcu_read_unlock();
+ continue;
+ }
+ rcu_read_unlock();
+
+ /* Call non-slab shrinkers even though kmem is disabled */
+ if (!memcg_kmem_online() &&
+ !(shrinker->flags & SHRINKER_NONSLAB))
+ continue;
+
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY) {
+ clear_bit(offset, unit->map);
+ /*
+ * After the shrinker reported that it had no objects to
+ * free, but before we cleared the corresponding bit in
+ * the memcg shrinker map, a new object might have been
+ * added. To make sure, we have the bit set in this
+ * case, we invoke the shrinker one more time and reset
+ * the bit if it reports that it is not empty anymore.
+ * The memory barrier here pairs with the barrier in
+ * set_shrinker_bit():
+ *
+ * list_lru_add() shrink_slab_memcg()
+ * list_add_tail() clear_bit()
+ * <MB> <MB>
+ * set_bit() do_shrink_slab()
+ */
+ smp_mb__after_atomic();
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ else
+ set_shrinker_bit(memcg, nid, shrinker_id);
+ }
+ freed += ret;
+ shrinker_put(shrinker);
+ }
+
+ index++;
+ goto again;
+ }
+unlock:
+ rcu_read_unlock();
+ return freed;
+}
+#else /* !CONFIG_MEMCG */
+static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg, int priority)
+{
+ return 0;
+}
+#endif /* CONFIG_MEMCG */
+
+/**
+ * shrink_slab - shrink slab caches
+ * @gfp_mask: allocation context
+ * @nid: node whose slab caches to target
+ * @memcg: memory cgroup whose slab caches to target
+ * @priority: the reclaim priority
+ *
+ * Call the shrink functions to age shrinkable caches.
+ *
+ * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
+ * unaware shrinkers will receive a node id of 0 instead.
+ *
+ * @memcg specifies the memory cgroup to target. Unaware shrinkers
+ * are called only if it is the root cgroup.
+ *
+ * @priority is sc->priority, we take the number of objects and >> by priority
+ * in order to get the scan target.
+ *
+ * Returns the number of reclaimed slab objects.
+ */
+unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
+ int priority)
+{
+ unsigned long ret, freed = 0;
+ struct shrinker *shrinker;
+
+ /*
+ * The root memcg might be allocated even though memcg is disabled
+ * via "cgroup_disable=memory" boot parameter. This could make
+ * mem_cgroup_is_root() return false, then just run memcg slab
+ * shrink, but skip global shrink. This may result in premature
+ * oom.
+ */
+ if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
+ return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
+
+ /*
+ * lockless algorithm of global shrink.
+ *
+ * In the unregistration setp, the shrinker will be freed asynchronously
+ * via RCU after its refcount reaches 0. So both rcu_read_lock() and
+ * shrinker_try_get() can be used to ensure the existence of the shrinker.
+ *
+ * So in the global shrink:
+ * step 1: use rcu_read_lock() to guarantee existence of the shrinker
+ * and the validity of the shrinker_list walk.
+ * step 2: use shrinker_try_get() to try get the refcount, if successful,
+ * then the existence of the shrinker can also be guaranteed,
+ * so we can release the RCU lock to do do_shrink_slab() that
+ * may sleep.
+ * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
+ * which ensures that neither this shrinker nor the next shrinker
+ * will be freed in the next traversal operation.
+ * step 4: do shrinker_put() paired with step 2 to put the refcount,
+ * if the refcount reaches 0, then wake up the waiter in
+ * shrinker_free() by calling complete().
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
+ struct shrink_control sc = {
+ .gfp_mask = gfp_mask,
+ .nid = nid,
+ .memcg = memcg,
+ };
+
+ if (!shrinker_try_get(shrinker))
+ continue;
+
+ rcu_read_unlock();
+
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ freed += ret;
+
+ rcu_read_lock();
+ shrinker_put(shrinker);
+ }
+
+ rcu_read_unlock();
+ cond_resched();
+ return freed;
+}
+
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
+{
+ struct shrinker *shrinker;
+ unsigned int size;
+ va_list ap;
+ int err;
+
+ shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
+ if (!shrinker)
+ return NULL;
+
+ va_start(ap, fmt);
+ err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
+ va_end(ap);
+ if (err)
+ goto err_name;
+
+ shrinker->flags = flags | SHRINKER_ALLOCATED;
+ shrinker->seeks = DEFAULT_SEEKS;
+
+ if (flags & SHRINKER_MEMCG_AWARE) {
+ err = shrinker_memcg_alloc(shrinker);
+ if (err == -ENOSYS) {
+ /* Memcg is not supported, fallback to non-memcg-aware shrinker. */
+ shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
+ goto non_memcg;
+ }
+
+ if (err)
+ goto err_flags;
+
+ return shrinker;
+ }
+
+non_memcg:
+ /*
+ * The nr_deferred is available on per memcg level for memcg aware
+ * shrinkers, so only allocate nr_deferred in the following cases:
+ * - non-memcg-aware shrinkers
+ * - !CONFIG_MEMCG
+ * - memcg is disabled by kernel command line
+ */
+ size = sizeof(*shrinker->nr_deferred);
+ if (flags & SHRINKER_NUMA_AWARE)
+ size *= nr_node_ids;
+
+ shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+ if (!shrinker->nr_deferred)
+ goto err_flags;
+
+ return shrinker;
+
+err_flags:
+ shrinker_debugfs_name_free(shrinker);
+err_name:
+ kfree(shrinker);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(shrinker_alloc);
+
+void shrinker_register(struct shrinker *shrinker)
+{
+ if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
+ pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
+ return;
+ }
+
+ mutex_lock(&shrinker_mutex);
+ list_add_tail_rcu(&shrinker->list, &shrinker_list);
+ shrinker->flags |= SHRINKER_REGISTERED;
+ shrinker_debugfs_add(shrinker);
+ mutex_unlock(&shrinker_mutex);
+
+ init_completion(&shrinker->done);
+ /*
+ * Now the shrinker is fully set up, take the first reference to it to
+ * indicate that lookup operations are now allowed to use it via
+ * shrinker_try_get().
+ */
+ refcount_set(&shrinker->refcount, 1);
+}
+EXPORT_SYMBOL_GPL(shrinker_register);
+
+static void shrinker_free_rcu_cb(struct rcu_head *head)
+{
+ struct shrinker *shrinker = container_of(head, struct shrinker, rcu);
+
+ kfree(shrinker->nr_deferred);
+ kfree(shrinker);
+}
+
+void shrinker_free(struct shrinker *shrinker)
+{
+ struct dentry *debugfs_entry = NULL;
+ int debugfs_id;
+
+ if (!shrinker)
+ return;
+
+ if (shrinker->flags & SHRINKER_REGISTERED) {
+ /* drop the initial refcount */
+ shrinker_put(shrinker);
+ /*
+ * Wait for all lookups of the shrinker to complete, after that,
+ * no shrinker is running or will run again, then we can safely
+ * free it asynchronously via RCU and safely free the structure
+ * where the shrinker is located, such as super_block etc.
+ */
+ wait_for_completion(&shrinker->done);
+ }
+
+ mutex_lock(&shrinker_mutex);
+ if (shrinker->flags & SHRINKER_REGISTERED) {
+ /*
+ * Now we can safely remove it from the shrinker_list and then
+ * free it.
+ */
+ list_del_rcu(&shrinker->list);
+ debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
+ shrinker->flags &= ~SHRINKER_REGISTERED;
+ }
+
+ shrinker_debugfs_name_free(shrinker);
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ shrinker_memcg_remove(shrinker);
+ mutex_unlock(&shrinker_mutex);
+
+ if (debugfs_entry)
+ shrinker_debugfs_remove(debugfs_entry, debugfs_id);
+
+ call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
+}
+EXPORT_SYMBOL_GPL(shrinker_free);
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 3ab53fad8876..12ea5486a3e9 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -6,8 +6,10 @@
#include <linux/shrinker.h>
#include <linux/memcontrol.h>
+#include "internal.h"
+
/* defined in vmscan.c */
-extern struct rw_semaphore shrinker_rwsem;
+extern struct mutex shrinker_mutex;
extern struct list_head shrinker_list;
static DEFINE_IDA(shrinker_debugfs_ida);
@@ -49,17 +51,12 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
struct mem_cgroup *memcg;
unsigned long total;
bool memcg_aware;
- int ret, nid;
+ int ret = 0, nid;
count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
if (!count_per_node)
return -ENOMEM;
- ret = down_read_killable(&shrinker_rwsem);
- if (ret) {
- kfree(count_per_node);
- return ret;
- }
rcu_read_lock();
memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
@@ -92,7 +89,6 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
rcu_read_unlock();
- up_read(&shrinker_rwsem);
kfree(count_per_node);
return ret;
@@ -117,7 +113,6 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
struct mem_cgroup *memcg = NULL;
int nid;
char kbuf[72];
- ssize_t ret;
read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1);
if (copy_from_user(kbuf, buf, read_len))
@@ -146,12 +141,6 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
return -EINVAL;
}
- ret = down_read_killable(&shrinker_rwsem);
- if (ret) {
- mem_cgroup_put(memcg);
- return ret;
- }
-
sc.nid = nid;
sc.memcg = memcg;
sc.nr_to_scan = nr_to_scan;
@@ -159,7 +148,6 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
shrinker->scan_objects(shrinker, &sc);
- up_read(&shrinker_rwsem);
mem_cgroup_put(memcg);
return size;
@@ -177,7 +165,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
char buf[128];
int id;
- lockdep_assert_held(&shrinker_rwsem);
+ lockdep_assert_held(&shrinker_mutex);
/* debugfs isn't initialized yet, add debugfs entries later. */
if (!shrinker_debugfs_root)
@@ -220,7 +208,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
if (!new)
return -ENOMEM;
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
old = shrinker->name;
shrinker->name = new;
@@ -238,7 +226,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
shrinker->debugfs_entry = entry;
}
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
kfree_const(old);
@@ -251,10 +239,7 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
{
struct dentry *entry = shrinker->debugfs_entry;
- lockdep_assert_held(&shrinker_rwsem);
-
- kfree_const(shrinker->name);
- shrinker->name = NULL;
+ lockdep_assert_held(&shrinker_mutex);
*debugfs_id = entry ? shrinker->debugfs_id : -1;
shrinker->debugfs_entry = NULL;
@@ -280,14 +265,14 @@ static int __init shrinker_debugfs_init(void)
shrinker_debugfs_root = dentry;
/* Create debugfs entries for shrinkers registered at boot */
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
list_for_each_entry(shrinker, &shrinker_list, list)
if (!shrinker->debugfs_entry) {
ret = shrinker_debugfs_add(shrinker);
if (ret)
break;
}
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
return ret;
}
diff --git a/mm/slab.h b/mm/slab.h
index 799a315695c6..3d07fb428393 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -484,7 +484,12 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))
return true;
- objcg = get_obj_cgroup_from_current();
+ /*
+ * The obtained objcg pointer is safe to use within the current scope,
+ * defined by current task or set_active_memcg() pair.
+ * obj_cgroup_get() is used to get a permanent reference.
+ */
+ objcg = current_obj_cgroup();
if (!objcg)
return true;
@@ -497,17 +502,14 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
css_put(&memcg->css);
if (ret)
- goto out;
+ return false;
}
if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s)))
- goto out;
+ return false;
*objcgp = objcg;
return true;
-out:
- obj_cgroup_put(objcg);
- return false;
}
static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
@@ -542,7 +544,6 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
obj_cgroup_uncharge(objcg, obj_full_size(s));
}
}
- obj_cgroup_put(objcg);
}
static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index cd71f9581e67..8d431193c273 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -479,7 +479,7 @@ void slab_kmem_cache_release(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
- int refcnt;
+ int err = -EBUSY;
bool rcu_set;
if (unlikely(!s) || !kasan_check_byte(s))
@@ -490,17 +490,17 @@ void kmem_cache_destroy(struct kmem_cache *s)
rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
- refcnt = --s->refcount;
- if (refcnt)
+ s->refcount--;
+ if (s->refcount)
goto out_unlock;
- WARN(shutdown_cache(s),
- "%s %s: Slab cache still has objects when called from %pS",
+ err = shutdown_cache(s);
+ WARN(err, "%s %s: Slab cache still has objects when called from %pS",
__func__, s->name, (void *)_RET_IP_);
out_unlock:
mutex_unlock(&slab_mutex);
cpus_read_unlock();
- if (!refcnt && !rcu_set)
+ if (!err && !rcu_set)
kmem_cache_release(s);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -528,26 +528,6 @@ bool slab_is_available(void)
}
#ifdef CONFIG_PRINTK
-/**
- * kmem_valid_obj - does the pointer reference a valid slab object?
- * @object: pointer to query.
- *
- * Return: %true if the pointer is to a not-yet-freed object from
- * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
- * is to an already-freed object, and %false otherwise.
- */
-bool kmem_valid_obj(void *object)
-{
- struct folio *folio;
-
- /* Some arches consider ZERO_SIZE_PTR to be a valid address. */
- if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
- return false;
- folio = virt_to_folio(object);
- return folio_test_slab(folio);
-}
-EXPORT_SYMBOL_GPL(kmem_valid_obj);
-
static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
{
if (__kfence_obj_info(kpp, object, slab))
@@ -566,11 +546,11 @@ static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *
* and, if available, the slab name, return address, and stack trace from
* the allocation and last free path of that object.
*
- * This function will splat if passed a pointer to a non-slab object.
- * If you are not sure what type of object you have, you should instead
- * use mem_dump_obj().
+ * Return: %true if the pointer is to a not-yet-freed object from
+ * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
+ * is to an already-freed object, and %false otherwise.
*/
-void kmem_dump_obj(void *object)
+bool kmem_dump_obj(void *object)
{
char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
int i;
@@ -578,13 +558,13 @@ void kmem_dump_obj(void *object)
unsigned long ptroffset;
struct kmem_obj_info kp = { };
- if (WARN_ON_ONCE(!virt_addr_valid(object)))
- return;
+ /* Some arches consider ZERO_SIZE_PTR to be a valid address. */
+ if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
+ return false;
slab = virt_to_slab(object);
- if (WARN_ON_ONCE(!slab)) {
- pr_cont(" non-slab memory.\n");
- return;
- }
+ if (!slab)
+ return false;
+
kmem_obj_info(&kp, object, slab);
if (kp.kp_slab_cache)
pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
@@ -621,6 +601,7 @@ void kmem_dump_obj(void *object)
pr_info(" %pS\n", kp.kp_free_stack[i]);
}
+ return true;
}
EXPORT_SYMBOL_GPL(kmem_dump_obj);
#endif
@@ -745,24 +726,24 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller)
size_t kmalloc_size_roundup(size_t size)
{
- struct kmem_cache *c;
+ if (size && size <= KMALLOC_MAX_CACHE_SIZE) {
+ /*
+ * The flags don't matter since size_index is common to all.
+ * Neither does the caller for just getting ->object_size.
+ */
+ return kmalloc_slab(size, GFP_KERNEL, 0)->object_size;
+ }
- /* Short-circuit the 0 size case. */
- if (unlikely(size == 0))
- return 0;
- /* Short-circuit saturated "too-large" case. */
- if (unlikely(size == SIZE_MAX))
- return SIZE_MAX;
/* Above the smaller buckets, size is a multiple of page size. */
- if (size > KMALLOC_MAX_CACHE_SIZE)
+ if (size && size <= KMALLOC_MAX_SIZE)
return PAGE_SIZE << get_order(size);
/*
- * The flags don't matter since size_index is common to all.
- * Neither does the caller for just getting ->object_size.
+ * Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR
+ * and very large size - kmalloc() may fail.
*/
- c = kmalloc_slab(size, GFP_KERNEL, 0);
- return c ? c->object_size : 0;
+ return size;
+
}
EXPORT_SYMBOL(kmalloc_size_roundup);
@@ -895,10 +876,13 @@ void __init setup_kmalloc_cache_index_table(void)
static unsigned int __kmalloc_minalign(void)
{
+ unsigned int minalign = dma_get_cache_alignment();
+
if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
is_swiotlb_allocated())
- return ARCH_KMALLOC_MINALIGN;
- return dma_get_cache_alignment();
+ minalign = ARCH_KMALLOC_MINALIGN;
+
+ return max(minalign, arch_slab_minalign());
}
void __init
diff --git a/mm/slub.c b/mm/slub.c
index f7940048138c..63d281dfacdb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4110,17 +4110,12 @@ static unsigned int slub_min_objects;
* the smallest order which will fit the object.
*/
static inline unsigned int calc_slab_order(unsigned int size,
- unsigned int min_objects, unsigned int max_order,
+ unsigned int min_order, unsigned int max_order,
unsigned int fract_leftover)
{
- unsigned int min_order = slub_min_order;
unsigned int order;
- if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
- return get_order(size * MAX_OBJS_PER_PAGE) - 1;
-
- for (order = max(min_order, (unsigned int)get_order(min_objects * size));
- order <= max_order; order++) {
+ for (order = min_order; order <= max_order; order++) {
unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
unsigned int rem;
@@ -4139,16 +4134,8 @@ static inline int calculate_order(unsigned int size)
unsigned int order;
unsigned int min_objects;
unsigned int max_objects;
- unsigned int nr_cpus;
+ unsigned int min_order;
- /*
- * Attempt to find best configuration for a slab. This
- * works by first attempting to generate a layout with
- * the best configuration and backing off gradually.
- *
- * First we increase the acceptable waste in a slab. Then
- * we reduce the minimum objects required in a slab.
- */
min_objects = slub_min_objects;
if (!min_objects) {
/*
@@ -4160,40 +4147,46 @@ static inline int calculate_order(unsigned int size)
* order on systems that appear larger than they are, and too
* low order on systems that appear smaller than they are.
*/
- nr_cpus = num_present_cpus();
+ unsigned int nr_cpus = num_present_cpus();
if (nr_cpus <= 1)
nr_cpus = nr_cpu_ids;
min_objects = 4 * (fls(nr_cpus) + 1);
}
- max_objects = order_objects(slub_max_order, size);
+ /* min_objects can't be 0 because get_order(0) is undefined */
+ max_objects = max(order_objects(slub_max_order, size), 1U);
min_objects = min(min_objects, max_objects);
- while (min_objects > 1) {
- unsigned int fraction;
-
- fraction = 16;
- while (fraction >= 4) {
- order = calc_slab_order(size, min_objects,
- slub_max_order, fraction);
- if (order <= slub_max_order)
- return order;
- fraction /= 2;
- }
- min_objects--;
- }
+ min_order = max_t(unsigned int, slub_min_order,
+ get_order(min_objects * size));
+ if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
+ return get_order(size * MAX_OBJS_PER_PAGE) - 1;
/*
- * We were unable to place multiple objects in a slab. Now
- * lets see if we can place a single object there.
+ * Attempt to find best configuration for a slab. This works by first
+ * attempting to generate a layout with the best possible configuration
+ * and backing off gradually.
+ *
+ * We start with accepting at most 1/16 waste and try to find the
+ * smallest order from min_objects-derived/slub_min_order up to
+ * slub_max_order that will satisfy the constraint. Note that increasing
+ * the order can only result in same or less fractional waste, not more.
+ *
+ * If that fails, we increase the acceptable fraction of waste and try
+ * again. The last iteration with fraction of 1/2 would effectively
+ * accept any waste and give us the order determined by min_objects, as
+ * long as at least single object fits within slub_max_order.
*/
- order = calc_slab_order(size, 1, slub_max_order, 1);
- if (order <= slub_max_order)
- return order;
+ for (unsigned int fraction = 16; fraction > 1; fraction /= 2) {
+ order = calc_slab_order(size, min_order, slub_max_order,
+ fraction);
+ if (order <= slub_max_order)
+ return order;
+ }
/*
* Doh this slab cannot be placed using slub_max_order.
*/
- order = calc_slab_order(size, 1, MAX_ORDER, 1);
+ order = get_order(size);
if (order <= MAX_ORDER)
return order;
return -ENOSYS;
@@ -4711,6 +4704,9 @@ static int __init setup_slub_min_order(char *str)
{
get_option(&str, (int *)&slub_min_order);
+ if (slub_min_order > slub_max_order)
+ slub_max_order = slub_min_order;
+
return 1;
}
@@ -4721,6 +4717,9 @@ static int __init setup_slub_max_order(char *str)
get_option(&str, (int *)&slub_max_order);
slub_max_order = min_t(unsigned int, slub_max_order, MAX_ORDER);
+ if (slub_min_order > slub_max_order)
+ slub_min_order = slub_max_order;
+
return 1;
}
diff --git a/mm/swap.h b/mm/swap.h
index 8a3c7a0ace4f..73c332ee4d91 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -2,6 +2,8 @@
#ifndef _MM_SWAP_H
#define _MM_SWAP_H
+struct mempolicy;
+
#ifdef CONFIG_SWAP
#include <linux/blk_types.h> /* for bio_end_io_t */
@@ -48,11 +50,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
unsigned long addr,
struct swap_iocb **plug);
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma,
- unsigned long addr,
+ struct mempolicy *mpol, pgoff_t ilx,
bool *new_page_allocated);
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
- struct vm_fault *vmf);
+ struct mempolicy *mpol, pgoff_t ilx);
struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf);
@@ -80,7 +81,7 @@ static inline void show_swap_cache_info(void)
}
static inline struct page *swap_cluster_readahead(swp_entry_t entry,
- gfp_t gfp_mask, struct vm_fault *vmf)
+ gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx)
{
return NULL;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b3b14bd0dd64..85d9e5806a6a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
+#include <linux/mempolicy.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/init.h>
@@ -109,9 +110,9 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
goto unlock;
for (i = 0; i < nr; i++) {
VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
- old = xas_load(&xas);
- if (xa_is_value(old)) {
- if (shadowp)
+ if (shadowp) {
+ old = xas_load(&xas);
+ if (xa_is_value(old))
*shadowp = old;
}
xas_store(&xas, folio);
@@ -410,8 +411,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
}
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr,
- bool *new_page_allocated)
+ struct mempolicy *mpol, pgoff_t ilx,
+ bool *new_page_allocated)
{
struct swap_info_struct *si;
struct folio *folio;
@@ -453,7 +454,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
* cause any racers to loop around until we add it to cache.
*/
- folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false);
+ folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0,
+ mpol, ilx, numa_node_id());
if (!folio)
goto fail_put_swap;
@@ -528,14 +530,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
unsigned long addr, struct swap_iocb **plug)
{
- bool page_was_allocated;
- struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
- vma, addr, &page_was_allocated);
+ bool page_allocated;
+ struct mempolicy *mpol;
+ pgoff_t ilx;
+ struct page *page;
- if (page_was_allocated)
- swap_readpage(retpage, false, plug);
+ mpol = get_vma_policy(vma, addr, 0, &ilx);
+ page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+ &page_allocated);
+ mpol_cond_put(mpol);
- return retpage;
+ if (page_allocated)
+ swap_readpage(page, false, plug);
+ return page;
}
static unsigned int __swapin_nr_pages(unsigned long prev_offset,
@@ -603,7 +610,8 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* swap_cluster_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory
* @gfp_mask: memory allocation flags
- * @vmf: fault information
+ * @mpol: NUMA memory allocation policy to be applied
+ * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
*
* Returns the struct page for entry and addr, after queueing swapin.
*
@@ -612,13 +620,12 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* because it doesn't cost us any seek time. We also make sure to queue
* the 'original' request together with the readahead ones...
*
- * This has been extended to use the NUMA policies from the mm triggering
- * the readahead.
- *
- * Caller must hold read mmap_lock if vmf->vma is not NULL.
+ * Note: it is intentional that the same NUMA policy and interleave index
+ * are used for every page of the readahead: neighbouring pages on swap
+ * are fairly likely to have been swapped out from the same node.
*/
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_fault *vmf)
+ struct mempolicy *mpol, pgoff_t ilx)
{
struct page *page;
unsigned long entry_offset = swp_offset(entry);
@@ -629,8 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct blk_plug plug;
struct swap_iocb *splug = NULL;
bool page_allocated;
- struct vm_area_struct *vma = vmf->vma;
- unsigned long addr = vmf->address;
mask = swapin_nr_pages(offset) - 1;
if (!mask)
@@ -648,8 +653,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */
page = __read_swap_cache_async(
- swp_entry(swp_type(entry), offset),
- gfp_mask, vma, addr, &page_allocated);
+ swp_entry(swp_type(entry), offset),
+ gfp_mask, mpol, ilx, &page_allocated);
if (!page)
continue;
if (page_allocated) {
@@ -663,11 +668,14 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
}
blk_finish_plug(&plug);
swap_read_unplug(splug);
-
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
/* The page was likely read above, so no need for plugging here */
- return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL);
+ page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+ &page_allocated);
+ if (unlikely(page_allocated))
+ swap_readpage(page, false, NULL);
+ return page;
}
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -765,8 +773,10 @@ static void swap_ra_info(struct vm_fault *vmf,
/**
* swap_vma_readahead - swap in pages in hope we need them soon
- * @fentry: swap entry of this memory
+ * @targ_entry: swap entry of the targeted memory
* @gfp_mask: memory allocation flags
+ * @mpol: NUMA memory allocation policy to be applied
+ * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
* @vmf: fault information
*
* Returns the struct page for entry and addr, after queueing swapin.
@@ -777,16 +787,17 @@ static void swap_ra_info(struct vm_fault *vmf,
* Caller must hold read mmap_lock if vmf->vma is not NULL.
*
*/
-static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
+ struct mempolicy *mpol, pgoff_t targ_ilx,
struct vm_fault *vmf)
{
struct blk_plug plug;
struct swap_iocb *splug = NULL;
- struct vm_area_struct *vma = vmf->vma;
struct page *page;
pte_t *pte = NULL, pentry;
unsigned long addr;
swp_entry_t entry;
+ pgoff_t ilx;
unsigned int i;
bool page_allocated;
struct vma_swap_readahead ra_info = {
@@ -798,9 +809,10 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
goto skip;
addr = vmf->address - (ra_info.offset * PAGE_SIZE);
+ ilx = targ_ilx - ra_info.offset;
blk_start_plug(&plug);
- for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) {
+ for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) {
if (!pte++) {
pte = pte_offset_map(vmf->pmd, addr);
if (!pte)
@@ -814,8 +826,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
continue;
pte_unmap(pte);
pte = NULL;
- page = __read_swap_cache_async(entry, gfp_mask, vma,
- addr, &page_allocated);
+ page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+ &page_allocated);
if (!page)
continue;
if (page_allocated) {
@@ -834,8 +846,11 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
lru_add_drain();
skip:
/* The page was likely read above, so no need for plugging here */
- return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
- NULL);
+ page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
+ &page_allocated);
+ if (unlikely(page_allocated))
+ swap_readpage(page, false, NULL);
+ return page;
}
/**
@@ -853,9 +868,16 @@ skip:
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
- return swap_use_vma_readahead() ?
- swap_vma_readahead(entry, gfp_mask, vmf) :
- swap_cluster_readahead(entry, gfp_mask, vmf);
+ struct mempolicy *mpol;
+ pgoff_t ilx;
+ struct page *page;
+
+ mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
+ page = swap_use_vma_readahead() ?
+ swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
+ swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+ mpol_cond_put(mpol);
+ return page;
}
#ifdef CONFIG_SYSFS
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e52f486834eb..4bc70f459164 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2530,11 +2530,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
exit_swap_address_space(p->type);
inode = mapping->host;
- if (S_ISBLK(inode->i_mode)) {
- struct block_device *bdev = I_BDEV(inode);
-
- set_blocksize(bdev, old_block_size);
- blkdev_put(bdev, p);
+ if (p->bdev_handle) {
+ set_blocksize(p->bdev, old_block_size);
+ bdev_release(p->bdev_handle);
+ p->bdev_handle = NULL;
}
inode_lock(inode);
@@ -2764,13 +2763,14 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
int error;
if (S_ISBLK(inode->i_mode)) {
- p->bdev = blkdev_get_by_dev(inode->i_rdev,
+ p->bdev_handle = bdev_open_by_dev(inode->i_rdev,
BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
- if (IS_ERR(p->bdev)) {
- error = PTR_ERR(p->bdev);
- p->bdev = NULL;
+ if (IS_ERR(p->bdev_handle)) {
+ error = PTR_ERR(p->bdev_handle);
+ p->bdev_handle = NULL;
return error;
}
+ p->bdev = p->bdev_handle->bdev;
p->old_block_size = block_size(p->bdev);
error = set_blocksize(p->bdev, PAGE_SIZE);
if (error < 0)
@@ -3206,9 +3206,10 @@ bad_swap:
p->percpu_cluster = NULL;
free_percpu(p->cluster_next_cpu);
p->cluster_next_cpu = NULL;
- if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
+ if (p->bdev_handle) {
set_blocksize(p->bdev, p->old_block_size);
- blkdev_put(p->bdev, p);
+ bdev_release(p->bdev_handle);
+ p->bdev_handle = NULL;
}
inode = NULL;
destroy_swap_extents(p);
diff --git a/mm/util.c b/mm/util.c
index 8cbbfd3a3d59..aa01f6ea5a75 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -799,6 +799,7 @@ void folio_copy(struct folio *dst, struct folio *src)
cond_resched();
}
}
+EXPORT_SYMBOL(folio_copy);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
int sysctl_overcommit_ratio __read_mostly = 50;
@@ -1060,10 +1061,8 @@ void mem_dump_obj(void *object)
{
const char *type;
- if (kmem_valid_obj(object)) {
- kmem_dump_obj(object);
+ if (kmem_dump_obj(object))
return;
- }
if (vmalloc_dump_obj(object))
return;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ef8599d394fd..d12a17fc0c17 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -111,7 +111,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t entry = pfn_pte(pfn, prot);
entry = arch_make_huge_pte(entry, ilog2(size), 0);
- set_huge_pte_at(&init_mm, addr, pte, entry);
+ set_huge_pte_at(&init_mm, addr, pte, entry, size);
pfn += PFN_DOWN(size);
continue;
}
@@ -3809,7 +3809,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
if (flags & VMAP_RAM)
copied = vmap_ram_vread_iter(iter, addr, n, flags);
- else if (!(vm->flags & VM_IOREMAP))
+ else if (!(vm && (vm->flags & VM_IOREMAP)))
copied = aligned_vread_iter(iter, addr, n);
else /* IOREMAP area is treated as memory hole */
copied = zero_iter(iter, n);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6f13394b112e..506f8220c5fe 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -35,7 +35,6 @@
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
-#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -188,246 +187,7 @@ struct scan_control {
*/
int vm_swappiness = 60;
-LIST_HEAD(shrinker_list);
-DECLARE_RWSEM(shrinker_rwsem);
-
#ifdef CONFIG_MEMCG
-static int shrinker_nr_max;
-
-/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
-static inline int shrinker_map_size(int nr_items)
-{
- return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
-}
-
-static inline int shrinker_defer_size(int nr_items)
-{
- return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
-}
-
-static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
- int nid)
-{
- return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
- lockdep_is_held(&shrinker_rwsem));
-}
-
-static int expand_one_shrinker_info(struct mem_cgroup *memcg,
- int map_size, int defer_size,
- int old_map_size, int old_defer_size,
- int new_nr_max)
-{
- struct shrinker_info *new, *old;
- struct mem_cgroup_per_node *pn;
- int nid;
- int size = map_size + defer_size;
-
- for_each_node(nid) {
- pn = memcg->nodeinfo[nid];
- old = shrinker_info_protected(memcg, nid);
- /* Not yet online memcg */
- if (!old)
- return 0;
-
- /* Already expanded this shrinker_info */
- if (new_nr_max <= old->map_nr_max)
- continue;
-
- new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
- if (!new)
- return -ENOMEM;
-
- new->nr_deferred = (atomic_long_t *)(new + 1);
- new->map = (void *)new->nr_deferred + defer_size;
- new->map_nr_max = new_nr_max;
-
- /* map: set all old bits, clear all new bits */
- memset(new->map, (int)0xff, old_map_size);
- memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
- /* nr_deferred: copy old values, clear all new values */
- memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
- memset((void *)new->nr_deferred + old_defer_size, 0,
- defer_size - old_defer_size);
-
- rcu_assign_pointer(pn->shrinker_info, new);
- kvfree_rcu(old, rcu);
- }
-
- return 0;
-}
-
-void free_shrinker_info(struct mem_cgroup *memcg)
-{
- struct mem_cgroup_per_node *pn;
- struct shrinker_info *info;
- int nid;
-
- for_each_node(nid) {
- pn = memcg->nodeinfo[nid];
- info = rcu_dereference_protected(pn->shrinker_info, true);
- kvfree(info);
- rcu_assign_pointer(pn->shrinker_info, NULL);
- }
-}
-
-int alloc_shrinker_info(struct mem_cgroup *memcg)
-{
- struct shrinker_info *info;
- int nid, size, ret = 0;
- int map_size, defer_size = 0;
-
- down_write(&shrinker_rwsem);
- map_size = shrinker_map_size(shrinker_nr_max);
- defer_size = shrinker_defer_size(shrinker_nr_max);
- size = map_size + defer_size;
- for_each_node(nid) {
- info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
- if (!info) {
- free_shrinker_info(memcg);
- ret = -ENOMEM;
- break;
- }
- info->nr_deferred = (atomic_long_t *)(info + 1);
- info->map = (void *)info->nr_deferred + defer_size;
- info->map_nr_max = shrinker_nr_max;
- rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
- }
- up_write(&shrinker_rwsem);
-
- return ret;
-}
-
-static int expand_shrinker_info(int new_id)
-{
- int ret = 0;
- int new_nr_max = round_up(new_id + 1, BITS_PER_LONG);
- int map_size, defer_size = 0;
- int old_map_size, old_defer_size = 0;
- struct mem_cgroup *memcg;
-
- if (!root_mem_cgroup)
- goto out;
-
- lockdep_assert_held(&shrinker_rwsem);
-
- map_size = shrinker_map_size(new_nr_max);
- defer_size = shrinker_defer_size(new_nr_max);
- old_map_size = shrinker_map_size(shrinker_nr_max);
- old_defer_size = shrinker_defer_size(shrinker_nr_max);
-
- memcg = mem_cgroup_iter(NULL, NULL, NULL);
- do {
- ret = expand_one_shrinker_info(memcg, map_size, defer_size,
- old_map_size, old_defer_size,
- new_nr_max);
- if (ret) {
- mem_cgroup_iter_break(NULL, memcg);
- goto out;
- }
- } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
-out:
- if (!ret)
- shrinker_nr_max = new_nr_max;
-
- return ret;
-}
-
-void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
-{
- if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
- struct shrinker_info *info;
-
- rcu_read_lock();
- info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
- if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
- /* Pairs with smp mb in shrink_slab() */
- smp_mb__before_atomic();
- set_bit(shrinker_id, info->map);
- }
- rcu_read_unlock();
- }
-}
-
-static DEFINE_IDR(shrinker_idr);
-
-static int prealloc_memcg_shrinker(struct shrinker *shrinker)
-{
- int id, ret = -ENOMEM;
-
- if (mem_cgroup_disabled())
- return -ENOSYS;
-
- down_write(&shrinker_rwsem);
- /* This may call shrinker, so it must use down_read_trylock() */
- id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
- if (id < 0)
- goto unlock;
-
- if (id >= shrinker_nr_max) {
- if (expand_shrinker_info(id)) {
- idr_remove(&shrinker_idr, id);
- goto unlock;
- }
- }
- shrinker->id = id;
- ret = 0;
-unlock:
- up_write(&shrinker_rwsem);
- return ret;
-}
-
-static void unregister_memcg_shrinker(struct shrinker *shrinker)
-{
- int id = shrinker->id;
-
- BUG_ON(id < 0);
-
- lockdep_assert_held(&shrinker_rwsem);
-
- idr_remove(&shrinker_idr, id);
-}
-
-static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
- struct mem_cgroup *memcg)
-{
- struct shrinker_info *info;
-
- info = shrinker_info_protected(memcg, nid);
- return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
-}
-
-static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
- struct mem_cgroup *memcg)
-{
- struct shrinker_info *info;
-
- info = shrinker_info_protected(memcg, nid);
- return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
-}
-
-void reparent_shrinker_deferred(struct mem_cgroup *memcg)
-{
- int i, nid;
- long nr;
- struct mem_cgroup *parent;
- struct shrinker_info *child_info, *parent_info;
-
- parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
- /* Prevent from concurrent shrinker_info expand */
- down_read(&shrinker_rwsem);
- for_each_node(nid) {
- child_info = shrinker_info_protected(memcg, nid);
- parent_info = shrinker_info_protected(parent, nid);
- for (i = 0; i < child_info->map_nr_max; i++) {
- nr = atomic_long_read(&child_info->nr_deferred[i]);
- atomic_long_add(nr, &parent_info->nr_deferred[i]);
- }
- }
- up_read(&shrinker_rwsem);
-}
/* Returns true for reclaim through cgroup limits or cgroup interfaces. */
static bool cgroup_reclaim(struct scan_control *sc)
@@ -468,27 +228,6 @@ static bool writeback_throttling_sane(struct scan_control *sc)
return false;
}
#else
-static int prealloc_memcg_shrinker(struct shrinker *shrinker)
-{
- return -ENOSYS;
-}
-
-static void unregister_memcg_shrinker(struct shrinker *shrinker)
-{
-}
-
-static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
- struct mem_cgroup *memcg)
-{
- return 0;
-}
-
-static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
- struct mem_cgroup *memcg)
-{
- return 0;
-}
-
static bool cgroup_reclaim(struct scan_control *sc)
{
return false;
@@ -557,39 +296,6 @@ static void flush_reclaim_state(struct scan_control *sc)
}
}
-static long xchg_nr_deferred(struct shrinker *shrinker,
- struct shrink_control *sc)
-{
- int nid = sc->nid;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
- nid = 0;
-
- if (sc->memcg &&
- (shrinker->flags & SHRINKER_MEMCG_AWARE))
- return xchg_nr_deferred_memcg(nid, shrinker,
- sc->memcg);
-
- return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
-}
-
-
-static long add_nr_deferred(long nr, struct shrinker *shrinker,
- struct shrink_control *sc)
-{
- int nid = sc->nid;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
- nid = 0;
-
- if (sc->memcg &&
- (shrinker->flags & SHRINKER_MEMCG_AWARE))
- return add_nr_deferred_memcg(nr, nid, shrinker,
- sc->memcg);
-
- return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
-}
-
static bool can_demote(int nid, struct scan_control *sc)
{
if (!numa_demotion_enabled)
@@ -671,413 +377,6 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
return size;
}
-/*
- * Add a shrinker callback to be called from the vm.
- */
-static int __prealloc_shrinker(struct shrinker *shrinker)
-{
- unsigned int size;
- int err;
-
- if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
- err = prealloc_memcg_shrinker(shrinker);
- if (err != -ENOSYS)
- return err;
-
- shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
- }
-
- size = sizeof(*shrinker->nr_deferred);
- if (shrinker->flags & SHRINKER_NUMA_AWARE)
- size *= nr_node_ids;
-
- shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
- if (!shrinker->nr_deferred)
- return -ENOMEM;
-
- return 0;
-}
-
-#ifdef CONFIG_SHRINKER_DEBUG
-int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
-{
- va_list ap;
- int err;
-
- va_start(ap, fmt);
- shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
- va_end(ap);
- if (!shrinker->name)
- return -ENOMEM;
-
- err = __prealloc_shrinker(shrinker);
- if (err) {
- kfree_const(shrinker->name);
- shrinker->name = NULL;
- }
-
- return err;
-}
-#else
-int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
-{
- return __prealloc_shrinker(shrinker);
-}
-#endif
-
-void free_prealloced_shrinker(struct shrinker *shrinker)
-{
-#ifdef CONFIG_SHRINKER_DEBUG
- kfree_const(shrinker->name);
- shrinker->name = NULL;
-#endif
- if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
- down_write(&shrinker_rwsem);
- unregister_memcg_shrinker(shrinker);
- up_write(&shrinker_rwsem);
- return;
- }
-
- kfree(shrinker->nr_deferred);
- shrinker->nr_deferred = NULL;
-}
-
-void register_shrinker_prepared(struct shrinker *shrinker)
-{
- down_write(&shrinker_rwsem);
- list_add_tail(&shrinker->list, &shrinker_list);
- shrinker->flags |= SHRINKER_REGISTERED;
- shrinker_debugfs_add(shrinker);
- up_write(&shrinker_rwsem);
-}
-
-static int __register_shrinker(struct shrinker *shrinker)
-{
- int err = __prealloc_shrinker(shrinker);
-
- if (err)
- return err;
- register_shrinker_prepared(shrinker);
- return 0;
-}
-
-#ifdef CONFIG_SHRINKER_DEBUG
-int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
-{
- va_list ap;
- int err;
-
- va_start(ap, fmt);
- shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
- va_end(ap);
- if (!shrinker->name)
- return -ENOMEM;
-
- err = __register_shrinker(shrinker);
- if (err) {
- kfree_const(shrinker->name);
- shrinker->name = NULL;
- }
- return err;
-}
-#else
-int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
-{
- return __register_shrinker(shrinker);
-}
-#endif
-EXPORT_SYMBOL(register_shrinker);
-
-/*
- * Remove one
- */
-void unregister_shrinker(struct shrinker *shrinker)
-{
- struct dentry *debugfs_entry;
- int debugfs_id;
-
- if (!(shrinker->flags & SHRINKER_REGISTERED))
- return;
-
- down_write(&shrinker_rwsem);
- list_del(&shrinker->list);
- shrinker->flags &= ~SHRINKER_REGISTERED;
- if (shrinker->flags & SHRINKER_MEMCG_AWARE)
- unregister_memcg_shrinker(shrinker);
- debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
- up_write(&shrinker_rwsem);
-
- shrinker_debugfs_remove(debugfs_entry, debugfs_id);
-
- kfree(shrinker->nr_deferred);
- shrinker->nr_deferred = NULL;
-}
-EXPORT_SYMBOL(unregister_shrinker);
-
-/**
- * synchronize_shrinkers - Wait for all running shrinkers to complete.
- *
- * This is equivalent to calling unregister_shrink() and register_shrinker(),
- * but atomically and with less overhead. This is useful to guarantee that all
- * shrinker invocations have seen an update, before freeing memory, similar to
- * rcu.
- */
-void synchronize_shrinkers(void)
-{
- down_write(&shrinker_rwsem);
- up_write(&shrinker_rwsem);
-}
-EXPORT_SYMBOL(synchronize_shrinkers);
-
-#define SHRINK_BATCH 128
-
-static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
- struct shrinker *shrinker, int priority)
-{
- unsigned long freed = 0;
- unsigned long long delta;
- long total_scan;
- long freeable;
- long nr;
- long new_nr;
- long batch_size = shrinker->batch ? shrinker->batch
- : SHRINK_BATCH;
- long scanned = 0, next_deferred;
-
- freeable = shrinker->count_objects(shrinker, shrinkctl);
- if (freeable == 0 || freeable == SHRINK_EMPTY)
- return freeable;
-
- /*
- * copy the current shrinker scan count into a local variable
- * and zero it so that other concurrent shrinker invocations
- * don't also do this scanning work.
- */
- nr = xchg_nr_deferred(shrinker, shrinkctl);
-
- if (shrinker->seeks) {
- delta = freeable >> priority;
- delta *= 4;
- do_div(delta, shrinker->seeks);
- } else {
- /*
- * These objects don't require any IO to create. Trim
- * them aggressively under memory pressure to keep
- * them from causing refetches in the IO caches.
- */
- delta = freeable / 2;
- }
-
- total_scan = nr >> priority;
- total_scan += delta;
- total_scan = min(total_scan, (2 * freeable));
-
- trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
- freeable, delta, total_scan, priority);
-
- /*
- * Normally, we should not scan less than batch_size objects in one
- * pass to avoid too frequent shrinker calls, but if the slab has less
- * than batch_size objects in total and we are really tight on memory,
- * we will try to reclaim all available objects, otherwise we can end
- * up failing allocations although there are plenty of reclaimable
- * objects spread over several slabs with usage less than the
- * batch_size.
- *
- * We detect the "tight on memory" situations by looking at the total
- * number of objects we want to scan (total_scan). If it is greater
- * than the total number of objects on slab (freeable), we must be
- * scanning at high prio and therefore should try to reclaim as much as
- * possible.
- */
- while (total_scan >= batch_size ||
- total_scan >= freeable) {
- unsigned long ret;
- unsigned long nr_to_scan = min(batch_size, total_scan);
-
- shrinkctl->nr_to_scan = nr_to_scan;
- shrinkctl->nr_scanned = nr_to_scan;
- ret = shrinker->scan_objects(shrinker, shrinkctl);
- if (ret == SHRINK_STOP)
- break;
- freed += ret;
-
- count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
- total_scan -= shrinkctl->nr_scanned;
- scanned += shrinkctl->nr_scanned;
-
- cond_resched();
- }
-
- /*
- * The deferred work is increased by any new work (delta) that wasn't
- * done, decreased by old deferred work that was done now.
- *
- * And it is capped to two times of the freeable items.
- */
- next_deferred = max_t(long, (nr + delta - scanned), 0);
- next_deferred = min(next_deferred, (2 * freeable));
-
- /*
- * move the unused scan count back into the shrinker in a
- * manner that handles concurrent updates.
- */
- new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
-
- trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
- return freed;
-}
-
-#ifdef CONFIG_MEMCG
-static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
- struct mem_cgroup *memcg, int priority)
-{
- struct shrinker_info *info;
- unsigned long ret, freed = 0;
- int i;
-
- if (!mem_cgroup_online(memcg))
- return 0;
-
- if (!down_read_trylock(&shrinker_rwsem))
- return 0;
-
- info = shrinker_info_protected(memcg, nid);
- if (unlikely(!info))
- goto unlock;
-
- for_each_set_bit(i, info->map, info->map_nr_max) {
- struct shrink_control sc = {
- .gfp_mask = gfp_mask,
- .nid = nid,
- .memcg = memcg,
- };
- struct shrinker *shrinker;
-
- shrinker = idr_find(&shrinker_idr, i);
- if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
- if (!shrinker)
- clear_bit(i, info->map);
- continue;
- }
-
- /* Call non-slab shrinkers even though kmem is disabled */
- if (!memcg_kmem_online() &&
- !(shrinker->flags & SHRINKER_NONSLAB))
- continue;
-
- ret = do_shrink_slab(&sc, shrinker, priority);
- if (ret == SHRINK_EMPTY) {
- clear_bit(i, info->map);
- /*
- * After the shrinker reported that it had no objects to
- * free, but before we cleared the corresponding bit in
- * the memcg shrinker map, a new object might have been
- * added. To make sure, we have the bit set in this
- * case, we invoke the shrinker one more time and reset
- * the bit if it reports that it is not empty anymore.
- * The memory barrier here pairs with the barrier in
- * set_shrinker_bit():
- *
- * list_lru_add() shrink_slab_memcg()
- * list_add_tail() clear_bit()
- * <MB> <MB>
- * set_bit() do_shrink_slab()
- */
- smp_mb__after_atomic();
- ret = do_shrink_slab(&sc, shrinker, priority);
- if (ret == SHRINK_EMPTY)
- ret = 0;
- else
- set_shrinker_bit(memcg, nid, i);
- }
- freed += ret;
-
- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
- }
- }
-unlock:
- up_read(&shrinker_rwsem);
- return freed;
-}
-#else /* CONFIG_MEMCG */
-static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
- struct mem_cgroup *memcg, int priority)
-{
- return 0;
-}
-#endif /* CONFIG_MEMCG */
-
-/**
- * shrink_slab - shrink slab caches
- * @gfp_mask: allocation context
- * @nid: node whose slab caches to target
- * @memcg: memory cgroup whose slab caches to target
- * @priority: the reclaim priority
- *
- * Call the shrink functions to age shrinkable caches.
- *
- * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
- * unaware shrinkers will receive a node id of 0 instead.
- *
- * @memcg specifies the memory cgroup to target. Unaware shrinkers
- * are called only if it is the root cgroup.
- *
- * @priority is sc->priority, we take the number of objects and >> by priority
- * in order to get the scan target.
- *
- * Returns the number of reclaimed slab objects.
- */
-static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
- struct mem_cgroup *memcg,
- int priority)
-{
- unsigned long ret, freed = 0;
- struct shrinker *shrinker;
-
- /*
- * The root memcg might be allocated even though memcg is disabled
- * via "cgroup_disable=memory" boot parameter. This could make
- * mem_cgroup_is_root() return false, then just run memcg slab
- * shrink, but skip global shrink. This may result in premature
- * oom.
- */
- if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
- return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
-
- if (!down_read_trylock(&shrinker_rwsem))
- goto out;
-
- list_for_each_entry(shrinker, &shrinker_list, list) {
- struct shrink_control sc = {
- .gfp_mask = gfp_mask,
- .nid = nid,
- .memcg = memcg,
- };
-
- ret = do_shrink_slab(&sc, shrinker, priority);
- if (ret == SHRINK_EMPTY)
- ret = 0;
- freed += ret;
- /*
- * Bail out if someone want to register a new shrinker to
- * prevent the registration from being stalled for long periods
- * by parallel ongoing shrinking.
- */
- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
- }
- }
-
- up_read(&shrinker_rwsem);
-out:
- cond_resched();
- return freed;
-}
-
static unsigned long drop_slab_node(int nid)
{
unsigned long freed = 0;
@@ -1915,6 +1214,7 @@ retry:
folio_list))
goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1);
count_vm_event(THP_SWPOUT_FALLBACK);
#endif
if (!add_to_swap(folio))
@@ -2271,7 +1571,7 @@ static bool skip_cma(struct folio *folio, struct scan_control *sc)
{
return !current_is_kswapd() &&
gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
- get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
+ folio_migratetype(folio) == MIGRATE_CMA;
}
#else
static bool skip_cma(struct folio *folio, struct scan_control *sc)
@@ -2389,8 +1689,7 @@ move:
}
*nr_scanned = total_scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
- total_scan, skipped, nr_taken,
- sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru);
+ total_scan, skipped, nr_taken, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
return nr_taken;
}
@@ -2909,7 +2208,7 @@ enum scan_balance {
SCAN_FILE,
};
-static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
+static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
{
unsigned long file;
struct lruvec *target_lruvec;
@@ -5005,6 +4304,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int sorted = 0;
int scanned = 0;
int isolated = 0;
+ int skipped = 0;
int remaining = MAX_LRU_BATCH;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -5018,7 +4318,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
for (i = MAX_NR_ZONES; i > 0; i--) {
LIST_HEAD(moved);
- int skipped = 0;
+ int skipped_zone = 0;
int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
struct list_head *head = &lrugen->folios[gen][type][zone];
@@ -5040,16 +4340,17 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
isolated += delta;
} else {
list_move(&folio->lru, &moved);
- skipped += delta;
+ skipped_zone += delta;
}
- if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
+ if (!--remaining || max(isolated, skipped_zone) >= MIN_LRU_BATCH)
break;
}
- if (skipped) {
+ if (skipped_zone) {
list_splice(&moved, head);
- __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone);
+ skipped += skipped_zone;
}
if (!remaining || isolated >= MIN_LRU_BATCH)
@@ -5064,6 +4365,9 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
__count_memcg_events(memcg, item, isolated);
__count_memcg_events(memcg, PGREFILL, sorted);
__count_vm_events(PGSCAN_ANON + type, isolated);
+ trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
+ scanned, skipped, isolated,
+ type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
/*
* There might not be eligible folios due to reclaim_idx. Check the
@@ -5194,6 +4498,9 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
retry:
reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
sc->nr_reclaimed += reclaimed;
+ trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
+ scanned, reclaimed, &stat, sc->priority,
+ type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
list_for_each_entry_safe_reverse(folio, next, &list, lru) {
if (!folio_evictable(folio)) {
@@ -6535,7 +5842,7 @@ again:
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- prepare_scan_count(pgdat, sc);
+ prepare_scan_control(pgdat, sc);
shrink_node_memcgs(pgdat, sc);
@@ -7892,8 +7199,9 @@ void __meminit kswapd_run(int nid)
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
+ pr_err("Failed to start kswapd on node %d,ret=%ld\n",
+ nid, PTR_ERR(pgdat->kswapd));
BUG_ON(system_state < SYSTEM_RUNNING);
- pr_err("Failed to start kswapd on node %d\n", nid);
pgdat->kswapd = NULL;
}
}
@@ -8026,6 +7334,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
cond_resched();
psi_memstall_enter(&pflags);
+ delayacct_freepages_start();
fs_reclaim_acquire(sc.gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
@@ -8048,6 +7357,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
psi_memstall_leave(&pflags);
+ delayacct_freepages_end();
trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 00e81e99c6ee..359460deb377 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -559,8 +559,10 @@ static inline void mod_zone_state(struct zone *zone,
{
struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
- long o, n, t, z;
+ long n, t, z;
+ s8 o;
+ o = this_cpu_read(*p);
do {
z = 0; /* overflow to zone counters */
@@ -576,8 +578,7 @@ static inline void mod_zone_state(struct zone *zone,
*/
t = this_cpu_read(pcp->stat_threshold);
- o = this_cpu_read(*p);
- n = delta + o;
+ n = delta + (long)o;
if (abs(n) > t) {
int os = overstep_mode * (t >> 1) ;
@@ -586,7 +587,7 @@ static inline void mod_zone_state(struct zone *zone,
z = n + os;
n = -os;
}
- } while (this_cpu_cmpxchg(*p, o, n) != o);
+ } while (!this_cpu_try_cmpxchg(*p, &o, n));
if (z)
zone_page_state_add(z, zone, item);
@@ -616,7 +617,8 @@ static inline void mod_node_state(struct pglist_data *pgdat,
{
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
s8 __percpu *p = pcp->vm_node_stat_diff + item;
- long o, n, t, z;
+ long n, t, z;
+ s8 o;
if (vmstat_item_in_bytes(item)) {
/*
@@ -629,6 +631,7 @@ static inline void mod_node_state(struct pglist_data *pgdat,
delta >>= PAGE_SHIFT;
}
+ o = this_cpu_read(*p);
do {
z = 0; /* overflow to node counters */
@@ -644,8 +647,7 @@ static inline void mod_node_state(struct pglist_data *pgdat,
*/
t = this_cpu_read(pcp->stat_threshold);
- o = this_cpu_read(*p);
- n = delta + o;
+ n = delta + (long)o;
if (abs(n) > t) {
int os = overstep_mode * (t >> 1) ;
@@ -654,7 +656,7 @@ static inline void mod_node_state(struct pglist_data *pgdat,
z = n + os;
n = -os;
}
- } while (this_cpu_cmpxchg(*p, o, n) != o);
+ } while (!this_cpu_try_cmpxchg(*p, &o, n));
if (z)
node_page_state_add(z, pgdat, item);
@@ -814,9 +816,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
for_each_populated_zone(zone) {
struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
-#ifdef CONFIG_NUMA
struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
-#endif
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
int v;
@@ -832,10 +832,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
#endif
}
}
-#ifdef CONFIG_NUMA
if (do_pagesets) {
cond_resched();
+
+ changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
+#ifdef CONFIG_NUMA
/*
* Deal with draining the remote pageset of this
* processor
@@ -855,15 +857,17 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
continue;
}
- if (__this_cpu_dec_return(pcp->expire))
+ if (__this_cpu_dec_return(pcp->expire)) {
+ changes++;
continue;
+ }
if (__this_cpu_read(pcp->count)) {
drain_zone_pages(zone, this_cpu_ptr(pcp));
changes++;
}
- }
#endif
+ }
}
for_each_online_pgdat(pgdat) {
diff --git a/mm/workingset.c b/mm/workingset.c
index da58a26d0d4d..b192e44a0e7c 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -763,13 +763,6 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
NULL);
}
-static struct shrinker workingset_shadow_shrinker = {
- .count_objects = count_shadow_nodes,
- .scan_objects = scan_shadow_nodes,
- .seeks = 0, /* ->count reports only fully expendable nodes */
- .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
-};
-
/*
* Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
* i_pages lock.
@@ -778,9 +771,10 @@ static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
+ struct shrinker *workingset_shadow_shrinker;
unsigned int timestamp_bits;
unsigned int max_order;
- int ret;
+ int ret = -ENOMEM;
BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
/*
@@ -797,17 +791,26 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
- if (ret)
+ workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
+ SHRINKER_MEMCG_AWARE,
+ "mm-shadow");
+ if (!workingset_shadow_shrinker)
goto err;
+
ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
- &workingset_shadow_shrinker);
+ workingset_shadow_shrinker);
if (ret)
goto err_list_lru;
- register_shrinker_prepared(&workingset_shadow_shrinker);
+
+ workingset_shadow_shrinker->count_objects = count_shadow_nodes;
+ workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
+ /* ->count reports only fully expendable nodes */
+ workingset_shadow_shrinker->seeks = 0;
+
+ shrinker_register(workingset_shadow_shrinker);
return 0;
err_list_lru:
- free_prealloced_shrinker(&workingset_shadow_shrinker);
+ shrinker_free(workingset_shadow_shrinker);
err:
return ret;
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b58f957429f0..b1c0dad7f4cf 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -229,7 +229,7 @@ struct zs_pool {
struct zs_pool_stats stats;
/* Compact classes */
- struct shrinker shrinker;
+ struct shrinker *shrinker;
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
@@ -1839,7 +1839,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
* Here, any user cannot access all objects in the zspage so let's move.
*/
d_addr = kmap_atomic(newpage);
- memcpy(d_addr, s_addr, PAGE_SIZE);
+ copy_page(d_addr, s_addr);
kunmap_atomic(d_addr);
for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE;
@@ -2086,8 +2086,7 @@ static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
struct shrink_control *sc)
{
unsigned long pages_freed;
- struct zs_pool *pool = container_of(shrinker, struct zs_pool,
- shrinker);
+ struct zs_pool *pool = shrinker->private_data;
/*
* Compact classes and calculate compaction delta.
@@ -2105,8 +2104,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
int i;
struct size_class *class;
unsigned long pages_to_free = 0;
- struct zs_pool *pool = container_of(shrinker, struct zs_pool,
- shrinker);
+ struct zs_pool *pool = shrinker->private_data;
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
@@ -2121,18 +2119,23 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
static void zs_unregister_shrinker(struct zs_pool *pool)
{
- unregister_shrinker(&pool->shrinker);
+ shrinker_free(pool->shrinker);
}
static int zs_register_shrinker(struct zs_pool *pool)
{
- pool->shrinker.scan_objects = zs_shrinker_scan;
- pool->shrinker.count_objects = zs_shrinker_count;
- pool->shrinker.batch = 0;
- pool->shrinker.seeks = DEFAULT_SEEKS;
+ pool->shrinker = shrinker_alloc(0, "mm-zspool:%s", pool->name);
+ if (!pool->shrinker)
+ return -ENOMEM;
+
+ pool->shrinker->scan_objects = zs_shrinker_scan;
+ pool->shrinker->count_objects = zs_shrinker_count;
+ pool->shrinker->batch = 0;
+ pool->shrinker->private_data = pool;
- return register_shrinker(&pool->shrinker, "mm-zspool:%s",
- pool->name);
+ shrinker_register(pool->shrinker);
+
+ return 0;
}
static int calculate_zspage_chain_size(int class_size)
diff --git a/mm/zswap.c b/mm/zswap.c
index 412b1409a0d7..74411dfdad92 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -24,6 +24,7 @@
#include <linux/swap.h>
#include <linux/crypto.h>
#include <linux/scatterlist.h>
+#include <linux/mempolicy.h>
#include <linux/mempool.h>
#include <linux/zpool.h>
#include <crypto/acompress.h>
@@ -61,6 +62,8 @@ static u64 zswap_pool_limit_hit;
static u64 zswap_written_back_pages;
/* Store failed due to a reclaim failure after pool limit was reached */
static u64 zswap_reject_reclaim_fail;
+/* Store failed due to compression algorithm failure */
+static u64 zswap_reject_compress_fail;
/* Compressed page was too big for the allocator to (optimally) store */
static u64 zswap_reject_compress_poor;
/* Store failed because underlying allocator could not get memory */
@@ -1057,6 +1060,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
{
swp_entry_t swpentry = entry->swpentry;
struct page *page;
+ struct mempolicy *mpol;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
struct zpool *pool = zswap_find_zpool(entry);
@@ -1075,8 +1079,9 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
}
/* try to allocate swap cache page */
- page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
- &page_was_allocated);
+ mpol = get_task_policy(current);
+ page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
+ NO_INTERLEAVE_INDEX, &page_was_allocated);
if (!page) {
ret = -ENOMEM;
goto fail;
@@ -1219,6 +1224,19 @@ bool zswap_store(struct folio *folio)
return false;
/*
+ * If this is a duplicate, it must be removed before attempting to store
+ * it, otherwise, if the store fails the old page won't be removed from
+ * the tree, and it might be written back overriding the new data.
+ */
+ spin_lock(&tree->lock);
+ dupentry = zswap_rb_search(&tree->rbroot, offset);
+ if (dupentry) {
+ zswap_duplicate_entry++;
+ zswap_invalidate_entry(tree, dupentry);
+ }
+ spin_unlock(&tree->lock);
+
+ /*
* XXX: zswap reclaim does not work with cgroups yet. Without a
* cgroup-aware entry LRU, we will push out entries system-wide based on
* local cgroup limits.
@@ -1296,8 +1314,10 @@ bool zswap_store(struct folio *folio)
ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
dlen = acomp_ctx->req->dlen;
- if (ret)
+ if (ret) {
+ zswap_reject_compress_fail++;
goto put_dstmem;
+ }
/* store */
zpool = zswap_find_zpool(entry);
@@ -1333,7 +1353,14 @@ insert_entry:
/* map */
spin_lock(&tree->lock);
+ /*
+ * A duplicate entry should have been removed at the beginning of this
+ * function. Since the swap entry should be pinned, if a duplicate is
+ * found again here it means that something went wrong in the swap
+ * cache.
+ */
while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
+ WARN_ON(1);
zswap_duplicate_entry++;
zswap_invalidate_entry(tree, dupentry);
}
@@ -1363,8 +1390,8 @@ reject:
shrink:
pool = zswap_pool_last_get();
- if (pool)
- queue_work(shrink_wq, &pool->shrink_work);
+ if (pool && !queue_work(shrink_wq, &pool->shrink_work))
+ zswap_pool_put(pool);
goto reject;
}
@@ -1530,6 +1557,8 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_reject_alloc_fail);
debugfs_create_u64("reject_kmemcache_fail", 0444,
zswap_debugfs_root, &zswap_reject_kmemcache_fail);
+ debugfs_create_u64("reject_compress_fail", 0444,
+ zswap_debugfs_root, &zswap_reject_compress_fail);
debugfs_create_u64("reject_compress_poor", 0444,
zswap_debugfs_root, &zswap_reject_compress_poor);
debugfs_create_u64("written_back_pages", 0444,