summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig35
-rw-r--r--mm/damon/Kconfig3
-rw-r--r--mm/damon/core.c31
-rw-r--r--mm/damon/modules-common.c2
-rw-r--r--mm/damon/modules-common.h2
-rw-r--r--mm/damon/ops-common.c2
-rw-r--r--mm/damon/ops-common.h2
-rw-r--r--mm/damon/paddr.c2
-rw-r--r--mm/damon/sysfs-common.c2
-rw-r--r--mm/damon/sysfs-common.h2
-rw-r--r--mm/damon/sysfs-schemes.c1
-rw-r--r--mm/damon/vaddr.c2
-rw-r--r--mm/execmem.c40
-rw-r--r--mm/filemap.c43
-rw-r--r--mm/gup.c16
-rw-r--r--mm/hugetlb.c132
-rw-r--r--mm/kasan/report.c47
-rw-r--r--mm/khugepaged.c35
-rw-r--r--mm/kmemleak.c14
-rw-r--r--mm/kmsan/kmsan_test.c1
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/madvise.c7
-rw-r--r--mm/memcontrol.c125
-rw-r--r--mm/memory-failure.c4
-rw-r--r--mm/memory.c26
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/migrate.c14
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmu_gather.c1
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/rmap.c46
-rw-r--r--mm/secretmem.c22
-rw-r--r--mm/shmem.c29
-rw-r--r--mm/slub.c15
-rw-r--r--mm/swap.h23
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/userfaultfd.c33
-rw-r--r--mm/util.c40
-rw-r--r--mm/vma.c50
-rw-r--r--mm/vma.h54
-rw-r--r--mm/vmalloc.c22
-rw-r--r--mm/vmscan.c20
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/zpdesc.h4
-rw-r--r--mm/zsmalloc.c3
46 files changed, 654 insertions, 325 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index bd08e151fa1b..781be3240e21 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -993,6 +993,40 @@ config CMA_AREAS
If unsure, leave the default value "8" in UMA and "20" in NUMA.
+#
+# Select this config option from the architecture Kconfig, if available, to set
+# the max page order for physically contiguous allocations.
+#
+config ARCH_FORCE_MAX_ORDER
+ int
+
+#
+# When ARCH_FORCE_MAX_ORDER is not defined,
+# the default page block order is MAX_PAGE_ORDER (10) as per
+# include/linux/mmzone.h.
+#
+config PAGE_BLOCK_ORDER
+ int "Page Block Order"
+ range 1 10 if ARCH_FORCE_MAX_ORDER = 0
+ default 10 if ARCH_FORCE_MAX_ORDER = 0
+ range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+ default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+ help
+ The page block order refers to the power of two number of pages that
+ are physically contiguous and can have a migrate type associated to
+ them. The maximum size of the page block order is limited by
+ ARCH_FORCE_MAX_ORDER.
+
+ This config allows overriding the default page block order when the
+ page block order is required to be smaller than ARCH_FORCE_MAX_ORDER
+ or MAX_PAGE_ORDER.
+
+ Reducing pageblock order can negatively impact THP generation
+ success rate. If your workloads uses THP heavily, please use this
+ option with caution.
+
+ Don't change if unsure.
+
config MEM_SOFT_DIRTY
bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
@@ -1321,6 +1355,7 @@ config NUMA_MEMBLKS
config NUMA_EMU
bool "NUMA emulation"
depends on NUMA_MEMBLKS
+ depends on X86 || GENERIC_ARCH_NUMA
help
Enable NUMA emulation. A flat machine will be split
into virtual nodes when booted with "numa=fake=N", where N is the
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index c213cf8b5638..c93d0c56b963 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -28,6 +28,7 @@ config DAMON_VADDR
bool "Data access monitoring operations for virtual address spaces"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
+ default DAMON
help
This builds the default data access monitoring operations for DAMON
that work for virtual address spaces.
@@ -36,6 +37,7 @@ config DAMON_PADDR
bool "Data access monitoring operations for the physical address space"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
+ default DAMON
help
This builds the default data access monitoring operations for DAMON
that works for the physical address space.
@@ -55,6 +57,7 @@ config DAMON_VADDR_KUNIT_TEST
config DAMON_SYSFS
bool "DAMON sysfs interface"
depends on DAMON && SYSFS
+ default DAMON
help
This builds the sysfs interface for DAMON. The user space can use
the interface for arbitrary data access monitoring.
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 0bb71e2ab713..339116ea30e3 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -754,6 +754,19 @@ static struct damos_quota_goal *damos_nth_quota_goal(
return NULL;
}
+static void damos_commit_quota_goal_union(
+ struct damos_quota_goal *dst, struct damos_quota_goal *src)
+{
+ switch (dst->metric) {
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ dst->nid = src->nid;
+ break;
+ default:
+ break;
+ }
+}
+
static void damos_commit_quota_goal(
struct damos_quota_goal *dst, struct damos_quota_goal *src)
{
@@ -762,6 +775,7 @@ static void damos_commit_quota_goal(
if (dst->metric == DAMOS_QUOTA_USER_INPUT)
dst->current_value = src->current_value;
/* keep last_psi_total as is, since it will be updated in next cycle */
+ damos_commit_quota_goal_union(dst, src);
}
/**
@@ -795,6 +809,7 @@ int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src)
src_goal->metric, src_goal->target_value);
if (!new_goal)
return -ENOMEM;
+ damos_commit_quota_goal_union(new_goal, src_goal);
damos_add_quota_goal(dst, new_goal);
}
return 0;
@@ -1093,9 +1108,17 @@ static int damon_commit_targets(
if (err)
return err;
} else {
+ struct damos *s;
+
if (damon_target_has_pid(dst))
put_pid(dst_target->pid);
damon_destroy_target(dst_target);
+ damon_for_each_scheme(s, dst) {
+ if (s->quota.charge_target_from == dst_target) {
+ s->quota.charge_target_from = NULL;
+ s->quota.charge_addr_from = 0;
+ }
+ }
}
}
@@ -1441,6 +1464,7 @@ static unsigned long damon_get_intervals_score(struct damon_ctx *c)
}
}
target_access_events = max_access_events * goal_bp / 10000;
+ target_access_events = target_access_events ? : 1;
return access_events * 10000 / target_access_events;
}
@@ -2347,9 +2371,8 @@ static void kdamond_usleep(unsigned long usecs)
*
* If there is a &struct damon_call_control request that registered via
* &damon_call() on @ctx, do or cancel the invocation of the function depending
- * on @cancel. @cancel is set when the kdamond is deactivated by DAMOS
- * watermarks, or the kdamond is already out of the main loop and therefore
- * will be terminated.
+ * on @cancel. @cancel is set when the kdamond is already out of the main loop
+ * and therefore will be terminated.
*/
static void kdamond_call(struct damon_ctx *ctx, bool cancel)
{
@@ -2397,7 +2420,7 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
if (ctx->callback.after_wmarks_check &&
ctx->callback.after_wmarks_check(ctx))
break;
- kdamond_call(ctx, true);
+ kdamond_call(ctx, false);
damos_walk_cancel(ctx);
}
return -EBUSY;
diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c
index 7cf96574cde7..86d58f8c4f63 100644
--- a/mm/damon/modules-common.c
+++ b/mm/damon/modules-common.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for DAMON Modules
+ * Common Code for DAMON Modules
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index f49cdb417005..f103ad556368 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for DAMON Modules
+ * Common Code for DAMON Modules
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 0db1fc70c84d..b43620fee6bb 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for Data Access Monitoring
+ * Common Code for Data Access Monitoring
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 18d837d11bce..cc9f5da9c012 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for Data Access Monitoring
+ * Common Code for Data Access Monitoring
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index e8464f7e0014..4102a8c5f992 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * DAMON Primitives for The Physical Address Space
+ * DAMON Code for The Physical Address Space
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
index 70edf45c2174..ffaf285e241a 100644
--- a/mm/damon/sysfs-common.c
+++ b/mm/damon/sysfs-common.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for DAMON Sysfs Interface
+ * Common Code for DAMON Sysfs Interface
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 70d84bdc9f5f..2099adee11d0 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for DAMON Sysfs Interface
+ * Common Code for DAMON Sysfs Interface
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 0f6c9e1fec0b..30ae7518ffbf 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -472,6 +472,7 @@ static ssize_t memcg_path_store(struct kobject *kobj,
return -ENOMEM;
strscpy(path, buf, count + 1);
+ kfree(filter->memcg_path);
filter->memcg_path = path;
return count;
}
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index e6d99106a7f9..46554e49a478 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * DAMON Primitives for Virtual Address Spaces
+ * DAMON Code for Virtual Address Spaces
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/execmem.c b/mm/execmem.c
index 9720ac2dfa41..2b683e7d864d 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -254,34 +254,6 @@ out_unlock:
return ptr;
}
-static bool execmem_cache_rox = false;
-
-void execmem_cache_make_ro(void)
-{
- struct maple_tree *free_areas = &execmem_cache.free_areas;
- struct maple_tree *busy_areas = &execmem_cache.busy_areas;
- MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
- MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
- struct mutex *mutex = &execmem_cache.mutex;
- void *area;
-
- execmem_cache_rox = true;
-
- mutex_lock(mutex);
-
- mas_for_each(&mas_free, area, ULONG_MAX) {
- unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT;
- set_memory_ro(mas_free.index, pages);
- }
-
- mas_for_each(&mas_busy, area, ULONG_MAX) {
- unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT;
- set_memory_ro(mas_busy.index, pages);
- }
-
- mutex_unlock(mutex);
-}
-
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
@@ -302,15 +274,9 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
/* fill memory with instructions that will trap */
execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
- if (execmem_cache_rox) {
- err = set_memory_rox((unsigned long)p, vm->nr_pages);
- if (err)
- goto err_free_mem;
- } else {
- err = set_memory_x((unsigned long)p, vm->nr_pages);
- if (err)
- goto err_free_mem;
- }
+ err = set_memory_rox((unsigned long)p, vm->nr_pages);
+ if (err)
+ goto err_free_mem;
err = execmem_cache_add(p, alloc_size);
if (err)
diff --git a/mm/filemap.c b/mm/filemap.c
index 09d005848f0d..bada249b9fb7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -142,7 +142,7 @@ static void page_cache_delete(struct address_space *mapping,
xas_init_marks(&xas);
folio->mapping = NULL;
- /* Leave page->index set: truncation lookup relies upon it */
+ /* Leave folio->index set: truncation lookup relies upon it */
mapping->nrpages -= nr;
}
@@ -949,7 +949,7 @@ unlock:
return 0;
error:
folio->mapping = NULL;
- /* Leave page->index set: truncation relies upon it */
+ /* Leave folio->index set: truncation relies upon it */
folio_put_refs(folio, nr);
return xas_error(&xas);
}
@@ -1589,13 +1589,30 @@ int folio_wait_private_2_killable(struct folio *folio)
}
EXPORT_SYMBOL(folio_wait_private_2_killable);
+static void filemap_end_dropbehind(struct folio *folio)
+{
+ struct address_space *mapping = folio->mapping;
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return;
+ if (!folio_test_clear_dropbehind(folio))
+ return;
+ if (mapping)
+ folio_unmap_invalidate(mapping, folio, 0);
+}
+
/*
* If folio was marked as dropbehind, then pages should be dropped when writeback
* completes. Do that now. If we fail, it's likely because of a big folio -
* just reset dropbehind for that case and latter completions should invalidate.
*/
-static void folio_end_dropbehind_write(struct folio *folio)
+static void filemap_end_dropbehind_write(struct folio *folio)
{
+ if (!folio_test_dropbehind(folio))
+ return;
+
/*
* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
* but can happen if normal writeback just happens to find dirty folios
@@ -1604,8 +1621,7 @@ static void folio_end_dropbehind_write(struct folio *folio)
* invalidation in that case.
*/
if (in_task() && folio_trylock(folio)) {
- if (folio->mapping)
- folio_unmap_invalidate(folio->mapping, folio, 0);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}
@@ -1620,8 +1636,6 @@ static void folio_end_dropbehind_write(struct folio *folio)
*/
void folio_end_writeback(struct folio *folio)
{
- bool folio_dropbehind = false;
-
VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
/*
@@ -1643,14 +1657,11 @@ void folio_end_writeback(struct folio *folio)
* reused before the folio_wake_bit().
*/
folio_get(folio);
- if (!folio_test_dirty(folio))
- folio_dropbehind = folio_test_clear_dropbehind(folio);
if (__folio_end_writeback(folio))
folio_wake_bit(folio, PG_writeback);
- acct_reclaim_writeback(folio);
- if (folio_dropbehind)
- folio_end_dropbehind_write(folio);
+ filemap_end_dropbehind_write(folio);
+ acct_reclaim_writeback(folio);
folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);
@@ -2635,16 +2646,14 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
return (pos1 >> shift == pos2 >> shift);
}
-static void filemap_end_dropbehind_read(struct address_space *mapping,
- struct folio *folio)
+static void filemap_end_dropbehind_read(struct folio *folio)
{
if (!folio_test_dropbehind(folio))
return;
if (folio_test_writeback(folio) || folio_test_dirty(folio))
return;
if (folio_trylock(folio)) {
- if (folio_test_clear_dropbehind(folio))
- folio_unmap_invalidate(mapping, folio, 0);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}
@@ -2765,7 +2774,7 @@ put_folios:
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- filemap_end_dropbehind_read(mapping, folio);
+ filemap_end_dropbehind_read(folio);
folio_put(folio);
}
folio_batch_init(&fbatch);
diff --git a/mm/gup.c b/mm/gup.c
index 329c5f7acc7a..3c39cbbeebef 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2303,13 +2303,13 @@ static void pofs_unpin(struct pages_or_folios *pofs)
/*
* Returns the number of collected folios. Return value is always >= 0.
*/
-static void collect_longterm_unpinnable_folios(
+static unsigned long collect_longterm_unpinnable_folios(
struct list_head *movable_folio_list,
struct pages_or_folios *pofs)
{
+ unsigned long i, collected = 0;
struct folio *prev_folio = NULL;
bool drain_allow = true;
- unsigned long i;
for (i = 0; i < pofs->nr_entries; i++) {
struct folio *folio = pofs_get_folio(pofs, i);
@@ -2321,6 +2321,8 @@ static void collect_longterm_unpinnable_folios(
if (folio_is_longterm_pinnable(folio))
continue;
+ collected++;
+
if (folio_is_device_coherent(folio))
continue;
@@ -2342,6 +2344,8 @@ static void collect_longterm_unpinnable_folios(
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
}
+
+ return collected;
}
/*
@@ -2418,9 +2422,11 @@ static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
{
LIST_HEAD(movable_folio_list);
+ unsigned long collected;
- collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
- if (list_empty(&movable_folio_list))
+ collected = collect_longterm_unpinnable_folios(&movable_folio_list,
+ pofs);
+ if (!collected)
return 0;
return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
@@ -3299,7 +3305,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
* include/asm-generic/tlb.h for more details.
*
* We do not adopt an rcu_read_lock() here as we also want to block IPIs
- * that come from THPs splitting.
+ * that come from callers of tlb_remove_table_sync_one().
*/
local_irq_save(flags);
gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 32ab14aa4074..a0d285d20992 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -121,7 +121,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static void hugetlb_free_folio(struct folio *folio)
@@ -2340,12 +2340,15 @@ struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
struct folio *folio;
spin_lock_irq(&hugetlb_lock);
+ if (!h->resv_huge_pages) {
+ spin_unlock_irq(&hugetlb_lock);
+ return NULL;
+ }
+
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid,
nmask);
- if (folio) {
- VM_BUG_ON(!h->resv_huge_pages);
+ if (folio)
h->resv_huge_pages--;
- }
spin_unlock_irq(&hugetlb_lock);
return folio;
@@ -2787,20 +2790,24 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
/*
* alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
* the old one
- * @h: struct hstate old page belongs to
* @old_folio: Old folio to dissolve
* @list: List to isolate the page in case we need to
* Returns 0 on success, otherwise negated error.
*/
-static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
- struct folio *old_folio, struct list_head *list)
+static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio,
+ struct list_head *list)
{
- gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ gfp_t gfp_mask;
+ struct hstate *h;
int nid = folio_nid(old_folio);
struct folio *new_folio = NULL;
int ret = 0;
retry:
+ /*
+ * The old_folio might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ */
spin_lock_irq(&hugetlb_lock);
if (!folio_test_hugetlb(old_folio)) {
/*
@@ -2829,8 +2836,10 @@ retry:
cond_resched();
goto retry;
} else {
+ h = folio_hstate(old_folio);
if (!new_folio) {
spin_unlock_irq(&hugetlb_lock);
+ gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
NULL, NULL);
if (!new_folio)
@@ -2874,35 +2883,24 @@ free_new:
int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)
{
- struct hstate *h;
int ret = -EBUSY;
- /*
- * The page might have been dissolved from under our feet, so make sure
- * to carefully check the state under the lock.
- * Return success when racing as if we dissolved the page ourselves.
- */
- spin_lock_irq(&hugetlb_lock);
- if (folio_test_hugetlb(folio)) {
- h = folio_hstate(folio);
- } else {
- spin_unlock_irq(&hugetlb_lock);
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (!folio_test_hugetlb(folio))
return 0;
- }
- spin_unlock_irq(&hugetlb_lock);
/*
* Fence off gigantic pages as there is a cyclic dependency between
* alloc_contig_range and them. Return -ENOMEM as this has the effect
* of bailing out right away without further retrying.
*/
- if (hstate_is_gigantic(h))
+ if (folio_order(folio) > MAX_PAGE_ORDER)
return -ENOMEM;
if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
ret = 0;
else if (!folio_ref_count(folio))
- ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
+ ret = alloc_and_dissolve_hugetlb_folio(folio, list);
return ret;
}
@@ -2916,7 +2914,6 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)
*/
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
{
- struct hstate *h;
struct folio *folio;
int ret = 0;
@@ -2925,23 +2922,9 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
while (start_pfn < end_pfn) {
folio = pfn_folio(start_pfn);
- /*
- * The folio might have been dissolved from under our feet, so make sure
- * to carefully check the state under the lock.
- */
- spin_lock_irq(&hugetlb_lock);
- if (folio_test_hugetlb(folio)) {
- h = folio_hstate(folio);
- } else {
- spin_unlock_irq(&hugetlb_lock);
- start_pfn++;
- continue;
- }
- spin_unlock_irq(&hugetlb_lock);
-
- if (!folio_ref_count(folio)) {
- ret = alloc_and_dissolve_hugetlb_folio(h, folio,
- &isolate_list);
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) {
+ ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list);
if (ret)
break;
@@ -3741,7 +3724,7 @@ static void __init report_hugepages(void)
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
- buf, h->free_huge_pages);
+ buf, h->nr_huge_pages);
if (nrinvalid)
pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
buf, nrinvalid, nrinvalid > 1 ? "s" : "");
@@ -5426,26 +5409,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+ return 0;
+}
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
/*
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ * This function is called in the middle of a VMA split operation, with
+ * MM, VMA and rmap all write-locked to prevent concurrent page table
+ * walks (except hardware and gup_fast()).
*/
+ vma_assert_write_locked(vma);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
if (addr & ~PUD_MASK) {
- /*
- * hugetlb_vm_op_split is called right before we attempt to
- * split the VMA. We will need to unshare PMDs in the old and
- * new VMAs, so let's unshare before we split.
- */
unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;
- if (floor >= vma->vm_start && ceil <= vma->vm_end)
- hugetlb_unshare_pmds(vma, floor, ceil);
+ if (floor >= vma->vm_start && ceil <= vma->vm_end) {
+ /*
+ * Locking:
+ * Use take_locks=false here.
+ * The file rmap lock is already held.
+ * The hugetlb VMA lock can't be taken when we already
+ * hold the file rmap lock, and we don't need it because
+ * its purpose is to synchronize against concurrent page
+ * table walks, which are not possible thanks to the
+ * locks held by our caller.
+ */
+ hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
+ }
}
-
- return 0;
}
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -7615,6 +7612,13 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
pud_clear(pud);
+ /*
+ * Once our caller drops the rmap lock, some other process might be
+ * using this page table as a normal, non-hugetlb page table.
+ * Wait for pending gup_fast() in other threads to finish before letting
+ * that happen.
+ */
+ tlb_remove_table_sync_one();
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
mm_dec_nr_pmds(mm);
return 1;
@@ -7885,9 +7889,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
spin_unlock_irq(&hugetlb_lock);
}
+/*
+ * If @take_locks is false, the caller must ensure that no concurrent page table
+ * access can happen (except for gup_fast() and hardware page walks).
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
+ * concurrent page fault handling) and the file rmap lock.
+ */
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool take_locks)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
@@ -7911,8 +7922,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
- hugetlb_vma_lock_write(vma);
- i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (take_locks) {
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ }
for (address = start; address < end; address += PUD_SIZE) {
ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
@@ -7922,8 +7937,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- hugetlb_vma_unlock_write(vma);
+ if (take_locks) {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
@@ -7938,7 +7955,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
- ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE),
+ /* take_locks = */ true);
}
/*
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 8357e1a33699..62c01b4527eb 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -370,36 +370,6 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
-/*
- * This function is invoked with report_lock (a raw_spinlock) held. A
- * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping
- * rt_spinlock.
- *
- * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a
- * lockdep warning for this raw_spinlock -> spinlock dependency. This config
- * option is enabled by default to ensure better test coverage to expose this
- * kind of RT kernel problem. This lockdep splat, however, can be suppressed
- * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the
- * invalid PREEMPT_RT case has been taken care of.
- */
-static inline struct vm_struct *kasan_find_vm_area(void *addr)
-{
- static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP);
- struct vm_struct *va;
-
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- return NULL;
-
- /*
- * Suppress lockdep warning and fetch vmalloc area of the
- * offending address.
- */
- lock_map_acquire_try(&vmalloc_map);
- va = find_vm_area(addr);
- lock_map_release(&vmalloc_map);
- return va;
-}
-
static void print_address_description(void *addr, u8 tag,
struct kasan_report_info *info)
{
@@ -429,19 +399,10 @@ static void print_address_description(void *addr, u8 tag,
}
if (is_vmalloc_addr(addr)) {
- struct vm_struct *va = kasan_find_vm_area(addr);
-
- if (va) {
- pr_err("The buggy address belongs to the virtual mapping at\n"
- " [%px, %px) created by:\n"
- " %pS\n",
- va->addr, va->addr + va->size, va->caller);
- pr_err("\n");
-
- page = vmalloc_to_page(addr);
- } else {
- pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr);
- }
+ pr_err("The buggy address belongs to a");
+ if (!vmalloc_dump_obj(addr))
+ pr_cont(" vmalloc virtual mapping\n");
+ page = vmalloc_to_page(addr);
}
if (page) {
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cdf5a581368b..15203ea7d007 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -548,19 +548,6 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
}
-static bool is_refcount_suitable(struct folio *folio)
-{
- int expected_refcount = folio_mapcount(folio);
-
- if (!folio_test_anon(folio) || folio_test_swapcache(folio))
- expected_refcount += folio_nr_pages(folio);
-
- if (folio_test_private(folio))
- expected_refcount++;
-
- return folio_ref_count(folio) == expected_refcount;
-}
-
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
@@ -652,7 +639,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
folio_unlock(folio);
result = SCAN_PAGE_COUNT;
goto out;
@@ -1402,7 +1389,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -2293,6 +2280,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
continue;
}
+ if (!folio_try_get(folio)) {
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
+ xas_reset(&xas);
+ continue;
+ }
+
if (folio_order(folio) == HPAGE_PMD_ORDER &&
folio->index == start) {
/* Maybe PMD-mapped */
@@ -2303,23 +2301,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
* it's safe to skip LRU and refcount checks before
* returning.
*/
+ folio_put(folio);
break;
}
node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
+ folio_put(folio);
break;
}
cc->node_load[node]++;
if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
+ folio_put(folio);
break;
}
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
+ folio_put(folio);
break;
}
@@ -2331,6 +2333,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
*/
present += folio_nr_pages(folio);
+ folio_put(folio);
if (need_resched()) {
xas_pause(&xas);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index da9cee34ee1b..8d588e685311 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1247,6 +1247,20 @@ void __ref kmemleak_transient_leak(const void *ptr)
EXPORT_SYMBOL(kmemleak_transient_leak);
/**
+ * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu
+ * address argument
+ * @ptr: percpu address of the object
+ */
+void __ref kmemleak_ignore_percpu(const void __percpu *ptr)
+{
+ pr_debug("%s(0x%px)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr))
+ make_black_object((unsigned long)ptr, OBJECT_PERCPU);
+}
+EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu);
+
+/**
* kmemleak_ignore - ignore an allocated object
* @ptr: pointer to beginning of the object
*
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index 9733a22c46c1..c6c5b2bbede0 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -732,3 +732,4 @@ kunit_test_suites(&kmsan_test_suite);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Alexander Potapenko <glider@google.com>");
+MODULE_DESCRIPTION("Test cases for KMSAN");
diff --git a/mm/ksm.c b/mm/ksm.c
index 8583fb91ef13..a9d3e719e089 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3669,10 +3669,10 @@ static ssize_t advisor_mode_show(struct kobject *kobj,
{
const char *output;
- if (ksm_advisor == KSM_ADVISOR_NONE)
- output = "[none] scan-time";
- else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
+ if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
output = "none [scan-time]";
+ else
+ output = "[none] scan-time";
return sysfs_emit(buf, "%s\n", output);
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 8433ac9b27e0..1d44a35ae85c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -508,6 +508,7 @@ restart:
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -741,6 +742,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
start_pte = pte;
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -1881,7 +1883,9 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
/* Drop and reacquire lock to unwind race. */
madvise_finish_tlb(&madv_behavior);
madvise_unlock(mm, behavior);
- madvise_lock(mm, behavior);
+ ret = madvise_lock(mm, behavior);
+ if (ret)
+ goto out;
madvise_init_tlb(&madv_behavior, mm);
continue;
}
@@ -1892,6 +1896,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
madvise_finish_tlb(&madv_behavior);
madvise_unlock(mm, behavior);
+out:
ret = (total_len - iov_iter_count(iter)) ? : ret;
return ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b90aa3075950..70fdeda1120b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -531,7 +531,7 @@ struct memcg_vmstats {
unsigned long events_pending[NR_MEMCG_EVENTS];
/* Stats updates since the last flush */
- atomic64_t stats_updates;
+ atomic_t stats_updates;
};
/*
@@ -557,7 +557,7 @@ static u64 flush_last_time;
static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
- return atomic64_read(&vmstats->stats_updates) >
+ return atomic_read(&vmstats->stats_updates) >
MEMCG_CHARGE_BATCH * num_online_cpus();
}
@@ -571,7 +571,9 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
if (!val)
return;
- css_rstat_updated(&memcg->css, cpu);
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ if (!in_nmi())
+ css_rstat_updated(&memcg->css, cpu);
statc_pcpu = memcg->vmstats_percpu;
for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) {
statc = this_cpu_ptr(statc_pcpu);
@@ -589,7 +591,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
continue;
stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0);
- atomic64_add(stats_updates, &statc->vmstats->stats_updates);
+ atomic_add(stats_updates, &statc->vmstats->stats_updates);
}
}
@@ -597,7 +599,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
- trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates),
+ trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates),
force, needs_flush);
if (!force && !needs_flush)
@@ -2513,17 +2515,47 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg;
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct lruvec *lruvec;
+
+ if (likely(!in_nmi())) {
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+ } else {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id];
+
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ if (idx == NR_SLAB_RECLAIMABLE_B)
+ atomic_add(nr, &pn->slab_reclaimable);
+ else
+ atomic_add(nr, &pn->slab_unreclaimable);
+ }
+}
+#else
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+}
+#endif
+
static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- mod_memcg_lruvec_state(lruvec, idx, nr);
+ account_slab_nmi_safe(memcg, pgdat, idx, nr);
rcu_read_unlock();
}
@@ -2648,6 +2680,9 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
+ if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
+ return NULL;
+
if (in_task()) {
memcg = current->active_memcg;
if (unlikely(memcg))
@@ -2710,6 +2745,23 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
return objcg;
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ if (likely(!in_nmi())) {
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+ } else {
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ atomic_add(val, &memcg->kmem_stat);
+ }
+}
+#else
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+}
+#endif
+
/*
* obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
* @objcg: object cgroup to uncharge
@@ -2722,7 +2774,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
memcg = get_mem_cgroup_from_objcg(objcg);
- mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ account_kmem_nmi_safe(memcg, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
if (!mem_cgroup_is_root(memcg))
refill_stock(memcg, nr_pages);
@@ -2750,7 +2802,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;
- mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
+ account_kmem_nmi_safe(memcg, nr_pages);
memcg1_account_kmem(memcg, nr_pages);
out:
css_put(&memcg->css);
@@ -3961,6 +4013,53 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
}
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{
+ int nid;
+
+ if (atomic_read(&memcg->kmem_stat)) {
+ int kmem = atomic_xchg(&memcg->kmem_stat, 0);
+ int index = memcg_stats_index(MEMCG_KMEM);
+
+ memcg->vmstats->state[index] += kmem;
+ if (parent)
+ parent->vmstats->state_pending[index] += kmem;
+ }
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct lruvec_stats *lstats = pn->lruvec_stats;
+ struct lruvec_stats *plstats = NULL;
+
+ if (parent)
+ plstats = parent->nodeinfo[nid]->lruvec_stats;
+
+ if (atomic_read(&pn->slab_reclaimable)) {
+ int slab = atomic_xchg(&pn->slab_reclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ if (atomic_read(&pn->slab_unreclaimable)) {
+ int slab = atomic_xchg(&pn->slab_unreclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ }
+}
+#else
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{}
+#endif
+
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -3969,6 +4068,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct aggregate_control ac;
int nid;
+ flush_nmi_stats(memcg, parent, cpu);
+
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
ac = (struct aggregate_control) {
@@ -4018,8 +4119,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
- if (atomic64_read(&memcg->vmstats->stats_updates))
- atomic64_set(&memcg->vmstats->stats_updates, 0);
+ if (atomic_read(&memcg->vmstats->stats_updates))
+ atomic_set(&memcg->vmstats->stats_updates, 0);
}
static void mem_cgroup_fork(struct task_struct *task)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b91a33fb6c69..225dddff091d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1561,6 +1561,10 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
return ret;
}
+/*
+ * The caller must guarantee the folio isn't large folio, except hugetlb.
+ * try_to_unmap() can't handle it.
+ */
int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
diff --git a/mm/memory.c b/mm/memory.c
index 5cb48f262ab0..b0cda5aab398 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -358,6 +358,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
{
struct unlink_vma_file_batch vb;
+ tlb_free_vmas(tlb);
+
do {
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
@@ -4313,26 +4315,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
- struct swap_info_struct *si = swp_swap_info(entry);
- pgoff_t offset = swp_offset(entry);
- int i;
-
- /*
- * While allocating a large folio and doing swap_read_folio, which is
- * the case the being faulted pte doesn't have swapcache. We need to
- * ensure all PTEs have no cache as well, otherwise, we might go to
- * swap devices while the content is in swapcache.
- */
- for (i = 0; i < max_nr; i++) {
- if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
- return i;
- }
-
- return i;
-}
-
/*
* Check if the PTEs within a range are contiguous swap entries
* and have consistent swapcache, zeromap.
@@ -4668,8 +4650,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/*
* KSM sometimes has to copy on read faults, for example, if
- * page->index of !PageKSM() pages would be nonlinear inside the
- * anon VMA -- PageKSM() is lost on actual swapout.
+ * folio->index of non-ksm folios would be nonlinear inside the
+ * anon VMA -- the ksm flag is lost on actual swapout.
*/
folio = ksm_might_need_to_copy(folio, vma, vmf->address);
if (unlikely(!folio)) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 72fd72e156b1..3b1dfd08338b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3708,15 +3708,13 @@ static void wi_state_free(void)
lockdep_is_held(&wi_state_lock));
if (!old_wi_state) {
mutex_unlock(&wi_state_lock);
- goto out;
+ return;
}
rcu_assign_pointer(wi_state, NULL);
mutex_unlock(&wi_state_lock);
synchronize_rcu();
kfree(old_wi_state);
-out:
- kfree(&wi_group->wi_kobj);
}
static struct kobj_attribute wi_auto_attr =
diff --git a/mm/migrate.c b/mm/migrate.c
index 8cf0f9c9599d..2c88f3b33833 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2399,6 +2399,7 @@ set_status:
static int get_compat_pages_array(const void __user *chunk_pages[],
const void __user * __user *pages,
+ unsigned long chunk_offset,
unsigned long chunk_nr)
{
compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
@@ -2406,7 +2407,7 @@ static int get_compat_pages_array(const void __user *chunk_pages[],
int i;
for (i = 0; i < chunk_nr; i++) {
- if (get_user(p, pages32 + i))
+ if (get_user(p, pages32 + chunk_offset + i))
return -EFAULT;
chunk_pages[i] = compat_ptr(p);
}
@@ -2425,27 +2426,28 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
#define DO_PAGES_STAT_CHUNK_NR 16UL
const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
int chunk_status[DO_PAGES_STAT_CHUNK_NR];
+ unsigned long chunk_offset = 0;
while (nr_pages) {
unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
if (in_compat_syscall()) {
if (get_compat_pages_array(chunk_pages, pages,
- chunk_nr))
+ chunk_offset, chunk_nr))
break;
} else {
- if (copy_from_user(chunk_pages, pages,
+ if (copy_from_user(chunk_pages, pages + chunk_offset,
chunk_nr * sizeof(*chunk_pages)))
break;
}
do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
- if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
+ if (copy_to_user(status + chunk_offset, chunk_status,
+ chunk_nr * sizeof(*status)))
break;
- pages += chunk_nr;
- status += chunk_nr;
+ chunk_offset += chunk_nr;
nr_pages -= chunk_nr;
}
return nr_pages ? -EFAULT : 0;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f0bd0830daad..f2944748f526 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1509,7 +1509,7 @@ static inline void setup_usemap(struct zone *zone) {}
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
void __init set_pageblock_order(void)
{
- unsigned int order = MAX_PAGE_ORDER;
+ unsigned int order = PAGE_BLOCK_ORDER;
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index db7ba4a725d6..b49cc6385f1f 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -424,6 +424,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
tlb->page_size = 0;
#endif
+ tlb->vma_pfn = 0;
__tlb_reset_range(tlb);
inc_tlb_flush_pending(tlb->mm);
diff --git a/mm/mremap.c b/mm/mremap.c
index 83e359754961..60f6b8d0d5f0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -237,6 +237,8 @@ static int move_ptes(struct pagetable_move_control *pmc,
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
new_pte++, new_addr += PAGE_SIZE) {
+ VM_WARN_ON_ONCE(!pte_none(*new_pte));
+
if (pte_none(ptep_get(old_pte)))
continue;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b603a59cf8f7..72b0ff0d4bae 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -608,7 +608,7 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc);
*/
static void writeout_period(struct timer_list *t)
{
- struct wb_domain *dom = from_timer(dom, t, period_timer);
+ struct wb_domain *dom = timer_container_of(dom, t, period_timer);
int miss_periods = (jiffies - dom->period_time) /
VM_COMPLETIONS_PERIOD_LEN;
@@ -2203,7 +2203,7 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int
void laptop_mode_timer_fn(struct timer_list *t)
{
struct backing_dev_info *backing_dev_info =
- from_timer(backing_dev_info, t, laptop_mode_wb_timer);
+ timer_container_of(backing_dev_info, t, laptop_mode_wb_timer);
wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
}
@@ -2565,11 +2565,11 @@ struct folio *writeback_iter(struct address_space *mapping,
if (!folio) {
/*
* To avoid deadlocks between range_cyclic writeback and callers
- * that hold pages in PageWriteback to aggregate I/O until
+ * that hold folios in writeback to aggregate I/O until
* the writeback iteration finishes, we do not loop back to the
- * start of the file. Doing so causes a page lock/page
+ * start of the file. Doing so causes a folio lock/folio
* writeback access order inversion - we should only ever lock
- * multiple pages in ascending page->index order, and looping
+ * multiple folios in ascending folio->index order, and looping
* back to the start of the file violates that rule and causes
* deadlocks.
*/
diff --git a/mm/rmap.c b/mm/rmap.c
index fb63d9256f09..1320b88fab74 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1845,23 +1845,32 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
#endif
}
-/* We support batch unmapping of PTEs for lazyfree large folios */
-static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
- struct folio *folio, pte_t *ptep)
+static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
+ struct page_vma_mapped_walk *pvmw,
+ enum ttu_flags flags, pte_t pte)
{
const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
- int max_nr = folio_nr_pages(folio);
- pte_t pte = ptep_get(ptep);
+ unsigned long end_addr, addr = pvmw->address;
+ struct vm_area_struct *vma = pvmw->vma;
+ unsigned int max_nr;
+
+ if (flags & TTU_HWPOISON)
+ return 1;
+ if (!folio_test_large(folio))
+ return 1;
+ /* We may only batch within a single VMA and a single page table. */
+ end_addr = pmd_addr_end(addr, vma->vm_end);
+ max_nr = (end_addr - addr) >> PAGE_SHIFT;
+
+ /* We only support lazyfree batching for now ... */
if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
- return false;
+ return 1;
if (pte_unused(pte))
- return false;
- if (pte_pfn(pte) != folio_pfn(folio))
- return false;
+ return 1;
- return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
- NULL, NULL) == max_nr;
+ return folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
+ NULL, NULL, NULL);
}
/*
@@ -2024,9 +2033,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_dirty(pteval))
folio_mark_dirty(folio);
} else if (likely(pte_present(pteval))) {
- if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
- can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
- nr_pages = folio_nr_pages(folio);
+ nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
end_addr = address + nr_pages * PAGE_SIZE;
flush_cache_range(vma, address, end_addr);
@@ -2206,13 +2213,16 @@ discard:
hugetlb_remove_rmap(folio);
} else {
folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
- folio_ref_sub(folio, nr_pages - 1);
}
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
- folio_put(folio);
- /* We have already batched the entire folio */
- if (nr_pages > 1)
+ folio_put_refs(folio, nr_pages);
+
+ /*
+ * If we are sure that we batched the entire folio and cleared
+ * all PTEs, we can just optimize and stop right here.
+ */
+ if (nr_pages == folio_nr_pages(folio))
goto walk_done;
continue;
walk_abort:
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 589b26c2d553..e042a4a0bc0c 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -195,18 +195,11 @@ static struct file *secretmem_file_create(unsigned long flags)
struct file *file;
struct inode *inode;
const char *anon_name = "[secretmem]";
- int err;
- inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
+ inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL);
if (IS_ERR(inode))
return ERR_CAST(inode);
- err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL);
- if (err) {
- file = ERR_PTR(err);
- goto err_free_inode;
- }
-
file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
O_RDWR, &secretmem_fops);
if (IS_ERR(file))
@@ -268,7 +261,15 @@ err_put_fd:
static int secretmem_init_fs_context(struct fs_context *fc)
{
- return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM;
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, SECRETMEM_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
+ return 0;
}
static struct file_system_type secretmem_fs = {
@@ -286,9 +287,6 @@ static int __init secretmem_init(void)
if (IS_ERR(secretmem_mnt))
return PTR_ERR(secretmem_mnt);
- /* prevent secretmem mappings from ever getting PROT_EXEC */
- secretmem_mnt->mnt_flags |= MNT_NOEXEC;
-
return 0;
}
fs_initcall(secretmem_init);
diff --git a/mm/shmem.c b/mm/shmem.c
index 858cee02ca49..3a5a65b1f41a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1446,8 +1446,6 @@ static int shmem_unuse_swap_entries(struct inode *inode,
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
- if (!xa_is_value(folio))
- continue;
error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
mapping_gfp_mask(mapping), NULL, NULL);
if (error == 0) {
@@ -1505,6 +1503,7 @@ int shmem_unuse(unsigned int type)
return 0;
mutex_lock(&shmem_swaplist_mutex);
+start_over:
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) {
list_del_init(&info->swaplist);
@@ -1523,13 +1522,15 @@ int shmem_unuse(unsigned int type)
cond_resched();
mutex_lock(&shmem_swaplist_mutex);
- next = list_next_entry(info, swaplist);
- if (!info->swapped)
- list_del_init(&info->swaplist);
if (atomic_dec_and_test(&info->stop_eviction))
wake_up_var(&info->stop_eviction);
if (error)
break;
+ if (list_empty(&info->swaplist))
+ goto start_over;
+ next = list_next_entry(info, swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
}
mutex_unlock(&shmem_swaplist_mutex);
@@ -1643,8 +1644,8 @@ try_split:
BUG_ON(folio_mapped(folio));
return swap_writeout(folio, wbc);
}
-
- list_del_init(&info->swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
if (nr_pages > 1)
goto try_split;
@@ -2258,6 +2259,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
folio = swap_cache_get_folio(swap, NULL, 0);
order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
+ int nr_pages = 1 << order;
bool fallback_order0 = false;
/* Or update major stats only when swapin succeeds?? */
@@ -2271,9 +2273,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
* If uffd is active for the vma, we need per-page fault
* fidelity to maintain the uffd semantics, then fallback
* to swapin order-0 folio, as well as for zswap case.
+ * Any existing sub folio in the swap cache also blocks
+ * mTHP swapin.
*/
if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
- !zswap_never_enabled()))
+ !zswap_never_enabled() ||
+ non_swapcache_batch(swap, nr_pages) != nr_pages))
fallback_order0 = true;
/* Skip swapcache for synchronous device. */
@@ -2331,6 +2336,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
*/
split_order = shmem_split_large_entry(inode, index, swap, gfp);
if (split_order < 0) {
+ folio_put(folio);
+ folio = NULL;
error = split_order;
goto failed;
}
@@ -5805,12 +5812,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
- if (shmem_acct_size(flags, size))
- return ERR_PTR(-ENOMEM);
-
if (is_idmapped_mnt(mnt))
return ERR_PTR(-EINVAL);
+ if (shmem_acct_size(flags, size))
+ return ERR_PTR(-ENOMEM);
+
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags);
if (IS_ERR(inode)) {
diff --git a/mm/slub.c b/mm/slub.c
index be8b09e09d30..31e11ef256f9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2084,10 +2084,11 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
slab = virt_to_slab(p);
if (!slab_obj_exts(slab) &&
- WARN(alloc_slab_obj_exts(slab, s, flags, false),
- "%s, %s: Failed to create slab extension vector!\n",
- __func__, s->name))
+ alloc_slab_obj_exts(slab, s, flags, false)) {
+ pr_warn_once("%s, %s: Failed to create slab extension vector!\n",
+ __func__, s->name);
return NULL;
+ }
return slab_obj_exts(slab) + obj_to_index(s, slab, p);
}
@@ -4968,14 +4969,16 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
* We want to attempt a large physically contiguous block first because
* it is less likely to fragment multiple larger blocks and therefore
* contribute to a long term fragmentation less than vmalloc fallback.
- * However make sure that larger requests are not too disruptive - no
- * OOM killer and no allocation failure warnings as we have a fallback.
+ * However make sure that larger requests are not too disruptive - i.e.
+ * do not direct reclaim unless physically continuous memory is preferred
+ * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to
+ * start working in the background
*/
if (size > PAGE_SIZE) {
flags |= __GFP_NOWARN;
if (!(flags & __GFP_RETRY_MAYFAIL))
- flags |= __GFP_NORETRY;
+ flags &= ~__GFP_DIRECT_RECLAIM;
/* nofail semantic is implemented by the vmalloc fallback */
flags &= ~__GFP_NOFAIL;
diff --git a/mm/swap.h b/mm/swap.h
index 2269eb9df0af..9096082a915e 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -106,6 +106,25 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
return find_next_bit(sis->zeromap, end, start) - start;
}
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ struct swap_info_struct *si = swp_swap_info(entry);
+ pgoff_t offset = swp_offset(entry);
+ int i;
+
+ /*
+ * While allocating a large folio and doing mTHP swapin, we need to
+ * ensure all entries are not cached, otherwise, the mTHP folio will
+ * be in conflict with the folio in swap cache.
+ */
+ for (i = 0; i < max_nr; i++) {
+ if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
+ return i;
+ }
+
+ return i;
+}
+
#else /* CONFIG_SWAP */
struct swap_iocb;
static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
@@ -199,6 +218,10 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
return 0;
}
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ return 0;
+}
#endif /* CONFIG_SWAP */
/**
diff --git a/mm/truncate.c b/mm/truncate.c
index f2aaf99f2990..91eb92a5ce4f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -425,7 +425,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- /* We rely upon deletion not changing page->index */
+ /* We rely upon deletion not changing folio->index */
if (xa_is_value(folio))
continue;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index bc473ad21202..8253978ee0fb 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1084,8 +1084,18 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
pte_t orig_dst_pte, pte_t orig_src_pte,
pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl,
- struct folio *src_folio)
+ struct folio *src_folio,
+ struct swap_info_struct *si, swp_entry_t entry)
{
+ /*
+ * Check if the folio still belongs to the target swap entry after
+ * acquiring the lock. Folio can be freed in the swap cache while
+ * not locked.
+ */
+ if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
+ entry.val != src_folio->swap.val))
+ return -EAGAIN;
+
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1102,6 +1112,25 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
if (src_folio) {
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
+ } else {
+ /*
+ * Check if the swap entry is cached after acquiring the src_pte
+ * lock. Otherwise, we might miss a newly loaded swap cache folio.
+ *
+ * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
+ * We are trying to catch newly added swap cache, the only possible case is
+ * when a folio is swapped in and out again staying in swap cache, using the
+ * same entry before the PTE check above. The PTL is acquired and released
+ * twice, each time after updating the swap_map's flag. So holding
+ * the PTL here ensures we see the updated value. False positive is possible,
+ * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
+ * cache, or during the tiny synchronization window between swap cache and
+ * swap_map, but it will be gone very quickly, worst result is retry jitters.
+ */
+ if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
}
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
@@ -1412,7 +1441,7 @@ retry:
}
err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
- dst_ptl, src_ptl, src_folio);
+ dst_ptl, src_ptl, src_folio, si, entry);
}
out:
diff --git a/mm/util.c b/mm/util.c
index 448117da071f..0b270c43d7d1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1131,3 +1131,43 @@ void flush_dcache_folio(struct folio *folio)
}
EXPORT_SYMBOL(flush_dcache_folio);
#endif
+
+/**
+ * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an
+ * existing VMA
+ * @file: The file which possesss an f_op->mmap_prepare() hook
+ * @vma: The VMA to apply the .mmap_prepare() hook to.
+ *
+ * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
+ * 'wrapper' file systems invoke a nested mmap hook of an underlying file.
+ *
+ * Until all filesystems are converted to use .mmap_prepare(), we must be
+ * conservative and continue to invoke these 'wrapper' filesystems using the
+ * deprecated .mmap() hook.
+ *
+ * However we have a problem if the underlying file system possesses an
+ * .mmap_prepare() hook, as we are in a different context when we invoke the
+ * .mmap() hook, already having a VMA to deal with.
+ *
+ * compat_vma_mmap_prepare() is a compatibility function that takes VMA state,
+ * establishes a struct vm_area_desc descriptor, passes to the underlying
+ * .mmap_prepare() hook and applies any changes performed by it.
+ *
+ * Once the conversion of filesystems is complete this function will no longer
+ * be required and will be removed.
+ *
+ * Returns: 0 on success or error.
+ */
+int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma)
+{
+ struct vm_area_desc desc;
+ int err;
+
+ err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc));
+ if (err)
+ return err;
+ set_vma_from_desc(vma, &desc);
+
+ return 0;
+}
+EXPORT_SYMBOL(compat_vma_mmap_prepare);
diff --git a/mm/vma.c b/mm/vma.c
index 1c6595f282e5..fef67a66a095 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -169,6 +169,9 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
vp->file = vma->vm_file;
if (vp->file)
vp->mapping = vma->vm_file->f_mapping;
+
+ if (vmg && vmg->skip_vma_uprobe)
+ vp->skip_vma_uprobe = true;
}
/*
@@ -358,10 +361,13 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
if (vp->file) {
i_mmap_unlock_write(vp->mapping);
- uprobe_mmap(vp->vma);
- if (vp->adj_next)
- uprobe_mmap(vp->adj_next);
+ if (!vp->skip_vma_uprobe) {
+ uprobe_mmap(vp->vma);
+
+ if (vp->adj_next)
+ uprobe_mmap(vp->adj_next);
+ }
}
if (vp->remove) {
@@ -539,7 +545,14 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
+
+ /*
+ * Get rid of huge pages and shared page tables straddling the split
+ * boundary.
+ */
vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_split(vma, addr);
if (new_below) {
vma->vm_start = addr;
@@ -954,26 +967,9 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
err = dup_anon_vma(next, middle, &anon_dup);
}
- if (err)
+ if (err || commit_merge(vmg))
goto abort;
- err = commit_merge(vmg);
- if (err) {
- VM_WARN_ON(err != -ENOMEM);
-
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
-
- /*
- * We've cleaned up any cloned anon_vma's, no VMAs have been
- * modified, no harm no foul if the user requests that we not
- * report this and just give up, leaving the VMAs unmerged.
- */
- if (!vmg->give_up_on_oom)
- vmg->state = VMA_MERGE_ERROR_NOMEM;
- return NULL;
- }
-
khugepaged_enter_vma(vmg->target, vmg->flags);
vmg->state = VMA_MERGE_SUCCESS;
return vmg->target;
@@ -982,6 +978,9 @@ abort:
vma_iter_set(vmg->vmi, start);
vma_iter_load(vmg->vmi);
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
/*
* This means we have failed to clone anon_vma's correctly, but no
* actual changes to VMAs have occurred, so no harm no foul - if the
@@ -1823,6 +1822,14 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
+ /*
+ * If the VMA we are copying might contain a uprobe PTE, ensure
+ * that we do not establish one upon merge. Otherwise, when mremap()
+ * moves page tables, it will orphan the newly created PTE.
+ */
+ if (vma->vm_file)
+ vmg.skip_vma_uprobe = true;
+
new_vma = find_vma_prev(mm, addr, &vmg.prev);
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
@@ -3106,7 +3113,6 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock)
return ret;
}
-
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap tree. If vm_file is non-NULL
* then i_mmap_rwsem is taken here.
diff --git a/mm/vma.h b/mm/vma.h
index 9a8af9be29a8..f47112a352db 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -19,6 +19,8 @@ struct vma_prepare {
struct vm_area_struct *insert;
struct vm_area_struct *remove;
struct vm_area_struct *remove2;
+
+ bool skip_vma_uprobe :1;
};
struct unlink_vma_file_batch {
@@ -120,6 +122,11 @@ struct vma_merge_struct {
*/
bool give_up_on_oom :1;
+ /*
+ * If set, skip uprobe_mmap upon merged vma.
+ */
+ bool skip_vma_uprobe :1;
+
/* Internal flags set during merge process: */
/*
@@ -215,6 +222,53 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
return 0;
}
+
+/*
+ * Temporary helper functions for file systems which wrap an invocation of
+ * f_op->mmap() but which might have an underlying file system which implements
+ * f_op->mmap_prepare().
+ */
+
+static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma,
+ struct vm_area_desc *desc)
+{
+ desc->mm = vma->vm_mm;
+ desc->start = vma->vm_start;
+ desc->end = vma->vm_end;
+
+ desc->pgoff = vma->vm_pgoff;
+ desc->file = vma->vm_file;
+ desc->vm_flags = vma->vm_flags;
+ desc->page_prot = vma->vm_page_prot;
+
+ desc->vm_ops = NULL;
+ desc->private_data = NULL;
+
+ return desc;
+}
+
+static inline void set_vma_from_desc(struct vm_area_struct *vma,
+ struct vm_area_desc *desc)
+{
+ /*
+ * Since we're invoking .mmap_prepare() despite having a partially
+ * established VMA, we must take care to handle setting fields
+ * correctly.
+ */
+
+ /* Mutable fields. Populated with initial state. */
+ vma->vm_pgoff = desc->pgoff;
+ if (vma->vm_file != desc->file)
+ vma_set_file(vma, desc->file);
+ if (vma->vm_flags != desc->vm_flags)
+ vm_flags_set(vma, desc->vm_flags);
+ vma->vm_page_prot = desc->page_prot;
+
+ /* User-defined fields. */
+ vma->vm_ops = desc->vm_ops;
+ vma->vm_private_data = desc->private_data;
+}
+
int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm, unsigned long start,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ab986dd09b6a..6dbcdceecae1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -514,6 +514,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
+ int err = 0;
pte_t *pte;
/*
@@ -530,12 +531,18 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
do {
struct page *page = pages[*nr];
- if (WARN_ON(!pte_none(ptep_get(pte))))
- return -EBUSY;
- if (WARN_ON(!page))
- return -ENOMEM;
- if (WARN_ON(!pfn_valid(page_to_pfn(page))))
- return -EINVAL;
+ if (WARN_ON(!pte_none(ptep_get(pte)))) {
+ err = -EBUSY;
+ break;
+ }
+ if (WARN_ON(!page)) {
+ err = -ENOMEM;
+ break;
+ }
+ if (WARN_ON(!pfn_valid(page_to_pfn(page)))) {
+ err = -EINVAL;
+ break;
+ }
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
@@ -543,7 +550,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
- return 0;
+
+ return err;
}
static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f2858cc92c2c..424412680cfc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1138,6 +1138,14 @@ retry:
goto keep;
if (folio_contain_hwpoisoned_page(folio)) {
+ /*
+ * unmap_poisoned_folio() can't handle large
+ * folio, just skip it. memory_failure() will
+ * handle it if the UCE is triggered again.
+ */
+ if (folio_test_large(folio))
+ goto keep_locked;
+
unmap_poisoned_folio(folio, folio_pfn(folio), false);
folio_unlock(folio);
folio_put(folio);
@@ -1197,8 +1205,10 @@ retry:
* 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the folio for immediate
- * reclaim and continue scanning.
+ * not to fs), or the folio belongs to a mapping where
+ * waiting on writeback during reclaim may lead to a deadlock.
+ * In this case mark the folio for immediate reclaim and
+ * continue scanning.
*
* Require may_enter_fs() because we would wait on fs, which
* may not have submitted I/O yet. And the loop driver might
@@ -1223,6 +1233,8 @@ retry:
* takes to write them to disk.
*/
if (folio_test_writeback(folio)) {
+ mapping = folio_mapping(folio);
+
/* Case 1 above */
if (current_is_kswapd() &&
folio_test_reclaim(folio) &&
@@ -1233,7 +1245,9 @@ retry:
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!folio_test_reclaim(folio) ||
- !may_enter_fs(folio, sc->gfp_mask)) {
+ !may_enter_fs(folio, sc->gfp_mask) ||
+ (mapping &&
+ mapping_writeback_may_deadlock_on_reclaim(mapping))) {
/*
* This is slightly racy -
* folio_end_writeback() might have
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d888c248d99f..a78d70ddeacd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1201,7 +1201,6 @@ const char * const vmstat_text[] = {
"nr_zone_unevictable",
"nr_zone_write_pending",
"nr_mlock",
- "nr_bounce",
#if IS_ENABLED(CONFIG_ZSMALLOC)
"nr_zspages",
#endif
diff --git a/mm/zpdesc.h b/mm/zpdesc.h
index 57e7a4d6c6ca..d3df316e5bb7 100644
--- a/mm/zpdesc.h
+++ b/mm/zpdesc.h
@@ -54,8 +54,8 @@ struct zpdesc {
ZPDESC_MATCH(flags, flags);
ZPDESC_MATCH(lru, lru);
ZPDESC_MATCH(mapping, movable_ops);
-ZPDESC_MATCH(index, next);
-ZPDESC_MATCH(index, handle);
+ZPDESC_MATCH(__folio_index, next);
+ZPDESC_MATCH(__folio_index, handle);
ZPDESC_MATCH(private, zspage);
ZPDESC_MATCH(page_type, first_obj_offset);
ZPDESC_MATCH(_refcount, _refcount);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 999b513c7fdf..f3e2215f95eb 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1043,6 +1043,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
if (!zspage)
return NULL;
+ if (!IS_ENABLED(CONFIG_COMPACTION))
+ gfp &= ~__GFP_MOVABLE;
+
zspage->magic = ZSPAGE_MAGIC;
zspage->pool = pool;
zspage->class = class->index;